Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/mon
parent: Initial commit. (diff)
download: ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
61 files changed, 60089 insertions, 0 deletions
diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc
new file mode 100644
index 000000000..f08608c61
--- /dev/null
+++ b/src/mon/AuthMonitor.cc
@@ -0,0 +1,2033 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sstream>
+
+#include "mon/AuthMonitor.h"
+#include "mon/Monitor.h"
+#include "mon/MonitorDBStore.h"
+#include "mon/OSDMonitor.h"
+#include "mon/MDSMonitor.h"
+#include "mon/ConfigMonitor.h"
+
+#include "messages/MMonCommand.h"
+#include "messages/MAuth.h"
+#include "messages/MAuthReply.h"
+#include "messages/MMonGlobalID.h"
+#include "msg/Messenger.h"
+
+#include "auth/AuthServiceHandler.h"
+#include "auth/KeyRing.h"
+#include "include/stringify.h"
+#include "include/ceph_assert.h"
+
+#include "mds/MDSAuthCaps.h"
+#include "mgr/MgrCap.h"
+#include "osd/OSDCap.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, get_last_committed())
+using namespace TOPNSPC::common;
+
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, Monitor &mon, version_t v) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name()
+		<< ").auth v" << v << " ";
+}
+
+ostream& operator<<(ostream &out, const AuthMonitor &pm)
+{
+  return out << "auth";
+}
+
+bool AuthMonitor::check_rotate()
+{
+  KeyServerData::Incremental rot_inc;
+  rot_inc.op = KeyServerData::AUTH_INC_SET_ROTATING;
+  if (mon.key_server.prepare_rotating_update(rot_inc.rotating_bl)) {
+    dout(10) << __func__ << " updating rotating" << dendl;
+    push_cephx_inc(rot_inc);
+    return true;
+  }
+  return false;
+}
+
+/*
+ Tick function to update the map based on performance every N seconds
+*/
+
+void AuthMonitor::tick()
+{
+  if (!is_active()) return;
+
+  dout(10) << *this << dendl;
+
+  // increase global_id?
+  bool propose = false;
+  bool increase;
+  {
+    std::lock_guard l(mon.auth_lock);
+    increase = _should_increase_max_global_id();
+  }
+  if (increase) {
+    if (mon.is_leader()) {
+      increase_max_global_id();
+      propose = true;
+    } else {
+      dout(10) << __func__ << "requesting more ids from leader" << dendl;
+      int leader = mon.get_leader();
+      MMonGlobalID *req = new MMonGlobalID();
+      req->old_max_id = max_global_id;
+      mon.send_mon_message(req, leader);
+    }
+  }
+
+  if (!mon.is_leader()) {
+    return;
+  }
+
+  if (check_rotate()) {
+    propose = true;
+  }
+
+  if (propose) {
+    propose_pending();
+  }
+}
+
+void AuthMonitor::on_active()
+{
+  dout(10) << "AuthMonitor::on_active()" << dendl;
+
+  if (!mon.is_leader())
+    return;
+
+  mon.key_server.start_server();
+
+  if (is_writeable()) {
+    bool propose = false;
+    if (check_rotate()) {
+      propose = true;
+    }
+    bool increase;
+    {
+      std::lock_guard l(mon.auth_lock);
+      increase = _should_increase_max_global_id();
+    }
+    if (increase) {
+      increase_max_global_id();
+      propose = true;
+    }
+    if (propose) {
+      propose_pending();
+    }
+  }
+}
+
+bufferlist _encode_cap(const string& cap)
+{
+  bufferlist bl;
+  encode(cap, bl);
+  return bl;
+}
+
+void AuthMonitor::get_initial_keyring(KeyRing *keyring)
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(keyring != nullptr);
+
+  bufferlist bl;
+  int ret = mon.store->get("mkfs", "keyring", bl);
+  if (ret == -ENOENT) {
+    return;
+  }
+  // fail hard only if there's an error we're not expecting to see
+  ceph_assert(ret == 0);
+
+  auto p = bl.cbegin();
+  decode(*keyring, p);
+}
+
+void _generate_bootstrap_keys(
+    list<pair<EntityName,EntityAuth> >* auth_lst)
+{
+  ceph_assert(auth_lst != nullptr);
+
+  map<string,map<string,bufferlist> > bootstrap = {
+    { "admin", {
+      { "mon", _encode_cap("allow *") },
+      { "osd", _encode_cap("allow *") },
+      { "mds", _encode_cap("allow *") },
+      { "mgr", _encode_cap("allow *") }
+    } },
+    { "bootstrap-osd", {
+      { "mon", _encode_cap("allow profile bootstrap-osd") }
+    } },
+    { "bootstrap-rgw", {
+      { "mon", _encode_cap("allow profile bootstrap-rgw") }
+    } },
+    { "bootstrap-mds", {
+      { "mon", _encode_cap("allow profile bootstrap-mds") }
+    } },
+    { "bootstrap-mgr", {
+      { "mon", _encode_cap("allow profile bootstrap-mgr") }
+    } },
+    { "bootstrap-rbd", {
+      { "mon", _encode_cap("allow profile bootstrap-rbd") }
+    } },
+    { "bootstrap-rbd-mirror", {
+      { "mon", _encode_cap("allow profile bootstrap-rbd-mirror") }
+    } }
+  };
+
+  for (auto &p : bootstrap) {
+    EntityName name;
+    name.from_str("client." + p.first);
+    EntityAuth auth;
+    auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+    auth.caps = p.second;
+
+    auth_lst->push_back(make_pair(name, auth));
+  }
+}
+
+void AuthMonitor::create_initial_keys(KeyRing *keyring)
+{
+  dout(10) << __func__ << " with keyring" << dendl;
+  ceph_assert(keyring != nullptr);
+
+  list<pair<EntityName,EntityAuth> > auth_lst;
+  _generate_bootstrap_keys(&auth_lst);
+
+  for (auto &p : auth_lst) {
+    if (keyring->exists(p.first)) {
+      continue;
+    }
+    keyring->add(p.first, p.second);
+  }
+}
+
+void AuthMonitor::create_initial()
+{
+  dout(10) << "create_initial -- creating initial map" << dendl;
+
+  // initialize rotating keys
+  mon.key_server.clear_secrets();
+  check_rotate();
+  ceph_assert(pending_auth.size() == 1);
+
+  if (mon.is_keyring_required()) {
+    KeyRing keyring;
+    // attempt to obtain an existing mkfs-time keyring
+    get_initial_keyring(&keyring);
+    // create missing keys in the keyring
+    create_initial_keys(&keyring);
+    // import the resulting keyring
+    import_keyring(keyring);
+  }
+
+  max_global_id = MIN_GLOBAL_ID;
+
+  Incremental inc;
+  inc.inc_type = GLOBAL_ID;
+  inc.max_global_id = max_global_id;
+  pending_auth.push_back(inc);
+
+  format_version = 3;
+}
+
+void AuthMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  dout(10) << __func__ << dendl;
+  load_health();
+
+  version_t version = get_last_committed();
+  version_t keys_ver = mon.key_server.get_ver();
+  if (version == keys_ver)
+    return;
+  ceph_assert(version > keys_ver);
+
+  version_t latest_full = get_version_latest_full();
+
+  dout(10) << __func__ << " version " << version << " keys ver " << keys_ver
+           << " latest " << latest_full << dendl;
+
+  if ((latest_full > 0) && (latest_full > keys_ver)) {
+    bufferlist latest_bl;
+    int err = get_version_full(latest_full, latest_bl);
+    ceph_assert(err == 0);
+    ceph_assert(latest_bl.length() != 0);
+    dout(7) << __func__ << " loading summary e " << latest_full << dendl;
+    dout(7) << __func__ << " latest length " << latest_bl.length() << dendl;
+    auto p = latest_bl.cbegin();
+    __u8 struct_v;
+    decode(struct_v, p);
+    decode(max_global_id, p);
+    decode(mon.key_server, p);
+    mon.key_server.set_ver(latest_full);
+    keys_ver = latest_full;
+  }
+
+  dout(10) << __func__ << " key server version " << mon.key_server.get_ver() << dendl;
+
+  // walk through incrementals
+  while (version > keys_ver) {
+    bufferlist bl;
+    int ret = get_version(keys_ver+1, bl);
+    ceph_assert(ret == 0);
+    ceph_assert(bl.length());
+
+    // reset if we are moving to initial state.  we will normally have
+    // keys in here temporarily for bootstrapping that we need to
+    // clear out.
+    if (keys_ver == 0)
+      mon.key_server.clear_secrets();
+
+    dout(20) << __func__ << " walking through version " << (keys_ver+1)
+             << " len " << bl.length() << dendl;
+
+    auto p = bl.cbegin();
+    __u8 v;
+    decode(v, p);
+    while (!p.end()) {
+      Incremental inc;
+      decode(inc, p);
+      switch (inc.inc_type) {
+      case GLOBAL_ID:
+	max_global_id = inc.max_global_id;
+	break;
+
+      case AUTH_DATA:
+        {
+          KeyServerData::Incremental auth_inc;
+          auto iter = inc.auth_data.cbegin();
+          decode(auth_inc, iter);
+          mon.key_server.apply_data_incremental(auth_inc);
+          break;
+        }
+      }
+    }
+
+    keys_ver++;
+    mon.key_server.set_ver(keys_ver);
+
+    if (keys_ver == 1 && mon.is_keyring_required()) {
+      auto t(std::make_shared<MonitorDBStore::Transaction>());
+      t->erase("mkfs", "keyring");
+      mon.store->apply_transaction(t);
+    }
+  }
+
+  {
+    std::lock_guard l(mon.auth_lock);
+    if (last_allocated_id == 0) {
+      last_allocated_id = max_global_id;
+      dout(10) << __func__ << " last_allocated_id initialized to "
+	       << max_global_id << dendl;
+    }
+  }
+
+  dout(10) << __func__ << " max_global_id=" << max_global_id
+	   << " format_version " << format_version
+	   << dendl;
+
+  mon.key_server.dump();
+}
+
+bool AuthMonitor::_should_increase_max_global_id()
+{
+  ceph_assert(ceph_mutex_is_locked(mon.auth_lock));
+  auto num_prealloc = g_conf()->mon_globalid_prealloc;
+  if (max_global_id < num_prealloc ||
+      (last_allocated_id + 1) >= max_global_id - num_prealloc / 2) {
+    return true;
+  }
+  return false;
+}
+
+void AuthMonitor::increase_max_global_id()
+{
+  ceph_assert(mon.is_leader());
+
+  Incremental inc;
+  inc.inc_type = GLOBAL_ID;
+  inc.max_global_id = max_global_id + g_conf()->mon_globalid_prealloc;
+  dout(10) << "increasing max_global_id to " << inc.max_global_id << dendl;
+  pending_auth.push_back(inc);
+}
+
+bool AuthMonitor::should_propose(double& delay)
+{
+  return (!pending_auth.empty());
+}
+
+void AuthMonitor::create_pending()
+{
+  pending_auth.clear();
+  dout(10) << "create_pending v " << (get_last_committed() + 1) << dendl;
+}
+
+void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  dout(10) << __func__ << " v " << (get_last_committed() + 1) << dendl;
+
+  bufferlist bl;
+
+  __u8 v = 1;
+  encode(v, bl);
+  vector<Incremental>::iterator p;
+  for (p = pending_auth.begin(); p != pending_auth.end(); ++p)
+    p->encode(bl, mon.get_quorum_con_features());
+
+  version_t version = get_last_committed() + 1;
+  put_version(t, version, bl);
+  put_last_committed(t, version);
+
+  // health
+  health_check_map_t next;
+  map<string,list<string>> bad_detail;  // entity -> details
+  for (auto i = mon.key_server.secrets_begin();
+       i != mon.key_server.secrets_end();
+       ++i) {
+    for (auto& p : i->second.caps) {
+      ostringstream ss;
+      if (!valid_caps(p.first, p.second, &ss)) {
+	ostringstream ss2;
+	ss2 << i->first << " " << ss.str();
+	bad_detail[i->first.to_str()].push_back(ss2.str());
+      }
+    }
+  }
+  for (auto& inc : pending_auth) {
+    if (inc.inc_type == AUTH_DATA) {
+      KeyServerData::Incremental auth_inc;
+      auto iter = inc.auth_data.cbegin();
+      decode(auth_inc, iter);
+      if (auth_inc.op == KeyServerData::AUTH_INC_DEL) {
+	bad_detail.erase(auth_inc.name.to_str());
+      } else if (auth_inc.op == KeyServerData::AUTH_INC_ADD) {
+	for (auto& p : auth_inc.auth.caps) {
+	  ostringstream ss;
+	  if (!valid_caps(p.first, p.second, &ss)) {
+	    ostringstream ss2;
+	    ss2 << auth_inc.name << " " << ss.str();
+	    bad_detail[auth_inc.name.to_str()].push_back(ss2.str());
+	  }
+	}
+      }
+    }
+  }
+  if (bad_detail.size()) {
+    ostringstream ss;
+    ss << bad_detail.size() << " auth entities have invalid capabilities";
+    health_check_t *check = &next.add("AUTH_BAD_CAPS", HEALTH_ERR, ss.str(),
+				      bad_detail.size());
+    for (auto& i : bad_detail) {
+      for (auto& j : i.second) {
+	check->detail.push_back(j);
+      }
+    }
+  }
+  encode_health(next, t);
+}
+
+void AuthMonitor::encode_full(MonitorDBStore::TransactionRef t)
+{
+  version_t version = mon.key_server.get_ver();
+  // do not stash full version 0 as it will never be removed nor read
+  if (version == 0)
+    return;
+
+  dout(10) << __func__ << " auth v " << version << dendl;
+  ceph_assert(get_last_committed() == version);
+
+  bufferlist full_bl;
+  std::scoped_lock l{mon.key_server.get_lock()};
+  dout(20) << __func__ << " key server has "
+           << (mon.key_server.has_secrets() ? "" : "no ")
+           << "secrets!" << dendl;
+  __u8 v = 1;
+  encode(v, full_bl);
+  encode(max_global_id, full_bl);
+  encode(mon.key_server, full_bl);
+
+  put_version_full(t, version, full_bl);
+  put_version_latest_full(t, version);
+}
+
+version_t AuthMonitor::get_trim_to() const
+{
+  unsigned max = g_conf()->paxos_max_join_drift * 2;
+  version_t version = get_last_committed();
+  if (mon.is_leader() && (version > max))
+    return version - max;
+  return 0;
+}
+
+bool AuthMonitor::preprocess_query(MonOpRequestRef op)
+{
+  auto m = op->get_req<PaxosServiceMessage>();
+  dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
+  switch (m->get_type()) {
+  case MSG_MON_COMMAND:
+    try {
+      return preprocess_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+
+  case CEPH_MSG_AUTH:
+    return prep_auth(op, false);
+
+  case MSG_MON_GLOBAL_ID:
+    return false;
+
+  default:
+    ceph_abort();
+    return true;
+  }
+}
+
+bool AuthMonitor::prepare_update(MonOpRequestRef op)
+{
+  auto m = op->get_req<PaxosServiceMessage>();
+  dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
+  switch (m->get_type()) {
+  case MSG_MON_COMMAND:
+    try {
+      return prepare_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+  case MSG_MON_GLOBAL_ID:
+    return prepare_global_id(op);
+  case CEPH_MSG_AUTH:
+    return prep_auth(op, true);
+  default:
+    ceph_abort();
+    return false;
+  }
+}
+
+void AuthMonitor::_set_mon_num_rank(int num, int rank)
+{
+  dout(10) << __func__ << " num " << num << " rank " << rank << dendl;
+  ceph_assert(ceph_mutex_is_locked(mon.auth_lock));
+  mon_num = num;
+  mon_rank = rank;
+}
+
+uint64_t AuthMonitor::_assign_global_id()
+{
+  ceph_assert(ceph_mutex_is_locked(mon.auth_lock));
+  if (mon_num < 1 || mon_rank < 0) {
+    dout(10) << __func__ << " inactive (num_mon " << mon_num
+	     << " rank " << mon_rank << ")" << dendl;
+    return 0;
+  }
+  if (!last_allocated_id) {
+    dout(10) << __func__ << " last_allocated_id == 0" << dendl;
+    return 0;
+  }
+
+  uint64_t id = last_allocated_id + 1;
+  int remainder = id % mon_num;
+  if (remainder) {
+    remainder = mon_num - remainder;
+  }
+  id += remainder + mon_rank;
+
+  if (id >= max_global_id) {
+    dout(10) << __func__ << " failed (max " << max_global_id << ")" << dendl;
+    return 0;
+  }
+
+  last_allocated_id = id;
+  dout(10) << __func__ << " " << id << " (max " << max_global_id << ")"
+	   << dendl;
+  return id;
+}
+
+uint64_t AuthMonitor::assign_global_id(bool should_increase_max)
+{
+  uint64_t id;
+  {
+    std::lock_guard l(mon.auth_lock);
+    id =_assign_global_id();
+    if (should_increase_max) {
+      should_increase_max = _should_increase_max_global_id();
+    }
+  }
+  if (mon.is_leader() &&
+      should_increase_max) {
+    increase_max_global_id();
+  }
+  return id;
+}
+
+bool AuthMonitor::prep_auth(MonOpRequestRef op, bool paxos_writable)
+{
+  auto m = op->get_req<MAuth>();
+  dout(10) << "prep_auth() blob_size=" << m->get_auth_payload().length() << dendl;
+
+  MonSession *s = op->get_session();
+  if (!s) {
+    dout(10) << "no session, dropping" << dendl;
+    return true;
+  }
+
+  int ret = 0;
+  MAuthReply *reply;
+  bufferlist response_bl;
+  auto indata = m->auth_payload.cbegin();
+  __u32 proto = m->protocol;
+  bool start = false;
+  bool finished = false;
+  EntityName entity_name;
+  bool is_new_global_id = false;
+
+  // set up handler?
+  if (m->protocol == 0 && !s->auth_handler) {
+    set<__u32> supported;
+
+    try {
+      __u8 struct_v = 1;
+      decode(struct_v, indata);
+      decode(supported, indata);
+      decode(entity_name, indata);
+      decode(s->con->peer_global_id, indata);
+    } catch (const ceph::buffer::error &e) {
+      dout(10) << "failed to decode initial auth message" << dendl;
+      ret = -EINVAL;
+      goto reply;
+    }
+
+    // do we require cephx signatures?
+
+    if (!m->get_connection()->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+      if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+	  entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+	  entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+	  entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) {
+	if (g_conf()->cephx_cluster_require_signatures ||
+	    g_conf()->cephx_require_signatures) {
+	  dout(1) << m->get_source_inst()
+                  << " supports cephx but not signatures and"
+                  << " 'cephx [cluster] require signatures = true';"
+                  << " disallowing cephx" << dendl;
+	  supported.erase(CEPH_AUTH_CEPHX);
+	}
+      } else {
+	if (g_conf()->cephx_service_require_signatures ||
+	    g_conf()->cephx_require_signatures) {
+	  dout(1) << m->get_source_inst()
+                  << " supports cephx but not signatures and"
+                  << " 'cephx [service] require signatures = true';"
+                  << " disallowing cephx" << dendl;
+	  supported.erase(CEPH_AUTH_CEPHX);
+	}
+      }
+    } else if (!m->get_connection()->has_feature(CEPH_FEATURE_CEPHX_V2)) {
+      if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+	  entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+	  entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+	  entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) {
+	if (g_conf()->cephx_cluster_require_version >= 2 ||
+	    g_conf()->cephx_require_version >= 2) {
+	  dout(1) << m->get_source_inst()
+                  << " supports cephx but not v2 and"
+                  << " 'cephx [cluster] require version >= 2';"
+                  << " disallowing cephx" << dendl;
+	  supported.erase(CEPH_AUTH_CEPHX);
+	}
+      } else {
+	if (g_conf()->cephx_service_require_version >= 2 ||
+	    g_conf()->cephx_require_version >= 2) {
+	  dout(1) << m->get_source_inst()
+                  << " supports cephx but not v2 and"
+                  << " 'cephx [service] require version >= 2';"
+                  << " disallowing cephx" << dendl;
+	  supported.erase(CEPH_AUTH_CEPHX);
+	}
+      }
+    }
+
+    int type;
+    if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+	entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+	entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+	entity_name.get_type() == CEPH_ENTITY_TYPE_MGR)
+      type = mon.auth_cluster_required.pick(supported);
+    else
+      type = mon.auth_service_required.pick(supported);
+
+    s->auth_handler = get_auth_service_handler(type, g_ceph_context, &mon.key_server);
+    if (!s->auth_handler) {
+      dout(1) << "client did not provide supported auth type" << dendl;
+      ret = -ENOTSUP;
+      goto reply;
+    }
+    start = true;
+    proto = type;
+  } else if (!s->auth_handler) {
+      dout(10) << "protocol specified but no s->auth_handler" << dendl;
+      ret = -EINVAL;
+      goto reply;
+  }
+
+  /* assign a new global_id? we assume this should only happen on the first
+     request. If a client tries to send it later, it'll screw up its auth
+     session */
+  if (!s->con->peer_global_id) {
+    s->con->peer_global_id = assign_global_id(paxos_writable);
+    if (!s->con->peer_global_id) {
+
+      delete s->auth_handler;
+      s->auth_handler = NULL;
+
+      if (mon.is_leader() && paxos_writable) {
+        dout(10) << "increasing global id, waitlisting message" << dendl;
+        wait_for_active(op, new C_RetryMessage(this, op));
+        goto done;
+      }
+
+      if (!mon.is_leader()) {
+	dout(10) << "not the leader, requesting more ids from leader" << dendl;
+	int leader = mon.get_leader();
+	MMonGlobalID *req = new MMonGlobalID();
+	req->old_max_id = max_global_id;
+	mon.send_mon_message(req, leader);
+	wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+	return true;
+      }
+
+      ceph_assert(!paxos_writable);
+      return false;
+    }
+    is_new_global_id = true;
+  }
+
+  try {
+    if (start) {
+      // new session
+      ret = s->auth_handler->start_session(entity_name,
+					   s->con->peer_global_id,
+					   is_new_global_id,
+					   &response_bl,
+					   &s->con->peer_caps_info);
+    } else {
+      // request
+      ret = s->auth_handler->handle_request(
+	indata,
+	0, // no connection_secret needed
+	&response_bl,
+	&s->con->peer_caps_info,
+	nullptr, nullptr);
+    }
+    if (ret == -EIO) {
+      wait_for_active(op, new C_RetryMessage(this,op));
+      goto done;
+    }
+    if (ret > 0) {
+      if (!s->authenticated &&
+	  mon.ms_handle_authentication(s->con.get()) > 0) {
+	finished = true;
+      }
+      ret = 0;
+    }
+  } catch (const ceph::buffer::error &err) {
+    ret = -EINVAL;
+    dout(0) << "caught error when trying to handle auth request, probably malformed request" << dendl;
+  }
+
+reply:
+  reply = new MAuthReply(proto, &response_bl, ret, s->con->peer_global_id);
+  mon.send_reply(op, reply);
+  if (finished) {
+    // always send the latest monmap.
+    if (m->monmap_epoch < mon.monmap->get_epoch())
+      mon.send_latest_monmap(m->get_connection().get());
+
+    mon.configmon()->check_sub(s);
+  }
+done:
+  return true;
+}
+
+bool AuthMonitor::preprocess_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  int r = -1;
+  bufferlist rdata;
+  stringstream ss, ds;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    // ss has reason for failure
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+  if (prefix == "auth add" ||
+      prefix == "auth del" ||
+      prefix == "auth rm" ||
+      prefix == "auth get-or-create" ||
+      prefix == "auth get-or-create-key" ||
+      prefix == "fs authorize" ||
+      prefix == "auth import" ||
+      prefix == "auth caps") {
+    return false;
+  }
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+    return true;
+  }
+
+  // entity might not be supplied, but if it is, it should be valid
+  string entity_name;
+  cmd_getval(cmdmap, "entity", entity_name);
+  EntityName entity;
+  if (!entity_name.empty() && !entity.from_str(entity_name)) {
+    ss << "invalid entity_auth " << entity_name;
+    mon.reply_command(op, -EINVAL, ss.str(), get_last_committed());
+    return true;
+  }
+
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  if (prefix == "auth export") {
+    KeyRing keyring;
+    export_keyring(keyring);
+    if (!entity_name.empty()) {
+      EntityAuth eauth;
+      if (keyring.get_auth(entity, eauth)) {
+	KeyRing kr;
+	kr.add(entity, eauth);
+	if (f)
+	  kr.encode_formatted("auth", f.get(), rdata);
+	else
+	  kr.encode_plaintext(rdata);
+	ss << "export " << eauth;
+	r = 0;
+      } else {
+	ss << "no key for " << eauth;
+	r = -ENOENT;
+      }
+    } else {
+      if (f)
+	keyring.encode_formatted("auth", f.get(), rdata);
+      else
+	keyring.encode_plaintext(rdata);
+
+      ss << "exported master keyring";
+      r = 0;
+    }
+  } else if (prefix == "auth get" && !entity_name.empty()) {
+    KeyRing keyring;
+    EntityAuth entity_auth;
+    if(!mon.key_server.get_auth(entity, entity_auth)) {
+      ss << "failed to find " << entity_name << " in keyring";
+      r = -ENOENT;
+    } else {
+      keyring.add(entity, entity_auth);
+      if (f)
+	keyring.encode_formatted("auth", f.get(), rdata);
+      else
+	keyring.encode_plaintext(rdata);
+      ss << "exported keyring for " << entity_name;
+      r = 0;
+    }
+  } else if (prefix == "auth print-key" ||
+	     prefix == "auth print_key" ||
+	     prefix == "auth get-key") {
+    EntityAuth auth;
+    if (!mon.key_server.get_auth(entity, auth)) {
+      ss << "don't have " << entity;
+      r = -ENOENT;
+      goto done;
+    }
+    if (f) {
+      auth.key.encode_formatted("auth", f.get(), rdata);
+    } else {
+      auth.key.encode_plaintext(rdata);
+    }
+    r = 0;
+  } else if (prefix == "auth list" ||
+	     prefix == "auth ls") {
+    if (f) {
+      mon.key_server.encode_formatted("auth", f.get(), rdata);
+    } else {
+      mon.key_server.encode_plaintext(rdata);
+      if (rdata.length() > 0)
+        ss << "installed auth entries:" << std::endl;
+      else
+        ss << "no installed auth entries!" << std::endl;
+    }
+    r = 0;
+    goto done;
+  } else {
+    ss << "invalid command";
+    r = -EINVAL;
+  }
+
+ done:
+  rdata.append(ds);
+  string rs;
+  getline(ss, rs, '\0');
+  mon.reply_command(op, r, rs, rdata, get_last_committed());
+  return true;
+}
+
+void AuthMonitor::export_keyring(KeyRing& keyring)
+{
+  mon.key_server.export_keyring(keyring);
+}
+
+int AuthMonitor::import_keyring(KeyRing& keyring)
+{
+  dout(10) << __func__ << " " << keyring.size() << " keys" << dendl;
+
+  for (map<EntityName, EntityAuth>::iterator p = keyring.get_keys().begin();
+       p != keyring.get_keys().end();
+       ++p) {
+    if (p->second.caps.empty()) {
+      dout(0) << "import: no caps supplied" << dendl;
+      return -EINVAL;
+    }
+    int err = add_entity(p->first, p->second);
+    ceph_assert(err == 0);
+  }
+  return 0;
+}
+
+int AuthMonitor::remove_entity(const EntityName &entity)
+{
+  dout(10) << __func__ << " " << entity << dendl;
+  if (!mon.key_server.contains(entity))
+    return -ENOENT;
+
+  KeyServerData::Incremental auth_inc;
+  auth_inc.name = entity;
+  auth_inc.op = KeyServerData::AUTH_INC_DEL;
+  push_cephx_inc(auth_inc);
+
+  return 0;
+}
+
+bool AuthMonitor::entity_is_pending(EntityName& entity)
+{
+  // are we about to have it?
+  for (auto& p : pending_auth) {
+    if (p.inc_type == AUTH_DATA) {
+      KeyServerData::Incremental inc;
+      auto q = p.auth_data.cbegin();
+      decode(inc, q);
+      if (inc.op == KeyServerData::AUTH_INC_ADD &&
+          inc.name == entity) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+int AuthMonitor::exists_and_matches_entity(
+    const auth_entity_t& entity,
+    bool has_secret,
+    stringstream& ss)
+{
+  return exists_and_matches_entity(entity.name, entity.auth,
+                                   entity.auth.caps, has_secret, ss);
+}
+
+int AuthMonitor::exists_and_matches_entity(
+    const EntityName& name,
+    const EntityAuth& auth,
+    const map<string,bufferlist>& caps,
+    bool has_secret,
+    stringstream& ss)
+{
+
+  dout(20) << __func__ << " entity " << name << " auth " << auth
+           << " caps " << caps << " has_secret " << has_secret << dendl;
+
+  EntityAuth existing_auth;
+  // does entry already exist?
+  if (mon.key_server.get_auth(name, existing_auth)) {
+    // key match?
+    if (has_secret) {
+      if (existing_auth.key.get_secret().cmp(auth.key.get_secret())) {
+        ss << "entity " << name << " exists but key does not match";
+        return -EEXIST;
+      }
+    }
+
+    // caps match?
+    if (caps.size() != existing_auth.caps.size()) {
+      ss << "entity " << name << " exists but caps do not match";
+      return -EINVAL;
+    }
+    for (auto& it : caps) {
+      if (existing_auth.caps.count(it.first) == 0 ||
+          !existing_auth.caps[it.first].contents_equal(it.second)) {
+        ss << "entity " << name << " exists but cap "
+          << it.first << " does not match";
+        return -EINVAL;
+      }
+    }
+
+    // they match, no-op
+    return 0;
+  }
+  return -ENOENT;
+}
+
+int AuthMonitor::add_entity(
+    const EntityName& name,
+    const EntityAuth& auth)
+{
+
+  // okay, add it.
+  KeyServerData::Incremental auth_inc;
+  auth_inc.op = KeyServerData::AUTH_INC_ADD;
+  auth_inc.name = name;
+  auth_inc.auth = auth;
+
+  dout(10) << " add auth entity " << auth_inc.name << dendl;
+  dout(30) << "    " << auth_inc.auth << dendl;
+  push_cephx_inc(auth_inc);
+  return 0;
+}
+
+int AuthMonitor::validate_osd_destroy(
+    int32_t id,
+    const uuid_d& uuid,
+    EntityName& cephx_entity,
+    EntityName& lockbox_entity,
+    stringstream& ss)
+{
+  ceph_assert(paxos.is_plugged());
+
+  dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
+
+  string cephx_str = "osd." + stringify(id);
+  string lockbox_str = "client.osd-lockbox." + stringify(uuid);
+
+  if (!cephx_entity.from_str(cephx_str)) {
+    dout(10) << __func__ << " invalid cephx entity '"
+             << cephx_str << "'" << dendl;
+    ss << "invalid cephx key entity '" << cephx_str << "'";
+    return -EINVAL;
+  }
+
+  if (!lockbox_entity.from_str(lockbox_str)) {
+    dout(10) << __func__ << " invalid lockbox entity '"
+             << lockbox_str << "'" << dendl;
+    ss << "invalid lockbox key entity '" << lockbox_str << "'";
+    return -EINVAL;
+  }
+
+  if (!mon.key_server.contains(cephx_entity) &&
+      !mon.key_server.contains(lockbox_entity)) {
+    return -ENOENT;
+  }
+
+  return 0;
+}
+
+int AuthMonitor::do_osd_destroy(
+    const EntityName& cephx_entity,
+    const EntityName& lockbox_entity)
+{
+  ceph_assert(paxos.is_plugged());
+
+  dout(10) << __func__ << " cephx " << cephx_entity
+                       << " lockbox " << lockbox_entity << dendl;
+
+  bool removed = false;
+
+  int err = remove_entity(cephx_entity);
+  if (err == -ENOENT) {
+    dout(10) << __func__ << " " << cephx_entity << " does not exist" << dendl;
+  } else {
+    removed = true;
+  }
+
+  err = remove_entity(lockbox_entity);
+  if (err == -ENOENT) {
+    dout(10) << __func__ << " " << lockbox_entity << " does not exist" << dendl;
+  } else {
+    removed = true;
+  }
+
+  if (!removed) {
+    dout(10) << __func__ << " entities do not exist -- no-op." << dendl;
+    return 0;
+  }
+
+  // given we have paxos plugged, this will not result in a proposal
+  // being triggered, but it will still be needed so that we get our
+  // pending state encoded into the paxos' pending transaction.
+  propose_pending();
+  return 0;
+}
+
+int _create_auth(
+    EntityAuth& auth,
+    const string& key,
+    const map<string,bufferlist>& caps)
+{
+  if (key.empty())
+    return -EINVAL;
+  try {
+    auth.key.decode_base64(key);
+  } catch (ceph::buffer::error& e) {
+    return -EINVAL;
+  }
+  auth.caps = caps;
+  return 0;
+}
+
+int AuthMonitor::validate_osd_new(
+    int32_t id,
+    const uuid_d& uuid,
+    const string& cephx_secret,
+    const string& lockbox_secret,
+    auth_entity_t& cephx_entity,
+    auth_entity_t& lockbox_entity,
+    stringstream& ss)
+{
+
+  dout(10) << __func__ << " osd." << id << " uuid " << uuid << dendl;
+
+  map<string,bufferlist> cephx_caps = {
+    { "osd", _encode_cap("allow *") },
+    { "mon", _encode_cap("allow profile osd") },
+    { "mgr", _encode_cap("allow profile osd") }
+  };
+  map<string,bufferlist> lockbox_caps = {
+    { "mon", _encode_cap("allow command \"config-key get\" "
+        "with key=\"dm-crypt/osd/" +
+        stringify(uuid) +
+        "/luks\"") }
+  };
+
+  bool has_lockbox = !lockbox_secret.empty();
+
+  string cephx_name = "osd." + stringify(id);
+  string lockbox_name = "client.osd-lockbox." + stringify(uuid);
+
+  if (!cephx_entity.name.from_str(cephx_name)) {
+    dout(10) << __func__ << " invalid cephx entity '"
+             << cephx_name << "'" << dendl;
+    ss << "invalid cephx key entity '" << cephx_name << "'";
+    return -EINVAL;
+  }
+
+  if (has_lockbox) {
+    if (!lockbox_entity.name.from_str(lockbox_name)) {
+      dout(10) << __func__ << " invalid cephx lockbox entity '"
+               << lockbox_name << "'" << dendl;
+      ss << "invalid cephx lockbox entity '" << lockbox_name << "'";
+      return -EINVAL;
+    }
+  }
+
+  if (entity_is_pending(cephx_entity.name) ||
+      (has_lockbox && entity_is_pending(lockbox_entity.name))) {
+    // If we have pending entities for either the cephx secret or the
+    // lockbox secret, then our safest bet is to retry the command at
+    // a later time. These entities may be pending because an `osd new`
+    // command has been run (which is unlikely, due to the nature of
+    // the operation, which will force a paxos proposal), or (more likely)
+    // because a competing client created those entities before we handled
+    // the `osd new` command. Regardless, let's wait and see.
+    return -EAGAIN;
+  }
+
+  if (!is_valid_cephx_key(cephx_secret)) {
+    ss << "invalid cephx secret.";
+    return -EINVAL;
+  }
+
+  if (has_lockbox && !is_valid_cephx_key(lockbox_secret)) {
+    ss << "invalid cephx lockbox secret.";
+    return -EINVAL;
+  }
+
+  int err = _create_auth(cephx_entity.auth, cephx_secret, cephx_caps);
+  ceph_assert(0 == err);
+
+  bool cephx_is_idempotent = false, lockbox_is_idempotent = false;
+  err = exists_and_matches_entity(cephx_entity, true, ss);
+
+  if (err != -ENOENT) {
+    if (err < 0) {
+      return err;
+    }
+    ceph_assert(0 == err);
+    cephx_is_idempotent = true;
+  }
+
+  if (has_lockbox) {
+    err = _create_auth(lockbox_entity.auth, lockbox_secret, lockbox_caps);
+    ceph_assert(err == 0);
+    err = exists_and_matches_entity(lockbox_entity, true, ss);
+    if (err != -ENOENT) {
+      if (err < 0) {
+        return err;
+      }
+      ceph_assert(0 == err);
+      lockbox_is_idempotent = true;
+    }
+  }
+
+  if (cephx_is_idempotent && (!has_lockbox || lockbox_is_idempotent)) {
+    return EEXIST;
+  }
+
+  return 0;
+}
+
+int AuthMonitor::do_osd_new(
+    const auth_entity_t& cephx_entity,
+    const auth_entity_t& lockbox_entity,
+    bool has_lockbox)
+{
+  ceph_assert(paxos.is_plugged());
+
+  dout(10) << __func__ << " cephx " << cephx_entity.name
+           << " lockbox ";
+  if (has_lockbox) {
+    *_dout << lockbox_entity.name;
+  } else {
+    *_dout << "n/a";
+  }
+  *_dout << dendl;
+
+  // we must have validated before reaching this point.
+  // if keys exist, then this means they also match; otherwise we would
+  // have failed before calling this function.
+  bool cephx_exists = mon.key_server.contains(cephx_entity.name);
+
+  if (!cephx_exists) {
+    int err = add_entity(cephx_entity.name, cephx_entity.auth);
+    ceph_assert(0 == err);
+  }
+
+  if (has_lockbox &&
+      !mon.key_server.contains(lockbox_entity.name)) {
+    int err = add_entity(lockbox_entity.name, lockbox_entity.auth);
+    ceph_assert(0 == err);
+  }
+
+  // given we have paxos plugged, this will not result in a proposal
+  // being triggered, but it will still be needed so that we get our
+  // pending state encoded into the paxos' pending transaction.
+  propose_pending();
+  return 0;
+}
+
+bool AuthMonitor::valid_caps(
+    const string& type,
+    const string& caps,
+    ostream *out)
+{
+  if (type == "mon") {
+    MonCap moncap;
+    if (!moncap.parse(caps, out)) {
+      return false;
+    }
+    return true;
+  }
+
+  if (!g_conf().get_val<bool>("mon_auth_validate_all_caps")) {
+    return true;
+  }
+
+  if (type == "mgr") {
+    MgrCap mgrcap;
+    if (!mgrcap.parse(caps, out)) {
+      return false;
+    }
+  } else if (type == "osd") {
+    OSDCap ocap;
+    if (!ocap.parse(caps, out)) {
+      return false;
+    }
+  } else if (type == "mds") {
+    MDSAuthCaps mdscap;
+    if (!mdscap.parse(g_ceph_context, caps, out)) {
+      return false;
+    }
+  } else {
+    if (out) {
+      *out << "unknown cap type '" << type << "'";
+    }
+    return false;
+  }
+  return true;
+}
+
+bool AuthMonitor::valid_caps(const vector<string>& caps, ostream *out)
+{
+  for (vector<string>::const_iterator p = caps.begin();
+       p != caps.end(); p += 2) {
+    if ((p+1) == caps.end()) {
+      *out << "cap '" << *p << "' has no value";
+      return false;
+    }
+    if (!valid_caps(*p, *(p+1), out)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool AuthMonitor::prepare_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  stringstream ss, ds;
+  bufferlist rdata;
+  string rs;
+  int err = -EINVAL;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    // ss has reason for failure
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  vector<string>caps_vec;
+  string entity_name;
+  EntityName entity;
+
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+    return true;
+  }
+
+  cmd_getval(cmdmap, "caps", caps_vec);
+  // fs authorize command's can have odd number of caps arguments
+  if ((prefix != "fs authorize") && (caps_vec.size() % 2) != 0) {
+    ss << "bad capabilities request; odd number of arguments";
+    err = -EINVAL;
+    goto done;
+  }
+
+  cmd_getval(cmdmap, "entity", entity_name);
+  if (!entity_name.empty() && !entity.from_str(entity_name)) {
+    ss << "bad entity name";
+    err = -EINVAL;
+    goto done;
+  }
+
+  if (prefix == "auth import") {
+    bufferlist bl = m->get_data();
+    if (bl.length() == 0) {
+      ss << "auth import: no data supplied";
+      getline(ss, rs);
+      mon.reply_command(op, -EINVAL, rs, get_last_committed());
+      return true;
+    }
+    auto iter = bl.cbegin();
+    KeyRing keyring;
+    try {
+      decode(keyring, iter);
+    } catch (const ceph::buffer::error &ex) {
+      ss << "error decoding keyring" << " " << ex.what();
+      err = -EINVAL;
+      goto done;
+    }
+    err = import_keyring(keyring);
+    if (err < 0) {
+      ss << "auth import: no caps supplied";
+      getline(ss, rs);
+      mon.reply_command(op, -EINVAL, rs, get_last_committed());
+      return true;
+    }
+    ss << "imported keyring";
+    getline(ss, rs);
+    err = 0;
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "auth add" && !entity_name.empty()) {
+    /* expected behavior:
+     *  - if command reproduces current state, return 0.
+     *  - if command adds brand new entity, handle it.
+     *  - if command adds new state to existing entity, return error.
+     */
+    KeyServerData::Incremental auth_inc;
+    auth_inc.name = entity;
+    bufferlist bl = m->get_data();
+    bool has_keyring = (bl.length() > 0);
+    map<string,bufferlist> new_caps;
+
+    KeyRing new_keyring;
+    if (has_keyring) {
+      auto iter = bl.cbegin();
+      try {
+        decode(new_keyring, iter);
+      } catch (const ceph::buffer::error &ex) {
+        ss << "error decoding keyring";
+        err = -EINVAL;
+        goto done;
+      }
+    }
+
+    if (!valid_caps(caps_vec, &ss)) {
+      err = -EINVAL;
+      goto done;
+    }
+
+    // are we about to have it?
+    if (entity_is_pending(entity)) {
+      wait_for_finished_proposal(op,
+          new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
+      return true;
+    }
+
+    // build new caps from provided arguments (if available)
+    for (vector<string>::iterator it = caps_vec.begin();
+	 it != caps_vec.end() && (it + 1) != caps_vec.end();
+	 it += 2) {
+      string sys = *it;
+      bufferlist cap;
+      encode(*(it+1), cap);
+      new_caps[sys] = cap;
+    }
+
+    // pull info out of provided keyring
+    EntityAuth new_inc;
+    if (has_keyring) {
+      if (!new_keyring.get_auth(auth_inc.name, new_inc)) {
+	ss << "key for " << auth_inc.name
+	   << " not found in provided keyring";
+	err = -EINVAL;
+	goto done;
+      }
+      if (!new_caps.empty() && !new_inc.caps.empty()) {
+	ss << "caps cannot be specified both in keyring and in command";
+	err = -EINVAL;
+	goto done;
+      }
+      if (new_caps.empty()) {
+	new_caps = new_inc.caps;
+      }
+    }
+
+    err = exists_and_matches_entity(auth_inc.name, new_inc,
+                                    new_caps, has_keyring, ss);
+    // if entity/key/caps do not exist in the keyring, just fall through
+    // and add the entity; otherwise, make sure everything matches (in
+    // which case it's a no-op), because if not we must fail.
+    if (err != -ENOENT) {
+      if (err < 0) {
+        goto done;
+      }
+      // no-op.
+      ceph_assert(err == 0);
+      goto done;
+    }
+    err = 0;
+
+    // okay, add it.
+    if (!has_keyring) {
+      dout(10) << "AuthMonitor::prepare_command generating random key for "
+        << auth_inc.name << dendl;
+      new_inc.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+    }
+    new_inc.caps = new_caps;
+
+    err = add_entity(auth_inc.name, new_inc);
+    ceph_assert(err == 0);
+
+    ss << "added key for " << auth_inc.name;
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						   get_last_committed() + 1));
+    return true;
+  } else if ((prefix == "auth get-or-create-key" ||
+	      prefix == "auth get-or-create") &&
+	     !entity_name.empty()) {
+    // auth get-or-create <name> [mon osdcapa osd osdcapb ...]
+
+    if (!valid_caps(caps_vec, &ss)) {
+      err = -EINVAL;
+      goto done;
+    }
+
+    // Parse the list of caps into a map
+    std::map<std::string, bufferlist> wanted_caps;
+    for (vector<string>::const_iterator it = caps_vec.begin();
+	 it != caps_vec.end() && (it + 1) != caps_vec.end();
+	 it += 2) {
+      const std::string &sys = *it;
+      bufferlist cap;
+      encode(*(it+1), cap);
+      wanted_caps[sys] = cap;
+    }
+
+    // do we have it?
+    EntityAuth entity_auth;
+    if (mon.key_server.get_auth(entity, entity_auth)) {
+      for (const auto &sys_cap : wanted_caps) {
+	if (entity_auth.caps.count(sys_cap.first) == 0 ||
+	    !entity_auth.caps[sys_cap.first].contents_equal(sys_cap.second)) {
+	  ss << "key for " << entity << " exists but cap " << sys_cap.first
+            << " does not match";
+	  err = -EINVAL;
+	  goto done;
+	}
+      }
+
+      if (prefix == "auth get-or-create-key") {
+        if (f) {
+          entity_auth.key.encode_formatted("auth", f.get(), rdata);
+        } else {
+          ds << entity_auth.key;
+        }
+      } else {
+	KeyRing kr;
+	kr.add(entity, entity_auth.key);
+        if (f) {
+          kr.set_caps(entity, entity_auth.caps);
+          kr.encode_formatted("auth", f.get(), rdata);
+        } else {
+          kr.encode_plaintext(rdata);
+        }
+      }
+      err = 0;
+      goto done;
+    }
+
+    // ...or are we about to?
+    for (vector<Incremental>::iterator p = pending_auth.begin();
+	 p != pending_auth.end();
+	 ++p) {
+      if (p->inc_type == AUTH_DATA) {
+	KeyServerData::Incremental auth_inc;
+	auto q = p->auth_data.cbegin();
+	decode(auth_inc, q);
+	if (auth_inc.op == KeyServerData::AUTH_INC_ADD &&
+	    auth_inc.name == entity) {
+	  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						get_last_committed() + 1));
+	  return true;
+	}
+      }
+    }
+
+    // create it
+    KeyServerData::Incremental auth_inc;
+    auth_inc.op = KeyServerData::AUTH_INC_ADD;
+    auth_inc.name = entity;
+    auth_inc.auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+    auth_inc.auth.caps = wanted_caps;
+
+    push_cephx_inc(auth_inc);
+
+    if (prefix == "auth get-or-create-key") {
+      if (f) {
+        auth_inc.auth.key.encode_formatted("auth", f.get(), rdata);
+      } else {
+        ds << auth_inc.auth.key;
+      }
+    } else {
+      KeyRing kr;
+      kr.add(entity, auth_inc.auth.key);
+      if (f) {
+        kr.set_caps(entity, wanted_caps);
+        kr.encode_formatted("auth", f.get(), rdata);
+      } else {
+        kr.encode_plaintext(rdata);
+      }
+    }
+
+    rdata.append(ds);
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, rdata,
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "fs authorize") {
+    string filesystem;
+    cmd_getval(cmdmap, "filesystem", filesystem);
+    string mon_cap_string = "allow r";
+    string mds_cap_string, osd_cap_string;
+    string osd_cap_wanted = "r";
+
+    std::shared_ptr<const Filesystem> fs;
+    if (filesystem != "*" && filesystem != "all") {
+      fs = mon.mdsmon()->get_fsmap().get_filesystem(filesystem);
+      if (fs == nullptr) {
+	ss << "filesystem " << filesystem << " does not exist.";
+	err = -EINVAL;
+	goto done;
+      } else {
+	mon_cap_string += " fsname=" + std::string(fs->mds_map.get_fs_name());
+      }
+    }
+
+    for (auto it = caps_vec.begin();
+	 it != caps_vec.end() && (it + 1) != caps_vec.end();
+	 it += 2) {
+      const string &path = *it;
+      const string &cap = *(it+1);
+      bool root_squash = false;
+      if ((it + 2) != caps_vec.end() && *(it+2) == "root_squash") {
+	root_squash = true;
+	++it;
+      }
+
+      if (cap != "r" && cap.compare(0, 2, "rw")) {
+	ss << "Permission flags must start with 'r' or 'rw'.";
+	err = -EINVAL;
+	goto done;
+      }
+      if (cap.compare(0, 2, "rw") == 0)
+	osd_cap_wanted = "rw";
+
+      char last='\0';
+      for (size_t i = 2; i < cap.size(); ++i) {
+	char c = cap.at(i);
+	if (last >= c) {
+	  ss << "Permission flags (except 'rw') must be specified in alphabetical order.";
+	  err = -EINVAL;
+	  goto done;
+	}
+	switch (c) {
+	case 'p':
+	  break;
+	case 's':
+	  break;
+	default:
+	  ss << "Unknown permission flag '" << c << "'.";
+	  err = -EINVAL;
+	  goto done;
+	}
+      }
+
+      mds_cap_string += mds_cap_string.empty() ? "" : ", ";
+      mds_cap_string += "allow " + cap;
+
+      if (filesystem != "*" && filesystem != "all" && fs != nullptr) {
+	mds_cap_string += " fsname=" + std::string(fs->mds_map.get_fs_name());
+      }
+
+      if (path != "/") {
+	mds_cap_string += " path=" + path;
+      }
+
+      if (root_squash) {
+	mds_cap_string += " root_squash";
+      }
+    }
+
+    osd_cap_string += osd_cap_string.empty() ? "" : ", ";
+    osd_cap_string += "allow " + osd_cap_wanted
+      + " tag " + pg_pool_t::APPLICATION_NAME_CEPHFS
+      + " data=" + filesystem;
+
+    std::map<string, bufferlist> wanted_caps = {
+      { "mon", _encode_cap(mon_cap_string) },
+      { "osd", _encode_cap(osd_cap_string) },
+      { "mds", _encode_cap(mds_cap_string) }
+    };
+
+    if (!valid_caps("mon", mon_cap_string, &ss) ||
+        !valid_caps("osd", osd_cap_string, &ss) ||
+	!valid_caps("mds", mds_cap_string, &ss)) {
+      err = -EINVAL;
+      goto done;
+    }
+
+    EntityAuth entity_auth;
+    if (mon.key_server.get_auth(entity, entity_auth)) {
+      for (const auto &sys_cap : wanted_caps) {
+	if (entity_auth.caps.count(sys_cap.first) == 0 ||
+	    !entity_auth.caps[sys_cap.first].contents_equal(sys_cap.second)) {
+	  ss << entity << " already has fs capabilities that differ from "
+	     << "those supplied. To generate a new auth key for " << entity
+	     << ", first remove " << entity << " from configuration files, "
+	     << "execute 'ceph auth rm " << entity << "', then execute this "
+	     << "command again.";
+	  err = -EINVAL;
+	  goto done;
+	}
+      }
+
+      KeyRing kr;
+      kr.add(entity, entity_auth.key);
+      if (f) {
+	kr.set_caps(entity, entity_auth.caps);
+	kr.encode_formatted("auth", f.get(), rdata);
+      } else {
+	kr.encode_plaintext(rdata);
+      }
+      err = 0;
+      goto done;
+    }
+
+    KeyServerData::Incremental auth_inc;
+    auth_inc.op = KeyServerData::AUTH_INC_ADD;
+    auth_inc.name = entity;
+    auth_inc.auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+    auth_inc.auth.caps = wanted_caps;
+
+    push_cephx_inc(auth_inc);
+    KeyRing kr;
+    kr.add(entity, auth_inc.auth.key);
+    if (f) {
+      kr.set_caps(entity, wanted_caps);
+      kr.encode_formatted("auth", f.get(), rdata);
+    } else {
+      kr.encode_plaintext(rdata);
+    }
+
+    rdata.append(ds);
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, rdata,
+						  get_last_committed() + 1));
+    return true;
+  } else if (prefix == "auth caps" && !entity_name.empty()) {
+    KeyServerData::Incremental auth_inc;
+    auth_inc.name = entity;
+    if (!mon.key_server.get_auth(auth_inc.name, auth_inc.auth)) {
+      ss << "couldn't find entry " << auth_inc.name;
+      err = -ENOENT;
+      goto done;
+    }
+
+    if (!valid_caps(caps_vec, &ss)) {
+      err = -EINVAL;
+      goto done;
+    }
+
+    map<string,bufferlist> newcaps;
+    for (vector<string>::iterator it = caps_vec.begin();
+	 it != caps_vec.end(); it += 2)
+      encode(*(it+1), newcaps[*it]);
+
+    auth_inc.op = KeyServerData::AUTH_INC_ADD;
+    auth_inc.auth.caps = newcaps;
+    push_cephx_inc(auth_inc);
+
+    ss << "updated caps for " << auth_inc.name;
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else if ((prefix == "auth del" || prefix == "auth rm") &&
+             !entity_name.empty()) {
+    KeyServerData::Incremental auth_inc;
+    auth_inc.name = entity;
+    if (!mon.key_server.contains(auth_inc.name)) {
+      ss << "entity " << entity << " does not exist";
+      err = 0;
+      goto done;
+    }
+    auth_inc.op = KeyServerData::AUTH_INC_DEL;
+    push_cephx_inc(auth_inc);
+
+    ss << "updated";
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  }
+done:
+  rdata.append(ds);
+  getline(ss, rs, '\0');
+  mon.reply_command(op, err, rs, rdata, get_last_committed());
+  return false;
+}
+
+bool AuthMonitor::prepare_global_id(MonOpRequestRef op)
+{
+  dout(10) << "AuthMonitor::prepare_global_id" << dendl;
+  increase_max_global_id();
+
+  return true;
+}
+
+bool AuthMonitor::_upgrade_format_to_dumpling()
+{
+  dout(1) << __func__ << " upgrading from format 0 to 1" << dendl;
+  ceph_assert(format_version == 0);
+
+  bool changed = false;
+  map<EntityName, EntityAuth>::iterator p;
+  for (p = mon.key_server.secrets_begin();
+       p != mon.key_server.secrets_end();
+       ++p) {
+    // grab mon caps, if any
+    string mon_caps;
+    if (p->second.caps.count("mon") == 0)
+      continue;
+    try {
+      auto it = p->second.caps["mon"].cbegin();
+      decode(mon_caps, it);
+    }
+    catch (const ceph::buffer::error&) {
+      dout(10) << __func__ << " unable to parse mon cap for "
+	       << p->first << dendl;
+      continue;
+    }
+
+    string n = p->first.to_str();
+    string new_caps;
+
+    // set daemon profiles
+    if ((p->first.is_osd() || p->first.is_mds()) &&
+        mon_caps == "allow rwx") {
+      new_caps = string("allow profile ") + std::string(p->first.get_type_name());
+    }
+
+    // update bootstrap keys
+    if (n == "client.bootstrap-osd") {
+      new_caps = "allow profile bootstrap-osd";
+    }
+    if (n == "client.bootstrap-mds") {
+      new_caps = "allow profile bootstrap-mds";
+    }
+
+    if (new_caps.length() > 0) {
+      dout(5) << __func__ << " updating " << p->first << " mon cap from "
+	      << mon_caps << " to " << new_caps << dendl;
+
+      bufferlist bl;
+      encode(new_caps, bl);
+
+      KeyServerData::Incremental auth_inc;
+      auth_inc.name = p->first;
+      auth_inc.auth = p->second;
+      auth_inc.auth.caps["mon"] = bl;
+      auth_inc.op = KeyServerData::AUTH_INC_ADD;
+      push_cephx_inc(auth_inc);
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool AuthMonitor::_upgrade_format_to_luminous()
+{
+  dout(1) << __func__ << " upgrading from format 1 to 2" << dendl;
+  ceph_assert(format_version == 1);
+
+  bool changed = false;
+  map<EntityName, EntityAuth>::iterator p;
+  for (p = mon.key_server.secrets_begin();
+       p != mon.key_server.secrets_end();
+       ++p) {
+    string n = p->first.to_str();
+
+    string newcap;
+    if (n == "client.admin") {
+      // admin gets it all
+      newcap = "allow *";
+    } else if (n.find("osd.") == 0 ||
+	       n.find("mds.") == 0 ||
+	       n.find("mon.") == 0) {
+      // daemons follow their profile
+      string type = n.substr(0, 3);
+      newcap = "allow profile " + type;
+    } else if (p->second.caps.count("mon")) {
+      // if there are any mon caps, give them 'r' mgr caps
+      newcap = "allow r";
+    }
+
+    if (newcap.length() > 0) {
+      dout(5) << " giving " << n << " mgr '" << newcap << "'" << dendl;
+      bufferlist bl;
+      encode(newcap, bl);
+
+      EntityAuth auth = p->second;
+      auth.caps["mgr"] = bl;
+
+      add_entity(p->first, auth);
+      changed = true;
+    }
+
+    if (n.find("mgr.") == 0 &&
+	p->second.caps.count("mon")) {
+      // the kraken ceph-mgr@.service set the mon cap to 'allow *'.
+      auto blp = p->second.caps["mon"].cbegin();
+      string oldcaps;
+      decode(oldcaps, blp);
+      if (oldcaps == "allow *") {
+	dout(5) << " fixing " << n << " mon cap to 'allow profile mgr'"
+		<< dendl;
+	bufferlist bl;
+	encode("allow profile mgr", bl);
+
+	EntityAuth auth = p->second;
+	auth.caps["mon"] = bl;
+	add_entity(p->first, p->second);
+	changed = true;
+      }
+    }
+  }
+
+  // add bootstrap key if it does not already exist
+  // (might have already been get-or-create'd by
+  //  ceph-create-keys)
+  EntityName bootstrap_mgr_name;
+  int r = bootstrap_mgr_name.from_str("client.bootstrap-mgr");
+  ceph_assert(r);
+  if (!mon.key_server.contains(bootstrap_mgr_name)) {
+
+    EntityName name = bootstrap_mgr_name;
+    EntityAuth auth;
+    encode("allow profile bootstrap-mgr", auth.caps["mon"]);
+    auth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+    add_entity(name, auth);
+    changed = true;
+  }
+  return changed;
+}
+
+bool AuthMonitor::_upgrade_format_to_mimic()
+{
+  dout(1) << __func__ << " upgrading from format 2 to 3" << dendl;
+  ceph_assert(format_version == 2);
+
+  list<pair<EntityName,EntityAuth> > auth_lst;
+  _generate_bootstrap_keys(&auth_lst);
+
+  bool changed = false;
+  for (auto &p : auth_lst) {
+    if (mon.key_server.contains(p.first)) {
+      continue;
+    }
+    int err = add_entity(p.first, p.second);
+    ceph_assert(err == 0);
+    changed = true;
+  }
+
+  return changed;
+}
+
+void AuthMonitor::upgrade_format()
+{
+  constexpr unsigned int FORMAT_NONE = 0;
+  constexpr unsigned int FORMAT_DUMPLING = 1;
+  constexpr unsigned int FORMAT_LUMINOUS = 2;
+  constexpr unsigned int FORMAT_MIMIC = 3;
+
+  // when upgrading from the current format to a new format, ensure that
+  // the new format doesn't break the older format. I.e., if a given format N
+  // changes or adds something, ensure that when upgrading from N-1 to N+1, we
+  // still observe the changes for format N if those have not been superseded
+  // by N+1.
+
+  unsigned int current = FORMAT_MIMIC;
+  if (!mon.get_quorum_mon_features().contains_all(
+	ceph::features::mon::FEATURE_LUMINOUS)) {
+    // pre-luminous quorum
+    current = FORMAT_DUMPLING;
+  } else if (!mon.get_quorum_mon_features().contains_all(
+	ceph::features::mon::FEATURE_MIMIC)) {
+    // pre-mimic quorum
+    current = FORMAT_LUMINOUS;
+  }
+  if (format_version >= current) {
+    dout(20) << __func__ << " format " << format_version
+	     << " is current" << dendl;
+    return;
+  }
+
+  // perform a rolling upgrade of the new format, if necessary.
+  // i.e., if we are moving from format NONE to MIMIC, we will first upgrade
+  // to DUMPLING, then to LUMINOUS, and finally to MIMIC, in several different
+  // proposals.
+
+  bool changed = false;
+  if (format_version == FORMAT_NONE) {
+    changed = _upgrade_format_to_dumpling();
+
+  } else if (format_version == FORMAT_DUMPLING) {
+    changed = _upgrade_format_to_luminous();
+  } else if (format_version == FORMAT_LUMINOUS) {
+    changed = _upgrade_format_to_mimic();
+  }
+
+  if (changed) {
+    // note new format
+    dout(10) << __func__ << " proposing update from format " << format_version
+	     << " -> " << current << dendl;
+    format_version = current;
+    propose_pending();
+  }
+}
+
+void AuthMonitor::dump_info(Formatter *f)
+{
+  /*** WARNING: do not include any privileged information here! ***/
+  f->open_object_section("auth");
+  f->dump_unsigned("first_committed", get_first_committed());
+  f->dump_unsigned("last_committed", get_last_committed());
+  f->dump_unsigned("num_secrets", mon.key_server.get_num_secrets());
+  f->close_section();
+}
diff --git a/src/mon/AuthMonitor.h b/src/mon/AuthMonitor.h
new file mode 100644
index 000000000..4312b5607
--- /dev/null
+++ b/src/mon/AuthMonitor.h
@@ -0,0 +1,237 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_AUTHMONITOR_H
+#define CEPH_AUTHMONITOR_H
+
+#include <map>
+#include <set>
+
+#include "global/global_init.h"
+#include "include/ceph_features.h"
+#include "include/types.h"
+#include "mon/PaxosService.h"
+#include "mon/MonitorDBStore.h"
+
+class MAuth;
+class KeyRing;
+class Monitor;
+
+#define MIN_GLOBAL_ID 0x1000
+
+class AuthMonitor : public PaxosService {
+public:
+  enum IncType {
+    GLOBAL_ID,
+    AUTH_DATA,
+  };
+  struct Incremental {
+    IncType inc_type;
+    uint64_t max_global_id;
+    uint32_t auth_type;
+    ceph::buffer::list auth_data;
+
+    Incremental() : inc_type(GLOBAL_ID), max_global_id(0), auth_type(0) {}
+
+    void encode(ceph::buffer::list& bl, uint64_t features=-1) const {
+      using ceph::encode;
+      ENCODE_START(2, 2, bl);
+      __u32 _type = (__u32)inc_type;
+      encode(_type, bl);
+      if (_type == GLOBAL_ID) {
+	encode(max_global_id, bl);
+      } else {
+	encode(auth_type, bl);
+	encode(auth_data, bl);
+      }
+      ENCODE_FINISH(bl);
+    }
+    void decode(ceph::buffer::list::const_iterator& bl) {
+      DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+      __u32 _type;
+      decode(_type, bl);
+      inc_type = (IncType)_type;
+      ceph_assert(inc_type >= GLOBAL_ID && inc_type <= AUTH_DATA);
+      if (_type == GLOBAL_ID) {
+	decode(max_global_id, bl);
+      } else {
+	decode(auth_type, bl);
+	decode(auth_data, bl);
+      }
+      DECODE_FINISH(bl);
+    }
+    void dump(ceph::Formatter *f) const {
+      f->dump_int("type", inc_type);
+      f->dump_int("max_global_id", max_global_id);
+      f->dump_int("auth_type", auth_type);
+      f->dump_int("auth_data_len", auth_data.length());
+    }
+    static void generate_test_instances(std::list<Incremental*>& ls) {
+      ls.push_back(new Incremental);
+      ls.push_back(new Incremental);
+      ls.back()->inc_type = GLOBAL_ID;
+      ls.back()->max_global_id = 1234;
+      ls.push_back(new Incremental);
+      ls.back()->inc_type = AUTH_DATA;
+      ls.back()->auth_type = 12;
+      ls.back()->auth_data.append("foo");
+    }
+  };
+
+  struct auth_entity_t {
+    EntityName name;
+    EntityAuth auth;
+  };
+
+
+private:
+  std::vector<Incremental> pending_auth;
+  uint64_t max_global_id;
+  uint64_t last_allocated_id;
+
+  // these are protected by mon->auth_lock
+  int mon_num = 0, mon_rank = 0;
+
+  bool _upgrade_format_to_dumpling();
+  bool _upgrade_format_to_luminous();
+  bool _upgrade_format_to_mimic();
+  void upgrade_format() override;
+
+  void export_keyring(KeyRing& keyring);
+  int import_keyring(KeyRing& keyring);
+
+  void push_cephx_inc(KeyServerData::Incremental& auth_inc) {
+    Incremental inc;
+    inc.inc_type = AUTH_DATA;
+    encode(auth_inc, inc.auth_data);
+    inc.auth_type = CEPH_AUTH_CEPHX;
+    pending_auth.push_back(inc);
+  }
+
+  /* validate mon/osd/mds caps; fail on unrecognized service/type */
+  bool valid_caps(const std::string& type, const std::string& caps, std::ostream *out);
+  bool valid_caps(const std::string& type, const ceph::buffer::list& bl, std::ostream *out) {
+    auto p = bl.begin();
+    std::string v;
+    try {
+      using ceph::decode;
+      decode(v, p);
+    } catch (ceph::buffer::error& e) {
+      *out << "corrupt capability encoding";
+      return false;
+    }
+    return valid_caps(type, v, out);
+  }
+  bool valid_caps(const std::vector<std::string>& caps, std::ostream *out);
+
+  void on_active() override;
+  bool should_propose(double& delay) override;
+  void get_initial_keyring(KeyRing *keyring);
+  void create_initial_keys(KeyRing *keyring);
+  void create_initial() override;
+  void update_from_paxos(bool *need_bootstrap) override;
+  void create_pending() override;  // prepare a new pending
+  bool prepare_global_id(MonOpRequestRef op);
+  bool _should_increase_max_global_id(); ///< called under mon->auth_lock
+  void increase_max_global_id();
+  uint64_t assign_global_id(bool should_increase_max);
+public:
+  uint64_t _assign_global_id(); ///< called under mon->auth_lock
+  void _set_mon_num_rank(int num, int rank); ///< called under mon->auth_lock
+
+private:
+  // propose pending update to peers
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  void encode_full(MonitorDBStore::TransactionRef t) override;
+  version_t get_trim_to() const override;
+
+  bool preprocess_query(MonOpRequestRef op) override;  // true if processed.
+  bool prepare_update(MonOpRequestRef op) override;
+
+  bool prep_auth(MonOpRequestRef op, bool paxos_writable);
+
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
+
+  bool check_rotate();
+
+  bool entity_is_pending(EntityName& entity);
+  int exists_and_matches_entity(
+      const auth_entity_t& entity,
+      bool has_secret,
+      std::stringstream& ss);
+  int exists_and_matches_entity(
+      const EntityName& name,
+      const EntityAuth& auth,
+      const std::map<std::string,ceph::buffer::list>& caps,
+      bool has_secret,
+      std::stringstream& ss);
+  int remove_entity(const EntityName &entity);
+  int add_entity(
+      const EntityName& name,
+      const EntityAuth& auth);
+
+ public:
+  AuthMonitor(Monitor &mn, Paxos &p, const std::string& service_name)
+    : PaxosService(mn, p, service_name),
+      max_global_id(0),
+      last_allocated_id(0)
+  {}
+
+  void pre_auth(MAuth *m);
+
+  void tick() override;  // check state, take actions
+
+  int validate_osd_destroy(
+      int32_t id,
+      const uuid_d& uuid,
+      EntityName& cephx_entity,
+      EntityName& lockbox_entity,
+      std::stringstream& ss);
+  int do_osd_destroy(
+      const EntityName& cephx_entity,
+      const EntityName& lockbox_entity);
+
+  int do_osd_new(
+      const auth_entity_t& cephx_entity,
+      const auth_entity_t& lockbox_entity,
+      bool has_lockbox);
+  int validate_osd_new(
+      int32_t id,
+      const uuid_d& uuid,
+      const std::string& cephx_secret,
+      const std::string& lockbox_secret,
+      auth_entity_t& cephx_entity,
+      auth_entity_t& lockbox_entity,
+      std::stringstream& ss);
+
+  void dump_info(ceph::Formatter *f);
+
+  bool is_valid_cephx_key(const std::string& k) {
+    if (k.empty())
+      return false;
+
+    EntityAuth ea;
+    try {
+      ea.key.decode_base64(k);
+      return true;
+    } catch (ceph::buffer::error& e) { /* fallthrough */ }
+    return false;
+  }
+};
+
+
+WRITE_CLASS_ENCODER_FEATURES(AuthMonitor::Incremental)
+
+#endif
diff --git a/src/mon/CMakeLists.txt b/src/mon/CMakeLists.txt
new file mode 100644
index 000000000..b4056fdb1
--- /dev/null
+++ b/src/mon/CMakeLists.txt
@@ -0,0 +1,42 @@
+set(lib_mon_srcs
+  ${CMAKE_SOURCE_DIR}/src/auth/cephx/CephxKeyServer.cc
+  ${CMAKE_SOURCE_DIR}/src/auth/cephx/CephxServiceHandler.cc
+  ${CMAKE_SOURCE_DIR}/src/auth/AuthServiceHandler.cc
+  Paxos.cc
+  PaxosService.cc
+  OSDMonitor.cc
+  MDSMonitor.cc
+  CommandHandler.cc
+  FSCommands.cc
+  MgrMonitor.cc
+  MgrStatMonitor.cc
+  Monitor.cc
+  MonmapMonitor.cc
+  LogMonitor.cc
+  AuthMonitor.cc
+  ConfigMap.cc
+  ConfigMonitor.cc
+  Elector.cc
+  ElectionLogic.cc
+  ConnectionTracker.cc
+  HealthMonitor.cc
+  KVMonitor.cc
+  ../mds/MDSAuthCaps.cc
+  ../mgr/mgr_commands.cc
+  ../osd/OSDCap.cc
+  $<TARGET_OBJECTS:mgr_cap_obj>)
+
+if(HAVE_GSSAPI)
+  list(APPEND lib_mon_srcs
+    ${CMAKE_SOURCE_DIR}/src/auth/krb/KrbServiceHandler.cpp)
+endif()
+
+add_library(mon STATIC
+  ${lib_mon_srcs})
+target_link_libraries(mon
+  kv
+  heap_profiler
+  fmt::fmt)
+if(WITH_JAEGER)
+  target_link_libraries(mon jaeger-base)
+endif()
diff --git a/src/mon/CommandHandler.cc b/src/mon/CommandHandler.cc
new file mode 100644
index 000000000..903d35927
--- /dev/null
+++ b/src/mon/CommandHandler.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Ltd
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "CommandHandler.h"
+
+#include "common/strtol.h"
+#include "include/ceph_assert.h"
+
+#include <ostream>
+#include <string>
+#include <string_view>
+
+int CommandHandler::parse_bool(std::string_view str, bool* result, std::ostream& ss)
+{
+  ceph_assert(result != nullptr);
+
+  std::string interr;
+  int64_t n = strict_strtoll(str.data(), 10, &interr);
+
+  if (str == "false" || str == "no"
+      || (interr.length() == 0 && n == 0)) {
+    *result = false;
+    return 0;
+  } else if (str == "true" || str == "yes"
+      || (interr.length() == 0 && n == 1)) {
+    *result = true;
+    return 0;
+  } else {
+    ss << "value must be false|no|0 or true|yes|1";
+    return -EINVAL;
+  }
+}
diff --git a/src/mon/CommandHandler.h b/src/mon/CommandHandler.h
new file mode 100644
index 000000000..167b4587f
--- /dev/null
+++ b/src/mon/CommandHandler.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Ltd
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef COMMAND_HANDLER_H_
+#define COMMAND_HANDLER_H_
+
+#include <ostream>
+#include <string_view>
+
+class CommandHandler
+{
+public:
+  /**
+   * Parse true|yes|1 style boolean string from `bool_str`
+   * `result` must be non-null.
+   * `ss` will be populated with error message on error.
+   *
+   * @return 0 on success, else -EINVAL
+   */
+  int parse_bool(std::string_view str, bool* result, std::ostream& ss);
+};
+
+#endif
diff --git a/src/mon/ConfigMap.cc b/src/mon/ConfigMap.cc
new file mode 100644
index 000000000..763b8ce9b
--- /dev/null
+++ b/src/mon/ConfigMap.cc
@@ -0,0 +1,291 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/algorithm/string/split.hpp>
+
+#include "ConfigMap.h"
+#include "crush/CrushWrapper.h"
+#include "common/entity_name.h"
+
+using namespace std::literals;
+
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+
+int MaskedOption::get_precision(const CrushWrapper *crush)
+{
+  // 0 = most precise
+  if (mask.location_type.size()) {
+    int r = crush->get_type_id(mask.location_type);
+    if (r >= 0) {
+      return r;
+    }
+    // bad type name, ignore it
+  }
+  int num_types = crush->get_num_type_names();
+  if (mask.device_class.size()) {
+    return num_types;
+  }
+  return num_types + 1;
+}
+
+void OptionMask::dump(Formatter *f) const
+{
+  if (location_type.size()) {
+    f->dump_string("location_type", location_type);
+    f->dump_string("location_value", location_value);
+  }
+  if (device_class.size()) {
+    f->dump_string("device_class", device_class);
+  }
+}
+
+void MaskedOption::dump(Formatter *f) const
+{
+  f->dump_string("name", opt->name);
+  f->dump_string("value", raw_value);
+  f->dump_string("level", Option::level_to_str(opt->level));
+  f->dump_bool("can_update_at_runtime", opt->can_update_at_runtime());
+  f->dump_string("mask", mask.to_str());
+  mask.dump(f);
+}
+
+ostream& operator<<(ostream& out, const MaskedOption& o)
+{
+  out << o.opt->name;
+  if (o.mask.location_type.size()) {
+    out << "@" << o.mask.location_type << '=' << o.mask.location_value;
+  }
+  if (o.mask.device_class.size()) {
+    out << "@class=" << o.mask.device_class;
+  }
+  return out;
+}
+
+// ----------
+
+void Section::dump(Formatter *f) const
+{
+  for (auto& i : options) {
+    f->dump_object(i.first.c_str(), i.second);
+  }
+}
+
+std::string Section::get_minimal_conf() const
+{
+  std::string r;
+  for (auto& i : options) {
+    if (i.second.opt->has_flag(Option::FLAG_NO_MON_UPDATE) ||
+	i.second.opt->has_flag(Option::FLAG_MINIMAL_CONF)) {
+      if (i.second.mask.empty()) {
+	r += "\t"s + i.first + " = " + i.second.raw_value + "\n";
+      } else {
+	r += "\t# masked option excluded: " + i.first + " = " +
+	  i.second.raw_value + "\n";
+      }
+    }
+  }
+  return r;
+}
+
+
+// ------------
+
+void ConfigMap::dump(Formatter *f) const
+{
+  f->dump_object("global", global);
+  f->open_object_section("by_type");
+  for (auto& i : by_type) {
+    f->dump_object(i.first.c_str(), i.second);
+  }
+  f->close_section();
+  f->open_object_section("by_id");
+  for (auto& i : by_id) {
+    f->dump_object(i.first.c_str(), i.second);
+  }
+  f->close_section();
+}
+
+std::map<std::string,std::string,std::less<>>
+ConfigMap::generate_entity_map(
+  const EntityName& name,
+  const map<std::string,std::string>& crush_location,
+  const CrushWrapper *crush,
+  const std::string& device_class,
+  std::map<std::string,pair<std::string,const MaskedOption*>> *src)
+{
+  // global, then by type, then by name prefix component(s), then name.
+  // name prefix components are .-separated,
+  // e.g. client.a.b.c -> [global, client, client.a, client.a.b, client.a.b.c]
+  vector<pair<string,Section*>> sections = { make_pair("global", &global) };
+  auto p = by_type.find(name.get_type_name());
+  if (p != by_type.end()) {
+    sections.emplace_back(name.get_type_name(), &p->second);
+  }
+  vector<std::string> name_bits;
+  boost::split(name_bits, name.to_str(), [](char c){ return c == '.'; });
+  std::string tname;
+  for (unsigned p = 0; p < name_bits.size(); ++p) {
+    if (p) {
+      tname += '.';
+    }
+    tname += name_bits[p];
+    auto q = by_id.find(tname);
+    if (q != by_id.end()) {
+      sections.push_back(make_pair(tname, &q->second));
+    }
+  }
+  std::map<std::string,std::string,std::less<>> out;
+  MaskedOption *prev = nullptr;
+  for (auto s : sections) {
+    for (auto& i : s.second->options) {
+      auto& o = i.second;
+      // match against crush location, class
+      if (o.mask.device_class.size() &&
+	  o.mask.device_class != device_class) {
+	continue;
+      }
+      if (o.mask.location_type.size()) {
+	auto p = crush_location.find(o.mask.location_type);
+	if (p == crush_location.end() ||
+	    p->second != o.mask.location_value) {
+	  continue;
+	}
+      }
+      if (prev && prev->opt->name != i.first) {
+	prev = nullptr;
+      }
+      if (prev &&
+	  prev->get_precision(crush) < o.get_precision(crush)) {
+	continue;
+      }
+      out[i.first] = o.raw_value;
+      if (src) {
+	(*src)[i.first] = make_pair(s.first, &o);
+      }
+      prev = &o;
+    }
+  }
+  return out;
+}
+
+bool ConfigMap::parse_mask(
+  const std::string& who,
+  std::string *section,
+  OptionMask *mask)
+{
+  vector<std::string> split;
+  boost::split(split, who, [](char c){ return c == '/'; });
+  for (unsigned j = 0; j < split.size(); ++j) {
+    auto& i = split[j];
+    if (i == "global") {
+      *section = "global";
+      continue;
+    }
+    size_t delim = i.find(':');
+    if (delim != std::string::npos) {
+      string k = i.substr(0, delim);
+      if (k == "class") {
+	mask->device_class = i.substr(delim + 1);
+      } else {
+	mask->location_type = k;
+	mask->location_value = i.substr(delim + 1);
+      }
+      continue;
+    }
+    string type, id;
+    auto dotpos = i.find('.');
+    if (dotpos != std::string::npos) {
+      type = i.substr(0, dotpos);
+      id = i.substr(dotpos + 1);
+    } else {
+      type = i;
+    }
+    if (EntityName::str_to_ceph_entity_type(type) == CEPH_ENTITY_TYPE_ANY) {
+      return false;
+    }
+    *section = i;
+  }
+  return true;
+}
+
+void ConfigMap::parse_key(
+  const std::string& key,
+  std::string *name,
+  std::string *who)
+{
+  auto last_slash = key.rfind('/');
+  if (last_slash == std::string::npos) {
+    *name = key;
+  } else if (auto mgrpos = key.find("/mgr/"); mgrpos != std::string::npos) {
+    *name = key.substr(mgrpos + 1);
+    *who = key.substr(0, mgrpos);
+  } else {
+    *name = key.substr(last_slash + 1);
+    *who = key.substr(0, last_slash);
+  }
+}
+
+
+// --------------
+
+void ConfigChangeSet::dump(Formatter *f) const
+{
+  f->dump_int("version", version);
+  f->dump_stream("timestamp") << stamp;
+  f->dump_string("name", name);
+  f->open_array_section("changes");
+  for (auto& i : diff) {
+    f->open_object_section("change");
+    f->dump_string("name", i.first);
+    if (i.second.first) {
+      f->dump_string("previous_value", *i.second.first);
+    }
+    if (i.second.second) {
+      f->dump_string("new_value", *i.second.second);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ConfigChangeSet::print(ostream& out) const
+{
+  out << "--- " << version << " --- " << stamp;
+  if (name.size()) {
+    out << " --- " << name;
+  }
+  out << " ---\n";
+  for (auto& i : diff) {
+    if (i.second.first) {
+      out << "- " << i.first << " = " << *i.second.first << "\n";
+    }
+    if (i.second.second) {
+      out << "+ " << i.first << " = " << *i.second.second << "\n";
+    }
+  }
+}
diff --git a/src/mon/ConfigMap.h b/src/mon/ConfigMap.h
new file mode 100644
index 000000000..2ecdcc071
--- /dev/null
+++ b/src/mon/ConfigMap.h
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <ostream>
+#include <string>
+
+#include "include/utime.h"
+#include "common/options.h"
+#include "common/entity_name.h"
+
+class CrushWrapper;
+
+// the precedence is thus:
+//
+//  global
+//   crush location (coarse to fine, ordered by type id)
+//  daemon type (e.g., osd)
+//   device class (osd only)
+//   crush location (coarse to fine, ordered by type id)
+//  daemon name (e.g., mds.foo)
+//
+// Note that this means that if we have
+//
+//  config/host:foo/a = 1
+//  config/osd/rack:foo/a = 2
+//
+// then we get a = 2.  The osd-level config wins, even though rack
+// is less precise than host, because the crush limiters are only
+// resolved within a section (global, per-daemon, per-instance).
+
+struct OptionMask {
+  std::string location_type, location_value; ///< matches crush_location
+  std::string device_class;                  ///< matches device class
+
+  bool empty() const {
+    return location_type.size() == 0
+      && location_value.size() == 0
+      && device_class.size() == 0;
+  }
+
+  std::string to_str() const {
+    std::string r;
+    if (location_type.size()) {
+      r += location_type + ":" + location_value;
+    }
+    if (device_class.size()) {
+      if (r.size()) {
+	r += "/";
+      }
+      r += "class:" + device_class;
+    }
+    return r;
+  }
+  void dump(ceph::Formatter *f) const;
+};
+
+struct MaskedOption {
+  std::string raw_value;               ///< raw, unparsed, unvalidated value
+  const Option *opt;              ///< the option
+  OptionMask mask;
+  std::unique_ptr<const Option> unknown_opt; ///< if fabricated for an unknown option
+
+  MaskedOption(const Option *o, bool fab=false) : opt(o) {
+    if (fab) {
+      unknown_opt.reset(o);
+    }
+  }
+  MaskedOption(MaskedOption&& o) {
+    raw_value = std::move(o.raw_value);
+    opt = o.opt;
+    mask = std::move(o.mask);
+    unknown_opt = std::move(o.unknown_opt);
+  }
+  const MaskedOption& operator=(const MaskedOption& o) = delete;
+  const MaskedOption& operator=(MaskedOption&& o) = delete;
+
+  /// return a precision metric (smaller is more precise)
+  int get_precision(const CrushWrapper *crush);
+
+  friend std::ostream& operator<<(std::ostream& out, const MaskedOption& o);
+
+  void dump(ceph::Formatter *f) const;
+};
+
+struct Section {
+  std::multimap<std::string,MaskedOption> options;
+
+  void clear() {
+    options.clear();
+  }
+  void dump(ceph::Formatter *f) const;
+  std::string get_minimal_conf() const;
+};
+
+struct ConfigMap {
+  Section global;
+  std::map<std::string,Section, std::less<>> by_type;
+  std::map<std::string,Section, std::less<>> by_id;
+  std::list<std::unique_ptr<Option>> stray_options;
+
+  Section *find_section(const std::string& name) {
+    if (name == "global") {
+      return &global;
+    }
+    auto i = by_type.find(name);
+    if (i != by_type.end()) {
+      return &i->second;
+    }
+    i = by_id.find(name);
+    if (i != by_id.end()) {
+      return &i->second;
+    }
+    return nullptr;
+  }
+  void clear() {
+    global.clear();
+    by_type.clear();
+    by_id.clear();
+    stray_options.clear();
+  }
+  void dump(ceph::Formatter *f) const;
+  std::map<std::string,std::string,std::less<>> generate_entity_map(
+    const EntityName& name,
+    const std::map<std::string,std::string>& crush_location,
+    const CrushWrapper *crush,
+    const std::string& device_class,
+    std::map<std::string,std::pair<std::string,const MaskedOption*>> *src=0);
+
+  void parse_key(
+    const std::string& key,
+    std::string *name,
+    std::string *who);
+  static bool parse_mask(
+    const std::string& in,
+    std::string *section,
+    OptionMask *mask);
+};
+
+
+struct ConfigChangeSet {
+  version_t version;
+  utime_t stamp;
+  std::string name;
+
+  // key -> (old value, new value)
+  std::map<std::string,std::pair<boost::optional<std::string>,boost::optional<std::string>>> diff;
+
+  void dump(ceph::Formatter *f) const;
+  void print(std::ostream& out) const;
+};
diff --git a/src/mon/ConfigMonitor.cc b/src/mon/ConfigMonitor.cc
new file mode 100644
index 000000000..c82a8417a
--- /dev/null
+++ b/src/mon/ConfigMonitor.cc
@@ -0,0 +1,1028 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "mon/Monitor.h"
+#include "mon/ConfigMonitor.h"
+#include "mon/KVMonitor.h"
+#include "mon/MgrMonitor.h"
+#include "mon/OSDMonitor.h"
+#include "messages/MConfig.h"
+#include "messages/MGetConfig.h"
+#include "messages/MMonCommand.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/cmdparse.h"
+#include "include/stringify.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, this)
+using namespace TOPNSPC::common;
+
+using namespace std::literals;
+
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, const Monitor &mon,
+                        const ConfigMonitor *hmon) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name() << ").config ";
+}
+
+const string KEY_PREFIX("config/");
+const string HISTORY_PREFIX("config-history/");
+
+ConfigMonitor::ConfigMonitor(Monitor &m, Paxos &p, const string& service_name)
+  : PaxosService(m, p, service_name) {
+}
+
+void ConfigMonitor::init()
+{
+  dout(10) << __func__ << dendl;
+}
+
+void ConfigMonitor::create_initial()
+{
+  dout(10) << __func__ << dendl;
+  version = 0;
+  pending.clear();
+}
+
+void ConfigMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  if (version == get_last_committed()) {
+    return;
+  }
+  version = get_last_committed();
+  dout(10) << __func__ << " " << version << dendl;
+  load_config();
+  check_all_subs();
+}
+
+void ConfigMonitor::create_pending()
+{
+  dout(10) << " " << version << dendl;
+  pending.clear();
+  pending_description.clear();
+}
+
+void ConfigMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  dout(10) << " " << (version+1) << dendl;
+  put_last_committed(t, version+1);
+  // NOTE: caller should have done encode_pending_to_kvmon() and
+  // kvmon->propose_pending() to commit the actual config changes.
+}
+
+void ConfigMonitor::encode_pending_to_kvmon()
+{
+  // we need to pass our data through KVMonitor so that it is properly
+  // versioned and shared with subscribers.
+  for (auto& [key, value] : pending_cleanup) {
+    if (pending.count(key) == 0) {
+      derr << __func__ << " repair: adjusting config key '" << key << "'"
+	   << dendl;
+      pending[key] = value;
+    }
+  }
+  pending_cleanup.clear();
+
+  // TODO: record changed sections (osd, mds.foo, rack:bar, ...)
+
+  string history = HISTORY_PREFIX + stringify(version+1) + "/";
+  {
+    bufferlist metabl;
+    ::encode(ceph_clock_now(), metabl);
+    ::encode(pending_description, metabl);
+    mon.kvmon()->enqueue_set(history, metabl);
+  }
+  for (auto& p : pending) {
+    string key = KEY_PREFIX + p.first;
+    auto q = current.find(p.first);
+    if (q != current.end()) {
+      if (p.second && *p.second == q->second) {
+	continue;
+      }
+      mon.kvmon()->enqueue_set(history + "-" + p.first, q->second);
+    } else if (!p.second) {
+      continue;
+    }
+    if (p.second) {
+      dout(20) << __func__ << " set " << key << dendl;
+      mon.kvmon()->enqueue_set(key, *p.second);
+      mon.kvmon()->enqueue_set(history + "+" + p.first, *p.second);
+   } else {
+      dout(20) << __func__ << " rm " << key << dendl;
+      mon.kvmon()->enqueue_rm(key);
+    }
+  }
+}
+
+version_t ConfigMonitor::get_trim_to() const
+{
+  // we don't actually need *any* old states, but keep a few.
+  if (version > 5) {
+    return version - 5;
+  }
+  return 0;
+}
+
+bool ConfigMonitor::preprocess_query(MonOpRequestRef op)
+{
+  switch (op->get_req()->get_type()) {
+  case MSG_MON_COMMAND:
+    try {
+      return preprocess_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+  }
+  return false;
+}
+
+static string indent_who(const string& who)
+{
+  if (who == "global") {
+    return who;
+  }
+  if (who.find('.') == string::npos) {
+    return "  " + who;
+  }
+  return "    " + who;
+}
+
+bool ConfigMonitor::preprocess_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  std::stringstream ss;
+  int err = 0;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, get_last_committed());
+    return true;
+  }
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  bufferlist odata;
+  if (prefix == "config help") {
+    stringstream ss;
+    string name;
+    cmd_getval(cmdmap, "key", name);
+    name = ConfFile::normalize_key_name(name);
+    const Option *opt = g_conf().find_option(name);
+    if (!opt) {
+      opt = mon.mgrmon()->find_module_option(name);
+    }
+    if (opt) {
+      if (f) {
+	f->dump_object("option", *opt);
+      } else {
+	opt->print(&ss);
+      }
+    } else {
+      ss << "configuration option '" << name << "' not recognized";
+      err = -ENOENT;
+      goto reply;
+    }
+    if (f) {
+      f->flush(odata);
+    } else {
+      odata.append(ss.str());
+    }
+  } else if (prefix == "config ls") {
+    ostringstream ss;
+    if (f) {
+      f->open_array_section("options");
+    }
+    for (auto& i : ceph_options) {
+      if (f) {
+	f->dump_string("option", i.name);
+      } else {
+	ss << i.name << "\n";
+      }
+    }
+    for (auto& i : mon.mgrmon()->get_mgr_module_options()) {
+      if (f) {
+	f->dump_string("option", i.first);
+      } else {
+	ss << i.first << "\n";
+      }
+    }
+    if (f) {
+      f->close_section();
+      f->flush(odata);
+    } else {
+      odata.append(ss.str());
+    }
+  } else if (prefix == "config dump") {
+    list<pair<string,Section*>> sections = {
+      make_pair("global", &config_map.global)
+    };
+    for (string type : { "mon", "mgr", "osd", "mds", "client" }) {
+      auto i = config_map.by_type.find(type);
+      if (i != config_map.by_type.end()) {
+	sections.push_back(make_pair(i->first, &i->second));
+      }
+      auto j = config_map.by_id.lower_bound(type);
+      while (j != config_map.by_id.end() &&
+	     j->first.find(type) == 0) {
+	sections.push_back(make_pair(j->first, &j->second));
+	++j;
+      }
+    }
+    TextTable tbl;
+    if (!f) {
+      tbl.define_column("WHO", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("MASK", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("LEVEL", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("OPTION", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("RO", TextTable::LEFT, TextTable::LEFT);
+    } else {
+      f->open_array_section("config");
+    }
+    for (auto s : sections) {
+      for (auto& i : s.second->options) {
+	if (!f) {
+	  tbl << indent_who(s.first);
+	  tbl << i.second.mask.to_str();
+	  tbl << Option::level_to_str(i.second.opt->level);
+          tbl << i.first;
+	  tbl << i.second.raw_value;
+	  tbl << (i.second.opt->can_update_at_runtime() ? "" : "*");
+	  tbl << TextTable::endrow;
+	} else {
+	  f->open_object_section("option");
+	  f->dump_string("section", s.first);
+	  i.second.dump(f.get());
+	  f->close_section();
+	}
+      }
+    }
+    if (!f) {
+      odata.append(stringify(tbl));
+    } else {
+      f->close_section();
+      f->flush(odata);
+    }
+  } else if (prefix == "config get") {
+    string who, name;
+    cmd_getval(cmdmap, "who", who);
+
+    EntityName entity;
+    if (!entity.from_str(who) &&
+	!entity.from_str(who + ".")) {
+      ss << "unrecognized entity '" << who << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    map<string,string> crush_location;
+    string device_class;
+    if (entity.is_osd()) {
+      mon.osdmon()->osdmap.crush->get_full_location(who, &crush_location);
+      int id = atoi(entity.get_id().c_str());
+      const char *c = mon.osdmon()->osdmap.crush->get_item_class(id);
+      if (c) {
+	device_class = c;
+      }
+      dout(10) << __func__ << " crush_location " << crush_location
+	       << " class " << device_class << dendl;
+    }
+
+    std::map<std::string,pair<std::string,const MaskedOption*>> src;
+    auto config = config_map.generate_entity_map(
+      entity,
+      crush_location,
+      mon.osdmon()->osdmap.crush.get(),
+      device_class,
+      &src);
+
+    if (cmd_getval(cmdmap, "key", name)) {
+      name = ConfFile::normalize_key_name(name);
+      const Option *opt = g_conf().find_option(name);
+      if (!opt) {
+	opt = mon.mgrmon()->find_module_option(name);
+      }
+      if (!opt) {
+        ss << "unrecognized key '" << name << "'";
+	err = -ENOENT;
+	goto reply;
+      }
+      if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+	// handle special options
+	if (name == "fsid") {
+	  odata.append(stringify(mon.monmap->get_fsid()));
+	  odata.append("\n");
+	  goto reply;
+	}
+	err = -EINVAL;
+	ss << name << " is special and cannot be stored by the mon";
+	goto reply;
+      }
+      // get a single value
+      auto p = config.find(name);
+      if (p != config.end()) {
+	odata.append(p->second);
+	odata.append("\n");
+	goto reply;
+      }
+      if (!entity.is_client() &&
+	  !boost::get<boost::blank>(&opt->daemon_value)) {
+	odata.append(Option::to_str(opt->daemon_value));
+      } else {
+	odata.append(Option::to_str(opt->value));
+      }
+      odata.append("\n");
+    } else {
+      // dump all (non-default) values for this entity
+      TextTable tbl;
+      if (!f) {
+	tbl.define_column("WHO", TextTable::LEFT, TextTable::LEFT);
+	tbl.define_column("MASK", TextTable::LEFT, TextTable::LEFT);
+	tbl.define_column("LEVEL", TextTable::LEFT, TextTable::LEFT);
+	tbl.define_column("OPTION", TextTable::LEFT, TextTable::LEFT);
+	tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT);
+	tbl.define_column("RO", TextTable::LEFT, TextTable::LEFT);
+      } else {
+	f->open_object_section("config");
+      }
+      auto p = config.begin();
+      auto q = src.begin();
+      for (; p != config.end(); ++p, ++q) {
+	if (name.size() && p->first != name) {
+	  continue;
+	}
+	if (!f) {
+	  tbl << q->second.first;
+	  tbl << q->second.second->mask.to_str();
+	  tbl << Option::level_to_str(q->second.second->opt->level);
+	  tbl << p->first;
+	  tbl << p->second;
+	  tbl << (q->second.second->opt->can_update_at_runtime() ? "" : "*");
+	  tbl << TextTable::endrow;
+	} else {
+	  f->open_object_section(p->first.c_str());
+	  f->dump_string("value", p->second);
+	  f->dump_string("section", q->second.first);
+	  f->dump_object("mask", q->second.second->mask);
+	  f->dump_bool("can_update_at_runtime",
+		       q->second.second->opt->can_update_at_runtime());
+	  f->close_section();
+	}
+      }
+      if (!f) {
+	odata.append(stringify(tbl));
+      } else {
+	f->close_section();
+	f->flush(odata);
+      }
+    }
+  } else if (prefix == "config log") {
+    int64_t num = 10;
+    cmd_getval(cmdmap, "num", num);
+    ostringstream ds;
+    if (f) {
+      f->open_array_section("changesets");
+    }
+    for (version_t v = version; v > version - std::min(version, (version_t)num); --v) {
+      ConfigChangeSet ch;
+      load_changeset(v, &ch);
+      if (f) {
+	f->dump_object("changeset", ch);
+      } else {
+	ch.print(ds);
+      }
+    }
+    if (f) {
+      f->close_section();
+      f->flush(odata);
+    } else {
+      odata.append(ds.str());
+    }
+  } else if (prefix == "config generate-minimal-conf") {
+    ostringstream conf;
+    conf << "# minimal ceph.conf for " << mon.monmap->get_fsid() << "\n";
+
+    // the basics
+    conf << "[global]\n";
+    conf << "\tfsid = " << mon.monmap->get_fsid() << "\n";
+    conf << "\tmon_host = ";
+    for (auto i = mon.monmap->mon_info.begin();
+	 i != mon.monmap->mon_info.end();
+	 ++i) {
+      if (i != mon.monmap->mon_info.begin()) {
+	conf << " ";
+      }
+      if (i->second.public_addrs.size() == 1 &&
+	  i->second.public_addrs.front().is_legacy() &&
+	  i->second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) {
+	// if this is a legacy addr on the legacy default port, then
+	// use the legacy-compatible formatting so that old clients
+	// can use this config.  new code will see the :6789 and correctly
+	// interpret this as a v1 address.
+	conf << i->second.public_addrs.get_legacy_str();
+      } else {
+	conf << i->second.public_addrs;
+      }
+    }
+    conf << "\n";
+    conf << config_map.global.get_minimal_conf();
+    for (auto m : { &config_map.by_type, &config_map.by_id }) {
+      for (auto& i : *m) {
+	auto s = i.second.get_minimal_conf();
+	if (s.size()) {
+	  conf << "\n[" << i.first << "]\n" << s;
+	}
+      }
+    }
+    odata.append(conf.str());
+    err = 0;
+  } else {
+    return false;
+  }
+
+  reply:
+  mon.reply_command(op, err, ss.str(), odata, get_last_committed());
+  return true;
+}
+
+void ConfigMonitor::handle_get_config(MonOpRequestRef op)
+{
+  auto m = op->get_req<MGetConfig>();
+  dout(10) << __func__ << " " << m->name << " host " << m->host << dendl;
+
+  const OSDMap& osdmap = mon.osdmon()->osdmap;
+  map<string,string> crush_location;
+  osdmap.crush->get_full_location(m->host, &crush_location);
+  auto out = config_map.generate_entity_map(
+    m->name,
+    crush_location,
+    osdmap.crush.get(),
+    m->device_class);
+  dout(20) << " config is " << out << dendl;
+  m->get_connection()->send_message(new MConfig{std::move(out)});
+}
+
+bool ConfigMonitor::prepare_update(MonOpRequestRef op)
+{
+  Message *m = op->get_req();
+  dout(7) << "prepare_update " << *m
+	  << " from " << m->get_orig_source_inst() << dendl;
+  switch (m->get_type()) {
+  case MSG_MON_COMMAND:
+    try {
+      return prepare_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigMonitor::prepare_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  std::stringstream ss;
+  int err = -EINVAL;
+
+  // make sure kv is writeable.
+  if (!mon.kvmon()->is_writeable()) {
+    dout(10) << __func__ << " waiting for kv mon to be writeable" << dendl;
+    mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+    return false;
+  }
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+  bufferlist odata;
+
+  if (prefix == "config set" ||
+      prefix == "config rm") {
+    string who;
+    string name, value;
+    bool force = false;
+    cmd_getval(cmdmap, "who", who);
+    cmd_getval(cmdmap, "name", name);
+    cmd_getval(cmdmap, "value", value);
+    cmd_getval(cmdmap, "force", force);
+    name = ConfFile::normalize_key_name(name);
+    
+    if (prefix == "config set" && !force) {
+      const Option *opt = g_conf().find_option(name);
+      if (!opt) {
+	opt = mon.mgrmon()->find_module_option(name);
+      }
+      if (!opt) {
+	ss << "unrecognized config option '" << name << "'";
+	err = -EINVAL;
+	goto reply;
+      }
+
+      Option::value_t real_value;
+      string errstr;
+      err = opt->parse_value(value, &real_value, &errstr, &value);
+      if (err < 0) {
+	ss << "error parsing value: " << errstr;
+	goto reply;
+      }
+
+      if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+	err = -EINVAL;
+	ss << name << " is special and cannot be stored by the mon";
+	goto reply;
+      }
+    }
+
+    string section;
+    OptionMask mask;
+    if (!ConfigMap::parse_mask(who, &section, &mask)) {
+      ss << "unrecognized config target '" << who << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    string key;
+    if (section.size()) {
+      key += section + "/";
+    } else {
+      key += "global/";
+    }
+    string mask_str = mask.to_str();
+    if (mask_str.size()) {
+      key += mask_str + "/";
+    }
+    key += name;
+
+    if (prefix == "config set") {
+      bufferlist bl;
+      bl.append(value);
+      pending[key] = bl;
+    } else {
+      pending[key] = boost::none;
+    }
+    goto update;
+  } else if (prefix == "config reset") {
+    int64_t revert_to = -1;
+    cmd_getval(cmdmap, "num", revert_to);
+    if (revert_to < 0 ||
+        revert_to > (int64_t)version) {
+      err = -EINVAL;
+      ss << "must specify a valid historical version to revert to; "
+         << "see 'ceph config log' for a list of avialable configuration "
+         << "historical versions";
+      goto reply;
+    }
+    if (revert_to == (int64_t)version) {
+      err = 0;
+      goto reply;
+    }
+    for (int64_t v = version; v > revert_to; --v) {
+      ConfigChangeSet ch;
+      load_changeset(v, &ch);
+      for (auto& i : ch.diff) {
+	if (i.second.first) {
+	  bufferlist bl;
+	  bl.append(*i.second.first);
+	  pending[i.first] = bl;
+	} else if (i.second.second) {
+	  pending[i.first] = boost::none;
+	}
+      }
+    }
+    pending_description = string("reset to ") + stringify(revert_to);
+    goto update;
+  } else if (prefix == "config assimilate-conf") {
+    ConfFile cf;
+    bufferlist bl = m->get_data();
+    err = cf.parse_bufferlist(&bl, &ss);
+    if (err < 0) {
+      goto reply;
+    }
+    bool updated = false;
+    ostringstream newconf;
+    for (auto& [section, s] : cf) {
+      dout(20) << __func__ << " [" << section << "]" << dendl;
+      bool did_section = false;
+      for (auto& [key, val] : s) {
+	Option::value_t real_value;
+	string value;
+	string errstr;
+	if (key.empty()) {
+	  continue;
+	}
+	// a known and worthy option?
+	const Option *o = g_conf().find_option(key);
+	if (!o) {
+	  o = mon.mgrmon()->find_module_option(key);
+	}
+	if (!o ||
+	    (o->flags & Option::FLAG_NO_MON_UPDATE) ||
+	    (o->flags & Option::FLAG_CLUSTER_CREATE)) {
+	  goto skip;
+	}
+	// normalize
+	err = o->parse_value(val, &real_value, &errstr, &value);
+	if (err < 0) {
+	  dout(20) << __func__ << " failed to parse " << key << " = '"
+		   << val << "'" << dendl;
+	  goto skip;
+	}
+	// does it conflict with an existing value?
+	{
+	  const Section *s = config_map.find_section(section);
+	  if (s) {
+	    auto k = s->options.find(key);
+	    if (k != s->options.end()) {
+	      if (value != k->second.raw_value) {
+		dout(20) << __func__ << " have " << key
+			 << " = " << k->second.raw_value
+			 << " (not " << value << ")" << dendl;
+		goto skip;
+	      }
+	      dout(20) << __func__ << " already have " << key
+		       << " = " << k->second.raw_value << dendl;
+	      continue;
+	    }
+	  }
+	}
+	dout(20) << __func__ << "  add " << key << " = " << value
+		 << " (" << val << ")" << dendl;
+	{
+	  bufferlist bl;
+	  bl.append(value);
+	  pending[section + "/" + key] = bl;
+	  updated = true;
+	}
+	continue;
+
+       skip:
+	dout(20) << __func__ << " skip " << key << " = " << value
+		 << " (" << val << ")" << dendl;
+	if (!did_section) {
+	  newconf << "\n[" << section << "]\n";
+	  did_section = true;
+	}
+	newconf << "\t" << key << " = " << val << "\n";
+      }
+    }
+    odata.append(newconf.str());
+    if (updated) {
+      goto update;
+    }
+  } else {
+    ss << "unknown command " << prefix;
+    err = -EINVAL;
+  }
+
+reply:
+  mon.reply_command(op, err, ss.str(), odata, get_last_committed());
+  return false;
+
+update:
+  // see if there is an actual change
+  auto p = pending.begin();
+  while (p != pending.end()) {
+    auto q = current.find(p->first);
+    if (p->second && q != current.end() && *p->second == q->second) {
+      // set to same value
+      p = pending.erase(p);
+    } else if (!p->second && q == current.end()) {
+      // erasing non-existent value
+      p = pending.erase(p);
+    } else {
+      ++p;
+    }
+  }
+  if (pending.empty()) {
+    err = 0;
+    goto reply;
+  }
+  // immediately propose *with* KV mon
+  encode_pending_to_kvmon();
+  paxos.plug();
+  mon.kvmon()->propose_pending();
+  paxos.unplug();
+  force_immediate_propose();
+  wait_for_finished_proposal(
+    op,
+    new Monitor::C_Command(
+      mon, op, 0, ss.str(), odata,
+      get_last_committed() + 1));
+  return true;
+}
+
+void ConfigMonitor::tick()
+{
+  if (!is_active() || !mon.is_leader()) {
+    return;
+  }
+  dout(10) << __func__ << dendl;
+  bool changed = false;
+  if (!pending_cleanup.empty()) {
+    changed = true;
+  }
+  if (changed && mon.kvmon()->is_writeable()) {
+    paxos.plug();
+    encode_pending_to_kvmon();
+    mon.kvmon()->propose_pending();
+    paxos.unplug();
+    propose_pending();
+  }
+}
+
+void ConfigMonitor::on_active()
+{
+}
+
+void ConfigMonitor::load_config()
+{
+  std::map<std::string,std::string> renamed_pacific = {
+    { "mon_osd_blacklist_default_expire", "mon_osd_blocklist_default_expire" },
+    { "mon_mds_blacklist_interval", "mon_mds_blocklist_interval" },
+    { "mon_mgr_blacklist_interval", "mon_mgr_blocklist_interval" },
+    { "rbd_blacklist_on_break_lock", "rbd_blocklist_on_break_lock" },
+    { "rbd_blacklist_expire_seconds", "rbd_blocklist_expire_seconds" },
+    { "mds_session_blacklist_on_timeout", "mds_session_blocklist_on_timeout" },
+    { "mds_session_blacklist_on_evict", "mds_session_blocklist_on_evict" },
+  };
+
+  unsigned num = 0;
+  KeyValueDB::Iterator it = mon.store->get_iterator(KV_PREFIX);
+  it->lower_bound(KEY_PREFIX);
+  config_map.clear();
+  current.clear();
+  pending_cleanup.clear();
+  while (it->valid() &&
+	 it->key().compare(0, KEY_PREFIX.size(), KEY_PREFIX) == 0) {
+    string key = it->key().substr(KEY_PREFIX.size());
+    string value = it->value().to_str();
+
+    current[key] = it->value();
+
+    string name;
+    string who;
+    config_map.parse_key(key, &name, &who);
+
+    // has this option been renamed?
+    {
+      auto p = renamed_pacific.find(name);
+      if (p != renamed_pacific.end()) {
+	if (mon.monmap->min_mon_release >= ceph_release_t::pacific) {
+	  // schedule a cleanup
+	  pending_cleanup[key] = boost::none;
+	  pending_cleanup[who + "/" + p->second] = it->value();
+	}
+	// continue loading under the new name
+	name = p->second;
+      }
+    }
+
+    const Option *opt = g_conf().find_option(name);
+    if (!opt) {
+      opt = mon.mgrmon()->find_module_option(name);
+    }
+    if (!opt) {
+      dout(10) << __func__ << " unrecognized option '" << name << "'" << dendl;
+      config_map.stray_options.push_back(
+	std::unique_ptr<Option>(
+	  new Option(name, Option::TYPE_STR, Option::LEVEL_UNKNOWN)));
+      opt = config_map.stray_options.back().get();
+    }
+
+    string err;
+    int r = opt->pre_validate(&value, &err);
+    if (r < 0) {
+      dout(10) << __func__ << " pre-validate failed on '" << name << "' = '"
+	       << value << "' for " << name << dendl;
+    }
+    
+    MaskedOption mopt(opt);
+    mopt.raw_value = value;
+    string section_name;
+    if (who.size() &&
+	!ConfigMap::parse_mask(who, &section_name, &mopt.mask)) {
+      derr << __func__ << " invalid mask for key " << key << dendl;
+      pending_cleanup[key] = boost::none;
+    } else if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+      dout(10) << __func__ << " NO_MON_UPDATE option '"
+	       << name << "' = '" << value << "' for " << name
+	       << dendl;
+      pending_cleanup[key] = boost::none;
+    } else {
+      if (section_name.empty()) {
+	// we prefer global/$option instead of just $option
+	derr << __func__ << " adding global/ prefix to key '" << key << "'"
+	     << dendl;
+	pending_cleanup[key] = boost::none;
+	pending_cleanup["global/"s + key] = it->value();
+      }
+      Section *section = &config_map.global;;
+      if (section_name.size() && section_name != "global") {
+	if (section_name.find('.') != std::string::npos) {
+	  section = &config_map.by_id[section_name];
+	} else {
+	  section = &config_map.by_type[section_name];
+	}
+      }
+      section->options.insert(make_pair(name, std::move(mopt)));
+      ++num;
+    }
+    it->next();
+  }
+  dout(10) << __func__ << " got " << num << " keys" << dendl;
+
+  // refresh our own config
+  {
+    const OSDMap& osdmap = mon.osdmon()->osdmap;
+    map<string,string> crush_location;
+    osdmap.crush->get_full_location(g_conf()->host, &crush_location);
+    auto out = config_map.generate_entity_map(
+      g_conf()->name,
+      crush_location,
+      osdmap.crush.get(),
+      string{}); // no device class
+    g_conf().set_mon_vals(g_ceph_context, out, nullptr);
+  }
+}
+
+void ConfigMonitor::load_changeset(version_t v, ConfigChangeSet *ch)
+{
+  ch->version = v;
+  string prefix = HISTORY_PREFIX + stringify(v) + "/";
+  KeyValueDB::Iterator it = mon.store->get_iterator(KV_PREFIX);
+  it->lower_bound(prefix);
+  while (it->valid() && it->key().find(prefix) == 0) {
+    if (it->key() == prefix) {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      try {
+	decode(ch->stamp, p);
+	decode(ch->name, p);
+      }
+      catch (ceph::buffer::error& e) {
+	derr << __func__ << " failure decoding changeset " << v << dendl;
+      }
+    } else {
+      char op = it->key()[prefix.length()];
+      string key = it->key().substr(prefix.length() + 1);
+      if (op == '-') {
+	ch->diff[key].first = it->value().to_str();
+      } else if (op == '+') {
+	ch->diff[key].second = it->value().to_str();
+      }
+    }
+    it->next();
+  }
+}
+
+bool ConfigMonitor::refresh_config(MonSession *s)
+{
+  const OSDMap& osdmap = mon.osdmon()->osdmap;
+  map<string,string> crush_location;
+  if (s->remote_host.size()) {
+    osdmap.crush->get_full_location(s->remote_host, &crush_location);
+    dout(10) << __func__ << " crush_location for remote_host " << s->remote_host
+	     << " is " << crush_location << dendl;
+  }
+
+  string device_class;
+  if (s->name.is_osd()) {
+    const char *c = osdmap.crush->get_item_class(s->name.num());
+    if (c) {
+      device_class = c;
+      dout(10) << __func__ << " device_class " << device_class << dendl;
+    }
+  }
+
+  dout(20) << __func__ << " " << s->entity_name << " crush " << crush_location
+	   << " device_class " << device_class << dendl;
+  auto out = config_map.generate_entity_map(
+    s->entity_name,
+    crush_location,
+    osdmap.crush.get(),
+    device_class);
+
+  if (out == s->last_config && s->any_config) {
+    dout(20) << __func__ << " no change, " << out << dendl;
+    return false;
+  }
+  // removing this to hide sensitive data going into logs
+  // leaving this for debugging purposes
+ //  dout(20) << __func__ << " " << out << dendl;
+  s->last_config = std::move(out);
+  s->any_config = true;
+  return true;
+}
+
+bool ConfigMonitor::maybe_send_config(MonSession *s)
+{
+  bool changed = refresh_config(s);
+  dout(10) << __func__ << " to " << s->name << " "
+	   << (changed ? "(changed)" : "(unchanged)")
+	   << dendl;
+  if (changed) {
+    send_config(s);
+  }
+  return changed;
+}
+
+void ConfigMonitor::send_config(MonSession *s)
+{
+  dout(10) << __func__ << " to " << s->name << dendl;
+  auto m = new MConfig(s->last_config);
+  s->con->send_message(m);
+}
+
+void ConfigMonitor::check_sub(MonSession *s)
+{
+  if (!s->authenticated) {
+    dout(20) << __func__ << " not authenticated " << s->entity_name << dendl;
+    return;
+  }
+  auto p = s->sub_map.find("config");
+  if (p != s->sub_map.end()) {
+    check_sub(p->second);
+  }
+}
+
+void ConfigMonitor::check_sub(Subscription *sub)
+{
+  dout(10) << __func__
+	   << " next " << sub->next
+	   << " have " << version << dendl;
+  if (sub->next <= version) {
+    maybe_send_config(sub->session);
+    if (sub->onetime) {
+      mon.with_session_map([sub](MonSessionMap& session_map) {
+	  session_map.remove_sub(sub);
+	});
+    } else {
+      sub->next = version + 1;
+    }
+  }
+}
+
+void ConfigMonitor::check_all_subs()
+{
+  dout(10) << __func__ << dendl;
+  auto subs = mon.session_map.subs.find("config");
+  if (subs == mon.session_map.subs.end()) {
+    return;
+  }
+  int updated = 0, total = 0;
+  auto p = subs->second->begin();
+  while (!p.end()) {
+    auto sub = *p;
+    ++p;
+    ++total;
+    if (maybe_send_config(sub->session)) {
+      ++updated;
+    }
+  }
+  dout(10) << __func__ << " updated " << updated << " / " << total << dendl;
+}
diff --git a/src/mon/ConfigMonitor.h b/src/mon/ConfigMonitor.h
new file mode 100644
index 000000000..e6c12a3d7
--- /dev/null
+++ b/src/mon/ConfigMonitor.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/optional.hpp>
+
+#include "ConfigMap.h"
+#include "mon/PaxosService.h"
+
+class MonSession;
+
+class ConfigMonitor : public PaxosService
+{
+  version_t version = 0;
+  ConfigMap config_map;
+  std::map<std::string,boost::optional<ceph::buffer::list>> pending;
+  std::string pending_description;
+  std::map<std::string,boost::optional<ceph::buffer::list>> pending_cleanup;
+
+  std::map<std::string,ceph::buffer::list> current;
+
+  void encode_pending_to_kvmon();
+
+public:
+  ConfigMonitor(Monitor &m, Paxos &p, const std::string& service_name);
+
+  void init() override;
+
+  void load_config();
+  void load_changeset(version_t v, ConfigChangeSet *ch);
+
+  bool preprocess_query(MonOpRequestRef op) override;
+  bool prepare_update(MonOpRequestRef op) override;
+
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
+
+  void handle_get_config(MonOpRequestRef op);
+
+  void create_initial() override;
+  void update_from_paxos(bool *need_bootstrap) override;
+  void create_pending() override;
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  version_t get_trim_to() const override;
+
+  void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+  void on_active() override;
+  void tick() override;
+
+  bool refresh_config(MonSession *s);
+  bool maybe_send_config(MonSession *s);
+  void send_config(MonSession *s);
+  void check_sub(MonSession *s);
+  void check_sub(Subscription *sub);
+  void check_all_subs();
+};
diff --git a/src/mon/ConnectionTracker.cc b/src/mon/ConnectionTracker.cc
new file mode 100644
index 000000000..272ad40c2
--- /dev/null
+++ b/src/mon/ConnectionTracker.cc
@@ -0,0 +1,361 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "ConnectionTracker.h"
+#include "common/Formatter.h"
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, rank, epoch, version)
+
+static std::ostream& _prefix(std::ostream *_dout, int rank, epoch_t epoch, uint64_t version) {
+  return *_dout << "rank: " << rank << " version: "<< version << " ConnectionTracker(" << epoch << ") ";
+}
+
+std::ostream& operator<<(std::ostream&o, const ConnectionReport& c) {
+  o << "rank=" << c.rank << ",epoch=" << c.epoch << ",version=" << c.epoch_version
+    << ", current links: " << c.current << ", history: " << c.history;
+  return o;
+}
+
+std::ostream& operator<<(std::ostream& o, const ConnectionTracker& c) {
+  o << "rank=" << c.rank << ", epoch=" << c.epoch << ", version=" << c.version
+    << ", half_life=" << c.half_life << ", reports: " << c.peer_reports;
+  return o;
+}
+
+ConnectionReport *ConnectionTracker::reports(int p)
+{
+  auto i = peer_reports.find(p);
+  if (i == peer_reports.end()) {
+    ceph_assert(p != rank);
+    auto[j,k] = peer_reports.insert(std::pair<int,ConnectionReport>(p,ConnectionReport()));
+    i = j;
+  }
+  return &i->second;
+}
+
+const ConnectionReport *ConnectionTracker::reports(int p) const
+{
+  auto i = peer_reports.find(p);
+  if (i == peer_reports.end()) {
+    return NULL;
+  }
+  return &i->second;
+}
+
+void ConnectionTracker::receive_peer_report(const ConnectionTracker& o)
+{
+  ldout(cct, 30) << __func__ << dendl;
+  for (auto& i : o.peer_reports) {
+    const ConnectionReport& report = i.second;
+    if (i.first == rank) continue;
+    ConnectionReport& existing = *reports(i.first);
+    if (report.epoch > existing.epoch ||
+	(report.epoch == existing.epoch &&
+	 report.epoch_version > existing.epoch_version)) {
+      ldout(cct, 30) << " new peer_report is more updated" << dendl;
+      ldout(cct, 30) << "existing: " << existing << dendl;
+      ldout(cct, 30) << "new: " << report << dendl;
+      existing = report;
+    }
+  }
+  encoding.clear();
+}
+
+bool ConnectionTracker::increase_epoch(epoch_t e)
+{
+  ldout(cct, 30) << __func__ << " to " << e << dendl;
+  if (e > epoch) {
+    my_reports.epoch_version = version = 0;
+    my_reports.epoch = epoch = e;
+    peer_reports[rank] = my_reports;
+    encoding.clear();
+    return true;
+  }
+  return false;
+}
+
+void ConnectionTracker::increase_version()
+{
+  ldout(cct, 30) << __func__ << " to " << version+1 << dendl;
+  encoding.clear();
+  ++version;
+  my_reports.epoch_version = version;
+  peer_reports[rank] = my_reports;
+  if ((version % persist_interval) == 0 ) {
+    ldout(cct, 30) << version << " % " << persist_interval << " == 0" << dendl;
+    owner->persist_connectivity_scores();
+  }
+}
+
+void ConnectionTracker::report_live_connection(int peer_rank, double units_alive)
+{
+  ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_alive: " << units_alive << dendl;
+  ldout(cct, 30) << "my_reports before: " << my_reports << dendl;
+  if (peer_rank == rank) {
+    lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl;
+    return;
+  }
+  // we need to "auto-initialize" to 1, do shenanigans
+  auto i = my_reports.history.find(peer_rank);
+  if (i == my_reports.history.end()) {
+    ldout(cct, 30) << "couldn't find: " << peer_rank
+      << " in my_reports.history" << "... inserting: "
+      << "(" << peer_rank << ", 1" << dendl;
+    auto[j,k] = my_reports.history.insert(std::pair<int,double>(peer_rank,1.0));
+    i = j;
+  }
+  double& pscore = i->second;
+  ldout(cct, 30) << "adding new pscore to my_reports" << dendl;
+  pscore = pscore * (1 - units_alive / (2 * half_life)) +
+    (units_alive / (2 * half_life));
+  pscore = std::min(pscore, 1.0);
+  my_reports.current[peer_rank] = true;
+
+  increase_version();
+  ldout(cct, 30) << "my_reports after: " << my_reports << dendl;
+}
+
+void ConnectionTracker::report_dead_connection(int peer_rank, double units_dead)
+{
+  ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_dead: " << units_dead << dendl;
+  ldout(cct, 30) << "my_reports before: " << my_reports << dendl;
+  if (peer_rank == rank) {
+    lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl;
+    return;
+  }
+  // we need to "auto-initialize" to 1, do shenanigans
+  auto i = my_reports.history.find(peer_rank);
+  if (i == my_reports.history.end()) {
+    ldout(cct, 30) << "couldn't find: " << peer_rank
+    << " in my_reports.history" << "... inserting: "
+    << "(" << peer_rank << ", 1" << dendl;
+    auto[j,k] = my_reports.history.insert(std::pair<int,double>(peer_rank,1.0));
+    i = j;
+  }
+  double& pscore = i->second;
+  ldout(cct, 30) << "adding new pscore to my_reports" << dendl;
+  pscore = pscore * (1 - units_dead / (2 * half_life)) -
+    (units_dead / (2*half_life));
+  pscore = std::max(pscore, 0.0);
+  my_reports.current[peer_rank] = false;
+  
+  increase_version();
+  ldout(cct, 30) << "my_reports after: " << my_reports << dendl;
+}
+
+void ConnectionTracker::get_total_connection_score(int peer_rank, double *rating,
+						    int *live_count) const
+{
+  ldout(cct, 30) << __func__ << dendl;
+  *rating = 0;
+  *live_count = 0;
+  double rate = 0;
+  int live = 0;
+
+  for (const auto& i : peer_reports) { // loop through all the scores
+    if (i.first == peer_rank) { // ... except the ones it has for itself, of course!
+      continue;
+    }
+    const auto& report = i.second;
+    auto score_i = report.history.find(peer_rank);
+    auto live_i = report.current.find(peer_rank);
+    if (score_i != report.history.end()) {
+      if (live_i->second) {
+	rate += score_i->second;
+	++live;
+      }
+    }
+  }
+  *rating = rate;
+  *live_count = live;
+}
+
+void ConnectionTracker::notify_rank_changed(int new_rank)
+{
+  ldout(cct, 20) << __func__ << " to " << new_rank << dendl;
+  if (new_rank == rank) return;
+  ldout(cct, 20) << "peer_reports before: " << peer_reports << dendl;
+  peer_reports.erase(rank);
+  peer_reports.erase(new_rank);
+  my_reports.rank = new_rank;
+  rank = new_rank;
+  encoding.clear();
+  ldout(cct, 20) << "peer_reports after: " << peer_reports << dendl;
+
+  increase_version();
+}
+
+void ConnectionTracker::notify_rank_removed(int rank_removed, int new_rank)
+{
+  ldout(cct, 20) << __func__ << " " << rank_removed
+    << " new_rank: " << new_rank << dendl;
+  ldout(cct, 20) << "my_reports before: " << my_reports << dendl;
+  ldout(cct, 20) << "peer_reports before: " << peer_reports << dendl;
+  ldout(cct, 20) << "my rank before: " << rank << dendl;
+
+  encoding.clear();
+  size_t starting_size_current = my_reports.current.size();
+  // Lets adjust everything in my report.
+  my_reports.current.erase(rank_removed);
+  my_reports.history.erase(rank_removed);
+  auto ci = my_reports.current.upper_bound(rank_removed);
+  auto hi = my_reports.history.upper_bound(rank_removed);
+  while (ci != my_reports.current.end()) {
+    ceph_assert(ci->first == hi->first);
+    my_reports.current[ci->first - 1] = ci->second;
+    my_reports.history[hi->first - 1] = hi->second;
+    my_reports.current.erase(ci++);
+    my_reports.history.erase(hi++);
+  }
+  ceph_assert((my_reports.current.size() == starting_size_current) ||
+    (my_reports.current.size() + 1 == starting_size_current));
+
+  size_t starting_size = peer_reports.size();
+  auto pi = peer_reports.upper_bound(rank_removed);
+  // Remove the target rank and adjust everything that comes after.
+  // Note that we don't adjust current and history for our peer_reports
+  // because it is better to rely on our peers on that information.
+  peer_reports.erase(rank_removed);
+  while (pi != peer_reports.end()) {
+    peer_reports[pi->first - 1] = pi->second; // copy content of next rank to ourself.
+    peer_reports.erase(pi++); // destroy our next rank and move on.
+  }
+
+  ceph_assert((peer_reports.size() == starting_size) ||
+	  (peer_reports.size() + 1 == starting_size));
+
+  if (rank_removed < rank) { // if the rank removed is lower than us, we need to adjust.
+    --rank;
+    my_reports.rank = rank; // also adjust my_reports.rank.
+  }
+
+  ldout(cct, 20) << "my rank after: " << rank << dendl;
+  ldout(cct, 20) << "peer_reports after: " << peer_reports << dendl;
+  ldout(cct, 20) << "my_reports after: " << my_reports << dendl;
+
+  //check if the new_rank from monmap is equal to our adjusted rank.
+  ceph_assert(rank == new_rank);
+
+  increase_version();
+}
+
+bool ConnectionTracker::is_clean(int mon_rank, int monmap_size)
+{
+  ldout(cct, 30) << __func__ << dendl;
+  // check consistency between our rank according
+  // to monmap and our rank according to our report.
+  if (rank != mon_rank ||
+    my_reports.rank != mon_rank) {
+    return false;
+  } else if (!peer_reports.empty()){
+    // if peer_report max rank is greater than monmap max rank
+    // then there is a problem.
+    if (peer_reports.rbegin()->first > monmap_size - 1) return false;
+  }
+  return true;
+}
+
+void ConnectionTracker::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(rank, bl);
+  encode(epoch, bl);
+  encode(version, bl);
+  encode(half_life, bl);
+  encode(peer_reports, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ConnectionTracker::decode(bufferlist::const_iterator& bl) {
+  clear_peer_reports();
+  encoding.clear();
+
+  DECODE_START(1, bl);
+  decode(rank, bl);
+  decode(epoch, bl);
+  decode(version, bl);
+  decode(half_life, bl);
+  decode(peer_reports, bl);
+  DECODE_FINISH(bl);
+  if (rank >=0)
+    my_reports = peer_reports[rank];
+}
+
+const bufferlist& ConnectionTracker::get_encoded_bl()
+{
+  if (!encoding.length()) {
+    encode(encoding);
+  }
+  return encoding;
+}
+
+void ConnectionReport::dump(ceph::Formatter *f) const
+{
+  f->dump_int("rank", rank);
+  f->dump_int("epoch", epoch);
+  f->dump_int("version", epoch_version);
+  f->open_object_section("peer_scores");
+  for (auto i : history) {
+    f->open_object_section("peer");
+    f->dump_int("peer_rank", i.first);
+    f->dump_float("peer_score", i.second);
+    f->dump_bool("peer_alive", current.find(i.first)->second);
+    f->close_section();
+  }
+  f->close_section(); // peer scores
+}
+
+void ConnectionReport::generate_test_instances(std::list<ConnectionReport*>& o)
+{
+  o.push_back(new ConnectionReport);
+  o.push_back(new ConnectionReport);
+  o.back()->rank = 1;
+  o.back()->epoch = 2;
+  o.back()->epoch_version = 3;
+  o.back()->current[0] = true;
+  o.back()->history[0] = .4;
+}
+
+void ConnectionTracker::dump(ceph::Formatter *f) const
+{
+  f->dump_int("rank", rank);
+  f->dump_int("epoch", epoch);
+  f->dump_int("version", version);
+  f->dump_float("half_life", half_life);
+  f->dump_int("persist_interval", persist_interval);
+  f->open_object_section("reports");
+  for (const auto& i : peer_reports) {
+    f->open_object_section("report");
+    i.second.dump(f);
+    f->close_section();
+  }
+  f->close_section(); // reports
+}
+
+void ConnectionTracker::generate_test_instances(std::list<ConnectionTracker*>& o)
+{
+  o.push_back(new ConnectionTracker);
+  o.push_back(new ConnectionTracker);
+  ConnectionTracker *e = o.back();
+  e->rank = 2;
+  e->epoch = 3;
+  e->version = 4;
+  e->peer_reports[0];
+  e->peer_reports[1];
+  e->my_reports = e->peer_reports[2];
+}
diff --git a/src/mon/ConnectionTracker.h b/src/mon/ConnectionTracker.h
new file mode 100644
index 000000000..09506636d
--- /dev/null
+++ b/src/mon/ConnectionTracker.h
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#pragma once
+#include "include/types.h"
+
+struct ConnectionReport {
+  int rank = -1; // mon rank this state belongs to
+  std::map<int, bool> current; // true if connected to the other mon
+  std::map<int, double> history; // [0-1]; the connection reliability
+  epoch_t epoch = 0; // the (local) election epoch the ConnectionReport came from
+  uint64_t epoch_version = 0; // version of the ConnectionReport within the epoch
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(rank, bl);
+    encode(current, bl);
+    encode(history, bl);
+    encode(epoch, bl);
+    encode(epoch_version, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(rank, bl);
+    decode(current, bl);
+    decode(history, bl);
+    decode(epoch, bl);
+    decode(epoch_version, bl);
+    DECODE_FINISH(bl);
+  }
+  bool operator==(const ConnectionReport& o) const {
+    return o.rank == rank && o.current == current &&
+      o.history == history && o.epoch == epoch &&
+      o.epoch_version == epoch_version;
+  }
+  friend std::ostream& operator<<(std::ostream&o, const ConnectionReport& c);
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ConnectionReport*>& o);
+};
+WRITE_CLASS_ENCODER(ConnectionReport);
+
+class RankProvider {
+ public:
+  /**
+   * Get the rank of the running daemon.
+   * It can be -1, meaning unknown/invalid, or it
+   * can be >1.
+   * You should not invoke the function get_total_connection_score()
+   * with an unknown rank.
+   */
+  virtual int get_my_rank() const = 0;
+  /**
+   * Asks our owner to encode us and persist it to disk.
+   * Presently we do this every tenth update.
+   */
+  virtual void persist_connectivity_scores() = 0;
+  virtual ~RankProvider() {}
+};
+
+class ConnectionTracker {
+ public:
+  /**
+   * Receive a report from a peer and update our internal state
+   * if the peer has newer data.
+   */
+  void receive_peer_report(const ConnectionTracker& o);
+  /**
+   * Bump up the epoch to the specified number.
+   * Validates that it is > current epoch and resets
+   * version to 0; returns false if not.
+   */
+  bool increase_epoch(epoch_t e);
+  /**
+   * Bump up the version within our epoch.
+   * If the new version is a multiple of ten, we also persist it.
+   */
+  void increase_version();
+  
+  /**
+   * Report a connection to a peer rank has been considered alive for
+   * the given time duration. We assume the units_alive is <= the time
+   * since the previous reporting call.
+   * (Or, more precisely, we assume that the total amount of time
+   * passed in is less than or equal to the time which has actually
+   * passed -- you can report a 10-second death immediately followed
+   * by reporting 5 seconds of liveness if your metrics are delayed.)
+   */
+  void report_live_connection(int peer_rank, double units_alive);
+  /**
+   * Report a connection to a peer rank has been considered dead for
+   * the given time duration, analogous to that above.
+   */
+  void report_dead_connection(int peer_rank, double units_dead);
+  /**
+   * Set the half-life for dropping connection state
+   * out of the ongoing score.
+   * Whenever you add a new data point:
+   * new_score = old_score * ( 1 - units / (2d)) + (units/(2d))
+   * where units is the units reported alive (for dead, you subtract them).
+   */
+  void set_half_life(double d) {
+    half_life = d;
+  }
+  /**
+   * Get the total connection score of a rank across
+   * all peers, and the count of how many electors think it's alive.
+   * For this summation, if a rank reports a peer as down its score is zero.
+   */
+  void get_total_connection_score(int peer_rank, double *rating,
+				  int *live_count) const;
+  /**
+  * Check if our ranks are clean and make
+  * sure there are no extra peer_report lingering.
+  * In the future we also want to check the reports
+  * current and history of each peer_report.
+  */
+  bool is_clean(int mon_rank, int monmap_size);
+  /**
+   * Encode this ConnectionTracker. Useful both for storing on disk
+   * and for sending off to peers for decoding and import
+   * with receive_peer_report() above.
+   */
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  /**
+   * Get a bufferlist containing the ConnectionTracker.
+   * This is like encode() but holds a copy so it
+   * doesn't re-encode on every invocation.
+   */
+  const bufferlist& get_encoded_bl();
+ private:
+  epoch_t epoch;
+  uint64_t version;
+  map<int,ConnectionReport> peer_reports;
+  ConnectionReport my_reports;
+  double half_life;
+  RankProvider *owner;
+  int rank;
+  int persist_interval;
+  bufferlist encoding;
+  CephContext *cct;
+  int get_my_rank() const { return rank; }
+  ConnectionReport *reports(int p);
+  const ConnectionReport *reports(int p) const;
+
+  void clear_peer_reports() {
+    encoding.clear();
+    peer_reports.clear();
+    my_reports = ConnectionReport();
+    my_reports.rank = rank;
+  }
+
+ public:
+  ConnectionTracker() : epoch(0), version(0), half_life(12*60*60),
+			owner(NULL), rank(-1), persist_interval(10) {
+  }
+  ConnectionTracker(RankProvider *o, int rank, double hl,
+		    int persist_i, CephContext *c) :
+    epoch(0), version(0),
+    half_life(hl), owner(o), rank(rank), persist_interval(persist_i), cct(c) {
+    my_reports.rank = rank;
+  }
+  ConnectionTracker(const bufferlist& bl, CephContext *c) :
+    epoch(0), version(0),
+    half_life(0), owner(NULL), rank(-1), persist_interval(10), cct(c)
+  {
+    auto bi = bl.cbegin();
+    decode(bi);
+  }
+  ConnectionTracker(const ConnectionTracker& o) :
+    epoch(o.epoch), version(o.version),
+    half_life(o.half_life), owner(o.owner), rank(o.rank),
+    persist_interval(o.persist_interval), cct(o.cct)
+  {
+    peer_reports = o.peer_reports;
+    my_reports = o.my_reports;
+  }
+  void notify_reset() { clear_peer_reports(); }
+  void set_rank(int new_rank) {
+    rank = new_rank;
+    my_reports.rank = rank;
+  }
+
+  void notify_rank_changed(int new_rank);
+  void notify_rank_removed(int rank_removed, int new_rank);
+  friend std::ostream& operator<<(std::ostream& o, const ConnectionTracker& c);
+  friend ConnectionReport *get_connection_reports(ConnectionTracker& ct);
+  friend map<int,ConnectionReport> *get_peer_reports(ConnectionTracker& ct);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ConnectionTracker*>& o);
+};
+
+WRITE_CLASS_ENCODER(ConnectionTracker);
diff --git a/src/mon/CreatingPGs.h b/src/mon/CreatingPGs.h
new file mode 100644
index 000000000..0075f81e7
--- /dev/null
+++ b/src/mon/CreatingPGs.h
@@ -0,0 +1,234 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "include/encoding.h"
+#include "include/utime.h"
+
+#include "osd/osd_types.h"
+
+struct creating_pgs_t {
+  epoch_t last_scan_epoch = 0;
+
+  struct pg_create_info {
+    epoch_t create_epoch;
+    utime_t create_stamp;
+
+    // NOTE: pre-octopus instances of this class will have a
+    // zeroed-out history
+    std::vector<int> up;
+    int up_primary = -1;
+    std::vector<int> acting;
+    int acting_primary = -1;
+    pg_history_t history;
+    PastIntervals past_intervals;
+
+    void encode(ceph::buffer::list& bl, uint64_t features) const {
+      using ceph::encode;
+      if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+	// was pair<epoch_t,utime_t> prior to octopus
+	encode(create_epoch, bl);
+	encode(create_stamp, bl);
+	return;
+      }
+      ENCODE_START(1, 1, bl);
+      encode(create_epoch, bl);
+      encode(create_stamp, bl);
+      encode(up, bl);
+      encode(up_primary, bl);
+      encode(acting, bl);
+      encode(acting_primary, bl);
+      encode(history, bl);
+      encode(past_intervals, bl);
+      ENCODE_FINISH(bl);
+    }
+    void decode_legacy(ceph::buffer::list::const_iterator& p) {
+      using ceph::decode;
+      decode(create_epoch, p);
+      decode(create_stamp, p);
+    }
+    void decode(ceph::buffer::list::const_iterator& p) {
+      using ceph::decode;
+      DECODE_START(1, p);
+      decode(create_epoch, p);
+      decode(create_stamp, p);
+      decode(up, p);
+      decode(up_primary, p);
+      decode(acting, p);
+      decode(acting_primary, p);
+      decode(history, p);
+      decode(past_intervals, p);
+      DECODE_FINISH(p);
+    }
+    void dump(ceph::Formatter *f) const {
+      f->dump_unsigned("create_epoch", create_epoch);
+      f->dump_stream("create_stamp") << create_stamp;
+      f->open_array_section("up");
+      for (auto& i : up) {
+	f->dump_unsigned("osd", i);
+      }
+      f->close_section();
+      f->dump_int("up_primary", up_primary);
+      f->open_array_section("acting");
+      for (auto& i : acting) {
+	f->dump_unsigned("osd", i);
+      }
+      f->close_section();
+      f->dump_int("acting_primary", up_primary);
+      f->dump_object("pg_history", history);
+      f->dump_object("past_intervals", past_intervals);
+    }
+
+    pg_create_info() {}
+    pg_create_info(epoch_t e, utime_t t)
+      : create_epoch(e),
+	create_stamp(t) {
+      // NOTE: we don't initialize the other fields here; see
+      // OSDMonitor::update_pending_pgs()
+    }
+  };
+
+  /// pgs we are currently creating
+  std::map<pg_t, pg_create_info> pgs;
+
+  struct pool_create_info {
+    epoch_t created;
+    utime_t modified;
+    uint64_t start = 0;
+    uint64_t end = 0;
+    bool done() const {
+      return start >= end;
+    }
+    void encode(ceph::buffer::list& bl) const {
+      using ceph::encode;
+      encode(created, bl);
+      encode(modified, bl);
+      encode(start, bl);
+      encode(end, bl);
+    }
+    void decode(ceph::buffer::list::const_iterator& p) {
+      using ceph::decode;
+      decode(created, p);
+      decode(modified, p);
+      decode(start, p);
+      decode(end, p);
+    }
+  };
+
+  /// queue of pgs we still need to create (poolid -> <created, set of ps>)
+  std::map<int64_t,pool_create_info> queue;
+
+  /// pools that exist in the osdmap for which at least one pg has been created
+  std::set<int64_t> created_pools;
+
+  bool still_creating_pool(int64_t poolid) {
+    for (auto& i : pgs) {
+      if (i.first.pool() == poolid) {
+	return true;
+      }
+    }
+    if (queue.count(poolid)) {
+      return true;
+    }
+    return false;
+  }
+  void create_pool(int64_t poolid, uint32_t pg_num,
+		   epoch_t created, utime_t modified) {
+    ceph_assert(created_pools.count(poolid) == 0);
+    auto& c = queue[poolid];
+    c.created = created;
+    c.modified = modified;
+    c.end = pg_num;
+    created_pools.insert(poolid);
+  }
+  unsigned remove_pool(int64_t removed_pool) {
+    const unsigned total = pgs.size();
+    auto first = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool});
+    auto last = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool + 1});
+    pgs.erase(first, last);
+    created_pools.erase(removed_pool);
+    queue.erase(removed_pool);
+    return total - pgs.size();
+  }
+  void encode(ceph::buffer::list& bl, uint64_t features) const {
+    unsigned v = 3;
+    if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+      v = 2;
+    }
+    ENCODE_START(v, 1, bl);
+    encode(last_scan_epoch, bl);
+    encode(pgs, bl, features);
+    encode(created_pools, bl);
+    encode(queue, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(last_scan_epoch, bl);
+    if (struct_v >= 3) {
+      decode(pgs, bl);
+    } else {
+      // legacy pg encoding
+      pgs.clear();
+      uint32_t num;
+      decode(num, bl);
+      while (num--) {
+	pg_t pgid;
+	decode(pgid, bl);
+	pgs[pgid].decode_legacy(bl);
+      }
+    }
+    decode(created_pools, bl);
+    if (struct_v >= 2)
+      decode(queue, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_unsigned("last_scan_epoch", last_scan_epoch);
+    f->open_array_section("creating_pgs");
+    for (auto& pg : pgs) {
+      f->open_object_section("pg");
+      f->dump_stream("pgid") << pg.first;
+      f->dump_object("pg_create_info", pg.second);
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("queue");
+    for (auto& p : queue) {
+      f->open_object_section("pool");
+      f->dump_unsigned("pool", p.first);
+      f->dump_unsigned("created", p.second.created);
+      f->dump_stream("modified") << p.second.modified;
+      f->dump_unsigned("ps_start", p.second.start);
+      f->dump_unsigned("ps_end", p.second.end);
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("created_pools");
+    for (auto pool : created_pools) {
+      f->dump_unsigned("pool", pool);
+    }
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<creating_pgs_t*>& o) {
+    auto c = new creating_pgs_t;
+    c->last_scan_epoch = 17;
+    c->pgs.emplace(pg_t{42, 2}, pg_create_info(31, utime_t{891, 113}));
+    c->pgs.emplace(pg_t{44, 2}, pg_create_info(31, utime_t{891, 113}));
+    c->created_pools = {0, 1};
+    o.push_back(c);
+    c = new creating_pgs_t;
+    c->last_scan_epoch = 18;
+    c->pgs.emplace(pg_t{42, 3}, pg_create_info(31, utime_t{891, 113}));
+    c->created_pools = {};
+    o.push_back(c);
+  }
+};
+WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t::pg_create_info)
+WRITE_CLASS_ENCODER(creating_pgs_t::pool_create_info)
+WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t)
diff --git a/src/mon/ElectionLogic.cc b/src/mon/ElectionLogic.cc
new file mode 100644
index 000000000..e22a85bed
--- /dev/null
+++ b/src/mon/ElectionLogic.cc
@@ -0,0 +1,556 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "ElectionLogic.h"
+
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, epoch, elector)
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, epoch_t epoch, ElectionOwner* elector) {
+  return *_dout << "paxos." << elector->get_my_rank()
+		<< ").electionLogic(" <<  epoch << ") ";
+}
+void ElectionLogic::init()
+{
+  epoch = elector->read_persisted_epoch();
+  if (!epoch) {
+    ldout(cct, 1) << "init, first boot, initializing epoch at 1 " << dendl;
+    epoch = 1;
+  } else if (epoch % 2) {
+    ldout(cct, 1) << "init, last seen epoch " << epoch
+	    << ", mid-election, bumping" << dendl;
+    ++epoch;
+    elector->persist_epoch(epoch);
+  } else {
+    ldout(cct, 1) << "init, last seen epoch " << epoch << dendl;
+  }
+}
+
+void ElectionLogic::bump_epoch(epoch_t e)
+{
+  ldout(cct, 10) << __func__ << " to "  << e << dendl;
+  ceph_assert(epoch <= e);
+  epoch = e;
+  peer_tracker->increase_epoch(e);
+  elector->persist_epoch(epoch);
+  // clear up some state
+  electing_me = false;
+  acked_me.clear();
+  elector->notify_bump_epoch();
+}
+
+void ElectionLogic::declare_standalone_victory()
+{
+  assert(elector->paxos_size() == 1 && elector->get_my_rank() == 0);
+  init();
+  bump_epoch(epoch+1);
+}
+
+void ElectionLogic::clear_live_election_state()
+{
+  leader_acked = -1;
+  electing_me = false;
+  reset_stable_tracker();
+  leader_peer_tracker.reset();
+}
+
+void ElectionLogic::reset_stable_tracker()
+{
+  stable_peer_tracker.reset(new ConnectionTracker(*peer_tracker));
+}
+
+void ElectionLogic::connectivity_bump_epoch_in_election(epoch_t mepoch)
+{
+  ldout(cct, 30) << __func__ << " to " << mepoch << dendl;
+  ceph_assert(mepoch > epoch);
+  bump_epoch(mepoch);
+  reset_stable_tracker();
+  double lscore, my_score;
+  my_score = connectivity_election_score(elector->get_my_rank());
+  lscore = connectivity_election_score(leader_acked);
+  if (my_score > lscore) {
+    leader_acked = -1;
+    leader_peer_tracker.reset();
+  }
+}
+
+void ElectionLogic::start()
+{
+  if (!participating) {
+    ldout(cct, 0) << "not starting new election -- not participating" << dendl;
+    return;
+  }
+  ldout(cct, 5) << "start -- can i be leader?" << dendl;
+
+  acked_me.clear();
+  init();
+  
+  // start by trying to elect me
+  if (epoch % 2 == 0) {
+    bump_epoch(epoch+1);  // odd == election cycle
+  } else {
+    elector->validate_store();
+  }
+  acked_me.insert(elector->get_my_rank());
+  clear_live_election_state();
+  reset_stable_tracker();
+  electing_me = true;
+
+  bufferlist bl;
+  if (strategy == CONNECTIVITY) {
+    stable_peer_tracker->encode(bl);
+  }
+  elector->propose_to_peers(epoch, bl);
+  elector->_start();
+}
+
+void ElectionLogic::defer(int who)
+{
+  if (strategy == CLASSIC) {
+      ldout(cct, 5) << "defer to " << who << dendl;
+      ceph_assert(who < elector->get_my_rank());
+  } else {
+    ldout(cct, 5) << "defer to " << who << ", disallowed_leaders=" << elector->get_disallowed_leaders() << dendl;
+    ceph_assert(!elector->get_disallowed_leaders().count(who));
+  }
+
+  if (electing_me) {
+    // drop out
+    acked_me.clear();
+    electing_me = false;
+  }
+
+  // ack them
+  leader_acked = who;
+  elector->_defer_to(who);
+}
+
+void ElectionLogic::end_election_period()
+{
+  ldout(cct, 5) << "election period ended" << dendl;
+  
+  // did i win?
+  if (electing_me &&
+      acked_me.size() > (elector->paxos_size() / 2)) {
+    // i win
+    declare_victory();
+  } else {
+    // whoever i deferred to didn't declare victory quickly enough.
+    if (elector->ever_participated())
+      start();
+    else
+      elector->reset_election();
+  }
+}
+
+
+void ElectionLogic::declare_victory()
+{
+  ldout(cct, 5) << "I win! acked_me=" << acked_me << dendl;
+  last_election_winner = elector->get_my_rank();
+  last_voted_for = last_election_winner;
+  clear_live_election_state();
+
+  set<int> new_quorum;
+  new_quorum.swap(acked_me);
+  
+  ceph_assert(epoch % 2 == 1);  // election
+  bump_epoch(epoch+1);     // is over!
+
+  elector->message_victory(new_quorum);
+}
+
+bool ElectionLogic::propose_classic_prefix(int from, epoch_t mepoch)
+{
+  if (mepoch > epoch) {
+    bump_epoch(mepoch);
+  } else if (mepoch < epoch) {
+    // got an "old" propose,
+    if (epoch % 2 == 0 &&    // in a non-election cycle
+	!elector->is_current_member(from)) {  // from someone outside the quorum
+      // a mon just started up, call a new election so they can rejoin!
+      ldout(cct, 5) << " got propose from old epoch, "
+	      << from << " must have just started" << dendl;
+      // we may be active; make sure we reset things in the monitor appropriately.
+      elector->trigger_new_election();
+    } else {
+      ldout(cct, 5) << " ignoring old propose" << dendl;
+    }
+    return true;
+  }
+  return false;
+}
+
+void ElectionLogic::receive_propose(int from, epoch_t mepoch,
+				    const ConnectionTracker *ct)
+{
+  ldout(cct, 20) << __func__ << " from " << from << dendl;
+  if (from == elector->get_my_rank()) {
+    lderr(cct) << "I got a propose from my own rank, hopefully this is startup weirdness,dropping" << dendl;
+    return;
+  }
+  switch (strategy) {
+  case CLASSIC:
+    propose_classic_handler(from, mepoch);
+    break;
+  case DISALLOW:
+    propose_disallow_handler(from, mepoch);
+    break;
+  case CONNECTIVITY:
+    propose_connectivity_handler(from, mepoch, ct);
+    break;
+  default:
+    ceph_assert(0 == "how did election strategy become an invalid value?");
+  }
+}
+
+void ElectionLogic::propose_disallow_handler(int from, epoch_t mepoch)
+{
+  if (propose_classic_prefix(from, mepoch)) {
+    return;
+  }
+  const set<int>& disallowed_leaders = elector->get_disallowed_leaders();
+  int my_rank = elector->get_my_rank();
+  bool me_disallowed = disallowed_leaders.count(my_rank);
+  bool from_disallowed = disallowed_leaders.count(from);
+  bool my_win = !me_disallowed && // we are allowed to lead
+    (my_rank < from || from_disallowed); // we are a better choice than them
+  bool their_win = !from_disallowed && // they are allowed to lead
+    (my_rank > from || me_disallowed) && // they are a better choice than us
+    (leader_acked < 0 || leader_acked >= from); // they are a better choice than our previously-acked choice
+    
+  
+  if (my_win) {
+    // i would win over them.
+    if (leader_acked >= 0) {        // we already acked someone
+      ceph_assert(leader_acked < from || from_disallowed);  // and they still win, of course
+      ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+    } else {
+      // wait, i should win!
+      if (!electing_me) {
+	elector->trigger_new_election();
+      }
+    }
+  } else {
+    // they would win over me
+    if (their_win) {
+      defer(from);
+    } else {
+      // ignore them!
+      ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+    }
+  }
+}
+
+void ElectionLogic::propose_classic_handler(int from, epoch_t mepoch)
+{
+  if (propose_classic_prefix(from, mepoch)) {
+    return;
+  }
+  if (elector->get_my_rank() < from) {
+    // i would win over them.
+    if (leader_acked >= 0) {        // we already acked someone
+      ceph_assert(leader_acked < from);  // and they still win, of course
+      ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+    } else {
+      // wait, i should win!
+      if (!electing_me) {
+	elector->trigger_new_election();
+      }
+    }
+  } else {
+    // they would win over me
+    if (leader_acked < 0 || // haven't acked anyone yet, or
+	leader_acked > from ||   // they would win over who you did ack, or
+	leader_acked == from) {  // this is the guy we're already deferring to
+      defer(from);
+    } else {
+      // ignore them!
+      ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+    }
+  }
+}
+
+double ElectionLogic::connectivity_election_score(int rank)
+{
+  ldout(cct, 30) << __func__ << " of " << rank << dendl;
+  if (elector->get_disallowed_leaders().count(rank)) {
+    return -1;
+  }
+  double score;
+  int liveness;
+  if (stable_peer_tracker) {
+    ldout(cct, 30) << "stable_peer_tracker exists so using that ..." << dendl;
+    stable_peer_tracker->get_total_connection_score(rank, &score, &liveness);
+  } else {
+    ldout(cct, 30) << "stable_peer_tracker does not exists, using peer_tracker ..." << dendl;
+    peer_tracker->get_total_connection_score(rank, &score, &liveness);
+  }
+  return score;
+}
+
+void ElectionLogic::propose_connectivity_handler(int from, epoch_t mepoch,
+						 const ConnectionTracker *ct)
+{
+  ldout(cct, 10) << __func__ << " from " << from << " mepoch: "
+    << mepoch << " epoch: " << epoch << dendl;
+  ldout(cct, 30) << "last_election_winner: " << last_election_winner << dendl;
+  if ((epoch % 2 == 0) &&
+      last_election_winner != elector->get_my_rank() &&
+      !elector->is_current_member(from)) {
+    // To prevent election flapping, peons ignore proposals from out-of-quorum
+    // peers unless their vote would materially change from the last election
+    ldout(cct, 30) << "Lets see if this out-of-quorum peer is worth it " << dendl;
+    int best_scorer = 0;
+    double best_score = 0;
+    double last_voted_for_score = 0;
+    ldout(cct, 30) << "elector->paxos_size(): " << elector->paxos_size() << dendl;
+    for (unsigned i = 0; i < elector->paxos_size(); ++i) {
+      double score = connectivity_election_score(i);
+      if (score > best_score) {
+	best_scorer = i;
+	best_score = score;
+      }
+      if (last_voted_for >= 0 && i == static_cast<unsigned>(last_voted_for)) {
+	last_voted_for_score = score;
+      }
+    }
+    ldout(cct, 30) << "best_scorer: " << best_scorer << " best_score: " << best_score
+      << " last_voted_for: " << last_voted_for << " last_voted_for_score: " 
+      << last_voted_for_score << dendl;
+    if (best_scorer == last_voted_for ||
+	(best_score - last_voted_for_score < ignore_propose_margin)) {
+      // drop this message; it won't change our vote so we defer to leader
+      ldout(cct, 30) << "drop this message; it won't change our vote so we defer to leader " << dendl;
+      return;
+    }
+  }
+  if (mepoch > epoch) {
+    ldout(cct, 20) << "mepoch > epoch" << dendl;
+    connectivity_bump_epoch_in_election(mepoch);
+  } else if (mepoch < epoch) {
+    // got an "old" propose,
+    if (epoch % 2 == 0 &&    // in a non-election cycle
+	!elector->is_current_member(from)) {  // from someone outside the quorum
+      // a mon just started up, call a new election so they can rejoin!
+      ldout(cct, 5) << " got propose from old epoch, "
+	      << from << " must have just started" << dendl;
+      ldout(cct, 10) << "triggering new election" << dendl;
+      // we may be active; make sure we reset things in the monitor appropriately.
+      elector->trigger_new_election();
+    } else {
+      ldout(cct, 5) << " ignoring old propose" << dendl;
+    }
+    return;
+  }
+
+  int my_rank = elector->get_my_rank();
+  double my_score = connectivity_election_score(my_rank);
+  double from_score = connectivity_election_score(from);
+  double leader_score = -1;
+  if (leader_acked >= 0) {
+    leader_score = connectivity_election_score(leader_acked);
+  }
+
+  ldout(cct, 20) << "propose from rank=" << from << ", tracker: "
+		 << (stable_peer_tracker ? *stable_peer_tracker : *peer_tracker) << dendl;
+
+  ldout(cct, 10) << "propose from rank=" << from << ",from_score=" << from_score
+		 << "; my score=" << my_score
+		 << "; currently acked " << leader_acked
+		 << ",leader_score=" << leader_score << dendl;
+
+  bool my_win = (my_score >= 0) && // My score is non-zero; I am allowed to lead
+    ((my_rank < from && my_score >= from_score) || // We have same scores and I have lower rank, or
+     (my_score > from_score)); // my score is higher
+  
+  bool their_win = (from_score >= 0) && // Their score is non-zero; they're allowed to lead, AND
+    ((from < my_rank && from_score >= my_score) || // Either they have lower rank and same score, or
+     (from_score > my_score)) && // their score is higher, AND
+    ((from <= leader_acked && from_score >= leader_score) || // same conditions compared to leader, or IS leader
+     (from_score > leader_score));
+
+  if (my_win) {
+    ldout(cct, 10) << " conditionally I win" << dendl;
+    // i would win over them.
+    if (leader_acked >= 0) {        // we already acked someone
+      ceph_assert(leader_score >= from_score);  // and they still win, of course
+      ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
+    } else {
+      // wait, i should win!
+      if (!electing_me) {
+      ldout(cct, 10) << " wait, i should win! triggering new election ..." << dendl;
+	elector->trigger_new_election();
+      }
+    }
+  } else {
+    ldout(cct, 10) << " conditionally they win" << dendl;
+    // they would win over me
+    if (their_win || from == leader_acked) {
+      if (leader_acked >= 0 && from != leader_acked) {
+	// we have to make sure our acked leader will ALSO defer to them, or else
+	// we can't, to maintain guarantees!
+  ldout(cct, 10) << " make sure acked leader defer to: " << from << dendl;
+	double leader_from_score;
+	int leader_from_liveness;
+	leader_peer_tracker->
+	  get_total_connection_score(from, &leader_from_score,
+				     &leader_from_liveness);
+	double leader_leader_score;
+	int leader_leader_liveness;
+	leader_peer_tracker->
+	  get_total_connection_score(leader_acked, &leader_leader_score,
+				     &leader_leader_liveness);
+	if ((from < leader_acked && leader_from_score >= leader_leader_score) ||
+	    (leader_from_score > leader_leader_score)) {
+    ldout(cct, 10) << "defering to " << from << dendl;
+	  defer(from);
+	  leader_peer_tracker.reset(new ConnectionTracker(*ct));
+	} else { // we can't defer to them *this* round even though they should win...
+	  double cur_leader_score, cur_from_score;
+	  int cur_leader_live, cur_from_live;
+	  peer_tracker->get_total_connection_score(leader_acked, &cur_leader_score, &cur_leader_live);
+	  peer_tracker->get_total_connection_score(from, &cur_from_score, &cur_from_live);
+	  if ((from < leader_acked && cur_from_score >= cur_leader_score) ||
+	      (cur_from_score > cur_leader_score)) {
+	    ldout(cct, 5) << "Bumping epoch and starting new election; acked "
+			  << leader_acked << " should defer to " << from
+			  << " but there is score disagreement!" << dendl;
+	    bump_epoch(epoch+1);
+	    start();
+	  } else {
+	    ldout(cct, 5) << "no, we already acked " << leader_acked
+			  << " and it won't defer to " << from
+			  << " despite better round scores" << dendl;
+	  }
+	}
+      } else {
+  ldout(cct, 10) << "defering to " << from << dendl;
+	defer(from);
+	leader_peer_tracker.reset(new ConnectionTracker(*ct));
+      }
+    } else {
+      // ignore them!
+      ldout(cct, 5) << "no, we already acked " << leader_acked << " with score >=" << from_score << dendl;
+    }
+  }
+}
+
+void ElectionLogic::receive_ack(int from, epoch_t from_epoch)
+{
+  ceph_assert(from_epoch % 2 == 1); // sender in an election epoch
+  if (from_epoch > epoch) {
+    ldout(cct, 5) << "woah, that's a newer epoch, i must have rebooted.  bumping and re-starting!" << dendl;
+    bump_epoch(from_epoch);
+    start();
+    return;
+  }
+  // is that _everyone_?
+  if (electing_me) {
+    acked_me.insert(from);
+    if (acked_me.size() == elector->paxos_size()) {
+      // if yes, shortcut to election finish
+      declare_victory();
+    }
+  } else {
+    // ignore, i'm deferring already.
+    ceph_assert(leader_acked >= 0);
+  }
+}
+
+bool ElectionLogic::victory_makes_sense(int from)
+{
+  bool makes_sense = false;
+  switch (strategy) {
+  case CLASSIC:
+    makes_sense = (from < elector->get_my_rank());
+    break;
+  case DISALLOW:
+    makes_sense = (from < elector->get_my_rank()) ||
+      elector->get_disallowed_leaders().count(elector->get_my_rank());
+    break;
+  case CONNECTIVITY:
+    double my_score, leader_score;
+    my_score = connectivity_election_score(elector->get_my_rank());
+    leader_score = connectivity_election_score(from);
+    ldout(cct, 5) << "victory from " << from << " makes sense? lscore:"
+		  << leader_score
+		  << "; my score:" << my_score << dendl;
+
+    makes_sense = (leader_score >= my_score);
+    break;
+  default:
+    ceph_assert(0 == "how did you get a nonsense election strategy assigned?");
+  }
+  return makes_sense;
+}
+
+bool ElectionLogic::receive_victory_claim(int from, epoch_t from_epoch)
+{
+  bool election_okay = victory_makes_sense(from);
+
+  last_election_winner = from;
+  last_voted_for = leader_acked;
+  clear_live_election_state();
+
+  if (!election_okay) {
+    ceph_assert(strategy == CONNECTIVITY);
+    ldout(cct, 1) << "I should have been elected over this leader; bumping and restarting!" << dendl;
+    bump_epoch(from_epoch);
+    start();
+    return false;
+  }
+
+  // i should have seen this election if i'm getting the victory.
+  if (from_epoch != epoch + 1) { 
+    ldout(cct, 5) << "woah, that's a funny epoch, i must have rebooted.  bumping and re-starting!" << dendl;
+    bump_epoch(from_epoch);
+    start();
+    return false;
+  }
+
+  bump_epoch(from_epoch);
+
+  // they win
+  return true;
+}
diff --git a/src/mon/ElectionLogic.h b/src/mon/ElectionLogic.h
new file mode 100644
index 000000000..65c727ca1
--- /dev/null
+++ b/src/mon/ElectionLogic.h
@@ -0,0 +1,459 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_ELECTIONLOGIC_H
+#define CEPH_ELECTIONLOGIC_H
+
+#include <map>
+#include "include/types.h"
+#include "ConnectionTracker.h"
+
+class ElectionOwner {
+public:
+  /**
+   * Write down the given epoch in persistent storage, such that it
+   * can later be retrieved by read_persisted_epoch even across process
+   * or machine restarts.
+   *
+   * @param e The epoch to write
+   */
+  virtual void persist_epoch(epoch_t e) = 0;
+  /**
+   * Retrieve the most-previously-persisted epoch.
+   *
+   * @returns The latest epoch passed to persist_epoch()
+   */
+  virtual epoch_t read_persisted_epoch() const = 0;
+  /**
+   * Validate that the persistent store is working by committing
+   * to it. (There is no interface for retrieving the value; this
+   * tests local functionality before doing things like triggering
+   * elections to try and join a quorum.)
+   */
+  virtual void validate_store() = 0;
+  /**
+   * Notify the ElectionOwner that ElectionLogic has increased its
+   * election epoch. This resets an election (either on local loss or victory,
+   * or when trying a new election round) and the ElectionOwner
+   * should reset any tracking of its own to match. (The ElectionLogic
+   * will further trigger sending election messages if that is
+   * appropriate.)
+   */
+  virtual void notify_bump_epoch() = 0;
+  /**
+   * Notify the ElectionOwner we must start a new election.
+   */
+  virtual void trigger_new_election() = 0;
+  /**
+   * Retrieve this Paxos instance's rank.
+   */
+  virtual int get_my_rank() const = 0;
+  /**
+   * Send a PROPOSE message to all our peers. This happens when
+   * we have started a new election (which may mean attempting to
+   * override a current one).
+   *
+   * @param e The election epoch of our proposal.
+   * @param bl A bufferlist containing data the logic wishes to share
+   */
+  virtual void propose_to_peers(epoch_t e, bufferlist& bl) = 0;
+  /**
+   * The election has failed and we aren't sure what the state of the
+   * quorum is, so reset the entire system as if from scratch.
+   */
+  virtual void reset_election() = 0;
+  /**
+   * Ask the ElectionOwner if we-the-Monitor have ever participated in the
+   * quorum (including across process restarts!).
+   *
+   * @returns true if we have participated, false otherwise
+   */
+  virtual bool ever_participated() const = 0;
+  /**
+   * Ask the ElectionOwner for the size of the Paxos set. This includes
+   * those monitors which may not be in the current quorum!
+   * The value returned by this function can change between elections,
+   * but not during them. (In practical terms, it can be updated
+   * by making a paxos commit, but not by injecting values while
+   * an election is ongoing.)
+   */
+  virtual unsigned paxos_size() const = 0;
+  /**
+   * Retrieve a set of ranks which are not allowed to become the leader.
+   * Like paxos_size(), This set can change between elections, but not
+   * during them.
+   */
+  virtual const set<int>& get_disallowed_leaders() const = 0;
+  /**
+   * Tell the ElectionOwner we have started a new election.
+   *
+   * The ElectionOwner is responsible for timing out the election (by invoking
+   * end_election_period()) if it takes too long (as defined by the ElectionOwner).
+   * This function is the opportunity to do that and to clean up any other external 
+   * election state it may be maintaining.
+   */
+  virtual void _start() = 0;
+  /**
+   * Tell the ElectionOwner to defer to the identified peer. Tell that peer
+   * we have deferred to it.
+   *
+   * @post  we sent an ack message to @p who
+   */
+  virtual void _defer_to(int who) = 0;
+  /**
+   * We have won an election, so have the ElectionOwner message that to
+   * our new quorum!
+   *
+   * @param quorum The ranks of our peers which deferred to us and
+   *        must be told of our victory
+   */
+  virtual void message_victory(const std::set<int>& quorum) = 0;
+  /**
+   * Query the ElectionOwner about if a given rank is in the
+   * currently active quorum.
+   * @param rank the Paxos rank whose status we are checking
+   * @returns true if the rank is in our current quorum, false otherwise.
+   */
+  virtual bool is_current_member(int rank) const = 0;
+  virtual ~ElectionOwner() {}
+};
+
+/**
+ * This class maintains local state for running an election
+ * between Paxos instances. It receives input requests
+ * and calls back out to its ElectionOwner to do persistence
+ * and message other entities.
+ */
+
+class ElectionLogic {
+  ElectionOwner *elector;
+  ConnectionTracker *peer_tracker;
+  
+  CephContext *cct;
+  /**
+   * Latest epoch we've seen.
+   *
+   * @remarks if its value is odd, we're electing; if it's even, then we're
+   *	      stable.
+   */
+  epoch_t epoch = 0;
+  /**
+   * The last rank which won an election we participated in
+   */
+  int last_election_winner = -1;
+  /**
+   * Only used in the connectivity handler.
+   * The rank we voted for in the last election we voted in.
+   */
+  int last_voted_for = -1;
+  double ignore_propose_margin = 0.0001;
+  /**
+   * Only used in the connectivity handler.
+   * Points at a stable copy of the peer_tracker we use to keep scores
+   * throughout an election period.
+   */
+  std::unique_ptr<ConnectionTracker> stable_peer_tracker;
+  std::unique_ptr<ConnectionTracker> leader_peer_tracker;
+  /**
+   * Indicates who we have acked
+   */
+  int leader_acked;
+  
+public:
+  enum election_strategy {
+			  // Keep in sync with MonMap.h!
+    CLASSIC = 1, // the original rank-based one
+    DISALLOW = 2, // disallow a set from being leader
+    CONNECTIVITY = 3 // includes DISALLOW, extends to prefer stronger connections
+  };
+  election_strategy strategy;
+    
+  /**
+   * Indicates if we are participating in the quorum.
+   *
+   * @remarks By default, we are created as participating. We may stop
+   *	      participating if something explicitly sets our value
+   *	      false, though. If that happens, it will
+   *	      have to set participating=true and invoke start() for us to resume
+   *	      participating in the quorum.
+   */
+  bool participating;
+  /**
+   * Indicates if we are the ones being elected.
+   *
+   * We always attempt to be the one being elected if we are the ones starting
+   * the election. If we are not the ones that started it, we will only attempt
+   * to be elected if we think we might have a chance (i.e., the other guy's
+   * rank is lower than ours).
+   */
+  bool electing_me;
+  /**
+   * Set containing all those that acked our proposal to become the Leader.
+   *
+   * If we are acked by ElectionOwner::paxos_size() peers, we will declare
+   * victory.
+   */
+  std::set<int> acked_me;
+
+  ElectionLogic(ElectionOwner *e, election_strategy es, ConnectionTracker *t,
+		double ipm,
+		CephContext *c) : elector(e), peer_tracker(t), cct(c),
+				  last_election_winner(-1), last_voted_for(-1),
+				  ignore_propose_margin(ipm),
+				  stable_peer_tracker(),
+				  leader_peer_tracker(),
+				  leader_acked(-1),
+				  strategy(es),
+				  participating(true),
+				  electing_me(false) {}
+  /**
+   * Set the election strategy to use. If this is not consistent across the
+   * electing cluster, you're going to have a bad time.
+   * Defaults to CLASSIC.
+   */
+  void set_election_strategy(election_strategy es) {
+    strategy = es;
+  }
+  /**
+   * If there are no other peers in this Paxos group, ElectionOwner
+   * can simply declare victory and we will make it so.
+   *
+   * @pre paxos_size() is 1
+   * @pre get_my_rank is 0
+   */
+  void declare_standalone_victory();
+  /**
+   * Start a new election by proposing ourselves as the new Leader.
+   *
+   * Basically, send propose messages to all the peers.
+   *
+   * @pre   participating is true
+   * @post  epoch is an odd value
+   * @post  electing_me is true
+   * @post  We have invoked propose_to_peers() on our ElectionOwner
+   * @post  We have invoked _start() on our ElectionOwner
+   */
+  void start();
+  /**
+   * ElectionOwner has decided the election has taken too long and expired.
+   *
+   * This will happen when no one declared victory or started a new election
+   * during the allowed time span.
+   *
+   * When the election expires, we will check if we were the ones who won, and
+   * if so we will declare victory. If that is not the case, then we assume
+   * that the one we deferred to didn't declare victory quickly enough (in fact,
+   * as far as we know, it may even be dead); so, just propose ourselves as the
+   * Leader.
+   */
+  void end_election_period();
+  /**
+   * Handle a proposal from some other node proposing asking to become
+   * the Leader.
+   *
+   * If the message appears to be old (i.e., its epoch is lower than our epoch),
+   * then we may take one of two actions:
+   *
+   *  @li Ignore it because it's nothing more than an old proposal
+   *  @li Start new elections if we verify that it was sent by a monitor from
+   *	  outside the quorum; given its old state, it's fair to assume it just
+   *	  started, so we should start new elections so it may rejoin. (Some
+   *      handlers may choose to ignore even these, if they think it's flapping.)
+   *
+   * We pass the propose off to a propose_*_handler function based
+   * on the election strategy we're using.
+   * Only the Connectivity strategy cares about the ConnectionTracker; it should
+   * be NULL if other strategies are in use. Otherwise, it will take ownership
+   * of the underlying data and delete it as needed.
+   *
+   * @pre   Message epoch is from the current or a newer epoch
+   * @param mepoch The epoch of the proposal
+   * @param from The rank proposing itself as leader
+   * @param ct Any incoming ConnectionTracker data sent with the message.
+   *  Callers are responsible for deleting this -- we will copy it if we want
+   *  to keep the data.
+   */
+  void receive_propose(int from, epoch_t mepoch, const ConnectionTracker *ct);
+  /**
+   * Handle a message from some other participant Acking us as the Leader.
+   *
+   * When we receive such a message, one of three thing may be happening:
+   *  @li We received a message with a newer epoch, which means we must have
+   *	  somehow lost track of what was going on (maybe we rebooted), thus we
+   *	  will start a new election
+   *  @li We consider ourselves in the run for the Leader (i.e., @p electing_me 
+   *	  is true), and we are actually being Acked by someone; thus simply add
+   *	  the one acking us to the @p acked_me set. If we do now have acks from
+   *	  all the participants, then we can declare victory
+   *  @li We already deferred the election to somebody else, so we will just
+   *	  ignore this message
+   *
+   * @pre   Message epoch is from the current or a newer epoch
+   * @post  Election is on-going if we deferred to somebody else
+   * @post  Election is on-going if we are still waiting for further Acks
+   * @post  Election is not on-going if we are victorious
+   * @post  Election is not on-going if we must start a new one
+   *
+   * @param from The rank which acked us
+   * @param from_epoch The election epoch the ack belongs to
+   */
+  void receive_ack(int from, epoch_t from_epoch);
+  /**
+   * Handle a message from some other participant declaring Victory.
+   *
+   * We just got a message from someone declaring themselves Victorious, thus
+   * the new Leader.
+   *
+   * However, if the message's epoch happens to be different from our epoch+1,
+   * then it means we lost track of something and we must start a new election.
+   *
+   * If that is not the case, then we will simply update our epoch to the one
+   * in the message and invoke start() to reset the quorum.
+   *
+   * @pre   from_epoch is the current or a newer epoch
+   * @post  Election is not on-going
+   * @post  Updated @p epoch
+   * @post  We are a peon in a new quorum if we lost the election
+   *
+   * @param from The victory-claiming rank
+   * @param from_epoch The election epoch in which they claim victory
+   */
+  bool receive_victory_claim(int from, epoch_t from_epoch);
+  /**
+   * Obtain our epoch
+   *
+   * @returns Our current epoch number
+   */
+  epoch_t get_epoch() const { return epoch; }
+  int get_election_winner() { return last_election_winner; }
+
+private:
+  /**
+   * Initiate the ElectionLogic class.
+   *
+   * Basically, we will simply read whatever epoch value we have in our stable
+   * storage, or consider it to be 1 if none is read.
+   *
+   * @post @p epoch is set to 1 or higher.
+   */
+  void init();
+  /**
+   * Update our epoch.
+   *
+   * If we come across a higher epoch, we simply update ours, also making
+   * sure we are no longer being elected (even though we could have been,
+   * we no longer are since we no longer are on that old epoch).
+   *
+   * @pre Our epoch is not larger than @p e
+   * @post Our epoch equals @p e
+   *
+   * @param e Epoch to which we will update our epoch
+   */
+  void bump_epoch(epoch_t e);
+  /**
+   * If the incoming proposal is newer, bump our own epoch; if
+   * it comes from an out-of-quorum peer, trigger a new eleciton.
+   * @returns true if you should drop this proposal, false otherwise.
+   */
+  bool propose_classic_prefix(int from, epoch_t mepoch);
+  /**
+   * Handle a proposal from another rank using the classic strategy.
+   * We will take one of the following actions:
+   *
+   *  @li Ignore it because we already acked another node with higher rank
+   *  @li Ignore it and start a new election because we outrank it
+   *  @li Defer to it because it outranks us and the node we previously
+   *	  acked, if any
+   */
+  void propose_classic_handler(int from, epoch_t mepoch);
+  /**
+   * Handle a proposal from another rank using our disallow strategy.
+   * This is the same as the classic strategy except we also disallow
+   * certain ranks from becoming the leader.
+   */
+  void propose_disallow_handler(int from, epoch_t mepoch);
+  /**
+   * Handle a proposal from another rank using the connectivity strategy.
+   * We will choose to defer or not based on the ordered criteria:
+   *
+   * @li Whether the other monitor (or ourself) is on the disallow list
+   * @li Whether the other monitor or ourself has the most connectivity to peers
+   * @li Whether the other monitor or ourself has the lower rank
+   */
+  void propose_connectivity_handler(int from, epoch_t mepoch, const ConnectionTracker *ct);
+  /**
+   * Helper function for connectivity handler. Combines the disallowed list
+   * with ConnectionTracker scores.
+   */
+  double connectivity_election_score(int rank);
+  /**
+   * Defer the current election to some other monitor.
+   *
+   * This means that we will ack some other monitor and drop out from the run
+   * to become the Leader. We will only defer an election if the monitor we
+   * are deferring to outranks us.
+   *
+   * @pre   @p who outranks us (i.e., who < our rank)
+   * @pre   @p who outranks any other monitor we have deferred to in the past
+   * @post  electing_me is false
+   * @post  leader_acked equals @p who
+   * @post  we triggered ElectionOwner's _defer_to() on @p who
+   *
+   * @param who Some other monitor's numeric identifier. 
+   */
+  void defer(int who);
+  /**
+   * Declare Victory.
+   * 
+   * We won. Or at least we believe we won, but for all intents and purposes
+   * that does not matter. What matters is that we Won.
+   *
+   * That said, we must now bump our epoch to reflect that the election is over
+   * and then we must let everybody in the quorum know we are their brand new
+   * Leader.
+   *
+   * Actually, the quorum will be now defined as the group of monitors that
+   * acked us during the election process.
+   *
+   * @pre   Election is on-going
+   * @pre   electing_me is true
+   * @post  electing_me is false
+   * @post  epoch is bumped up into an even value
+   * @post  Election is not on-going
+   * @post  We have a quorum, composed of the monitors that acked us
+   * @post  We invoked message_victory() on the ElectionOwner
+   */
+  void declare_victory();
+  /**
+   * This is just a helper function to validate that the victory claim we
+   * get from another rank makes any sense.
+   */
+  bool victory_makes_sense(int from);
+  /**
+   * Reset some data members which we only care about while we are in an election
+   * or need to be set consistently during stable states.
+   */
+  void clear_live_election_state();
+  void reset_stable_tracker();
+  /**
+   * Only for the connectivity handler, Bump the epoch
+   * when we get a message from a newer one and clear
+   * out leader and stable tracker
+   * data so that we can switch our allegiance.
+   */
+  void connectivity_bump_epoch_in_election(epoch_t mepoch);
+};
+
+#endif
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
new file mode 100644
index 000000000..671c08d85
--- /dev/null
+++ b/src/mon/Elector.cc
@@ -0,0 +1,807 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "Elector.h"
+#include "Monitor.h"
+
+#include "common/Timer.h"
+#include "MonitorDBStore.h"
+#include "messages/MMonElection.h"
+#include "messages/MMonPing.h"
+
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, get_epoch())
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, Monitor *mon, epoch_t epoch) {
+  return *_dout << "mon." << mon->name << "@" << mon->rank
+		<< "(" << mon->get_state_name()
+		<< ").elector(" << epoch << ") ";
+}
+
+Elector::Elector(Monitor *m, int strategy) : logic(this, static_cast<ElectionLogic::election_strategy>(strategy),
+						   &peer_tracker,
+						   m->cct->_conf.get_val<double>("mon_elector_ignore_propose_margin"),
+						   m->cct),
+					     peer_tracker(this, m->rank,
+					    m->cct->_conf.get_val<uint64_t>("mon_con_tracker_score_halflife"),
+					    m->cct->_conf.get_val<uint64_t>("mon_con_tracker_persist_interval"), m->cct),
+			       ping_timeout(m->cct->_conf.get_val<double>("mon_elector_ping_timeout")),
+			       PING_DIVISOR(m->cct->_conf.get_val<uint64_t>("mon_elector_ping_divisor")),
+			       mon(m), elector(this) {
+  bufferlist bl;
+  mon->store->get(Monitor::MONITOR_NAME, "connectivity_scores", bl);
+  if (bl.length()) {
+    bufferlist::const_iterator bi = bl.begin();
+    peer_tracker.decode(bi);
+  }
+}
+
+
+void Elector::persist_epoch(epoch_t e)
+{
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  t->put(Monitor::MONITOR_NAME, "election_epoch", e);
+  t->put(Monitor::MONITOR_NAME, "connectivity_scores", peer_tracker.get_encoded_bl());
+  mon->store->apply_transaction(t);
+}
+
+void Elector::persist_connectivity_scores()
+{
+  dout(20) << __func__ << dendl;
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  t->put(Monitor::MONITOR_NAME, "connectivity_scores", peer_tracker.get_encoded_bl());
+  mon->store->apply_transaction(t);
+}
+
+epoch_t Elector::read_persisted_epoch() const
+{
+  return mon->store->get(Monitor::MONITOR_NAME, "election_epoch");
+}
+
+void Elector::validate_store()
+{
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  t->put(Monitor::MONITOR_NAME, "election_writeable_test", rand());
+  int r = mon->store->apply_transaction(t);
+  ceph_assert(r >= 0);
+}
+
+bool Elector::is_current_member(int rank) const
+{
+  return mon->quorum.count(rank);
+}
+
+void Elector::trigger_new_election()
+{
+  mon->start_election();
+}
+
+int Elector::get_my_rank() const
+{
+  return mon->rank;
+}
+
+void Elector::reset_election()
+{
+  mon->bootstrap();
+}
+
+bool Elector::ever_participated() const
+{
+  return mon->has_ever_joined;
+}
+
+unsigned Elector::paxos_size() const
+{
+  return (unsigned)mon->monmap->size();
+}
+
+void Elector::shutdown()
+{
+  cancel_timer();
+}
+
+void Elector::notify_bump_epoch()
+{
+  mon->join_election();
+}
+
+void Elector::propose_to_peers(epoch_t e, bufferlist& logic_bl)
+{
+  // bcast to everyone else
+  for (unsigned i=0; i<mon->monmap->size(); ++i) {
+    if ((int)i == mon->rank) continue;
+    MMonElection *m =
+      new MMonElection(MMonElection::OP_PROPOSE, e,
+		       peer_tracker.get_encoded_bl(),
+		       logic.strategy, mon->monmap);
+    m->sharing_bl = logic_bl;
+    m->mon_features = ceph::features::mon::get_supported();
+    m->mon_release = ceph_release();
+    mon->send_mon_message(m, i);
+  }  
+}
+
+void Elector::_start()
+{
+  peer_info.clear();
+  peer_info[mon->rank].cluster_features = CEPH_FEATURES_ALL;
+  peer_info[mon->rank].mon_release = ceph_release();
+  peer_info[mon->rank].mon_features = ceph::features::mon::get_supported();
+  mon->collect_metadata(&peer_info[mon->rank].metadata);
+  reset_timer();
+}
+
+void Elector::_defer_to(int who)
+{
+  MMonElection *m = new MMonElection(MMonElection::OP_ACK, get_epoch(),
+				     peer_tracker.get_encoded_bl(),
+				     logic.strategy, mon->monmap);
+  m->mon_features = ceph::features::mon::get_supported();
+  m->mon_release = ceph_release();
+  mon->collect_metadata(&m->metadata);
+
+  mon->send_mon_message(m, who);
+  
+  // set a timer
+  reset_timer(1.0);  // give the leader some extra time to declare victory
+}
+
+
+void Elector::reset_timer(double plus)
+{
+  // set the timer
+  cancel_timer();
+  /**
+   * This class is used as the callback when the expire_event timer fires up.
+   *
+   * If the expire_event is fired, then it means that we had an election going,
+   * either started by us or by some other participant, but it took too long,
+   * thus expiring.
+   *
+   * When the election expires, we will check if we were the ones who won, and
+   * if so we will declare victory. If that is not the case, then we assume
+   * that the one we defered to didn't declare victory quickly enough (in fact,
+   * as far as we know, we may even be dead); so, just propose ourselves as the
+   * Leader.
+   */
+  expire_event = mon->timer.add_event_after(
+    g_conf()->mon_election_timeout + plus,
+    new C_MonContext{mon, [this](int) {
+	logic.end_election_period();
+      }});
+}
+
+
+void Elector::cancel_timer()
+{
+  if (expire_event) {
+    mon->timer.cancel_event(expire_event);
+    expire_event = 0;
+  }
+}
+
+void Elector::assimilate_connection_reports(const bufferlist& tbl)
+{
+  dout(10) << __func__ << dendl;
+  ConnectionTracker pct(tbl, mon->cct);
+  peer_tracker.receive_peer_report(pct);
+}
+
+void Elector::message_victory(const std::set<int>& quorum)
+{
+  uint64_t cluster_features = CEPH_FEATURES_ALL;
+  mon_feature_t mon_features = ceph::features::mon::get_supported();
+  map<int,Metadata> metadata;
+  ceph_release_t min_mon_release{ceph_release_t::unknown};
+  for (auto id : quorum) {
+    auto i = peer_info.find(id);
+    ceph_assert(i != peer_info.end());
+    auto& info = i->second;
+    cluster_features &= info.cluster_features;
+    mon_features &= info.mon_features;
+    metadata[id] = info.metadata;
+    if (min_mon_release == ceph_release_t::unknown ||
+	info.mon_release < min_mon_release) {
+      min_mon_release = info.mon_release;
+    }
+  }
+
+  cancel_timer();
+  
+
+  // tell everyone!
+  for (set<int>::iterator p = quorum.begin();
+       p != quorum.end();
+       ++p) {
+    if (*p == mon->rank) continue;
+    MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, get_epoch(),
+				       peer_tracker.get_encoded_bl(),
+				       logic.strategy, mon->monmap);
+    m->quorum = quorum;
+    m->quorum_features = cluster_features;
+    m->mon_features = mon_features;
+    m->sharing_bl = mon->get_local_commands_bl(mon_features);
+    m->mon_release = min_mon_release;
+    mon->send_mon_message(m, *p);
+  }
+
+  // tell monitor
+  mon->win_election(get_epoch(), quorum,
+                    cluster_features, mon_features, min_mon_release,
+		    metadata);
+}
+
+
+void Elector::handle_propose(MonOpRequestRef op)
+{
+  op->mark_event("elector:handle_propose");
+  auto m = op->get_req<MMonElection>();
+  dout(5) << "handle_propose from " << m->get_source() << dendl;
+  int from = m->get_source().num();
+
+  ceph_assert(m->epoch % 2 == 1); // election
+  uint64_t required_features = mon->get_required_features();
+  mon_feature_t required_mon_features = mon->get_required_mon_features();
+
+  dout(10) << __func__ << " required features " << required_features
+           << " " << required_mon_features
+           << ", peer features " << m->get_connection()->get_features()
+           << " " << m->mon_features
+           << dendl;
+
+  if ((required_features ^ m->get_connection()->get_features()) &
+      required_features) {
+    dout(5) << " ignoring propose from mon" << from
+	    << " without required features" << dendl;
+    nak_old_peer(op);
+    return;
+  } else if (mon->monmap->min_mon_release > m->mon_release) {
+    dout(5) << " ignoring propose from mon" << from
+	    << " release " << (int)m->mon_release
+	    << " < min_mon_release " << (int)mon->monmap->min_mon_release
+	    << dendl;
+    nak_old_peer(op);
+    return;
+  } else if (!m->mon_features.contains_all(required_mon_features)) {
+    // all the features in 'required_mon_features' not in 'm->mon_features'
+    mon_feature_t missing = required_mon_features.diff(m->mon_features);
+    dout(5) << " ignoring propose from mon." << from
+            << " without required mon_features " << missing
+            << dendl;
+    nak_old_peer(op);
+  }
+  ConnectionTracker *oct = NULL;
+  if (m->sharing_bl.length()) {
+    oct = new ConnectionTracker(m->sharing_bl, mon->cct);
+  }
+  logic.receive_propose(from, m->epoch, oct);
+  delete oct;
+}
+
+void Elector::handle_ack(MonOpRequestRef op)
+{
+  op->mark_event("elector:handle_ack");
+  auto m = op->get_req<MMonElection>();
+  dout(5) << "handle_ack from " << m->get_source() << dendl;
+  int from = m->get_source().num();
+
+  ceph_assert(m->epoch == get_epoch());
+  uint64_t required_features = mon->get_required_features();
+  if ((required_features ^ m->get_connection()->get_features()) &
+      required_features) {
+    dout(5) << " ignoring ack from mon" << from
+	    << " without required features" << dendl;
+    return;
+  }
+
+  mon_feature_t required_mon_features = mon->get_required_mon_features();
+  if (!m->mon_features.contains_all(required_mon_features)) {
+    mon_feature_t missing = required_mon_features.diff(m->mon_features);
+    dout(5) << " ignoring ack from mon." << from
+            << " without required mon_features " << missing
+            << dendl;
+    return;
+  }
+
+  if (logic.electing_me) {
+    // thanks
+    peer_info[from].cluster_features = m->get_connection()->get_features();
+    peer_info[from].mon_features = m->mon_features;
+    peer_info[from].mon_release = m->mon_release;
+    peer_info[from].metadata = m->metadata;
+    dout(5) << " so far i have {";
+    for (auto q = logic.acked_me.begin();
+         q != logic.acked_me.end();
+         ++q) {
+      auto p = peer_info.find(*q);
+      ceph_assert(p != peer_info.end());
+      if (q != logic.acked_me.begin())
+        *_dout << ",";
+      *_dout << " mon." << p->first << ":"
+             << " features " << p->second.cluster_features
+             << " " << p->second.mon_features;
+    }
+    *_dout << " }" << dendl;
+  }
+
+  logic.receive_ack(from, m->epoch);
+}
+
+void Elector::handle_victory(MonOpRequestRef op)
+{
+  op->mark_event("elector:handle_victory");
+  auto m = op->get_req<MMonElection>();
+  dout(5) << "handle_victory from " << m->get_source()
+          << " quorum_features " << m->quorum_features
+          << " " << m->mon_features
+          << dendl;
+  int from = m->get_source().num();
+
+  bool accept_victory = logic.receive_victory_claim(from, m->epoch);
+
+  if (!accept_victory) {
+    return;
+  }
+
+  mon->lose_election(get_epoch(), m->quorum, from,
+                     m->quorum_features, m->mon_features, m->mon_release);
+
+  // cancel my timer
+  cancel_timer();
+
+  // stash leader's commands
+  ceph_assert(m->sharing_bl.length());
+  vector<MonCommand> new_cmds;
+  auto bi = m->sharing_bl.cbegin();
+  MonCommand::decode_vector(new_cmds, bi);
+  mon->set_leader_commands(new_cmds);
+}
+
+void Elector::nak_old_peer(MonOpRequestRef op)
+{
+  op->mark_event("elector:nak_old_peer");
+  auto m = op->get_req<MMonElection>();
+  uint64_t supported_features = m->get_connection()->get_features();
+  uint64_t required_features = mon->get_required_features();
+  mon_feature_t required_mon_features = mon->get_required_mon_features();
+  dout(10) << "sending nak to peer " << m->get_source()
+	   << " supports " << supported_features << " " << m->mon_features
+	   << ", required " << required_features << " " << required_mon_features
+	   << ", release " << (int)m->mon_release
+	   << " vs required " << (int)mon->monmap->min_mon_release
+	   << dendl;
+  MMonElection *reply = new MMonElection(MMonElection::OP_NAK, m->epoch,
+                                         peer_tracker.get_encoded_bl(),
+					 logic.strategy, mon->monmap);
+  reply->quorum_features = required_features;
+  reply->mon_features = required_mon_features;
+  reply->mon_release = mon->monmap->min_mon_release;
+  mon->features.encode(reply->sharing_bl);
+  m->get_connection()->send_message(reply);
+}
+
+void Elector::handle_nak(MonOpRequestRef op)
+{
+  op->mark_event("elector:handle_nak");
+  auto m = op->get_req<MMonElection>();
+  dout(1) << "handle_nak from " << m->get_source()
+	  << " quorum_features " << m->quorum_features
+          << " " << m->mon_features
+	  << " min_mon_release " << (int)m->mon_release
+          << dendl;
+
+  if (m->mon_release > ceph_release()) {
+    derr << "Shutting down because I am release " << (int)ceph_release()
+	 << " < min_mon_release " << (int)m->mon_release << dendl;
+  } else {
+    CompatSet other;
+    auto bi = m->sharing_bl.cbegin();
+    other.decode(bi);
+    CompatSet diff = Monitor::get_supported_features().unsupported(other);
+
+    mon_feature_t mon_supported = ceph::features::mon::get_supported();
+    // all features in 'm->mon_features' not in 'mon_supported'
+    mon_feature_t mon_diff = m->mon_features.diff(mon_supported);
+
+    derr << "Shutting down because I lack required monitor features: { "
+	 << diff << " } " << mon_diff << dendl;
+  }
+  exit(0);
+  // the end!
+}
+
+void Elector::begin_peer_ping(int peer)
+{
+  dout(20) << __func__ << " against " << peer << dendl;
+  if (live_pinging.count(peer)) {
+    dout(20) << peer << " already in live_pinging ... return " << dendl;
+    return;
+  }
+
+  if (!mon->get_quorum_mon_features().contains_all(
+				      ceph::features::mon::FEATURE_PINGING)) {
+    return;
+  }
+
+  peer_tracker.report_live_connection(peer, 0); // init this peer as existing
+  live_pinging.insert(peer);
+  dead_pinging.erase(peer);
+  peer_acked_ping[peer] = ceph_clock_now();
+  if (!send_peer_ping(peer)) return;
+  mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
+			     new C_MonContext{mon, [this, peer](int) {
+				 ping_check(peer);
+			       }});
+}
+
+bool Elector::send_peer_ping(int peer, const utime_t *n)
+{
+  dout(10) << __func__ << " to peer " << peer << dendl;
+  if (peer >= mon->monmap->ranks.size()) {
+    // Monitor no longer exists in the monmap,
+    // therefore, we shouldn't ping this monitor
+    // since we cannot lookup the address!
+    dout(5) << "peer: " << peer << " >= ranks_size: "
+      << mon->monmap->ranks.size() << " ... dropping to prevent "
+      << "https://tracker.ceph.com/issues/50089" << dendl;
+    live_pinging.erase(peer);
+    return false;
+  }
+  utime_t now;
+  if (n != NULL) {
+    now = *n;
+  } else {
+    now = ceph_clock_now();
+  }
+  MMonPing *ping = new MMonPing(MMonPing::PING, now, peer_tracker.get_encoded_bl());
+  mon->messenger->send_to_mon(ping, mon->monmap->get_addrs(peer));
+  peer_sent_ping[peer] = now;
+  return true;
+}
+
+void Elector::ping_check(int peer)
+{
+  dout(20) << __func__ << " to peer " << peer << dendl;
+
+  if (!live_pinging.count(peer) &&
+      !dead_pinging.count(peer)) {
+    dout(20) << __func__ << peer << " is no longer marked for pinging" << dendl;
+    return;
+  }
+  utime_t now = ceph_clock_now();
+  utime_t& acked_ping = peer_acked_ping[peer];
+  utime_t& newest_ping = peer_sent_ping[peer];
+  if (!acked_ping.is_zero() && acked_ping < now - ping_timeout) {
+    peer_tracker.report_dead_connection(peer, now - acked_ping);
+    acked_ping = now;
+    begin_dead_ping(peer);
+    return;
+  }
+
+  if (acked_ping == newest_ping) {
+    if (!send_peer_ping(peer, &now)) return;
+  }
+
+  mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
+			     new C_MonContext{mon, [this, peer](int) {
+				 ping_check(peer);
+			       }});
+}
+
+void Elector::begin_dead_ping(int peer)
+{
+  dout(20) << __func__ << " to peer " << peer << dendl;  
+  if (dead_pinging.count(peer)) {
+    return;
+  }
+  
+  live_pinging.erase(peer);
+  dead_pinging.insert(peer);
+  mon->timer.add_event_after(ping_timeout,
+			     new C_MonContext{mon, [this, peer](int) {
+				 dead_ping(peer);
+			       }});
+}
+
+void Elector::dead_ping(int peer)
+{
+  dout(20) << __func__ << " to peer " << peer << dendl;
+  if (!dead_pinging.count(peer)) {
+    dout(20) << __func__ << peer << " is no longer marked for dead pinging" << dendl;
+    return;
+  }
+  ceph_assert(!live_pinging.count(peer));
+
+  utime_t now = ceph_clock_now();
+  utime_t& acked_ping = peer_acked_ping[peer];
+
+  peer_tracker.report_dead_connection(peer, now - acked_ping);
+  acked_ping = now;
+  mon->timer.add_event_after(ping_timeout,
+			       new C_MonContext{mon, [this, peer](int) {
+				   dead_ping(peer);
+				 }});
+}
+
+void Elector::handle_ping(MonOpRequestRef op)
+{
+  MMonPing *m = static_cast<MMonPing*>(op->get_req());
+  int prank = mon->monmap->get_rank(m->get_source_addr());
+  dout(20) << __func__ << " from: " << prank << dendl;
+  begin_peer_ping(prank);
+  assimilate_connection_reports(m->tracker_bl);
+  switch(m->op) {
+  case MMonPing::PING:
+    {
+      MMonPing *reply = new MMonPing(MMonPing::PING_REPLY, m->stamp, peer_tracker.get_encoded_bl());
+      m->get_connection()->send_message(reply);
+    }
+    break;
+
+  case MMonPing::PING_REPLY:
+
+    const utime_t& previous_acked = peer_acked_ping[prank];
+    const utime_t& newest = peer_sent_ping[prank];
+
+    if (m->stamp > newest && !newest.is_zero()) {
+      derr << "dropping PING_REPLY stamp " << m->stamp
+	   << " as it is newer than newest sent " << newest << dendl;
+      return;
+    }
+
+    if (m->stamp > previous_acked) {
+      dout(20) << "m->stamp > previous_acked" << dendl;
+      peer_tracker.report_live_connection(prank, m->stamp - previous_acked);
+      peer_acked_ping[prank] = m->stamp;
+    } else{
+      dout(20) << "m->stamp <= previous_acked .. we don't report_live_connection" << dendl;
+    }
+    utime_t now = ceph_clock_now();
+    dout(30) << "now: " << now << " m->stamp: " << m->stamp << " ping_timeout: "
+      << ping_timeout << " PING_DIVISOR: " << PING_DIVISOR << dendl;
+    if (now - m->stamp > ping_timeout / PING_DIVISOR) {
+      if (!send_peer_ping(prank, &now)) return;
+    }
+    break;
+  }
+}
+
+void Elector::dispatch(MonOpRequestRef op)
+{
+  op->mark_event("elector:dispatch");
+  ceph_assert(op->is_type_election_or_ping());
+
+  switch (op->get_req()->get_type()) {
+    
+  case MSG_MON_ELECTION:
+    {
+      if (!logic.participating) {
+        return;
+      }
+      if (op->get_req()->get_source().num() >= mon->monmap->size()) {
+	dout(5) << " ignoring bogus election message with bad mon rank " 
+		<< op->get_req()->get_source() << dendl;
+	return;
+      }
+
+      auto em = op->get_req<MMonElection>();
+      dout(20) << __func__ << " from: " << mon->monmap->get_rank(em->get_source_addr()) << dendl;
+      // assume an old message encoding would have matched
+      if (em->fsid != mon->monmap->fsid) {
+	dout(0) << " ignoring election msg fsid " 
+		<< em->fsid << " != " << mon->monmap->fsid << dendl;
+	return;
+      }
+
+      if (!mon->monmap->contains(em->get_source_addr())) {
+	dout(1) << "discarding election message: " << em->get_source_addr()
+		<< " not in my monmap " << *mon->monmap << dendl;
+	return;
+      }
+
+      MonMap peermap;
+      peermap.decode(em->monmap_bl);
+      if (peermap.epoch > mon->monmap->epoch) {
+	dout(0) << em->get_source_inst() << " has newer monmap epoch " << peermap.epoch
+		<< " > my epoch " << mon->monmap->epoch 
+		<< ", taking it"
+		<< dendl;
+	mon->monmap->decode(em->monmap_bl);
+        auto t(std::make_shared<MonitorDBStore::Transaction>());
+        t->put("monmap", mon->monmap->epoch, em->monmap_bl);
+        t->put("monmap", "last_committed", mon->monmap->epoch);
+        mon->store->apply_transaction(t);
+	//mon->monmon()->paxos->stash_latest(mon->monmap->epoch, em->monmap_bl);
+	cancel_timer();
+	mon->notify_new_monmap(false);
+	mon->bootstrap();
+	return;
+      }
+      if (peermap.epoch < mon->monmap->epoch) {
+	dout(0) << em->get_source_inst() << " has older monmap epoch " << peermap.epoch
+		<< " < my epoch " << mon->monmap->epoch 
+		<< dendl;
+      }
+
+      if (em->strategy != logic.strategy) {
+	dout(5) << __func__ << " somehow got an Election message with different strategy "
+		<< em->strategy << " from local " << logic.strategy
+		<< "; dropping for now to let race resolve" << dendl;
+	return;
+      }
+
+      if (em->scoring_bl.length()) {
+	assimilate_connection_reports(em->scoring_bl);
+      }
+
+      begin_peer_ping(mon->monmap->get_rank(em->get_source_addr()));
+      switch (em->op) {
+      case MMonElection::OP_PROPOSE:
+	handle_propose(op);
+	return;
+      }
+
+      if (em->epoch < get_epoch()) {
+	dout(5) << "old epoch, dropping" << dendl;
+	break;
+      }
+
+      switch (em->op) {
+      case MMonElection::OP_ACK:
+	handle_ack(op);
+	return;
+      case MMonElection::OP_VICTORY:
+	handle_victory(op);
+	return;
+      case MMonElection::OP_NAK:
+	handle_nak(op);
+	return;
+      default:
+	ceph_abort();
+      }
+    }
+    break;
+
+  case MSG_MON_PING:
+    handle_ping(op);
+    break;
+    
+  default: 
+    ceph_abort();
+  }
+}
+
+void Elector::start_participating()
+{
+  logic.participating = true;
+}
+
+bool Elector::peer_tracker_is_clean()
+{
+  return peer_tracker.is_clean(mon->rank, paxos_size());
+}
+
+void Elector::notify_clear_peer_state()
+{
+  dout(10) << __func__ << dendl;
+  dout(20) << " peer_tracker before: " << peer_tracker << dendl;
+  peer_tracker.notify_reset();
+  peer_tracker.set_rank(mon->rank);
+  dout(20) << " peer_tracker after: " << peer_tracker << dendl;
+}
+
+void Elector::notify_rank_changed(int new_rank)
+{
+  dout(10) << __func__ << " to " << new_rank << dendl; 
+  peer_tracker.notify_rank_changed(new_rank);
+  live_pinging.erase(new_rank);
+  dead_pinging.erase(new_rank);
+}
+
+void Elector::notify_rank_removed(int rank_removed, int new_rank)
+{
+  dout(10) << __func__ << ": " << rank_removed << dendl; 
+  peer_tracker.notify_rank_removed(rank_removed, new_rank);
+  /* we have to clean up the pinging state, which is annoying
+     because it's not indexed anywhere (and adding indexing
+     would also be annoying).
+     In the case where we are removing any rank that is not the
+     higest, we start with the removed rank and examine the state
+     of the surrounding ranks.
+     Everybody who remains with larger rank gets a new rank one lower
+     than before, and we have to figure out the remaining scheduled
+     ping contexts. So, starting one past with the removed rank, we:
+     * check if the current rank is alive or dead
+     * examine our new rank (one less than before, initially the removed
+     rank)
+     * * erase it if it's in the wrong set
+     * * start pinging it if we're not already
+     * check if the next rank is in the same pinging set, and delete
+     * ourselves if not.
+     In the case where we are removing the highest rank,
+     we erase the removed rank from all sets.
+   */
+  if (rank_removed < paxos_size()) {
+    for (unsigned i = rank_removed + 1; i <= paxos_size() ; ++i) {
+      if (live_pinging.count(i)) {
+        dead_pinging.erase(i-1);
+        if (!live_pinging.count(i-1)) {
+	  begin_peer_ping(i-1);
+        }
+        if (!live_pinging.count(i+1)) {
+	  live_pinging.erase(i);
+        }
+      }
+      else if (dead_pinging.count(i)) {
+        live_pinging.erase(i-1);
+        if (!dead_pinging.count(i-1)) {
+	  begin_dead_ping(i-1);
+        }
+        if (!dead_pinging.count(i+1)) {
+	  dead_pinging.erase(i);
+        }
+      } else {
+        // we aren't pinging rank i at all
+        if (i-1 == (unsigned)rank_removed) {
+	  // so we special case to make sure we
+	  // actually nuke the removed rank
+	  dead_pinging.erase(rank_removed);
+	  live_pinging.erase(rank_removed);
+        }
+      }
+     }
+   } else {
+     if (live_pinging.count(rank_removed)) {
+       live_pinging.erase(rank_removed);
+     }
+     if (dead_pinging.count(rank_removed)) {
+       dead_pinging.erase(rank_removed);
+     }
+   }
+}
+
+void Elector::notify_strategy_maybe_changed(int strategy)
+{
+  logic.set_election_strategy(static_cast<ElectionLogic::election_strategy>(strategy));
+}
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
new file mode 100644
index 000000000..2a53c1fc4
--- /dev/null
+++ b/src/mon/Elector.h
@@ -0,0 +1,406 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_MON_ELECTOR_H
+#define CEPH_MON_ELECTOR_H
+
+#include <map>
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "mon/MonOpRequest.h"
+#include "mon/mon_types.h"
+#include "mon/ElectionLogic.h"
+#include "mon/ConnectionTracker.h"
+
+class Monitor;
+
+
+/**
+ * This class is responsible for handling messages and maintaining
+ * an ElectionLogic which holds the local state when electing
+ * a new Leader. We may win or we may lose. If we win, it means we became the
+ * Leader; if we lose, it means we are a Peon.
+ */
+class Elector : public ElectionOwner, RankProvider {
+  /**
+   * @defgroup Elector_h_class Elector
+   * @{
+   */
+  ElectionLogic logic;
+  // connectivity validation and scoring
+  ConnectionTracker peer_tracker;
+  map<int, utime_t> peer_acked_ping; // rank -> last ping stamp they acked
+  map<int, utime_t> peer_sent_ping; // rank -> last ping stamp we sent
+  set<int> live_pinging; // ranks which we are currently pinging
+  set<int> dead_pinging; // ranks which didn't answer (degrading scores)
+  double ping_timeout; // the timeout after which we consider a ping to be dead
+  int PING_DIVISOR = 2;  // we time out pings
+
+   /**
+   * @defgroup Elector_h_internal_types Internal Types
+   * @{
+   */
+  /**
+   * This struct will hold the features from a given peer.
+   * Features may both be the cluster's (in the form of a uint64_t), or
+   * mon-specific features. Instead of keeping maps to hold them both, or
+   * a pair, which would be weird, a struct to keep them seems appropriate.
+   */
+  struct elector_info_t {
+    uint64_t cluster_features = 0;
+    mon_feature_t mon_features;
+    ceph_release_t mon_release{0};
+    std::map<std::string,std::string> metadata;
+  };
+
+  /**
+   * @}
+   */
+
+  /**
+   * The Monitor instance associated with this class.
+   */
+  Monitor *mon;
+
+  /**
+   * Event callback responsible for dealing with an expired election once a
+   * timer runs out and fires up.
+   */
+  Context *expire_event = nullptr;
+
+  /**
+   * Resets the expire_event timer, by cancelling any existing one and
+   * scheduling a new one.
+   *
+   * @remarks This function assumes as a default firing value the duration of
+   *	      the monitor's lease interval, and adds to it the value specified
+   *	      in @e plus
+   *
+   * @post expire_event is set
+   *
+   * @param plus The amount of time to be added to the default firing value.
+   */
+  void reset_timer(double plus=0.0);
+  /**
+   * Cancel the expire_event timer, if it is defined.
+   *
+   * @post expire_event is not set
+   */
+  void cancel_timer();
+
+  // electing me
+  /**
+   * @defgroup Elector_h_electing_me_vars We are being elected
+   * @{
+   */
+  /**
+   * Map containing info of all those that acked our proposal to become the Leader.
+   * Note each peer's info.
+   */
+  std::map<int, elector_info_t> peer_info;
+  /**
+   * @}
+   */
+ 
+  /**
+   * Handle a message from some other node proposing itself to become it
+   * the Leader.
+   *
+   * We validate that the sending Monitor is allowed to participate based on
+   * its supported features, then pass the request to our ElectionLogic.
+   *
+   * @invariant The received message is an operation of type OP_PROPOSE
+   *
+   * @pre   Message epoch is from the current or a newer epoch
+   * 
+   * @param m A message sent by another participant in the quorum.
+   */
+  void handle_propose(MonOpRequestRef op);
+  /**
+   * Handle a message from some other participant Acking us as the Leader.
+   *
+   * We validate that the sending Monitor is allowed to participate based on
+   * its supported features, add it to peer_info, and pass the ack to our
+   * ElectionLogic.
+   *
+   * @pre   Message epoch is from the current or a newer epoch
+   *
+   * @param m A message with an operation type of OP_ACK
+   */
+  void handle_ack(MonOpRequestRef op);
+  /**
+   * Handle a message from some other participant declaring Victory.
+   *
+   * We just got a message from someone declaring themselves Victorious, thus
+   * the new Leader.
+   *
+   * We pass the Victory to our ElectionLogic, and if it confirms the
+   * victory we lose the election and start following this Leader. Otherwise,
+   * drop the message.
+   *
+   * @pre   Message epoch is from the current or a newer epoch
+   * @post  Election is not on-going
+   * @post  Updated @p epoch
+   * @post  We have a new quorum if we lost the election
+   *
+   * @param m A message with an operation type of OP_VICTORY
+   */
+  void handle_victory(MonOpRequestRef op);
+  /**
+   * Send a nak to a peer who's out of date, containing information about why.
+   *
+   * If we get a message from a peer who can't support the required quorum
+   * features, we have to ignore them. This function will at least send
+   * them a message about *why* they're being ignored -- if they're new
+   * enough to support such a message.
+   *
+   * @param m A message from a monitor not supporting required features. We
+   * take ownership of the reference.
+   */
+  void nak_old_peer(MonOpRequestRef op);
+  /**
+   * Handle a message from some other participant declaring
+   * we cannot join the quorum.
+   *
+   * Apparently the quorum requires some feature that we do not implement. Shut
+   * down gracefully.
+   *
+   * @pre Election is on-going.
+   * @post We've shut down.
+   *
+   * @param m A message with an operation type of OP_NAK
+   */
+  void handle_nak(MonOpRequestRef op);
+  /**
+   * Send a ping to the specified peer.
+   * @n optional time that we will use instead of calling ceph_clock_now()
+   */
+  bool send_peer_ping(int peer, const utime_t *n=NULL);
+  /**
+   * Check the state of pinging the specified peer. This is our
+   * "tick" for heartbeating; scheduled by itself and begin_peer_ping().
+   */
+  void ping_check(int peer);
+  /**
+   * Move the peer out of live_pinging into dead_pinging set
+   * and schedule dead_ping()ing on it.
+   */
+  void begin_dead_ping(int peer);
+  /**
+   * Checks that the peer is still marked for dead pinging,
+   * and then marks it as dead for the appropriate interval.
+   */
+  void dead_ping(int peer);
+  /**
+   * Handle a ping from another monitor and assimilate the data it contains.
+   */
+  void handle_ping(MonOpRequestRef op);
+  /**
+   * Update our view of everybody else's connectivity based on the provided
+   * tracker bufferlist
+   */
+  void assimilate_connection_reports(const bufferlist& bl);
+  
+ public:
+  /**
+   * @defgroup Elector_h_ElectionOwner Functions from the ElectionOwner interface
+   * @{
+   */
+  /* Commit the given epoch to our MonStore.
+   * We also take the opportunity to persist our peer_tracker.
+   */
+  void persist_epoch(epoch_t e);
+  /* Read the epoch out of our MonStore */
+  epoch_t read_persisted_epoch() const;
+  /* Write a nonsense key "election_writeable_test" to our MonStore */
+  void validate_store();
+  /* Reset my tracking. Currently, just call Monitor::join_election() */
+  void notify_bump_epoch();
+  /* Call a new election: Invoke Monitor::start_election() */
+  void trigger_new_election();
+  /* Retrieve rank from the Monitor */
+  int get_my_rank() const;
+  /* Send MMonElection OP_PROPOSE to every monitor in the map. */
+  void propose_to_peers(epoch_t e, bufferlist &bl);
+  /* bootstrap() the Monitor */
+  void reset_election();
+  /* Retrieve the Monitor::has_ever_joined member */
+  bool ever_participated() const;
+  /* Retrieve monmap->size() */
+  unsigned paxos_size() const;
+  /* Right now we don't disallow anybody */
+  set<int> disallowed_leaders;
+  const set<int>& get_disallowed_leaders() const { return disallowed_leaders; }
+  /**
+   * Reset the expire_event timer so we can limit the amount of time we 
+   * will be electing. Clean up our peer_info.
+   *
+   * @post  we reset the expire_event timer
+   */
+  void _start();
+  /**
+   * Send an MMonElection message deferring to the identified monitor. We
+   * also increase the election timeout so the monitor we defer to
+   * has some time to gather deferrals and actually win. (FIXME: necessary to protocol?)
+   *
+   * @post  we sent an ack message to @p who
+   * @post  we reset the expire_event timer
+   *
+   * @param who Some other monitor's numeric identifier. 
+   */
+  void _defer_to(int who);
+  /**
+   * Our ElectionLogic told us we won an election! Identify the quorum
+   * features, tell our new peons we've won, and invoke Monitor::win_election().
+   */
+  void message_victory(const std::set<int>& quorum);
+  /* Check if rank is in mon->quorum */
+  bool is_current_member(int rank) const;
+  /*
+   * @}
+   */
+  /**
+   * Persist our peer_tracker to disk.
+   */
+  void persist_connectivity_scores();
+
+  Elector *elector;
+  
+  /**
+   * Create an Elector class
+   *
+   * @param m A Monitor instance
+   * @param strategy The election strategy to use, defined in MonMap/ElectionLogic
+   */
+  explicit Elector(Monitor *m, int strategy);
+  virtual ~Elector() {}
+
+  /**
+   * Inform this class it is supposed to shutdown.
+   *
+   * We will simply cancel the @p expire_event if any exists.
+   *
+   * @post @p expire_event is cancelled 
+   */
+  void shutdown();
+
+  /**
+   * Obtain our epoch from ElectionLogic.
+   *
+   * @returns Our current epoch number
+   */
+  epoch_t get_epoch() { return logic.get_epoch(); }
+
+  /**
+   * If the Monitor knows there are no Paxos peers (so
+   * we are rank 0 and there are no others) we can declare victory.
+   */
+  void declare_standalone_victory() {
+    logic.declare_standalone_victory();
+  }
+  /**
+   * Tell the Elector to start pinging a given peer.
+   * Do this when you discover a peer and it has a rank assigned.
+   * We do it ourselves on receipt of pings and when receiving other messages.
+   */
+  void begin_peer_ping(int peer);
+  /**
+   * Handle received messages.
+   *
+   * We will ignore all messages that are not of type @p MSG_MON_ELECTION
+   * (i.e., messages whose interface is not of type @p MMonElection). All of
+   * those that are will then be dispatched to their operation-specific
+   * functions.
+   *
+   * @param m A received message
+   */
+  void dispatch(MonOpRequestRef op);
+
+  /**
+   * Call an election.
+   *
+   * This function simply calls ElectionLogic::start.
+   */
+  void call_election() {
+    logic.start();
+  }
+
+  /**
+   * Stop participating in subsequent Elections.
+   *
+   * @post @p participating is false
+   */
+  void stop_participating() { logic.participating = false; }
+  /**
+   * Start participating in Elections.
+   *
+   * If we are already participating (i.e., @p participating is true), then
+   * calling this function is moot.
+   *
+   * However, if we are not participating (i.e., @p participating is false),
+   * then we will start participating by setting @p participating to true and
+   * we will call for an Election.
+   *
+   * @post  @p participating is true
+   */
+  void start_participating();
+  /**
+  * Check if our peer_tracker is self-consistent, not suffering from
+  * https://tracker.ceph.com/issues/58049
+  */
+  bool peer_tracker_is_clean();
+  /**
+   * Forget everything about our peers. :(
+   */
+  void notify_clear_peer_state();
+  /**
+   * Notify that our local rank has changed
+   * and we may need to update internal data structures.
+   */
+  void notify_rank_changed(int new_rank);
+  /**
+   * A peer has been removed so we should clean up state related to it.
+   * This is safe to call even if we haven't joined or are currently
+   * in a quorum.
+   */
+  void notify_rank_removed(int rank_removed, int new_rank);
+  void notify_strategy_maybe_changed(int strategy);
+  /**
+   * Set the disallowed leaders.
+   *
+   * If you call this and the new disallowed set
+   * contains your current leader, you are
+   * responsible for calling an election!
+   *
+   * @returns false if the set is unchanged,
+   *   true if the set changed
+   */
+  bool set_disallowed_leaders(const set<int>& dl) {
+    if (dl == disallowed_leaders) return false;
+    disallowed_leaders = dl;
+    return true;
+  }
+  void dump_connection_scores(Formatter *f) {
+    f->open_object_section("connection scores");
+    peer_tracker.dump(f);
+    f->close_section();
+  }
+  /**
+   * @}
+   */
+};
+
+#endif
diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc
new file mode 100644
index 000000000..0b1bb2a03
--- /dev/null
+++ b/src/mon/FSCommands.cc
@@ -0,0 +1,1516 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat Ltd
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include "OSDMonitor.h"
+
+#include "FSCommands.h"
+#include "MDSMonitor.h"
+#include "MgrStatMonitor.h"
+#include "mds/cephfs_features.h"
+
+using TOPNSPC::common::cmd_getval;
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::pair;
+using std::set;
+using std::string;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+
+class FlagSetHandler : public FileSystemCommandHandler
+{
+  public:
+  FlagSetHandler()
+    : FileSystemCommandHandler("fs flag set")
+  {
+  }
+
+  int handle(
+      Monitor *mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream &ss) override
+  {
+    string flag_name;
+    cmd_getval(cmdmap, "flag_name", flag_name);
+
+    string flag_val;
+    cmd_getval(cmdmap, "val", flag_val);
+
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+    if (flag_name == "enable_multiple") {
+      bool flag_bool = false;
+      int r = parse_bool(flag_val, &flag_bool, ss);
+      if (r != 0) {
+        ss << "Invalid boolean value '" << flag_val << "'";
+        return r;
+      }
+
+      fsmap.set_enable_multiple(flag_bool);
+      return 0;
+    } else {
+      ss << "Unknown flag '" << flag_name << "'";
+      return -EINVAL;
+    }
+  }
+};
+
+class FailHandler : public FileSystemCommandHandler
+{
+  public:
+  FailHandler()
+    : FileSystemCommandHandler("fs fail")
+  {
+  }
+
+  int handle(
+      Monitor* mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream& ss) override
+  {
+    if (!mon->osdmon()->is_writeable()) {
+      // not allowed to write yet, so retry when we can
+      mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+      return -EAGAIN;
+    }
+
+    std::string fs_name;
+    if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+      ss << "Missing filesystem name";
+      return -EINVAL;
+    }
+
+    auto fs = fsmap.get_filesystem(fs_name);
+
+    auto f = [](auto fs) {
+      fs->mds_map.set_flag(CEPH_MDSMAP_NOT_JOINABLE);
+    };
+    fsmap.modify_filesystem(fs->fscid, std::move(f));
+
+    std::vector<mds_gid_t> to_fail;
+    for (const auto& p : fs->mds_map.get_mds_info()) {
+      to_fail.push_back(p.first);
+    }
+
+    for (const auto& gid : to_fail) {
+      mon->mdsmon()->fail_mds_gid(fsmap, gid);
+    }
+    if (!to_fail.empty()) {
+      mon->osdmon()->propose_pending();
+    }
+
+    ss << fs_name;
+    ss << " marked not joinable; MDS cannot join the cluster. All MDS ranks marked failed.";
+
+    return 0;
+  }
+};
+
+class FsNewHandler : public FileSystemCommandHandler
+{
+  public:
+  explicit FsNewHandler(Paxos *paxos)
+    : FileSystemCommandHandler("fs new"), m_paxos(paxos)
+  {
+  }
+
+  bool batched_propose() override {
+    return true;
+  }
+
+  int handle(
+      Monitor *mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream &ss) override
+  {
+    ceph_assert(m_paxos->is_plugged());
+
+    string metadata_name;
+    cmd_getval(cmdmap, "metadata", metadata_name);
+    int64_t metadata = mon->osdmon()->osdmap.lookup_pg_pool_name(metadata_name);
+    if (metadata < 0) {
+      ss << "pool '" << metadata_name << "' does not exist";
+      return -ENOENT;
+    }
+
+    string data_name;
+    cmd_getval(cmdmap, "data", data_name);
+    int64_t data = mon->osdmon()->osdmap.lookup_pg_pool_name(data_name);
+    if (data < 0) {
+      ss << "pool '" << data_name << "' does not exist";
+      return -ENOENT;
+    }
+    if (data == 0) {
+      ss << "pool '" << data_name << "' has id 0, which CephFS does not allow. Use another pool or recreate it to get a non-zero pool id.";
+      return -EINVAL;
+    }
+
+    string fs_name;
+    cmd_getval(cmdmap, "fs_name", fs_name);
+    if (fs_name.empty()) {
+        // Ensure fs name is not empty so that we can implement
+        // commmands that refer to FS by name in future.
+        ss << "Filesystem name may not be empty";
+        return -EINVAL;
+    }
+
+    if (fsmap.get_filesystem(fs_name)) {
+      auto fs = fsmap.get_filesystem(fs_name);
+      if (*(fs->mds_map.get_data_pools().begin()) == data
+          && fs->mds_map.get_metadata_pool() == metadata) {
+        // Identical FS created already, this is a no-op
+        ss << "filesystem '" << fs_name << "' already exists";
+        return 0;
+      } else {
+        ss << "filesystem already exists with name '" << fs_name << "'";
+        return -EINVAL;
+      }
+    }
+
+    bool force = false;
+    cmd_getval(cmdmap, "force", force);
+
+    const pool_stat_t *stat = mon->mgrstatmon()->get_pool_stat(metadata);
+    if (stat) {
+      int64_t metadata_num_objects = stat->stats.sum.num_objects;
+      if (!force && metadata_num_objects > 0) {
+	ss << "pool '" << metadata_name
+	   << "' already contains some objects. Use an empty pool instead.";
+	return -EINVAL;
+      }
+    }
+
+    if (fsmap.filesystem_count() > 0
+        && !fsmap.get_enable_multiple()) {
+      ss << "Creation of multiple filesystems is disabled.  To enable "
+            "this experimental feature, use 'ceph fs flag set enable_multiple "
+            "true'";
+      return -EINVAL;
+    }
+
+    for (auto& fs : fsmap.get_filesystems()) {
+      const std::vector<int64_t> &data_pools = fs->mds_map.get_data_pools();
+
+      bool sure = false;
+      cmd_getval(cmdmap,
+                 "allow_dangerous_metadata_overlay", sure);
+
+      if ((std::find(data_pools.begin(), data_pools.end(), data) != data_pools.end()
+	   || fs->mds_map.get_metadata_pool() == metadata)
+	  && !sure) {
+	ss << "Filesystem '" << fs_name
+	   << "' is already using one of the specified RADOS pools. This should ONLY be done in emergencies and after careful reading of the documentation. Pass --allow-dangerous-metadata-overlay to permit this.";
+	return -EEXIST;
+      }
+    }
+
+    int64_t fscid = FS_CLUSTER_ID_NONE;
+    if (cmd_getval(cmdmap, "fscid", fscid)) {
+      if (!force) {
+        ss << "Pass --force to create a file system with a specific ID";
+        return -EINVAL;
+      }
+      if (fsmap.filesystem_exists(fscid)) {
+        ss << "filesystem already exists with id '" << fscid << "'";
+        return -EINVAL;
+      }
+    }
+
+    pg_pool_t const *data_pool = mon->osdmon()->osdmap.get_pg_pool(data);
+    ceph_assert(data_pool != NULL);  // Checked it existed above
+    pg_pool_t const *metadata_pool = mon->osdmon()->osdmap.get_pg_pool(metadata);
+    ceph_assert(metadata_pool != NULL);  // Checked it existed above
+
+    int r = _check_pool(mon->osdmon()->osdmap, data, POOL_DATA_DEFAULT, force, &ss);
+    if (r < 0) {
+      return r;
+    }
+
+    r = _check_pool(mon->osdmon()->osdmap, metadata, POOL_METADATA, force, &ss);
+    if (r < 0) {
+      return r;
+    }
+    
+    if (!mon->osdmon()->is_writeable()) {
+      // not allowed to write yet, so retry when we can
+      mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+      return -EAGAIN;
+    }
+    mon->osdmon()->do_application_enable(data,
+					 pg_pool_t::APPLICATION_NAME_CEPHFS,
+					 "data", fs_name, true);
+    mon->osdmon()->do_application_enable(metadata,
+					 pg_pool_t::APPLICATION_NAME_CEPHFS,
+					 "metadata", fs_name, true);
+    mon->osdmon()->do_set_pool_opt(metadata,
+				   pool_opts_t::RECOVERY_PRIORITY,
+				   static_cast<int64_t>(5));
+    mon->osdmon()->do_set_pool_opt(metadata,
+				   pool_opts_t::PG_NUM_MIN,
+				   static_cast<int64_t>(16));
+    mon->osdmon()->do_set_pool_opt(metadata,
+				   pool_opts_t::PG_AUTOSCALE_BIAS,
+				   static_cast<double>(4.0));
+    mon->osdmon()->propose_pending();
+
+    bool recover = false;
+    cmd_getval(cmdmap, "recover", recover);
+
+    // All checks passed, go ahead and create.
+    auto&& fs = fsmap.create_filesystem(fs_name, metadata, data,
+        mon->get_quorum_con_features(), fscid, recover);
+
+    ss << "new fs with metadata pool " << metadata << " and data pool " << data;
+
+    if (recover) {
+      return 0;
+    }
+
+    // assign a standby to rank 0 to avoid health warnings
+    auto info = fsmap.find_replacement_for({fs->fscid, 0});
+
+    if (info) {
+      mon->clog->info() << info->human_name() << " assigned to filesystem "
+          << fs_name << " as rank 0";
+      fsmap.promote(info->global_id, *fs, 0);
+    }
+
+    return 0;
+  }
+
+private:
+  Paxos *m_paxos;
+};
+
+class SetHandler : public FileSystemCommandHandler
+{
+public:
+  SetHandler()
+    : FileSystemCommandHandler("fs set")
+  {}
+
+  int handle(
+      Monitor *mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream &ss) override
+  {
+    std::string fs_name;
+    if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+      ss << "Missing filesystem name";
+      return -EINVAL;
+    }
+
+    auto fs = fsmap.get_filesystem(fs_name);
+    string var;
+    if (!cmd_getval(cmdmap, "var", var) || var.empty()) {
+      ss << "Invalid variable";
+      return -EINVAL;
+    }
+    string val;
+    string interr;
+    int64_t n = 0;
+    if (!cmd_getval(cmdmap, "val", val)) {
+      return -EINVAL;
+    }
+    // we got a string.  see if it contains an int.
+    n = strict_strtoll(val.c_str(), 10, &interr);
+    if (var == "max_mds") {
+      // NOTE: see also "mds set_max_mds", which can modify the same field.
+      if (interr.length()) {
+        ss << interr;
+	return -EINVAL;
+      }
+
+      if (n <= 0) {
+        ss << "You must specify at least one MDS";
+        return -EINVAL;
+      }
+
+      if (n > 1 && n > fs->mds_map.get_max_mds()) {
+	if (fs->mds_map.was_snaps_ever_allowed() &&
+	    !fs->mds_map.allows_multimds_snaps()) {
+	  ss << "multi-active MDS is not allowed while there are snapshots possibly created by pre-mimic MDS";
+	  return -EINVAL;
+	}
+      }
+      if (n > MAX_MDS) {
+        ss << "may not have more than " << MAX_MDS << " MDS ranks";
+        return -EINVAL;
+      }
+
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [n](std::shared_ptr<Filesystem> fs)
+      {
+	fs->mds_map.clear_flag(CEPH_MDSMAP_NOT_JOINABLE);
+        fs->mds_map.set_max_mds(n);
+      });
+    } else if (var == "inline_data") {
+      bool enable_inline = false;
+      int r = parse_bool(val, &enable_inline, ss);
+      if (r != 0) {
+        return r;
+      }
+
+      if (enable_inline) {
+        bool confirm = false;
+        cmd_getval(cmdmap, "yes_i_really_really_mean_it", confirm);
+	if (!confirm) {
+	  ss << "Inline data support is deprecated and will be removed in a future release. "
+	     << "Add --yes-i-really-really-mean-it if you are certain you want this enabled.";
+	  return -EPERM;
+	}
+	ss << "inline data enabled";
+
+        fsmap.modify_filesystem(
+            fs->fscid,
+            [](std::shared_ptr<Filesystem> fs)
+        {
+          fs->mds_map.set_inline_data_enabled(true);
+        });
+      } else {
+	ss << "inline data disabled";
+        fsmap.modify_filesystem(
+            fs->fscid,
+            [](std::shared_ptr<Filesystem> fs)
+        {
+          fs->mds_map.set_inline_data_enabled(false);
+        });
+      }
+    } else if (var == "balancer") {
+      if (val.empty()) {
+        ss << "unsetting the metadata load balancer";
+      } else {
+        ss << "setting the metadata load balancer to " << val;
+      }
+      fsmap.modify_filesystem(
+	fs->fscid,
+	[val](std::shared_ptr<Filesystem> fs)
+        {
+          fs->mds_map.set_balancer(val);
+        });
+      return true;
+    } else if (var == "max_file_size") {
+      if (interr.length()) {
+	ss << var << " requires an integer value";
+	return -EINVAL;
+      }
+      if (n < CEPH_MIN_STRIPE_UNIT) {
+	ss << var << " must at least " << CEPH_MIN_STRIPE_UNIT;
+	return -ERANGE;
+      }
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [n](std::shared_ptr<Filesystem> fs)
+      {
+        fs->mds_map.set_max_filesize(n);
+      });
+    } else if (var == "allow_new_snaps") {
+      bool enable_snaps = false;
+      int r = parse_bool(val, &enable_snaps, ss);
+      if (r != 0) {
+        return r;
+      }
+
+      if (!enable_snaps) {
+        fsmap.modify_filesystem(
+            fs->fscid,
+            [](std::shared_ptr<Filesystem> fs)
+        {
+          fs->mds_map.clear_snaps_allowed();
+        });
+	ss << "disabled new snapshots";
+      } else {
+        fsmap.modify_filesystem(
+            fs->fscid,
+            [](std::shared_ptr<Filesystem> fs)
+        {
+          fs->mds_map.set_snaps_allowed();
+        });
+	ss << "enabled new snapshots";
+      }
+    } else if (var == "allow_multimds") {
+        ss << "Multiple MDS is always enabled. Use the max_mds"
+           << " parameter to control the number of active MDSs"
+           << " allowed. This command is DEPRECATED and will be"
+           << " REMOVED from future releases.";
+    } else if (var == "allow_multimds_snaps") {
+      bool enable = false;
+      int r = parse_bool(val, &enable, ss);
+      if (r != 0) {
+        return r;
+      }
+
+      string confirm;
+      if (!cmd_getval(cmdmap, "confirm", confirm) ||
+	  confirm != "--yes-i-am-really-a-mds") {
+	ss << "Warning! This command is for MDS only. Do not run it manually";
+	return -EPERM;
+      }
+
+      if (enable) {
+	ss << "enabled multimds with snapshot";
+        fsmap.modify_filesystem(
+            fs->fscid,
+            [](std::shared_ptr<Filesystem> fs)
+        {
+	  fs->mds_map.set_multimds_snaps_allowed();
+        });
+      } else {
+	ss << "disabled multimds with snapshot";
+        fsmap.modify_filesystem(
+            fs->fscid,
+            [](std::shared_ptr<Filesystem> fs)
+        {
+	  fs->mds_map.clear_multimds_snaps_allowed();
+        });
+      }
+    } else if (var == "allow_dirfrags") {
+        ss << "Directory fragmentation is now permanently enabled."
+           << " This command is DEPRECATED and will be REMOVED from future releases.";
+    } else if (var == "down") {
+      bool is_down = false;
+      int r = parse_bool(val, &is_down, ss);
+      if (r != 0) {
+        return r;
+      }
+
+      ss << fs->mds_map.get_fs_name();
+
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [is_down](std::shared_ptr<Filesystem> fs)
+      {
+	if (is_down) {
+          if (fs->mds_map.get_max_mds() > 0) {
+	    fs->mds_map.set_old_max_mds();
+	    fs->mds_map.set_max_mds(0);
+          } /* else already down! */
+	} else {
+	  mds_rank_t oldmax = fs->mds_map.get_old_max_mds();
+	  fs->mds_map.set_max_mds(oldmax ? oldmax : 1);
+	}
+      });
+
+      if (is_down) {
+	ss << " marked down. ";
+      } else {
+	ss << " marked up, max_mds = " << fs->mds_map.get_max_mds();
+      }
+    } else if (var == "cluster_down" || var == "joinable") {
+      bool joinable = true;
+      int r = parse_bool(val, &joinable, ss);
+      if (r != 0) {
+        return r;
+      }
+      if (var == "cluster_down") {
+        joinable = !joinable;
+      }
+
+      ss << fs->mds_map.get_fs_name();
+
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [joinable](std::shared_ptr<Filesystem> fs)
+      {
+	if (joinable) {
+	  fs->mds_map.clear_flag(CEPH_MDSMAP_NOT_JOINABLE);
+	} else {
+	  fs->mds_map.set_flag(CEPH_MDSMAP_NOT_JOINABLE);
+	}
+      });
+
+      if (joinable) {
+	ss << " marked joinable; MDS may join as newly active.";
+      } else {
+	ss << " marked not joinable; MDS cannot join as newly active.";
+      }
+
+      if (var == "cluster_down") {
+        ss << " WARNING: cluster_down flag is deprecated and will be"
+           << " removed in a future version. Please use \"joinable\".";
+      }
+    } else if (var == "standby_count_wanted") {
+      if (interr.length()) {
+       ss << var << " requires an integer value";
+       return -EINVAL;
+      }
+      if (n < 0) {
+       ss << var << " must be non-negative";
+       return -ERANGE;
+      }
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [n](std::shared_ptr<Filesystem> fs)
+      {
+        fs->mds_map.set_standby_count_wanted(n);
+      });
+    } else if (var == "session_timeout") {
+      if (interr.length()) {
+       ss << var << " requires an integer value";
+       return -EINVAL;
+      }
+      if (n < 30) {
+       ss << var << " must be at least 30s";
+       return -ERANGE;
+      }
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [n](std::shared_ptr<Filesystem> fs)
+      {
+        fs->mds_map.set_session_timeout((uint32_t)n);
+      });
+    } else if (var == "session_autoclose") {
+      if (interr.length()) {
+       ss << var << " requires an integer value";
+       return -EINVAL;
+      }
+      if (n < 30) {
+       ss << var << " must be at least 30s";
+       return -ERANGE;
+      }
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [n](std::shared_ptr<Filesystem> fs)
+      {
+        fs->mds_map.set_session_autoclose((uint32_t)n);
+      });
+    } else if (var == "allow_standby_replay") {
+      bool allow = false;
+      int r = parse_bool(val, &allow, ss);
+      if (r != 0) {
+        return r;
+      }
+
+      if (!allow) {
+        if (!mon->osdmon()->is_writeable()) {
+          // not allowed to write yet, so retry when we can
+          mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+          return -EAGAIN;
+        }
+        std::vector<mds_gid_t> to_fail;
+        for (const auto& [gid, info]: fs->mds_map.get_mds_info()) {
+          if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
+            to_fail.push_back(gid);
+          }
+        }
+
+        for (const auto& gid : to_fail) {
+          mon->mdsmon()->fail_mds_gid(fsmap, gid);
+        }
+        if (!to_fail.empty()) {
+          mon->osdmon()->propose_pending();
+        }
+      }
+
+      auto f = [allow](auto& fs) {
+        if (allow) {
+          fs->mds_map.set_standby_replay_allowed();
+        } else {
+          fs->mds_map.clear_standby_replay_allowed();
+        }
+      };
+      fsmap.modify_filesystem(fs->fscid, std::move(f));
+    } else if (var == "min_compat_client") {
+      auto vno = ceph_release_from_name(val.c_str());
+      if (!vno) {
+	ss << "version " << val << " is not recognized";
+	return -EINVAL;
+      }
+      ss << "WARNING: setting min_compat_client is deprecated"
+            " and may not do what you want.\n"
+            "The oldest release to set is octopus.\n"
+            "Please migrate to `ceph fs required_client_features ...`.";
+      auto f = [vno](auto&& fs) {
+        fs->mds_map.set_min_compat_client(vno);
+      };
+      fsmap.modify_filesystem(fs->fscid, std::move(f));
+    } else {
+      ss << "unknown variable " << var;
+      return -EINVAL;
+    }
+
+    return 0;
+  }
+};
+
+class CompatSetHandler : public FileSystemCommandHandler
+{
+  public:
+    CompatSetHandler()
+      : FileSystemCommandHandler("fs compat")
+    {
+    }
+
+    int handle(
+	Monitor *mon,
+	FSMap &fsmap,
+	MonOpRequestRef op,
+	const cmdmap_t& cmdmap,
+	std::ostream &ss) override
+    {
+      static const std::set<std::string> subops = {"rm_incompat", "rm_compat", "add_incompat", "add_compat"};
+
+      std::string fs_name;
+      if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+	ss << "Missing filesystem name";
+	return -EINVAL;
+      }
+      auto fs = fsmap.get_filesystem(fs_name);
+      if (fs == nullptr) {
+	ss << "Not found: '" << fs_name << "'";
+	return -ENOENT;
+      }
+
+      string subop;
+      if (!cmd_getval(cmdmap, "subop", subop) || subops.count(subop) == 0) {
+	ss << "subop `" << subop << "' not recognized. Must be one of: " << subops;
+	return -EINVAL;
+      }
+
+      int64_t feature;
+      if (!cmd_getval(cmdmap, "feature", feature) || feature <= 0) {
+        ss << "Invalid feature";
+        return -EINVAL;
+      }
+
+      if (fs->mds_map.get_num_up_mds() > 0) {
+        ss << "file system must be failed or down; use `ceph fs fail` to bring down";
+        return -EBUSY;
+      }
+
+      CompatSet cs = fs->mds_map.compat;
+      if (subop == "rm_compat") {
+        if (cs.compat.contains(feature)) {
+          ss << "removed compat feature " << feature;
+          cs.compat.remove(feature);
+        } else {
+          ss << "already removed compat feature " << feature;
+        }
+      } else if (subop == "rm_incompat") {
+        if (cs.incompat.contains(feature)) {
+          ss << "removed incompat feature " << feature;
+          cs.incompat.remove(feature);
+        } else {
+          ss << "already removed incompat feature " << feature;
+        }
+      } else if (subop == "add_compat" || subop == "add_incompat") {
+        string feature_str;
+        if (!cmd_getval(cmdmap, "feature_str", feature_str) || feature_str.empty()) {
+          ss << "adding a feature requires a feature string";
+          return -EINVAL;
+        }
+        auto f = CompatSet::Feature(feature, feature_str);
+        if (subop == "add_compat") {
+          if (cs.compat.contains(feature)) {
+            auto name = cs.compat.get_name(feature);
+            if (name == feature_str) {
+              ss << "feature already exists";
+            } else {
+              ss << "feature with differing name `" << name << "' exists";
+              return -EEXIST;
+            }
+          } else {
+            cs.compat.insert(f);
+            ss << "added compat feature " << f;
+          }
+        } else if (subop == "add_incompat") {
+          if (cs.incompat.contains(feature)) {
+            auto name = cs.incompat.get_name(feature);
+            if (name == feature_str) {
+              ss << "feature already exists";
+            } else {
+              ss << "feature with differing name `" << name << "' exists";
+              return -EEXIST;
+            }
+          } else {
+            cs.incompat.insert(f);
+            ss << "added incompat feature " << f;
+          }
+        } else ceph_assert(0);
+      } else ceph_assert(0);
+
+      auto modifyf = [cs = std::move(cs)](auto&& fs) {
+        fs->mds_map.compat = cs;
+      };
+
+      fsmap.modify_filesystem(fs->fscid, std::move(modifyf));
+      return 0;
+    }
+};
+
+class RequiredClientFeaturesHandler : public FileSystemCommandHandler
+{
+  public:
+    RequiredClientFeaturesHandler()
+      : FileSystemCommandHandler("fs required_client_features")
+    {
+    }
+
+    int handle(
+	Monitor *mon,
+	FSMap &fsmap,
+	MonOpRequestRef op,
+	const cmdmap_t& cmdmap,
+	std::ostream &ss) override
+    {
+      std::string fs_name;
+      if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+	ss << "Missing filesystem name";
+	return -EINVAL;
+      }
+      auto fs = fsmap.get_filesystem(fs_name);
+      if (fs == nullptr) {
+	ss << "Not found: '" << fs_name << "'";
+	return -ENOENT;
+      }
+      string subop;
+      if (!cmd_getval(cmdmap, "subop", subop) ||
+	  (subop != "add" && subop != "rm")) {
+	ss << "Must either add or rm a feature; " << subop << " is not recognized";
+	return -EINVAL;
+      }
+      string val;
+      if (!cmd_getval(cmdmap, "val", val) || val.empty()) {
+	ss << "Missing feature id/name";
+	return -EINVAL;
+      }
+
+      int feature = cephfs_feature_from_name(val);
+      if (feature < 0) {
+	string err;
+	feature = strict_strtol(val.c_str(), 10, &err);
+	if (err.length()) {
+	  ss << "Invalid feature name: " << val;
+	  return -EINVAL;
+	}
+	if (feature < 0 || feature > CEPHFS_FEATURE_MAX) {
+	  ss << "Invalid feature id: " << feature;
+	  return -EINVAL;
+	}
+      }
+
+      if (subop == "add") {
+	bool ret = false;
+	fsmap.modify_filesystem(
+	    fs->fscid,
+	    [feature, &ret](auto&& fs)
+	{
+	  if (fs->mds_map.get_required_client_features().test(feature))
+	    return;
+	  fs->mds_map.add_required_client_feature(feature);
+	  ret = true;
+	});
+	if (ret) {
+	  ss << "added feature '" << cephfs_feature_name(feature) << "' to required_client_features";
+	} else {
+	  ss << "feature '" << cephfs_feature_name(feature) << "' is already set";
+	}
+      } else {
+	bool ret = false;
+	fsmap.modify_filesystem(
+	    fs->fscid,
+	    [feature, &ret](auto&& fs)
+	{
+          if (!fs->mds_map.get_required_client_features().test(feature))
+            return;
+          fs->mds_map.remove_required_client_feature(feature);
+          ret = true;
+	});
+	if (ret) {
+	  ss << "removed feature '" << cephfs_feature_name(feature) << "' from required_client_features";
+	} else {
+	  ss << "feature '" << cephfs_feature_name(feature) << "' is already unset";
+	}
+      }
+      return 0;
+   }
+};
+
+
+class AddDataPoolHandler : public FileSystemCommandHandler
+{
+  public:
+  explicit AddDataPoolHandler(Paxos *paxos)
+    : FileSystemCommandHandler("fs add_data_pool"), m_paxos(paxos)
+  {}
+
+  bool batched_propose() override {
+    return true;
+  }
+
+  int handle(
+      Monitor *mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream &ss) override
+  {
+    ceph_assert(m_paxos->is_plugged());
+
+    string poolname;
+    cmd_getval(cmdmap, "pool", poolname);
+
+    std::string fs_name;
+    if (!cmd_getval(cmdmap, "fs_name", fs_name)
+        || fs_name.empty()) {
+      ss << "Missing filesystem name";
+      return -EINVAL;
+    }
+
+    int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname);
+    if (poolid < 0) {
+      string err;
+      poolid = strict_strtol(poolname.c_str(), 10, &err);
+      if (err.length()) {
+	ss << "pool '" << poolname << "' does not exist";
+	return -ENOENT;
+      }
+    }
+
+    int r = _check_pool(mon->osdmon()->osdmap, poolid, POOL_DATA_EXTRA, false, &ss);
+    if (r != 0) {
+      return r;
+    }
+
+    auto fs = fsmap.get_filesystem(fs_name);
+    // no-op when the data_pool already on fs
+    if (fs->mds_map.is_data_pool(poolid)) {
+      ss << "data pool " << poolid << " is already on fs " << fs_name;
+      return 0;
+    }
+
+    if (!mon->osdmon()->is_writeable()) {
+      // not allowed to write yet, so retry when we can
+      mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+      return -EAGAIN;
+    }
+    mon->osdmon()->do_application_enable(poolid,
+					 pg_pool_t::APPLICATION_NAME_CEPHFS,
+					 "data", fs_name, true);
+    mon->osdmon()->propose_pending();
+
+    fsmap.modify_filesystem(
+        fs->fscid,
+        [poolid](std::shared_ptr<Filesystem> fs)
+    {
+      fs->mds_map.add_data_pool(poolid);
+    });
+
+    ss << "added data pool " << poolid << " to fsmap";
+
+    return 0;
+  }
+
+private:
+  Paxos *m_paxos;
+};
+
+class SetDefaultHandler : public FileSystemCommandHandler
+{
+  public:
+  SetDefaultHandler()
+    : FileSystemCommandHandler("fs set-default")
+  {}
+
+  int handle(
+      Monitor *mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream &ss) override
+  {
+    std::string fs_name;
+    cmd_getval(cmdmap, "fs_name", fs_name);
+    auto fs = fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+        ss << "filesystem '" << fs_name << "' does not exist";
+        return -ENOENT;
+    }
+
+    fsmap.set_legacy_client_fscid(fs->fscid);
+    return 0;
+  }
+};
+
+class RemoveFilesystemHandler : public FileSystemCommandHandler
+{
+  public:
+  RemoveFilesystemHandler()
+    : FileSystemCommandHandler("fs rm")
+  {}
+
+  int handle(
+      Monitor *mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream &ss) override
+  {
+    /* We may need to blocklist ranks. */
+    if (!mon->osdmon()->is_writeable()) {
+      // not allowed to write yet, so retry when we can
+      mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+      return -EAGAIN;
+    }
+
+    // Check caller has correctly named the FS to delete
+    // (redundant while there is only one FS, but command
+    //  syntax should apply to multi-FS future)
+    string fs_name;
+    cmd_getval(cmdmap, "fs_name", fs_name);
+    auto fs = fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+        // Consider absence success to make deletes idempotent
+        ss << "filesystem '" << fs_name << "' does not exist";
+        return 0;
+    }
+
+    // Check that no MDS daemons are active
+    if (fs->mds_map.get_num_up_mds() > 0) {
+      ss << "all MDS daemons must be inactive/failed before removing filesystem. See `ceph fs fail`.";
+      return -EINVAL;
+    }
+
+    // Check for confirmation flag
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      ss << "this is a DESTRUCTIVE operation and will make data in your filesystem permanently" \
+            " inaccessible.  Add --yes-i-really-mean-it if you are sure you wish to continue.";
+      return -EPERM;
+    }
+
+    if (fsmap.get_legacy_client_fscid() == fs->fscid) {
+      fsmap.set_legacy_client_fscid(FS_CLUSTER_ID_NONE);
+    }
+
+    std::vector<mds_gid_t> to_fail;
+    // There may be standby_replay daemons left here
+    for (const auto &i : fs->mds_map.get_mds_info()) {
+      ceph_assert(i.second.state == MDSMap::STATE_STANDBY_REPLAY);
+      to_fail.push_back(i.first);
+    }
+
+    for (const auto &gid : to_fail) {
+      // Standby replays don't write, so it isn't important to
+      // wait for an osdmap propose here: ignore return value.
+      mon->mdsmon()->fail_mds_gid(fsmap, gid);
+    }
+    if (!to_fail.empty()) {
+      mon->osdmon()->propose_pending(); /* maybe new blocklists */
+    }
+
+    fsmap.erase_filesystem(fs->fscid);
+
+    return 0;
+  }
+};
+
+class ResetFilesystemHandler : public FileSystemCommandHandler
+{
+  public:
+  ResetFilesystemHandler()
+    : FileSystemCommandHandler("fs reset")
+  {}
+
+  int handle(
+      Monitor *mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream &ss) override
+  {
+    string fs_name;
+    cmd_getval(cmdmap, "fs_name", fs_name);
+    auto fs = fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+        ss << "filesystem '" << fs_name << "' does not exist";
+        // Unlike fs rm, we consider this case an error
+        return -ENOENT;
+    }
+
+    // Check that no MDS daemons are active
+    if (fs->mds_map.get_num_up_mds() > 0) {
+      ss << "all MDS daemons must be inactive before resetting filesystem: set the cluster_down flag"
+            " and use `ceph mds fail` to make this so";
+      return -EINVAL;
+    }
+
+    // Check for confirmation flag
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      ss << "this is a potentially destructive operation, only for use by experts in disaster recovery.  "
+        "Add --yes-i-really-mean-it if you are sure you wish to continue.";
+      return -EPERM;
+    }
+
+    fsmap.reset_filesystem(fs->fscid);
+
+    return 0;
+  }
+};
+
+class RemoveDataPoolHandler : public FileSystemCommandHandler
+{
+  public:
+  RemoveDataPoolHandler()
+    : FileSystemCommandHandler("fs rm_data_pool")
+  {}
+
+  int handle(
+      Monitor *mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream &ss) override
+  {
+    string poolname;
+    cmd_getval(cmdmap, "pool", poolname);
+
+    std::string fs_name;
+    if (!cmd_getval(cmdmap, "fs_name", fs_name)
+        || fs_name.empty()) {
+      ss << "Missing filesystem name";
+      return -EINVAL;
+    }
+
+    int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname);
+    if (poolid < 0) {
+      string err;
+      poolid = strict_strtol(poolname.c_str(), 10, &err);
+      if (err.length()) {
+	ss << "pool '" << poolname << "' does not exist";
+        return -ENOENT;
+      } else if (poolid < 0) {
+        ss << "invalid pool id '" << poolid << "'";
+        return -EINVAL;
+      }
+    }
+
+    ceph_assert(poolid >= 0);  // Checked by parsing code above
+
+    auto fs = fsmap.get_filesystem(fs_name);
+    if (fs->mds_map.get_first_data_pool() == poolid) {
+      ss << "cannot remove default data pool";
+      return -EINVAL;
+    }
+
+    int r = 0;
+    fsmap.modify_filesystem(fs->fscid,
+        [&r, poolid](std::shared_ptr<Filesystem> fs)
+    {
+      r = fs->mds_map.remove_data_pool(poolid);
+    });
+    if (r == -ENOENT) {
+      // It was already removed, succeed in silence
+      return 0;
+    } else if (r == 0) {
+      // We removed it, succeed
+      ss << "removed data pool " << poolid << " from fsmap";
+      return 0;
+    } else {
+      // Unexpected error, bubble up
+      return r;
+    }
+  }
+};
+
+/**
+ * For commands with an alternative prefix
+ */
+template<typename T>
+class AliasHandler : public T
+{
+  std::string alias_prefix;
+
+  public:
+  explicit AliasHandler(const std::string &new_prefix)
+    : T()
+  {
+    alias_prefix = new_prefix;
+  }
+
+  std::string const &get_prefix() const override {return alias_prefix;}
+
+  int handle(
+      Monitor *mon,
+      FSMap& fsmap,
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      std::ostream &ss) override
+  {
+    return T::handle(mon, fsmap, op, cmdmap, ss);
+  }
+};
+
+class MirrorHandlerEnable : public FileSystemCommandHandler
+{
+public:
+  MirrorHandlerEnable()
+    : FileSystemCommandHandler("fs mirror enable")
+  {}
+
+  int handle(Monitor *mon,
+             FSMap &fsmap, MonOpRequestRef op,
+             const cmdmap_t& cmdmap, std::ostream &ss) override {
+    std::string fs_name;
+    if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+      ss << "Missing filesystem name";
+      return -EINVAL;
+    }
+
+    auto fs = fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+      ss << "Filesystem '" << fs_name << "' not found";
+      return -ENOENT;
+    }
+
+    if (fs->mirror_info.is_mirrored()) {
+      return 0;
+    }
+
+    auto f = [](auto &&fs) {
+               fs->mirror_info.enable_mirroring();
+    };
+    fsmap.modify_filesystem(fs->fscid, std::move(f));
+
+    return 0;
+  }
+};
+
+class MirrorHandlerDisable : public FileSystemCommandHandler
+{
+public:
+  MirrorHandlerDisable()
+    : FileSystemCommandHandler("fs mirror disable")
+  {}
+
+  int handle(Monitor *mon,
+             FSMap &fsmap, MonOpRequestRef op,
+             const cmdmap_t& cmdmap, std::ostream &ss) override {
+    std::string fs_name;
+    if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+      ss << "Missing filesystem name";
+      return -EINVAL;
+    }
+
+    auto fs = fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+      ss << "Filesystem '" << fs_name << "' not found";
+      return -ENOENT;
+    }
+
+    if (!fs->mirror_info.is_mirrored()) {
+      return 0;
+    }
+
+    auto f = [](auto &&fs) {
+      fs->mirror_info.disable_mirroring();
+    };
+    fsmap.modify_filesystem(fs->fscid, std::move(f));
+
+    return 0;
+  }
+};
+
+class MirrorHandlerAddPeer : public FileSystemCommandHandler
+{
+public:
+  MirrorHandlerAddPeer()
+    : FileSystemCommandHandler("fs mirror peer_add")
+  {}
+
+  boost::optional<std::pair<string, string>>
+  extract_remote_cluster_conf(const std::string &spec) {
+    auto pos = spec.find("@");
+    if (pos == std::string_view::npos) {
+      return boost::optional<std::pair<string, string>>();
+    }
+
+    auto client = spec.substr(0, pos);
+    auto cluster = spec.substr(pos+1);
+
+    return std::make_pair(client, cluster);
+  }
+
+  bool peer_add(FSMap &fsmap, Filesystem::const_ref &&fs,
+                const cmdmap_t &cmdmap, std::ostream &ss) {
+    string peer_uuid;
+    string remote_spec;
+    string remote_fs_name;
+    cmd_getval(cmdmap, "uuid", peer_uuid);
+    cmd_getval(cmdmap, "remote_cluster_spec", remote_spec);
+    cmd_getval(cmdmap, "remote_fs_name", remote_fs_name);
+
+    // verify (and extract) remote cluster specification
+    auto remote_conf = extract_remote_cluster_conf(remote_spec);
+    if (!remote_conf) {
+      ss << "invalid remote cluster spec -- should be <client>@<cluster>";
+      return false;
+    }
+
+    if (fs->mirror_info.has_peer(peer_uuid)) {
+      ss << "peer already exists";
+      return true;
+    }
+    if (fs->mirror_info.has_peer((*remote_conf).first, (*remote_conf).second,
+                                 remote_fs_name)) {
+      ss << "peer already exists";
+      return true;
+    }
+
+    auto f = [peer_uuid, remote_conf, remote_fs_name](auto &&fs) {
+               fs->mirror_info.peer_add(peer_uuid, (*remote_conf).first,
+                                        (*remote_conf).second, remote_fs_name);
+             };
+    fsmap.modify_filesystem(fs->fscid, std::move(f));
+    return true;
+  }
+
+  int handle(Monitor *mon,
+             FSMap &fsmap, MonOpRequestRef op,
+             const cmdmap_t& cmdmap, std::ostream &ss) override {
+    std::string fs_name;
+    if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+      ss << "Missing filesystem name";
+      return -EINVAL;
+    }
+
+    auto fs = fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+      ss << "Filesystem '" << fs_name << "' not found";
+      return -ENOENT;
+    }
+
+    if (!fs->mirror_info.is_mirrored()) {
+      ss << "Mirroring not enabled for filesystem '" << fs_name << "'";
+      return -EINVAL;
+    }
+
+    auto res = peer_add(fsmap, std::move(fs), cmdmap, ss);
+    if (!res) {
+      return -EINVAL;
+    }
+
+    return 0;
+  }
+};
+
+class MirrorHandlerRemovePeer : public FileSystemCommandHandler
+{
+public:
+  MirrorHandlerRemovePeer()
+    : FileSystemCommandHandler("fs mirror peer_remove")
+  {}
+
+  bool peer_remove(FSMap &fsmap, Filesystem::const_ref &&fs,
+                   const cmdmap_t &cmdmap, std::ostream &ss) {
+    string peer_uuid;
+    cmd_getval(cmdmap, "uuid", peer_uuid);
+
+    if (!fs->mirror_info.has_peer(peer_uuid)) {
+      ss << "cannot find peer with uuid: " << peer_uuid;
+      return true;
+    }
+
+    auto f = [peer_uuid](auto &&fs) {
+               fs->mirror_info.peer_remove(peer_uuid);
+             };
+    fsmap.modify_filesystem(fs->fscid, std::move(f));
+    return true;
+  }
+
+  int handle(Monitor *mon,
+             FSMap &fsmap, MonOpRequestRef op,
+             const cmdmap_t& cmdmap, std::ostream &ss) override {
+    std::string fs_name;
+    if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+      ss << "Missing filesystem name";
+      return -EINVAL;
+    }
+
+    auto fs = fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+      ss << "Filesystem '" << fs_name << "' not found";
+      return -ENOENT;
+    }
+
+    if (!fs->mirror_info.is_mirrored()) {
+      ss << "Mirroring not enabled for filesystem '" << fs_name << "'";
+      return -EINVAL;
+    }
+
+    auto res = peer_remove(fsmap, std::move(fs), cmdmap, ss);
+    if (!res) {
+      return -EINVAL;
+    }
+
+    return 0;
+  }
+};
+
+std::list<std::shared_ptr<FileSystemCommandHandler> >
+FileSystemCommandHandler::load(Paxos *paxos)
+{
+  std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
+
+  handlers.push_back(std::make_shared<SetHandler>());
+  handlers.push_back(std::make_shared<FailHandler>());
+  handlers.push_back(std::make_shared<FlagSetHandler>());
+  handlers.push_back(std::make_shared<CompatSetHandler>());
+  handlers.push_back(std::make_shared<RequiredClientFeaturesHandler>());
+  handlers.push_back(std::make_shared<AddDataPoolHandler>(paxos));
+  handlers.push_back(std::make_shared<RemoveDataPoolHandler>());
+  handlers.push_back(std::make_shared<FsNewHandler>(paxos));
+  handlers.push_back(std::make_shared<RemoveFilesystemHandler>());
+  handlers.push_back(std::make_shared<ResetFilesystemHandler>());
+
+  handlers.push_back(std::make_shared<SetDefaultHandler>());
+  handlers.push_back(std::make_shared<AliasHandler<SetDefaultHandler> >(
+        "fs set_default"));
+  handlers.push_back(std::make_shared<MirrorHandlerEnable>());
+  handlers.push_back(std::make_shared<MirrorHandlerDisable>());
+  handlers.push_back(std::make_shared<MirrorHandlerAddPeer>());
+  handlers.push_back(std::make_shared<MirrorHandlerRemovePeer>());
+
+  return handlers;
+}
+
+int FileSystemCommandHandler::_check_pool(
+    OSDMap &osd_map,
+    const int64_t pool_id,
+    int type,
+    bool force,
+    std::ostream *ss) const
+{
+  ceph_assert(ss != NULL);
+
+  const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
+  if (!pool) {
+    *ss << "pool id '" << pool_id << "' does not exist";
+    return -ENOENT;
+  }
+
+  const string& pool_name = osd_map.get_pool_name(pool_id);
+
+  if (pool->is_erasure()) {
+    if (type == POOL_METADATA) {
+      *ss << "pool '" << pool_name << "' (id '" << pool_id << "')"
+         << " is an erasure-coded pool.  Use of erasure-coded pools"
+         << " for CephFS metadata is not permitted";
+      return -EINVAL;
+    } else if (type == POOL_DATA_DEFAULT && !force) {
+      *ss << "pool '" << pool_name << "' (id '" << pool_id << "')"
+             " is an erasure-coded pool."
+             " Use of an EC pool for the default data pool is discouraged;"
+             " see the online CephFS documentation for more information."
+             " Use --force to override.";
+      return -EINVAL;
+    } else if (!pool->allows_ecoverwrites()) {
+      // non-overwriteable EC pools are only acceptable with a cache tier overlay
+      if (!pool->has_tiers() || !pool->has_read_tier() || !pool->has_write_tier()) {
+        *ss << "pool '" << pool_name << "' (id '" << pool_id << "')"
+            << " is an erasure-coded pool, with no overwrite support";
+        return -EINVAL;
+      }
+
+      // That cache tier overlay must be writeback, not readonly (it's the
+      // write operations like modify+truncate we care about support for)
+      const pg_pool_t *write_tier = osd_map.get_pg_pool(
+          pool->write_tier);
+      ceph_assert(write_tier != NULL);  // OSDMonitor shouldn't allow DNE tier
+      if (write_tier->cache_mode == pg_pool_t::CACHEMODE_FORWARD
+          || write_tier->cache_mode == pg_pool_t::CACHEMODE_READONLY) {
+        *ss << "EC pool '" << pool_name << "' has a write tier ("
+            << osd_map.get_pool_name(pool->write_tier)
+            << ") that is configured "
+               "to forward writes.  Use a cache mode such as 'writeback' for "
+               "CephFS";
+        return -EINVAL;
+      }
+    }
+  }
+
+  if (pool->is_tier()) {
+    *ss << " pool '" << pool_name << "' (id '" << pool_id
+      << "') is already in use as a cache tier.";
+    return -EINVAL;
+  }
+
+  if (!force && !pool->application_metadata.empty() &&
+      pool->application_metadata.count(
+        pg_pool_t::APPLICATION_NAME_CEPHFS) == 0) {
+    *ss << " pool '" << pool_name << "' (id '" << pool_id
+        << "') has a non-CephFS application enabled.";
+    return -EINVAL;
+  }
+
+  // Nothing special about this pool, so it is permissible
+  return 0;
+}
+
+int FileSystemCommandHandler::is_op_allowed(
+    const MonOpRequestRef& op, const FSMap& fsmap, const cmdmap_t& cmdmap,
+    std::ostream &ss) const
+{
+    string fs_name;
+    cmd_getval(cmdmap, "fs_name", fs_name);
+
+    // so that fsmap can filtered and the original copy is untouched.
+    FSMap fsmap_copy = fsmap;
+    fsmap_copy.filter(op->get_session()->get_allowed_fs_names());
+
+    auto fs = fsmap_copy.get_filesystem(fs_name);
+    if (fs == nullptr) {
+      /* let "fs rm" handle idempotent case where file system does not exist */
+      if (!(get_prefix() == "fs rm" && fsmap.get_filesystem(fs_name) == nullptr)) {
+        ss << "Filesystem not found: '" << fs_name << "'";
+        return -ENOENT;
+      }
+    }
+
+    if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+      ss << "Permission denied: '" << fs_name << "'";
+      return -EPERM;
+    }
+
+  return 1;
+}
diff --git a/src/mon/FSCommands.h b/src/mon/FSCommands.h
new file mode 100644
index 000000000..4b59225f9
--- /dev/null
+++ b/src/mon/FSCommands.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat Ltd
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef FS_COMMANDS_H_
+#define FS_COMMANDS_H_
+
+#include "Monitor.h"
+#include "CommandHandler.h"
+
+#include "osd/OSDMap.h"
+#include "mds/FSMap.h"
+
+#include <string>
+#include <ostream>
+
+class FileSystemCommandHandler : protected CommandHandler
+{
+protected:
+  std::string prefix;
+
+  enum {
+    POOL_METADATA,
+    POOL_DATA_DEFAULT,
+    POOL_DATA_EXTRA,
+  };
+  /**
+   * Return 0 if the pool is suitable for use with CephFS, or
+   * in case of errors return a negative error code, and populate
+   * the passed ostream with an explanation.
+   *
+   * @param metadata whether the pool will be for metadata (stricter checks)
+   */
+  int _check_pool(
+      OSDMap &osd_map,
+      const int64_t pool_id,
+      int type,
+      bool force,
+      std::ostream *ss) const;
+
+  virtual std::string const &get_prefix() const {return prefix;}
+
+public:
+  FileSystemCommandHandler(const std::string &prefix_)
+    : prefix(prefix_)
+  {}
+
+  virtual ~FileSystemCommandHandler()
+  {}
+
+  int is_op_allowed(const MonOpRequestRef& op, const FSMap& fsmap,
+		    const cmdmap_t& cmdmap, std::ostream &ss) const;
+
+  int can_handle(std::string const &prefix_, MonOpRequestRef& op, FSMap& fsmap,
+	         const cmdmap_t& cmdmap, std::ostream &ss) const
+  {
+    if (get_prefix() != prefix_) {
+      return 0;
+    }
+
+    if (get_prefix() == "fs new" || get_prefix() == "fs flag set") {
+      return 1;
+    }
+
+    return is_op_allowed(op, fsmap, cmdmap, ss);
+  }
+
+  static std::list<std::shared_ptr<FileSystemCommandHandler> > load(Paxos *paxos);
+
+  virtual bool batched_propose() {
+    return false;
+  }
+
+  virtual int handle(
+    Monitor *mon,
+    FSMap &fsmap,
+    MonOpRequestRef op,
+    const cmdmap_t& cmdmap,
+    std::ostream &ss) = 0;
+};
+
+#endif
diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc
new file mode 100644
index 000000000..a45159e7c
--- /dev/null
+++ b/src/mon/HealthMonitor.cc
@@ -0,0 +1,877 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <stdlib.h>
+#include <limits.h>
+#include <sstream>
+#include <regex>
+#include <time.h>
+#include <iterator>
+
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+#include "include/stringify.h"
+
+#include "mon/Monitor.h"
+#include "mon/HealthMonitor.h"
+
+#include "messages/MMonHealthChecks.h"
+
+#include "common/Formatter.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, this)
+using namespace TOPNSPC::common;
+
+using namespace std::literals;
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::parse_timespan;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, const Monitor &mon,
+                        const HealthMonitor *hmon) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name() << ").health ";
+}
+
+HealthMonitor::HealthMonitor(Monitor &m, Paxos &p, const string& service_name)
+  : PaxosService(m, p, service_name) {
+}
+
+void HealthMonitor::init()
+{
+  dout(10) << __func__ << dendl;
+}
+
+void HealthMonitor::create_initial()
+{
+  dout(10) << __func__ << dendl;
+}
+
+void HealthMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  version = get_last_committed();
+  dout(10) << __func__ << dendl;
+  load_health();
+
+  bufferlist qbl;
+  mon.store->get(service_name, "quorum", qbl);
+  if (qbl.length()) {
+    auto p = qbl.cbegin();
+    decode(quorum_checks, p);
+  } else {
+    quorum_checks.clear();
+  }
+
+  bufferlist lbl;
+  mon.store->get(service_name, "leader", lbl);
+  if (lbl.length()) {
+    auto p = lbl.cbegin();
+    decode(leader_checks, p);
+  } else {
+    leader_checks.clear();
+  }
+
+  {
+    bufferlist bl;
+    mon.store->get(service_name, "mutes", bl);
+    if (bl.length()) {
+      auto p = bl.cbegin();
+      decode(mutes, p);
+    } else {
+      mutes.clear();
+    }
+  }
+
+  dout(20) << "dump:";
+  JSONFormatter jf(true);
+  jf.open_object_section("health");
+  jf.open_object_section("quorum_health");
+  for (auto& p : quorum_checks) {
+    string s = string("mon.") + stringify(p.first);
+    jf.dump_object(s.c_str(), p.second);
+  }
+  jf.close_section();
+  jf.dump_object("leader_health", leader_checks);
+  jf.close_section();
+  jf.flush(*_dout);
+  *_dout << dendl;
+}
+
+void HealthMonitor::create_pending()
+{
+  dout(10) << " " << version << dendl;
+  pending_mutes = mutes;
+}
+
+void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  ++version;
+  dout(10) << " " << version << dendl;
+  put_last_committed(t, version);
+
+  bufferlist qbl;
+  encode(quorum_checks, qbl);
+  t->put(service_name, "quorum", qbl);
+  bufferlist lbl;
+  encode(leader_checks, lbl);
+  t->put(service_name, "leader", lbl);
+  {
+    bufferlist bl;
+    encode(pending_mutes, bl);
+    t->put(service_name, "mutes", bl);
+  }
+
+  health_check_map_t pending_health;
+
+  // combine per-mon details carefully...
+  map<string,set<string>> names; // code -> <mon names>
+  for (auto p : quorum_checks) {
+    for (auto q : p.second.checks) {
+      names[q.first].insert(mon.monmap->get_name(p.first));
+    }
+    pending_health.merge(p.second);
+  }
+  for (auto &p : pending_health.checks) {
+    p.second.summary = std::regex_replace(
+      p.second.summary,
+      std::regex("%hasorhave%"),
+      names[p.first].size() > 1 ? "have" : "has");
+    p.second.summary = std::regex_replace(
+      p.second.summary,
+      std::regex("%names%"), stringify(names[p.first]));
+    p.second.summary = std::regex_replace(
+      p.second.summary,
+      std::regex("%plurals%"),
+      names[p.first].size() > 1 ? "s" : "");
+    p.second.summary = std::regex_replace(
+      p.second.summary,
+      std::regex("%isorare%"),
+      names[p.first].size() > 1 ? "are" : "is");
+  }
+
+  pending_health.merge(leader_checks);
+  encode_health(pending_health, t);
+}
+
+version_t HealthMonitor::get_trim_to() const
+{
+  // we don't actually need *any* old states, but keep a few.
+  if (version > 5) {
+    return version - 5;
+  }
+  return 0;
+}
+
+bool HealthMonitor::preprocess_query(MonOpRequestRef op)
+{
+  auto m = op->get_req<PaxosServiceMessage>();
+  switch (m->get_type()) {
+  case MSG_MON_COMMAND:
+    return preprocess_command(op);
+  case MSG_MON_HEALTH_CHECKS:
+    return false;
+  default:
+    mon.no_reply(op);
+    derr << "Unhandled message type " << m->get_type() << dendl;
+    return true;
+  }
+}
+
+bool HealthMonitor::prepare_update(MonOpRequestRef op)
+{
+  Message *m = op->get_req();
+  dout(7) << "prepare_update " << *m
+	  << " from " << m->get_orig_source_inst() << dendl;
+  switch (m->get_type()) {
+  case MSG_MON_HEALTH_CHECKS:
+    return prepare_health_checks(op);
+  case MSG_MON_COMMAND:
+    return prepare_command(op);
+  default:
+    return false;
+  }
+}
+
+bool HealthMonitor::preprocess_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  std::stringstream ss;
+  bufferlist rdata;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+    return true;
+  }
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", rdata,
+		       get_last_committed());
+    return true;
+  }
+  // more sanity checks
+  try {
+    string format;
+    cmd_getval(cmdmap, "format", format);
+    string prefix;
+    cmd_getval(cmdmap, "prefix", prefix);
+  } catch (const bad_cmd_get& e) {
+    mon.reply_command(op, -EINVAL, e.what(), rdata, get_last_committed());
+    return true;
+  }
+  return false;
+}
+
+bool HealthMonitor::prepare_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+
+  std::stringstream ss;
+  bufferlist rdata;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+    return true;
+  }
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+    return true;
+  }
+
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  int r = 0;
+
+  if (prefix == "health mute") {
+    string code;
+    bool sticky = false;
+    if (!cmd_getval(cmdmap, "code", code) ||
+	code == "") {
+      r = -EINVAL;
+      ss << "must specify an alert code to mute";
+      goto out;
+    }
+    cmd_getval(cmdmap, "sticky", sticky);
+    string ttl_str;
+    utime_t ttl;
+    if (cmd_getval(cmdmap, "ttl", ttl_str)) {
+      auto secs = parse_timespan(ttl_str);
+      if (secs == 0s) {
+	r = -EINVAL;
+	ss << "not a valid duration: " << ttl_str;
+	goto out;
+      }
+      ttl = ceph_clock_now();
+      ttl += std::chrono::duration<double>(secs).count();
+    }
+    health_check_map_t all;
+    gather_all_health_checks(&all);
+    string summary;
+    int64_t count = 0;
+    if (!sticky) {
+      auto p = all.checks.find(code);
+      if (p == all.checks.end()) {
+	r = -ENOENT;
+	ss << "health alert " << code << " is not currently raised";
+	goto out;
+      }
+      count = p->second.count;
+      summary = p->second.summary;
+    }
+    auto& m = pending_mutes[code];
+    m.code = code;
+    m.ttl = ttl;
+    m.sticky = sticky;
+    m.summary = summary;
+    m.count = count;
+  } else if (prefix == "health unmute") {
+    string code;
+    if (cmd_getval(cmdmap, "code", code)) {
+      pending_mutes.erase(code);
+    } else {
+      pending_mutes.clear();
+    }
+  } else {
+    ss << "Command '" << prefix << "' not implemented!";
+    r = -ENOSYS;
+  }
+
+out:
+  dout(4) << __func__ << " done, r=" << r << dendl;
+  /* Compose response */
+  string rs;
+  getline(ss, rs);
+
+  if (r >= 0) {
+    // success.. delay reply
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else {
+    // reply immediately
+    mon.reply_command(op, r, rs, rdata, get_last_committed());
+    return false;
+  }
+}
+
+bool HealthMonitor::prepare_health_checks(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonHealthChecks>();
+  // no need to check if it's changed, the peon has done so
+  quorum_checks[m->get_source().num()] = std::move(m->health_checks);
+  return true;
+}
+
+void HealthMonitor::tick()
+{
+  if (!is_active()) {
+    return;
+  }
+  dout(10) << __func__ << dendl;
+  bool changed = false;
+  if (check_member_health()) {
+    changed = true;
+  }
+  if (!mon.is_leader()) {
+    return;
+  }
+  if (check_leader_health()) {
+    changed = true;
+  }
+  if (check_mutes()) {
+    changed = true;
+  }
+  if (changed) {
+    propose_pending();
+  }
+}
+
+bool HealthMonitor::check_mutes()
+{
+  bool changed = true;
+  auto now = ceph_clock_now();
+  health_check_map_t all;
+  gather_all_health_checks(&all);
+  auto p = pending_mutes.begin();
+  while (p != pending_mutes.end()) {
+    if (p->second.ttl != utime_t() &&
+	p->second.ttl <= now) {
+      mon.clog->info() << "Health alert mute " << p->first
+			<< " cleared (passed TTL " << p->second.ttl << ")";
+      p = pending_mutes.erase(p);
+      changed = true;
+      continue;
+    }
+    if (!p->second.sticky) {
+      auto q = all.checks.find(p->first);
+      if (q == all.checks.end()) {
+	mon.clog->info() << "Health alert mute " << p->first
+			  << " cleared (health alert cleared)";
+	p = pending_mutes.erase(p);
+	changed = true;
+	continue;
+      }
+      if (p->second.count) {
+	// count-based mute
+	if (q->second.count > p->second.count) {
+	  mon.clog->info() << "Health alert mute " << p->first
+			    << " cleared (count increased from " << p->second.count
+			    << " to " << q->second.count << ")";
+	  p = pending_mutes.erase(p);
+	  changed = true;
+	  continue;
+	}
+	if (q->second.count < p->second.count) {
+	  // rachet down the mute
+	  dout(10) << __func__ << " mute " << p->first << " count "
+		   << p->second.count << " -> " << q->second.count
+		   << dendl;
+	  p->second.count = q->second.count;
+	  changed = true;
+	}
+      } else {
+	// summary-based mute
+	if (p->second.summary != q->second.summary) {
+	  mon.clog->info() << "Health alert mute " << p->first
+			    << " cleared (summary changed)";
+	  p = pending_mutes.erase(p);
+	  changed = true;
+	  continue;
+	}
+      }
+    }
+    ++p;
+  }
+  return changed;
+}
+
+void HealthMonitor::gather_all_health_checks(health_check_map_t *all)
+{
+  for (auto& svc : mon.paxos_service) {
+    all->merge(svc->get_health_checks());
+  }
+}
+
+health_status_t HealthMonitor::get_health_status(
+  bool want_detail,
+  Formatter *f,
+  std::string *plain,
+  const char *sep1,
+  const char *sep2)
+{
+  health_check_map_t all;
+  gather_all_health_checks(&all);
+  health_status_t r = HEALTH_OK;
+  for (auto& p : all.checks) {
+    if (!mutes.count(p.first)) {
+      if (r > p.second.severity) {
+	r = p.second.severity;
+      }
+    }
+  }
+  if (f) {
+    f->open_object_section("health");
+    f->dump_stream("status") << r;
+    f->open_object_section("checks");
+    for (auto& p : all.checks) {
+      f->open_object_section(p.first.c_str());
+      p.second.dump(f, want_detail);
+      f->dump_bool("muted", mutes.count(p.first));
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("mutes");
+    for (auto& p : mutes) {
+      f->dump_object("mute", p.second);
+    }
+    f->close_section();
+    f->close_section();
+  } else {
+    auto now = ceph_clock_now();
+    // one-liner: HEALTH_FOO[ thing1[; thing2 ...]]
+    string summary;
+    for (auto& p : all.checks) {
+      if (!mutes.count(p.first)) {
+	if (!summary.empty()) {
+	  summary += sep2;
+	}
+	summary += p.second.summary;
+      }
+    }
+    *plain = stringify(r);
+    if (summary.size()) {
+      *plain += sep1;
+      *plain += summary;
+    }
+    if (!mutes.empty()) {
+      if (summary.size()) {
+	*plain += sep2;
+      } else {
+	*plain += sep1;
+      }
+      *plain += "(muted:";
+      for (auto& p : mutes) {
+	*plain += " ";
+	*plain += p.first;
+	if (p.second.ttl) {
+	  if (p.second.ttl > now) {
+	    auto left = p.second.ttl;
+	    left -= now;
+	    *plain += "("s + utimespan_str(left) + ")";
+	  } else {
+	    *plain += "(0s)";
+	  }
+	}
+      }
+      *plain += ")";
+    }
+    *plain += "\n";
+    // detail
+    if (want_detail) {
+      for (auto& p : all.checks) {
+	auto q = mutes.find(p.first);
+	if (q != mutes.end()) {
+	  *plain += "(MUTED";
+	  if (q->second.ttl != utime_t()) {
+	    if (q->second.ttl > now) {
+	      auto left = q->second.ttl;
+	      left -= now;
+	      *plain += " ttl ";
+	      *plain += utimespan_str(left);
+	    } else {
+	      *plain += "0s";
+	    }
+	  }
+	  if (q->second.sticky) {
+	    *plain += ", STICKY";
+	  }
+	  *plain += ") ";
+	}
+	*plain += "["s + short_health_string(p.second.severity) + "] " +
+	  p.first + ": " + p.second.summary + "\n";
+	for (auto& d : p.second.detail) {
+	  *plain += "    ";
+	  *plain += d;
+	  *plain += "\n";
+	}
+      }
+    }
+  }
+  return r;
+}
+
+bool HealthMonitor::check_member_health()
+{
+  dout(20) << __func__ << dendl;
+  bool changed = false;
+  const auto max = g_conf().get_val<uint64_t>("mon_health_max_detail");
+
+  // snapshot of usage
+  DataStats stats;
+  get_fs_stats(stats.fs_stats, g_conf()->mon_data.c_str());
+  map<string,uint64_t> extra;
+  uint64_t store_size = mon.store->get_estimated_size(extra);
+  ceph_assert(store_size > 0);
+  stats.store_stats.bytes_total = store_size;
+  stats.store_stats.bytes_sst = extra["sst"];
+  stats.store_stats.bytes_log = extra["log"];
+  stats.store_stats.bytes_misc = extra["misc"];
+  stats.last_update = ceph_clock_now();
+  dout(10) << __func__ << " avail " << stats.fs_stats.avail_percent << "%"
+	   << " total " << byte_u_t(stats.fs_stats.byte_total)
+	   << ", used " << byte_u_t(stats.fs_stats.byte_used)
+	   << ", avail " << byte_u_t(stats.fs_stats.byte_avail) << dendl;
+
+  // MON_DISK_{LOW,CRIT,BIG}
+  health_check_map_t next;
+  if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_crit) {
+    stringstream ss, ss2;
+    ss << "mon%plurals% %names% %isorare% very low on available space";
+    auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str(), 1);
+    ss2 << "mon." << mon.name << " has " << stats.fs_stats.avail_percent
+	<< "% avail";
+    d.detail.push_back(ss2.str());
+  } else if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_warn) {
+    stringstream ss, ss2;
+    ss << "mon%plurals% %names% %isorare% low on available space";
+    auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str(), 1);
+    ss2 << "mon." << mon.name << " has " << stats.fs_stats.avail_percent
+	<< "% avail";
+    d.detail.push_back(ss2.str());
+  }
+  if (stats.store_stats.bytes_total >= g_conf()->mon_data_size_warn) {
+    stringstream ss, ss2;
+    ss << "mon%plurals% %names% %isorare% using a lot of disk space";
+    auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str(), 1);
+    ss2 << "mon." << mon.name << " is "
+	<< byte_u_t(stats.store_stats.bytes_total)
+	<< " >= mon_data_size_warn ("
+	<< byte_u_t(g_conf()->mon_data_size_warn) << ")";
+    d.detail.push_back(ss2.str());
+  }
+
+  // OSD_NO_DOWN_OUT_INTERVAL
+  {
+    // Warn if 'mon_osd_down_out_interval' is set to zero.
+    // Having this option set to zero on the leader acts much like the
+    // 'noout' flag.  It's hard to figure out what's going wrong with clusters
+    // without the 'noout' flag set but acting like that just the same, so
+    // we report a HEALTH_WARN in case this option is set to zero.
+    // This is an ugly hack to get the warning out, but until we find a way
+    // to spread global options throughout the mon cluster and have all mons
+    // using a base set of the same options, we need to work around this sort
+    // of things.
+    // There's also the obvious drawback that if this is set on a single
+    // monitor on a 3-monitor cluster, this warning will only be shown every
+    // third monitor connection.
+    if (g_conf()->mon_warn_on_osd_down_out_interval_zero &&
+        g_conf()->mon_osd_down_out_interval == 0) {
+      ostringstream ss, ds;
+      ss << "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0";
+      auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str(), 1);
+      ds << "mon." << mon.name << " has mon_osd_down_out_interval set to 0";
+      d.detail.push_back(ds.str());
+    }
+  }
+
+  // AUTH_INSECURE_GLOBAL_ID_RECLAIM
+  if (g_conf().get_val<bool>("mon_warn_on_insecure_global_id_reclaim") &&
+      g_conf().get_val<bool>("auth_allow_insecure_global_id_reclaim")) {
+    // Warn if there are any clients that are insecurely renewing their global_id
+    std::lock_guard l(mon.session_map_lock);
+    list<std::string> detail;
+    for (auto p = mon.session_map.sessions.begin();
+	 p != mon.session_map.sessions.end();
+	 ++p) {
+      if ((*p)->global_id_status == global_id_status_t::RECLAIM_INSECURE) {
+	ostringstream ds;
+	ds << (*p)->entity_name << " at " << (*p)->addrs
+	   << " is using insecure global_id reclaim";
+	detail.push_back(ds.str());
+	if (detail.size() >= max) {
+	  detail.push_back("...");
+	  break;
+	}
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << "client%plurals% %isorare% using insecure global_id reclaim";
+      auto& d = next.add("AUTH_INSECURE_GLOBAL_ID_RECLAIM", HEALTH_WARN, ss.str(),
+			 detail.size());
+      d.detail.swap(detail);
+    }
+  }
+  // AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED
+  if (g_conf().get_val<bool>("mon_warn_on_insecure_global_id_reclaim_allowed") &&
+      g_conf().get_val<bool>("auth_allow_insecure_global_id_reclaim")) {
+    ostringstream ss, ds;
+    ss << "mon%plurals% %isorare% allowing insecure global_id reclaim";
+    auto& d = next.add("AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED", HEALTH_WARN, ss.str(), 1);
+    ds << "mon." << mon.name << " has auth_allow_insecure_global_id_reclaim set to true";
+    d.detail.push_back(ds.str());
+  }
+
+  auto p = quorum_checks.find(mon.rank);
+  if (p == quorum_checks.end()) {
+    if (next.empty()) {
+      return false;
+    }
+  } else {
+    if (p->second == next) {
+      return false;
+    }
+  }
+
+  if (mon.is_leader()) {
+    // prepare to propose
+    quorum_checks[mon.rank] = next;
+    changed = true;
+  } else {
+    // tell the leader
+    mon.send_mon_message(new MMonHealthChecks(next), mon.get_leader());
+  }
+
+  return changed;
+}
+
+bool HealthMonitor::check_leader_health()
+{
+  dout(20) << __func__ << dendl;
+  bool changed = false;
+
+  // prune quorum_health
+  {
+    auto& qset = mon.get_quorum();
+    auto p = quorum_checks.begin();
+    while (p != quorum_checks.end()) {
+      if (qset.count(p->first) == 0) {
+	p = quorum_checks.erase(p);
+	changed = true;
+      } else {
+	++p;
+      }
+    }
+  }
+
+  health_check_map_t next;
+
+ // DAEMON_OLD_VERSION
+  if (g_conf().get_val<bool>("mon_warn_on_older_version")) {
+    check_for_older_version(&next);
+  }
+  // MON_DOWN
+  check_for_mon_down(&next);
+  // MON_CLOCK_SKEW
+  check_for_clock_skew(&next);
+  // MON_MSGR2_NOT_ENABLED
+  if (g_conf().get_val<bool>("mon_warn_on_msgr2_not_enabled")) {
+    check_if_msgr2_enabled(&next);
+  }
+
+  if (next != leader_checks) {
+    changed = true;
+    leader_checks = next;
+  }
+  return changed;
+}
+
+void HealthMonitor::check_for_older_version(health_check_map_t *checks)
+{
+  static ceph::coarse_mono_time old_version_first_time =
+    ceph::coarse_mono_clock::zero();
+
+  auto now = ceph::coarse_mono_clock::now();
+  if (ceph::coarse_mono_clock::is_zero(old_version_first_time)) {
+    old_version_first_time = now;
+  }
+  const auto warn_delay = g_conf().get_val<std::chrono::seconds>("mon_warn_older_version_delay");
+  if (now - old_version_first_time > warn_delay) {
+    std::map<string, std::list<string> > all_versions;
+    mon.get_all_versions(all_versions);
+    if (all_versions.size() > 1) {
+      dout(20) << __func__ << " all_versions=" << all_versions << dendl;
+      // The last entry has the largest version
+      dout(20) << __func__ << " highest version daemon count "
+	       << all_versions.rbegin()->second.size() << dendl;
+      // Erase last element (the highest version running)
+      all_versions.erase(all_versions.rbegin()->first);
+      ceph_assert(all_versions.size() > 0);
+      ostringstream ss;
+      unsigned daemon_count = 0;
+      for (auto& g : all_versions) {
+	daemon_count += g.second.size();
+      }
+      int ver_count = all_versions.size();
+      ceph_assert(!(daemon_count == 1 && ver_count != 1));
+      ss << "There " << (daemon_count == 1 ? "is a daemon" : "are daemons")
+	 << " running " << (ver_count > 1 ? "multiple old versions" : "an older version")  << " of ceph";
+      health_status_t status;
+      if (ver_count > 1)
+	status = HEALTH_ERR;
+      else
+	status = HEALTH_WARN;
+      auto& d = checks->add("DAEMON_OLD_VERSION", status, ss.str(), all_versions.size());
+      for (auto& g : all_versions) {
+	ostringstream ds;
+	for (auto& i : g.second) { // Daemon list
+	  ds << i << " ";
+	}
+	ds << (g.second.size() == 1 ? "is" : "are")
+	   << " running an older version of ceph: " << g.first;
+	d.detail.push_back(ds.str());
+      }
+    } else {
+      old_version_first_time = ceph::coarse_mono_clock::zero();
+    }
+  }
+}
+
+void HealthMonitor::check_for_mon_down(health_check_map_t *checks)
+{
+  int max = mon.monmap->size();
+  int actual = mon.get_quorum().size();
+  const auto now = ceph::real_clock::now();
+  if (actual < max &&
+      now > mon.monmap->created.to_real_time() + g_conf().get_val<std::chrono::seconds>("mon_down_mkfs_grace")) {
+    ostringstream ss;
+    ss << (max-actual) << "/" << max << " mons down, quorum "
+       << mon.get_quorum_names();
+    auto& d = checks->add("MON_DOWN", HEALTH_WARN, ss.str(), max - actual);
+    set<int> q = mon.get_quorum();
+    for (int i=0; i<max; i++) {
+      if (q.count(i) == 0) {
+	ostringstream ss;
+	ss << "mon." << mon.monmap->get_name(i) << " (rank " << i
+	   << ") addr " << mon.monmap->get_addrs(i)
+	   << " is down (out of quorum)";
+	d.detail.push_back(ss.str());
+      }
+    }
+  }
+}
+
+void HealthMonitor::check_for_clock_skew(health_check_map_t *checks)
+{
+  if (!mon.timecheck_skews.empty()) {
+    list<string> warns;
+    list<string> details;
+    for (auto& i : mon.timecheck_skews) {
+      double skew = i.second;
+      double latency = mon.timecheck_latencies[i.first];
+      string name = mon.monmap->get_name(i.first);
+      ostringstream tcss;
+      health_status_t tcstatus = mon.timecheck_status(tcss, skew, latency);
+      if (tcstatus != HEALTH_OK) {
+	warns.push_back(name);
+	ostringstream tmp_ss;
+	tmp_ss << "mon." << name << " " << tcss.str()
+	       << " (latency " << latency << "s)";
+	details.push_back(tmp_ss.str());
+      }
+    }
+    if (!warns.empty()) {
+      ostringstream ss;
+      ss << "clock skew detected on";
+      while (!warns.empty()) {
+	ss << " mon." << warns.front();
+	warns.pop_front();
+	if (!warns.empty())
+	  ss << ",";
+      }
+      auto& d = checks->add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str(), details.size());
+      d.detail.swap(details);
+    }
+  }
+}
+
+void HealthMonitor::check_if_msgr2_enabled(health_check_map_t *checks)
+{
+  if (g_conf().get_val<bool>("ms_bind_msgr2") &&
+      mon.monmap->get_required_features().contains_all(
+	ceph::features::mon::FEATURE_NAUTILUS)) {
+    list<string> details;
+    for (auto& i : mon.monmap->mon_info) {
+      if (!i.second.public_addrs.has_msgr2()) {
+	ostringstream ds;
+	ds << "mon." << i.first << " is not bound to a msgr2 port, only "
+	   << i.second.public_addrs;
+	details.push_back(ds.str());
+      }
+    }
+    if (!details.empty()) {
+      ostringstream ss;
+      ss << details.size() << " monitors have not enabled msgr2";
+      auto &d = checks->add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str(),
+			    details.size());
+      d.detail.swap(details);
+    }
+  }
+}
diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h
new file mode 100644
index 000000000..c0e79d033
--- /dev/null
+++ b/src/mon/HealthMonitor.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_HEALTH_MONITOR_H
+#define CEPH_HEALTH_MONITOR_H
+
+#include "mon/PaxosService.h"
+
+class HealthMonitor : public PaxosService
+{
+  version_t version = 0;
+  std::map<int,health_check_map_t> quorum_checks;  // for each quorum member
+  health_check_map_t leader_checks;           // leader only
+  std::map<std::string,health_mute_t> mutes;
+
+  std::map<std::string,health_mute_t> pending_mutes;
+
+public:
+  HealthMonitor(Monitor &m, Paxos &p, const std::string& service_name);
+
+  /**
+   * @defgroup HealthMonitor_Inherited_h Inherited abstract methods
+   * @{
+   */
+  void init() override;
+
+  bool preprocess_query(MonOpRequestRef op) override;
+  bool prepare_update(MonOpRequestRef op) override;
+
+  void create_initial() override;
+  void update_from_paxos(bool *need_bootstrap) override;
+  void create_pending() override;
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  version_t get_trim_to() const override;
+
+  void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+  void tick() override;
+
+  void gather_all_health_checks(health_check_map_t *all);
+  health_status_t get_health_status(
+    bool want_detail,
+    ceph::Formatter *f,
+    std::string *plain,
+    const char *sep1 = " ",
+    const char *sep2 = "; ");
+
+  /**
+   * @} // HealthMonitor_Inherited_h
+   */
+private:
+  bool preprocess_command(MonOpRequestRef op);
+
+  bool prepare_command(MonOpRequestRef op);
+  bool prepare_health_checks(MonOpRequestRef op);
+  void check_for_older_version(health_check_map_t *checks);
+  void check_for_mon_down(health_check_map_t *checks);
+  void check_for_clock_skew(health_check_map_t *checks);
+  void check_if_msgr2_enabled(health_check_map_t *checks);
+  bool check_leader_health();
+  bool check_member_health();
+  bool check_mutes();
+};
+
+#endif // CEPH_HEALTH_MONITOR_H
diff --git a/src/mon/KVMonitor.cc b/src/mon/KVMonitor.cc
new file mode 100644
index 000000000..699cbe417
--- /dev/null
+++ b/src/mon/KVMonitor.cc
@@ -0,0 +1,525 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mon/Monitor.h"
+#include "mon/KVMonitor.h"
+#include "include/stringify.h"
+#include "messages/MKVData.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, this)
+
+static ostream& _prefix(std::ostream *_dout, const Monitor &mon,
+                        const KVMonitor *hmon) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name() << ").kv ";
+}
+
+const string KV_PREFIX = "mon_config_key";
+
+const int MAX_HISTORY = 50;
+
+
+static bool is_binary_string(const string& s)
+{
+  for (auto c : s) {
+    // \n and \t are escaped in JSON; other control characters are not.
+    if ((c < 0x20 && c != '\n' && c != '\t') || c >= 0x7f) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+KVMonitor::KVMonitor(Monitor &m, Paxos &p, const string& service_name)
+  : PaxosService(m, p, service_name) {
+}
+
+void KVMonitor::init()
+{
+  dout(10) << __func__ << dendl;
+}
+
+void KVMonitor::create_initial()
+{
+  dout(10) << __func__ << dendl;
+  version = 0;
+  pending.clear();
+}
+
+void KVMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  if (version == get_last_committed()) {
+    return;
+  }
+  version = get_last_committed();
+  dout(10) << __func__ << " " << version << dendl;
+  check_all_subs();
+}
+
+void KVMonitor::create_pending()
+{
+  dout(10) << " " << version << dendl;
+  pending.clear();
+}
+
+void KVMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  dout(10) << " " << (version+1) << dendl;
+  put_last_committed(t, version+1);
+
+  // record the delta for this commit point
+  bufferlist bl;
+  encode(pending, bl);
+  put_version(t, version+1, bl);
+  
+  // make actual changes
+  for (auto& p : pending) {
+    string key = p.first;
+    if (p.second) {
+      dout(20) << __func__ << " set " << key << dendl;
+      t->put(KV_PREFIX, key, *p.second);
+    } else {
+      dout(20) << __func__ << " rm " << key << dendl;
+      t->erase(KV_PREFIX, key);
+    }
+  }
+}
+
+version_t KVMonitor::get_trim_to() const
+{
+  // we don't need that many old states, but keep a few
+  if (version > MAX_HISTORY) {
+    return version - MAX_HISTORY;
+  }
+  return 0;
+}
+
+void KVMonitor::get_store_prefixes(set<string>& s) const
+{
+  s.insert(service_name);
+  s.insert(KV_PREFIX);
+}
+
+void KVMonitor::tick()
+{
+  if (!is_active() || !mon.is_leader()) {
+    return;
+  }
+  dout(10) << __func__ << dendl;
+}
+
+void KVMonitor::on_active()
+{
+}
+
+
+bool KVMonitor::preprocess_query(MonOpRequestRef op)
+{
+  switch (op->get_req()->get_type()) {
+  case MSG_MON_COMMAND:
+    try {
+      return preprocess_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+  }
+  return false;
+}
+
+bool KVMonitor::preprocess_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  std::stringstream ss;
+  int err = 0;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, get_last_committed());
+    return true;
+  }
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+  string key;
+  cmd_getval(cmdmap, "key", key);
+
+  bufferlist odata;
+
+  if (prefix == "config-key get") {
+    err = mon.store->get(KV_PREFIX, key, odata);
+  }
+  else if (prefix == "config-key exists") {
+    bool exists = mon.store->exists(KV_PREFIX, key);
+    ss << "key '" << key << "'";
+    if (exists) {
+      ss << " exists";
+      err = 0;
+    } else {
+      ss << " doesn't exist";
+      err = -ENOENT;
+    }
+  }
+  else if (prefix == "config-key list" ||
+	   prefix == "config-key ls") {
+    if (!f) {
+      f.reset(Formatter::create("json-pretty"));
+    }
+    KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+    f->open_array_section("keys");
+    while (iter->valid()) {
+      string key(iter->key());
+      f->dump_string("key", key);
+      iter->next();
+    }
+    f->close_section();
+
+    stringstream tmp_ss;
+    f->flush(tmp_ss);
+    odata.append(tmp_ss);
+    err = 0;
+  }
+  else if (prefix == "config-key dump") {
+    if (!f) {
+      f.reset(Formatter::create("json-pretty"));
+    }
+
+    KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+    if (key.size()) {
+      iter->lower_bound(key);
+    }
+    f->open_object_section("config-key store");
+    while (iter->valid()) {
+      if (key.size() &&
+	  iter->key().find(key) != 0) {
+	break;
+      }
+      string s = iter->value().to_str();
+      if (is_binary_string(s)) {
+	ostringstream ss;
+	ss << "<<< binary blob of length " << s.size() << " >>>";
+	f->dump_string(iter->key().c_str(), ss.str());
+      } else {
+	f->dump_string(iter->key().c_str(), s);
+      }
+      iter->next();
+    }
+    f->close_section();
+    
+    stringstream tmp_ss;
+    f->flush(tmp_ss);
+    odata.append(tmp_ss);
+    err = 0;
+  }
+  else {
+    return false;
+  }
+
+  mon.reply_command(op, err, ss.str(), odata, get_last_committed());
+  return true;
+}
+
+bool KVMonitor::prepare_update(MonOpRequestRef op)
+{
+  Message *m = op->get_req();
+  dout(7) << "prepare_update " << *m
+	  << " from " << m->get_orig_source_inst() << dendl;
+  switch (m->get_type()) {
+  case MSG_MON_COMMAND:
+    try {
+      return prepare_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+  }
+  return false;
+}
+
+
+bool KVMonitor::prepare_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  std::stringstream ss;
+  int err = 0;
+  bufferlist odata;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+  string key;
+  if (!cmd_getval(cmdmap, "key", key)) {
+    err = -EINVAL;
+    ss << "must specify a key";
+    goto reply;
+  }
+
+
+  if (prefix == "config-key set" ||
+      prefix == "config-key put") {
+    bufferlist data;
+    string val;
+    if (cmd_getval(cmdmap, "val", val)) {
+      // they specified a value in the command instead of a file
+      data.append(val);
+    } else if (m->get_data_len() > 0) {
+      // they specified '-i <file>'
+      data = m->get_data();
+    }
+    if (data.length() > (size_t) g_conf()->mon_config_key_max_entry_size) {
+      err = -EFBIG; // File too large
+      ss << "error: entry size limited to "
+         << g_conf()->mon_config_key_max_entry_size << " bytes. "
+         << "Use 'mon config key max entry size' to manually adjust";
+      goto reply;
+    }
+
+    ss << "set " << key;
+    pending[key] = data;
+    goto update;
+  }
+  else if (prefix == "config-key del" ||
+	   prefix == "config-key rm") {
+    ss << "key deleted";
+    pending[key] = boost::none;
+    goto update;
+  }
+  else {
+    ss << "unknown command " << prefix;
+    err = -EINVAL;
+  }
+
+reply:
+  mon.reply_command(op, err, ss.str(), odata, get_last_committed());
+  return false;
+
+update:
+  // see if there is an actual change
+  if (pending.empty()) {
+    err = 0;
+    goto reply;
+  }
+  force_immediate_propose();  // faster response
+  wait_for_finished_proposal(
+    op,
+    new Monitor::C_Command(
+      mon, op, 0, ss.str(), odata,
+      get_last_committed() + 1));
+  return true;
+}
+
+
+
+
+static string _get_dmcrypt_prefix(const uuid_d& uuid, const string k)
+{
+  return "dm-crypt/osd/" + stringify(uuid) + "/" + k;
+}
+
+bool KVMonitor::_have_prefix(const string &prefix)
+{
+  KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+
+  while (iter->valid()) {
+    string key(iter->key());
+    size_t p = key.find(prefix);
+    if (p != string::npos && p == 0) {
+      return true;
+    }
+    iter->next();
+  }
+  return false;
+}
+
+int KVMonitor::validate_osd_destroy(
+  const int32_t id,
+  const uuid_d& uuid)
+{
+  string dmcrypt_prefix = _get_dmcrypt_prefix(uuid, "");
+  string daemon_prefix =
+    "daemon-private/osd." + stringify(id) + "/";
+
+  if (!_have_prefix(dmcrypt_prefix) &&
+      !_have_prefix(daemon_prefix)) {
+    return -ENOENT;
+  }
+  return 0;
+}
+
+void KVMonitor::do_osd_destroy(int32_t id, uuid_d& uuid)
+{
+  string dmcrypt_prefix = _get_dmcrypt_prefix(uuid, "");
+  string daemon_prefix =
+    "daemon-private/osd." + stringify(id) + "/";
+
+  for (auto& prefix : { dmcrypt_prefix, daemon_prefix }) {
+    KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+    iter->lower_bound(prefix);
+    if (iter->key().find(prefix) != 0) {
+      break;
+    }
+    pending[iter->key()] = boost::none;
+  }
+
+  propose_pending();
+}
+
+int KVMonitor::validate_osd_new(
+  const uuid_d& uuid,
+  const string& dmcrypt_key,
+  stringstream& ss)
+{
+  string dmcrypt_prefix = _get_dmcrypt_prefix(uuid, "luks");
+  bufferlist value;
+  value.append(dmcrypt_key);
+  
+  if (mon.store->exists(KV_PREFIX, dmcrypt_prefix)) {
+    bufferlist existing_value;
+    int err = mon.store->get(KV_PREFIX, dmcrypt_prefix, existing_value);
+    if (err < 0) {
+      dout(10) << __func__ << " unable to get dm-crypt key from store (r = "
+               << err << ")" << dendl;
+      return err;
+    }
+    if (existing_value.contents_equal(value)) {
+      // both values match; this will be an idempotent op.
+      return EEXIST;
+    }
+    ss << "dm-crypt key already exists and does not match";
+    return -EEXIST;
+  }
+  return 0;
+}
+
+void KVMonitor::do_osd_new(
+  const uuid_d& uuid,
+  const string& dmcrypt_key)
+{
+  ceph_assert(paxos.is_plugged());
+
+  string dmcrypt_key_prefix = _get_dmcrypt_prefix(uuid, "luks");
+  bufferlist dmcrypt_key_value;
+  dmcrypt_key_value.append(dmcrypt_key);
+
+  pending[dmcrypt_key_prefix] = dmcrypt_key_value;
+
+  propose_pending();
+}
+
+
+void KVMonitor::check_sub(MonSession *s)
+{
+  if (!s->authenticated) {
+    dout(20) << __func__ << " not authenticated " << s->entity_name << dendl;
+    return;
+  }
+  for (auto& p : s->sub_map) {
+    if (p.first.find("kv:") == 0) {
+      check_sub(p.second);
+    }
+  }
+}
+
+void KVMonitor::check_sub(Subscription *sub)
+{
+  dout(10) << __func__
+	   << " next " << sub->next
+	   << " have " << version << dendl;
+  if (sub->next <= version) {
+    maybe_send_update(sub);
+    if (sub->onetime) {
+      mon.with_session_map([sub](MonSessionMap& session_map) {
+	  session_map.remove_sub(sub);
+	});
+    }
+  }
+}
+
+void KVMonitor::check_all_subs()
+{
+  dout(10) << __func__ << dendl;
+  int updated = 0, total = 0;
+  for (auto& i : mon.session_map.subs) {
+    if (i.first.find("kv:") == 0) {
+      auto p = i.second->begin();
+      while (!p.end()) {
+	auto sub = *p;
+	++p;
+	++total;
+	if (maybe_send_update(sub)) {
+	  ++updated;
+	}
+      }
+    }
+  }
+  dout(10) << __func__ << " updated " << updated << " / " << total << dendl;
+}
+
+bool KVMonitor::maybe_send_update(Subscription *sub)
+{
+  if (sub->next > version) {
+    return false;
+  }
+
+  auto m = new MKVData;
+  m->prefix = sub->type.substr(3);
+  m->version = version;
+
+  if (sub->next && sub->next > get_first_committed()) {
+    // incremental
+    m->incremental = true;
+
+    for (version_t cur = sub->next; cur <= version; ++cur) {
+      bufferlist bl;
+      int err = get_version(cur, bl);
+      ceph_assert(err == 0);
+
+      std::map<std::string,boost::optional<ceph::buffer::list>> pending;
+      auto p = bl.cbegin();
+      ceph::decode(pending, p);
+
+      for (auto& i : pending) {
+	if (i.first.find(m->prefix) == 0) {
+	  m->data[i.first] = i.second;
+	}
+      }
+    }
+
+    dout(10) << __func__ << " incremental keys for " << m->prefix
+	     << ", v " << sub->next << ".." << version
+	     << ", " << m->data.size() << " keys"
+	     << dendl;
+  } else {
+    m->incremental = false;
+
+    KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX);
+    iter->lower_bound(m->prefix);
+    while (iter->valid() &&
+	   iter->key().find(m->prefix) == 0) {
+      m->data[iter->key()] = iter->value();
+      iter->next();
+    }
+
+    dout(10) << __func__ << " sending full dump of " << m->prefix
+	     << ", " << m->data.size() << " keys"
+	     << dendl;
+  }
+  sub->session->con->send_message(m);
+  sub->next = version + 1;
+  return true;
+}
diff --git a/src/mon/KVMonitor.h b/src/mon/KVMonitor.h
new file mode 100644
index 000000000..c14c16380
--- /dev/null
+++ b/src/mon/KVMonitor.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/optional.hpp>
+
+#include "mon/PaxosService.h"
+
+class MonSession;
+
+extern const std::string KV_PREFIX;
+
+class KVMonitor : public PaxosService
+{
+  version_t version = 0;
+  std::map<std::string,boost::optional<ceph::buffer::list>> pending;
+
+  bool _have_prefix(const string &prefix);
+
+public:
+  KVMonitor(Monitor &m, Paxos &p, const std::string& service_name);
+
+  void init() override;
+
+  void get_store_prefixes(set<string>& s) const override;
+
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
+  
+  bool preprocess_query(MonOpRequestRef op) override;
+  bool prepare_update(MonOpRequestRef op) override;
+
+  void create_initial() override;
+  void update_from_paxos(bool *need_bootstrap) override;
+  void create_pending() override;
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  version_t get_trim_to() const override;
+
+  void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+  void on_active() override;
+  void tick() override;
+
+  int validate_osd_destroy(const int32_t id, const uuid_d& uuid);
+  void do_osd_destroy(int32_t id, uuid_d& uuid);
+  int validate_osd_new(
+      const uuid_d& uuid,
+      const std::string& dmcrypt_key,
+      std::stringstream& ss);
+  void do_osd_new(const uuid_d& uuid, const std::string& dmcrypt_key);
+
+  void check_sub(MonSession *s);
+  void check_sub(Subscription *sub);
+  void check_all_subs();
+
+  bool maybe_send_update(Subscription *sub);
+
+
+  // used by other services to adjust kv content; note that callers MUST ensure that
+  // propose_pending() is called and a commit is forced to provide atomicity and
+  // proper subscriber notifications.
+  void enqueue_set(const std::string& key, bufferlist &v) {
+    pending[key] = v;
+  }
+  void enqueue_rm(const std::string& key) {
+    pending[key] = boost::none;
+  }
+};
diff --git a/src/mon/LogMonitor.cc b/src/mon/LogMonitor.cc
new file mode 100644
index 000000000..88327663a
--- /dev/null
+++ b/src/mon/LogMonitor.cc
@@ -0,0 +1,947 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <boost/algorithm/string/predicate.hpp>
+
+#include <sstream>
+#include <syslog.h>
+
+#include "LogMonitor.h"
+#include "Monitor.h"
+#include "MonitorDBStore.h"
+
+#include "messages/MMonCommand.h"
+#include "messages/MLog.h"
+#include "messages/MLogAck.h"
+#include "common/Graylog.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "include/ceph_assert.h"
+#include "include/str_list.h"
+#include "include/str_map.h"
+#include "include/compat.h"
+
+#define dout_subsys ceph_subsys_mon
+
+using namespace TOPNSPC::common;
+
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+
+string LogMonitor::log_channel_info::get_log_file(const string &channel)
+{
+  dout(25) << __func__ << " for channel '"
+	   << channel << "'" << dendl;
+
+  if (expanded_log_file.count(channel) == 0) {
+    string fname = expand_channel_meta(
+      get_str_map_key(log_file, channel, &CLOG_CONFIG_DEFAULT_KEY),
+      channel);
+    expanded_log_file[channel] = fname;
+
+    dout(20) << __func__ << " for channel '"
+	     << channel << "' expanded to '"
+	     << fname << "'" << dendl;
+  }
+  return expanded_log_file[channel];
+}
+
+
+void LogMonitor::log_channel_info::expand_channel_meta(map<string,string> &m)
+{
+  dout(20) << __func__ << " expand map: " << m << dendl;
+  for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p) {
+    m[p->first] = expand_channel_meta(p->second, p->first);
+  }
+  dout(20) << __func__ << " expanded map: " << m << dendl;
+}
+
+string LogMonitor::log_channel_info::expand_channel_meta(
+    const string &input,
+    const string &change_to)
+{
+  size_t pos = string::npos;
+  string s(input);
+  while ((pos = s.find(LOG_META_CHANNEL)) != string::npos) {
+    string tmp = s.substr(0, pos) + change_to;
+    if (pos+LOG_META_CHANNEL.length() < s.length())
+      tmp += s.substr(pos+LOG_META_CHANNEL.length());
+    s = tmp;
+  }
+  dout(20) << __func__ << " from '" << input
+	   << "' to '" << s << "'" << dendl;
+
+  return s;
+}
+
+bool LogMonitor::log_channel_info::do_log_to_syslog(const string &channel) {
+  string v = get_str_map_key(log_to_syslog, channel,
+                             &CLOG_CONFIG_DEFAULT_KEY);
+  // We expect booleans, but they are in k/v pairs, kept
+  // as strings, in 'log_to_syslog'. We must ensure
+  // compatibility with existing boolean handling, and so
+  // we are here using a modified version of how
+  // md_config_t::set_val_raw() handles booleans. We will
+  // accept both 'true' and 'false', but will also check for
+  // '1' and '0'. The main distiction between this and the
+  // original code is that we will assume everything not '1',
+  // '0', 'true' or 'false' to be 'false'.
+  bool ret = false;
+
+  if (boost::iequals(v, "false")) {
+    ret = false;
+  } else if (boost::iequals(v, "true")) {
+    ret = true;
+  } else {
+    std::string err;
+    int b = strict_strtol(v.c_str(), 10, &err);
+    ret = (err.empty() && b == 1);
+  }
+
+  return ret;
+}
+
+ceph::logging::Graylog::Ref LogMonitor::log_channel_info::get_graylog(
+    const string &channel)
+{
+  dout(25) << __func__ << " for channel '"
+	   << channel << "'" << dendl;
+
+  if (graylogs.count(channel) == 0) {
+    auto graylog(std::make_shared<ceph::logging::Graylog>("mon"));
+
+    graylog->set_fsid(g_conf().get_val<uuid_d>("fsid"));
+    graylog->set_hostname(g_conf()->host);
+    graylog->set_destination(get_str_map_key(log_to_graylog_host, channel,
+					     &CLOG_CONFIG_DEFAULT_KEY),
+			     atoi(get_str_map_key(log_to_graylog_port, channel,
+						  &CLOG_CONFIG_DEFAULT_KEY).c_str()));
+
+    graylogs[channel] = graylog;
+    dout(20) << __func__ << " for channel '"
+	     << channel << "' to graylog host '"
+	     << log_to_graylog_host[channel] << ":"
+	     << log_to_graylog_port[channel]
+	     << "'" << dendl;
+  }
+  return graylogs[channel];
+}
+
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, get_last_committed())
+static ostream& _prefix(std::ostream *_dout, Monitor &mon, version_t v) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name()
+		<< ").log v" << v << " ";
+}
+
+ostream& operator<<(ostream &out, const LogMonitor &pm)
+{
+  return out << "log";
+}
+
+/*
+ Tick function to update the map based on performance every N seconds
+*/
+
+void LogMonitor::tick() 
+{
+  if (!is_active()) return;
+
+  dout(10) << *this << dendl;
+
+}
+
+void LogMonitor::create_initial()
+{
+  dout(10) << "create_initial -- creating initial map" << dendl;
+  LogEntry e;
+  e.name = g_conf()->name;
+  e.rank = entity_name_t::MON(mon.rank);
+  e.addrs = mon.messenger->get_myaddrs();
+  e.stamp = ceph_clock_now();
+  e.prio = CLOG_INFO;
+  std::stringstream ss;
+  ss << "mkfs " << mon.monmap->get_fsid();
+  e.msg = ss.str();
+  e.seq = 0;
+  pending_log.insert(pair<utime_t,LogEntry>(e.stamp, e));
+}
+
+void LogMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  dout(10) << __func__ << dendl;
+  version_t version = get_last_committed();
+  dout(10) << __func__ << " version " << version
+           << " summary v " << summary.version << dendl;
+  if (version == summary.version)
+    return;
+  ceph_assert(version >= summary.version);
+
+  map<string,bufferlist> channel_blog;
+
+  version_t latest_full = get_version_latest_full();
+  dout(10) << __func__ << " latest full " << latest_full << dendl;
+  if ((latest_full > 0) && (latest_full > summary.version)) {
+    bufferlist latest_bl;
+    get_version_full(latest_full, latest_bl);
+    ceph_assert(latest_bl.length() != 0);
+    dout(7) << __func__ << " loading summary e" << latest_full << dendl;
+    auto p = latest_bl.cbegin();
+    decode(summary, p);
+    dout(7) << __func__ << " loaded summary e" << summary.version << dendl;
+  }
+
+  // walk through incrementals
+  while (version > summary.version) {
+    bufferlist bl;
+    int err = get_version(summary.version+1, bl);
+    ceph_assert(err == 0);
+    ceph_assert(bl.length());
+
+    auto p = bl.cbegin();
+    __u8 v;
+    decode(v, p);
+    while (!p.end()) {
+      LogEntry le;
+      le.decode(p);
+      dout(7) << "update_from_paxos applying incremental log " << summary.version+1 <<  " " << le << dendl;
+
+      string channel = le.channel;
+      if (channel.empty()) // keep retrocompatibility
+        channel = CLOG_CHANNEL_CLUSTER;
+
+      if (g_conf().get_val<bool>("mon_cluster_log_to_stderr")) {
+	cerr << channel << " " << le << std::endl;
+      }
+
+      if (channels.do_log_to_syslog(channel)) {
+        string level = channels.get_level(channel);
+        string facility = channels.get_facility(channel);
+        if (level.empty() || facility.empty()) {
+          derr << __func__ << " unable to log to syslog -- level or facility"
+               << " not defined (level: " << level << ", facility: "
+               << facility << ")" << dendl;
+          continue;
+        }
+        le.log_to_syslog(channels.get_level(channel),
+                         channels.get_facility(channel));
+      }
+
+      if (channels.do_log_to_graylog(channel)) {
+	ceph::logging::Graylog::Ref graylog = channels.get_graylog(channel);
+	if (graylog) {
+	  graylog->log_log_entry(&le);
+	}
+	dout(7) << "graylog: " << channel << " " << graylog
+		<< " host:" << channels.log_to_graylog_host << dendl;
+      }
+
+      if (g_conf()->mon_cluster_log_to_file) {
+	string log_file = channels.get_log_file(channel);
+	dout(20) << __func__ << " logging for channel '" << channel
+		 << "' to file '" << log_file << "'" << dendl;
+
+	if (!log_file.empty()) {
+	  string log_file_level = channels.get_log_file_level(channel);
+	  if (log_file_level.empty()) {
+	    dout(1) << __func__ << " warning: log file level not defined for"
+		    << " channel '" << channel << "' yet a log file is --"
+		    << " will assume lowest level possible" << dendl;
+	  }
+
+	  int min = string_to_syslog_level(log_file_level);
+	  int l = clog_type_to_syslog_level(le.prio);
+	  if (l <= min) {
+	    stringstream ss;
+	    ss << le << "\n";
+	    // init entry if DNE
+	    bufferlist &blog = channel_blog[channel];
+	    blog.append(ss.str());
+	  }
+	}
+      }
+
+      summary.add(le);
+    }
+
+    summary.version++;
+    summary.prune(g_conf()->mon_log_max_summary);
+  }
+
+  dout(15) << __func__ << " logging for "
+           << channel_blog.size() << " channels" << dendl;
+  for(map<string,bufferlist>::iterator p = channel_blog.begin();
+      p != channel_blog.end(); ++p) {
+    if (!p->second.length()) {
+      dout(15) << __func__ << " channel '" << p->first
+               << "': nothing to log" << dendl;
+      continue;
+    }
+
+    dout(15) << __func__ << " channel '" << p->first
+             << "' logging " << p->second.length() << " bytes" << dendl;
+    string log_file = channels.get_log_file(p->first);
+
+    int fd = ::open(log_file.c_str(), O_WRONLY|O_APPEND|O_CREAT|O_CLOEXEC, 0600);
+    if (fd < 0) {
+      int err = -errno;
+      dout(1) << "unable to write to '" << log_file << "' for channel '"
+              << p->first << "': " << cpp_strerror(err) << dendl;
+    } else {
+      int err = p->second.write_fd(fd);
+      if (err < 0) {
+	dout(1) << "error writing to '" << log_file << "' for channel '"
+                << p->first << ": " << cpp_strerror(err) << dendl;
+      }
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+    }
+  }
+
+  check_subs();
+}
+
+void LogMonitor::create_pending()
+{
+  pending_log.clear();
+  pending_summary = summary;
+  dout(10) << "create_pending v " << (get_last_committed() + 1) << dendl;
+}
+
+void LogMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  version_t version = get_last_committed() + 1;
+  bufferlist bl;
+  dout(10) << __func__ << " v" << version << dendl;
+  __u8 v = 1;
+  encode(v, bl);
+  for (auto p = pending_log.begin(); p != pending_log.end(); ++p)
+    p->second.encode(bl, mon.get_quorum_con_features());
+
+  put_version(t, version, bl);
+  put_last_committed(t, version);
+}
+
+void LogMonitor::encode_full(MonitorDBStore::TransactionRef t)
+{
+  dout(10) << __func__ << " log v " << summary.version << dendl;
+  ceph_assert(get_last_committed() == summary.version);
+
+  bufferlist summary_bl;
+  encode(summary, summary_bl, mon.get_quorum_con_features());
+
+  put_version_full(t, summary.version, summary_bl);
+  put_version_latest_full(t, summary.version);
+}
+
+version_t LogMonitor::get_trim_to() const
+{
+  if (!mon.is_leader())
+    return 0;
+
+  unsigned max = g_conf()->mon_max_log_epochs;
+  version_t version = get_last_committed();
+  if (version > max)
+    return version - max;
+  return 0;
+}
+
+bool LogMonitor::preprocess_query(MonOpRequestRef op)
+{
+  op->mark_logmon_event("preprocess_query");
+  auto m = op->get_req<PaxosServiceMessage>();
+  dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
+  switch (m->get_type()) {
+  case MSG_MON_COMMAND:
+    try {
+      return preprocess_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+
+  case MSG_LOG:
+    return preprocess_log(op);
+
+  default:
+    ceph_abort();
+    return true;
+  }
+}
+
+bool LogMonitor::prepare_update(MonOpRequestRef op)
+{
+  op->mark_logmon_event("prepare_update");
+  auto m = op->get_req<PaxosServiceMessage>();
+  dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
+  switch (m->get_type()) {
+  case MSG_MON_COMMAND:
+    try {
+      return prepare_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+  case MSG_LOG:
+    return prepare_log(op);
+  default:
+    ceph_abort();
+    return false;
+  }
+}
+
+bool LogMonitor::preprocess_log(MonOpRequestRef op)
+{
+  op->mark_logmon_event("preprocess_log");
+  auto m = op->get_req<MLog>();
+  dout(10) << "preprocess_log " << *m << " from " << m->get_orig_source() << dendl;
+  int num_new = 0;
+
+  MonSession *session = op->get_session();
+  if (!session)
+    goto done;
+  if (!session->is_capable("log", MON_CAP_W)) {
+    dout(0) << "preprocess_log got MLog from entity with insufficient privileges "
+	    << session->caps << dendl;
+    goto done;
+  }
+  
+  for (auto p = m->entries.begin();
+       p != m->entries.end();
+       ++p) {
+    if (!pending_summary.contains(p->key()))
+      num_new++;
+  }
+  if (!num_new) {
+    dout(10) << "  nothing new" << dendl;
+    goto done;
+  }
+
+  return false;
+
+ done:
+  mon.no_reply(op);
+  return true;
+}
+
+struct LogMonitor::C_Log : public C_MonOp {
+  LogMonitor *logmon;
+  C_Log(LogMonitor *p, MonOpRequestRef o) :
+    C_MonOp(o), logmon(p) {}
+  void _finish(int r) override {
+    if (r == -ECANCELED) {
+      return;
+    }
+    logmon->_updated_log(op);
+  }
+};
+
+bool LogMonitor::prepare_log(MonOpRequestRef op) 
+{
+  op->mark_logmon_event("prepare_log");
+  auto m = op->get_req<MLog>();
+  dout(10) << "prepare_log " << *m << " from " << m->get_orig_source() << dendl;
+
+  if (m->fsid != mon.monmap->fsid) {
+    dout(0) << "handle_log on fsid " << m->fsid << " != " << mon.monmap->fsid 
+	    << dendl;
+    return false;
+  }
+
+  for (auto p = m->entries.begin();
+       p != m->entries.end();
+       ++p) {
+    dout(10) << " logging " << *p << dendl;
+    if (!pending_summary.contains(p->key())) {
+      pending_summary.add(*p);
+      pending_log.insert(pair<utime_t,LogEntry>(p->stamp, *p));
+    }
+  }
+  pending_summary.prune(g_conf()->mon_log_max_summary);
+  wait_for_finished_proposal(op, new C_Log(this, op));
+  return true;
+}
+
+void LogMonitor::_updated_log(MonOpRequestRef op)
+{
+  auto m = op->get_req<MLog>();
+  dout(7) << "_updated_log for " << m->get_orig_source_inst() << dendl;
+  mon.send_reply(op, new MLogAck(m->fsid, m->entries.rbegin()->seq));
+}
+
+bool LogMonitor::should_propose(double& delay)
+{
+  // commit now if we have a lot of pending events
+  if (g_conf()->mon_max_log_entries_per_event > 0 &&
+      pending_log.size() >= (unsigned)g_conf()->mon_max_log_entries_per_event)
+    return true;
+
+  // otherwise fall back to generic policy
+  return PaxosService::should_propose(delay);
+}
+
+
+bool LogMonitor::preprocess_command(MonOpRequestRef op)
+{
+  op->mark_logmon_event("preprocess_command");
+  auto m = op->get_req<MMonCommand>();
+  int r = -EINVAL;
+  bufferlist rdata;
+  stringstream ss;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, get_last_committed());
+    return true;
+  }
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  if (prefix == "log last") {
+    int64_t num = 20;
+    cmd_getval(cmdmap, "num", num);
+    if (f) {
+      f->open_array_section("tail");
+    }
+
+    std::string level_str;
+    clog_type level;
+    if (cmd_getval(cmdmap, "level", level_str)) {
+      level = LogEntry::str_to_level(level_str);
+      if (level == CLOG_UNKNOWN) {
+        ss << "Invalid severity '" << level_str << "'";
+        mon.reply_command(op, -EINVAL, ss.str(), get_last_committed());
+        return true;
+      }
+    } else {
+      level = CLOG_INFO;
+    }
+
+    std::string channel;
+    if (!cmd_getval(cmdmap, "channel", channel)) {
+      channel = CLOG_CHANNEL_DEFAULT;
+    }
+
+    // We'll apply this twice, once while counting out lines
+    // and once while outputting them.
+    auto match = [level](const LogEntry &entry) {
+      return entry.prio >= level;
+    };
+
+    // Decrement operation that sets to container end when hitting rbegin
+    ostringstream ss;
+    if (channel == "*") {
+      list<LogEntry> full_tail;
+      summary.build_ordered_tail(&full_tail);
+      derr << "full " << full_tail << dendl;
+      auto rp = full_tail.rbegin();
+      for (; num > 0 && rp != full_tail.rend(); ++rp) {
+	if (match(*rp)) {
+	  num--;
+	}
+      }
+      if (rp == full_tail.rend()) {
+	--rp;
+      }
+
+      // Decrement a reverse iterator such that going past rbegin()
+      // sets it to rend().  This is for writing a for() loop that
+      // goes up to (and including) rbegin()
+      auto dec = [&rp, &full_tail] () {
+        if (rp == full_tail.rbegin()) {
+          rp = full_tail.rend();
+        } else {
+          --rp;
+        }
+      };
+
+      // Move forward to the end of the container (decrement the reverse
+      // iterator).
+      for (; rp != full_tail.rend(); dec()) {
+	if (!match(*rp)) {
+	  continue;
+	}
+	if (f) {
+	  f->dump_object("entry", *rp);
+	} else {
+	  ss << *rp << "\n";
+	}
+      }
+    } else {
+      auto p = summary.tail_by_channel.find(channel);
+      if (p != summary.tail_by_channel.end()) {
+	auto rp = p->second.rbegin();
+	for (; num > 0 && rp != p->second.rend(); ++rp) {
+	  if (match(rp->second)) {
+	    num--;
+	  }
+	}
+	if (rp == p->second.rend()) {
+	  --rp;
+	}
+
+        // Decrement a reverse iterator such that going past rbegin()
+        // sets it to rend().  This is for writing a for() loop that
+        // goes up to (and including) rbegin()
+        auto dec = [&rp, &p] () {
+          if (rp == p->second.rbegin()) {
+            rp = p->second.rend();
+          } else {
+            --rp;
+          }
+        };
+
+        // Move forward to the end of the container (decrement the reverse
+        // iterator).
+	for (; rp != p->second.rend(); dec()) {
+	  if (!match(rp->second)) {
+	    continue;
+	  }
+	  if (f) {
+	    f->dump_object("entry", rp->second);
+	  } else {
+	    ss << rp->second << "\n";
+	  }
+	}
+      }
+    }
+    if (f) {
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      rdata.append(ss.str());
+    }
+    r = 0;
+  } else {
+    return false;
+  }
+
+  string rs;
+  getline(ss, rs);
+  mon.reply_command(op, r, rs, rdata, get_last_committed());
+  return true;
+}
+
+
+bool LogMonitor::prepare_command(MonOpRequestRef op)
+{
+  op->mark_logmon_event("prepare_command");
+  auto m = op->get_req<MMonCommand>();
+  stringstream ss;
+  string rs;
+  int err = -EINVAL;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    // ss has reason for failure
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+    return true;
+  }
+
+  if (prefix == "log") {
+    vector<string> logtext;
+    string level_str;
+    cmd_getval(cmdmap, "logtext", logtext);
+    LogEntry le;
+    le.rank = m->get_orig_source();
+    le.addrs.v.push_back(m->get_orig_source_addr());
+    le.name = session->entity_name;
+    le.stamp = m->get_recv_stamp();
+    le.seq = 0;
+    cmd_getval(cmdmap, "level", level_str, string("info"));
+    le.prio = LogEntry::str_to_level(level_str);
+    le.channel = CLOG_CHANNEL_DEFAULT;
+    le.msg = str_join(logtext, " ");
+    pending_summary.add(le);
+    pending_summary.prune(g_conf()->mon_log_max_summary);
+    pending_log.insert(pair<utime_t,LogEntry>(le.stamp, le));
+    wait_for_finished_proposal(op, new Monitor::C_Command(
+          mon, op, 0, string(), get_last_committed() + 1));
+    return true;
+  }
+
+  getline(ss, rs);
+  mon.reply_command(op, err, rs, get_last_committed());
+  return false;
+}
+
+
+int LogMonitor::sub_name_to_id(const string& n)
+{
+  if (n.substr(0, 4) == "log-" && n.size() > 4) {
+    return LogEntry::str_to_level(n.substr(4));
+  } else {
+    return CLOG_UNKNOWN;
+  }
+}
+
+void LogMonitor::check_subs()
+{
+  dout(10) << __func__ << dendl;
+  for (map<string, xlist<Subscription*>*>::iterator i = mon.session_map.subs.begin();
+       i != mon.session_map.subs.end();
+       ++i) {
+    for (xlist<Subscription*>::iterator j = i->second->begin(); !j.end(); ++j) {
+      if (sub_name_to_id((*j)->type) >= 0)
+	check_sub(*j);
+    }
+  }
+}
+
+void LogMonitor::check_sub(Subscription *s)
+{
+  dout(10) << __func__ << " client wants " << s->type << " ver " << s->next << dendl;
+
+  int sub_level = sub_name_to_id(s->type);
+  ceph_assert(sub_level >= 0);
+
+  version_t summary_version = summary.version;
+  if (s->next > summary_version) {
+    dout(10) << __func__ << " client " << s->session->name
+	    << " requested version (" << s->next << ") is greater than ours (" 
+	    << summary_version << "), which means we already sent him" 
+	    << " everything we have." << dendl;
+    return;
+  } 
+ 
+  MLog *mlog = new MLog(mon.monmap->fsid);
+
+  if (s->next == 0) { 
+    /* First timer, heh? */
+    _create_sub_incremental(mlog, sub_level, get_last_committed());
+  } else {
+    /* let us send you an incremental log... */
+    _create_sub_incremental(mlog, sub_level, s->next);
+  }
+
+  dout(10) << __func__ << " sending message to " << s->session->name
+	  << " with " << mlog->entries.size() << " entries"
+	  << " (version " << mlog->version << ")" << dendl;
+  
+  if (!mlog->entries.empty()) {
+    s->session->con->send_message(mlog);
+  } else {
+    mlog->put();
+  }
+  if (s->onetime)
+    mon.session_map.remove_sub(s);
+  else
+    s->next = summary_version+1;
+}
+
+/**
+ * Create an incremental log message from version \p sv to \p summary.version
+ *
+ * @param mlog	Log message we'll send to the client with the messages received
+ *		since version \p sv, inclusive.
+ * @param level	The max log level of the messages the client is interested in.
+ * @param sv	The version the client is looking for.
+ */
+void LogMonitor::_create_sub_incremental(MLog *mlog, int level, version_t sv)
+{
+  dout(10) << __func__ << " level " << level << " ver " << sv 
+	  << " cur summary ver " << summary.version << dendl; 
+
+  if (sv < get_first_committed()) {
+    dout(10) << __func__ << " skipped from " << sv
+	     << " to first_committed " << get_first_committed() << dendl;
+    LogEntry le;
+    le.stamp = ceph_clock_now();
+    le.prio = CLOG_WARN;
+    ostringstream ss;
+    ss << "skipped log messages from " << sv << " to " << get_first_committed();
+    le.msg = ss.str();
+    mlog->entries.push_back(le);
+    sv = get_first_committed();
+  }
+
+  version_t summary_ver = summary.version;
+  while (sv && sv <= summary_ver) {
+    bufferlist bl;
+    int err = get_version(sv, bl);
+    ceph_assert(err == 0);
+    ceph_assert(bl.length());
+    auto p = bl.cbegin();
+    __u8 v;
+    decode(v,p);
+    while (!p.end()) {
+      LogEntry le;
+      le.decode(p);
+
+      if (le.prio < level) {
+	dout(20) << __func__ << " requested " << level 
+		 << " entry " << le.prio << dendl;
+	continue;
+      }
+
+      mlog->entries.push_back(le);
+    }
+    mlog->version = sv++;
+  }
+
+  dout(10) << __func__ << " incremental message ready (" 
+	   << mlog->entries.size() << " entries)" << dendl;
+}
+
+void LogMonitor::update_log_channels()
+{
+  ostringstream oss;
+
+  channels.clear();
+
+  int r = get_conf_str_map_helper(
+    g_conf().get_val<string>("mon_cluster_log_to_syslog"),
+    oss, &channels.log_to_syslog,
+    CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    derr << __func__ << " error parsing 'mon_cluster_log_to_syslog'" << dendl;
+    return;
+  }
+
+  r = get_conf_str_map_helper(
+    g_conf().get_val<string>("mon_cluster_log_to_syslog_level"),
+    oss, &channels.syslog_level,
+    CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    derr << __func__ << " error parsing 'mon_cluster_log_to_syslog_level'"
+         << dendl;
+    return;
+  }
+
+  r = get_conf_str_map_helper(
+    g_conf().get_val<string>("mon_cluster_log_to_syslog_facility"),
+    oss, &channels.syslog_facility,
+    CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    derr << __func__ << " error parsing 'mon_cluster_log_to_syslog_facility'"
+         << dendl;
+    return;
+  }
+
+  r = get_conf_str_map_helper(
+    g_conf().get_val<string>("mon_cluster_log_file"), oss,
+    &channels.log_file,
+    CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    derr << __func__ << " error parsing 'mon_cluster_log_file'" << dendl;
+    return;
+  }
+
+  r = get_conf_str_map_helper(
+    g_conf().get_val<string>("mon_cluster_log_file_level"), oss,
+    &channels.log_file_level,
+    CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    derr << __func__ << " error parsing 'mon_cluster_log_file_level'"
+         << dendl;
+    return;
+  }
+
+  r = get_conf_str_map_helper(
+    g_conf().get_val<string>("mon_cluster_log_to_graylog"), oss,
+    &channels.log_to_graylog,
+    CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    derr << __func__ << " error parsing 'mon_cluster_log_to_graylog'"
+         << dendl;
+    return;
+  }
+
+  r = get_conf_str_map_helper(
+    g_conf().get_val<string>("mon_cluster_log_to_graylog_host"), oss,
+    &channels.log_to_graylog_host,
+    CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    derr << __func__ << " error parsing 'mon_cluster_log_to_graylog_host'"
+         << dendl;
+    return;
+  }
+
+  r = get_conf_str_map_helper(
+    g_conf().get_val<string>("mon_cluster_log_to_graylog_port"), oss,
+    &channels.log_to_graylog_port,
+    CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    derr << __func__ << " error parsing 'mon_cluster_log_to_graylog_port'"
+         << dendl;
+    return;
+  }
+
+  channels.expand_channel_meta();
+}
+
+
+void LogMonitor::handle_conf_change(const ConfigProxy& conf,
+                                    const std::set<std::string> &changed)
+{
+  if (changed.count("mon_cluster_log_to_syslog") ||
+      changed.count("mon_cluster_log_to_syslog_level") ||
+      changed.count("mon_cluster_log_to_syslog_facility") ||
+      changed.count("mon_cluster_log_file") ||
+      changed.count("mon_cluster_log_file_level") ||
+      changed.count("mon_cluster_log_to_graylog") ||
+      changed.count("mon_cluster_log_to_graylog_host") ||
+      changed.count("mon_cluster_log_to_graylog_port")) {
+    update_log_channels();
+  }
+}
diff --git a/src/mon/LogMonitor.h b/src/mon/LogMonitor.h
new file mode 100644
index 000000000..6d6a0b71c
--- /dev/null
+++ b/src/mon/LogMonitor.h
@@ -0,0 +1,189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_LOGMONITOR_H
+#define CEPH_LOGMONITOR_H
+
+#include <map>
+#include <set>
+
+#include "include/types.h"
+#include "PaxosService.h"
+
+#include "common/config_fwd.h"
+#include "common/LogEntry.h"
+#include "include/str_map.h"
+
+class MLog;
+
+static const std::string LOG_META_CHANNEL = "$channel";
+
+namespace ceph {
+namespace logging {
+  class Graylog;
+}
+}
+
+class LogMonitor : public PaxosService,
+                   public md_config_obs_t {
+private:
+  std::multimap<utime_t,LogEntry> pending_log;
+  LogSummary pending_summary, summary;
+
+  struct log_channel_info {
+
+    std::map<std::string,std::string> log_to_syslog;
+    std::map<std::string,std::string> syslog_level;
+    std::map<std::string,std::string> syslog_facility;
+    std::map<std::string,std::string> log_file;
+    std::map<std::string,std::string> expanded_log_file;
+    std::map<std::string,std::string> log_file_level;
+    std::map<std::string,std::string> log_to_graylog;
+    std::map<std::string,std::string> log_to_graylog_host;
+    std::map<std::string,std::string> log_to_graylog_port;
+
+    std::map<std::string, std::shared_ptr<ceph::logging::Graylog>> graylogs;
+    uuid_d fsid;
+    std::string host;
+
+    void clear() {
+      log_to_syslog.clear();
+      syslog_level.clear();
+      syslog_facility.clear();
+      log_file.clear();
+      expanded_log_file.clear();
+      log_file_level.clear();
+      log_to_graylog.clear();
+      log_to_graylog_host.clear();
+      log_to_graylog_port.clear();
+      graylogs.clear();
+    }
+
+    /** expands $channel meta variable on all maps *EXCEPT* log_file
+     *
+     * We won't expand the log_file map meta variables here because we
+     * intend to do that selectively during get_log_file()
+     */
+    void expand_channel_meta() {
+      expand_channel_meta(log_to_syslog);
+      expand_channel_meta(syslog_level);
+      expand_channel_meta(syslog_facility);
+      expand_channel_meta(log_file_level);
+    }
+    void expand_channel_meta(std::map<std::string,std::string> &m);
+    std::string expand_channel_meta(const std::string &input,
+				    const std::string &change_to);
+
+    bool do_log_to_syslog(const std::string &channel);
+
+    std::string get_facility(const std::string &channel) {
+      return get_str_map_key(syslog_facility, channel,
+                             &CLOG_CONFIG_DEFAULT_KEY);
+    }
+
+    std::string get_level(const std::string &channel) {
+      return get_str_map_key(syslog_level, channel,
+                             &CLOG_CONFIG_DEFAULT_KEY);
+    }
+
+    std::string get_log_file(const std::string &channel);
+
+    std::string get_log_file_level(const std::string &channel) {
+      return get_str_map_key(log_file_level, channel,
+                             &CLOG_CONFIG_DEFAULT_KEY);
+    }
+
+    bool do_log_to_graylog(const std::string &channel) {
+      return (get_str_map_key(log_to_graylog, channel,
+			      &CLOG_CONFIG_DEFAULT_KEY) == "true");
+    }
+
+    std::shared_ptr<ceph::logging::Graylog> get_graylog(const std::string &channel);
+  } channels;
+
+  void update_log_channels();
+
+  void create_initial() override;
+  void update_from_paxos(bool *need_bootstrap) override;
+  void create_pending() override;  // prepare a new pending
+  // propose pending update to peers
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  void encode_full(MonitorDBStore::TransactionRef t) override;
+  version_t get_trim_to() const override;
+  bool preprocess_query(MonOpRequestRef op) override;  // true if processed.
+  bool prepare_update(MonOpRequestRef op) override;
+
+  bool preprocess_log(MonOpRequestRef op);
+  bool prepare_log(MonOpRequestRef op);
+  void _updated_log(MonOpRequestRef op);
+
+  bool should_propose(double& delay) override;
+
+  bool should_stash_full() override {
+    // commit a LogSummary on every commit
+    return true;
+  }
+
+  struct C_Log;
+
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
+
+  void _create_sub_incremental(MLog *mlog, int level, version_t sv);
+
+ public:
+  LogMonitor(Monitor &mn, Paxos &p, const std::string& service_name)
+    : PaxosService(mn, p, service_name) { }
+
+  void init() override {
+    generic_dout(10) << "LogMonitor::init" << dendl;
+    g_conf().add_observer(this);
+    update_log_channels();
+  }
+  
+  void tick() override;  // check state, take actions
+
+  void check_subs();
+  void check_sub(Subscription *s);
+
+  /**
+   * translate log sub name ('log-info') to integer id
+   *
+   * @param n name
+   * @return id, or -1 if unrecognized
+   */
+  int sub_name_to_id(const std::string& n);
+
+  void on_shutdown() override {
+    g_conf().remove_observer(this);
+  }
+
+  const char **get_tracked_conf_keys() const override {
+    static const char* KEYS[] = {
+      "mon_cluster_log_to_syslog",
+      "mon_cluster_log_to_syslog_level",
+      "mon_cluster_log_to_syslog_facility",
+      "mon_cluster_log_file",
+      "mon_cluster_log_file_level",
+      "mon_cluster_log_to_graylog",
+      "mon_cluster_log_to_graylog_host",
+      "mon_cluster_log_to_graylog_port",
+      NULL
+    };
+    return KEYS;
+  }
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set<std::string> &changed) override;
+};
+#endif
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
new file mode 100644
index 000000000..2ec7a2018
--- /dev/null
+++ b/src/mon/MDSMonitor.cc
@@ -0,0 +1,2370 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <regex>
+#include <sstream>
+#include <boost/utility.hpp>
+
+#include "MDSMonitor.h"
+#include "FSCommands.h"
+#include "Monitor.h"
+#include "MonitorDBStore.h"
+#include "OSDMonitor.h"
+
+#include "common/strtol.h"
+#include "common/perf_counters.h"
+#include "common/config.h"
+#include "common/cmdparse.h"
+#include "messages/MMDSMap.h"
+#include "messages/MFSMap.h"
+#include "messages/MFSMapUser.h"
+#include "messages/MMDSLoadTargets.h"
+#include "messages/MMonCommand.h"
+#include "messages/MGenericMessage.h"
+
+#include "include/ceph_assert.h"
+#include "include/str_list.h"
+#include "include/stringify.h"
+#include "mds/mdstypes.h"
+#include "Session.h"
+
+using namespace TOPNSPC::common;
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::string_view;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, get_fsmap())
+static ostream& _prefix(std::ostream *_dout, Monitor &mon, const FSMap& fsmap) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name()
+		<< ").mds e" << fsmap.get_epoch() << " ";
+}
+
+static const string MDS_METADATA_PREFIX("mds_metadata");
+static const string MDS_HEALTH_PREFIX("mds_health");
+
+
+/*
+ * Specialized implementation of cmd_getval to allow us to parse
+ * out strongly-typedef'd types
+ */
+namespace TOPNSPC::common {
+template<> bool cmd_getval(const cmdmap_t& cmdmap,
+			   const std::string& k, mds_gid_t &val)
+{
+  return cmd_getval(cmdmap, k, (int64_t&)val);
+}
+
+template<> bool cmd_getval(const cmdmap_t& cmdmap,
+			   const std::string& k, mds_rank_t &val)
+{
+  return cmd_getval(cmdmap, k, (int64_t&)val);
+}
+
+template<> bool cmd_getval(const cmdmap_t& cmdmap,
+			   const std::string& k, MDSMap::DaemonState &val)
+{
+  return cmd_getval(cmdmap, k, (int64_t&)val);
+}
+}
+// my methods
+
+template <int dblV>
+void MDSMonitor::print_map(const FSMap& m)
+{
+  dout(dblV) << "print_map\n";
+  m.print(*_dout);
+  *_dout << dendl;
+}
+
+// service methods
+void MDSMonitor::create_initial()
+{
+  dout(10) << "create_initial" << dendl;
+}
+
+void MDSMonitor::get_store_prefixes(std::set<string>& s) const
+{
+  s.insert(service_name);
+  s.insert(MDS_METADATA_PREFIX);
+  s.insert(MDS_HEALTH_PREFIX);
+}
+
+void MDSMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  version_t version = get_last_committed();
+  if (version == get_fsmap().epoch)
+    return;
+
+  dout(10) << __func__ << " version " << version
+	   << ", my e " << get_fsmap().epoch << dendl;
+  ceph_assert(version > get_fsmap().epoch);
+
+  load_health();
+
+  // read and decode
+  bufferlist fsmap_bl;
+  fsmap_bl.clear();
+  int err = get_version(version, fsmap_bl);
+  ceph_assert(err == 0);
+
+  ceph_assert(fsmap_bl.length() > 0);
+  dout(10) << __func__ << " got " << version << dendl;
+  try {
+    PaxosFSMap::decode(fsmap_bl);
+  } catch (const ceph::buffer::malformed_input& e) {
+    derr << "unable to decode FSMap: " << e.what() << dendl;
+    throw;
+  }
+
+  // new map
+  dout(0) << "new map" << dendl;
+  print_map<0>(get_fsmap());
+  if (!g_conf()->mon_mds_skip_sanity) {
+    get_fsmap().sanity();
+  }
+
+  check_subs();
+}
+
+void MDSMonitor::init()
+{
+  (void)load_metadata(pending_metadata);
+}
+
+void MDSMonitor::create_pending()
+{
+  auto &fsmap = PaxosFSMap::create_pending();
+
+  if (mon.osdmon()->is_readable()) {
+    const auto &osdmap = mon.osdmon()->osdmap;
+    fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
+  }
+
+  dout(10) << "create_pending e" << fsmap.epoch << dendl;
+}
+
+void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  auto &pending = get_pending_fsmap_writeable();
+  auto &epoch = pending.epoch;
+
+  dout(10) << "encode_pending e" << epoch << dendl;
+
+  // print map iff 'debug mon = 30' or higher
+  print_map<30>(pending);
+  if (!g_conf()->mon_mds_skip_sanity) {
+    pending.sanity(true);
+  }
+
+  // Set 'modified' on maps modified this epoch
+  for (auto &p : pending.filesystems) {
+    if (p.second->mds_map.epoch == epoch) {
+      p.second->mds_map.modified = ceph_clock_now();
+    }
+  }
+
+  // apply to paxos
+  ceph_assert(get_last_committed() + 1 == pending.epoch);
+  bufferlist pending_bl;
+  pending.encode(pending_bl, mon.get_quorum_con_features());
+
+  /* put everything in the transaction */
+  put_version(t, pending.epoch, pending_bl);
+  put_last_committed(t, pending.epoch);
+
+  // Encode MDSHealth data
+  for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
+      i != pending_daemon_health.end(); ++i) {
+    bufferlist bl;
+    i->second.encode(bl);
+    t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
+  }
+
+  for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
+      i != pending_daemon_health_rm.end(); ++i) {
+    t->erase(MDS_HEALTH_PREFIX, stringify(*i));
+  }
+  pending_daemon_health_rm.clear();
+  remove_from_metadata(pending, t);
+
+  // health
+  health_check_map_t new_checks;
+  const auto &info_map = pending.get_mds_info();
+  for (const auto &i : info_map) {
+    const auto &gid = i.first;
+    const auto &info = i.second;
+    if (pending_daemon_health_rm.count(gid)) {
+      continue;
+    }
+    MDSHealth health;
+    auto p = pending_daemon_health.find(gid);
+    if (p != pending_daemon_health.end()) {
+      health = p->second;
+    } else {
+      bufferlist bl;
+      mon.store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
+      if (!bl.length()) {
+	derr << "Missing health data for MDS " << gid << dendl;
+	continue;
+      }
+      auto bl_i = bl.cbegin();
+      health.decode(bl_i);
+    }
+    for (const auto &metric : health.metrics) {
+      if (metric.type == MDS_HEALTH_DUMMY) {
+        continue;
+      }
+      const auto rank = info.rank;
+      health_check_t *check = &new_checks.get_or_add(
+	mds_metric_name(metric.type),
+	metric.sev,
+	mds_metric_summary(metric.type),
+	1);
+      ostringstream ss;
+      ss << "mds." << info.name << "(mds." << rank << "): " << metric.message;
+      bool first = true;
+      for (auto &p : metric.metadata) {
+	if (first) {
+	  ss << " ";
+	} else {
+	  ss << ", ";
+        }
+	ss << p.first << ": " << p.second;
+        first = false;
+      }
+      check->detail.push_back(ss.str());
+    }
+  }
+  pending.get_health_checks(&new_checks);
+  for (auto& p : new_checks.checks) {
+    p.second.summary = std::regex_replace(
+      p.second.summary,
+      std::regex("%num%"),
+      stringify(p.second.detail.size()));
+    p.second.summary = std::regex_replace(
+      p.second.summary,
+      std::regex("%plurals%"),
+      p.second.detail.size() > 1 ? "s" : "");
+    p.second.summary = std::regex_replace(
+      p.second.summary,
+      std::regex("%isorare%"),
+      p.second.detail.size() > 1 ? "are" : "is");
+    p.second.summary = std::regex_replace(
+      p.second.summary,
+      std::regex("%hasorhave%"),
+      p.second.detail.size() > 1 ? "have" : "has");
+  }
+  encode_health(new_checks, t);
+}
+
+version_t MDSMonitor::get_trim_to() const
+{
+  version_t floor = 0;
+  if (g_conf()->mon_mds_force_trim_to > 0 &&
+      g_conf()->mon_mds_force_trim_to <= (int)get_last_committed()) {
+    floor = g_conf()->mon_mds_force_trim_to;
+    dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
+             << floor << dendl;
+  }
+
+  unsigned max = g_conf()->mon_max_mdsmap_epochs;
+  version_t last = get_last_committed();
+
+  if (last - get_first_committed() > max && floor < last - max) {
+    floor = last-max;
+  }
+
+  dout(20) << __func__ << " = " << floor << dendl;
+  return floor;
+}
+
+bool MDSMonitor::preprocess_query(MonOpRequestRef op)
+{
+  op->mark_mdsmon_event(__func__);
+  auto m = op->get_req<PaxosServiceMessage>();
+  dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
+	   << " " << m->get_orig_source_addrs() << dendl;
+
+  switch (m->get_type()) {
+    
+  case MSG_MDS_BEACON:
+    return preprocess_beacon(op);
+    
+  case MSG_MON_COMMAND:
+    try {
+      return preprocess_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+
+  case MSG_MDS_OFFLOAD_TARGETS:
+    return preprocess_offload_targets(op);
+
+  default:
+    ceph_abort();
+    return true;
+  }
+}
+
+void MDSMonitor::_note_beacon(MMDSBeacon *m)
+{
+  mds_gid_t gid = mds_gid_t(m->get_global_id());
+  version_t seq = m->get_seq();
+
+  dout(5) << "_note_beacon " << *m << " noting time" << dendl;
+  auto &beacon = last_beacon[gid];
+  beacon.stamp = mono_clock::now();
+  beacon.seq = seq;
+}
+
+bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
+{
+  op->mark_mdsmon_event(__func__);
+  auto m = op->get_req<MMDSBeacon>();
+  MDSMap::DaemonState state = m->get_state();
+  mds_gid_t gid = m->get_global_id();
+  version_t seq = m->get_seq();
+  MDSMap::mds_info_t info;
+  epoch_t effective_epoch = 0;
+
+  const auto &fsmap = get_fsmap();
+
+  // check privileges, ignore if fails
+  MonSession *session = op->get_session();
+  if (!session)
+    goto ignore;
+  if (!session->is_capable("mds", MON_CAP_X)) {
+    dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
+	    << session->caps << dendl;
+    goto ignore;
+  }
+
+  if (m->get_fsid() != mon.monmap->fsid) {
+    dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon.monmap->fsid << dendl;
+    goto ignore;
+  }
+
+  dout(5)  << "preprocess_beacon " << *m
+	   << " from " << m->get_orig_source()
+	   << " " << m->get_orig_source_addrs()
+	   << " " << m->get_compat()
+	   << dendl;
+
+  // make sure the address has a port
+  if (m->get_orig_source_addr().get_port() == 0) {
+    dout(1) << " ignoring boot message without a port" << dendl;
+    goto ignore;
+  }
+
+  // fw to leader?
+  if (!is_leader())
+    return false;
+
+  // booted, but not in map?
+  if (!fsmap.gid_exists(gid)) {
+    if (state != MDSMap::STATE_BOOT) {
+      dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
+              << ceph_mds_state_name(state) << ")" << dendl;
+
+      /* We can't send an MDSMap this MDS was a part of because we no longer
+       * know which FS it was part of. Nor does this matter. Sending an empty
+       * MDSMap is sufficient for getting the MDS to respawn.
+       */
+      auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
+      mon.send_reply(op, m.detach());
+      return true;
+    } else {
+      return false;  // not booted yet.
+    }
+  }
+  dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
+  info = fsmap.get_info_gid(gid);
+
+  if (state == MDSMap::STATE_DNE) {
+    return false;
+  }
+
+  // old seq?
+  if (info.state_seq > seq) {
+    dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
+    goto ignore;
+  }
+
+  // Work out the latest epoch that this daemon should have seen
+  {
+    fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
+    if (fscid == FS_CLUSTER_ID_NONE) {
+      effective_epoch = fsmap.standby_epochs.at(gid);
+    } else {
+      effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
+    }
+    if (effective_epoch != m->get_last_epoch_seen()) {
+      dout(10) << "mds_beacon " << *m
+               << " ignoring requested state, because mds hasn't seen latest map" << dendl;
+      goto reply;
+    }
+  }
+
+  if (info.laggy()) {
+    _note_beacon(m);
+    return false;  // no longer laggy, need to update map.
+  }
+  if (state == MDSMap::STATE_BOOT) {
+    // ignore, already booted.
+    goto ignore;
+  }
+
+  // did the join_fscid change
+  if (m->get_fs().size()) {
+    fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+    auto f = fsmap.get_filesystem(m->get_fs());
+    if (f) {
+      fscid = f->fscid;
+    }
+    if (info.join_fscid != fscid) {
+      dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
+               << " (" << m->get_fs() << ")" << dendl;
+      _note_beacon(m);
+      return false;
+    }
+  } else {
+    if (info.join_fscid != FS_CLUSTER_ID_NONE) {
+      dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
+      _note_beacon(m);
+      return false;
+    }
+  }
+
+  // is there a state change here?
+  if (info.state != state) {
+    _note_beacon(m);
+    return false;
+  }
+
+  // Comparing known daemon health with m->get_health()
+  // and return false (i.e. require proposal) if they
+  // do not match, to update our stored
+  if (!(pending_daemon_health[gid] == m->get_health())) {
+    dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
+    _note_beacon(m);
+    return false;
+  }
+
+ reply:
+  // note time and reply
+  ceph_assert(effective_epoch > 0);
+  _note_beacon(m);
+  {
+    auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
+        m->get_global_id(), m->get_name(), effective_epoch,
+        state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
+    mon.send_reply(op, beacon.detach());
+  }
+  return true;
+
+ ignore:
+  // I won't reply this beacon, drop it.
+  mon.no_reply(op);
+  return true;
+}
+
+bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
+{
+  op->mark_mdsmon_event(__func__);
+  auto m = op->get_req<MMDSLoadTargets>();
+  dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
+
+  const auto &fsmap = get_fsmap();
+  
+  // check privileges, ignore message if fails
+  MonSession *session = op->get_session();
+  if (!session)
+    goto ignore;
+  if (!session->is_capable("mds", MON_CAP_X)) {
+    dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
+	    << session->caps << dendl;
+    goto ignore;
+  }
+
+  if (fsmap.gid_exists(m->global_id) &&
+      m->targets == fsmap.get_info_gid(m->global_id).export_targets)
+    goto ignore;
+
+  return false;
+
+ ignore:
+  mon.no_reply(op);
+  return true;
+}
+
+
+bool MDSMonitor::prepare_update(MonOpRequestRef op)
+{
+  op->mark_mdsmon_event(__func__);
+  auto m = op->get_req<PaxosServiceMessage>();
+  dout(7) << "prepare_update " << *m << dendl;
+
+  switch (m->get_type()) {
+    
+  case MSG_MDS_BEACON:
+    return prepare_beacon(op);
+
+  case MSG_MON_COMMAND:
+    try {
+      return prepare_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+
+  case MSG_MDS_OFFLOAD_TARGETS:
+    return prepare_offload_targets(op);
+  
+  default:
+    ceph_abort();
+  }
+
+  return true;
+}
+
+bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
+{
+  op->mark_mdsmon_event(__func__);
+  auto m = op->get_req<MMDSBeacon>();
+  // -- this is an update --
+  dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
+	   << " " << m->get_orig_source_addrs() << dendl;
+  entity_addrvec_t addrs = m->get_orig_source_addrs();
+  mds_gid_t gid = m->get_global_id();
+  MDSMap::DaemonState state = m->get_state();
+  version_t seq = m->get_seq();
+
+  auto &pending = get_pending_fsmap_writeable();
+
+  dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
+
+  // Calculate deltas of health metrics created and removed
+  // Do this by type rather than MDSHealthMetric equality, because messages can
+  // change a lot when they include e.g. a number of items.
+  const auto &old_health = pending_daemon_health[gid].metrics;
+  const auto &new_health = m->get_health().metrics;
+
+  std::set<mds_metric_t> old_types;
+  for (const auto &i : old_health) {
+    old_types.insert(i.type);
+  }
+
+  std::set<mds_metric_t> new_types;
+  for (const auto &i : new_health) {
+    if (i.type == MDS_HEALTH_DUMMY) {
+      continue;
+    }
+    new_types.insert(i.type);
+  }
+
+  for (const auto &new_metric: new_health) {
+    if (new_metric.type == MDS_HEALTH_DUMMY) {
+      continue;
+    }
+    if (old_types.count(new_metric.type) == 0) {
+      dout(10) << "MDS health message (" << m->get_orig_source()
+	       << "): " << new_metric.sev << " " << new_metric.message << dendl;
+    }
+  }
+
+  // Log the disappearance of health messages at INFO
+  for (const auto &old_metric : old_health) {
+    if (new_types.count(old_metric.type) == 0) {
+      mon.clog->info() << "MDS health message cleared ("
+        << m->get_orig_source() << "): " << old_metric.message;
+    }
+  }
+
+  // Store health
+  pending_daemon_health[gid] = m->get_health();
+
+  const auto& cs = m->get_compat();
+  if (state == MDSMap::STATE_BOOT) {
+    // zap previous instance of this name?
+    if (g_conf()->mds_enforce_unique_name) {
+      bool failed_mds = false;
+      while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
+        if (!mon.osdmon()->is_writeable()) {
+          mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+          return false;
+        }
+        const auto& existing_info = pending.get_info_gid(existing);
+        mon.clog->info() << existing_info.human_name() << " restarted";
+	fail_mds_gid(pending, existing);
+        failed_mds = true;
+      }
+      if (failed_mds) {
+        ceph_assert(mon.osdmon()->is_writeable());
+        request_proposal(mon.osdmon());
+      }
+    }
+
+    // Add this daemon to the map
+    if (pending.mds_roles.count(gid) == 0) {
+      MDSMap::mds_info_t new_info;
+      new_info.global_id = gid;
+      new_info.name = m->get_name();
+      new_info.addrs = addrs;
+      new_info.mds_features = m->get_mds_features();
+      new_info.state = MDSMap::STATE_STANDBY;
+      new_info.state_seq = seq;
+      new_info.compat = cs;
+      if (m->get_fs().size()) {
+	fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+	auto f = pending.get_filesystem(m->get_fs());
+	if (f) {
+	  fscid = f->fscid;
+	}
+        new_info.join_fscid = fscid;
+      }
+      pending.insert(new_info);
+    }
+
+    // initialize the beacon timer
+    auto &beacon = last_beacon[gid];
+    beacon.stamp = mono_clock::now();
+    beacon.seq = seq;
+
+    update_metadata(m->get_global_id(), m->get_sys_info());
+  } else {
+    // state update
+
+    if (!pending.gid_exists(gid)) {
+      /* gid has been removed from pending, send null map */
+      dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
+              << ceph_mds_state_name(state) << ")" << dendl;
+
+      /* We can't send an MDSMap this MDS was a part of because we no longer
+       * know which FS it was part of. Nor does this matter. Sending an empty
+       * MDSMap is sufficient for getting the MDS to respawn.
+       */
+      goto null;
+    }
+
+    const auto& info = pending.get_info_gid(gid);
+
+    // did the reported compat change? That's illegal!
+    if (cs.compare(info.compat) != 0) {
+      if (!mon.osdmon()->is_writeable()) {
+        mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+        return false;
+      }
+      mon.clog->warn() << info.human_name() << " compat changed unexpectedly";
+      fail_mds_gid(pending, gid);
+      request_proposal(mon.osdmon());
+      return true;
+    }
+
+    if (state == MDSMap::STATE_DNE) {
+      dout(1) << __func__ << ": DNE from " << info << dendl;
+      goto evict;
+    }
+
+    // legal state change?
+    if ((info.state == MDSMap::STATE_STANDBY && state != info.state) ||
+        (info.state == MDSMap::STATE_STANDBY_REPLAY && state != info.state && state != MDSMap::STATE_DAMAGED)) {
+      // Standby daemons should never modify their own state.
+      // Except that standby-replay can indicate the rank is damaged due to failure to replay.
+      // Reject any attempts to do so.
+      derr << "standby " << gid << " attempted to change state to "
+           << ceph_mds_state_name(state) << ", rejecting" << dendl;
+      goto evict;
+    } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
+               !MDSMap::state_transition_valid(info.state, state)) {
+      // Validate state transitions for daemons that hold a rank
+      derr << "daemon " << gid << " (rank " << info.rank << ") "
+           << "reported invalid state transition "
+           << ceph_mds_state_name(info.state) << " -> "
+           << ceph_mds_state_name(state) << dendl;
+      goto evict;
+    }
+
+    if (info.laggy()) {
+      dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
+      pending.modify_daemon(info.global_id, [](auto& info)
+        {
+          info.clear_laggy();
+        }
+      );
+    }
+
+    dout(5)  << "prepare_beacon mds." << info.rank
+	     << " " << ceph_mds_state_name(info.state)
+	     << " -> " << ceph_mds_state_name(state)
+	     << dendl;
+
+    fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+    if (m->get_fs().size()) {
+      auto f = pending.get_filesystem(m->get_fs());
+      if (f) {
+        fscid = f->fscid;
+      }
+    }
+    pending.modify_daemon(gid, [fscid](auto& info) {
+      info.join_fscid = fscid;
+    });
+
+    if (state == MDSMap::STATE_STOPPED) {
+      const auto fscid = pending.mds_roles.at(gid);
+      const auto &fs = pending.get_filesystem(fscid);
+
+      mon.clog->info() << info.human_name() << " finished "
+                        << "stopping rank " << info.rank << " in filesystem "
+                        << fs->mds_map.fs_name << " (now has "
+                        << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
+
+      auto erased = pending.stop(gid);
+      erased.push_back(gid);
+
+      for (const auto& erased_gid : erased) {
+        last_beacon.erase(erased_gid);
+        if (pending_daemon_health.count(erased_gid)) {
+          pending_daemon_health.erase(erased_gid);
+          pending_daemon_health_rm.insert(erased_gid);
+        }
+      }
+    } else if (state == MDSMap::STATE_DAMAGED) {
+      if (!mon.osdmon()->is_writeable()) {
+        dout(1) << __func__ << ": DAMAGED from rank " << info.rank
+                << " waiting for osdmon writeable to blocklist it" << dendl;
+        mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+        return false;
+      }
+
+      auto rank = info.rank;
+
+      // Record this MDS rank as damaged, so that other daemons
+      // won't try to run it.
+      dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl;
+
+      auto fs = pending.get_filesystem(gid);
+      auto rankgid = fs->mds_map.get_gid(rank);
+      auto rankinfo = pending.get_info_gid(rankgid);
+      auto followergid = fs->mds_map.get_standby_replay(rank);
+
+      ceph_assert(gid == rankgid || gid == followergid);
+
+      utime_t until = ceph_clock_now();
+      until += g_conf().get_val<double>("mon_mds_blocklist_interval");
+      const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until);
+      if (followergid != MDS_GID_NONE) {
+        fail_mds_gid(pending, followergid);
+        last_beacon.erase(followergid);
+      }
+      request_proposal(mon.osdmon());
+      pending.damaged(rankgid, blocklist_epoch);
+      last_beacon.erase(rankgid);
+
+      /* MDS expects beacon reply back */
+    } else {
+      if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
+        const auto &fscid = pending.mds_roles.at(gid);
+        const auto &fs = pending.get_filesystem(fscid);
+        mon.clog->info() << info.human_name() << " is now active in "
+                          << "filesystem " << fs->mds_map.fs_name << " as rank "
+                          << info.rank;
+      }
+
+      // Made it through special cases and validations, record the
+      // daemon's reported state to the FSMap.
+      pending.modify_daemon(gid, [state, seq](auto& info) {
+        info.state = state;
+        info.state_seq = seq;
+      });
+    }
+  }
+
+  dout(5) << "prepare_beacon pending map now:" << dendl;
+  print_map(pending);
+  
+  wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
+    if (r >= 0)
+      _updated(op);   // success
+    else if (r == -ECANCELED) {
+      mon.no_reply(op);
+    } else {
+      dispatch(op);        // try again
+    }
+  }));
+
+  return true;
+
+evict:
+  if (!mon.osdmon()->is_writeable()) {
+    dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl;
+    mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+    return false;
+  }
+
+  fail_mds_gid(pending, gid);
+  request_proposal(mon.osdmon());
+  dout(5) << __func__ << ": pending map now:" << dendl;
+  print_map(pending);
+
+  goto null;
+
+null:
+  wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
+    if (r >= 0) {
+      auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
+      mon.send_reply(op, m.detach());
+    } else {
+      dispatch(op);        // try again
+    }
+  }));
+
+  return true;
+}
+
+bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
+{
+  auto &pending = get_pending_fsmap_writeable();
+
+  op->mark_mdsmon_event(__func__);
+  auto m = op->get_req<MMDSLoadTargets>();
+  mds_gid_t gid = m->global_id;
+  if (pending.gid_has_rank(gid)) {
+    dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
+    pending.update_export_targets(gid, m->targets);
+  } else {
+    dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
+  }
+  mon.no_reply(op);
+  return true;
+}
+
+bool MDSMonitor::should_propose(double& delay)
+{
+  // delegate to PaxosService to assess whether we should propose
+  return PaxosService::should_propose(delay);
+}
+
+void MDSMonitor::_updated(MonOpRequestRef op)
+{
+  const auto &fsmap = get_fsmap();
+  op->mark_mdsmon_event(__func__);
+  auto m = op->get_req<MMDSBeacon>();
+  dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
+  mon.clog->debug() << m->get_orig_source() << " "
+		     << m->get_orig_source_addrs() << " "
+		     << ceph_mds_state_name(m->get_state());
+
+  if (m->get_state() == MDSMap::STATE_STOPPED) {
+    // send the map manually (they're out of the map, so they won't get it automatic)
+    auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
+    mon.send_reply(op, m.detach());
+  } else {
+    auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
+        m->get_global_id(), m->get_name(), fsmap.get_epoch(),
+        m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
+    mon.send_reply(op, beacon.detach());
+  }
+}
+
+void MDSMonitor::on_active()
+{
+  tick();
+
+  if (is_leader()) {
+    mon.clog->debug() << "fsmap " << get_fsmap();
+  }
+}
+
+void MDSMonitor::dump_info(Formatter *f)
+{
+  f->open_object_section("fsmap");
+  get_fsmap().dump(f);
+  f->close_section();
+
+  f->dump_unsigned("mdsmap_first_committed", get_first_committed());
+  f->dump_unsigned("mdsmap_last_committed", get_last_committed());
+}
+
+bool MDSMonitor::preprocess_command(MonOpRequestRef op)
+{
+  op->mark_mdsmon_event(__func__);
+  auto m = op->get_req<MMonCommand>();
+  int r = -1;
+  bufferlist rdata;
+  stringstream ss, ds;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    // ss has reason for failure
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  std::unique_ptr<Formatter> f(Formatter::create(format));
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+    return true;
+  }
+
+  // to use const qualifier filter fsmap beforehand
+  FSMap _fsmap_copy = get_fsmap();
+  _fsmap_copy.filter(session->get_allowed_fs_names());
+  const auto& fsmap = _fsmap_copy;
+
+  if (prefix == "mds stat") {
+    if (f) {
+      f->open_object_section("mds_stat");
+      dump_info(f.get());
+      f->close_section();
+      f->flush(ds);
+    } else {
+      ds << fsmap;
+    }
+    r = 0;
+  } else if (prefix == "mds ok-to-stop") {
+    vector<string> ids;
+    if (!cmd_getval(cmdmap, "ids", ids)) {
+      r = -EINVAL;
+      ss << "must specify mds id";
+      goto out;
+    }
+    if (fsmap.is_any_degraded()) {
+      ss << "one or more filesystems is currently degraded";
+      r = -EBUSY;
+      goto out;
+    }
+    set<mds_gid_t> stopping;
+    for (auto& id : ids) {
+      ostringstream ess;
+      mds_gid_t gid = gid_from_arg(fsmap, id, ess);
+      if (gid == MDS_GID_NONE) {
+	// the mds doesn't exist, but no file systems are unhappy, so losing it
+	// can't have any effect.
+	continue;
+      }
+      stopping.insert(gid);
+    }
+    set<mds_gid_t> active;
+    set<mds_gid_t> standby;
+    for (auto gid : stopping) {
+      if (fsmap.gid_has_rank(gid)) {
+	// ignore standby-replay daemons (at this level)
+	if (!fsmap.is_standby_replay(gid)) {
+	  auto standby = fsmap.get_standby_replay(gid);
+	  if (standby == MDS_GID_NONE ||
+	      stopping.count(standby)) {
+	    // no standby-replay, or we're also stopping the standby-replay
+	    // for this mds
+	    active.insert(gid);
+	  }
+	}
+      } else {
+	// net loss of a standby
+	standby.insert(gid);
+      }
+    }
+    if (fsmap.get_num_standby() - standby.size() < active.size()) {
+      r = -EBUSY;
+      ss << "insufficent standby MDS daemons to stop active gids "
+	 << stringify(active)
+	 << " and/or standby gids " << stringify(standby);;
+      goto out;
+    }
+    r = 0;
+    ss << "should be safe to stop " << ids;
+  } else if (prefix == "fs dump") {
+    int64_t epocharg;
+    epoch_t epoch;
+
+    const FSMap *fsmapp = &fsmap;
+    FSMap dummy;
+    if (cmd_getval(cmdmap, "epoch", epocharg)) {
+      epoch = epocharg;
+      bufferlist b;
+      int err = get_version(epoch, b);
+      if (err == -ENOENT) {
+	r = -ENOENT;
+        goto out;
+      } else {
+	ceph_assert(err == 0);
+	ceph_assert(b.length());
+	dummy.decode(b);
+        fsmapp = &dummy;
+      }
+    }
+
+    stringstream ds;
+    if (f != NULL) {
+      f->open_object_section("fsmap");
+      fsmapp->dump(f.get());
+      f->close_section();
+      f->flush(ds);
+      r = 0;
+    } else {
+      fsmapp->print(ds);
+      r = 0;
+    }
+
+    rdata.append(ds);
+    ss << "dumped fsmap epoch " << fsmapp->get_epoch();
+  } else if (prefix == "mds metadata") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+
+    string who;
+    bool all = !cmd_getval(cmdmap, "who", who);
+    dout(1) << "all = " << all << dendl;
+    if (all) {
+      r = 0;
+      // Dump all MDSs' metadata
+      const auto all_info = fsmap.get_mds_info();
+
+      f->open_array_section("mds_metadata");
+      for(const auto &i : all_info) {
+        const auto &info = i.second;
+
+        f->open_object_section("mds");
+        f->dump_string("name", info.name);
+        std::ostringstream get_err;
+        r = dump_metadata(fsmap, info.name, f.get(), get_err);
+        if (r == -EINVAL || r == -ENOENT) {
+          // Drop error, list what metadata we do have
+          dout(1) << get_err.str() << dendl;
+          r = 0;
+        } else if (r != 0) {
+          derr << "Unexpected error reading metadata: " << cpp_strerror(r)
+               << dendl;
+          ss << get_err.str();
+          f->close_section();
+          break;
+        }
+        f->close_section();
+      }
+      f->close_section();
+    } else {
+      // Dump a single daemon's metadata
+      f->open_object_section("mds_metadata");
+      r = dump_metadata(fsmap, who, f.get(), ss);
+      f->close_section();
+    }
+    f->flush(ds);
+  } else if (prefix == "mds versions") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    count_metadata("ceph_version", f.get());
+    f->flush(ds);
+    r = 0;
+  } else if (prefix == "mds count-metadata") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    string field;
+    cmd_getval(cmdmap, "property", field);
+    count_metadata(field, f.get());
+    f->flush(ds);
+    r = 0;
+  } else if (prefix == "fs compat show") {
+    string fs_name;
+    cmd_getval(cmdmap, "fs_name", fs_name);
+    const auto &fs = fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+      ss << "filesystem '" << fs_name << "' not found";
+      r = -ENOENT;
+      goto out;
+    }
+
+    if (f) {
+      f->open_object_section("mds_compat");
+      fs->mds_map.compat.dump(f.get());
+      f->close_section();
+      f->flush(ds);
+    } else {
+      ds << fs->mds_map.compat;
+    }
+    r = 0;
+  } else if (prefix == "mds compat show") {
+      if (f) {
+	f->open_object_section("mds_compat");
+	fsmap.default_compat.dump(f.get());
+	f->close_section();
+	f->flush(ds);
+      } else {
+	ds << fsmap.default_compat;
+      }
+      r = 0;
+  } else if (prefix == "fs get") {
+    string fs_name;
+    cmd_getval(cmdmap, "fs_name", fs_name);
+    const auto &fs = fsmap.get_filesystem(fs_name);
+    if (fs == nullptr) {
+      ss << "filesystem '" << fs_name << "' not found";
+      r = -ENOENT;
+    } else {
+      if (f != nullptr) {
+        f->open_object_section("filesystem");
+        fs->dump(f.get());
+        f->close_section();
+        f->flush(ds);
+        r = 0;
+      } else {
+        fs->print(ds);
+        r = 0;
+      }
+    }
+  } else if (prefix == "fs ls") {
+    if (f) {
+      f->open_array_section("filesystems");
+      for (const auto &p : fsmap.filesystems) {
+        const auto &fs = p.second;
+        f->open_object_section("filesystem");
+        {
+          const MDSMap &mds_map = fs->mds_map;
+          f->dump_string("name", mds_map.fs_name);
+          /* Output both the names and IDs of pools, for use by
+           * humans and machines respectively */
+          f->dump_string("metadata_pool", mon.osdmon()->osdmap.get_pool_name(
+                mds_map.metadata_pool));
+          f->dump_int("metadata_pool_id", mds_map.metadata_pool);
+          f->open_array_section("data_pool_ids");
+          for (const auto &id : mds_map.data_pools) {
+            f->dump_int("data_pool_id", id);
+          }
+          f->close_section();
+
+          f->open_array_section("data_pools");
+          for (const auto &id : mds_map.data_pools) {
+            const auto &name = mon.osdmon()->osdmap.get_pool_name(id);
+            f->dump_string("data_pool", name);
+          }
+          f->close_section();
+        }
+        f->close_section();
+      }
+      f->close_section();
+      f->flush(ds);
+    } else {
+      for (const auto &p : fsmap.filesystems) {
+        const auto &fs = p.second;
+        const MDSMap &mds_map = fs->mds_map;
+        const string &md_pool_name = mon.osdmon()->osdmap.get_pool_name(
+            mds_map.metadata_pool);
+        
+        ds << "name: " << mds_map.fs_name << ", metadata pool: "
+           << md_pool_name << ", data pools: [";
+        for (const auto &id : mds_map.data_pools) {
+          const string &pool_name = mon.osdmon()->osdmap.get_pool_name(id);
+          ds << pool_name << " ";
+        }
+        ds << "]" << std::endl;
+      }
+
+      if (fsmap.filesystems.empty()) {
+        ds << "No filesystems enabled" << std::endl;
+      }
+    }
+    r = 0;
+  } else if (prefix == "fs feature ls") {
+    if (f) {
+      f->open_array_section("cephfs_features");
+      for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
+	f->open_object_section("feature");
+	f->dump_int("index", i);
+	f->dump_string("name", cephfs_feature_name(i));
+	f->close_section();
+      }
+      f->close_section();
+      f->flush(ds);
+    } else {
+      for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
+        ds << i << " " << cephfs_feature_name(i) << std::endl;
+      }
+    }
+    r = 0;
+  }
+
+out:
+  if (r != -1) {
+    rdata.append(ds);
+    string rs;
+    getline(ss, rs);
+    mon.reply_command(op, r, rs, rdata, get_last_committed());
+    return true;
+  } else
+    return false;
+}
+
+bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
+{
+  const auto& info = fsmap.get_info_gid(gid);
+  dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
+
+  ceph_assert(mon.osdmon()->is_writeable());
+
+  epoch_t blocklist_epoch = 0;
+  if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
+    utime_t until = ceph_clock_now();
+    until += g_conf().get_val<double>("mon_mds_blocklist_interval");
+    blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
+  }
+
+  fsmap.erase(gid, blocklist_epoch);
+  last_beacon.erase(gid);
+  if (pending_daemon_health.count(gid)) {
+    pending_daemon_health.erase(gid);
+    pending_daemon_health_rm.insert(gid);
+  }
+
+  return blocklist_epoch != 0;
+}
+
+mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
+{
+  // Try parsing as a role
+  mds_role_t role;
+  std::ostringstream ignore_err;  // Don't spam 'ss' with parse_role errors
+  int r = fsmap.parse_role(arg, &role, ignore_err);
+  if (r == 0) {
+    // See if a GID is assigned to this role
+    const auto &fs = fsmap.get_filesystem(role.fscid);
+    ceph_assert(fs != nullptr);  // parse_role ensures it exists
+    if (fs->mds_map.is_up(role.rank)) {
+      dout(10) << __func__ << ": validated rank/GID " << role
+               << " as a rank" << dendl;
+      return fs->mds_map.get_mds_info(role.rank).global_id;
+    }
+  }
+
+  // Try parsing as a gid
+  std::string err;
+  unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
+  if (!err.empty()) {
+    // Not a role or a GID, try as a daemon name
+    const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
+    if (!mds_info) {
+      ss << "MDS named '" << arg
+	 << "' does not exist, or is not up";
+      return MDS_GID_NONE;
+    }
+    dout(10) << __func__ << ": resolved MDS name '" << arg
+             << "' to GID " << mds_info->global_id << dendl;
+    return mds_info->global_id;
+  } else {
+    // Not a role, but parses as a an integer, might be a GID
+    dout(10) << __func__ << ": treating MDS reference '" << arg
+	     << "' as an integer " << maybe_gid << dendl;
+
+    if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
+      return mds_gid_t(maybe_gid);
+    }
+  }
+
+  dout(1) << __func__ << ": rank/GID " << arg
+	  << " not a existent rank or GID" << dendl;
+  return MDS_GID_NONE;
+}
+
+int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
+    const std::string &arg, MDSMap::mds_info_t *failed_info)
+{
+  ceph_assert(failed_info != nullptr);
+
+  mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
+  if (gid == MDS_GID_NONE) {
+    return 0;
+  }
+  if (!mon.osdmon()->is_writeable()) {
+    return -EAGAIN;
+  }
+
+  // Take a copy of the info before removing the MDS from the map,
+  // so that the caller knows which mds (if any) they ended up removing.
+  *failed_info = fsmap.get_info_gid(gid);
+
+  fail_mds_gid(fsmap, gid);
+  ss << "failed mds gid " << gid;
+  ceph_assert(mon.osdmon()->is_writeable());
+  request_proposal(mon.osdmon());
+  return 0;
+}
+
+bool MDSMonitor::prepare_command(MonOpRequestRef op)
+{
+  op->mark_mdsmon_event(__func__);
+  auto m = op->get_req<MMonCommand>();
+  int r = -EINVAL;
+  stringstream ss;
+  bufferlist rdata;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  /* Refuse access if message not associated with a valid session */
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+    return true;
+  }
+
+  auto &pending = get_pending_fsmap_writeable();
+
+  bool batched_propose = false;
+  for (const auto &h : handlers) {
+    r = h->can_handle(prefix, op, pending, cmdmap, ss);
+    if (r == 1) {
+      ; // pass, since we got the right handler.
+    } else if (r == 0) {
+      continue;
+    } else {
+      goto out;
+    }
+
+    batched_propose = h->batched_propose();
+    if (batched_propose) {
+      paxos.plug();
+    }
+    r = h->handle(&mon, pending, op, cmdmap, ss);
+    if (batched_propose) {
+      paxos.unplug();
+    }
+
+    if (r == -EAGAIN) {
+      // message has been enqueued for retry; return.
+      dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
+      return false;
+    } else {
+      if (r == 0) {
+	// On successful updates, print the updated map
+	print_map(pending);
+      }
+      // Successful or not, we're done: respond.
+      goto out;
+    }
+  }
+
+  r = filesystem_command(pending, op, prefix, cmdmap, ss);
+  if (r >= 0) {
+    goto out;
+  } else if (r == -EAGAIN) {
+    // Do not reply, the message has been enqueued for retry
+    dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
+    return false;
+  } else if (r != -ENOSYS) {
+    goto out;
+  }
+
+  if (r == -ENOSYS && ss.str().empty()) {
+    ss << "unrecognized command";
+  }
+
+out:
+  dout(4) << __func__ << " done, r=" << r << dendl;
+  /* Compose response */
+  string rs;
+  getline(ss, rs);
+
+  if (r >= 0) {
+    // success.. delay reply
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
+					      get_last_committed() + 1));
+    if (batched_propose) {
+      force_immediate_propose();
+    }
+    return true;
+  } else {
+    // reply immediately
+    mon.reply_command(op, r, rs, rdata, get_last_committed());
+    return false;
+  }
+}
+
+int MDSMonitor::filesystem_command(
+    FSMap &fsmap,
+    MonOpRequestRef op,
+    std::string const &prefix,
+    const cmdmap_t& cmdmap,
+    std::stringstream &ss)
+{
+  dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
+  op->mark_mdsmon_event(__func__);
+  int r = 0;
+  string whostr;
+  cmd_getval(cmdmap, "role", whostr);
+
+  if (prefix == "mds set_state") {
+    mds_gid_t gid;
+    if (!cmd_getval(cmdmap, "gid", gid)) {
+      ss << "error parsing 'gid' value '"
+         << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
+      return -EINVAL;
+    }
+    MDSMap::DaemonState state;
+    if (!cmd_getval(cmdmap, "state", state)) {
+      ss << "error parsing 'state' string value '"
+         << cmd_vartype_stringify(cmdmap.at("state")) << "'";
+      return -EINVAL;
+    }
+    if (fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
+      fsmap.modify_daemon(gid, [state](auto& info) {
+        info.state = state;
+      });
+      ss << "set mds gid " << gid << " to state " << state << " "
+         << ceph_mds_state_name(state);
+      return 0;
+    }
+  } else if (prefix == "mds fail") {
+    string who;
+    cmd_getval(cmdmap, "role_or_gid", who);
+
+    MDSMap::mds_info_t failed_info;
+    mds_gid_t gid = gid_from_arg(fsmap, who, ss);
+    if (gid == MDS_GID_NONE) {
+      ss << "MDS named '" << who << "' does not exist, is not up or you "
+	 << "lack the permission to see.";
+      return 0;
+    }
+    if(!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
+      ss << "MDS named '" << who << "' does not exist, is not up or you "
+	 << "lack the permission to see.";
+      return -EINVAL;
+    }
+    string_view fs_name = fsmap.fs_name_from_gid(gid);
+    if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+      ss << "Permission denied.";
+      return -EPERM;
+    }
+
+    r = fail_mds(fsmap, ss, who, &failed_info);
+    if (r < 0 && r == -EAGAIN) {
+      mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+      return -EAGAIN; // don't propose yet; wait for message to be retried
+    } else if (r == 0) {
+      // Only log if we really did something (not when was already gone)
+      if (failed_info.global_id != MDS_GID_NONE) {
+        mon.clog->info() << failed_info.human_name() << " marked failed by "
+                          << op->get_session()->entity_name;
+      }
+    }
+  } else if (prefix == "mds rm") {
+    mds_gid_t gid;
+    if (!cmd_getval(cmdmap, "gid", gid)) {
+      ss << "error parsing 'gid' value '"
+         << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
+      return -EINVAL;
+    }
+    if (!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
+      ss << "mds gid " << gid << " does not exist";
+      return 0;
+    }
+    string_view fs_name = fsmap.fs_name_from_gid(gid);
+    if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+      ss << "Permission denied.";
+      return -EPERM;
+    }
+    const auto &info = fsmap.get_info_gid(gid);
+    MDSMap::DaemonState state = info.state;
+    if (state > 0) {
+    ss << "cannot remove active mds." << info.name
+	<< " rank " << info.rank;
+    return -EBUSY;
+    } else {
+    fsmap.erase(gid, {});
+    ss << "removed mds gid " << gid;
+    return 0;
+    }
+  } else if (prefix == "mds rmfailed") {
+    bool confirm = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+    if (!confirm) {
+         ss << "WARNING: this can make your filesystem inaccessible! "
+               "Add --yes-i-really-mean-it if you are sure you wish to continue.";
+         return -EPERM;
+    }
+    
+    std::string role_str;
+    cmd_getval(cmdmap, "role", role_str);
+    mds_role_t role;
+    const auto fs_names = op->get_session()->get_allowed_fs_names();
+    int r = fsmap.parse_role(role_str, &role, ss, fs_names);
+    if (r < 0) {
+      ss << "invalid role '" << role_str << "'";
+      return -EINVAL;
+    }
+    string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
+    if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+      ss << "Permission denied.";
+      return -EPERM;
+    }
+
+    fsmap.modify_filesystem(
+        role.fscid,
+        [role](std::shared_ptr<Filesystem> fs)
+    {
+      fs->mds_map.failed.erase(role.rank);
+    });
+
+    ss << "removed failed mds." << role;
+    return 0;
+    /* TODO: convert to fs commands to update defaults */
+  } else if (prefix == "mds compat rm_compat") {
+    int64_t f;
+    if (!cmd_getval(cmdmap, "feature", f)) {
+      ss << "error parsing feature value '"
+         << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
+      return -EINVAL;
+    }
+    if (fsmap.default_compat.compat.contains(f)) {
+      ss << "removing compat feature " << f;
+      fsmap.default_compat.compat.remove(f);
+    } else {
+      ss << "compat feature " << f << " not present in " << fsmap.default_compat;
+    }
+    r = 0;
+  } else if (prefix == "mds compat rm_incompat") {
+    int64_t f;
+    if (!cmd_getval(cmdmap, "feature", f)) {
+      ss << "error parsing feature value '"
+         << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
+      return -EINVAL;
+    }
+    if (fsmap.default_compat.incompat.contains(f)) {
+      ss << "removing incompat feature " << f;
+      fsmap.default_compat.incompat.remove(f);
+    } else {
+      ss << "incompat feature " << f << " not present in " << fsmap.default_compat;
+    }
+    r = 0;
+  } else if (prefix == "mds repaired") {
+    std::string role_str;
+    cmd_getval(cmdmap, "role", role_str);
+    mds_role_t role;
+    const auto fs_names = op->get_session()->get_allowed_fs_names();
+    r = fsmap.parse_role(role_str, &role, ss, fs_names);
+    if (r < 0) {
+      return r;
+    }
+    string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
+    if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+      ss << "Permission denied.";
+      return -EPERM;
+    }
+
+    bool modified = fsmap.undamaged(role.fscid, role.rank);
+    if (modified) {
+      ss << "repaired: restoring rank " << role;
+    } else {
+      ss << "nothing to do: rank is not damaged";
+    }
+
+    r = 0;
+  } else if (prefix == "mds freeze") {
+    std::string who;
+    cmd_getval(cmdmap, "role_or_gid", who);
+    mds_gid_t gid = gid_from_arg(fsmap, who, ss);
+    if (gid == MDS_GID_NONE) {
+      return -EINVAL;
+    }
+
+    string_view fs_name = fsmap.fs_name_from_gid(gid);
+    if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
+      ss << "Permission denied.";
+      return -EPERM;
+    }
+
+    bool freeze = false;
+    {
+      std::string str;
+      cmd_getval(cmdmap, "val", str);
+      if ((r = parse_bool(str, &freeze, ss)) != 0) {
+        return r;
+      }
+    }
+
+    auto f = [freeze,gid,&ss](auto& info) {
+      if (freeze) {
+        ss << "freezing mds." << gid;
+        info.freeze();
+      } else {
+        ss << "unfreezing mds." << gid;
+        info.unfreeze();
+      }
+    };
+    fsmap.modify_daemon(gid, f);
+    r = 0;
+  } else {
+    return -ENOSYS;
+  }
+
+  return r;
+}
+
+void MDSMonitor::check_subs()
+{
+  // Subscriptions may be to "mdsmap" (MDS and legacy clients),
+  // "mdsmap.<namespace>", or to "fsmap" for the full state of all
+  // filesystems.  Build a list of all the types we service
+  // subscriptions for.
+
+  std::vector<std::string> types = {
+    "fsmap",
+    "fsmap.user",
+    "mdsmap",
+  };
+
+  for (const auto &p : get_fsmap().filesystems) {
+    const auto &fscid = p.first;
+    CachedStackStringStream cos;
+    *cos << "mdsmap." << fscid;
+    types.push_back(std::string(cos->strv()));
+  }
+
+  for (const auto &type : types) {
+    auto& subs = mon.session_map.subs;
+    auto subs_it = subs.find(type);
+    if (subs_it == subs.end())
+      continue;
+    auto sub_it = subs_it->second->begin();
+    while (!sub_it.end()) {
+      auto sub = *sub_it;
+      ++sub_it; // N.B. check_sub may remove sub!
+      check_sub(sub);
+    }
+  }
+}
+
+
+void MDSMonitor::check_sub(Subscription *sub)
+{
+  dout(20) << __func__ << ": " << sub->type << dendl;
+
+  // to use const qualifier filter fsmap beforehand
+  FSMap _fsmap_copy = get_fsmap();
+  _fsmap_copy.filter(sub->session->get_allowed_fs_names());
+  const auto& fsmap = _fsmap_copy;
+  if (sub->next > fsmap.get_epoch()) {
+    return;
+  }
+
+  if (sub->type == "fsmap") {
+    sub->session->con->send_message(new MFSMap(mon.monmap->fsid, fsmap));
+    if (sub->onetime) {
+      mon.session_map.remove_sub(sub);
+    } else {
+      sub->next = fsmap.get_epoch() + 1;
+    }
+  } else if (sub->type == "fsmap.user") {
+    FSMapUser fsmap_u;
+    fsmap_u.epoch = fsmap.get_epoch();
+    fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
+    for (const auto &p : fsmap.filesystems) {
+      FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
+      fs_info.cid = p.second->fscid;
+      fs_info.name = p.second->mds_map.fs_name;
+    }
+    sub->session->con->send_message(new MFSMapUser(mon.monmap->fsid, fsmap_u));
+    if (sub->onetime) {
+      mon.session_map.remove_sub(sub);
+    } else {
+      sub->next = fsmap.get_epoch() + 1;
+    }
+  } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
+    const bool is_mds = sub->session->name.is_mds();
+    mds_gid_t mds_gid = MDS_GID_NONE;
+    fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
+    if (is_mds) {
+      // What (if any) namespace are you assigned to?
+      auto mds_info = fsmap.get_mds_info();
+      for (const auto &p : mds_info) {
+        if (p.second.addrs == sub->session->addrs) {
+          mds_gid = p.first;
+          fscid = fsmap.mds_roles.at(mds_gid);
+        }
+      }
+    } else {
+      // You're a client.  Did you request a particular
+      // namespace?
+      if (sub->type.compare(0, 7, "mdsmap.") == 0) {
+        auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
+        dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
+        std::string err;
+        fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
+        if (!err.empty()) {
+          // Client asked for a non-existent namespace, send them nothing
+          dout(1) << "Invalid client subscription '" << sub->type
+                  << "'" << dendl;
+          return;
+        }
+      } else {
+        // Unqualified request for "mdsmap": give it the one marked
+        // for use by legacy clients.
+        if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
+          fscid = fsmap.legacy_client_fscid;
+        } else {
+          dout(1) << "Client subscribed for legacy filesystem but "
+                     "none is configured" << dendl;
+          return;
+        }
+      }
+      if (!fsmap.filesystem_exists(fscid)) {
+        // Client asked for a non-existent namespace, send them nothing
+        // TODO: something more graceful for when a client has a filesystem
+        // mounted, and the fileysstem is deleted.  Add a "shut down you fool"
+        // flag to MMDSMap?
+        dout(1) << "Client subscribed to non-existent namespace '" <<
+                fscid << "'" << dendl;
+        return;
+      }
+    }
+    dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
+
+    // Work out the effective latest epoch
+    const MDSMap *mds_map = nullptr;
+    MDSMap null_map = MDSMap::create_null_mdsmap();
+    if (fscid == FS_CLUSTER_ID_NONE) {
+      // For a client, we should have already dropped out
+      ceph_assert(is_mds);
+
+      auto it = fsmap.standby_daemons.find(mds_gid);
+      if (it != fsmap.standby_daemons.end()) {
+        // For an MDS, we need to feed it an MDSMap with its own state in
+        null_map.mds_info[mds_gid] = it->second;
+        null_map.epoch = fsmap.standby_epochs.at(mds_gid);
+      } else {
+        null_map.epoch = fsmap.epoch;
+      }
+      mds_map = &null_map;
+    } else {
+      // Check the effective epoch 
+      mds_map = &fsmap.get_filesystem(fscid)->mds_map;
+    }
+
+    ceph_assert(mds_map != nullptr);
+    dout(10) << __func__ << " selected MDS map epoch " <<
+      mds_map->epoch << " for namespace " << fscid << " for subscriber "
+      << sub->session->name << " who wants epoch " << sub->next << dendl;
+
+    if (sub->next > mds_map->epoch) {
+      return;
+    }
+    auto msg = make_message<MMDSMap>(mon.monmap->fsid, *mds_map,
+			             mds_map->fs_name);
+
+    sub->session->con->send_message(msg.detach());
+    if (sub->onetime) {
+      mon.session_map.remove_sub(sub);
+    } else {
+      sub->next = mds_map->get_epoch() + 1;
+    }
+  }
+}
+
+
+void MDSMonitor::update_metadata(mds_gid_t gid,
+				 const map<string, string>& metadata)
+{
+  if (metadata.empty()) {
+    return;
+  }
+  pending_metadata[gid] = metadata;
+
+  MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
+  bufferlist bl;
+  encode(pending_metadata, bl);
+  t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
+  paxos.trigger_propose();
+}
+
+void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
+{
+  bool update = false;
+  for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
+    if (!fsmap.gid_exists(it->first)) {
+      it = pending_metadata.erase(it);
+      update = true;
+    } else {
+      ++it;
+    }
+  }
+  if (!update)
+    return;
+  bufferlist bl;
+  encode(pending_metadata, bl);
+  t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
+}
+
+int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
+{
+  bufferlist bl;
+  int r = mon.store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
+  if (r) {
+    dout(5) << "Unable to load 'last_metadata'" << dendl;
+    return r;
+  }
+
+  auto it = bl.cbegin();
+  ceph::decode(m, it);
+  return 0;
+}
+
+void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
+{
+  map<mds_gid_t,Metadata> meta;
+  load_metadata(meta);
+  for (auto& p : meta) {
+    auto q = p.second.find(field);
+    if (q == p.second.end()) {
+      (*out)["unknown"]++;
+    } else {
+      (*out)[q->second]++;
+    }
+  }
+}
+
+void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
+{
+  map<string,int> by_val;
+  count_metadata(field, &by_val);
+  f->open_object_section(field.c_str());
+  for (auto& p : by_val) {
+    f->dump_int(p.first.c_str(), p.second);
+  }
+  f->close_section();
+}
+
+void MDSMonitor::get_versions(std::map<string, list<string> > &versions)
+{
+  map<mds_gid_t,Metadata> meta;
+  load_metadata(meta);
+  const auto &fsmap = get_fsmap();
+  std::map<mds_gid_t, mds_info_t> map = fsmap.get_mds_info();
+  dout(10) << __func__ << " mds meta=" << meta << dendl;
+  for (auto& p : meta) {
+    auto q = p.second.find("ceph_version_short");
+    if (q == p.second.end()) continue;
+    versions[q->second].push_back(string("mds.") + map[p.first].name);
+  }
+}
+
+int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
+    Formatter *f, ostream& err)
+{
+  ceph_assert(f);
+
+  mds_gid_t gid = gid_from_arg(fsmap, who, err);
+  if (gid == MDS_GID_NONE) {
+    return -EINVAL;
+  }
+
+  map<mds_gid_t, Metadata> metadata;
+  if (int r = load_metadata(metadata)) {
+    err << "Unable to load 'last_metadata'";
+    return r;
+  }
+
+  if (!metadata.count(gid)) {
+    return -ENOENT;
+  }
+  const Metadata& m = metadata[gid];
+  for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
+    f->dump_string(p->first.c_str(), p->second);
+  }
+  return 0;
+}
+
+int MDSMonitor::print_nodes(Formatter *f)
+{
+  ceph_assert(f);
+
+  const auto &fsmap = get_fsmap();
+
+  map<mds_gid_t, Metadata> metadata;
+  if (int r = load_metadata(metadata)) {
+    return r;
+  }
+
+  map<string, list<string> > mdses; // hostname => mds
+  for (const auto &p : metadata) {
+    const mds_gid_t& gid = p.first;
+    const Metadata& m = p.second;
+    Metadata::const_iterator hostname = m.find("hostname");
+    if (hostname == m.end()) {
+      // not likely though
+      continue;
+    }
+    if (!fsmap.gid_exists(gid)) {
+      dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
+      continue;
+    }
+    const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
+    mdses[hostname->second].push_back(mds_info.name);
+  }
+
+  dump_services(f, mdses, "mds");
+  return 0;
+}
+
+/**
+ * If a cluster is undersized (with respect to max_mds), then
+ * attempt to find daemons to grow it. If the cluster is oversized
+ * (with respect to max_mds) then shrink it by stopping its highest rank.
+ */
+bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
+{
+  auto &current_mds_map = get_fsmap().get_filesystem(fscid)->mds_map;
+  auto&& fs = fsmap.get_filesystem(fscid);
+  auto &mds_map = fs->mds_map;
+
+  int in = mds_map.get_num_in_mds();
+  int max = mds_map.get_max_mds();
+
+  dout(20) << __func__ << " in " << in << " max " << max << dendl;
+
+  /* Check that both the current epoch mds_map is resizeable as well as the
+   * current batch of changes in pending. This is important if an MDS is
+   * becoming active in the next epoch.
+   */
+  if (!current_mds_map.is_resizeable() ||
+      !mds_map.is_resizeable()) {
+    dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
+    return false;
+  }
+
+  if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+    mds_rank_t mds = mds_rank_t(0);
+    while (mds_map.is_in(mds)) {
+      mds++;
+    }
+    auto info = fsmap.find_replacement_for({fscid, mds});
+    if (!info) {
+      return false;
+    }
+
+    dout(1) << "assigned standby " << info->addrs
+            << " as mds." << mds << dendl;
+    mon.clog->info() << info->human_name() << " assigned to "
+                         "filesystem " << mds_map.fs_name << " as rank "
+                      << mds << " (now has " << mds_map.get_num_in_mds() + 1
+                      << " ranks)";
+    fsmap.promote(info->global_id, *fs, mds);
+    return true;
+  } else if (in > max) {
+    mds_rank_t target = in - 1;
+    const auto &info = mds_map.get_info(target);
+    if (mds_map.is_active(target)) {
+      dout(1) << "stopping " << target << dendl;
+      mon.clog->info() << "stopping " << info.human_name();
+      auto f = [](auto& info) {
+        info.state = MDSMap::STATE_STOPPING;
+      };
+      fsmap.modify_daemon(info.global_id, f);
+      return true;
+    } else {
+      dout(20) << "skipping stop of " << target << dendl;
+      return false;
+    }
+  }
+
+  return false;
+}
+
+
+/**
+ * Fail a daemon and replace it with a suitable standby.
+ */
+bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
+{
+  ceph_assert(osd_propose != nullptr);
+
+  const auto fscid = fsmap.mds_roles.at(gid);
+  const auto& info = fsmap.get_info_gid(gid);
+  const auto rank = info.rank;
+  const auto state = info.state;
+
+  if (info.is_frozen()) {
+    return false;
+  } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
+             state == MDSMap::STATE_STANDBY) {
+    dout(1)  << " failing and removing standby " << gid << " " << info.addrs
+	     << " mds." << rank
+	     << "." << info.inc << " " << ceph_mds_state_name(state)
+	     << dendl;
+    *osd_propose |= fail_mds_gid(fsmap, gid);
+    return true;
+  } else if (rank >= 0 && rep_info) {
+    auto fs = fsmap.filesystems.at(fscid);
+    if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+      return false;
+    }
+    // are we in?
+    // and is there a non-laggy standby that can take over for us?
+    dout(1)  << " replacing " << gid << " " << info.addrs
+	     << " mds." << rank << "." << info.inc
+	     << " " << ceph_mds_state_name(state)
+	     << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
+	     << dendl;
+
+    mon.clog->warn() << "Replacing " << info.human_name()
+                      << " as rank " << rank
+                      << " with standby " << rep_info->human_name();
+
+    // Remove the old one
+    *osd_propose |= fail_mds_gid(fsmap, gid);
+
+    // Promote the replacement
+    fsmap.promote(rep_info->global_id, *fs, rank);
+
+    return true;
+  }
+  return false;
+}
+
+bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
+{
+  bool do_propose = false;
+  const auto now = mono_clock::now();
+  const bool osdmap_writeable = mon.osdmon()->is_writeable();
+  const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
+  const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
+
+  if (mono_clock::is_zero(last_tick)) {
+    last_tick = now;
+  }
+
+  {
+    auto since_last = std::chrono::duration<double>(now-last_tick);
+
+    if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
+      // This case handles either local slowness (calls being delayed
+      // for whatever reason) or cluster election slowness (a long gap
+      // between calls while an election happened)
+      dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
+              "(slow election?) of " << since_last.count() << " seconds" << dendl;
+      for (auto& p : last_beacon) {
+        p.second.stamp = now;
+      }
+    }
+  }
+
+  // make sure last_beacon is fully populated
+  for (auto& p : fsmap.mds_roles) {
+    auto& gid = p.first;
+    last_beacon.emplace(std::piecewise_construct,
+        std::forward_as_tuple(gid),
+        std::forward_as_tuple(now, 0));
+  }
+
+  // We will only take decisive action (replacing/removing a daemon)
+  // if we have some indication that some other daemon(s) are successfully
+  // getting beacons through recently.
+  mono_time latest_beacon = mono_clock::zero();
+  for (const auto& p : last_beacon) {
+    latest_beacon = std::max(p.second.stamp, latest_beacon);
+  }
+  auto since = std::chrono::duration<double>(now-latest_beacon);
+  const bool may_replace = since.count() <
+      std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
+
+  // check beacon timestamps
+  std::vector<mds_gid_t> to_remove;
+  const bool mon_down = mon.is_mon_down();
+  const auto mds_beacon_mon_down_grace =
+      g_conf().get_val<std::chrono::seconds>("mds_beacon_mon_down_grace");
+  const auto quorum_age = std::chrono::seconds(mon.quorum_age());
+  const bool new_quorum = quorum_age < mds_beacon_mon_down_grace;
+  for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
+    auto& [gid, beacon_info] = *it;
+    auto since_last = std::chrono::duration<double>(now-beacon_info.stamp);
+
+    if (!fsmap.gid_exists(gid)) {
+      // gid no longer exists, remove from tracked beacons
+      it = last_beacon.erase(it);
+      continue;
+    }
+
+    if (since_last.count() >= g_conf()->mds_beacon_grace) {
+      auto& info = fsmap.get_info_gid(gid);
+      dout(1) << "no beacon from mds." << info.rank << "." << info.inc
+              << " (gid: " << gid << " addr: " << info.addrs
+              << " state: " << ceph_mds_state_name(info.state) << ")"
+              << " since " << since_last.count() << dendl;
+      if ((mon_down || new_quorum) && since_last < mds_beacon_mon_down_grace) {
+        /* The MDS may be sending beacons to a monitor not yet in quorum or
+         * temporarily partitioned. Hold off on removal for a little longer...
+         */
+        dout(10) << "deferring removal for mds_beacon_mon_down_grace during MON_DOWN" << dendl;
+        ++it;
+        continue;
+      }
+      // If the OSDMap is writeable, we can blocklist things, so we can
+      // try failing any laggy MDS daemons.  Consider each one for failure.
+      if (!info.laggy()) {
+        dout(1)  << " marking " << gid << " " << info.addrs
+	         << " mds." << info.rank << "." << info.inc
+	         << " " << ceph_mds_state_name(info.state)
+	         << " laggy" << dendl;
+        fsmap.modify_daemon(info.global_id, [](auto& info) {
+            info.laggy_since = ceph_clock_now();
+        });
+        do_propose = true;
+      }
+      if (osdmap_writeable && may_replace) {
+        to_remove.push_back(gid); // drop_mds may invalidate iterator
+      }
+    }
+
+    ++it;
+  }
+
+  for (const auto& gid : to_remove) {
+    auto info = fsmap.get_info_gid(gid);
+    const mds_info_t* rep_info = nullptr;
+    if (info.rank >= 0) {
+      auto fscid = fsmap.fscid_from_gid(gid);
+      rep_info = fsmap.find_replacement_for({fscid, info.rank});
+    }
+    bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
+    if (dropped) {
+      mon.clog->info() << "MDS " << info.human_name()
+                        << " is removed because it is dead or otherwise unavailable.";
+      do_propose = true;
+    }
+  }
+
+  if (osdmap_writeable) {
+    for (auto& [fscid, fs] : fsmap.filesystems) {
+      if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
+          fs->mds_map.is_resizeable()) {
+        // Check if a rank or standby-replay should be replaced with a stronger
+        // affinity standby. This looks at ranks and standby-replay:
+        for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
+          const auto join_fscid = info.join_fscid;
+          if (join_fscid == fscid)
+            continue;
+          const auto rank = info.rank;
+          const auto state = info.state;
+          const mds_info_t* rep_info = nullptr;
+          if (state == MDSMap::STATE_STANDBY_REPLAY) {
+            rep_info = fsmap.get_available_standby(*fs);
+          } else if (state == MDSMap::STATE_ACTIVE) {
+            rep_info = fsmap.find_replacement_for({fscid, rank});
+          } else {
+            /* N.B. !is_degraded() */
+            ceph_abort_msg("invalid state in MDSMap");
+          }
+          if (!rep_info) {
+            break;
+          }
+          bool better_affinity = false;
+          if (join_fscid == FS_CLUSTER_ID_NONE) {
+            better_affinity = (rep_info->join_fscid == fscid);
+          } else {
+            better_affinity = (rep_info->join_fscid == fscid) ||
+                              (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
+          }
+          if (better_affinity) {
+            if (state == MDSMap::STATE_STANDBY_REPLAY) {
+              mon.clog->info() << "Dropping low affinity standby-replay "
+                                << info.human_name()
+                                << " in favor of higher affinity standby.";
+              *propose_osdmap |= fail_mds_gid(fsmap, gid);
+              /* Now let maybe_promote_standby do the promotion. */
+            } else {
+              mon.clog->info() << "Dropping low affinity active "
+                                << info.human_name()
+                                << " in favor of higher affinity standby.";
+              do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
+            }
+            break; /* don't replace more than one per tick per fs */
+          }
+        }
+      }
+    }
+  }
+  return do_propose;
+}
+
+bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
+{
+  if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
+    return false;
+  }
+
+  bool do_propose = false;
+
+  // have a standby take over?
+  set<mds_rank_t> failed;
+  fs.mds_map.get_failed_mds_set(failed);
+  for (const auto& rank : failed) {
+    auto info = fsmap.find_replacement_for({fs.fscid, rank});
+    if (info) {
+      dout(1) << " taking over failed mds." << rank << " with " << info->global_id
+              << "/" << info->name << " " << info->addrs << dendl;
+      mon.clog->info() << "Standby " << info->human_name()
+                        << " assigned to filesystem " << fs.mds_map.fs_name
+                        << " as rank " << rank;
+
+      fsmap.promote(info->global_id, fs, rank);
+      do_propose = true;
+    }
+  }
+
+  if (fs.mds_map.is_resizeable() && fs.mds_map.allows_standby_replay()) {
+    // There were no failures to replace, so try using any available standbys
+    // as standby-replay daemons. Don't do this when the cluster is degraded
+    // as a standby-replay daemon may try to read a journal being migrated.
+    for (;;) {
+      auto info = fsmap.get_available_standby(fs);
+      if (!info) break;
+      dout(20) << "standby available mds." << info->global_id << dendl;
+      bool changed = false;
+      for (const auto& rank : fs.mds_map.in) {
+        dout(20) << "examining " << rank << dendl;
+        if (fs.mds_map.is_followable(rank)) {
+          dout(1) << "  setting mds." << info->global_id
+                  << " to follow mds rank " << rank << dendl;
+          fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
+          do_propose = true;
+          changed = true;
+          break;
+        }
+      }
+      if (!changed) break;
+    }
+  }
+
+  return do_propose;
+}
+
+void MDSMonitor::tick()
+{
+  if (!is_active() || !is_leader()) return;
+
+  auto &pending = get_pending_fsmap_writeable();
+
+  bool do_propose = false;
+  bool propose_osdmap = false;
+
+  if (check_fsmap_struct_version) {
+    /* Allow time for trimming otherwise PaxosService::is_writeable will always
+     * be false.
+     */
+
+    auto now = clock::now();
+    auto elapsed = now - last_fsmap_struct_flush;
+    if (elapsed > std::chrono::seconds(30)) {
+      FSMap fsmap;
+      bufferlist bl;
+      auto v = get_first_committed();
+      int err = get_version(v, bl);
+      if (err) {
+        derr << "could not get version " << v << dendl;
+        ceph_abort();
+      }
+      try {
+        fsmap.decode(bl);
+      } catch (const ceph::buffer::malformed_input& e) {
+        dout(5) << "flushing old fsmap struct because unable to decode FSMap: " << e.what() << dendl;
+      }
+      /* N.B. FSMap::is_struct_old is also true for undecoded (failed to decode) FSMap */
+      if (fsmap.is_struct_old()) {
+        dout(5) << "fsmap struct is too old; proposing to flush out old versions" << dendl;
+        do_propose = true;
+        last_fsmap_struct_flush = now;
+      } else {
+        dout(20) << "struct is recent" << dendl;
+        check_fsmap_struct_version = false;
+      }
+    }
+  }
+
+  do_propose |= pending.check_health();
+
+  /* Check health and affinity of ranks */
+  do_propose |= check_health(pending, &propose_osdmap);
+
+  /* Resize the cluster according to max_mds. */
+  for (auto& p : pending.filesystems) {
+    do_propose |= maybe_resize_cluster(pending, p.second->fscid);
+  }
+
+  /* Replace any failed ranks. */
+  for (auto& p : pending.filesystems) {
+    do_propose |= maybe_promote_standby(pending, *p.second);
+  }
+
+  if (propose_osdmap) {
+    request_proposal(mon.osdmon());
+  }
+
+  if (do_propose) {
+    propose_pending();
+  }
+
+  last_tick = mono_clock::now();
+}
+
+MDSMonitor::MDSMonitor(Monitor &mn, Paxos &p, string service_name)
+  : PaxosService(mn, p, service_name)
+{
+  handlers = FileSystemCommandHandler::load(&p);
+}
+
+void MDSMonitor::on_restart()
+{
+  // Clear out the leader-specific state.
+  last_tick = mono_clock::now();
+  last_beacon.clear();
+}
+
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
new file mode 100644
index 000000000..c70814996
--- /dev/null
+++ b/src/mon/MDSMonitor.h
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+ 
+/* Metadata Server Monitor
+ */
+
+#ifndef CEPH_MDSMONITOR_H
+#define CEPH_MDSMONITOR_H
+
+#include <map>
+#include <set>
+
+#include "include/types.h"
+#include "PaxosFSMap.h"
+#include "PaxosService.h"
+#include "msg/Messenger.h"
+#include "messages/MMDSBeacon.h"
+#include "CommandHandler.h"
+
+class FileSystemCommandHandler;
+
+class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHandler {
+ public:
+  using clock = ceph::coarse_mono_clock;
+  using time = ceph::coarse_mono_time;
+
+  MDSMonitor(Monitor &mn, Paxos &p, std::string service_name);
+
+  // service methods
+  void create_initial() override;
+  void get_store_prefixes(std::set<std::string>& s) const override;
+  void update_from_paxos(bool *need_bootstrap) override;
+  void init() override;
+  void create_pending() override;
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  // we don't require full versions; don't encode any.
+  void encode_full(MonitorDBStore::TransactionRef t) override { }
+  version_t get_trim_to() const override;
+
+  bool preprocess_query(MonOpRequestRef op) override;  // true if processed.
+  bool prepare_update(MonOpRequestRef op) override;
+  bool should_propose(double& delay) override;
+
+  bool should_print_status() const {
+    auto& fs = get_fsmap();
+    auto fs_count = fs.filesystem_count();
+    auto standby_count = fs.get_num_standby();
+    return fs_count > 0 || standby_count > 0;
+  }
+
+  void on_active() override;
+  void on_restart() override;
+
+  void check_subs();
+  void check_sub(Subscription *sub);
+
+  void dump_info(ceph::Formatter *f);
+  int print_nodes(ceph::Formatter *f);
+
+  /**
+   * Return true if a blocklist was done (i.e. OSD propose needed)
+   */
+  bool fail_mds_gid(FSMap &fsmap, mds_gid_t gid);
+
+  bool is_leader() const override { return mon.is_leader(); }
+
+ protected:
+  using mds_info_t = MDSMap::mds_info_t;
+
+  // my helpers
+  template<int dblV = 7>
+  void print_map(const FSMap &m);
+
+  void _updated(MonOpRequestRef op);
+
+  void _note_beacon(class MMDSBeacon *m);
+  bool preprocess_beacon(MonOpRequestRef op);
+  bool prepare_beacon(MonOpRequestRef op);
+
+  bool preprocess_offload_targets(MonOpRequestRef op);
+  bool prepare_offload_targets(MonOpRequestRef op);
+
+  int fail_mds(FSMap &fsmap, std::ostream &ss,
+      const std::string &arg, mds_info_t *failed_info);
+
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
+
+  int filesystem_command(
+      FSMap &fsmap,
+      MonOpRequestRef op,
+      std::string const &prefix,
+      const cmdmap_t& cmdmap,
+      std::stringstream &ss);
+
+  // beacons
+  struct beacon_info_t {
+    ceph::mono_time stamp = ceph::mono_clock::zero();
+    uint64_t seq = 0;
+    beacon_info_t() {}
+    beacon_info_t(ceph::mono_time stamp, uint64_t seq) : stamp(stamp), seq(seq) {}
+  };
+  std::map<mds_gid_t, beacon_info_t> last_beacon;
+
+  std::list<std::shared_ptr<FileSystemCommandHandler> > handlers;
+
+  bool maybe_promote_standby(FSMap& fsmap, Filesystem& fs);
+  bool maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid);
+  bool drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool* osd_propose);
+  bool check_health(FSMap &fsmap, bool* osd_propose);
+  void tick() override;     // check state, take actions
+
+  int dump_metadata(const FSMap &fsmap, const std::string &who, ceph::Formatter *f,
+		    std::ostream& err);
+
+  void update_metadata(mds_gid_t gid, const Metadata& metadata);
+  void remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t);
+  int load_metadata(std::map<mds_gid_t, Metadata>& m);
+  void count_metadata(const std::string& field, ceph::Formatter *f);
+
+public:
+  void print_fs_summary(ostream& out) {
+    get_fsmap().print_fs_summary(out);
+  }
+  void count_metadata(const std::string& field, std::map<std::string,int> *out);
+  void get_versions(std::map<std::string, std::list<std::string>> &versions);
+
+protected:
+  // MDS daemon GID to latest health state from that GID
+  std::map<uint64_t, MDSHealth> pending_daemon_health;
+  std::set<uint64_t> pending_daemon_health_rm;
+
+  std::map<mds_gid_t, Metadata> pending_metadata;
+
+  mds_gid_t gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream& err);
+
+  // When did the mon last call into our tick() method?  Used for detecting
+  // when the mon was not updating us for some period (e.g. during slow
+  // election) to reset last_beacon timeouts
+  ceph::mono_time last_tick = ceph::mono_clock::zero();
+
+private:
+  time last_fsmap_struct_flush = clock::zero();
+  bool check_fsmap_struct_version = true;
+};
+
+#endif
diff --git a/src/mon/MgrMap.h b/src/mon/MgrMap.h
new file mode 100644
index 000000000..5342fc51f
--- /dev/null
+++ b/src/mon/MgrMap.h
@@ -0,0 +1,601 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MGR_MAP_H_
+#define MGR_MAP_H_
+
+#include <sstream>
+#include <set>
+
+#include "msg/msg_types.h"
+#include "include/encoding.h"
+#include "include/utime.h"
+#include "common/Formatter.h"
+#include "common/ceph_releases.h"
+#include "common/version.h"
+#include "common/options.h"
+#include "common/Clock.h"
+
+
+class MgrMap
+{
+public:
+  struct ModuleOption {
+    std::string name;
+    uint8_t type = Option::TYPE_STR;         // Option::type_t TYPE_*
+    uint8_t level = Option::LEVEL_ADVANCED;  // Option::level_t LEVEL_*
+    uint32_t flags = 0; // Option::flag_t FLAG_*
+    std::string default_value;
+    std::string min, max;
+    std::set<std::string> enum_allowed;
+    std::string desc, long_desc;
+    std::set<std::string> tags;
+    std::set<std::string> see_also;
+
+    void encode(ceph::buffer::list& bl) const {
+      ENCODE_START(1, 1, bl);
+      encode(name, bl);
+      encode(type, bl);
+      encode(level, bl);
+      encode(flags, bl);
+      encode(default_value, bl);
+      encode(min, bl);
+      encode(max, bl);
+      encode(enum_allowed, bl);
+      encode(desc, bl);
+      encode(long_desc, bl);
+      encode(tags, bl);
+      encode(see_also, bl);
+      ENCODE_FINISH(bl);
+    }
+    void decode(ceph::buffer::list::const_iterator& p) {
+      DECODE_START(1, p);
+      decode(name, p);
+      decode(type, p);
+      decode(level, p);
+      decode(flags, p);
+      decode(default_value, p);
+      decode(min, p);
+      decode(max, p);
+      decode(enum_allowed, p);
+      decode(desc, p);
+      decode(long_desc, p);
+      decode(tags, p);
+      decode(see_also, p);
+      DECODE_FINISH(p);
+    }
+    void dump(ceph::Formatter *f) const {
+      f->dump_string("name", name);
+      f->dump_string("type", Option::type_to_str(
+		       static_cast<Option::type_t>(type)));
+      f->dump_string("level", Option::level_to_str(
+		       static_cast<Option::level_t>(level)));
+      f->dump_unsigned("flags", flags);
+      f->dump_string("default_value", default_value);
+      f->dump_string("min", min);
+      f->dump_string("max", max);
+      f->open_array_section("enum_allowed");
+      for (auto& i : enum_allowed) {
+	f->dump_string("value", i);
+      }
+      f->close_section();
+      f->dump_string("desc", desc);
+      f->dump_string("long_desc", long_desc);
+      f->open_array_section("tags");
+      for (auto& i : tags) {
+	f->dump_string("tag", i);
+      }
+      f->close_section();
+      f->open_array_section("see_also");
+      for (auto& i : see_also) {
+	f->dump_string("option", i);
+      }
+      f->close_section();
+    }
+  };
+
+  class ModuleInfo
+  {
+    public:
+    std::string name;
+    bool can_run = true;
+    std::string error_string;
+    std::map<std::string,ModuleOption> module_options;
+
+    // We do not include the module's `failed` field in the beacon,
+    // because it is exposed via health checks.
+    void encode(ceph::buffer::list &bl) const {
+      ENCODE_START(2, 1, bl);
+      encode(name, bl);
+      encode(can_run, bl);
+      encode(error_string, bl);
+      encode(module_options, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      DECODE_START(1, bl);
+      decode(name, bl);
+      decode(can_run, bl);
+      decode(error_string, bl);
+      if (struct_v >= 2) {
+	decode(module_options, bl);
+      }
+      DECODE_FINISH(bl);
+    }
+
+    bool operator==(const ModuleInfo &rhs) const
+    {
+      return (name == rhs.name) && (can_run == rhs.can_run);
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->open_object_section("module");
+      f->dump_string("name", name);
+      f->dump_bool("can_run", can_run);
+      f->dump_string("error_string", error_string);
+      f->open_object_section("module_options");
+      for (auto& i : module_options) {
+	f->dump_object(i.first.c_str(), i.second);
+      }
+      f->close_section();
+      f->close_section();
+    }
+  };
+
+  class StandbyInfo
+  {
+  public:
+    uint64_t gid = 0;
+    std::string name;
+    std::vector<ModuleInfo> available_modules;
+    uint64_t mgr_features = 0;
+
+    StandbyInfo(uint64_t gid_, const std::string &name_,
+                const std::vector<ModuleInfo>& am,
+		uint64_t feat)
+      : gid(gid_), name(name_), available_modules(am),
+	mgr_features(feat)
+    {}
+
+    StandbyInfo() {}
+
+    void encode(ceph::buffer::list& bl) const
+    {
+      ENCODE_START(4, 1, bl);
+      encode(gid, bl);
+      encode(name, bl);
+      std::set<std::string> old_available_modules;
+      for (const auto &i : available_modules) {
+        old_available_modules.insert(i.name);
+      }
+      encode(old_available_modules, bl);  // version 2
+      encode(available_modules, bl);  // version 3
+      encode(mgr_features, bl); // v4
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator& p)
+    {
+      DECODE_START(4, p);
+      decode(gid, p);
+      decode(name, p);
+      if (struct_v >= 2) {
+        std::set<std::string> old_available_modules;
+        decode(old_available_modules, p);
+        if (struct_v < 3) {
+          for (const auto &name : old_available_modules) {
+            MgrMap::ModuleInfo info;
+            info.name = name;
+            available_modules.push_back(std::move(info));
+          }
+        }
+      }
+      if (struct_v >= 3) {
+        decode(available_modules, p);
+      }
+      if (struct_v >= 4) {
+	decode(mgr_features, p);
+      }
+      DECODE_FINISH(p);
+    }
+
+    bool have_module(const std::string &module_name) const
+    {
+      auto it = std::find_if(available_modules.begin(),
+          available_modules.end(),
+          [module_name](const ModuleInfo &m) -> bool {
+            return m.name == module_name;
+          });
+
+      return it != available_modules.end();
+    }
+  };
+
+  epoch_t epoch = 0;
+  epoch_t last_failure_osd_epoch = 0;
+
+  /// global_id of the ceph-mgr instance selected as a leader
+  uint64_t active_gid = 0;
+  /// server address reported by the leader once it is active
+  entity_addrvec_t active_addrs;
+  /// whether the nominated leader is active (i.e. has initialized its server)
+  bool available = false;
+  /// the name (foo in mgr.<foo>) of the active daemon
+  std::string active_name;
+  /// when the active mgr became active, or we lost the active mgr
+  utime_t active_change;
+  /// features
+  uint64_t active_mgr_features = 0;
+
+  std::vector<entity_addrvec_t> clients; // for blocklist
+
+  std::map<uint64_t, StandbyInfo> standbys;
+
+  // Modules which are enabled
+  std::set<std::string> modules;
+
+  // Modules which should always be enabled. A manager daemon will enable
+  // modules from the union of this set and the `modules` set above, latest
+  // active version.
+  std::map<uint32_t, std::set<std::string>> always_on_modules;
+
+  // Modules which are reported to exist
+  std::vector<ModuleInfo> available_modules;
+
+  // Map of module name to URI, indicating services exposed by
+  // running modules on the active mgr daemon.
+  std::map<std::string, std::string> services;
+
+  epoch_t get_epoch() const { return epoch; }
+  epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; }
+  const entity_addrvec_t& get_active_addrs() const { return active_addrs; }
+  uint64_t get_active_gid() const { return active_gid; }
+  bool get_available() const { return available; }
+  const std::string &get_active_name() const { return active_name; }
+  const utime_t& get_active_change() const { return active_change; }
+  int get_num_standby() const { return standbys.size(); }
+
+  bool all_support_module(const std::string& module) {
+    if (!have_module(module)) {
+      return false;
+    }
+    for (auto& p : standbys) {
+      if (!p.second.have_module(module)) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  bool have_module(const std::string &module_name) const
+  {
+    for (const auto &i : available_modules) {
+      if (i.name == module_name) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  const ModuleInfo *get_module_info(const std::string &module_name) const {
+    for (const auto &i : available_modules) {
+      if (i.name == module_name) {
+        return &i;
+      }
+    }
+    return nullptr;
+  }
+
+  bool can_run_module(const std::string &module_name, std::string *error) const
+  {
+    for (const auto &i : available_modules) {
+      if (i.name == module_name) {
+        *error = i.error_string;
+        return i.can_run;
+      }
+    }
+
+    std::ostringstream oss;
+    oss << "Module '" << module_name << "' does not exist";
+    throw std::logic_error(oss.str());
+  }
+
+  bool module_enabled(const std::string& module_name) const
+  {
+    return modules.find(module_name) != modules.end();
+  }
+
+  bool any_supports_module(const std::string& module) const {
+    if (have_module(module)) {
+      return true;
+    }
+    for (auto& p : standbys) {
+      if (p.second.have_module(module)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool have_name(const std::string& name) const {
+    if (active_name == name) {
+      return true;
+    }
+    for (auto& p : standbys) {
+      if (p.second.name == name) {
+	return true;
+      }
+    }
+    return false;
+  }
+
+  std::set<std::string> get_all_names() const {
+    std::set<std::string> ls;
+    if (active_name.size()) {
+      ls.insert(active_name);
+    }
+    for (auto& p : standbys) {
+      ls.insert(p.second.name);
+    }
+    return ls;
+  }
+
+  std::set<std::string> get_always_on_modules() const {
+    unsigned rnum = to_integer<uint32_t>(ceph_release());
+    auto it = always_on_modules.find(rnum);
+    if (it == always_on_modules.end()) {
+      // ok, try the most recent release
+      if (always_on_modules.empty()) {
+	return {}; // ugh
+      }
+      --it;
+      if (it->first < rnum) {
+	return it->second;
+      }
+      return {};      // wth
+    }
+    return it->second;
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const
+  {
+    if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      ENCODE_START(5, 1, bl);
+      encode(epoch, bl);
+      encode(active_addrs.legacy_addr(), bl, features);
+      encode(active_gid, bl);
+      encode(available, bl);
+      encode(active_name, bl);
+      encode(standbys, bl);
+      encode(modules, bl);
+
+      // Pre-version 4 std::string std::list of available modules
+      // (replaced by direct encode of ModuleInfo below)
+      std::set<std::string> old_available_modules;
+      for (const auto &i : available_modules) {
+	old_available_modules.insert(i.name);
+      }
+      encode(old_available_modules, bl);
+
+      encode(services, bl);
+      encode(available_modules, bl);
+      ENCODE_FINISH(bl);
+      return;
+    }
+    ENCODE_START(11, 6, bl);
+    encode(epoch, bl);
+    encode(active_addrs, bl, features);
+    encode(active_gid, bl);
+    encode(available, bl);
+    encode(active_name, bl);
+    encode(standbys, bl);
+    encode(modules, bl);
+    encode(services, bl);
+    encode(available_modules, bl);
+    encode(active_change, bl);
+    encode(always_on_modules, bl);
+    encode(active_mgr_features, bl);
+    encode(last_failure_osd_epoch, bl);
+    encode(clients, bl, features);
+    ENCODE_FINISH(bl);
+    return;
+  }
+
+  void decode(ceph::buffer::list::const_iterator& p)
+  {
+    DECODE_START(11, p);
+    decode(epoch, p);
+    decode(active_addrs, p);
+    decode(active_gid, p);
+    decode(available, p);
+    decode(active_name, p);
+    decode(standbys, p);
+    if (struct_v >= 2) {
+      decode(modules, p);
+
+      if (struct_v < 6) {
+	// Reconstitute ModuleInfos from names
+	std::set<std::string> module_name_list;
+	decode(module_name_list, p);
+	// Only need to unpack this field if we won't have the full
+	// MgrMap::ModuleInfo structures added in v4
+	if (struct_v < 4) {
+	  for (const auto &i : module_name_list) {
+	    MgrMap::ModuleInfo info;
+	    info.name = i;
+	    available_modules.push_back(std::move(info));
+	  }
+	}
+      }
+    }
+    if (struct_v >= 3) {
+      decode(services, p);
+    }
+    if (struct_v >= 4) {
+      decode(available_modules, p);
+    }
+    if (struct_v >= 7) {
+      decode(active_change, p);
+    } else {
+      active_change = {};
+    }
+    if (struct_v >= 8) {
+      decode(always_on_modules, p);
+    }
+    if (struct_v >= 9) {
+      decode(active_mgr_features, p);
+    }
+    if (struct_v >= 10) {
+      decode(last_failure_osd_epoch, p);
+    }
+    if (struct_v >= 11) {
+      decode(clients, p);
+    }
+    DECODE_FINISH(p);
+  }
+
+  void dump(ceph::Formatter *f) const {
+    f->dump_int("epoch", epoch);
+    f->dump_int("active_gid", get_active_gid());
+    f->dump_string("active_name", get_active_name());
+    f->dump_object("active_addrs", active_addrs);
+    f->dump_stream("active_addr") << active_addrs.get_legacy_str();
+    f->dump_stream("active_change") << active_change;
+    f->dump_unsigned("active_mgr_features", active_mgr_features);
+    f->dump_bool("available", available);
+    f->open_array_section("standbys");
+    for (const auto &i : standbys) {
+      f->open_object_section("standby");
+      f->dump_int("gid", i.second.gid);
+      f->dump_string("name", i.second.name);
+      f->dump_unsigned("mgr_features", i.second.mgr_features);
+      f->open_array_section("available_modules");
+      for (const auto& j : i.second.available_modules) {
+        j.dump(f);
+      }
+      f->close_section();
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("modules");
+    for (auto& i : modules) {
+      f->dump_string("module", i);
+    }
+    f->close_section();
+    f->open_array_section("available_modules");
+    for (const auto& j : available_modules) {
+      j.dump(f);
+    }
+    f->close_section();
+
+    f->open_object_section("services");
+    for (const auto &i : services) {
+      f->dump_string(i.first.c_str(), i.second);
+    }
+    f->close_section();
+
+    f->open_object_section("always_on_modules");
+    for (auto& v : always_on_modules) {
+      f->open_array_section(ceph_release_name(v.first));
+      for (auto& m : v.second) {
+        f->dump_string("module", m);
+      }
+      f->close_section();
+    }
+    f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
+    f->open_array_section("active_clients");
+    for (const auto &c : clients) {
+      f->dump_object("client", c);
+    }
+    f->close_section();
+    f->close_section();
+  }
+
+  static void generate_test_instances(std::list<MgrMap*> &l) {
+    l.push_back(new MgrMap);
+  }
+
+  void print_summary(ceph::Formatter *f, std::ostream *ss) const
+  {
+    // One or the other, not both
+    ceph_assert((ss != nullptr) != (f != nullptr));
+    if (f) {
+      f->dump_bool("available", available);
+      f->dump_int("num_standbys", standbys.size());
+      f->open_array_section("modules");
+      for (auto& i : modules) {
+	f->dump_string("module", i);
+      }
+      f->close_section();
+      f->open_object_section("services");
+      for (const auto &i : services) {
+	f->dump_string(i.first.c_str(), i.second);
+      }
+      f->close_section();
+    } else {
+      utime_t now = ceph_clock_now();
+      if (get_active_gid() != 0) {
+	*ss << get_active_name();
+        if (!available) {
+          // If the daemon hasn't gone active yet, indicate that.
+          *ss << "(active, starting";
+        } else {
+          *ss << "(active";
+        }
+	if (active_change) {
+	  *ss << ", since " << utimespan_str(now - active_change);
+	}
+	*ss << ")";
+      } else {
+	*ss << "no daemons active";
+	if (active_change) {
+	  *ss << " (since " << utimespan_str(now - active_change) << ")";
+	}
+      }
+      if (standbys.size()) {
+	*ss << ", standbys: ";
+	bool first = true;
+	for (const auto &i : standbys) {
+	  if (!first) {
+	    *ss << ", ";
+	  }
+	  *ss << i.second.name;
+	  first = false;
+	}
+      }
+    }
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const MgrMap& m) {
+    std::ostringstream ss;
+    m.print_summary(nullptr, &ss);
+    return out << ss.str();
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const std::vector<ModuleInfo>& mi) {
+    for (const auto &i : mi) {
+      out << i.name << " ";
+    }
+    return out;
+  }
+};
+
+WRITE_CLASS_ENCODER_FEATURES(MgrMap)
+WRITE_CLASS_ENCODER(MgrMap::StandbyInfo)
+WRITE_CLASS_ENCODER(MgrMap::ModuleInfo);
+WRITE_CLASS_ENCODER(MgrMap::ModuleOption);
+
+#endif
+
diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc
new file mode 100644
index 000000000..bf5e2ed31
--- /dev/null
+++ b/src/mon/MgrMonitor.cc
@@ -0,0 +1,1356 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <boost/tokenizer.hpp>
+
+#include "messages/MMgrBeacon.h"
+#include "messages/MMgrMap.h"
+#include "messages/MMgrDigest.h"
+
+#include "include/stringify.h"
+#include "mgr/MgrContext.h"
+#include "mgr/mgr_commands.h"
+#include "OSDMonitor.h"
+#include "ConfigMonitor.h"
+#include "HealthMonitor.h"
+
+#include "MgrMonitor.h"
+
+#define MGR_METADATA_PREFIX "mgr_metadata"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, map)
+using namespace TOPNSPC::common;
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+
+static ostream& _prefix(std::ostream *_dout, Monitor &mon,
+			const MgrMap& mgrmap) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name()
+		<< ").mgr e" << mgrmap.get_epoch() << " ";
+}
+
+// the system treats always_on_modules as if they provide built-in functionality
+// by ensuring that they are always enabled.
+const static std::map<uint32_t, std::set<std::string>> always_on_modules = {
+  {
+    CEPH_RELEASE_NAUTILUS, {
+      "crash",
+      "status",
+      "progress",
+      "balancer",
+      "devicehealth",
+      "orchestrator_cli",
+      "rbd_support",
+      "volumes",
+    }
+  },
+  {
+    CEPH_RELEASE_OCTOPUS, {
+      "crash",
+      "status",
+      "progress",
+      "balancer",
+      "devicehealth",
+      "orchestrator",
+      "rbd_support",
+      "volumes",
+      "pg_autoscaler",
+      "telemetry",
+    }
+  },
+  {
+    CEPH_RELEASE_PACIFIC, {
+      "crash",
+      "status",
+      "progress",
+      "balancer",
+      "devicehealth",
+      "orchestrator",
+      "rbd_support",
+      "volumes",
+      "pg_autoscaler",
+      "telemetry",
+    }
+  }
+};
+
+// Prefix for mon store of active mgr's command descriptions
+const static std::string command_descs_prefix = "mgr_command_descs";
+
+const Option *MgrMonitor::find_module_option(const string& name)
+{
+  // we have two forms of names: "mgr/$module/$option" and
+  // localized "mgr/$module/$instance/$option".  normalize to the
+  // former by stripping out $instance.
+  string real_name;
+  if (name.substr(0, 4) != "mgr/") {
+    return nullptr;
+  }
+  auto second_slash = name.find('/', 5);
+  if (second_slash == std::string::npos) {
+    return nullptr;
+  }
+  auto third_slash = name.find('/', second_slash + 1);
+  if (third_slash != std::string::npos) {
+    // drop the $instance part between the second and third slash
+    real_name = name.substr(0, second_slash) + name.substr(third_slash);
+  } else {
+    real_name = name;
+  }
+  auto p = mgr_module_options.find(real_name);
+  if (p != mgr_module_options.end()) {
+    return &p->second;
+  }
+  return nullptr;
+}
+
+version_t MgrMonitor::get_trim_to() const
+{
+  int64_t max = g_conf().get_val<int64_t>("mon_max_mgrmap_epochs");
+  if (map.epoch > max) {
+    return map.epoch - max;
+  }
+  return 0;
+}
+
+void MgrMonitor::create_initial()
+{
+  // Take a local copy of initial_modules for tokenizer to iterate over.
+  auto initial_modules = g_conf().get_val<std::string>("mgr_initial_modules");
+  boost::tokenizer<> tok(initial_modules);
+  for (auto& m : tok) {
+    pending_map.modules.insert(m);
+  }
+  pending_map.always_on_modules = always_on_modules;
+  pending_command_descs = mgr_commands;
+  dout(10) << __func__ << " initial modules " << pending_map.modules
+	   << ", always on modules " << pending_map.get_always_on_modules()
+           << ", " << pending_command_descs.size() << " commands"
+	   << dendl;
+}
+
+void MgrMonitor::get_store_prefixes(std::set<string>& s) const
+{
+  s.insert(service_name);
+  s.insert(command_descs_prefix);
+  s.insert(MGR_METADATA_PREFIX);
+}
+
+void MgrMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  version_t version = get_last_committed();
+  if (version != map.epoch) {
+    dout(4) << "loading version " << version << dendl;
+
+    bufferlist bl;
+    int err = get_version(version, bl);
+    ceph_assert(err == 0);
+
+    bool old_available = map.get_available();
+    uint64_t old_gid = map.get_active_gid();
+
+    auto p = bl.cbegin();
+    map.decode(p);
+
+    dout(4) << "active server: " << map.active_addrs
+	    << "(" << map.active_gid << ")" << dendl;
+
+    ever_had_active_mgr = get_value("ever_had_active_mgr");
+
+    load_health();
+
+    if (map.available) {
+      first_seen_inactive = utime_t();
+    } else {
+      first_seen_inactive = ceph_clock_now();
+    }
+
+    check_subs();
+
+    if (version == 1
+        || command_descs.empty()
+        || (map.get_available()
+            && (!old_available || old_gid != map.get_active_gid()))) {
+      dout(4) << "mkfs or daemon transitioned to available, loading commands"
+	      << dendl;
+      bufferlist loaded_commands;
+      int r = mon.store->get(command_descs_prefix, "", loaded_commands);
+      if (r < 0) {
+        derr << "Failed to load mgr commands: " << cpp_strerror(r) << dendl;
+      } else {
+        auto p = loaded_commands.cbegin();
+        decode(command_descs, p);
+      }
+    }
+  }
+
+  // populate module options
+  mgr_module_options.clear();
+  misc_option_strings.clear();
+  for (auto& i : map.available_modules) {
+    for (auto& j : i.module_options) {
+      string name = string("mgr/") + i.name + "/" + j.second.name;
+      auto p = mgr_module_options.emplace(
+	name,
+	Option(name, static_cast<Option::type_t>(j.second.type),
+	       static_cast<Option::level_t>(j.second.level)));
+      Option& opt = p.first->second;
+      opt.set_flags(static_cast<Option::flag_t>(j.second.flags));
+      opt.set_flag(Option::FLAG_MGR);
+      opt.set_description(j.second.desc.c_str());
+      opt.set_long_description(j.second.long_desc.c_str());
+      for (auto& k : j.second.tags) {
+	opt.add_tag(k.c_str());
+      }
+      for (auto& k : j.second.see_also) {
+	if (i.module_options.count(k)) {
+	  // it's another module option
+	  misc_option_strings.push_back(string("mgr/") + i.name + "/" + k);
+	  opt.add_see_also(misc_option_strings.back().c_str());
+	} else {
+	  // it's a native option
+	  opt.add_see_also(k.c_str());
+	}
+      }
+      Option::value_t v, v2;
+      std::string err;
+      if (j.second.default_value.size() &&
+	  !opt.parse_value(j.second.default_value, &v, &err)) {
+	opt.set_default(v);
+      }
+      if (j.second.min.size() &&
+	  j.second.max.size() &&
+	  !opt.parse_value(j.second.min, &v, &err) &&
+	  !opt.parse_value(j.second.max, &v2, &err)) {
+	opt.set_min_max(v, v2);
+      }
+      std::vector<const char *> enum_allowed;
+      for (auto& k : j.second.enum_allowed) {
+	enum_allowed.push_back(k.c_str());
+      }
+      opt.set_enum_allowed(enum_allowed);
+    }
+  }
+  // force ConfigMonitor to refresh, since it uses const Option *
+  // pointers into our mgr_module_options (which we just rebuilt).
+  mon.configmon()->load_config();
+
+  if (!mon.is_init()) {
+    // feed our pet MgrClient, unless we are in Monitor::[pre]init()
+    prime_mgr_client();
+  }
+}
+
+void MgrMonitor::prime_mgr_client()
+{
+  dout(10) << __func__ << dendl;
+  mon.mgr_client.ms_dispatch2(make_message<MMgrMap>(map));
+}
+
+void MgrMonitor::create_pending()
+{
+  pending_map = map;
+  pending_map.epoch++;
+}
+
+health_status_t MgrMonitor::should_warn_about_mgr_down()
+{
+  utime_t now = ceph_clock_now();
+  // we warn if we have osds AND we've exceeded the grace period
+  // which means a new mon cluster and be HEALTH_OK indefinitely as long as
+  // no OSDs are ever created.
+  if (mon.osdmon()->osdmap.get_num_osds() > 0 &&
+       now > mon.monmap->created + g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")) {
+    health_status_t level = HEALTH_WARN;
+    if (first_seen_inactive != utime_t() &&
+	now - first_seen_inactive > g_conf().get_val<int64_t>("mon_mgr_inactive_grace")) {
+      level = HEALTH_ERR;
+    }
+    return level;
+  }
+  return HEALTH_OK;
+}
+
+void MgrMonitor::post_paxos_update()
+{
+  // are we handling digest subscribers?
+  if (digest_event) {
+    bool send = false;
+    if (prev_health_checks.empty()) {
+      prev_health_checks.resize(mon.paxos_service.size());
+      send = true;
+    }
+    ceph_assert(prev_health_checks.size() == mon.paxos_service.size());
+    for (auto i = 0u; i < prev_health_checks.size(); i++) {
+      const auto& curr = mon.paxos_service[i]->get_health_checks();
+      if (!send && curr != prev_health_checks[i]) {
+        send = true;
+      }
+      prev_health_checks[i] = curr;
+    }
+    if (send) {
+      if (is_active()) {
+        send_digests();
+      } else {
+        cancel_timer();
+        wait_for_active_ctx(new C_MonContext{&mon, [this](int) {
+          send_digests();
+        }});
+      }
+    }
+  }
+}
+
+void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  dout(10) << __func__ << " " << pending_map << dendl;
+  bufferlist bl;
+  pending_map.encode(bl, mon.get_quorum_con_features());
+  put_version(t, pending_map.epoch, bl);
+  put_last_committed(t, pending_map.epoch);
+
+  for (auto& p : pending_metadata) {
+    dout(10) << __func__ << " set metadata for " << p.first << dendl;
+    t->put(MGR_METADATA_PREFIX, p.first, p.second);
+  }
+  for (auto& name : pending_metadata_rm) {
+    dout(10) << __func__ << " rm metadata for " << name << dendl;
+    t->erase(MGR_METADATA_PREFIX, name);
+  }
+  pending_metadata.clear();
+  pending_metadata_rm.clear();
+
+  health_check_map_t next;
+  if (pending_map.active_gid == 0) {
+    auto level = should_warn_about_mgr_down();
+    if (level != HEALTH_OK) {
+      next.add("MGR_DOWN", level, "no active mgr", 0);
+    } else {
+      dout(10) << __func__ << " no health warning (never active and new cluster)"
+	       << dendl;
+    }
+  } else {
+    put_value(t, "ever_had_active_mgr", 1);
+  }
+  encode_health(next, t);
+
+  if (pending_command_descs.size()) {
+    dout(4) << __func__ << " encoding " << pending_command_descs.size()
+            << " command_descs" << dendl;
+    for (auto& p : pending_command_descs) {
+      p.set_flag(MonCommand::FLAG_MGR);
+    }
+    bufferlist bl;
+    encode(pending_command_descs, bl);
+    t->put(command_descs_prefix, "", bl);
+    pending_command_descs.clear();
+  }
+}
+
+bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid)
+{
+  // check permissions
+  MonSession *session = op->get_session();
+  if (!session)
+    return false;
+  if (!session->is_capable("mgr", MON_CAP_X)) {
+    dout(1) << __func__ << " insufficient caps " << session->caps << dendl;
+    return false;
+  }
+  if (fsid != mon.monmap->fsid) {
+    dout(1) << __func__ << " op fsid " << fsid
+	    << " != " << mon.monmap->fsid << dendl;
+    return false;
+  }
+  return true;
+}
+
+bool MgrMonitor::preprocess_query(MonOpRequestRef op)
+{
+  auto m = op->get_req<PaxosServiceMessage>();
+  switch (m->get_type()) {
+    case MSG_MGR_BEACON:
+      return preprocess_beacon(op);
+    case MSG_MON_COMMAND:
+      try {
+	return preprocess_command(op);
+      } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+
+    default:
+      mon.no_reply(op);
+      derr << "Unhandled message type " << m->get_type() << dendl;
+      return true;
+  }
+}
+
+bool MgrMonitor::prepare_update(MonOpRequestRef op)
+{
+  auto m = op->get_req<PaxosServiceMessage>();
+  switch (m->get_type()) {
+    case MSG_MGR_BEACON:
+      return prepare_beacon(op);
+
+    case MSG_MON_COMMAND:
+      try {
+	return prepare_command(op);
+      } catch (const bad_cmd_get& e) {
+	bufferlist bl;
+	mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+	return true;
+      }
+
+    default:
+      mon.no_reply(op);
+      derr << "Unhandled message type " << m->get_type() << dendl;
+      return true;
+  }
+}
+
+
+
+class C_Updated : public Context {
+  MgrMonitor *mm;
+  MonOpRequestRef op;
+public:
+  C_Updated(MgrMonitor *a, MonOpRequestRef c) :
+    mm(a), op(c) {}
+  void finish(int r) override {
+    if (r >= 0) {
+      // Success 
+    } else if (r == -ECANCELED) {
+      mm->mon.no_reply(op);
+    } else {
+      mm->dispatch(op);        // try again
+    }
+  }
+};
+
+bool MgrMonitor::preprocess_beacon(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMgrBeacon>();
+  mon.no_reply(op); // we never reply to beacons
+  dout(4) << "beacon from " << m->get_gid() << dendl;
+
+  if (!check_caps(op, m->get_fsid())) {
+    // drop it on the floor
+    return true;
+  }
+
+  // always send this to the leader's prepare_beacon()
+  return false;
+}
+
+bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMgrBeacon>();
+  dout(4) << "beacon from " << m->get_gid() << dendl;
+
+  // See if we are seeing same name, new GID for the active daemon
+  if (m->get_name() == pending_map.active_name
+      && m->get_gid() != pending_map.active_gid)
+  {
+    dout(4) << "Active daemon restart (mgr." << m->get_name() << ")" << dendl;
+    mon.clog->info() << "Active manager daemon " << m->get_name()
+                      << " restarted";
+    if (!mon.osdmon()->is_writeable()) {
+      dout(1) << __func__ << ":  waiting for osdmon writeable to"
+                 " blocklist old instance." << dendl;
+      mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+      return false;
+    }
+    drop_active();
+  }
+
+  // See if we are seeing same name, new GID for any standbys
+  for (const auto &i : pending_map.standbys) {
+    const MgrMap::StandbyInfo &s = i.second;
+    if (s.name == m->get_name() && s.gid != m->get_gid()) {
+      dout(4) << "Standby daemon restart (mgr." << m->get_name() << ")" << dendl;
+      mon.clog->debug() << "Standby manager daemon " << m->get_name()
+                         << " restarted";
+      drop_standby(i.first);
+      break;
+    }
+  }
+
+  last_beacon[m->get_gid()] = ceph::coarse_mono_clock::now();
+
+  // Track whether we modified pending_map
+  bool updated = false;
+
+  if (pending_map.active_gid == m->get_gid()) {
+    if (pending_map.services != m->get_services()) {
+      dout(4) << "updated services from mgr." << m->get_name()
+              << ": " << m->get_services() << dendl;
+      pending_map.services = m->get_services();
+      updated = true;
+    }
+
+    // A beacon from the currently active daemon
+    if (pending_map.active_addrs != m->get_server_addrs()) {
+      dout(4) << "learned address " << m->get_server_addrs()
+	      << " (was " << pending_map.active_addrs << ")" << dendl;
+      pending_map.active_addrs = m->get_server_addrs();
+      updated = true;
+    }
+
+    if (pending_map.get_available() != m->get_available()) {
+      dout(4) << "available " << m->get_gid() << dendl;
+      mon.clog->info() << "Manager daemon " << pending_map.active_name
+                        << " is now available";
+
+      // This beacon should include command descriptions
+      pending_command_descs = m->get_command_descs();
+      if (pending_command_descs.empty()) {
+        // This should not happen, but it also isn't fatal: we just
+        // won't successfully update our list of commands.
+        dout(4) << "First available beacon from " << pending_map.active_name
+                << "(" << m->get_gid() << ") does not include command descs"
+                << dendl;
+      } else {
+        dout(4) << "First available beacon from " << pending_map.active_name
+                << "(" << m->get_gid() << ") includes "
+                << pending_command_descs.size() << " command descs" << dendl;
+      }
+
+      pending_map.available = m->get_available();
+      updated = true;
+    }
+    if (pending_map.available_modules != m->get_available_modules()) {
+      dout(4) << "available_modules " << m->get_available_modules()
+	      << " (was " << pending_map.available_modules << ")" << dendl;
+      pending_map.available_modules = m->get_available_modules();
+      updated = true;
+    }
+    const auto& clients = m->get_clients();
+    if (pending_map.clients != clients) {
+      dout(4) << "active's RADOS clients " << clients
+	      << " (was " << pending_map.clients << ")" << dendl;
+      pending_map.clients = clients;
+      updated = true;
+    }
+  } else if (pending_map.active_gid == 0) {
+    // There is no currently active daemon, select this one.
+    if (pending_map.standbys.count(m->get_gid())) {
+      drop_standby(m->get_gid(), false);
+    }
+    dout(4) << "selecting new active " << m->get_gid()
+	    << " " << m->get_name()
+	    << " (was " << pending_map.active_gid << " "
+	    << pending_map.active_name << ")" << dendl;
+    pending_map.active_gid = m->get_gid();
+    pending_map.active_name = m->get_name();
+    pending_map.active_change = ceph_clock_now();
+    pending_map.active_mgr_features = m->get_mgr_features();
+    pending_map.available_modules = m->get_available_modules();
+    encode(m->get_metadata(), pending_metadata[m->get_name()]);
+    pending_metadata_rm.erase(m->get_name());
+
+    mon.clog->info() << "Activating manager daemon "
+                      << pending_map.active_name;
+
+    updated = true;
+  } else {
+    if (pending_map.standbys.count(m->get_gid()) > 0) {
+      dout(10) << "from existing standby " << m->get_gid() << dendl;
+      if (pending_map.standbys[m->get_gid()].available_modules !=
+	  m->get_available_modules()) {
+	dout(10) << "existing standby " << m->get_gid() << " available_modules "
+		 << m->get_available_modules() << " (was "
+		 << pending_map.standbys[m->get_gid()].available_modules << ")"
+		 << dendl;
+	pending_map.standbys[m->get_gid()].available_modules =
+	  m->get_available_modules();
+	updated = true;
+      }
+    } else {
+      dout(10) << "new standby " << m->get_gid() << dendl;
+      mon.clog->debug() << "Standby manager daemon " << m->get_name()
+                         << " started";
+      pending_map.standbys[m->get_gid()] = {m->get_gid(), m->get_name(),
+					    m->get_available_modules(),
+					    m->get_mgr_features()};
+      encode(m->get_metadata(), pending_metadata[m->get_name()]);
+      pending_metadata_rm.erase(m->get_name());
+      updated = true;
+    }
+  }
+
+  if (updated) {
+    dout(4) << "updating map" << dendl;
+    wait_for_finished_proposal(op, new C_Updated(this, op));
+  } else {
+    dout(10) << "no change" << dendl;
+  }
+
+  return updated;
+}
+
+void MgrMonitor::check_subs()
+{
+  const std::string type = "mgrmap";
+  if (mon.session_map.subs.count(type) == 0)
+    return;
+  for (auto sub : *(mon.session_map.subs[type])) {
+    check_sub(sub);
+  }
+}
+
+void MgrMonitor::check_sub(Subscription *sub)
+{
+  if (sub->type == "mgrmap") {
+    if (sub->next <= map.get_epoch()) {
+      dout(20) << "Sending map to subscriber " << sub->session->con
+	       << " " << sub->session->con->get_peer_addr() << dendl;
+      sub->session->con->send_message2(make_message<MMgrMap>(map));
+      if (sub->onetime) {
+        mon.session_map.remove_sub(sub);
+      } else {
+        sub->next = map.get_epoch() + 1;
+      }
+    }
+  } else {
+    ceph_assert(sub->type == "mgrdigest");
+    if (sub->next == 0) {
+      // new registration; cancel previous timer
+      cancel_timer();
+    }
+    if (digest_event == nullptr) {
+      send_digests();
+    }
+  }
+}
+
+/**
+ * Handle digest subscriptions separately (outside of check_sub) because
+ * they are going to be periodic rather than version-driven.
+ */
+void MgrMonitor::send_digests()
+{
+  cancel_timer();
+
+  const std::string type = "mgrdigest";
+  if (mon.session_map.subs.count(type) == 0) {
+    prev_health_checks.clear();
+    return;
+  }
+
+  if (!is_active()) {
+    // if paxos is currently not active, don't send a digest but reenable timer
+    goto timer;
+  }
+  dout(10) << __func__ << dendl;
+
+  for (auto sub : *(mon.session_map.subs[type])) {
+    dout(10) << __func__ << " sending digest to subscriber " << sub->session->con
+	     << " " << sub->session->con->get_peer_addr() << dendl;
+    auto mdigest = make_message<MMgrDigest>();
+
+    JSONFormatter f;
+    mon.healthmon()->get_health_status(true, &f, nullptr, nullptr, nullptr);
+    f.flush(mdigest->health_json);
+    f.reset();
+
+    mon.get_mon_status(&f);
+    f.flush(mdigest->mon_status_json);
+    f.reset();
+
+    sub->session->con->send_message2(mdigest);
+  }
+
+timer:
+  digest_event = mon.timer.add_event_after(
+    g_conf().get_val<int64_t>("mon_mgr_digest_period"),
+    new C_MonContext{&mon, [this](int) {
+      send_digests();
+  }});
+}
+
+void MgrMonitor::cancel_timer()
+{
+  if (digest_event) {
+    mon.timer.cancel_event(digest_event);
+    digest_event = nullptr;
+  }
+}
+
+void MgrMonitor::on_active()
+{
+  if (!mon.is_leader()) {
+    return;
+  }
+  mon.clog->debug() << "mgrmap e" << map.epoch << ": " << map;
+  if (!HAVE_FEATURE(mon.get_quorum_con_features(), SERVER_NAUTILUS)) {
+    return;
+  }
+  if (pending_map.always_on_modules == always_on_modules) {
+    return;
+  }
+  dout(4) << "always on modules changed, pending "
+          << pending_map.always_on_modules << " != wanted "
+          << always_on_modules << dendl;
+  pending_map.always_on_modules = always_on_modules;
+  propose_pending();
+}
+
+void MgrMonitor::tick()
+{
+  if (!is_active() || !mon.is_leader())
+    return;
+
+  const auto now = ceph::coarse_mono_clock::now();
+
+  const auto mgr_beacon_grace =
+      g_conf().get_val<std::chrono::seconds>("mon_mgr_beacon_grace");
+
+  // Note that this is the mgr daemon's tick period, not ours (the
+  // beacon is sent with this period).
+  const auto mgr_tick_period =
+      g_conf().get_val<std::chrono::seconds>("mgr_tick_period");
+
+  if (last_tick != ceph::coarse_mono_clock::time_point::min()
+      && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) {
+    // This case handles either local slowness (calls being delayed
+    // for whatever reason) or cluster election slowness (a long gap
+    // between calls while an election happened)
+    dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
+            "(slow election?) of " << now - last_tick << " seconds" << dendl;
+    for (auto &i : last_beacon) {
+      i.second = now;
+    }
+  }
+
+  last_tick = now;
+
+  // Populate any missing beacons (i.e. no beacon since MgrMonitor
+  // instantiation) with the current time, so that they will
+  // eventually look laggy if they fail to give us a beacon.
+  if (pending_map.active_gid != 0
+      && last_beacon.count(pending_map.active_gid) == 0) {
+    last_beacon[pending_map.active_gid] = now;
+  }
+  for (auto s : pending_map.standbys) {
+    if (last_beacon.count(s.first) == 0) {
+      last_beacon[s.first] = now;
+    }
+  }
+
+  // Cull standbys first so that any remaining standbys
+  // will be eligible to take over from the active if we cull him.
+  std::list<uint64_t> dead_standbys;
+  const auto cutoff = now - mgr_beacon_grace;
+  for (const auto &i : pending_map.standbys) {
+    auto last_beacon_time = last_beacon.at(i.first);
+    if (last_beacon_time < cutoff) {
+      dead_standbys.push_back(i.first);
+    }
+  }
+
+  bool propose = false;
+
+  for (auto i : dead_standbys) {
+    dout(4) << "Dropping laggy standby " << i << dendl;
+    drop_standby(i);
+    propose = true;
+  }
+
+  if (pending_map.active_gid != 0
+      && last_beacon.at(pending_map.active_gid) < cutoff
+      && mon.osdmon()->is_writeable()) {
+    const std::string old_active_name = pending_map.active_name;
+    drop_active();
+    propose = true;
+    dout(4) << "Dropping active" << pending_map.active_gid << dendl;
+    if (promote_standby()) {
+      dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
+      mon.clog->info() << "Manager daemon " << old_active_name
+                        << " is unresponsive, replacing it with standby"
+                        << " daemon " << pending_map.active_name;
+    } else {
+      dout(4) << "Active is laggy but have no standbys to replace it" << dendl;
+      mon.clog->info() << "Manager daemon " << old_active_name
+                        << " is unresponsive.  No standby daemons available.";
+    }
+  } else if (pending_map.active_gid == 0) {
+    if (promote_standby()) {
+      dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
+      mon.clog->info() << "Activating manager daemon "
+                      << pending_map.active_name;
+      propose = true;
+    }
+  }
+
+  if (!pending_map.available &&
+      !ever_had_active_mgr &&
+      should_warn_about_mgr_down() != HEALTH_OK) {
+    dout(10) << " exceeded mon_mgr_mkfs_grace "
+             << g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")
+             << " seconds" << dendl;
+    propose = true;
+  }
+
+  // obsolete modules?
+  if (mon.monmap->min_mon_release >= ceph_release_t::octopus &&
+      pending_map.module_enabled("orchestrator_cli")) {
+    dout(10) << " disabling obsolete/renamed 'orchestrator_cli'" << dendl;
+    // we don't need to enable 'orchestrator' because it's now always-on
+    pending_map.modules.erase("orchestrator_cli");
+    propose = true;
+  }
+
+  if (propose) {
+    propose_pending();
+  }
+}
+
+void MgrMonitor::on_restart()
+{
+  // Clear out the leader-specific state.
+  last_beacon.clear();
+  last_tick = ceph::coarse_mono_clock::now();
+}
+
+
+bool MgrMonitor::promote_standby()
+{
+  ceph_assert(pending_map.active_gid == 0);
+  if (pending_map.standbys.size()) {
+    // Promote a replacement (arbitrary choice of standby)
+    auto replacement_gid = pending_map.standbys.begin()->first;
+    pending_map.active_gid = replacement_gid;
+    pending_map.active_name = pending_map.standbys.at(replacement_gid).name;
+    pending_map.available_modules =
+      pending_map.standbys.at(replacement_gid).available_modules;
+    pending_map.active_mgr_features =
+      pending_map.standbys.at(replacement_gid).mgr_features;
+    pending_map.available = false;
+    pending_map.active_addrs = entity_addrvec_t();
+    pending_map.active_change = ceph_clock_now();
+
+    drop_standby(replacement_gid, false);
+
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void MgrMonitor::drop_active()
+{
+  ceph_assert(mon.osdmon()->is_writeable());
+
+  if (last_beacon.count(pending_map.active_gid) > 0) {
+    last_beacon.erase(pending_map.active_gid);
+  }
+
+  ceph_assert(pending_map.active_gid > 0);
+  auto until = ceph_clock_now();
+  until += g_conf().get_val<double>("mon_mgr_blocklist_interval");
+  dout(5) << "blocklisting previous mgr." << pending_map.active_name << "."
+          << pending_map.active_gid << " ("
+          << pending_map.active_addrs << ")" << dendl;
+  auto blocklist_epoch = mon.osdmon()->blocklist(pending_map.active_addrs, until);
+
+  /* blocklist RADOS clients in use by the mgr */
+  for (const auto& a : pending_map.clients) {
+    mon.osdmon()->blocklist(a, until);
+  }
+  request_proposal(mon.osdmon());
+
+  pending_metadata_rm.insert(pending_map.active_name);
+  pending_metadata.erase(pending_map.active_name);
+  pending_map.active_name = "";
+  pending_map.active_gid = 0;
+  pending_map.active_change = ceph_clock_now();
+  pending_map.active_mgr_features = 0;
+  pending_map.available = false;
+  pending_map.active_addrs = entity_addrvec_t();
+  pending_map.services.clear();
+  pending_map.clients.clear();
+  pending_map.last_failure_osd_epoch = blocklist_epoch;
+
+  // So that when new active mgr subscribes to mgrdigest, it will
+  // get an immediate response instead of waiting for next timer
+  cancel_timer();
+}
+
+void MgrMonitor::drop_standby(uint64_t gid, bool drop_meta)
+{
+  if (drop_meta) {
+    pending_metadata_rm.insert(pending_map.standbys[gid].name);
+    pending_metadata.erase(pending_map.standbys[gid].name);
+  }
+  pending_map.standbys.erase(gid);
+  if (last_beacon.count(gid) > 0) {
+    last_beacon.erase(gid);
+  }
+}
+
+bool MgrMonitor::preprocess_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  std::stringstream ss;
+  bufferlist rdata;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+    return true;
+  }
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", rdata,
+		       get_last_committed());
+    return true;
+  }
+
+  string format;
+  cmd_getval(cmdmap, "format", format);
+  boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+						   "json-pretty"));
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+  int r = 0;
+
+  if (prefix == "mgr stat") {
+    f->open_object_section("stat");
+    f->dump_unsigned("epoch", map.get_epoch());
+    f->dump_bool("available", map.get_available());
+    f->dump_string("active_name", map.get_active_name());
+    f->dump_unsigned("num_standby", map.get_num_standby());
+    f->close_section();
+    f->flush(rdata);
+  } else if (prefix == "mgr dump") {
+    int64_t epoch = 0;
+    cmd_getval(cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
+    if (epoch == (int64_t)map.get_epoch()) {
+      f->dump_object("mgrmap", map);
+    } else {
+      bufferlist bl;
+      int err = get_version(epoch, bl);
+      if (err == -ENOENT) {
+	r = -ENOENT;
+	ss << "there is no map for epoch " << epoch;
+	goto reply;
+      }
+      MgrMap m;
+      auto p = bl.cbegin();
+      m.decode(p);
+      f->dump_object("mgrmap", m);
+    }
+    f->flush(rdata);
+  } else if (prefix == "mgr module ls") {
+    f->open_object_section("modules");
+    {
+      f->open_array_section("always_on_modules");
+      for (auto& p : map.get_always_on_modules()) {
+        f->dump_string("module", p);
+      }
+      f->close_section();
+      f->open_array_section("enabled_modules");
+      for (auto& p : map.modules) {
+        if (map.get_always_on_modules().count(p) > 0)
+          continue;
+        // We only show the name for enabled modules.  The any errors
+        // etc will show up as a health checks.
+        f->dump_string("module", p);
+      }
+      f->close_section();
+      f->open_array_section("disabled_modules");
+      for (auto& p : map.available_modules) {
+        if (map.modules.count(p.name) == 0 &&
+            map.get_always_on_modules().count(p.name) == 0) {
+          // For disabled modules, we show the full info, to
+          // give a hint about whether enabling it will work
+          p.dump(f.get());
+        }
+      }
+      f->close_section();
+    }
+    f->close_section();
+    f->flush(rdata);
+  } else if (prefix == "mgr services") {
+    f->open_object_section("services");
+    for (const auto &i : map.services) {
+      f->dump_string(i.first.c_str(), i.second);
+    }
+    f->close_section();
+    f->flush(rdata);
+  } else if (prefix == "mgr metadata") {
+    string name;
+    cmd_getval(cmdmap, "who", name);
+    if (name.size() > 0 && !map.have_name(name)) {
+      ss << "mgr." << name << " does not exist";
+      r = -ENOENT;
+      goto reply;
+    }
+    string format;
+    cmd_getval(cmdmap, "format", format);
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+    if (name.size()) {
+      f->open_object_section("mgr_metadata");
+      f->dump_string("name", name);
+      r = dump_metadata(name, f.get(), &ss);
+      if (r < 0)
+        goto reply;
+      f->close_section();
+    } else {
+      r = 0;
+      f->open_array_section("mgr_metadata");
+      for (auto& i : map.get_all_names()) {
+	f->open_object_section("mgr");
+	f->dump_string("name", i);
+	r = dump_metadata(i, f.get(), NULL);
+	if (r == -EINVAL || r == -ENOENT) {
+	  // Drop error, continue to get other daemons' metadata
+	  dout(4) << "No metadata for mgr." << i << dendl;
+	  r = 0;
+	} else if (r < 0) {
+	  // Unexpected error
+	  goto reply;
+	}
+	f->close_section();
+      }
+      f->close_section();
+    }
+    f->flush(rdata);
+  } else if (prefix == "mgr versions") {
+    count_metadata("ceph_version", f.get());
+    f->flush(rdata);
+    r = 0;
+  } else if (prefix == "mgr count-metadata") {
+    string field;
+    cmd_getval(cmdmap, "property", field);
+    count_metadata(field, f.get());
+    f->flush(rdata);
+    r = 0;
+  } else {
+    return false;
+  }
+
+reply:
+  string rs;
+  getline(ss, rs);
+  mon.reply_command(op, r, rs, rdata, get_last_committed());
+  return true;
+}
+
+bool MgrMonitor::prepare_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+
+  std::stringstream ss;
+  bufferlist rdata;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+    return true;
+  }
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
+    return true;
+  }
+
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  int r = 0;
+
+  if (prefix == "mgr fail") {
+    string who;
+    if (!cmd_getval(cmdmap, "who", who)) {
+      if (!map.active_gid) {
+	ss << "Currently no active mgr";
+	goto out;
+      }
+      who = map.active_name;
+    }
+
+    std::string err;
+    uint64_t gid = strict_strtol(who.c_str(), 10, &err);
+    bool changed = false;
+    if (!err.empty()) {
+      // Does not parse as a gid, treat it as a name
+      if (pending_map.active_name == who) {
+        if (!mon.osdmon()->is_writeable()) {
+          mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+          return false;
+        }
+        drop_active();
+        changed = true;
+      } else {
+        gid = 0;
+        for (const auto &i : pending_map.standbys) {
+          if (i.second.name == who) {
+            gid = i.first;
+            break;
+          }
+        }
+        if (gid != 0) {
+          drop_standby(gid);
+          changed = true;
+        } else {
+          ss << "Daemon not found '" << who << "', already failed?";
+        }
+      }
+    } else {
+      if (pending_map.active_gid == gid) {
+        if (!mon.osdmon()->is_writeable()) {
+          mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+          return false;
+        }
+        drop_active();
+        changed = true;
+      } else if (pending_map.standbys.count(gid) > 0) {
+        drop_standby(gid);
+        changed = true;
+      } else {
+        ss << "Daemon not found '" << gid << "', already failed?";
+      }
+    }
+
+    if (changed && pending_map.active_gid == 0) {
+      promote_standby();
+    }
+  } else if (prefix == "mgr module enable") {
+    string module;
+    cmd_getval(cmdmap, "module", module);
+    if (module.empty()) {
+      r = -EINVAL;
+      goto out;
+    }
+    if (pending_map.get_always_on_modules().count(module) > 0) {
+      ss << "module '" << module << "' is already enabled (always-on)";
+      goto out;
+    }
+    string force;
+    cmd_getval(cmdmap, "force", force);
+    if (!pending_map.all_support_module(module) &&
+	force != "--force") {
+      ss << "all mgr daemons do not support module '" << module << "', pass "
+	 << "--force to force enablement";
+      r = -ENOENT;
+      goto out;
+    }
+
+    std::string can_run_error;
+    if (force != "--force" && !pending_map.can_run_module(module, &can_run_error)) {
+      ss << "module '" << module << "' reports that it cannot run on the active "
+            "manager daemon: " << can_run_error << " (pass --force to force "
+            "enablement)";
+      r = -ENOENT;
+      goto out;
+    }
+
+    if (pending_map.module_enabled(module)) {
+      ss << "module '" << module << "' is already enabled";
+      r = 0;
+      goto out;
+    }
+    pending_map.modules.insert(module);
+  } else if (prefix == "mgr module disable") {
+    string module;
+    cmd_getval(cmdmap, "module", module);
+    if (module.empty()) {
+      r = -EINVAL;
+      goto out;
+    }
+    if (pending_map.get_always_on_modules().count(module) > 0) {
+      ss << "module '" << module << "' cannot be disabled (always-on)";
+      r = -EINVAL;
+      goto out;
+    }
+    if (!pending_map.module_enabled(module)) {
+      ss << "module '" << module << "' is already disabled";
+      r = 0;
+      goto out;
+    }
+    if (!pending_map.modules.count(module)) {
+      ss << "module '" << module << "' is not enabled";
+    }
+    pending_map.modules.erase(module);
+  } else {
+    ss << "Command '" << prefix << "' not implemented!";
+    r = -ENOSYS;
+  }
+
+out:
+  dout(4) << __func__ << " done, r=" << r << dendl;
+  /* Compose response */
+  string rs;
+  getline(ss, rs);
+
+  if (r >= 0) {
+    // success.. delay reply
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else {
+    // reply immediately
+    mon.reply_command(op, r, rs, rdata, get_last_committed());
+    return false;
+  }
+}
+
+void MgrMonitor::init()
+{
+  if (digest_event == nullptr) {
+    send_digests();  // To get it to schedule its own event
+  }
+}
+
+void MgrMonitor::on_shutdown()
+{
+  cancel_timer();
+}
+
+int MgrMonitor::load_metadata(const string& name, std::map<string, string>& m,
+			      ostream *err) const
+{
+  bufferlist bl;
+  int r = mon.store->get(MGR_METADATA_PREFIX, name, bl);
+  if (r < 0)
+    return r;
+  try {
+    auto p = bl.cbegin();
+    decode(m, p);
+  }
+  catch (ceph::buffer::error& e) {
+    if (err)
+      *err << "mgr." << name << " metadata is corrupt";
+    return -EIO;
+  }
+  return 0;
+}
+
+void MgrMonitor::count_metadata(const string& field, std::map<string,int> *out)
+{
+  std::set<string> ls = map.get_all_names();
+  for (auto& name : ls) {
+    std::map<string,string> meta;
+    load_metadata(name, meta, nullptr);
+    auto p = meta.find(field);
+    if (p == meta.end()) {
+      (*out)["unknown"]++;
+    } else {
+      (*out)[p->second]++;
+    }
+  }
+}
+
+void MgrMonitor::count_metadata(const string& field, Formatter *f)
+{
+  std::map<string,int> by_val;
+  count_metadata(field, &by_val);
+  f->open_object_section(field.c_str());
+  for (auto& p : by_val) {
+    f->dump_int(p.first.c_str(), p.second);
+  }
+  f->close_section();
+}
+
+void MgrMonitor::get_versions(std::map<string, list<string> > &versions)
+{
+  std::set<string> ls = map.get_all_names();
+  for (auto& name : ls) {
+    std::map<string,string> meta;
+    load_metadata(name, meta, nullptr);
+    auto p = meta.find("ceph_version_short");
+    if (p == meta.end()) continue;
+    versions[p->second].push_back(string("mgr.") + name);
+  }
+}
+
+int MgrMonitor::dump_metadata(const string& name, Formatter *f, ostream *err)
+{
+  std::map<string,string> m;
+  if (int r = load_metadata(name, m, err))
+    return r;
+  for (auto& p : m) {
+    f->dump_string(p.first.c_str(), p.second);
+  }
+  return 0;
+}
+
+void MgrMonitor::print_nodes(Formatter *f) const
+{
+  ceph_assert(f);
+
+  std::map<string, list<string> > mgrs; // hostname => mgr
+  auto ls = map.get_all_names();
+  for (auto& name : ls) {
+    std::map<string,string> meta;
+    if (load_metadata(name, meta, nullptr)) {
+      continue;
+    }
+    auto hostname = meta.find("hostname");
+    if (hostname == meta.end()) {
+      // not likely though
+      continue;
+    }
+    mgrs[hostname->second].push_back(name);
+  }
+
+  dump_services(f, mgrs, "mgr");
+}
+
+const std::vector<MonCommand> &MgrMonitor::get_command_descs() const
+{
+  if (command_descs.empty()) {
+    // must have just upgraded; fallback to static commands
+    return mgr_commands;
+  } else {
+    return command_descs;
+  }
+}
diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h
new file mode 100644
index 000000000..be75602ab
--- /dev/null
+++ b/src/mon/MgrMonitor.h
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_MGRMONITOR_H
+#define CEPH_MGRMONITOR_H
+
+#include <map>
+#include <set>
+
+#include "include/Context.h"
+#include "MgrMap.h"
+#include "PaxosService.h"
+#include "MonCommand.h"
+
+class MgrMonitor: public PaxosService
+{
+  MgrMap map;
+  MgrMap pending_map;
+  bool ever_had_active_mgr = false;
+
+  std::map<std::string, ceph::buffer::list> pending_metadata;
+  std::set<std::string> pending_metadata_rm;
+
+  std::map<std::string,Option> mgr_module_options;
+  std::list<std::string> misc_option_strings;
+
+  utime_t first_seen_inactive;
+
+  std::map<uint64_t, ceph::coarse_mono_clock::time_point> last_beacon;
+
+  /**
+   * If a standby is available, make it active, given that
+   * there is currently no active daemon.
+   *
+   * @return true if a standby was promoted
+   */
+  bool promote_standby();
+  void drop_active();
+
+  /**
+   * Remove this gid from the list of standbys.  By default,
+   * also remove metadata (i.e. forget the daemon entirely).
+   *
+   * Set `drop_meta` to false if you would like to keep
+   * the daemon's metadata, for example if you're dropping
+   * it as a standby before reinstating it as the active daemon.
+   */
+  void drop_standby(uint64_t gid, bool drop_meta=true);
+
+  Context *digest_event = nullptr;
+  void cancel_timer();
+
+  std::vector<health_check_map_t> prev_health_checks;
+
+  bool check_caps(MonOpRequestRef op, const uuid_d& fsid);
+
+  health_status_t should_warn_about_mgr_down();
+
+  // Command descriptions we've learned from the active mgr
+  std::vector<MonCommand> command_descs;
+  std::vector<MonCommand> pending_command_descs;
+
+public:
+  MgrMonitor(Monitor &mn, Paxos &p, const std::string& service_name)
+    : PaxosService(mn, p, service_name)
+  {}
+  ~MgrMonitor() override {}
+
+  void init() override;
+  void on_shutdown() override;
+
+  const MgrMap &get_map() const { return map; }
+
+  const std::map<std::string,Option>& get_mgr_module_options() {
+    return mgr_module_options;
+  }
+  const Option *find_module_option(const std::string& name);
+
+  bool in_use() const { return map.epoch > 0; }
+
+  version_t get_trim_to() const override;
+
+  void prime_mgr_client();
+
+  void create_initial() override;
+  void get_store_prefixes(std::set<std::string>& s) const override;
+  void update_from_paxos(bool *need_bootstrap) override;
+  void post_paxos_update() override;
+  void create_pending() override;
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+
+  bool preprocess_query(MonOpRequestRef op) override;
+  bool prepare_update(MonOpRequestRef op) override;
+
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
+
+  void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+  bool preprocess_beacon(MonOpRequestRef op);
+  bool prepare_beacon(MonOpRequestRef op);
+
+  void check_sub(Subscription *sub);
+  void check_subs();
+  void send_digests();
+
+  void on_active() override;
+  void on_restart() override;
+
+  void tick() override;
+
+  void print_summary(ceph::Formatter *f, std::ostream *ss) const;
+
+  const std::vector<MonCommand> &get_command_descs() const;
+
+  int load_metadata(const std::string& name, std::map<std::string, std::string>& m,
+		    std::ostream *err) const;
+  int dump_metadata(const std::string& name, ceph::Formatter *f, std::ostream *err);
+  void print_nodes(ceph::Formatter *f) const;
+  void count_metadata(const std::string& field, ceph::Formatter *f);
+  void count_metadata(const std::string& field, std::map<std::string,int> *out);
+  void get_versions(std::map<std::string, std::list<std::string>> &versions);
+
+  // When did the mon last call into our tick() method?  Used for detecting
+  // when the mon was not updating us for some period (e.g. during slow
+  // election) to reset last_beacon timeouts
+  ceph::coarse_mono_clock::time_point last_tick;
+};
+
+#endif
diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc
new file mode 100644
index 000000000..9da4c50da
--- /dev/null
+++ b/src/mon/MgrStatMonitor.cc
@@ -0,0 +1,367 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MgrStatMonitor.h"
+#include "mon/OSDMonitor.h"
+#include "mon/MgrMonitor.h"
+#include "mon/PGMap.h"
+#include "messages/MGetPoolStats.h"
+#include "messages/MGetPoolStatsReply.h"
+#include "messages/MMonMgrReport.h"
+#include "messages/MStatfs.h"
+#include "messages/MStatfsReply.h"
+#include "messages/MServiceMap.h"
+
+#include "include/ceph_assert.h"	// re-clobber assert
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon)
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+
+static ostream& _prefix(std::ostream *_dout, Monitor &mon) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name()
+		<< ").mgrstat ";
+}
+
+MgrStatMonitor::MgrStatMonitor(Monitor &mn, Paxos &p, const string& service_name)
+  : PaxosService(mn, p, service_name)
+{
+}
+
+MgrStatMonitor::~MgrStatMonitor() = default;
+
+void MgrStatMonitor::create_initial()
+{
+  dout(10) << __func__ << dendl;
+  version = 0;
+  service_map.epoch = 1;
+  service_map.modified = ceph_clock_now();
+  pending_service_map_bl.clear();
+  encode(service_map, pending_service_map_bl, CEPH_FEATURES_ALL);
+}
+
+void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  version = get_last_committed();
+  dout(10) << " " << version << dendl;
+  load_health();
+  bufferlist bl;
+  get_version(version, bl);
+  if (version) {
+    ceph_assert(bl.length());
+    try {
+      auto p = bl.cbegin();
+      decode(digest, p);
+      decode(service_map, p);
+      if (!p.end()) {
+	decode(progress_events, p);
+      }
+      dout(10) << __func__ << " v" << version
+	       << " service_map e" << service_map.epoch
+	       << " " << progress_events.size() << " progress events"
+	       << dendl;
+    }
+    catch (ceph::buffer::error& e) {
+      derr << "failed to decode mgrstat state; luminous dev version? "
+	   << e.what() << dendl;
+    }
+  }
+  check_subs();
+  update_logger();
+  mon.osdmon()->notify_new_pg_digest();
+}
+
+void MgrStatMonitor::update_logger()
+{
+  dout(20) << __func__ << dendl;
+
+  mon.cluster_logger->set(l_cluster_osd_bytes, digest.osd_sum.statfs.total);
+  mon.cluster_logger->set(l_cluster_osd_bytes_used,
+                           digest.osd_sum.statfs.get_used_raw());
+  mon.cluster_logger->set(l_cluster_osd_bytes_avail,
+                           digest.osd_sum.statfs.available);
+
+  mon.cluster_logger->set(l_cluster_num_pool, digest.pg_pool_sum.size());
+  uint64_t num_pg = 0;
+  for (auto i : digest.num_pg_by_pool) {
+    num_pg += i.second;
+  }
+  mon.cluster_logger->set(l_cluster_num_pg, num_pg);
+
+  unsigned active = 0, active_clean = 0, peering = 0;
+  for (auto p = digest.num_pg_by_state.begin();
+       p != digest.num_pg_by_state.end();
+       ++p) {
+    if (p->first & PG_STATE_ACTIVE) {
+      active += p->second;
+      if (p->first & PG_STATE_CLEAN)
+	active_clean += p->second;
+    }
+    if (p->first & PG_STATE_PEERING)
+      peering += p->second;
+  }
+  mon.cluster_logger->set(l_cluster_num_pg_active_clean, active_clean);
+  mon.cluster_logger->set(l_cluster_num_pg_active, active);
+  mon.cluster_logger->set(l_cluster_num_pg_peering, peering);
+
+  mon.cluster_logger->set(l_cluster_num_object, digest.pg_sum.stats.sum.num_objects);
+  mon.cluster_logger->set(l_cluster_num_object_degraded, digest.pg_sum.stats.sum.num_objects_degraded);
+  mon.cluster_logger->set(l_cluster_num_object_misplaced, digest.pg_sum.stats.sum.num_objects_misplaced);
+  mon.cluster_logger->set(l_cluster_num_object_unfound, digest.pg_sum.stats.sum.num_objects_unfound);
+  mon.cluster_logger->set(l_cluster_num_bytes, digest.pg_sum.stats.sum.num_bytes);
+
+}
+
+void MgrStatMonitor::create_pending()
+{
+  dout(10) << " " << version << dendl;
+  pending_digest = digest;
+  pending_health_checks = get_health_checks();
+  pending_service_map_bl.clear();
+  encode(service_map, pending_service_map_bl, mon.get_quorum_con_features());
+}
+
+void MgrStatMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  ++version;
+  dout(10) << " " << version << dendl;
+  bufferlist bl;
+  encode(pending_digest, bl, mon.get_quorum_con_features());
+  ceph_assert(pending_service_map_bl.length());
+  bl.append(pending_service_map_bl);
+  encode(pending_progress_events, bl);
+  put_version(t, version, bl);
+  put_last_committed(t, version);
+
+  encode_health(pending_health_checks, t);
+}
+
+version_t MgrStatMonitor::get_trim_to() const
+{
+  // we don't actually need *any* old states, but keep a few.
+  if (version > 5) {
+    return version - 5;
+  }
+  return 0;
+}
+
+void MgrStatMonitor::on_active()
+{
+  update_logger();
+}
+
+void MgrStatMonitor::tick()
+{
+}
+
+bool MgrStatMonitor::preprocess_query(MonOpRequestRef op)
+{
+  auto m = op->get_req<PaxosServiceMessage>();
+  switch (m->get_type()) {
+  case CEPH_MSG_STATFS:
+    return preprocess_statfs(op);
+  case MSG_MON_MGR_REPORT:
+    return preprocess_report(op);
+  case MSG_GETPOOLSTATS:
+    return preprocess_getpoolstats(op);
+  default:
+    mon.no_reply(op);
+    derr << "Unhandled message type " << m->get_type() << dendl;
+    return true;
+  }
+}
+
+bool MgrStatMonitor::prepare_update(MonOpRequestRef op)
+{
+  auto m = op->get_req<PaxosServiceMessage>();
+  switch (m->get_type()) {
+  case MSG_MON_MGR_REPORT:
+    return prepare_report(op);
+  default:
+    mon.no_reply(op);
+    derr << "Unhandled message type " << m->get_type() << dendl;
+    return true;
+  }
+}
+
+bool MgrStatMonitor::preprocess_report(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonMgrReport>();
+  mon.no_reply(op);
+  if (m->gid &&
+      m->gid != mon.mgrmon()->get_map().get_active_gid()) {
+    dout(10) << "ignoring report from non-active mgr " << m->gid
+	     << dendl;
+    return true;
+  }
+  return false;
+}
+
+bool MgrStatMonitor::prepare_report(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonMgrReport>();
+  bufferlist bl = m->get_data();
+  auto p = bl.cbegin();
+  decode(pending_digest, p);
+  pending_health_checks.swap(m->health_checks);
+  if (m->service_map_bl.length()) {
+    pending_service_map_bl.swap(m->service_map_bl);
+  }
+  pending_progress_events.swap(m->progress_events);
+  dout(10) << __func__ << " " << pending_digest << ", "
+	   << pending_health_checks.checks.size() << " health checks, "
+	   << progress_events.size() << " progress events" << dendl;
+  dout(20) << "pending_digest:\n";
+  JSONFormatter jf(true);
+  jf.open_object_section("pending_digest");
+  pending_digest.dump(&jf);
+  jf.close_section();
+  jf.flush(*_dout);
+  *_dout << dendl;
+  dout(20) << "health checks:\n";
+  JSONFormatter jf(true);
+  jf.open_object_section("health_checks");
+  pending_health_checks.dump(&jf);
+  jf.close_section();
+  jf.flush(*_dout);
+  *_dout << dendl;
+  dout(20) << "progress events:\n";
+  JSONFormatter jf(true);
+  jf.open_object_section("progress_events");
+  for (auto& i : pending_progress_events) {
+    jf.dump_object(i.first.c_str(), i.second);
+  }
+  jf.close_section();
+  jf.flush(*_dout);
+  *_dout << dendl;
+  return true;
+}
+
+bool MgrStatMonitor::preprocess_getpoolstats(MonOpRequestRef op)
+{
+  op->mark_pgmon_event(__func__);
+  auto m = op->get_req<MGetPoolStats>();
+  auto session = op->get_session();
+  if (!session)
+    return true;
+  if (!session->is_capable("pg", MON_CAP_R)) {
+    dout(0) << "MGetPoolStats received from entity with insufficient caps "
+            << session->caps << dendl;
+    return true;
+  }
+  if (m->fsid != mon.monmap->fsid) {
+    dout(0) << __func__ << " on fsid "
+	    << m->fsid << " != " << mon.monmap->fsid << dendl;
+    return true;
+  }
+  epoch_t ver = get_last_committed();
+  auto reply = new MGetPoolStatsReply(m->fsid, m->get_tid(), ver);
+  reply->per_pool = digest.use_per_pool_stats();
+  for (const auto& pool_name : m->pools) {
+    const auto pool_id = mon.osdmon()->osdmap.lookup_pg_pool_name(pool_name);
+    if (pool_id == -ENOENT)
+      continue;
+    auto pool_stat = get_pool_stat(pool_id);
+    if (!pool_stat)
+      continue;
+    reply->pool_stats[pool_name] = *pool_stat;
+  }
+  mon.send_reply(op, reply);
+  return true;
+}
+
+bool MgrStatMonitor::preprocess_statfs(MonOpRequestRef op)
+{
+  op->mark_pgmon_event(__func__);
+  auto statfs = op->get_req<MStatfs>();
+  auto session = op->get_session();
+
+  if (!session)
+    return true;
+  if (!session->is_capable("pg", MON_CAP_R)) {
+    dout(0) << "MStatfs received from entity with insufficient privileges "
+            << session->caps << dendl;
+    return true;
+  }
+  if (statfs->fsid != mon.monmap->fsid) {
+    dout(0) << __func__ << " on fsid " << statfs->fsid
+            << " != " << mon.monmap->fsid << dendl;
+    return true;
+  }
+  const auto& pool = statfs->data_pool;
+  if (pool && !mon.osdmon()->osdmap.have_pg_pool(*pool)) {
+    // There's no error field for MStatfsReply so just ignore the request.
+    // This is known to happen when a client is still accessing a removed fs.
+    dout(1) << __func__ << " on removed pool " << *pool << dendl;
+    return true;
+  }
+  dout(10) << __func__ << " " << *statfs
+           << " from " << statfs->get_orig_source() << dendl;
+  epoch_t ver = get_last_committed();
+  auto reply = new MStatfsReply(statfs->fsid, statfs->get_tid(), ver);
+  reply->h.st = get_statfs(mon.osdmon()->osdmap, pool);
+  mon.send_reply(op, reply);
+  return true;
+}
+
+void MgrStatMonitor::check_sub(Subscription *sub)
+{
+  dout(10) << __func__
+	   << " next " << sub->next
+	   << " vs service_map.epoch " << service_map.epoch << dendl;
+  if (sub->next <= service_map.epoch) {
+    auto m = new MServiceMap(service_map);
+    sub->session->con->send_message(m);
+    if (sub->onetime) {
+      mon.with_session_map([sub](MonSessionMap& session_map) {
+	  session_map.remove_sub(sub);
+	});
+    } else {
+      sub->next = service_map.epoch + 1;
+    }
+  }
+}
+
+void MgrStatMonitor::check_subs()
+{
+  dout(10) << __func__ << dendl;
+  if (!service_map.epoch) {
+    return;
+  }
+  auto subs = mon.session_map.subs.find("servicemap");
+  if (subs == mon.session_map.subs.end()) {
+    return;
+  }
+  auto p = subs->second->begin();
+  while (!p.end()) {
+    auto sub = *p;
+    ++p;
+    check_sub(sub);
+  }
+}
diff --git a/src/mon/MgrStatMonitor.h b/src/mon/MgrStatMonitor.h
new file mode 100644
index 000000000..7c31f2c13
--- /dev/null
+++ b/src/mon/MgrStatMonitor.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/Context.h"
+#include "PaxosService.h"
+#include "mon/PGMap.h"
+#include "mgr/ServiceMap.h"
+
+class MgrStatMonitor : public PaxosService {
+  // live version
+  version_t version = 0;
+  PGMapDigest digest;
+  ServiceMap service_map;
+  std::map<std::string,ProgressEvent> progress_events;
+
+  // pending commit
+  PGMapDigest pending_digest;
+  health_check_map_t pending_health_checks;
+  std::map<std::string,ProgressEvent> pending_progress_events;
+  ceph::buffer::list pending_service_map_bl;
+
+public:
+  MgrStatMonitor(Monitor &mn, Paxos &p, const std::string& service_name);
+  ~MgrStatMonitor() override;
+
+  void init() override {}
+  void on_shutdown() override {}
+
+  void create_initial() override;
+  void update_from_paxos(bool *need_bootstrap) override;
+  void create_pending() override;
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  version_t get_trim_to() const override;
+
+  bool definitely_converted_snapsets() const {
+    return digest.definitely_converted_snapsets();
+  }
+
+  bool preprocess_query(MonOpRequestRef op) override;
+  bool prepare_update(MonOpRequestRef op) override;
+
+  void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+  bool preprocess_report(MonOpRequestRef op);
+  bool prepare_report(MonOpRequestRef op);
+
+  bool preprocess_getpoolstats(MonOpRequestRef op);
+  bool preprocess_statfs(MonOpRequestRef op);
+
+  void check_sub(Subscription *sub);
+  void check_subs();
+  void send_digests();
+
+  void on_active() override;
+  void tick() override;
+
+  uint64_t get_last_osd_stat_seq(int osd) {
+    return digest.get_last_osd_stat_seq(osd);
+  }
+
+  void update_logger();
+
+  const ServiceMap& get_service_map() const {
+    return service_map;
+  }
+
+  const std::map<std::string,ProgressEvent>& get_progress_events() {
+    return progress_events;
+  }
+
+  // pg stat access
+  const pool_stat_t* get_pool_stat(int64_t poolid) const {
+    auto i = digest.pg_pool_sum.find(poolid);
+    if (i != digest.pg_pool_sum.end()) {
+      return &i->second;
+    }
+    return nullptr;
+  }
+
+  const PGMapDigest& get_digest() {
+    return digest;
+  }
+
+  ceph_statfs get_statfs(OSDMap& osdmap,
+			 boost::optional<int64_t> data_pool) const {
+    return digest.get_statfs(osdmap, data_pool);
+  }
+
+  void print_summary(ceph::Formatter *f, std::ostream *out) const {
+    digest.print_summary(f, out);
+  }
+  void dump_info(ceph::Formatter *f) const {
+    digest.dump(f);
+    f->dump_object("servicemap", get_service_map());
+  }
+  void dump_cluster_stats(std::stringstream *ss,
+			  ceph::Formatter *f,
+			  bool verbose) const {
+    digest.dump_cluster_stats(ss, f, verbose);
+  }
+  void dump_pool_stats(const OSDMap& osdm, std::stringstream *ss, ceph::Formatter *f,
+		       bool verbose) const {
+    digest.dump_pool_stats_full(osdm, ss, f, verbose);
+  }
+};
diff --git a/src/mon/MonCap.cc b/src/mon/MonCap.cc
new file mode 100644
index 000000000..e1dc37239
--- /dev/null
+++ b/src/mon/MonCap.cc
@@ -0,0 +1,679 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <boost/config/warning_disable.hpp>
+#include <boost/spirit/include/qi_uint.hpp>
+#include <boost/spirit/include/qi.hpp>
+#include <boost/fusion/include/std_pair.hpp>
+#include <boost/spirit/include/phoenix.hpp>
+#include <boost/fusion/adapted/struct/adapt_struct.hpp>
+#include <boost/fusion/include/adapt_struct.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "MonCap.h"
+#include "include/stringify.h"
+#include "include/ipaddr.h"
+#include "common/debug.h"
+#include "common/Formatter.h"
+
+#include <algorithm>
+#include <regex>
+
+#include "include/ceph_assert.h"
+
+using std::list;
+using std::map;
+using std::ostream;
+using std::pair;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+static inline bool is_not_alnum_space(char c)
+{
+  return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_'));
+}
+
+static std::string maybe_quote_string(const std::string& str)
+{
+  if (find_if(str.begin(), str.end(), is_not_alnum_space) == str.end())
+    return str;
+  return string("\"") + str + string("\"");
+}
+
+#define dout_subsys ceph_subsys_mon
+
+ostream& operator<<(ostream& out, const mon_rwxa_t& p)
+{ 
+  if (p == MON_CAP_ANY)
+    return out << "*";
+
+  if (p & MON_CAP_R)
+    out << "r";
+  if (p & MON_CAP_W)
+    out << "w";
+  if (p & MON_CAP_X)
+    out << "x";
+  return out;
+}
+
+ostream& operator<<(ostream& out, const StringConstraint& c)
+{
+  switch (c.match_type) {
+  case StringConstraint::MATCH_TYPE_EQUAL:
+    return out << "value " << c.value;
+  case StringConstraint::MATCH_TYPE_PREFIX:
+    return out << "prefix " << c.value;
+  case StringConstraint::MATCH_TYPE_REGEX:
+    return out << "regex " << c.value;
+  default:
+    break;
+  }
+  return out;
+}
+
+ostream& operator<<(ostream& out, const MonCapGrant& m)
+{
+  out << "allow";
+  if (m.service.length()) {
+    out << " service " << maybe_quote_string(m.service);
+  }
+  if (m.command.length()) {
+    out << " command " << maybe_quote_string(m.command);
+    if (!m.command_args.empty()) {
+      out << " with";
+      for (auto p = m.command_args.begin();
+	   p != m.command_args.end();
+	   ++p) {
+        switch (p->second.match_type) {
+        case StringConstraint::MATCH_TYPE_EQUAL:
+	  out << " " << maybe_quote_string(p->first) << "="
+              << maybe_quote_string(p->second.value);
+          break;
+        case StringConstraint::MATCH_TYPE_PREFIX:
+	  out << " " << maybe_quote_string(p->first) << " prefix "
+              << maybe_quote_string(p->second.value);
+          break;
+        case StringConstraint::MATCH_TYPE_REGEX:
+	  out << " " << maybe_quote_string(p->first) << " regex "
+              << maybe_quote_string(p->second.value);
+          break;
+        default:
+          break;
+        }
+      }
+    }
+  }
+  if (m.profile.length()) {
+    out << " profile " << maybe_quote_string(m.profile);
+  }
+  if (m.allow != 0)
+    out << " " << m.allow;
+  if (m.network.size())
+    out << " network " << m.network;
+  return out;
+}
+
+
+// <magic>
+//  fusion lets us easily populate structs via the qi parser.
+
+typedef map<string,StringConstraint> kvmap;
+
+BOOST_FUSION_ADAPT_STRUCT(MonCapGrant,
+			  (std::string, service)
+			  (std::string, profile)
+			  (std::string, command)
+			  (kvmap, command_args)
+			  (mon_rwxa_t, allow)
+			  (std::string, network)
+                          (std::string, fs_name))
+
+BOOST_FUSION_ADAPT_STRUCT(StringConstraint,
+                          (StringConstraint::MatchType, match_type)
+			  (std::string, value))
+
+// </magic>
+
+void MonCapGrant::parse_network()
+{
+  network_valid = ::parse_network(network.c_str(), &network_parsed,
+				  &network_prefix);
+}
+
+void MonCapGrant::expand_profile(const EntityName& name) const
+{
+  // only generate this list once
+  if (!profile_grants.empty())
+    return;
+
+  if (profile == "read-only") {
+    // grants READ-ONLY caps monitor-wide
+    // 'auth' requires MON_CAP_X even for RO, which we do not grant here.
+    profile_grants.push_back(mon_rwxa_t(MON_CAP_R));
+    return;
+  }
+
+  if (profile == "read-write") {
+    // grants READ-WRITE caps monitor-wide
+    // 'auth' requires MON_CAP_X for all operations, which we do not grant.
+    profile_grants.push_back(mon_rwxa_t(MON_CAP_R | MON_CAP_W));
+    return;
+  }
+
+  if (profile == "mon") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_ALL));
+    profile_grants.push_back(MonCapGrant("log", MON_CAP_ALL));
+  }
+  if (profile == "osd") {
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_ALL));
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("pg", MON_CAP_R | MON_CAP_W));
+    profile_grants.push_back(MonCapGrant("log", MON_CAP_W));
+    StringConstraint constraint(StringConstraint::MATCH_TYPE_REGEX,
+                                string("osd_mclock_max_capacity_iops_(hdd|ssd)"));
+    profile_grants.push_back(MonCapGrant("config set", "name", constraint));
+  }
+  if (profile == "mds") {
+    profile_grants.push_back(MonCapGrant("mds", MON_CAP_ALL));
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+    // This command grant is checked explicitly in MRemoveSnaps handling
+    profile_grants.push_back(MonCapGrant("osd pool rmsnap"));
+    profile_grants.push_back(MonCapGrant("osd blocklist"));
+    profile_grants.push_back(MonCapGrant("osd blacklist")); // for compat
+    profile_grants.push_back(MonCapGrant("log", MON_CAP_W));
+  }
+  if (profile == "mgr") {
+    profile_grants.push_back(MonCapGrant("mgr", MON_CAP_ALL));
+    profile_grants.push_back(MonCapGrant("log", MON_CAP_R | MON_CAP_W));
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R | MON_CAP_W));
+    profile_grants.push_back(MonCapGrant("mds", MON_CAP_R | MON_CAP_W));
+    profile_grants.push_back(MonCapGrant("fs", MON_CAP_R | MON_CAP_W));
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R | MON_CAP_W));
+    profile_grants.push_back(MonCapGrant("auth", MON_CAP_R | MON_CAP_W | MON_CAP_X));
+    profile_grants.push_back(MonCapGrant("config-key", MON_CAP_R | MON_CAP_W));
+    profile_grants.push_back(MonCapGrant("config", MON_CAP_R | MON_CAP_W));
+    // cephadm orchestrator provisions new daemon keys and updates caps
+    profile_grants.push_back(MonCapGrant("auth get-or-create"));
+    profile_grants.push_back(MonCapGrant("auth caps"));
+    profile_grants.push_back(MonCapGrant("auth rm"));
+    // tell commands (this is a bit of a kludge)
+    profile_grants.push_back(MonCapGrant("smart"));
+  }
+  if (profile == "osd" || profile == "mds" || profile == "mon" ||
+      profile == "mgr") {
+    StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX,
+                                string("daemon-private/") + stringify(name) +
+                                string("/"));
+    std::string prefix = string("daemon-private/") + stringify(name) + string("/");
+    profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+    profile_grants.push_back(MonCapGrant("config-key put", "key", constraint));
+    profile_grants.push_back(MonCapGrant("config-key set", "key", constraint));
+    profile_grants.push_back(MonCapGrant("config-key exists", "key", constraint));
+    profile_grants.push_back(MonCapGrant("config-key delete", "key", constraint));
+  }
+  if (profile == "bootstrap-osd") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));  // read monmap
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));  // read osdmap
+    profile_grants.push_back(MonCapGrant("mon getmap"));
+    profile_grants.push_back(MonCapGrant("osd new"));
+    profile_grants.push_back(MonCapGrant("osd purge-new"));
+  }
+  if (profile == "bootstrap-mds") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));  // read monmap
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));  // read osdmap
+    profile_grants.push_back(MonCapGrant("mon getmap"));
+    profile_grants.push_back(MonCapGrant("auth get-or-create"));  // FIXME: this can expose other mds keys
+    profile_grants.back().command_args["entity"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_PREFIX, "mds.");
+    profile_grants.back().command_args["caps_mon"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL, "allow profile mds");
+    profile_grants.back().command_args["caps_osd"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL, "allow rwx");
+    profile_grants.back().command_args["caps_mds"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL, "allow");
+  }
+  if (profile == "bootstrap-mgr") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));  // read monmap
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));  // read osdmap
+    profile_grants.push_back(MonCapGrant("mon getmap"));
+    profile_grants.push_back(MonCapGrant("auth get-or-create"));  // FIXME: this can expose other mgr keys
+    profile_grants.back().command_args["entity"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_PREFIX, "mgr.");
+    profile_grants.back().command_args["caps_mon"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL, "allow profile mgr");
+  }
+  if (profile == "bootstrap-rgw") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));  // read monmap
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));  // read osdmap
+    profile_grants.push_back(MonCapGrant("mon getmap"));
+    profile_grants.push_back(MonCapGrant("auth get-or-create"));  // FIXME: this can expose other mds keys
+    profile_grants.back().command_args["entity"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_PREFIX, "client.rgw.");
+    profile_grants.back().command_args["caps_mon"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL, "allow rw");
+    profile_grants.back().command_args["caps_osd"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL, "allow rwx");
+  }
+  if (profile == "bootstrap-rbd" || profile == "bootstrap-rbd-mirror") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));  // read monmap
+    profile_grants.push_back(MonCapGrant("auth get-or-create"));  // FIXME: this can expose other rbd keys
+    profile_grants.back().command_args["entity"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_PREFIX, "client.");
+    profile_grants.back().command_args["caps_mon"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL,
+      (profile == "bootstrap-rbd-mirror" ? "profile rbd-mirror" :
+                                           "profile rbd"));
+    profile_grants.back().command_args["caps_osd"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_REGEX,
+      "^([ ,]*profile(=|[ ]+)['\"]?rbd[^ ,'\"]*['\"]?([ ]+pool(=|[ ]+)['\"]?[^,'\"]+['\"]?)?)+$");
+  }
+  if (profile == "fs-client") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("mds", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+  }
+  if (profile == "simple-rados-client") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+  }
+  if (profile == "simple-rados-client-with-blocklist") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("osd blocklist"));
+    profile_grants.back().command_args["blocklistop"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL, "add");
+    profile_grants.back().command_args["addr"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$");
+
+  }
+  if (boost::starts_with(profile, "rbd")) {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+
+    // exclusive lock dead-client blocklisting (IP+nonce required)
+    profile_grants.push_back(MonCapGrant("osd blocklist"));
+    profile_grants.back().command_args["blocklistop"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL, "add");
+    profile_grants.back().command_args["addr"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$");
+
+    // for compat,
+    profile_grants.push_back(MonCapGrant("osd blacklist"));
+    profile_grants.back().command_args["blacklistop"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_EQUAL, "add");
+    profile_grants.back().command_args["addr"] = StringConstraint(
+      StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$");
+
+  }
+  if (profile == "rbd-mirror") {
+    StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX,
+                                "rbd/mirror/");
+    profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+  } else if (profile == "rbd-mirror-peer") {
+    StringConstraint constraint(StringConstraint::MATCH_TYPE_REGEX,
+                                "rbd/mirror/[^/]+");
+    profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+
+    constraint = StringConstraint(StringConstraint::MATCH_TYPE_PREFIX,
+                                  "rbd/mirror/peer/");
+    profile_grants.push_back(MonCapGrant("config-key set", "key", constraint));
+  }
+  else if (profile == "crash") {
+    // TODO: we could limit this to getting the monmap and mgrmap...
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+  }
+  if (profile == "cephfs-mirror") {
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("mds", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("pg", MON_CAP_R));
+    StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX,
+                                "cephfs/mirror/peer/");
+    profile_grants.push_back(MonCapGrant("config-key get", "key", constraint));
+
+  }
+  if (profile == "role-definer") {
+    // grants ALL caps to the auth subsystem, read-only on the
+    // monitor subsystem and nothing else.
+    profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
+    profile_grants.push_back(MonCapGrant("auth", MON_CAP_ALL));
+  }
+}
+
+mon_rwxa_t MonCapGrant::get_allowed(CephContext *cct,
+				    EntityName name,
+				    const std::string& s, const std::string& c,
+				    const map<string,string>& c_args) const
+{
+  if (profile.length()) {
+    expand_profile(name);
+    mon_rwxa_t a;
+    for (auto p = profile_grants.begin();
+	 p != profile_grants.end(); ++p)
+      a = a | p->get_allowed(cct, name, s, c, c_args);
+    return a;
+  }
+  if (service.length()) {
+    if (service != s)
+      return 0;
+    return allow;
+  }
+  if (command.length()) {
+    if (command != c)
+      return 0;
+    for (map<string,StringConstraint>::const_iterator p = command_args.begin(); p != command_args.end(); ++p) {
+      map<string,string>::const_iterator q = c_args.find(p->first);
+      // argument must be present if a constraint exists
+      if (q == c_args.end())
+	return 0;
+      switch (p->second.match_type) {
+      case StringConstraint::MATCH_TYPE_EQUAL:
+	if (p->second.value != q->second)
+	  return 0;
+        break;
+      case StringConstraint::MATCH_TYPE_PREFIX:
+	if (q->second.find(p->second.value) != 0)
+	  return 0;
+        break;
+      case StringConstraint::MATCH_TYPE_REGEX:
+        try {
+	  std::regex pattern(
+            p->second.value, std::regex::extended);
+          if (!std::regex_match(q->second, pattern))
+	    return 0;
+        } catch(const std::regex_error&) {
+	  return 0;
+	}
+        break;
+      default:
+        break;
+      }
+    }
+    return MON_CAP_ALL;
+  }
+  // we don't allow config-key service to be accessed with blanket caps other
+  // than '*' (i.e., 'any'), and that should have been checked by the caller
+  // via 'is_allow_all()'.
+  if (s == "config-key") {
+    return 0;
+  }
+  return allow;
+}
+
+ostream& operator<<(ostream&out, const MonCap& m)
+{
+  for (vector<MonCapGrant>::const_iterator p = m.grants.begin(); p != m.grants.end(); ++p) {
+    if (p != m.grants.begin())
+      out << ", ";
+    out << *p;
+  }
+  return out;
+}
+
+bool MonCap::is_allow_all() const
+{
+  for (vector<MonCapGrant>::const_iterator p = grants.begin(); p != grants.end(); ++p)
+    if (p->is_allow_all())
+      return true;
+  return false;
+}
+
+void MonCap::set_allow_all()
+{
+  grants.clear();
+  grants.push_back(MonCapGrant(MON_CAP_ANY));
+  text = "allow *";
+}
+
+bool MonCap::is_capable(
+  CephContext *cct,
+  EntityName name,
+  const string& service,
+  const string& command, const map<string,string>& command_args,
+  bool op_may_read, bool op_may_write, bool op_may_exec,
+  const entity_addr_t& addr) const
+{
+  if (cct)
+    ldout(cct, 20) << "is_capable service=" << service << " command=" << command
+		   << (op_may_read ? " read":"")
+		   << (op_may_write ? " write":"")
+		   << (op_may_exec ? " exec":"")
+		   << " addr " << addr
+		   << " on cap " << *this
+		   << dendl;
+
+  mon_rwxa_t allow = 0;
+  for (vector<MonCapGrant>::const_iterator p = grants.begin();
+       p != grants.end(); ++p) {
+    if (cct)
+      ldout(cct, 20) << " allow so far " << allow << ", doing grant " << *p
+		     << dendl;
+
+    if (p->network.size() &&
+	(!p->network_valid ||
+	 !network_contains(p->network_parsed,
+			   p->network_prefix,
+			   addr))) {
+      continue;
+    }
+
+    if (p->is_allow_all()) {
+      if (cct)
+	ldout(cct, 20) << " allow all" << dendl;
+      return true;
+    }
+
+    // check enumerated caps
+    allow = allow | p->get_allowed(cct, name, service, command, command_args);
+    if ((!op_may_read || (allow & MON_CAP_R)) &&
+	(!op_may_write || (allow & MON_CAP_W)) &&
+	(!op_may_exec || (allow & MON_CAP_X))) {
+      if (cct)
+	ldout(cct, 20) << " match" << dendl;
+      return true;
+    }
+  }
+  return false;
+}
+
+void MonCap::encode(bufferlist& bl) const
+{
+  ENCODE_START(4, 4, bl);   // legacy MonCaps was 3, 3
+  encode(text, bl);
+  ENCODE_FINISH(bl);
+}
+
+void MonCap::decode(bufferlist::const_iterator& bl)
+{
+  std::string s;
+  DECODE_START(4, bl);
+  decode(s, bl);
+  DECODE_FINISH(bl);
+  parse(s, NULL);
+}
+
+void MonCap::dump(Formatter *f) const
+{
+  f->dump_string("text", text);
+}
+
+void MonCap::generate_test_instances(list<MonCap*>& ls)
+{
+  ls.push_back(new MonCap);
+  ls.push_back(new MonCap);
+  ls.back()->parse("allow *");
+  ls.push_back(new MonCap);
+  ls.back()->parse("allow rwx");
+  ls.push_back(new MonCap);
+  ls.back()->parse("allow service foo x");
+  ls.push_back(new MonCap);
+  ls.back()->parse("allow command bar x");
+  ls.push_back(new MonCap);
+  ls.back()->parse("allow service foo r, allow command bar x");
+  ls.push_back(new MonCap);
+  ls.back()->parse("allow command bar with k1=v1 x");
+  ls.push_back(new MonCap);
+  ls.back()->parse("allow command bar with k1=v1 k2=v2 x");
+}
+
+// grammar
+namespace qi = boost::spirit::qi;
+namespace ascii = boost::spirit::ascii;
+namespace phoenix = boost::phoenix;
+
+
+template <typename Iterator>
+struct MonCapParser : qi::grammar<Iterator, MonCap()>
+{
+  MonCapParser() : MonCapParser::base_type(moncap)
+  {
+    using qi::char_;
+    using qi::int_;
+    using qi::ulong_long;
+    using qi::lexeme;
+    using qi::alnum;
+    using qi::_val;
+    using qi::_1;
+    using qi::_2;
+    using qi::_3;
+    using qi::eps;
+    using qi::lit;
+
+    quoted_string %=
+      lexeme['"' >> +(char_ - '"') >> '"'] | 
+      lexeme['\'' >> +(char_ - '\'') >> '\''];
+    unquoted_word %= +char_("a-zA-Z0-9_./-");
+    str %= quoted_string | unquoted_word;
+    network_str %= +char_("/.:a-fA-F0-9][");
+    fs_name_str %= +char_("a-zA-Z0-9_.-");
+
+    spaces = +(lit(' ') | lit('\n') | lit('\t'));
+
+    // command := command[=]cmd [k1=v1 k2=v2 ...]
+    str_match = '=' >> qi::attr(StringConstraint::MATCH_TYPE_EQUAL) >> str;
+    str_prefix = spaces >> lit("prefix") >> spaces >>
+                 qi::attr(StringConstraint::MATCH_TYPE_PREFIX) >> str;
+    str_regex = spaces >> lit("regex") >> spaces >>
+                 qi::attr(StringConstraint::MATCH_TYPE_REGEX) >> str;
+    kv_pair = str >> (str_match | str_prefix | str_regex);
+    kv_map %= kv_pair >> *(spaces >> kv_pair);
+    command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces)
+			    >> qi::attr(string()) >> qi::attr(string())
+			    >> str
+			    >> -(spaces >> lit("with") >> spaces >> kv_map)
+			    >> qi::attr(0)
+			    >> -(spaces >> lit("network") >> spaces >> network_str);
+
+    // service foo rwxa
+    service_match %= -spaces >> lit("allow") >> spaces >> lit("service") >> (lit('=') | spaces)
+			     >> str >> qi::attr(string()) >> qi::attr(string())
+			     >> qi::attr(map<string,StringConstraint>())
+                             >> spaces >> rwxa
+			     >> -(spaces >> lit("network") >> spaces >> network_str);
+
+    // profile foo
+    profile_match %= -spaces >> -(lit("allow") >> spaces)
+                             >> lit("profile") >> (lit('=') | spaces)
+			     >> qi::attr(string())
+			     >> str
+			     >> qi::attr(string())
+			     >> qi::attr(map<string,StringConstraint>())
+			     >> qi::attr(0)
+			     >> -(spaces >> lit("network") >> spaces >> network_str);
+
+    // rwxa
+    rwxa_match %= -spaces >> lit("allow") >> spaces
+			  >> qi::attr(string()) >> qi::attr(string()) >> qi::attr(string())
+			  >> qi::attr(map<string,StringConstraint>())
+			  >> rwxa
+			  >> -(spaces >> lit("network") >> spaces >> network_str)
+			  >> -(spaces >> lit("fsname") >> (lit('=') | spaces) >> fs_name_str);
+
+    // rwxa := * | [r][w][x]
+    rwxa =
+      (lit("*")[_val = MON_CAP_ANY]) |
+      (lit("all")[_val = MON_CAP_ANY]) |
+      ( eps[_val = 0] >>
+	( lit('r')[_val |= MON_CAP_R] ||
+	  lit('w')[_val |= MON_CAP_W] ||
+	  lit('x')[_val |= MON_CAP_X]
+	  )
+	);
+
+    // grant := allow ...
+    grant = -spaces >> (rwxa_match | profile_match | service_match | command_match) >> -spaces;
+
+    // moncap := grant [grant ...]
+    grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' ')));
+    moncap = grants  [_val = phoenix::construct<MonCap>(_1)]; 
+
+  }
+  qi::rule<Iterator> spaces;
+  qi::rule<Iterator, unsigned()> rwxa;
+  qi::rule<Iterator, string()> quoted_string;
+  qi::rule<Iterator, string()> unquoted_word;
+  qi::rule<Iterator, string()> str, network_str;
+  qi::rule<Iterator, string()> fs_name_str;
+
+  qi::rule<Iterator, StringConstraint()> str_match, str_prefix, str_regex;
+  qi::rule<Iterator, pair<string, StringConstraint>()> kv_pair;
+  qi::rule<Iterator, map<string, StringConstraint>()> kv_map;
+
+  qi::rule<Iterator, MonCapGrant()> rwxa_match;
+  qi::rule<Iterator, MonCapGrant()> command_match;
+  qi::rule<Iterator, MonCapGrant()> service_match;
+  qi::rule<Iterator, MonCapGrant()> profile_match;
+  qi::rule<Iterator, MonCapGrant()> grant;
+  qi::rule<Iterator, std::vector<MonCapGrant>()> grants;
+  qi::rule<Iterator, MonCap()> moncap;
+};
+
+bool MonCap::parse(const string& str, ostream *err)
+{
+  auto iter = str.begin();
+  auto end = str.end();
+
+  MonCapParser<string::const_iterator> exp;
+  bool r = qi::parse(iter, end, exp, *this);
+  if (r && iter == end) {
+    text = str;
+    for (auto& g : grants) {
+      g.parse_network();
+    }
+    return true;
+  }
+
+  // Make sure no grants are kept after parsing failed!
+  grants.clear();
+
+  if (err) {
+    if (iter != end)
+      *err << "mon capability parse failed, stopped at '"
+	   << std::string(iter, end)
+	   << "' of '" << str << "'";
+    else
+      *err << "mon capability parse failed, stopped at end of '" << str << "'";
+  }
+
+  return false; 
+}
+
diff --git a/src/mon/MonCap.h b/src/mon/MonCap.h
new file mode 100644
index 000000000..ab4e35bc9
--- /dev/null
+++ b/src/mon/MonCap.h
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MONCAP_H
+#define CEPH_MONCAP_H
+
+#include <ostream>
+
+#include "include/common_fwd.h"
+#include "include/types.h"
+#include "common/entity_name.h"
+#include "mds/mdstypes.h"
+
+static const __u8 MON_CAP_R     = (1 << 1);      // read
+static const __u8 MON_CAP_W     = (1 << 2);      // write
+static const __u8 MON_CAP_X     = (1 << 3);      // execute
+static const __u8 MON_CAP_ALL   = MON_CAP_R | MON_CAP_W | MON_CAP_X;
+static const __u8 MON_CAP_ANY   = 0xff;          // *
+
+struct mon_rwxa_t {
+  __u8 val;
+
+  // cppcheck-suppress noExplicitConstructor
+  mon_rwxa_t(__u8 v = 0) : val(v) {}
+  mon_rwxa_t& operator=(__u8 v) {
+    val = v;
+    return *this;
+  }
+  operator __u8() const {
+    return val;
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const mon_rwxa_t& p);
+
+struct StringConstraint {
+  enum MatchType {
+    MATCH_TYPE_NONE,
+    MATCH_TYPE_EQUAL,
+    MATCH_TYPE_PREFIX,
+    MATCH_TYPE_REGEX
+  };
+
+  MatchType match_type = MATCH_TYPE_NONE;
+  std::string value;
+
+  StringConstraint() {}
+  StringConstraint(MatchType match_type, std::string value)
+    : match_type(match_type), value(value) {
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const StringConstraint& c);
+
+struct MonCapGrant {
+  /*
+   * A grant can come in one of five forms:
+   *
+   *  - a blanket allow ('allow rw', 'allow *')
+   *    - this will match against any service and the read/write/exec flags
+   *      in the mon code.  semantics of what X means are somewhat ad hoc.
+   *
+   *  - a service allow ('allow service mds rw')
+   *    - this will match against a specific service and the r/w/x flags.
+   *
+   *  - a profile ('allow profile osd')
+   *    - this will match against specific monitor-enforced semantics of what
+   *      this type of user should need to do.  examples include 'osd', 'mds',
+   *      'bootstrap-osd'.
+   *
+   *  - a command ('allow command foo', 'allow command bar with arg1=val1 arg2 prefix val2')
+   *      this includes the command name (the prefix string), and a set
+   *      of key/value pairs that constrain use of that command.  if no pairs
+   *      are specified, any arguments are allowed; if a pair is specified, that
+   *      argument must be present and equal or match a prefix.
+   *
+   *  - an fs name ('allow fsname foo')
+   *    - this will restrict access to MDSMaps in the FSMap to the provided
+   *      fs name.
+   */
+  std::string service;
+  std::string profile;
+  std::string command;
+  std::map<std::string, StringConstraint> command_args;
+  std::string fs_name;
+
+  // restrict by network
+  std::string network;
+
+  // these are filled in by parse_network(), called by MonCap::parse()
+  entity_addr_t network_parsed;
+  unsigned network_prefix = 0;
+  bool network_valid = true;
+
+  void parse_network();
+
+  mon_rwxa_t allow;
+
+  // explicit grants that a profile grant expands to; populated as
+  // needed by expand_profile() (via is_match()) and cached here.
+  mutable std::list<MonCapGrant> profile_grants;
+
+  void expand_profile(const EntityName& name) const;
+
+  MonCapGrant() : allow(0) {}
+  // cppcheck-suppress noExplicitConstructor
+  MonCapGrant(mon_rwxa_t a) : allow(a) {}
+  MonCapGrant(std::string s, mon_rwxa_t a) : service(std::move(s)), allow(a) {}
+  // cppcheck-suppress noExplicitConstructor
+  MonCapGrant(std::string c) : command(std::move(c)) {}
+  MonCapGrant(std::string c, std::string a, StringConstraint co) : command(std::move(c)) {
+    command_args[a] = co;
+  }
+  MonCapGrant(mon_rwxa_t a, std::string fsname) : fs_name(fsname), allow(a) {}
+
+  /**
+   * check if given request parameters match our constraints
+   *
+   * @param cct context
+   * @param name entity name
+   * @param service service (if any)
+   * @param command command (if any)
+   * @param command_args command args (if any)
+   * @return bits we allow
+   */
+  mon_rwxa_t get_allowed(CephContext *cct,
+			 EntityName name,
+			 const std::string& service,
+			 const std::string& command,
+			 const std::map<std::string, std::string>& command_args) const;
+
+  bool is_allow_all() const {
+    return
+      allow == MON_CAP_ANY &&
+      service.length() == 0 &&
+      profile.length() == 0 &&
+      command.length() == 0 &&
+      fs_name.empty();
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const MonCapGrant& g);
+
+struct MonCap {
+  std::string text;
+  std::vector<MonCapGrant> grants;
+
+  MonCap() {}
+  explicit MonCap(const std::vector<MonCapGrant> &g) : grants(g) {}
+
+  std::string get_str() const {
+    return text;
+  }
+
+  bool is_allow_all() const;
+  void set_allow_all();
+  bool parse(const std::string& str, std::ostream *err=NULL);
+
+  /**
+   * check if we are capable of something
+   *
+   * This method actually checks a description of a particular operation against
+   * what the capability has specified.
+   *
+   * @param service service name
+   * @param command command id
+   * @param command_args
+   * @param op_may_read whether the operation may need to read
+   * @param op_may_write whether the operation may need to write
+   * @param op_may_exec whether the operation may exec
+   * @return true if the operation is allowed, false otherwise
+   */
+  bool is_capable(CephContext *cct,
+		  EntityName name,
+		  const std::string& service,
+		  const std::string& command,
+		  const std::map<std::string, std::string>& command_args,
+		  bool op_may_read, bool op_may_write, bool op_may_exec,
+		  const entity_addr_t& addr) const;
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<MonCap*>& ls);
+
+  std::vector<string> allowed_fs_names() const {
+    std::vector<string> ret;
+    for (auto& g : grants) {
+      if (not g.fs_name.empty()) {
+	ret.push_back(g.fs_name);
+      } else {
+	return {};
+      }
+    }
+    return ret;
+  }
+
+  bool fs_name_capable(const EntityName& ename, string_view fs_name,
+		       __u8 mask) {
+    for (auto& g : grants) {
+      if (g.is_allow_all()) {
+	return true;
+      }
+
+      if ((g.fs_name.empty() || g.fs_name == fs_name) && (mask & g.allow)) {
+	  return true;
+      }
+
+      g.expand_profile(ename);
+      for (auto& pg : g.profile_grants) {
+	if ((pg.service == "fs" || pg.service == "mds") &&
+	    (pg.fs_name.empty() || pg.fs_name == fs_name) &&
+	    (pg.allow & mask)) {
+	  return true;
+	}
+      }
+    }
+
+    return false;
+  }
+
+};
+WRITE_CLASS_ENCODER(MonCap)
+
+std::ostream& operator<<(std::ostream& out, const MonCap& cap);
+
+#endif
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
new file mode 100644
index 000000000..9c637bf8a
--- /dev/null
+++ b/src/mon/MonClient.cc
@@ -0,0 +1,2025 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <algorithm>
+#include <iterator>
+#include <random>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm_ext/copy_n.hpp>
+#include "common/weighted_shuffle.h"
+
+#include "include/random.h"
+#include "include/scope_guard.h"
+#include "include/stringify.h"
+
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersion.h"
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersionReply.h"
+#include "messages/MMonMap.h"
+#include "messages/MConfig.h"
+#include "messages/MGetConfig.h"
+#include "messages/MAuth.h"
+#include "messages/MLogAck.h"
+#include "messages/MAuthReply.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MPing.h"
+
+#include "messages/MMonSubscribe.h"
+#include "messages/MMonSubscribeAck.h"
+#include "common/errno.h"
+#include "common/hostname.h"
+#include "common/LogClient.h"
+
+#include "MonClient.h"
+#include "error_code.h"
+#include "MonMap.h"
+
+#include "auth/Auth.h"
+#include "auth/KeyRing.h"
+#include "auth/AuthClientHandler.h"
+#include "auth/AuthRegistry.h"
+#include "auth/RotatingKeyRing.h"
+
+#define dout_subsys ceph_subsys_monc
+#undef dout_prefix
+#define dout_prefix *_dout << "monclient" << (_hunting() ? "(hunting)":"") << ": "
+
+namespace bs = boost::system;
+using std::string;
+using namespace std::literals;
+
+MonClient::MonClient(CephContext *cct_, boost::asio::io_context& service) :
+  Dispatcher(cct_),
+  AuthServer(cct_),
+  messenger(NULL),
+  timer(cct_, monc_lock),
+  service(service),
+  initialized(false),
+  log_client(NULL),
+  more_log_pending(false),
+  want_monmap(true),
+  had_a_connection(false),
+  reopen_interval_multiplier(
+    cct_->_conf.get_val<double>("mon_client_hunt_interval_min_multiple")),
+  last_mon_command_tid(0),
+  version_req_id(0)
+{}
+
+MonClient::~MonClient()
+{
+}
+
+int MonClient::build_initial_monmap()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  int r = monmap.build_initial(cct, false, std::cerr);
+  ldout(cct,10) << "monmap:\n";
+  monmap.print(*_dout);
+  *_dout << dendl;
+  return r;
+}
+
+int MonClient::get_monmap()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  std::unique_lock l(monc_lock);
+
+  sub.want("monmap", 0, 0);
+  if (!_opened())
+    _reopen_session();
+  map_cond.wait(l, [this] { return !want_monmap; });
+  ldout(cct, 10) << __func__ << " done" << dendl;
+  return 0;
+}
+
+int MonClient::get_monmap_and_config()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  ceph_assert(!messenger);
+
+  int tries = 10;
+
+  cct->init_crypto();
+  auto shutdown_crypto = make_scope_guard([this] {
+    cct->shutdown_crypto();
+  });
+
+  int r = build_initial_monmap();
+  if (r < 0) {
+    lderr(cct) << __func__ << " cannot identify monitors to contact" << dendl;
+    return r;
+  }
+
+  messenger = Messenger::create_client_messenger(
+    cct, "temp_mon_client");
+  ceph_assert(messenger);
+  messenger->add_dispatcher_head(this);
+  messenger->start();
+  auto shutdown_msgr = make_scope_guard([this] {
+    messenger->shutdown();
+    messenger->wait();
+    delete messenger;
+    messenger = nullptr;
+    if (!monmap.fsid.is_zero()) {
+      cct->_conf.set_val("fsid", stringify(monmap.fsid));
+    }
+  });
+
+  want_bootstrap_config = true;
+  auto shutdown_config = make_scope_guard([this] {
+    std::unique_lock l(monc_lock);
+    want_bootstrap_config = false;
+    bootstrap_config.reset();
+  });
+
+  ceph::ref_t<MConfig> config;
+  while (tries-- > 0) {
+    r = init();
+    if (r < 0) {
+      return r;
+    }
+    r = authenticate(std::chrono::duration<double>(cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout")).count());
+    if (r == -ETIMEDOUT) {
+      shutdown();
+      continue;
+    }
+    if (r < 0) {
+      break;
+    }
+    {
+      std::unique_lock l(monc_lock);
+      if (monmap.get_epoch() &&
+	  !monmap.persistent_features.contains_all(
+	    ceph::features::mon::FEATURE_MIMIC)) {
+	ldout(cct,10) << __func__ << " pre-mimic monitor, no config to fetch"
+		      << dendl;
+	r = 0;
+	break;
+      }
+      while ((!bootstrap_config || monmap.get_epoch() == 0) && r == 0) {
+	ldout(cct,20) << __func__ << " waiting for monmap|config" << dendl;
+	auto status = map_cond.wait_for(l, ceph::make_timespan(
+	    cct->_conf->mon_client_hunt_interval));
+	if (status == std::cv_status::timeout) {
+	  r = -ETIMEDOUT;
+	}
+      }
+
+      if (bootstrap_config) {
+	ldout(cct,10) << __func__ << " success" << dendl;
+	config = std::move(bootstrap_config);
+	r = 0;
+	break;
+      }
+    }
+    lderr(cct) << __func__ << " failed to get config" << dendl;
+    shutdown();
+    continue;
+  }
+
+  if (config) {
+    // apply the bootstrap config to ensure its applied prior to completing
+    // the bootstrap
+    cct->_conf.set_mon_vals(cct, config->config, config_cb);
+  }
+
+  shutdown();
+  return r;
+}
+
+
+/**
+ * Ping the monitor with id @p mon_id and set the resulting reply in
+ * the provided @p result_reply, if this last parameter is not NULL.
+ *
+ * So that we don't rely on the MonClient's default messenger, set up
+ * during connect(), we create our own messenger to comunicate with the
+ * specified monitor.  This is advantageous in the following ways:
+ *
+ * - Isolate the ping procedure from the rest of the MonClient's operations,
+ *   allowing us to not acquire or manage the big monc_lock, thus not
+ *   having to block waiting for some other operation to finish before we
+ *   can proceed.
+ *   * for instance, we can ping mon.FOO even if we are currently hunting
+ *     or blocked waiting for auth to complete with mon.BAR.
+ *
+ * - Ping a monitor prior to establishing a connection (using connect())
+ *   and properly establish the MonClient's messenger.  This frees us
+ *   from dealing with the complex foo that happens in connect().
+ *
+ * We also don't rely on MonClient as a dispatcher for this messenger,
+ * unlike what happens with the MonClient's default messenger.  This allows
+ * us to sandbox the whole ping, having it much as a separate entity in
+ * the MonClient class, considerably simplifying the handling and dispatching
+ * of messages without needing to consider monc_lock.
+ *
+ * Current drawback is that we will establish a messenger for each ping
+ * we want to issue, instead of keeping a single messenger instance that
+ * would be used for all pings.
+ */
+int MonClient::ping_monitor(const string &mon_id, string *result_reply)
+{
+  ldout(cct, 10) << __func__ << dendl;
+
+  string new_mon_id;
+  if (monmap.contains("noname-"+mon_id)) {
+    new_mon_id = "noname-"+mon_id;
+  } else {
+    new_mon_id = mon_id;
+  }
+
+  if (new_mon_id.empty()) {
+    ldout(cct, 10) << __func__ << " specified mon id is empty!" << dendl;
+    return -EINVAL;
+  } else if (!monmap.contains(new_mon_id)) {
+    ldout(cct, 10) << __func__ << " no such monitor 'mon." << new_mon_id << "'"
+                   << dendl;
+    return -ENOENT;
+  }
+
+  // N.B. monc isn't initialized
+
+  auth_registry.refresh_config();
+
+  KeyRing keyring;
+  keyring.from_ceph_context(cct);
+  RotatingKeyRing rkeyring(cct, cct->get_module_type(), &keyring);
+
+  MonClientPinger *pinger = new MonClientPinger(cct,
+						&rkeyring,
+						result_reply);
+
+  Messenger *smsgr = Messenger::create_client_messenger(cct, "temp_ping_client");
+  smsgr->add_dispatcher_head(pinger);
+  smsgr->set_auth_client(pinger);
+  smsgr->start();
+
+  ConnectionRef con = smsgr->connect_to_mon(monmap.get_addrs(new_mon_id));
+  ldout(cct, 10) << __func__ << " ping mon." << new_mon_id
+                 << " " << con->get_peer_addr() << dendl;
+
+  pinger->mc.reset(new MonConnection(cct, con, 0, &auth_registry));
+  pinger->mc->start(monmap.get_epoch(), entity_name);
+  con->send_message(new MPing);
+
+  int ret = pinger->wait_for_reply(cct->_conf->mon_client_ping_timeout);
+  if (ret == 0) {
+    ldout(cct,10) << __func__ << " got ping reply" << dendl;
+  } else {
+    ret = -ret;
+  }
+
+  con->mark_down();
+  pinger->mc.reset();
+  smsgr->shutdown();
+  smsgr->wait();
+  delete smsgr;
+  delete pinger;
+  return ret;
+}
+
+bool MonClient::ms_dispatch(Message *m)
+{
+  // we only care about these message types
+  switch (m->get_type()) {
+  case CEPH_MSG_MON_MAP:
+  case CEPH_MSG_AUTH_REPLY:
+  case CEPH_MSG_MON_SUBSCRIBE_ACK:
+  case CEPH_MSG_MON_GET_VERSION_REPLY:
+  case MSG_MON_COMMAND_ACK:
+  case MSG_COMMAND_REPLY:
+  case MSG_LOGACK:
+  case MSG_CONFIG:
+    break;
+  case CEPH_MSG_PING:
+    m->put();
+    return true;
+  default:
+    return false;
+  }
+
+  std::lock_guard lock(monc_lock);
+
+  if (!m->get_connection()->is_anon() &&
+      m->get_source().type() == CEPH_ENTITY_TYPE_MON) {
+    if (_hunting()) {
+      auto p = _find_pending_con(m->get_connection());
+      if (p == pending_cons.end()) {
+	// ignore any messages outside hunting sessions
+	ldout(cct, 10) << "discarding stray monitor message " << *m << dendl;
+	m->put();
+	return true;
+      }
+    } else if (!active_con || active_con->get_con() != m->get_connection()) {
+      // ignore any messages outside our session(s)
+      ldout(cct, 10) << "discarding stray monitor message " << *m << dendl;
+      m->put();
+      return true;
+    }
+  }
+
+  switch (m->get_type()) {
+  case CEPH_MSG_MON_MAP:
+    handle_monmap(static_cast<MMonMap*>(m));
+    if (passthrough_monmap) {
+      return false;
+    } else {
+      m->put();
+    }
+    break;
+  case CEPH_MSG_AUTH_REPLY:
+    handle_auth(static_cast<MAuthReply*>(m));
+    break;
+  case CEPH_MSG_MON_SUBSCRIBE_ACK:
+    handle_subscribe_ack(static_cast<MMonSubscribeAck*>(m));
+    break;
+  case CEPH_MSG_MON_GET_VERSION_REPLY:
+    handle_get_version_reply(static_cast<MMonGetVersionReply*>(m));
+    break;
+  case MSG_MON_COMMAND_ACK:
+    handle_mon_command_ack(static_cast<MMonCommandAck*>(m));
+    break;
+  case MSG_COMMAND_REPLY:
+    if (m->get_connection()->is_anon() &&
+        m->get_source().type() == CEPH_ENTITY_TYPE_MON) {
+      // this connection is from 'tell'... ignore everything except our command
+      // reply.  (we'll get misc other message because we authenticated, but we
+      // don't need them.)
+      handle_command_reply(static_cast<MCommandReply*>(m));
+      return true;
+    }
+    // leave the message for another dispatch handler (e.g., Objecter)
+    return false;
+  case MSG_LOGACK:
+    if (log_client) {
+      log_client->handle_log_ack(static_cast<MLogAck*>(m));
+      m->put();
+      if (more_log_pending) {
+	send_log();
+      }
+    } else {
+      m->put();
+    }
+    break;
+  case MSG_CONFIG:
+    handle_config(static_cast<MConfig*>(m));
+    break;
+  }
+  return true;
+}
+
+void MonClient::send_log(bool flush)
+{
+  if (log_client) {
+    auto lm = log_client->get_mon_log_message(flush);
+    if (lm)
+      _send_mon_message(std::move(lm));
+    more_log_pending = log_client->are_pending();
+  }
+}
+
+void MonClient::flush_log()
+{
+  std::lock_guard l(monc_lock);
+  send_log();
+}
+
+/* Unlike all the other message-handling functions, we don't put away a reference
+* because we want to support MMonMap passthrough to other Dispatchers. */
+void MonClient::handle_monmap(MMonMap *m)
+{
+  ldout(cct, 10) << __func__ << " " << *m << dendl;
+  auto con_addrs = m->get_source_addrs();
+  string old_name = monmap.get_name(con_addrs);
+  const auto old_epoch = monmap.get_epoch();
+
+  auto p = m->monmapbl.cbegin();
+  decode(monmap, p);
+
+  ldout(cct, 10) << " got monmap " << monmap.epoch
+		 << " from mon." << old_name
+		 << " (according to old e" << monmap.get_epoch() << ")"
+ 		 << dendl;
+  ldout(cct, 10) << "dump:\n";
+  monmap.print(*_dout);
+  *_dout << dendl;
+
+  if (old_epoch != monmap.get_epoch()) {
+    tried.clear();
+  }
+  if (old_name.size() == 0) {
+    ldout(cct,10) << " can't identify which mon we were connected to" << dendl;
+    _reopen_session();
+  } else {
+    auto new_name = monmap.get_name(con_addrs);
+    if (new_name.empty()) {
+      ldout(cct, 10) << "mon." << old_name << " at " << con_addrs
+		     << " went away" << dendl;
+      // can't find the mon we were talking to (above)
+      _reopen_session();
+    } else if (messenger->should_use_msgr2() &&
+	       monmap.get_addrs(new_name).has_msgr2() &&
+	       !con_addrs.has_msgr2()) {
+      ldout(cct,1) << " mon." << new_name << " has (v2) addrs "
+		   << monmap.get_addrs(new_name) << " but i'm connected to "
+		   << con_addrs << ", reconnecting" << dendl;
+      _reopen_session();
+    }
+  }
+
+  cct->set_mon_addrs(monmap);
+
+  sub.got("monmap", monmap.get_epoch());
+  map_cond.notify_all();
+  want_monmap = false;
+
+  if (authenticate_err == 1) {
+    _finish_auth(0);
+  }
+}
+
+void MonClient::handle_config(MConfig *m)
+{
+  ldout(cct,10) << __func__ << " " << *m << dendl;
+
+  if (want_bootstrap_config) {
+    // get_monmap_and_config is waiting for config which it will apply
+    // synchronously
+    bootstrap_config = ceph::ref_t<MConfig>(m, false);
+    map_cond.notify_all();
+    return;
+  }
+
+  // Take the sledgehammer approach to ensuring we don't depend on
+  // anything in MonClient.
+  boost::asio::post(finish_strand,
+		    [m, cct = boost::intrusive_ptr<CephContext>(cct),
+		     config_notify_cb = config_notify_cb,
+		     config_cb = config_cb]() {
+		      cct->_conf.set_mon_vals(cct.get(), m->config, config_cb);
+		      if (config_notify_cb) {
+			config_notify_cb();
+		      }
+		      m->put();
+		    });
+}
+
+// ----------------------
+
+int MonClient::init()
+{
+  ldout(cct, 10) << __func__ << dendl;
+
+  entity_name = cct->_conf->name;
+
+  auth_registry.refresh_config();
+
+  std::lock_guard l(monc_lock);
+  keyring.reset(new KeyRing);
+  if (auth_registry.is_supported_method(messenger->get_mytype(),
+					CEPH_AUTH_CEPHX)) {
+    // this should succeed, because auth_registry just checked!
+    int r = keyring->from_ceph_context(cct);
+    if (r != 0) {
+      // but be somewhat graceful in case there was a race condition
+      lderr(cct) << "keyring not found" << dendl;
+      return r;
+    }
+  }
+  if (!auth_registry.any_supported_methods(messenger->get_mytype())) {
+    return -ENOENT;
+  }
+
+  rotating_secrets.reset(
+    new RotatingKeyRing(cct, cct->get_module_type(), keyring.get()));
+
+  initialized = true;
+
+  messenger->set_auth_client(this);
+  messenger->add_dispatcher_head(this);
+
+  timer.init();
+  schedule_tick();
+
+  return 0;
+}
+
+void MonClient::shutdown()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  monc_lock.lock();
+  stopping = true;
+  while (!version_requests.empty()) {
+    ceph::async::post(std::move(version_requests.begin()->second),
+		      monc_errc::shutting_down, 0, 0);
+    ldout(cct, 20) << __func__ << " canceling and discarding version request "
+		   << version_requests.begin()->first << dendl;
+    version_requests.erase(version_requests.begin());
+  }
+  while (!mon_commands.empty()) {
+    auto tid = mon_commands.begin()->first;
+    _cancel_mon_command(tid);
+  }
+  ldout(cct, 20) << __func__ << " discarding " << waiting_for_session.size()
+		 << " pending message(s)" << dendl;
+  waiting_for_session.clear();
+
+  active_con.reset();
+  pending_cons.clear();
+
+  auth.reset();
+  global_id = 0;
+  authenticate_err = 0;
+  authenticated = false;
+
+  monc_lock.unlock();
+
+  if (initialized) {
+    initialized = false;
+  }
+  monc_lock.lock();
+  timer.shutdown();
+  stopping = false;
+  monc_lock.unlock();
+}
+
+int MonClient::authenticate(double timeout)
+{
+  std::unique_lock lock{monc_lock};
+
+  if (active_con) {
+    ldout(cct, 5) << "already authenticated" << dendl;
+    return 0;
+  }
+  sub.want("monmap", monmap.get_epoch() ? monmap.get_epoch() + 1 : 0, 0);
+  sub.want("config", 0, 0);
+  if (!_opened())
+    _reopen_session();
+
+  auto until = ceph::real_clock::now();
+  until += ceph::make_timespan(timeout);
+  if (timeout > 0.0)
+    ldout(cct, 10) << "authenticate will time out at " << until << dendl;
+  while (!active_con && authenticate_err >= 0) {
+    if (timeout > 0.0) {
+      auto r = auth_cond.wait_until(lock, until);
+      if (r == std::cv_status::timeout && !active_con) {
+	ldout(cct, 0) << "authenticate timed out after " << timeout << dendl;
+	authenticate_err = -ETIMEDOUT;
+      }
+    } else {
+      auth_cond.wait(lock);
+    }
+  }
+
+  if (active_con) {
+    ldout(cct, 5) << __func__ << " success, global_id "
+		  << active_con->get_global_id() << dendl;
+    // active_con should not have been set if there was an error
+    ceph_assert(authenticate_err >= 0);
+    authenticated = true;
+  }
+
+  if (authenticate_err < 0 && auth_registry.no_keyring_disabled_cephx()) {
+    lderr(cct) << __func__ << " NOTE: no keyring found; disabled cephx authentication" << dendl;
+  }
+
+  return authenticate_err;
+}
+
+void MonClient::handle_auth(MAuthReply *m)
+{
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+
+  if (m->get_connection()->is_anon()) {
+    // anon connection, used for mon tell commands
+    for (auto& p : mon_commands) {
+      if (p.second->target_con == m->get_connection()) {
+	auto& mc = p.second->target_session;
+	int ret = mc->handle_auth(m, entity_name,
+				  CEPH_ENTITY_TYPE_MON,
+				  rotating_secrets.get());
+	(void)ret; // we don't care
+	break;
+      }
+    }
+    m->put();
+    return;
+  }
+
+  if (!_hunting()) {
+    std::swap(active_con->get_auth(), auth);
+    int ret = active_con->authenticate(m);
+    m->put();
+    std::swap(auth, active_con->get_auth());
+    if (global_id != active_con->get_global_id()) {
+      lderr(cct) << __func__ << " peer assigned me a different global_id: "
+		 << active_con->get_global_id() << dendl;
+    }
+    if (ret != -EAGAIN) {
+      _finish_auth(ret);
+    }
+    return;
+  }
+
+  // hunting
+  auto found = _find_pending_con(m->get_connection());
+  ceph_assert(found != pending_cons.end());
+  int auth_err = found->second.handle_auth(m, entity_name, want_keys,
+					   rotating_secrets.get());
+  m->put();
+  if (auth_err == -EAGAIN) {
+    return;
+  }
+  if (auth_err) {
+    pending_cons.erase(found);
+    if (!pending_cons.empty()) {
+      // keep trying with pending connections
+      return;
+    }
+    // the last try just failed, give up.
+  } else {
+    auto& mc = found->second;
+    ceph_assert(mc.have_session());
+    active_con.reset(new MonConnection(std::move(mc)));
+    pending_cons.clear();
+  }
+
+  _finish_hunting(auth_err);
+  _finish_auth(auth_err);
+}
+
+void MonClient::_finish_auth(int auth_err)
+{
+  ldout(cct,10) << __func__ << " " << auth_err << dendl;
+  authenticate_err = auth_err;
+  // _resend_mon_commands() could _reopen_session() if the connected mon is not
+  // the one the MonCommand is targeting.
+  if (!auth_err && active_con) {
+    ceph_assert(auth);
+    _check_auth_tickets();
+  }
+  auth_cond.notify_all();
+}
+
+// ---------
+
+void MonClient::send_mon_message(MessageRef m)
+{
+  std::lock_guard l{monc_lock};
+  _send_mon_message(std::move(m));
+}
+
+void MonClient::_send_mon_message(MessageRef m)
+{
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+  if (active_con) {
+    auto cur_con = active_con->get_con();
+    ldout(cct, 10) << "_send_mon_message to mon."
+		   << monmap.get_name(cur_con->get_peer_addr())
+		   << " at " << cur_con->get_peer_addr() << dendl;
+    cur_con->send_message2(std::move(m));
+  } else {
+    waiting_for_session.push_back(std::move(m));
+  }
+}
+
+void MonClient::_reopen_session(int rank)
+{
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+  ldout(cct, 10) << __func__ << " rank " << rank << dendl;
+
+  active_con.reset();
+  pending_cons.clear();
+
+  authenticate_err = 1;  // == in progress
+
+  _start_hunting();
+
+  if (rank >= 0) {
+    _add_conn(rank);
+  } else {
+    _add_conns();
+  }
+
+  // throw out old queued messages
+  waiting_for_session.clear();
+
+  // throw out version check requests
+  while (!version_requests.empty()) {
+    ceph::async::post(std::move(version_requests.begin()->second),
+		      monc_errc::session_reset, 0, 0);
+    version_requests.erase(version_requests.begin());
+  }
+
+  for (auto& c : pending_cons) {
+    c.second.start(monmap.get_epoch(), entity_name);
+  }
+
+  if (sub.reload()) {
+    _renew_subs();
+  }
+}
+
+void MonClient::_add_conn(unsigned rank)
+{
+  auto peer = monmap.get_addrs(rank);
+  auto conn = messenger->connect_to_mon(peer);
+  MonConnection mc(cct, conn, global_id, &auth_registry);
+  if (auth) {
+    mc.get_auth().reset(auth->clone());
+  }
+  pending_cons.insert(std::make_pair(peer, std::move(mc)));
+  ldout(cct, 10) << "picked mon." << monmap.get_name(rank)
+                 << " con " << conn
+                 << " addr " << peer
+                 << dendl;
+}
+
+void MonClient::_add_conns()
+{
+  // collect the next batch of candidates who are listed right next to the ones
+  // already tried
+  auto get_next_batch = [this]() -> std::vector<unsigned> {
+    std::multimap<uint16_t, unsigned> ranks_by_priority;
+    boost::copy(
+      monmap.mon_info | boost::adaptors::filtered(
+        [this](auto& info) {
+          auto rank = monmap.get_rank(info.first);
+          return tried.count(rank) == 0;
+        }) | boost::adaptors::transformed(
+          [this](auto& info) {
+            auto rank = monmap.get_rank(info.first);
+            return std::make_pair(info.second.priority, rank);
+          }), std::inserter(ranks_by_priority, end(ranks_by_priority)));
+    if (ranks_by_priority.empty()) {
+      return {};
+    }
+    // only choose the monitors with lowest priority
+    auto cands = boost::make_iterator_range(
+      ranks_by_priority.equal_range(ranks_by_priority.begin()->first));
+    std::vector<unsigned> ranks;
+    boost::range::copy(cands | boost::adaptors::map_values,
+		       std::back_inserter(ranks));
+    return ranks;
+  };
+  auto ranks = get_next_batch();
+  if (ranks.empty()) {
+    tried.clear();  // start over
+    ranks = get_next_batch();
+  }
+  ceph_assert(!ranks.empty());
+  if (ranks.size() > 1) {
+    std::vector<uint16_t> weights;
+    for (auto i : ranks) {
+      auto rank_name = monmap.get_name(i);
+      weights.push_back(monmap.get_weight(rank_name));
+    }
+    random_device_t rd;
+    if (std::accumulate(begin(weights), end(weights), 0u) == 0) {
+      std::shuffle(begin(ranks), end(ranks), std::mt19937{rd()});
+    } else {
+      weighted_shuffle(begin(ranks), end(ranks), begin(weights), end(weights),
+		       std::mt19937{rd()});
+    }
+  }
+  ldout(cct, 10) << __func__ << " ranks=" << ranks << dendl;
+  unsigned n = cct->_conf->mon_client_hunt_parallel;
+  if (n == 0 || n > ranks.size()) {
+    n = ranks.size();
+  }
+  for (unsigned i = 0; i < n; i++) {
+    _add_conn(ranks[i]);
+    tried.insert(ranks[i]);
+  }
+}
+
+bool MonClient::ms_handle_reset(Connection *con)
+{
+  std::lock_guard lock(monc_lock);
+
+  if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON)
+    return false;
+
+  if (con->is_anon()) {
+    auto p = mon_commands.begin();
+    while (p != mon_commands.end()) {
+      auto cmd = p->second;
+      ++p;
+      if (cmd->target_con == con) {
+	_send_command(cmd); // may retry or fail
+	break;
+      }
+    }
+    return true;
+  }
+
+  if (_hunting()) {
+    if (pending_cons.count(con->get_peer_addrs())) {
+      ldout(cct, 10) << __func__ << " hunted mon " << con->get_peer_addrs()
+		     << dendl;
+    } else {
+      ldout(cct, 10) << __func__ << " stray mon " << con->get_peer_addrs()
+		     << dendl;
+    }
+    return true;
+  } else {
+    if (active_con && con == active_con->get_con()) {
+      ldout(cct, 10) << __func__ << " current mon " << con->get_peer_addrs()
+		     << dendl;
+      _reopen_session();
+      return false;
+    } else {
+      ldout(cct, 10) << "ms_handle_reset stray mon " << con->get_peer_addrs()
+		     << dendl;
+      return true;
+    }
+  }
+}
+
+bool MonClient::_opened() const
+{
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+  return active_con || _hunting();
+}
+
+bool MonClient::_hunting() const
+{
+  return !pending_cons.empty();
+}
+
+void MonClient::_start_hunting()
+{
+  ceph_assert(!_hunting());
+  // adjust timeouts if necessary
+  if (!had_a_connection)
+    return;
+  reopen_interval_multiplier *= cct->_conf->mon_client_hunt_interval_backoff;
+  if (reopen_interval_multiplier >
+      cct->_conf->mon_client_hunt_interval_max_multiple) {
+    reopen_interval_multiplier =
+      cct->_conf->mon_client_hunt_interval_max_multiple;
+  }
+}
+
+void MonClient::_finish_hunting(int auth_err)
+{
+  ldout(cct,10) << __func__ << " " << auth_err << dendl;
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+  // the pending conns have been cleaned.
+  ceph_assert(!_hunting());
+  if (active_con) {
+    auto con = active_con->get_con();
+    ldout(cct, 1) << "found mon."
+		  << monmap.get_name(con->get_peer_addr())
+		  << dendl;
+  } else {
+    ldout(cct, 1) << "no mon sessions established" << dendl;
+  }
+
+  had_a_connection = true;
+  _un_backoff();
+
+  if (!auth_err) {
+    last_rotating_renew_sent = utime_t();
+    while (!waiting_for_session.empty()) {
+      _send_mon_message(std::move(waiting_for_session.front()));
+      waiting_for_session.pop_front();
+    }
+    _resend_mon_commands();
+    send_log(true);
+    if (active_con) {
+      auth = std::move(active_con->get_auth());
+      if (global_id && global_id != active_con->get_global_id()) {
+	lderr(cct) << __func__ << " global_id changed from " << global_id
+		   << " to " << active_con->get_global_id() << dendl;
+      }
+      global_id = active_con->get_global_id();
+    }
+  }
+}
+
+void MonClient::tick()
+{
+  ldout(cct, 10) << __func__ << dendl;
+
+  utime_t now = ceph_clock_now();
+
+  auto reschedule_tick = make_scope_guard([this] {
+      schedule_tick();
+    });
+
+  _check_auth_tickets();
+  _check_tell_commands();
+  
+  if (_hunting()) {
+    ldout(cct, 1) << "continuing hunt" << dendl;
+    return _reopen_session();
+  } else if (active_con) {
+    // just renew as needed
+    auto cur_con = active_con->get_con();
+    if (!cur_con->has_feature(CEPH_FEATURE_MON_STATEFUL_SUB)) {
+      const bool maybe_renew = sub.need_renew();
+      ldout(cct, 10) << "renew subs? -- " << (maybe_renew ? "yes" : "no")
+		     << dendl;
+      if (maybe_renew) {
+	_renew_subs();
+      }
+    }
+
+    if (now > last_keepalive + cct->_conf->mon_client_ping_interval) {
+      cur_con->send_keepalive();
+      last_keepalive = now;
+
+      if (cct->_conf->mon_client_ping_timeout > 0 &&
+	  cur_con->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+	utime_t lk = cur_con->get_last_keepalive_ack();
+	utime_t interval = now - lk;
+	if (interval > cct->_conf->mon_client_ping_timeout) {
+	  ldout(cct, 1) << "no keepalive since " << lk << " (" << interval
+			<< " seconds), reconnecting" << dendl;
+	  return _reopen_session();
+	}
+      }
+
+      _un_backoff();
+    }
+
+    if (now > last_send_log + cct->_conf->mon_client_log_interval) {
+      send_log();
+      last_send_log = now;
+    }
+  }
+}
+
+void MonClient::_un_backoff()
+{
+  // un-backoff our reconnect interval
+  reopen_interval_multiplier = std::max(
+    cct->_conf.get_val<double>("mon_client_hunt_interval_min_multiple"),
+    reopen_interval_multiplier /
+    cct->_conf.get_val<double>("mon_client_hunt_interval_backoff"));
+  ldout(cct, 20) << __func__ << " reopen_interval_multipler now "
+		 << reopen_interval_multiplier << dendl;
+}
+
+void MonClient::schedule_tick()
+{
+  auto do_tick = make_lambda_context([this](int) { tick(); });
+  if (!is_connected()) {
+    // start another round of hunting
+    const auto hunt_interval = (cct->_conf->mon_client_hunt_interval *
+				reopen_interval_multiplier);
+    timer.add_event_after(hunt_interval, do_tick);
+  } else {
+    // keep in touch
+    timer.add_event_after(std::min(cct->_conf->mon_client_ping_interval,
+				   cct->_conf->mon_client_log_interval),
+			  do_tick);
+  }
+}
+
+// ---------
+
+void MonClient::_renew_subs()
+{
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+  if (!sub.have_new()) {
+    ldout(cct, 10) << __func__ << " - empty" << dendl;
+    return;
+  }
+
+  ldout(cct, 10) << __func__ << dendl;
+  if (!_opened())
+    _reopen_session();
+  else {
+    auto m = ceph::make_message<MMonSubscribe>();
+    m->what = sub.get_subs();
+    m->hostname = ceph_get_short_hostname();
+    _send_mon_message(std::move(m));
+    sub.renewed();
+  }
+}
+
+void MonClient::handle_subscribe_ack(MMonSubscribeAck *m)
+{
+  sub.acked(m->interval);
+  m->put();
+}
+
+int MonClient::_check_auth_tickets()
+{
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+  if (active_con && auth) {
+    if (auth->need_tickets()) {
+      ldout(cct, 10) << __func__ << " getting new tickets!" << dendl;
+      auto m = ceph::make_message<MAuth>();
+      m->protocol = auth->get_protocol();
+      auth->prepare_build_request();
+      auth->build_request(m->auth_payload);
+      _send_mon_message(m);
+    }
+
+    _check_auth_rotating();
+  }
+  return 0;
+}
+
+int MonClient::_check_auth_rotating()
+{
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+  if (!rotating_secrets ||
+      !auth_principal_needs_rotating_keys(entity_name)) {
+    ldout(cct, 20) << "_check_auth_rotating not needed by " << entity_name << dendl;
+    return 0;
+  }
+
+  if (!active_con || !auth) {
+    ldout(cct, 10) << "_check_auth_rotating waiting for auth session" << dendl;
+    return 0;
+  }
+
+  utime_t now = ceph_clock_now();
+  utime_t cutoff = now;
+  cutoff -= std::min(30.0, cct->_conf->auth_service_ticket_ttl / 4.0);
+  utime_t issued_at_lower_bound = now;
+  issued_at_lower_bound -= cct->_conf->auth_service_ticket_ttl;
+  if (!rotating_secrets->need_new_secrets(cutoff)) {
+    ldout(cct, 10) << "_check_auth_rotating have uptodate secrets (they expire after " << cutoff << ")" << dendl;
+    rotating_secrets->dump_rotating();
+    return 0;
+  }
+
+  ldout(cct, 10) << "_check_auth_rotating renewing rotating keys (they expired before " << cutoff << ")" << dendl;
+  if (!rotating_secrets->need_new_secrets() &&
+      rotating_secrets->need_new_secrets(issued_at_lower_bound)) {
+    // the key has expired before it has been issued?
+    lderr(cct) << __func__ << " possible clock skew, rotating keys expired way too early"
+               << " (before " << issued_at_lower_bound << ")" << dendl;
+  }
+  if ((now > last_rotating_renew_sent) &&
+      double(now - last_rotating_renew_sent) < 1) {
+    ldout(cct, 10) << __func__ << " called too often (last: "
+                   << last_rotating_renew_sent << "), skipping refresh" << dendl;
+    return 0;
+  }
+  auto m = ceph::make_message<MAuth>();
+  m->protocol = auth->get_protocol();
+  if (auth->build_rotating_request(m->auth_payload)) {
+    last_rotating_renew_sent = now;
+    _send_mon_message(std::move(m));
+  }
+  return 0;
+}
+
+int MonClient::wait_auth_rotating(double timeout)
+{
+  std::unique_lock l(monc_lock);
+
+  // Must be initialized
+  ceph_assert(auth != nullptr);
+
+  if (auth->get_protocol() == CEPH_AUTH_NONE)
+    return 0;
+  
+  if (!rotating_secrets)
+    return 0;
+
+  ldout(cct, 10) << __func__ << " waiting for " << timeout << dendl;
+  utime_t cutoff = ceph_clock_now();
+  cutoff -= std::min(30.0, cct->_conf->auth_service_ticket_ttl / 4.0);
+  if (auth_cond.wait_for(l, ceph::make_timespan(timeout), [this, cutoff] {
+    return (!auth_principal_needs_rotating_keys(entity_name) ||
+	    !rotating_secrets->need_new_secrets(cutoff));
+  })) {
+    ldout(cct, 10) << __func__ << " done" << dendl;
+    return 0;
+  } else {
+    ldout(cct, 0) << __func__ << " timed out after " << timeout << dendl;
+    return -ETIMEDOUT;
+  }
+}
+
+// ---------
+
+void MonClient::_send_command(MonCommand *r)
+{
+  if (r->is_tell()) {
+    ++r->send_attempts;
+    if (r->send_attempts > cct->_conf->mon_client_directed_command_retry) {
+      _finish_command(r, monc_errc::mon_unavailable, "mon unavailable", {});
+      return;
+    }
+    // tell-style command
+    if (monmap.min_mon_release >= ceph_release_t::octopus) {
+      if (r->target_con) {
+	r->target_con->mark_down();
+      }
+      if (r->target_rank >= 0) {
+	if (r->target_rank >= (int)monmap.size()) {
+	  ldout(cct, 10) << " target " << r->target_rank
+			 << " >= max mon " << monmap.size() << dendl;
+	  _finish_command(r, monc_errc::rank_dne, "mon rank dne"sv, {});
+	  return;
+	}
+	r->target_con = messenger->connect_to_mon(
+	  monmap.get_addrs(r->target_rank), true /* anon */);
+      } else {
+	if (!monmap.contains(r->target_name)) {
+	  ldout(cct, 10) << " target " << r->target_name
+			 << " not present in monmap" << dendl;
+	  _finish_command(r, monc_errc::mon_dne, "mon dne"sv, {});
+	  return;
+	}
+	r->target_con = messenger->connect_to_mon(
+	  monmap.get_addrs(r->target_name), true /* anon */);
+      }
+
+      r->target_session.reset(new MonConnection(cct, r->target_con, 0,
+						&auth_registry));
+      r->target_session->start(monmap.get_epoch(), entity_name);
+      r->last_send_attempt = ceph_clock_now();
+
+      MCommand *m = new MCommand(monmap.fsid);
+      m->set_tid(r->tid);
+      m->cmd = r->cmd;
+      m->set_data(r->inbl);
+      r->target_session->queue_command(m);
+      return;
+    }
+
+    // ugly legacy handling of pre-octopus mons
+    entity_addr_t peer;
+    if (active_con) {
+      peer = active_con->get_con()->get_peer_addr();
+    }
+
+    if (r->target_rank >= 0 &&
+	r->target_rank != monmap.get_rank(peer)) {
+      ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd
+		     << " wants rank " << r->target_rank
+		     << ", reopening session"
+		     << dendl;
+      if (r->target_rank >= (int)monmap.size()) {
+	ldout(cct, 10) << " target " << r->target_rank
+		       << " >= max mon " << monmap.size() << dendl;
+	_finish_command(r, monc_errc::rank_dne, "mon rank dne"sv, {});
+	return;
+      }
+      _reopen_session(r->target_rank);
+      return;
+    }
+    if (r->target_name.length() &&
+	r->target_name != monmap.get_name(peer)) {
+      ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd
+		     << " wants mon " << r->target_name
+		     << ", reopening session"
+		     << dendl;
+      if (!monmap.contains(r->target_name)) {
+	ldout(cct, 10) << " target " << r->target_name
+		       << " not present in monmap" << dendl;
+	_finish_command(r, monc_errc::mon_dne, "mon dne"sv, {});
+	return;
+      }
+      _reopen_session(monmap.get_rank(r->target_name));
+      return;
+    }
+    // fall-thru to send 'normal' CLI command
+  }
+
+  // normal CLI command
+  ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl;
+  auto m = ceph::make_message<MMonCommand>(monmap.fsid);
+  m->set_tid(r->tid);
+  m->cmd = r->cmd;
+  m->set_data(r->inbl);
+  _send_mon_message(std::move(m));
+  return;
+}
+
+void MonClient::_check_tell_commands()
+{
+  // resend any requests
+  auto now = ceph_clock_now();
+  auto p = mon_commands.begin();
+  while (p != mon_commands.end()) {
+    auto cmd = p->second;
+    ++p;
+    if (cmd->is_tell() &&
+	cmd->last_send_attempt != utime_t() &&
+	now - cmd->last_send_attempt > cct->_conf->mon_client_hunt_interval) {
+      ldout(cct,5) << __func__ << " timeout tell command " << cmd->tid << dendl;
+      _send_command(cmd); // might remove cmd from mon_commands
+    }
+  }
+}
+
+void MonClient::_resend_mon_commands()
+{
+  // resend any requests
+  auto p = mon_commands.begin();
+  while (p != mon_commands.end()) {
+    auto cmd = p->second;
+    ++p;
+    if (cmd->is_tell() && monmap.min_mon_release >= ceph_release_t::octopus) {
+      // starting with octopus, tell commands use their own connetion and need no
+      // special resend when we finish hunting.
+    } else {
+      _send_command(cmd); // might remove cmd from mon_commands
+    }
+  }
+}
+
+void MonClient::handle_mon_command_ack(MMonCommandAck *ack)
+{
+  MonCommand *r = NULL;
+  uint64_t tid = ack->get_tid();
+
+  if (tid == 0 && !mon_commands.empty()) {
+    r = mon_commands.begin()->second;
+    ldout(cct, 10) << __func__ << " has tid 0, assuming it is " << r->tid << dendl;
+  } else {
+    auto p = mon_commands.find(tid);
+    if (p == mon_commands.end()) {
+      ldout(cct, 10) << __func__ << " " << ack->get_tid() << " not found" << dendl;
+      ack->put();
+      return;
+    }
+    r = p->second;
+  }
+
+  ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl;
+  auto ec = ack->r < 0 ? bs::error_code(-ack->r, mon_category())
+    : bs::error_code();
+  _finish_command(r, ec, ack->rs,
+		  std::move(ack->get_data()));
+  ack->put();
+}
+
+void MonClient::handle_command_reply(MCommandReply *reply)
+{
+  MonCommand *r = NULL;
+  uint64_t tid = reply->get_tid();
+
+  if (tid == 0 && !mon_commands.empty()) {
+    r = mon_commands.begin()->second;
+    ldout(cct, 10) << __func__ << " has tid 0, assuming it is " << r->tid
+		   << dendl;
+  } else {
+    auto p = mon_commands.find(tid);
+    if (p == mon_commands.end()) {
+      ldout(cct, 10) << __func__ << " " << reply->get_tid() << " not found"
+		     << dendl;
+      reply->put();
+      return;
+    }
+    r = p->second;
+  }
+
+  ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl;
+  auto ec = reply->r < 0 ? bs::error_code(-reply->r, mon_category())
+    : bs::error_code();
+  _finish_command(r, ec, reply->rs, std::move(reply->get_data()));
+  reply->put();
+}
+
+int MonClient::_cancel_mon_command(uint64_t tid)
+{
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+
+  auto it = mon_commands.find(tid);
+  if (it == mon_commands.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+    return -ENOENT;
+  }
+
+  ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+  MonCommand *cmd = it->second;
+  _finish_command(cmd, monc_errc::timed_out, "timed out"sv, {});
+  return 0;
+}
+
+void MonClient::_finish_command(MonCommand *r, bs::error_code ret,
+				std::string_view rs, ceph::buffer::list&& bl)
+{
+  ldout(cct, 10) << __func__ << " " << r->tid << " = " << ret << " " << rs
+		 << dendl;
+  ceph::async::post(std::move(r->onfinish), ret, std::string(rs),
+		    std::move(bl));
+  if (r->target_con) {
+    r->target_con->mark_down();
+  }
+  mon_commands.erase(r->tid);
+  delete r;
+}
+
+// ---------
+
+void MonClient::handle_get_version_reply(MMonGetVersionReply* m)
+{
+  ceph_assert(ceph_mutex_is_locked(monc_lock));
+  auto iter = version_requests.find(m->handle);
+  if (iter == version_requests.end()) {
+    ldout(cct, 0) << __func__ << " version request with handle " << m->handle
+		  << " not found" << dendl;
+  } else {
+    auto req = std::move(iter->second);
+    ldout(cct, 10) << __func__ << " finishing " << iter->first << " version "
+		   << m->version << dendl;
+    version_requests.erase(iter);
+    ceph::async::post(std::move(req), bs::error_code(),
+		      m->version, m->oldest_version);
+  }
+  m->put();
+}
+
+int MonClient::get_auth_request(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  uint32_t *auth_method,
+  std::vector<uint32_t> *preferred_modes,
+  ceph::buffer::list *bl)
+{
+  std::lock_guard l(monc_lock);
+  ldout(cct,10) << __func__ << " con " << con << " auth_method " << *auth_method
+		<< dendl;
+
+  // connection to mon?
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    ceph_assert(!auth_meta->authorizer);
+    if (con->is_anon()) {
+      for (auto& i : mon_commands) {
+	if (i.second->target_con == con) {
+	  return i.second->target_session->get_auth_request(
+	    auth_method, preferred_modes, bl,
+	    entity_name, want_keys, rotating_secrets.get());
+	}
+      }
+    }
+    for (auto& i : pending_cons) {
+      if (i.second.is_con(con)) {
+	return i.second.get_auth_request(
+	  auth_method, preferred_modes, bl,
+	  entity_name, want_keys, rotating_secrets.get());
+      }
+    }
+    return -ENOENT;
+  }
+
+  // generate authorizer
+  if (!auth) {
+    lderr(cct) << __func__ << " but no auth handler is set up" << dendl;
+    return -EACCES;
+  }
+  auth_meta->authorizer.reset(auth->build_authorizer(con->get_peer_type()));
+  if (!auth_meta->authorizer) {
+    lderr(cct) << __func__ << " failed to build_authorizer for type "
+	       << ceph_entity_type_name(con->get_peer_type()) << dendl;
+    return -EACCES;
+  }
+  auth_meta->auth_method = auth_meta->authorizer->protocol;
+  auth_registry.get_supported_modes(con->get_peer_type(),
+				    auth_meta->auth_method,
+				    preferred_modes);
+  *bl = auth_meta->authorizer->bl;
+  return 0;
+}
+
+int MonClient::handle_auth_reply_more(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  const ceph::buffer::list& bl,
+  ceph::buffer::list *reply)
+{
+  std::lock_guard l(monc_lock);
+
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    if (con->is_anon()) {
+      for (auto& i : mon_commands) {
+	if (i.second->target_con == con) {
+	  return i.second->target_session->handle_auth_reply_more(
+	    auth_meta, bl, reply);
+	}
+      }
+    }
+    for (auto& i : pending_cons) {
+      if (i.second.is_con(con)) {
+	return i.second.handle_auth_reply_more(auth_meta, bl, reply);
+      }
+    }
+    return -ENOENT;
+  }
+
+  // authorizer challenges
+  if (!auth || !auth_meta->authorizer) {
+    lderr(cct) << __func__ << " no authorizer?" << dendl;
+    return -1;
+  }
+  auth_meta->authorizer->add_challenge(cct, bl);
+  *reply = auth_meta->authorizer->bl;
+  return 0;
+}
+
+int MonClient::handle_auth_done(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  uint64_t global_id,
+  uint32_t con_mode,
+  const ceph::buffer::list& bl,
+  CryptoKey *session_key,
+  std::string *connection_secret)
+{
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    std::lock_guard l(monc_lock);
+    if (con->is_anon()) {
+      for (auto& i : mon_commands) {
+	if (i.second->target_con == con) {
+	  return i.second->target_session->handle_auth_done(
+	    auth_meta, global_id, bl,
+	    session_key, connection_secret);
+	}
+      }
+    }
+    for (auto& i : pending_cons) {
+      if (i.second.is_con(con)) {
+	int r = i.second.handle_auth_done(
+	  auth_meta, global_id, bl,
+	  session_key, connection_secret);
+	if (r) {
+	  pending_cons.erase(i.first);
+	  if (!pending_cons.empty()) {
+	    return r;
+	  }
+	} else {
+	  active_con.reset(new MonConnection(std::move(i.second)));
+	  pending_cons.clear();
+	  ceph_assert(active_con->have_session());
+	}
+
+	_finish_hunting(r);
+	if (r || monmap.get_epoch() > 0) {
+	  _finish_auth(r);
+	}
+	return r;
+      }
+    }
+    return -ENOENT;
+  } else {
+    // verify authorizer reply
+    auto p = bl.begin();
+    if (!auth_meta->authorizer->verify_reply(p, &auth_meta->connection_secret)) {
+      ldout(cct, 0) << __func__ << " failed verifying authorizer reply"
+		    << dendl;
+      return -EACCES;
+    }
+    auth_meta->session_key = auth_meta->authorizer->session_key;
+    return 0;
+  }
+}
+
+int MonClient::handle_auth_bad_method(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  uint32_t old_auth_method,
+  int result,
+  const std::vector<uint32_t>& allowed_methods,
+  const std::vector<uint32_t>& allowed_modes)
+{
+  auth_meta->allowed_methods = allowed_methods;
+
+  std::lock_guard l(monc_lock);
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    if (con->is_anon()) {
+      for (auto& i : mon_commands) {
+	if (i.second->target_con == con) {
+	  int r = i.second->target_session->handle_auth_bad_method(
+	    old_auth_method,
+	    result,
+	    allowed_methods,
+	    allowed_modes);
+	  if (r < 0) {
+	    auto ec = bs::error_code(-r, mon_category());
+	    _finish_command(i.second, ec, "auth failed"sv, {});
+	  }
+	  return r;
+	}
+      }
+    }
+    for (auto& i : pending_cons) {
+      if (i.second.is_con(con)) {
+	int r = i.second.handle_auth_bad_method(old_auth_method,
+						result,
+						allowed_methods,
+						allowed_modes);
+	if (r == 0) {
+	  return r; // try another method on this con
+	}
+	pending_cons.erase(i.first);
+	if (!pending_cons.empty()) {
+	  return r;  // fail this con, maybe another con will succeed
+	}
+	// fail hunt
+	_finish_hunting(r);
+	_finish_auth(r);
+	return r;
+      }
+    }
+    return -ENOENT;
+  } else {
+    // huh...
+    ldout(cct,10) << __func__ << " hmm, they didn't like " << old_auth_method
+		  << " result " << cpp_strerror(result)
+		  << " and auth is " << (auth ? auth->get_protocol() : 0)
+		  << dendl;
+    return -EACCES;
+  }
+}
+
+int MonClient::handle_auth_request(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  bool more,
+  uint32_t auth_method,
+  const ceph::buffer::list& payload,
+  ceph::buffer::list *reply)
+{
+  if (payload.length() == 0) {
+    // for some channels prior to nautilus (osd heartbeat), we
+    // tolerate the lack of an authorizer.
+    if (!con->get_messenger()->require_authorizer) {
+      handle_authentication_dispatcher->ms_handle_authentication(con);
+      return 1;
+    }
+    return -EACCES;
+  }
+  auth_meta->auth_mode = payload[0];
+  if (auth_meta->auth_mode < AUTH_MODE_AUTHORIZER ||
+      auth_meta->auth_mode > AUTH_MODE_AUTHORIZER_MAX) {
+    return -EACCES;
+  }
+  AuthAuthorizeHandler *ah = get_auth_authorize_handler(con->get_peer_type(),
+							auth_method);
+  if (!ah) {
+    lderr(cct) << __func__ << " no AuthAuthorizeHandler found for auth method "
+	       << auth_method << dendl;
+    return -EOPNOTSUPP;
+  }
+
+  auto ac = &auth_meta->authorizer_challenge;
+  if (auth_meta->skip_authorizer_challenge) {
+    ldout(cct, 10) << __func__ << " skipping challenge on " << con << dendl;
+    ac = nullptr;
+  }
+
+  bool was_challenge = (bool)auth_meta->authorizer_challenge;
+  bool isvalid = ah->verify_authorizer(
+    cct,
+    *rotating_secrets,
+    payload,
+    auth_meta->get_connection_secret_length(),
+    reply,
+    &con->peer_name,
+    &con->peer_global_id,
+    &con->peer_caps_info,
+    &auth_meta->session_key,
+    &auth_meta->connection_secret,
+    ac);
+  if (isvalid) {
+    handle_authentication_dispatcher->ms_handle_authentication(con);
+    return 1;
+  }
+  if (!more && !was_challenge && auth_meta->authorizer_challenge) {
+    ldout(cct,10) << __func__ << " added challenge on " << con << dendl;
+    return 0;
+  }
+  ldout(cct,10) << __func__ << " bad authorizer on " << con << dendl;
+  // discard old challenge
+  auth_meta->authorizer_challenge.reset();
+  return -EACCES;
+}
+
+AuthAuthorizer* MonClient::build_authorizer(int service_id) const {
+  std::lock_guard l(monc_lock);
+  if (auth) {
+    return auth->build_authorizer(service_id);
+  } else {
+    ldout(cct, 0) << __func__ << " for " << ceph_entity_type_name(service_id)
+		  << ", but no auth is available now" << dendl;
+    return nullptr;
+  }
+}
+
+#define dout_subsys ceph_subsys_monc
+#undef dout_prefix
+#define dout_prefix *_dout << "monclient" << (have_session() ? ": " : "(hunting): ")
+
+MonConnection::MonConnection(
+  CephContext *cct, ConnectionRef con, uint64_t global_id,
+  AuthRegistry *ar)
+  : cct(cct), con(con), global_id(global_id), auth_registry(ar)
+{}
+
+MonConnection::~MonConnection()
+{
+  if (con) {
+    con->mark_down();
+    con.reset();
+  }
+}
+
+bool MonConnection::have_session() const
+{
+  return state == State::HAVE_SESSION;
+}
+
+void MonConnection::start(epoch_t epoch,
+			  const EntityName& entity_name)
+{
+  using ceph::encode;
+  auth_start = ceph_clock_now();
+
+  if (con->get_peer_addr().is_msgr2()) {
+    ldout(cct, 10) << __func__ << " opening mon connection" << dendl;
+    state = State::AUTHENTICATING;
+    con->send_message(new MMonGetMap());
+    return;
+  }
+
+  // restart authentication handshake
+  state = State::NEGOTIATING;
+
+  // send an initial keepalive to ensure our timestamp is valid by the
+  // time we are in an OPENED state (by sequencing this before
+  // authentication).
+  con->send_keepalive();
+
+  auto m = new MAuth;
+  m->protocol = CEPH_AUTH_UNKNOWN;
+  m->monmap_epoch = epoch;
+  __u8 struct_v = 1;
+  encode(struct_v, m->auth_payload);
+  std::vector<uint32_t> auth_supported;
+  auth_registry->get_supported_methods(con->get_peer_type(), &auth_supported);
+  encode(auth_supported, m->auth_payload);
+  encode(entity_name, m->auth_payload);
+  encode(global_id, m->auth_payload);
+  con->send_message(m);
+}
+
+int MonConnection::get_auth_request(
+  uint32_t *method,
+  std::vector<uint32_t> *preferred_modes,
+  ceph::buffer::list *bl,
+  const EntityName& entity_name,
+  uint32_t want_keys,
+  RotatingKeyRing* keyring)
+{
+  using ceph::encode;
+  // choose method
+  if (auth_method < 0) {
+    std::vector<uint32_t> as;
+    auth_registry->get_supported_methods(con->get_peer_type(), &as);
+    if (as.empty()) {
+      return -EACCES;
+    }
+    auth_method = as.front();
+  }
+  *method = auth_method;
+  auth_registry->get_supported_modes(con->get_peer_type(), auth_method,
+				     preferred_modes);
+  ldout(cct,10) << __func__ << " method " << *method
+		<< " preferred_modes " << *preferred_modes << dendl;
+  if (preferred_modes->empty()) {
+    return -EACCES;
+  }
+
+  int r = _init_auth(*method, entity_name, want_keys, keyring, true);
+  ceph_assert(r == 0);
+
+  // initial requset includes some boilerplate...
+  encode((char)AUTH_MODE_MON, *bl);
+  encode(entity_name, *bl);
+  encode(global_id, *bl);
+
+  // and (maybe) some method-specific initial payload
+  auth->build_initial_request(bl);
+
+  return 0;
+}
+
+int MonConnection::handle_auth_reply_more(
+  AuthConnectionMeta *auth_meta,
+  const ceph::buffer::list& bl,
+  ceph::buffer::list *reply)
+{
+  ldout(cct, 10) << __func__ << " payload " << bl.length() << dendl;
+  ldout(cct, 30) << __func__ << " got\n";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+
+  auto p = bl.cbegin();
+  ldout(cct, 10) << __func__ << " payload_len " << bl.length() << dendl;
+  int r = auth->handle_response(0, p, &auth_meta->session_key,
+				&auth_meta->connection_secret);
+  if (r == -EAGAIN) {
+    auth->prepare_build_request();
+    auth->build_request(*reply);
+    ldout(cct, 10) << __func__ << " responding with " << reply->length()
+		   << " bytes" << dendl;
+    r = 0;
+  } else if (r < 0) {
+    lderr(cct) << __func__ << " handle_response returned " << r << dendl;
+  } else {
+    ldout(cct, 10) << __func__ << " authenticated!" << dendl;
+    // FIXME
+    ceph_abort(cct, "write me");
+  }
+  return r;
+}
+
+int MonConnection::handle_auth_done(
+  AuthConnectionMeta *auth_meta,
+  uint64_t new_global_id,
+  const ceph::buffer::list& bl,
+  CryptoKey *session_key,
+  std::string *connection_secret)
+{
+  ldout(cct,10) << __func__ << " global_id " << new_global_id
+		<< " payload " << bl.length()
+		<< dendl;
+  global_id = new_global_id;
+  auth->set_global_id(global_id);
+  auto p = bl.begin();
+  int auth_err = auth->handle_response(0, p, &auth_meta->session_key,
+				       &auth_meta->connection_secret);
+  if (auth_err >= 0) {
+    state = State::HAVE_SESSION;
+  }
+  con->set_last_keepalive_ack(auth_start);
+
+  if (pending_tell_command) {
+    con->send_message2(std::move(pending_tell_command));
+  }
+  return auth_err;
+}
+
+int MonConnection::handle_auth_bad_method(
+  uint32_t old_auth_method,
+  int result,
+  const std::vector<uint32_t>& allowed_methods,
+  const std::vector<uint32_t>& allowed_modes)
+{
+  ldout(cct,10) << __func__ << " old_auth_method " << old_auth_method
+		<< " result " << cpp_strerror(result)
+		<< " allowed_methods " << allowed_methods << dendl;
+  std::vector<uint32_t> auth_supported;
+  auth_registry->get_supported_methods(con->get_peer_type(), &auth_supported);
+  auto p = std::find(auth_supported.begin(), auth_supported.end(),
+		     old_auth_method);
+  assert(p != auth_supported.end());
+  p = std::find_first_of(std::next(p), auth_supported.end(),
+			 allowed_methods.begin(), allowed_methods.end());
+  if (p == auth_supported.end()) {
+    lderr(cct) << __func__ << " server allowed_methods " << allowed_methods
+	       << " but i only support " << auth_supported << dendl;
+    return -EACCES;
+  }
+  auth_method = *p;
+  ldout(cct,10) << __func__ << " will try " << auth_method << " next" << dendl;
+  return 0;
+}
+
+int MonConnection::handle_auth(MAuthReply* m,
+			       const EntityName& entity_name,
+			       uint32_t want_keys,
+			       RotatingKeyRing* keyring)
+{
+  if (state == State::NEGOTIATING) {
+    int r = _negotiate(m, entity_name, want_keys, keyring);
+    if (r) {
+      return r;
+    }
+    state = State::AUTHENTICATING;
+  }
+  int r = authenticate(m);
+  if (!r) {
+    state = State::HAVE_SESSION;
+  }
+  return r;
+}
+
+int MonConnection::_negotiate(MAuthReply *m,
+			      const EntityName& entity_name,
+			      uint32_t want_keys,
+			      RotatingKeyRing* keyring)
+{
+  int r = _init_auth(m->protocol, entity_name, want_keys, keyring, false);
+  if (r == -ENOTSUP) {
+    if (m->result == -ENOTSUP) {
+      ldout(cct, 10) << "none of our auth protocols are supported by the server"
+		     << dendl;
+    }
+    return m->result;
+  }
+  return r;
+}
+
+int MonConnection::_init_auth(
+  uint32_t method,
+  const EntityName& entity_name,
+  uint32_t want_keys,
+  RotatingKeyRing* keyring,
+  bool msgr2)
+{
+  ldout(cct, 10) << __func__ << " method " << method << dendl;
+  if (auth && auth->get_protocol() == (int)method) {
+    ldout(cct, 10) << __func__ << " already have auth, reseting" << dendl;
+    auth->reset();
+    return 0;
+  }
+
+  ldout(cct, 10) << __func__ << " creating new auth" << dendl;
+  auth.reset(AuthClientHandler::create(cct, method, keyring));
+  if (!auth) {
+    ldout(cct, 10) << " no handler for protocol " << method << dendl;
+    return -ENOTSUP;
+  }
+
+  // do not request MGR key unless the mon has the SERVER_KRAKEN
+  // feature.  otherwise it will give us an auth error.  note that
+  // we have to use the FEATUREMASK because pre-jewel the kraken
+  // feature bit was used for something else.
+  if (!msgr2 &&
+      (want_keys & CEPH_ENTITY_TYPE_MGR) &&
+      !(con->has_features(CEPH_FEATUREMASK_SERVER_KRAKEN))) {
+    ldout(cct, 1) << __func__
+		  << " not requesting MGR keys from pre-kraken monitor"
+		  << dendl;
+    want_keys &= ~CEPH_ENTITY_TYPE_MGR;
+  }
+  auth->set_want_keys(want_keys);
+  auth->init(entity_name);
+  auth->set_global_id(global_id);
+  return 0;
+}
+
+int MonConnection::authenticate(MAuthReply *m)
+{
+  ceph_assert(auth);
+  if (!m->global_id) {
+    ldout(cct, 1) << "peer sent an invalid global_id" << dendl;
+  }
+  if (m->global_id != global_id) {
+    // it's a new session
+    auth->reset();
+    global_id = m->global_id;
+    auth->set_global_id(global_id);
+    ldout(cct, 10) << "my global_id is " << m->global_id << dendl;
+  }
+  auto p = m->result_bl.cbegin();
+  int ret = auth->handle_response(m->result, p, nullptr, nullptr);
+  if (ret == -EAGAIN) {
+    auto ma = new MAuth;
+    ma->protocol = auth->get_protocol();
+    auth->prepare_build_request();
+    auth->build_request(ma->auth_payload);
+    con->send_message(ma);
+  }
+  if (ret == 0 && pending_tell_command) {
+    con->send_message2(std::move(pending_tell_command));
+  }
+
+  return ret;
+}
+
+void MonClient::register_config_callback(md_config_t::config_callback fn) {
+  ceph_assert(!config_cb);
+  config_cb = fn;
+}
+
+md_config_t::config_callback MonClient::get_config_callback() {
+  return config_cb;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+class monc_error_category : public ceph::converting_category {
+public:
+  monc_error_category(){}
+  const char* name() const noexcept override;
+  const char* message(int ev, char*, std::size_t) const noexcept override;
+  std::string message(int ev) const override;
+  bs::error_condition default_error_condition(int ev) const noexcept
+    override;
+  bool equivalent(int ev, const bs::error_condition& c) const
+    noexcept override;
+  using ceph::converting_category::equivalent;
+  int from_code(int ev) const noexcept override;
+};
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
+
+const char* monc_error_category::name() const noexcept {
+  return "monc";
+}
+
+const char* monc_error_category::message(int ev, char*, std::size_t) const noexcept {
+  if (ev == 0)
+    return "No error";
+
+  switch (static_cast<monc_errc>(ev)) {
+  case monc_errc::shutting_down: // Command failed due to MonClient shutting down
+    return "Command failed due to MonClient shutting down";
+  case monc_errc::session_reset:
+    return "Monitor session was reset";
+  case monc_errc::rank_dne:
+    return "Requested monitor rank does not exist";
+  case monc_errc::mon_dne:
+    return "Requested monitor does not exist";
+  case monc_errc::timed_out:
+    return "Monitor operation timed out";
+  case monc_errc::mon_unavailable:
+    return "Monitor unavailable";
+  }
+
+  return "Unknown error";
+}
+
+std::string monc_error_category::message(int ev) const {
+  return message(ev, nullptr, 0);
+}
+
+bs::error_condition monc_error_category::default_error_condition(int ev) const noexcept {
+  switch (static_cast<monc_errc>(ev)) {
+  case monc_errc::shutting_down:
+    return bs::errc::operation_canceled;
+  case monc_errc::session_reset:
+    return bs::errc::resource_unavailable_try_again;
+  case monc_errc::rank_dne:
+    [[fallthrough]];
+  case monc_errc::mon_dne:
+    return ceph::errc::not_in_map;
+  case monc_errc::timed_out:
+    return bs::errc::timed_out;
+  case monc_errc::mon_unavailable:
+    return bs::errc::no_such_device;
+  }
+  return { ev, *this };
+}
+
+bool monc_error_category::equivalent(int ev, const bs::error_condition& c) const noexcept {
+  switch (static_cast<monc_errc>(ev)) {
+  case monc_errc::rank_dne:
+    [[fallthrough]];
+  case monc_errc::mon_dne:
+      return c == bs::errc::no_such_file_or_directory;
+  default:
+    return default_error_condition(ev) == c;
+  }
+}
+
+int monc_error_category::from_code(int ev) const noexcept {
+  if (ev == 0)
+    return 0;
+
+  switch (static_cast<monc_errc>(ev)) {
+  case monc_errc::shutting_down:
+    return -ECANCELED;
+  case monc_errc::session_reset:
+    return -EAGAIN;
+  case monc_errc::rank_dne:
+    [[fallthrough]];
+  case monc_errc::mon_dne:
+    return -ENOENT;
+  case monc_errc::timed_out:
+    return -ETIMEDOUT;
+  case monc_errc::mon_unavailable:
+    return -ENXIO;
+  }
+  return -EDOM;
+}
+
+const bs::error_category& monc_category() noexcept {
+  static const monc_error_category c;
+  return c;
+}
diff --git a/src/mon/MonClient.h b/src/mon/MonClient.h
new file mode 100644
index 000000000..6a7daa814
--- /dev/null
+++ b/src/mon/MonClient.h
@@ -0,0 +1,774 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_MONCLIENT_H
+#define CEPH_MONCLIENT_H
+
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "msg/Messenger.h"
+
+#include "MonMap.h"
+#include "MonSub.h"
+
+#include "common/async/completion.h"
+#include "common/Timer.h"
+#include "common/config.h"
+#include "messages/MMonGetVersion.h"
+
+#include "auth/AuthClient.h"
+#include "auth/AuthServer.h"
+
+class MMonMap;
+class MConfig;
+class MMonGetVersionReply;
+class MMonCommandAck;
+class LogClient;
+class AuthClientHandler;
+class AuthRegistry;
+class KeyRing;
+class RotatingKeyRing;
+
+class MonConnection {
+public:
+  MonConnection(CephContext *cct,
+		ConnectionRef conn,
+		uint64_t global_id,
+		AuthRegistry *auth_registry);
+  ~MonConnection();
+  MonConnection(MonConnection&& rhs) = default;
+  MonConnection& operator=(MonConnection&&) = default;
+  MonConnection(const MonConnection& rhs) = delete;
+  MonConnection& operator=(const MonConnection&) = delete;
+  int handle_auth(MAuthReply *m,
+		  const EntityName& entity_name,
+		  uint32_t want_keys,
+		  RotatingKeyRing* keyring);
+  int authenticate(MAuthReply *m);
+  void start(epoch_t epoch,
+             const EntityName& entity_name);
+  bool have_session() const;
+  uint64_t get_global_id() const {
+    return global_id;
+  }
+  ConnectionRef get_con() {
+    return con;
+  }
+  std::unique_ptr<AuthClientHandler>& get_auth() {
+    return auth;
+  }
+
+  int get_auth_request(
+    uint32_t *method,
+    std::vector<uint32_t> *preferred_modes,
+    ceph::buffer::list *out,
+    const EntityName& entity_name,
+    uint32_t want_keys,
+    RotatingKeyRing* keyring);
+  int handle_auth_reply_more(
+    AuthConnectionMeta *auth_meta,
+    const ceph::buffer::list& bl,
+    ceph::buffer::list *reply);
+  int handle_auth_done(
+    AuthConnectionMeta *auth_meta,
+    uint64_t global_id,
+    const ceph::buffer::list& bl,
+    CryptoKey *session_key,
+    std::string *connection_secret);
+  int handle_auth_bad_method(
+    uint32_t old_auth_method,
+    int result,
+    const std::vector<uint32_t>& allowed_methods,
+    const std::vector<uint32_t>& allowed_modes);
+
+  bool is_con(Connection *c) const {
+    return con.get() == c;
+  }
+  void queue_command(Message *m) {
+    pending_tell_command = m;
+  }
+
+private:
+  int _negotiate(MAuthReply *m,
+		 const EntityName& entity_name,
+		 uint32_t want_keys,
+		 RotatingKeyRing* keyring);
+  int _init_auth(uint32_t method,
+		 const EntityName& entity_name,
+		 uint32_t want_keys,
+		 RotatingKeyRing* keyring,
+		 bool msgr2);
+
+private:
+  CephContext *cct;
+  enum class State {
+    NONE,
+    NEGOTIATING,       // v1 only
+    AUTHENTICATING,    // v1 and v2
+    HAVE_SESSION,
+  };
+  State state = State::NONE;
+  ConnectionRef con;
+  int auth_method = -1;
+  utime_t auth_start;
+
+  std::unique_ptr<AuthClientHandler> auth;
+  uint64_t global_id;
+
+  MessageRef pending_tell_command;
+
+  AuthRegistry *auth_registry;
+};
+
+
+struct MonClientPinger : public Dispatcher,
+			 public AuthClient {
+  ceph::mutex lock = ceph::make_mutex("MonClientPinger::lock");
+  ceph::condition_variable ping_recvd_cond;
+  std::string *result;
+  bool done;
+  RotatingKeyRing *keyring;
+  std::unique_ptr<MonConnection> mc;
+
+  MonClientPinger(CephContext *cct_,
+		  RotatingKeyRing *keyring,
+		  std::string *res_) :
+    Dispatcher(cct_),
+    result(res_),
+    done(false),
+    keyring(keyring)
+  { }
+
+  int wait_for_reply(double timeout = 0.0) {
+    std::unique_lock locker{lock};
+    if (timeout <= 0) {
+      timeout = std::chrono::duration<double>(cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout")).count();
+    }
+    done = false;
+    if (ping_recvd_cond.wait_for(locker,
+				 ceph::make_timespan(timeout),
+				 [this] { return done; })) {
+      return 0;
+    } else {
+      return ETIMEDOUT;
+    }
+  }
+
+  bool ms_dispatch(Message *m) override {
+    using ceph::decode;
+    std::lock_guard l(lock);
+    if (m->get_type() != CEPH_MSG_PING)
+      return false;
+
+    ceph::buffer::list &payload = m->get_payload();
+    if (result && payload.length() > 0) {
+      auto p = std::cbegin(payload);
+      decode(*result, p);
+    }
+    done = true;
+    ping_recvd_cond.notify_all();
+    m->put();
+    return true;
+  }
+  bool ms_handle_reset(Connection *con) override {
+    std::lock_guard l(lock);
+    done = true;
+    ping_recvd_cond.notify_all();
+    return true;
+  }
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override {
+    return false;
+  }
+
+  // AuthClient
+  int get_auth_request(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    uint32_t *auth_method,
+    std::vector<uint32_t> *preferred_modes,
+    ceph::buffer::list *bl) override {
+    return mc->get_auth_request(auth_method, preferred_modes, bl,
+				cct->_conf->name, 0, keyring);
+  }
+  int handle_auth_reply_more(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    const ceph::buffer::list& bl,
+    ceph::buffer::list *reply) override {
+    return mc->handle_auth_reply_more(auth_meta, bl, reply);
+  }
+  int handle_auth_done(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    uint64_t global_id,
+    uint32_t con_mode,
+    const ceph::buffer::list& bl,
+    CryptoKey *session_key,
+    std::string *connection_secret) override {
+    return mc->handle_auth_done(auth_meta, global_id, bl,
+				session_key, connection_secret);
+  }
+  int handle_auth_bad_method(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    uint32_t old_auth_method,
+    int result,
+    const std::vector<uint32_t>& allowed_methods,
+    const std::vector<uint32_t>& allowed_modes) override {
+    return mc->handle_auth_bad_method(old_auth_method, result,
+				      allowed_methods, allowed_modes);
+  }
+};
+
+const boost::system::error_category& monc_category() noexcept;
+
+enum class monc_errc {
+  shutting_down = 1, // Command failed due to MonClient shutting down
+  session_reset, // Monitor session was reset
+  rank_dne, // Requested monitor rank does not exist
+  mon_dne, // Requested monitor does not exist
+  timed_out, // Monitor operation timed out
+  mon_unavailable // Monitor unavailable
+};
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::monc_errc> {
+  static const bool value = true;
+};
+}
+
+//  implicit conversion:
+inline boost::system::error_code make_error_code(monc_errc e) noexcept {
+  return { static_cast<int>(e), monc_category() };
+}
+
+// explicit conversion:
+inline boost::system::error_condition make_error_condition(monc_errc e) noexcept {
+  return { static_cast<int>(e), monc_category() };
+}
+
+const boost::system::error_category& monc_category() noexcept;
+
+class MonClient : public Dispatcher,
+		  public AuthClient,
+		  public AuthServer /* for mgr, osd, mds */ {
+  static constexpr auto dout_subsys = ceph_subsys_monc;
+public:
+  // Error, Newest, Oldest
+  using VersionSig = void(boost::system::error_code, version_t, version_t);
+  using VersionCompletion = ceph::async::Completion<VersionSig>;
+
+  using CommandSig = void(boost::system::error_code, std::string,
+			  ceph::buffer::list);
+  using CommandCompletion = ceph::async::Completion<CommandSig>;
+
+  MonMap monmap;
+  std::map<std::string,std::string> config_mgr;
+
+private:
+  Messenger *messenger;
+
+  std::unique_ptr<MonConnection> active_con;
+  std::map<entity_addrvec_t, MonConnection> pending_cons;
+  std::set<unsigned> tried;
+
+  EntityName entity_name;
+
+  mutable ceph::mutex monc_lock = ceph::make_mutex("MonClient::monc_lock");
+  SafeTimer timer;
+  boost::asio::io_context& service;
+  boost::asio::io_context::strand finish_strand{service};
+
+  bool initialized;
+  bool stopping = false;
+
+  LogClient *log_client;
+  bool more_log_pending;
+
+  void send_log(bool flush = false);
+
+  bool ms_dispatch(Message *m) override;
+  bool ms_handle_reset(Connection *con) override;
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override { return false; }
+
+  void handle_monmap(MMonMap *m);
+  void handle_config(MConfig *m);
+
+  void handle_auth(MAuthReply *m);
+
+  // monitor session
+  utime_t last_keepalive;
+  utime_t last_send_log;
+
+  void tick();
+  void schedule_tick();
+
+  // monclient
+  bool want_monmap;
+  ceph::condition_variable map_cond;
+  bool passthrough_monmap = false;
+
+  bool want_bootstrap_config = false;
+  ceph::ref_t<MConfig> bootstrap_config;
+
+  // authenticate
+  std::unique_ptr<AuthClientHandler> auth;
+  uint32_t want_keys = 0;
+  uint64_t global_id = 0;
+  ceph::condition_variable auth_cond;
+  int authenticate_err = 0;
+  bool authenticated = false;
+
+  std::list<MessageRef> waiting_for_session;
+  utime_t last_rotating_renew_sent;
+  bool had_a_connection;
+  double reopen_interval_multiplier;
+
+  Dispatcher *handle_authentication_dispatcher = nullptr;
+  bool _opened() const;
+  bool _hunting() const;
+  void _start_hunting();
+  void _finish_hunting(int auth_err);
+  void _finish_auth(int auth_err);
+  void _reopen_session(int rank = -1);
+  void _add_conn(unsigned rank);
+  void _add_conns();
+  void _un_backoff();
+  void _send_mon_message(MessageRef m);
+
+  std::map<entity_addrvec_t, MonConnection>::iterator _find_pending_con(
+    const ConnectionRef& con) {
+    for (auto i = pending_cons.begin(); i != pending_cons.end(); ++i) {
+      if (i->second.get_con() == con) {
+	return i;
+      }
+    }
+    return pending_cons.end();
+  }
+
+public:
+  // AuthClient
+  int get_auth_request(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    uint32_t *method,
+    std::vector<uint32_t> *preferred_modes,
+    ceph::buffer::list *bl) override;
+  int handle_auth_reply_more(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    const ceph::buffer::list& bl,
+    ceph::buffer::list *reply) override;
+  int handle_auth_done(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    uint64_t global_id,
+    uint32_t con_mode,
+    const ceph::buffer::list& bl,
+    CryptoKey *session_key,
+    std::string *connection_secret) override;
+  int handle_auth_bad_method(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    uint32_t old_auth_method,
+    int result,
+    const std::vector<uint32_t>& allowed_methods,
+    const std::vector<uint32_t>& allowed_modes) override;
+  // AuthServer
+  int handle_auth_request(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    bool more,
+    uint32_t auth_method,
+    const ceph::buffer::list& bl,
+    ceph::buffer::list *reply) override;
+
+  void set_entity_name(EntityName name) { entity_name = name; }
+  void set_handle_authentication_dispatcher(Dispatcher *d) {
+    handle_authentication_dispatcher = d;
+  }
+  int _check_auth_tickets();
+  int _check_auth_rotating();
+  int wait_auth_rotating(double timeout);
+
+  int authenticate(double timeout=0.0);
+  bool is_authenticated() const {return authenticated;}
+
+  bool is_connected() const { return active_con != nullptr; }
+
+  /**
+   * Try to flush as many log messages as we can in a single
+   * message.  Use this before shutting down to transmit your
+   * last message.
+   */
+  void flush_log();
+
+private:
+  // mon subscriptions
+  MonSub sub;
+  void _renew_subs();
+  void handle_subscribe_ack(MMonSubscribeAck* m);
+
+public:
+  void renew_subs() {
+    std::lock_guard l(monc_lock);
+    _renew_subs();
+  }
+  bool sub_want(std::string what, version_t start, unsigned flags) {
+    std::lock_guard l(monc_lock);
+    return sub.want(what, start, flags);
+  }
+  void sub_got(std::string what, version_t have) {
+    std::lock_guard l(monc_lock);
+    sub.got(what, have);
+  }
+  void sub_unwant(std::string what) {
+    std::lock_guard l(monc_lock);
+    sub.unwant(what);
+  }
+  bool sub_want_increment(std::string what, version_t start, unsigned flags) {
+    std::lock_guard l(monc_lock);
+    return sub.inc_want(what, start, flags);
+  }
+
+  std::unique_ptr<KeyRing> keyring;
+  std::unique_ptr<RotatingKeyRing> rotating_secrets;
+
+ public:
+  MonClient(CephContext *cct_, boost::asio::io_context& service);
+  MonClient(const MonClient &) = delete;
+  MonClient& operator=(const MonClient &) = delete;
+  ~MonClient() override;
+
+  int init();
+  void shutdown();
+
+  void set_log_client(LogClient *clog) {
+    log_client = clog;
+  }
+  LogClient *get_log_client() {
+    return log_client;
+  }
+
+  int build_initial_monmap();
+  int get_monmap();
+  int get_monmap_and_config();
+  /**
+   * If you want to see MonMap messages, set this and
+   * the MonClient will tell the Messenger it hasn't
+   * dealt with it.
+   * Note that if you do this, *you* are of course responsible for
+   * putting the message reference!
+   */
+  void set_passthrough_monmap() {
+    std::lock_guard l(monc_lock);
+    passthrough_monmap = true;
+  }
+  void unset_passthrough_monmap() {
+    std::lock_guard l(monc_lock);
+    passthrough_monmap = false;
+  }
+  /**
+   * Ping monitor with ID @p mon_id and record the resulting
+   * reply in @p result_reply.
+   *
+   * @param[in]  mon_id Target monitor's ID
+   * @param[out] result_reply reply from mon.ID, if param != NULL
+   * @returns    0 in case of success; < 0 in case of error,
+   *             -ETIMEDOUT if monitor didn't reply before timeout
+   *             expired (default: conf->client_mount_timeout).
+   */
+  int ping_monitor(const std::string &mon_id, std::string *result_reply);
+
+  void send_mon_message(Message *m) {
+    send_mon_message(MessageRef{m, false});
+  }
+  void send_mon_message(MessageRef m);
+
+  void reopen_session() {
+    std::lock_guard l(monc_lock);
+    _reopen_session();
+  }
+
+  const uuid_d& get_fsid() const {
+    return monmap.fsid;
+  }
+
+  entity_addrvec_t get_mon_addrs(unsigned i) const {
+    std::lock_guard l(monc_lock);
+    if (i < monmap.size())
+      return monmap.get_addrs(i);
+    return entity_addrvec_t();
+  }
+  int get_num_mon() const {
+    std::lock_guard l(monc_lock);
+    return monmap.size();
+  }
+
+  uint64_t get_global_id() const {
+    std::lock_guard l(monc_lock);
+    return global_id;
+  }
+
+  void set_messenger(Messenger *m) { messenger = m; }
+  entity_addrvec_t get_myaddrs() const { return messenger->get_myaddrs(); }
+  AuthAuthorizer* build_authorizer(int service_id) const;
+
+  void set_want_keys(uint32_t want) {
+    want_keys = want;
+  }
+
+  // admin commands
+private:
+  uint64_t last_mon_command_tid;
+
+  struct MonCommand {
+    // for tell only
+    std::string target_name;
+    int target_rank = -1;
+    ConnectionRef target_con;
+    std::unique_ptr<MonConnection> target_session;
+    unsigned send_attempts = 0;  ///< attempt count for legacy mons
+    utime_t last_send_attempt;
+    uint64_t tid;
+    std::vector<std::string> cmd;
+    ceph::buffer::list inbl;
+    std::unique_ptr<CommandCompletion> onfinish;
+    std::optional<boost::asio::steady_timer> cancel_timer;
+
+    MonCommand(MonClient& monc, uint64_t t, std::unique_ptr<CommandCompletion> onfinish)
+      : tid(t), onfinish(std::move(onfinish)) {
+      auto timeout =
+          monc.cct->_conf.get_val<std::chrono::seconds>("rados_mon_op_timeout");
+      if (timeout.count() > 0) {
+	cancel_timer.emplace(monc.service, timeout);
+	cancel_timer->async_wait(
+          [this, &monc](boost::system::error_code ec) {
+	    if (ec)
+	      return;
+	    std::scoped_lock l(monc.monc_lock);
+	    monc._cancel_mon_command(tid);
+	  });
+      }
+    }
+
+    bool is_tell() const {
+      return target_name.size() || target_rank >= 0;
+    }
+  };
+  friend MonCommand;
+  std::map<uint64_t,MonCommand*> mon_commands;
+
+  void _send_command(MonCommand *r);
+  void _check_tell_commands();
+  void _resend_mon_commands();
+  int _cancel_mon_command(uint64_t tid);
+  void _finish_command(MonCommand *r, boost::system::error_code ret, std::string_view rs,
+		       bufferlist&& bl);
+  void _finish_auth();
+  void handle_mon_command_ack(MMonCommandAck *ack);
+  void handle_command_reply(MCommandReply *reply);
+
+public:
+  template<typename CompletionToken>
+  auto start_mon_command(const std::vector<std::string>& cmd,
+                         const ceph::buffer::list& inbl,
+			 CompletionToken&& token) {
+    ldout(cct,10) << __func__ << " cmd=" << cmd << dendl;
+    boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+    {
+      std::scoped_lock l(monc_lock);
+      auto h = CommandCompletion::create(service.get_executor(),
+					 std::move(init.completion_handler));
+      if (!initialized || stopping) {
+	ceph::async::post(std::move(h), monc_errc::shutting_down, std::string{},
+			  bufferlist{});
+      } else {
+	auto r = new MonCommand(*this, ++last_mon_command_tid, std::move(h));
+	r->cmd = cmd;
+	r->inbl = inbl;
+	mon_commands.emplace(r->tid, r);
+	_send_command(r);
+      }
+    }
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto start_mon_command(int mon_rank, const std::vector<std::string>& cmd,
+			 const ceph::buffer::list& inbl, CompletionToken&& token) {
+    ldout(cct,10) << __func__ << " cmd=" << cmd << dendl;
+    boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+    {
+      std::scoped_lock l(monc_lock);
+      auto h = CommandCompletion::create(service.get_executor(),
+					 std::move(init.completion_handler));
+      if (!initialized || stopping) {
+	ceph::async::post(std::move(h), monc_errc::shutting_down, std::string{},
+			  bufferlist{});
+      } else {
+	auto r = new MonCommand(*this, ++last_mon_command_tid, std::move(h));
+	r->target_rank = mon_rank;
+	r->cmd = cmd;
+	r->inbl = inbl;
+	mon_commands.emplace(r->tid, r);
+	_send_command(r);
+      }
+    }
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto start_mon_command(const std::string& mon_name,
+                         const std::vector<std::string>& cmd,
+			 const ceph::buffer::list& inbl,
+			 CompletionToken&& token) {
+    ldout(cct,10) << __func__ << " cmd=" << cmd << dendl;
+    boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+    {
+      std::scoped_lock l(monc_lock);
+      auto h = CommandCompletion::create(service.get_executor(),
+					 std::move(init.completion_handler));
+      if (!initialized || stopping) {
+	ceph::async::post(std::move(h), monc_errc::shutting_down, std::string{},
+			  bufferlist{});
+      } else {
+	auto r = new MonCommand(*this, ++last_mon_command_tid, std::move(h));
+	// detect/tolerate mon *rank* passed as a string
+	std::string err;
+	int rank = strict_strtoll(mon_name.c_str(), 10, &err);
+	if (err.size() == 0 && rank >= 0) {
+	  ldout(cct,10) << __func__ << " interpreting name '" << mon_name
+			<< "' as rank " << rank << dendl;
+	  r->target_rank = rank;
+	} else {
+	  r->target_name = mon_name;
+	}
+	r->cmd = cmd;
+	r->inbl = inbl;
+	mon_commands.emplace(r->tid, r);
+	_send_command(r);
+      }
+    }
+    return init.result.get();
+  }
+
+  class ContextVerter {
+    std::string* outs;
+    ceph::bufferlist* outbl;
+    Context* onfinish;
+
+  public:
+    ContextVerter(std::string* outs, ceph::bufferlist* outbl, Context* onfinish)
+      : outs(outs), outbl(outbl), onfinish(onfinish) {}
+    ~ContextVerter() = default;
+    ContextVerter(const ContextVerter&) = default;
+    ContextVerter& operator =(const ContextVerter&) = default;
+    ContextVerter(ContextVerter&&) = default;
+    ContextVerter& operator =(ContextVerter&&) = default;
+
+    void operator()(boost::system::error_code e,
+		    std::string s,
+		    ceph::bufferlist bl) {
+      if (outs)
+	*outs = std::move(s);
+      if (outbl)
+	*outbl = std::move(bl);
+      if (onfinish)
+	onfinish->complete(ceph::from_error_code(e));
+    }
+  };
+
+  void start_mon_command(const vector<string>& cmd, const bufferlist& inbl,
+			 bufferlist *outbl, string *outs,
+			 Context *onfinish) {
+    start_mon_command(cmd, inbl, ContextVerter(outs, outbl, onfinish));
+  }
+  void start_mon_command(int mon_rank,
+			 const vector<string>& cmd, const bufferlist& inbl,
+			 bufferlist *outbl, string *outs,
+			 Context *onfinish) {
+    start_mon_command(mon_rank, cmd, inbl, ContextVerter(outs, outbl, onfinish));
+  }
+  void start_mon_command(const string &mon_name,  ///< mon name, with mon. prefix
+			 const vector<string>& cmd, const bufferlist& inbl,
+			 bufferlist *outbl, string *outs,
+			 Context *onfinish) {
+    start_mon_command(mon_name, cmd, inbl, ContextVerter(outs, outbl, onfinish));
+  }
+
+
+  // version requests
+public:
+  /**
+   * get latest known version(s) of cluster map
+   *
+   * @param map string name of map (e.g., 'osdmap')
+   * @param token context that will be triggered on completion
+   * @return (via Completion) {} on success,
+   *         boost::system::errc::resource_unavailable_try_again if we need to
+   *         resubmit our request
+   */
+  template<typename CompletionToken>
+  auto get_version(std::string&& map, CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, VersionSig> init(token);
+    {
+      std::scoped_lock l(monc_lock);
+      auto m = ceph::make_message<MMonGetVersion>();
+      m->what = std::move(map);
+      m->handle = ++version_req_id;
+      version_requests.emplace(m->handle,
+			       VersionCompletion::create(
+				 service.get_executor(),
+				 std::move(init.completion_handler)));
+      _send_mon_message(m);
+    }
+    return init.result.get();
+  }
+
+  /**
+   * Run a callback within our lock, with a reference
+   * to the MonMap
+   */
+  template<typename Callback, typename...Args>
+  auto with_monmap(Callback&& cb, Args&&...args) const ->
+    decltype(cb(monmap, std::forward<Args>(args)...)) {
+    std::lock_guard l(monc_lock);
+    return std::forward<Callback>(cb)(monmap, std::forward<Args>(args)...);
+  }
+
+  void register_config_callback(md_config_t::config_callback fn);
+  void register_config_notify_callback(std::function<void(void)> f) {
+    config_notify_cb = f;
+  }
+  md_config_t::config_callback get_config_callback();
+
+private:
+
+  std::map<ceph_tid_t, std::unique_ptr<VersionCompletion>> version_requests;
+  ceph_tid_t version_req_id;
+  void handle_get_version_reply(MMonGetVersionReply* m);
+  md_config_t::config_callback config_cb;
+  std::function<void(void)> config_notify_cb;
+};
+
+#endif
diff --git a/src/mon/MonCommand.h b/src/mon/MonCommand.h
new file mode 100644
index 000000000..cb60d3d17
--- /dev/null
+++ b/src/mon/MonCommand.h
@@ -0,0 +1,175 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include <string>
+#include "include/encoding.h"
+
+struct MonCommand {
+  std::string cmdstring;
+  std::string helpstring;
+  std::string module;
+  std::string req_perms;
+  uint64_t flags;
+
+  // MonCommand flags
+  static const uint64_t FLAG_NONE       = 0;
+  static const uint64_t FLAG_NOFORWARD  = 1 << 0;
+  static const uint64_t FLAG_OBSOLETE   = 1 << 1;
+  static const uint64_t FLAG_DEPRECATED = 1 << 2;
+  static const uint64_t FLAG_MGR        = 1 << 3;
+  static const uint64_t FLAG_POLL       = 1 << 4;
+  static const uint64_t FLAG_HIDDEN     = 1 << 5;
+  // asok and tell commands are not forwarded, and they should not be listed
+  // in --help output.
+  static const uint64_t FLAG_TELL       = (FLAG_NOFORWARD | FLAG_HIDDEN);
+
+  bool has_flag(uint64_t flag) const { return (flags & flag) == flag; }
+  void set_flag(uint64_t flag) { flags |= flag; }
+  void unset_flag(uint64_t flag) { flags &= ~flag; }
+
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode_bare(bl);
+    encode(flags, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode_bare(bl);
+    decode(flags, bl);
+    DECODE_FINISH(bl);
+  }
+
+  /**
+   * Unversioned encoding for use within encode_array.
+   */
+  void encode_bare(ceph::buffer::list &bl) const {
+    using ceph::encode;
+    encode(cmdstring, bl);
+    encode(helpstring, bl);
+    encode(module, bl);
+    encode(req_perms, bl);
+    std::string availability = "cli,rest";  // Removed field, for backward compat
+    encode(availability, bl);
+  }
+  void decode_bare(ceph::buffer::list::const_iterator &bl) {
+    using ceph::decode;
+    decode(cmdstring, bl);
+    decode(helpstring, bl);
+    decode(module, bl);
+    decode(req_perms, bl);
+    std::string availability;  // Removed field, for backward compat
+    decode(availability, bl);
+  }
+  bool is_compat(const MonCommand* o) const {
+    return cmdstring == o->cmdstring &&
+	module == o->module && req_perms == o->req_perms;
+  }
+
+  bool is_tell() const {
+    return has_flag(MonCommand::FLAG_TELL);
+  }
+
+  bool is_noforward() const {
+    return has_flag(MonCommand::FLAG_NOFORWARD);
+  }
+
+  bool is_obsolete() const {
+    return has_flag(MonCommand::FLAG_OBSOLETE);
+  }
+
+  bool is_deprecated() const {
+    return has_flag(MonCommand::FLAG_DEPRECATED);
+  }
+
+  bool is_mgr() const {
+    return has_flag(MonCommand::FLAG_MGR);
+  }
+
+  bool is_hidden() const {
+    return has_flag(MonCommand::FLAG_HIDDEN);
+  }
+
+  static void encode_array(const MonCommand *cmds, int size, ceph::buffer::list &bl) {
+    ENCODE_START(2, 1, bl);
+    uint16_t s = size;
+    encode(s, bl);
+    for (int i = 0; i < size; ++i) {
+      cmds[i].encode_bare(bl);
+    }
+    for (int i = 0; i < size; i++) {
+      encode(cmds[i].flags, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+  static void decode_array(MonCommand **cmds, int *size,
+                           ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(2, bl);
+    uint16_t s = 0;
+    decode(s, bl);
+    *size = s;
+    *cmds = new MonCommand[*size];
+    for (int i = 0; i < *size; ++i) {
+      (*cmds)[i].decode_bare(bl);
+    }
+    if (struct_v >= 2) {
+      for (int i = 0; i < *size; i++)
+        decode((*cmds)[i].flags, bl);
+    } else {
+      for (int i = 0; i < *size; i++)
+        (*cmds)[i].flags = 0;
+    }
+    DECODE_FINISH(bl);
+  }
+
+  // this uses a u16 for the count, so we need a special encoder/decoder.
+  static void encode_vector(const std::vector<MonCommand>& cmds,
+			    ceph::buffer::list &bl) {
+    ENCODE_START(2, 1, bl);
+    uint16_t s = cmds.size();
+    encode(s, bl);
+    for (unsigned i = 0; i < s; ++i) {
+      cmds[i].encode_bare(bl);
+    }
+    for (unsigned i = 0; i < s; i++) {
+      encode(cmds[i].flags, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+  static void decode_vector(std::vector<MonCommand> &cmds,
+			    ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(2, bl);
+    uint16_t s = 0;
+    decode(s, bl);
+    cmds.resize(s);
+    for (unsigned i = 0; i < s; ++i) {
+      cmds[i].decode_bare(bl);
+    }
+    if (struct_v >= 2) {
+      for (unsigned i = 0; i < s; i++)
+        decode(cmds[i].flags, bl);
+    } else {
+      for (unsigned i = 0; i < s; i++)
+        cmds[i].flags = 0;
+    }
+    DECODE_FINISH(bl);
+  }
+
+  bool requires_perm(char p) const {
+    return (req_perms.find(p) != std::string::npos);
+  }
+};
+WRITE_CLASS_ENCODER(MonCommand)
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
new file mode 100644
index 000000000..f5ca47eb4
--- /dev/null
+++ b/src/mon/MonCommands.h
@@ -0,0 +1,1407 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/* no guard; may be included multiple times */
+
+/*
+ * Define commands that are reported by the monitor's
+ * "get_command_descriptions" command, and parsed by the Python
+ * frontend 'ceph' (and perhaps by other frontends, such as a RESTful
+ * server). The format is:
+ *
+ * COMMAND(signature, helpstring, modulename, req perms, availability)
+ * where:
+ * signature:  describes the command and its parameters (more below)
+ * helpstring: displays in CLI help, API help (nice if it refers to
+ *             parameter names from signature, 40-a few hundred chars)
+ * modulename: the monitor module or daemon this applies to:
+ *             mds, osd, pg (osd), mon, auth, log, config-key, mgr
+ * req perms:  required permission in that modulename space to execute command
+ *             this also controls what type of REST command is accepted
+ *
+ * The commands describe themselves completely enough for the separate
+ * frontend(s) to be able to accept user input and validate it against
+ * the command descriptions, and generate a JSON object that contains
+ * key:value mappings of parameter names to validated parameter values.
+ *
+ * 'signature' is a space-separated list of individual command descriptors;
+ * each descriptor is either a literal string, which can contain no spaces or
+ * '=' signs (for instance, in "pg stat", both "pg" and "stat" are literal
+ * strings representing one descriptor each), or a list of key=val[,key=val...]
+ * which also includes no spaces.
+ *
+ * The key=val form describes a non-literal parameter.  Each will have at
+ * least a name= and type=, and each type can have its own type-specific
+ * parameters.  The parser is the arbiter of these types and their
+ * interpretation.  A few more non-type-specific key=val pairs exist:
+ *
+ *    req=false marks an optional parameter (default for req is 'true')
+ *    n=<n> is a repeat count for how many of this argument must be supplied.
+ *          n=1 is the default.
+ *          n=N is a special case that means "1 or more".
+ *
+ * A perhaps-incomplete list of types:
+ *
+ * CephInt: Optional: range=min[|max]
+ * CephFloat: Optional range
+ * CephString: optional badchars
+ * CephSocketpath: validation involves "is it S_ISSOCK"
+ * CephIPAddr: v4 or v6 addr with optional port, syntax validated
+ * CephEntityAddr: CephIPAddr + optional '/nonce'
+ * CephPoolname: Plainold string
+ * CephObjectname: Another plainold string
+ * CephPgid: n.xxx where n is an int > 0, xxx is a hex number > 0
+ * CephName: daemon name, '*' or '<type>.<id>' (id must be int for type osd)
+ * CephOsdName: osd name, '*' or '<id> or 'osd.<id>' (id must be int)
+ * CephChoices: strings="foo|bar" means this param can be either
+ * CephFilepath: openable file
+ * CephFragment: cephfs 'fragID': val/bits, val in hex 0xnnn, bits in dec
+ * CephUUID: uuid in text matching Python uuid.UUID()
+ * CephPrefix: special type assigned to literals
+ *
+ * Example:
+ *
+ * COMMAND("auth add "
+ *   	   "name=entity,type=CephString "
+ *   	   "name=caps,type=CephString,n=N,req=false",
+ *   	   "add auth info for <name> from input file, or random key "
+ *   	   "if no input given, and/or any caps specified in the command")
+ *
+ * defines a command "auth add" that takes a required argument "entity"
+ * of type "CephString", and from 1 to N arguments named "caps" of type
+ * CephString, at least one of which is required.  The front end will
+ * validate user input against this description.  Let's say the user
+ * enters auth add client.admin 'mon rwx' 'osd *'.  The result will be a
+ * JSON object like {"prefix":"auth add", "entity":"client.admin",
+ * "caps":["mon rwx", "osd *"]}.
+ * Note that
+ * 	- string literals are accumulated into 'prefix'
+ * 	- n=1 descriptors are given normal string or int object values
+ * 	- n=N descriptors are given array values
+ *
+ * NOTE: be careful with spaces.  Each descriptor must be separated by
+ * one space, no other characters, so if you split lines as above, be
+ * sure to close and reopen the quotes, and be careful to include the '
+ * separating spaces in the quoted string.
+ *
+ * The monitor marshals this JSON into a std::map<string, cmd_vartype>
+ * where cmd_vartype is a boost::variant type-enforcing discriminated
+ * type, so the monitor is expected to know the type of each argument.
+ * See cmdparse.cc/h for more details.
+ *
+ * The flag parameter for COMMAND_WITH_FLAGS macro must be passed using
+ * FLAG(f), where 'f' may be one of the following:
+ *
+ *  NONE      - no flag assigned
+ *  NOFORWARD - command may not be forwarded
+ *  OBSOLETE  - command is considered obsolete
+ *  DEPRECATED - command is considered deprecated
+ *  MGR       - command goes to ceph-mgr (for luminous+)
+ *  POLL      - command is intended to be called periodically by the
+ *              client (see iostat)
+ *  HIDDEN    - command is hidden (no reported by help etc)
+ *  TELL      - tell/asok command. it's an alias of (NOFORWARD | HIDDEN)
+ *
+ * A command should always be first considered DEPRECATED before being
+ * considered OBSOLETE, giving due consideration to users and conforming
+ * to any guidelines regarding deprecating commands.
+ */
+
+COMMAND("pg map name=pgid,type=CephPgid", "show mapping of pg to osds", \
+	"pg", "r")
+COMMAND("pg repeer name=pgid,type=CephPgid", "force a PG to repeer",
+	"osd", "rw")
+COMMAND("osd last-stat-seq name=id,type=CephOsdName", \
+	"get the last pg stats sequence number reported for this osd", \
+	"osd", "r")
+
+/*
+ * auth commands AuthMonitor.cc
+ */
+
+COMMAND("auth export name=entity,type=CephString,req=false", \
+       	"write keyring for requested entity, or master keyring if none given", \
+	"auth", "rx")
+COMMAND("auth get name=entity,type=CephString", \
+	"write keyring file with requested key", "auth", "rx")
+COMMAND("auth get-key name=entity,type=CephString", "display requested key", \
+	"auth", "rx")
+COMMAND("auth print-key name=entity,type=CephString", "display requested key", \
+	"auth", "rx")
+COMMAND("auth print_key name=entity,type=CephString", "display requested key", \
+	"auth", "rx")
+COMMAND_WITH_FLAG("auth list", "list authentication state", "auth", "rx",
+		  FLAG(DEPRECATED))
+COMMAND("auth ls", "list authentication state", "auth", "rx")
+COMMAND("auth import", "auth import: read keyring file from -i <file>",
+	"auth", "rwx")
+COMMAND("auth add "
+	"name=entity,type=CephString "
+	"name=caps,type=CephString,n=N,req=false",
+	"add auth info for <entity> from input file, or random key if no "
+        "input is given, and/or any caps specified in the command",
+	"auth", "rwx")
+COMMAND("auth get-or-create-key "
+	"name=entity,type=CephString "
+	"name=caps,type=CephString,n=N,req=false",
+	"get, or add, key for <name> from system/caps pairs specified in the command.  If key already exists, any given caps must match the existing caps for that key.",
+	"auth", "rwx")
+COMMAND("auth get-or-create "
+	"name=entity,type=CephString "
+	"name=caps,type=CephString,n=N,req=false",
+	"add auth info for <entity> from input file, or random key if no input given, and/or any caps specified in the command",
+	"auth", "rwx")
+COMMAND("fs authorize "
+   "name=filesystem,type=CephString "
+   "name=entity,type=CephString "
+	"name=caps,type=CephString,n=N",
+	"add auth for <entity> to access file system <filesystem> based on following directory and permissions pairs",
+	"auth", "rwx")
+COMMAND("auth caps "
+	"name=entity,type=CephString "
+	"name=caps,type=CephString,n=N",
+	"update caps for <name> from caps specified in the command",
+	"auth", "rwx")
+COMMAND_WITH_FLAG("auth del "
+	"name=entity,type=CephString",
+	"delete all caps for <name>",
+	"auth", "rwx",
+    FLAG(DEPRECATED))
+COMMAND("auth rm "
+	"name=entity,type=CephString",
+	"remove all caps for <name>",
+	"auth", "rwx")
+
+/*
+ * Monitor commands (Monitor.cc)
+ */
+COMMAND_WITH_FLAG("compact", "cause compaction of monitor's leveldb/rocksdb storage",
+	     "mon", "rw",
+             FLAG(TELL))
+COMMAND_WITH_FLAG("scrub", "scrub the monitor stores",
+             "mon", "rw",
+             FLAG(OBSOLETE))
+COMMAND("fsid", "show cluster FSID/UUID", "mon", "r")
+COMMAND("log name=logtext,type=CephString,n=N",
+	"log supplied text to the monitor log", "mon", "rw")
+COMMAND("log last "
+        "name=num,type=CephInt,range=1,req=false "
+        "name=level,type=CephChoices,strings=debug|info|sec|warn|error,req=false "
+        "name=channel,type=CephChoices,strings=*|cluster|audit|cephadm,req=false",
+	"print last few lines of the cluster log",
+	"mon", "r")
+
+COMMAND("status", "show cluster status", "mon", "r")
+COMMAND("health name=detail,type=CephChoices,strings=detail,req=false",
+	"show cluster health", "mon", "r")
+COMMAND("health mute "\
+	"name=code,type=CephString "
+	"name=ttl,type=CephString,req=false "
+	"name=sticky,type=CephBool,req=false",
+	"mute health alert", "mon", "w")
+COMMAND("health unmute "\
+	"name=code,type=CephString,req=false",
+	"unmute existing health alert mute(s)", "mon", "w")
+COMMAND("time-sync-status", "show time sync status", "mon", "r")
+COMMAND("df name=detail,type=CephChoices,strings=detail,req=false",
+	"show cluster free space stats", "mon", "r")
+COMMAND("report name=tags,type=CephString,n=N,req=false",
+	"report full status of cluster, optional title tag strings",
+	"mon", "r")
+COMMAND("features", "report of connected features",
+        "mon", "r")
+COMMAND("quorum_status", "report status of monitor quorum",
+	"mon", "r")
+COMMAND("mon ok-to-stop "
+	"name=ids,type=CephString,n=N",
+	"check whether mon(s) can be safely stopped without reducing immediate "
+	"availability",
+	"mon", "r")
+COMMAND("mon ok-to-add-offline",
+	"check whether adding a mon and not starting it would break quorum",
+	"mon", "r")
+COMMAND("mon ok-to-rm "
+	"name=id,type=CephString",
+	"check whether removing the specified mon would break quorum",
+	"mon", "r")
+
+COMMAND("tell "
+	"name=target,type=CephName "
+	"name=args,type=CephString,n=N",
+	"send a command to a specific daemon", "mon", "rw")
+COMMAND_WITH_FLAG("version", "show mon daemon version", "mon", "r",
+                  FLAG(TELL))
+
+COMMAND("node ls "
+	"name=type,type=CephChoices,strings=all|osd|mon|mds|mgr,req=false",
+	"list all nodes in cluster [type]", "mon", "r")
+/*
+ * Monitor-specific commands under module 'mon'
+ */
+COMMAND_WITH_FLAG("mon scrub",
+    "scrub the monitor stores",
+    "mon", "rw",
+    FLAG(NONE))
+COMMAND("mon metadata name=id,type=CephString,req=false",
+	"fetch metadata for mon <id>",
+	"mon", "r")
+COMMAND("mon count-metadata name=property,type=CephString",
+	"count mons by metadata field property",
+	"mon", "r")
+COMMAND("mon versions",
+	"check running versions of monitors",
+	"mon", "r")
+COMMAND("versions",
+	"check running versions of ceph daemons",
+	"mon", "r")
+
+
+
+/*
+ * MDS commands (MDSMonitor.cc)
+ */
+
+COMMAND_WITH_FLAG("mds stat", "show MDS status", "mds", "r", FLAG(HIDDEN))
+COMMAND_WITH_FLAG("mds dump "
+	"name=epoch,type=CephInt,req=false,range=0",
+	"dump legacy MDS cluster info, optionally from epoch",
+        "mds", "r", FLAG(OBSOLETE))
+COMMAND("fs dump "
+	"name=epoch,type=CephInt,req=false,range=0",
+	"dump all CephFS status, optionally from epoch", "mds", "r")
+COMMAND_WITH_FLAG("mds getmap "
+	"name=epoch,type=CephInt,req=false,range=0",
+	"get MDS map, optionally from epoch", "mds", "r", FLAG(OBSOLETE))
+COMMAND("mds metadata name=who,type=CephString,req=false",
+	"fetch metadata for mds <role>",
+	"mds", "r")
+COMMAND("mds count-metadata name=property,type=CephString",
+	"count MDSs by metadata field property",
+	"mds", "r")
+COMMAND("mds versions",
+	"check running versions of MDSs",
+	"mds", "r")
+COMMAND_WITH_FLAG("mds tell "
+	"name=who,type=CephString "
+	"name=args,type=CephString,n=N",
+	"send command to particular mds", "mds", "rw", FLAG(OBSOLETE))
+COMMAND_WITH_FLAG("mds stop name=role,type=CephString", "stop mds",
+	"mds", "rw", FLAG(OBSOLETE))
+COMMAND_WITH_FLAG("mds deactivate name=role,type=CephString",
+        "clean up specified MDS rank (use with `set max_mds` to shrink cluster)",
+	"mds", "rw", FLAG(OBSOLETE))
+COMMAND("mds ok-to-stop name=ids,type=CephString,n=N",
+	"check whether stopping the specified MDS would reduce immediate availability",
+	"mds", "r")
+COMMAND_WITH_FLAG("mds set_max_mds "
+	"name=maxmds,type=CephInt,range=0",
+	"set max MDS index", "mds", "rw", FLAG(OBSOLETE))
+COMMAND_WITH_FLAG("mds set "
+	"name=var,type=CephChoices,strings=max_mds|max_file_size|inline_data|"
+	"allow_new_snaps|allow_multimds|allow_multimds_snaps|allow_dirfrags "
+	"name=val,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"set mds parameter <var> to <val>", "mds", "rw", FLAG(OBSOLETE))
+COMMAND_WITH_FLAG("mds freeze name=role_or_gid,type=CephString"
+	" name=val,type=CephString",
+	"freeze MDS yes/no", "mds", "rw", FLAG(HIDDEN))
+// arbitrary limit 0-20 below; worth standing on head to make it
+// relate to actual state definitions?
+// #include "include/ceph_fs.h"
+COMMAND_WITH_FLAG("mds set_state "
+	"name=gid,type=CephInt,range=0 "
+	"name=state,type=CephInt,range=0|20",
+	"set mds state of <gid> to <numeric-state>", "mds", "rw", FLAG(HIDDEN))
+COMMAND("mds fail name=role_or_gid,type=CephString",
+	"Mark MDS failed: trigger a failover if a standby is available",
+        "mds", "rw")
+COMMAND("mds repaired name=role,type=CephString",
+	"mark a damaged MDS rank as no longer damaged", "mds", "rw")
+COMMAND("mds rm "
+	"name=gid,type=CephInt,range=0",
+	"remove nonactive mds", "mds", "rw")
+COMMAND_WITH_FLAG("mds rmfailed name=role,type=CephString "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
+	"remove failed rank", "mds", "rw", FLAG(HIDDEN))
+COMMAND_WITH_FLAG("mds cluster_down", "take MDS cluster down", "mds", "rw", FLAG(OBSOLETE))
+COMMAND_WITH_FLAG("mds cluster_up", "bring MDS cluster up", "mds", "rw", FLAG(OBSOLETE))
+COMMAND_WITH_FLAG("mds compat show", "show mds compatibility settings",
+	"mds", "r", FLAG(DEPRECATED))
+COMMAND("fs compat show "
+        "name=fs_name,type=CephString ",
+        "show fs compatibility settings",
+	"mds", "r")
+COMMAND_WITH_FLAG("mds compat rm_compat "
+	"name=feature,type=CephInt,range=0",
+	"remove compatible feature", "mds", "rw", FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("mds compat rm_incompat "
+	"name=feature,type=CephInt,range=0",
+	"remove incompatible feature", "mds", "rw", FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("mds add_data_pool "
+	"name=pool,type=CephString",
+	"add data pool <pool>", "mds", "rw", FLAG(OBSOLETE))
+COMMAND_WITH_FLAG("mds rm_data_pool "
+	"name=pool,type=CephString",
+	"remove data pool <pool>", "mds", "rw", FLAG(OBSOLETE))
+COMMAND_WITH_FLAG("mds remove_data_pool "
+	"name=pool,type=CephString",
+	"remove data pool <pool>", "mds", "rw", FLAG(OBSOLETE))
+COMMAND_WITH_FLAG("mds newfs "
+	"name=metadata,type=CephInt,range=0 "
+	"name=data,type=CephInt,range=0 "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"make new filesystem using pools <metadata> and <data>",
+	"mds", "rw", FLAG(OBSOLETE))
+COMMAND("fs new "
+	"name=fs_name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=metadata,type=CephString "
+	"name=data,type=CephString "
+	"name=force,type=CephBool,req=false "
+	"name=allow_dangerous_metadata_overlay,type=CephBool,req=false "
+	"name=fscid,type=CephInt,range=0,req=false "
+	"name=recover,type=CephBool,req=false",
+	"make new filesystem using named pools <metadata> and <data>",
+	"fs", "rw")
+COMMAND("fs fail "
+	"name=fs_name,type=CephString ",
+	"bring the file system down and all of its ranks",
+	"fs", "rw")
+COMMAND("fs rm "
+	"name=fs_name,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"disable the named filesystem",
+	"fs", "rw")
+COMMAND("fs reset "
+	"name=fs_name,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"disaster recovery only: reset to a single-MDS map",
+	"fs", "rw")
+COMMAND("fs ls ",
+	"list filesystems",
+	"fs", "r")
+COMMAND("fs get name=fs_name,type=CephString",
+	"get info about one filesystem",
+	"fs", "r")
+COMMAND("fs set "
+	"name=fs_name,type=CephString "
+	"name=var,type=CephChoices,strings=max_mds|max_file_size"
+        "|allow_new_snaps|inline_data|cluster_down|allow_dirfrags|balancer"
+        "|standby_count_wanted|session_timeout|session_autoclose"
+        "|allow_standby_replay|down|joinable|min_compat_client "
+	"name=val,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false "
+	"name=yes_i_really_really_mean_it,type=CephBool,req=false",
+	"set fs parameter <var> to <val>", "mds", "rw")
+COMMAND("fs flag set name=flag_name,type=CephChoices,strings=enable_multiple "
+        "name=val,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"Set a global CephFS flag",
+	"fs", "rw")
+
+COMMAND("fs feature ls",
+        "list available cephfs features to be set/unset",
+	"mds", "r")
+
+COMMAND("fs compat "
+        "name=fs_name,type=CephString "
+        "name=subop,type=CephChoices,strings=rm_compat|rm_incompat|add_compat|add_incompat "
+        "name=feature,type=CephInt "
+        "name=feature_str,type=CephString,req=false ",
+        "manipulate compat settings", "fs", "rw")
+
+COMMAND("fs required_client_features "
+        "name=fs_name,type=CephString "
+        "name=subop,type=CephChoices,strings=add|rm "
+        "name=val,type=CephString ",
+        "add/remove required features of clients", "mds", "rw")
+
+COMMAND("fs add_data_pool name=fs_name,type=CephString "
+	"name=pool,type=CephString",
+	"add data pool <pool>", "mds", "rw")
+COMMAND("fs rm_data_pool name=fs_name,type=CephString "
+	"name=pool,type=CephString",
+	"remove data pool <pool>", "mds", "rw")
+COMMAND_WITH_FLAG("fs set_default name=fs_name,type=CephString",
+		  "set the default to the named filesystem",
+		  "fs", "rw",
+		  FLAG(DEPRECATED))
+COMMAND("fs set-default name=fs_name,type=CephString",
+	"set the default to the named filesystem",
+	"fs", "rw")
+COMMAND("fs mirror enable "
+	"name=fs_name,type=CephString ",
+	"enable mirroring for a ceph filesystem", "mds", "rw")
+COMMAND("fs mirror disable "
+	"name=fs_name,type=CephString ",
+	"disable mirroring for a ceph filesystem", "mds", "rw")
+COMMAND("fs mirror peer_add "
+	"name=fs_name,type=CephString "
+	"name=uuid,type=CephString "
+	"name=remote_cluster_spec,type=CephString "
+	"name=remote_fs_name,type=CephString",
+	"add a mirror peer for a ceph filesystem", "mds", "rw")
+COMMAND("fs mirror peer_remove "
+	"name=fs_name,type=CephString "
+	"name=uuid,type=CephString ",
+	"remove a mirror peer for a ceph filesystem", "mds", "rw")
+
+/*
+ * Monmap commands
+ */
+COMMAND("mon dump "
+	"name=epoch,type=CephInt,range=0,req=false",
+	"dump formatted monmap (optionally from epoch)",
+	"mon", "r")
+COMMAND("mon stat", "summarize monitor status", "mon", "r")
+COMMAND("mon getmap "
+	"name=epoch,type=CephInt,range=0,req=false",
+	"get monmap", "mon", "r")
+COMMAND("mon add "
+	"name=name,type=CephString "
+	"name=addr,type=CephIPAddr "
+	"name=location,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=],req=false",
+	"add new monitor named <name> at <addr>, possibly with CRUSH location <location>", "mon", "rw")
+COMMAND("mon rm "
+	"name=name,type=CephString",
+	"remove monitor named <name>", "mon", "rw")
+COMMAND_WITH_FLAG("mon remove "
+	"name=name,type=CephString",
+	"remove monitor named <name>", "mon", "rw",
+    FLAG(DEPRECATED))
+COMMAND("mon feature ls "
+        "name=with_value,type=CephChoices,strings=--with-value,req=false",
+        "list available mon map features to be set/unset",
+        "mon", "r")
+COMMAND("mon feature set "
+        "name=feature_name,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+        "set provided feature on mon map",
+        "mon", "rw")
+COMMAND("mon set-rank "
+	"name=name,type=CephString "
+	"name=rank,type=CephInt",
+	"set the rank for the specified mon",
+	"mon", "rw")
+COMMAND("mon set-addrs "
+	"name=name,type=CephString "
+	"name=addrs,type=CephString",
+	"set the addrs (IPs and ports) a specific monitor binds to",
+	"mon", "rw")
+COMMAND("mon set-weight "
+        "name=name,type=CephString "
+        "name=weight,type=CephInt,range=0|65535",
+        "set the weight for the specified mon",
+        "mon", "rw")
+COMMAND("mon enable-msgr2",
+	"enable the msgr2 protocol on port 3300",
+	"mon", "rw")
+COMMAND("mon set election_strategy " \
+	"name=strategy,type=CephString", \
+	"set the election strategy to use; choices classic, disallow, connectivity", \
+	"mon", "rw")
+COMMAND("mon add disallowed_leader " \
+	"name=name,type=CephString", \
+	"prevent the named mon from being a leader", \
+	"mon", "rw")
+COMMAND("mon rm disallowed_leader " \
+	"name=name,type=CephString", \
+	"allow the named mon to be a leader again", \
+	"mon", "rw")
+COMMAND("mon set_location " \
+	"name=name,type=CephString "
+	"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+	"specify location <args> for the monitor <name>, using CRUSH bucket names", \
+	"mon", "rw")
+COMMAND("mon enable_stretch_mode " \
+	"name=tiebreaker_mon,type=CephString, "
+	"name=new_crush_rule,type=CephString, "
+	"name=dividing_bucket,type=CephString, ",
+	"enable stretch mode, changing the peering rules and "
+	"failure handling on all pools with <tiebreaker_mon> "
+	"as the tiebreaker and setting <dividing_bucket> locations "
+	"as the units for stretching across",
+	"mon", "rw")
+COMMAND("mon set_new_tiebreaker " \
+	"name=name,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"switch the stretch tiebreaker to be the named mon", \
+	"mon", "rw")
+
+/*
+ * OSD commands
+ */
+COMMAND("osd stat", "print summary of OSD map", "osd", "r")
+COMMAND("osd dump "
+	"name=epoch,type=CephInt,range=0,req=false",
+	"print summary of OSD map", "osd", "r")
+COMMAND("osd info "
+	"name=id,type=CephOsdName,req=false",
+	"print osd's {id} information (instead of all osds from map)",
+	"osd", "r")
+COMMAND("osd tree "
+	"name=epoch,type=CephInt,range=0,req=false "
+	"name=states,type=CephChoices,strings=up|down|in|out|destroyed,n=N,req=false",
+	"print OSD tree", "osd", "r")
+COMMAND("osd tree-from "
+	"name=epoch,type=CephInt,range=0,req=false "
+	"name=bucket,type=CephString "
+	"name=states,type=CephChoices,strings=up|down|in|out|destroyed,n=N,req=false",
+	"print OSD tree in bucket", "osd", "r")
+COMMAND("osd ls "
+	"name=epoch,type=CephInt,range=0,req=false",
+	"show all OSD ids", "osd", "r")
+COMMAND("osd getmap "
+	"name=epoch,type=CephInt,range=0,req=false",
+	"get OSD map", "osd", "r")
+COMMAND("osd getcrushmap "
+	"name=epoch,type=CephInt,range=0,req=false",
+	"get CRUSH map", "osd", "r")
+COMMAND("osd getmaxosd", "show largest OSD id", "osd", "r")
+COMMAND("osd ls-tree "
+        "name=epoch,type=CephInt,range=0,req=false "
+        "name=name,type=CephString,req=true",
+        "show OSD ids under bucket <name> in the CRUSH map",
+        "osd", "r")
+COMMAND("osd find "
+	"name=id,type=CephOsdName",
+	"find osd <id> in the CRUSH map and show its location",
+	"osd", "r")
+COMMAND("osd metadata "
+	"name=id,type=CephOsdName,req=false",
+	"fetch metadata for osd {id} (default all)",
+	"osd", "r")
+COMMAND("osd count-metadata name=property,type=CephString",
+	"count OSDs by metadata field property",
+	"osd", "r")
+COMMAND("osd versions",
+	"check running versions of OSDs",
+	"osd", "r")
+COMMAND("osd numa-status",
+	"show NUMA status of OSDs",
+	"osd", "r")
+COMMAND("osd map "
+	"name=pool,type=CephPoolname "
+	"name=object,type=CephObjectname "
+	"name=nspace,type=CephString,req=false",
+	"find pg for <object> in <pool> with [namespace]", "osd", "r")
+COMMAND_WITH_FLAG("osd lspools",
+		  "list pools", "osd", "r", FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd crush rule list", "list crush rules", "osd", "r",
+		  FLAG(DEPRECATED))
+COMMAND("osd crush rule ls", "list crush rules", "osd", "r")
+COMMAND("osd crush rule ls-by-class "
+        "name=class,type=CephString,goodchars=[A-Za-z0-9-_.]",
+        "list all crush rules that reference the same <class>",
+        "osd", "r")
+COMMAND("osd crush rule dump "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.],req=false",
+	"dump crush rule <name> (default all)",
+	"osd", "r")
+COMMAND("osd crush dump",
+	"dump crush map",
+	"osd", "r")
+COMMAND("osd setcrushmap name=prior_version,type=CephInt,req=false",
+	"set crush map from input file",
+	"osd", "rw")
+COMMAND("osd crush set name=prior_version,type=CephInt,req=false",
+	"set crush map from input file",
+	"osd", "rw")
+COMMAND("osd crush add-bucket "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+        "name=type,type=CephString "
+        "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=],req=false",
+	"add no-parent (probably root) crush bucket <name> of type <type> "
+        "to location <args>",
+	"osd", "rw")
+COMMAND("osd crush rename-bucket "
+	"name=srcname,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=dstname,type=CephString,goodchars=[A-Za-z0-9-_.]",
+	"rename bucket <srcname> to <dstname>",
+	"osd", "rw")
+COMMAND("osd crush set "
+	"name=id,type=CephOsdName "
+	"name=weight,type=CephFloat,range=0.0 "
+	"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+	"update crushmap position and weight for <name> to <weight> with location <args>",
+	"osd", "rw")
+COMMAND("osd crush add "
+	"name=id,type=CephOsdName "
+	"name=weight,type=CephFloat,range=0.0 "
+	"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+	"add or update crushmap position and weight for <name> with <weight> and location <args>",
+	"osd", "rw")
+COMMAND("osd crush set-all-straw-buckets-to-straw2",
+        "convert all CRUSH current straw buckets to use the straw2 algorithm",
+	"osd", "rw")
+COMMAND("osd crush class create "
+        "name=class,type=CephString,goodchars=[A-Za-z0-9-_]",
+        "create crush device class <class>",
+        "osd", "rw")
+COMMAND("osd crush class rm "
+        "name=class,type=CephString,goodchars=[A-Za-z0-9-_]",
+        "remove crush device class <class>",
+        "osd", "rw")
+COMMAND("osd crush set-device-class "
+        "name=class,type=CephString "
+	"name=ids,type=CephString,n=N",
+	"set the <class> of the osd(s) <id> [<id>...],"
+        "or use <all|any> to set all.",
+	"osd", "rw")
+COMMAND("osd crush rm-device-class "
+        "name=ids,type=CephString,n=N",
+        "remove class of the osd(s) <id> [<id>...],"
+        "or use <all|any> to remove all.",
+        "osd", "rw")
+COMMAND("osd crush class rename "
+        "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_] "
+        "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_]",
+        "rename crush device class <srcname> to <dstname>",
+        "osd", "rw")
+COMMAND("osd crush create-or-move "
+	"name=id,type=CephOsdName "
+	"name=weight,type=CephFloat,range=0.0 "
+	"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+	"create entry or move existing entry for <name> <weight> at/to location <args>",
+	"osd", "rw")
+COMMAND("osd crush move "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+	"move existing entry for <name> to location <args>",
+	"osd", "rw")
+COMMAND("osd crush swap-bucket "
+	"name=source,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=dest,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"swap existing bucket contents from (orphan) bucket <source> and <target>",
+	"osd", "rw")
+COMMAND("osd crush link "
+	"name=name,type=CephString "
+	"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]",
+	"link existing entry for <name> under location <args>",
+	"osd", "rw")
+COMMAND("osd crush rm "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]",
+	"remove <name> from crush map (everywhere, or just at <ancestor>)",\
+	"osd", "rw")
+COMMAND_WITH_FLAG("osd crush remove "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]",
+	"remove <name> from crush map (everywhere, or just at <ancestor>)",
+	"osd", "rw",
+    FLAG(DEPRECATED))
+COMMAND("osd crush unlink "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]",
+	"unlink <name> from crush map (everywhere, or just at <ancestor>)",
+	"osd", "rw")
+COMMAND("osd crush reweight-all",
+	"recalculate the weights for the tree to ensure they sum correctly",
+	"osd", "rw")
+COMMAND("osd crush reweight "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=weight,type=CephFloat,range=0.0",
+	"change <name>'s weight to <weight> in crush map",
+	"osd", "rw")
+COMMAND("osd crush reweight-subtree "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=weight,type=CephFloat,range=0.0",
+	"change all leaf items beneath <name> to <weight> in crush map",
+	"osd", "rw")
+COMMAND("osd crush tunables "
+	"name=profile,type=CephChoices,strings=legacy|argonaut|bobtail|firefly|hammer|jewel|optimal|default",
+	"set crush tunables values to <profile>", "osd", "rw")
+COMMAND("osd crush set-tunable "
+	"name=tunable,type=CephChoices,strings=straw_calc_version "
+	"name=value,type=CephInt",
+	"set crush tunable <tunable> to <value>",
+	"osd", "rw")
+COMMAND("osd crush get-tunable "
+	"name=tunable,type=CephChoices,strings=straw_calc_version",
+	"get crush tunable <tunable>",
+	"osd", "r")
+COMMAND("osd crush show-tunables",
+	"show current crush tunables", "osd", "r")
+COMMAND("osd crush rule create-simple "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=root,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=type,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=mode,type=CephChoices,strings=firstn|indep,req=false",
+	"create crush rule <name> to start from <root>, replicate across buckets of type <type>, using a choose mode of <firstn|indep> (default firstn; indep best for erasure pools)",
+	"osd", "rw")
+COMMAND("osd crush rule create-replicated "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=root,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=type,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=class,type=CephString,goodchars=[A-Za-z0-9-_.],req=false",
+	"create crush rule <name> for replicated pool to start from <root>, replicate across buckets of type <type>, use devices of type <class> (ssd or hdd)",
+	"osd", "rw")
+COMMAND("osd crush rule create-erasure "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=profile,type=CephString,req=false,goodchars=[A-Za-z0-9-_.=]",
+	"create crush rule <name> for erasure coded pool created with <profile> (default default)",
+	"osd", "rw")
+COMMAND("osd crush rule rm "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] ",
+	"remove crush rule <name>", "osd", "rw")
+COMMAND("osd crush rule rename "
+        "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_.] "
+        "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_.]",
+        "rename crush rule <srcname> to <dstname>",
+        "osd", "rw")
+COMMAND("osd crush tree "
+        "name=shadow,type=CephChoices,strings=--show-shadow,req=false",
+	"dump crush buckets and items in a tree view",
+	"osd", "r")
+COMMAND("osd crush ls name=node,type=CephString,goodchars=[A-Za-z0-9-_.]",
+	"list items beneath a node in the CRUSH tree",
+	"osd", "r")
+COMMAND("osd crush class ls",
+	"list all crush device classes",
+	"osd", "r")
+COMMAND("osd crush class ls-osd "
+        "name=class,type=CephString,goodchars=[A-Za-z0-9-_]",
+        "list all osds belonging to the specific <class>",
+        "osd", "r")
+COMMAND("osd crush get-device-class "
+        "name=ids,type=CephString,n=N",
+        "get classes of specified osd(s) <id> [<id>...]",
+        "osd", "r")
+COMMAND("osd crush weight-set ls",
+	"list crush weight sets",
+	"osd", "r")
+COMMAND("osd crush weight-set dump",
+	"dump crush weight sets",
+	"osd", "r")
+COMMAND("osd crush weight-set create-compat",
+	"create a default backward-compatible weight-set",
+	"osd", "rw")
+COMMAND("osd crush weight-set create "
+        "name=pool,type=CephPoolname "\
+        "name=mode,type=CephChoices,strings=flat|positional",
+	"create a weight-set for a given pool",
+	"osd", "rw")
+COMMAND("osd crush weight-set rm name=pool,type=CephPoolname",
+	"remove the weight-set for a given pool",
+	"osd", "rw")
+COMMAND("osd crush weight-set rm-compat",
+	"remove the backward-compatible weight-set",
+	"osd", "rw")
+COMMAND("osd crush weight-set reweight "
+        "name=pool,type=CephPoolname "
+	"name=item,type=CephString "
+        "name=weight,type=CephFloat,range=0.0,n=N",
+	"set weight for an item (bucket or osd) in a pool's weight-set",
+	"osd", "rw")
+COMMAND("osd crush weight-set reweight-compat "
+	"name=item,type=CephString "
+        "name=weight,type=CephFloat,range=0.0,n=N",
+	"set weight for an item (bucket or osd) in the backward-compatible weight-set",
+	"osd", "rw")
+COMMAND("osd setmaxosd "
+	"name=newmax,type=CephInt,range=0",
+	"set new maximum osd value", "osd", "rw")
+COMMAND("osd set-full-ratio "
+	"name=ratio,type=CephFloat,range=0.0|1.0",
+	"set usage ratio at which OSDs are marked full",
+	"osd", "rw")
+COMMAND("osd set-backfillfull-ratio "
+	"name=ratio,type=CephFloat,range=0.0|1.0",
+	"set usage ratio at which OSDs are marked too full to backfill",
+	"osd", "rw")
+COMMAND("osd set-nearfull-ratio "
+	"name=ratio,type=CephFloat,range=0.0|1.0",
+	"set usage ratio at which OSDs are marked near-full",
+	"osd", "rw")
+COMMAND("osd get-require-min-compat-client",
+        "get the minimum client version we will maintain compatibility with",
+        "osd", "r")
+COMMAND("osd set-require-min-compat-client "
+	"name=version,type=CephString "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
+	"set the minimum client version we will maintain compatibility with",
+	"osd", "rw")
+COMMAND("osd pause", "pause osd", "osd", "rw")
+COMMAND("osd unpause", "unpause osd", "osd", "rw")
+COMMAND("osd erasure-code-profile set "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=profile,type=CephString,n=N,req=false "
+	"name=force,type=CephBool,req=false",
+	"create erasure code profile <name> with [<key[=value]> ...] pairs. Add a --force at the end to override an existing profile (VERY DANGEROUS)",
+	"osd", "rw")
+COMMAND("osd erasure-code-profile get "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.]",
+	"get erasure code profile <name>",
+	"osd", "r")
+COMMAND("osd erasure-code-profile rm "
+	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.]",
+	"remove erasure code profile <name>",
+	"osd", "rw")
+COMMAND("osd erasure-code-profile ls",
+	"list all erasure code profiles",
+	"osd", "r")
+COMMAND("osd set "
+	"name=key,type=CephChoices,strings=full|pause|noup|nodown|"
+	"noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|"
+	"notieragent|nosnaptrim|pglog_hardlimit "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
+	"set <key>", "osd", "rw")
+COMMAND("osd unset "
+	"name=key,type=CephChoices,strings=full|pause|noup|nodown|"\
+	"noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|"
+	"notieragent|nosnaptrim",
+	"unset <key>", "osd", "rw")
+COMMAND("osd require-osd-release "\
+	"name=release,type=CephChoices,strings=luminous|mimic|nautilus|octopus|pacific "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
+	"set the minimum allowed OSD release to participate in the cluster",
+	"osd", "rw")
+COMMAND("osd down "
+	"name=ids,type=CephString,n=N "
+	"name=definitely_dead,type=CephBool,req=false",
+	"set osd(s) <id> [<id>...] down, "
+        "or use <any|all> to set all osds down",
+        "osd", "rw")
+COMMAND("osd stop "
+        "type=CephString,name=ids,n=N",
+        "stop the corresponding osd daemons and mark them as down",
+        "osd", "rw")
+COMMAND("osd out "
+	"name=ids,type=CephString,n=N",
+	"set osd(s) <id> [<id>...] out, "
+        "or use <any|all> to set all osds out",
+        "osd", "rw")
+COMMAND("osd in "
+	"name=ids,type=CephString,n=N",
+	"set osd(s) <id> [<id>...] in, "
+        "can use <any|all> to automatically set all previously out osds in",
+        "osd", "rw")
+COMMAND_WITH_FLAG("osd rm "
+	"name=ids,type=CephString,n=N",
+	"remove osd(s) <id> [<id>...], "
+        "or use <any|all> to remove all osds",
+	"osd", "rw",
+	FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd add-noup "
+        "name=ids,type=CephString,n=N",
+        "mark osd(s) <id> [<id>...] as noup, "
+        "or use <all|any> to mark all osds as noup",
+        "osd", "rw",
+        FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd add-nodown "
+        "name=ids,type=CephString,n=N",
+        "mark osd(s) <id> [<id>...] as nodown, "
+        "or use <all|any> to mark all osds as nodown",
+        "osd", "rw",
+        FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd add-noin "
+        "name=ids,type=CephString,n=N",
+        "mark osd(s) <id> [<id>...] as noin, "
+        "or use <all|any> to mark all osds as noin",
+        "osd", "rw",
+        FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd add-noout "
+        "name=ids,type=CephString,n=N",
+        "mark osd(s) <id> [<id>...] as noout, "
+        "or use <all|any> to mark all osds as noout",
+        "osd", "rw",
+        FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd rm-noup "
+        "name=ids,type=CephString,n=N",
+        "allow osd(s) <id> [<id>...] to be marked up "
+        "(if they are currently marked as noup), "
+        "can use <all|any> to automatically filter out all noup osds",
+        "osd", "rw",
+        FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd rm-nodown "
+        "name=ids,type=CephString,n=N",
+        "allow osd(s) <id> [<id>...] to be marked down "
+        "(if they are currently marked as nodown), "
+        "can use <all|any> to automatically filter out all nodown osds",
+        "osd", "rw",
+        FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd rm-noin "
+        "name=ids,type=CephString,n=N",
+        "allow osd(s) <id> [<id>...] to be marked in "
+        "(if they are currently marked as noin), "
+        "can use <all|any> to automatically filter out all noin osds",
+        "osd", "rw",
+        FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd rm-noout "
+        "name=ids,type=CephString,n=N",
+        "allow osd(s) <id> [<id>...] to be marked out "
+        "(if they are currently marked as noout), "
+        "can use <all|any> to automatically filter out all noout osds",
+        "osd", "rw",
+        FLAG(DEPRECATED))
+COMMAND("osd set-group "
+        "name=flags,type=CephString "
+        "name=who,type=CephString,n=N",
+        "set <flags> for batch osds or crush nodes, "
+        "<flags> must be a comma-separated subset of {noup,nodown,noin,noout}",
+        "osd", "rw")
+COMMAND("osd unset-group "
+        "name=flags,type=CephString "
+        "name=who,type=CephString,n=N",
+        "unset <flags> for batch osds or crush nodes, "
+        "<flags> must be a comma-separated subset of {noup,nodown,noin,noout}",
+        "osd", "rw")
+COMMAND("osd reweight "
+	"name=id,type=CephOsdName "
+	"type=CephFloat,name=weight,range=0.0|1.0",
+	"reweight osd to 0.0 < <weight> < 1.0", "osd", "rw")
+COMMAND("osd reweightn "
+	"name=weights,type=CephString",
+	"reweight osds with {<id>: <weight>,...}",
+	"osd", "rw")
+COMMAND("osd force-create-pg "
+	"name=pgid,type=CephPgid "\
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"force creation of pg <pgid>",
+        "osd", "rw")
+COMMAND("osd pg-temp "
+	"name=pgid,type=CephPgid "
+	"name=id,type=CephOsdName,n=N,req=false",
+	"set pg_temp mapping pgid:[<id> [<id>...]] (developers only)",
+        "osd", "rw")
+COMMAND("osd pg-upmap "
+	"name=pgid,type=CephPgid "
+	"name=id,type=CephOsdName,n=N",
+	"set pg_upmap mapping <pgid>:[<id> [<id>...]] (developers only)",
+        "osd", "rw")
+COMMAND("osd rm-pg-upmap "
+	"name=pgid,type=CephPgid",
+	"clear pg_upmap mapping for <pgid> (developers only)",
+        "osd", "rw")
+COMMAND("osd pg-upmap-items "
+	"name=pgid,type=CephPgid "
+	"name=id,type=CephOsdName,n=N",
+	"set pg_upmap_items mapping <pgid>:{<id> to <id>, [...]} (developers only)",
+        "osd", "rw")
+COMMAND("osd rm-pg-upmap-items "
+	"name=pgid,type=CephPgid",
+	"clear pg_upmap_items mapping for <pgid> (developers only)",
+        "osd", "rw")
+COMMAND("osd primary-temp "
+	"name=pgid,type=CephPgid "
+	"name=id,type=CephOsdName",
+        "set primary_temp mapping pgid:<id>|-1 (developers only)",
+        "osd", "rw")
+COMMAND("osd primary-affinity "
+	"name=id,type=CephOsdName "
+	"type=CephFloat,name=weight,range=0.0|1.0",
+	"adjust osd primary-affinity from 0.0 <= <weight> <= 1.0",
+	"osd", "rw")
+COMMAND_WITH_FLAG("osd destroy-actual "
+        "name=id,type=CephOsdName "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
+        "mark osd as being destroyed. Keeps the ID intact (allowing reuse), "
+        "but removes cephx keys, config-key data and lockbox keys, "\
+        "rendering data permanently unreadable.",
+		  "osd", "rw", FLAG(HIDDEN))
+COMMAND("osd purge-new "
+        "name=id,type=CephOsdName "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
+        "purge all traces of an OSD that was partially created but never "
+	"started",
+        "osd", "rw")
+COMMAND_WITH_FLAG("osd purge-actual "
+        "name=id,type=CephOsdName "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
+        "purge all osd data from the monitors. Combines `osd destroy`, "
+        "`osd rm`, and `osd crush rm`.",
+		  "osd", "rw", FLAG(HIDDEN))
+COMMAND("osd lost "
+	"name=id,type=CephOsdName "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"mark osd as permanently lost. THIS DESTROYS DATA IF NO MORE REPLICAS EXIST, BE CAREFUL",
+	"osd", "rw")
+COMMAND_WITH_FLAG("osd create "
+	"name=uuid,type=CephUUID,req=false "
+	"name=id,type=CephOsdName,req=false",
+	"create new osd (with optional UUID and ID)", "osd", "rw",
+	FLAG(DEPRECATED))
+COMMAND("osd new "
+        "name=uuid,type=CephUUID,req=true "
+        "name=id,type=CephOsdName,req=false",
+        "Create a new OSD. If supplied, the `id` to be replaced needs to "
+        "exist and have been previously destroyed. "
+        "Reads secrets from JSON file via `-i <file>` (see man page).",
+        "osd", "rw")
+COMMAND("osd blocklist "
+	"name=range,type=CephString,goodchars=[range],req=false "
+	"name=blocklistop,type=CephChoices,strings=add|rm "
+	"name=addr,type=CephEntityAddr "
+	"name=expire,type=CephFloat,range=0.0,req=false",
+	"add (optionally until <expire> seconds from now) or remove <addr> from blocklist",
+	"osd", "rw")
+COMMAND("osd blocklist ls", "show blocklisted clients", "osd", "r")
+COMMAND("osd blocklist clear", "clear all blocklisted clients", "osd", "rw")
+
+COMMAND_WITH_FLAG("osd blacklist "
+	"name=blacklistop,type=CephChoices,strings=add|rm "
+	"name=addr,type=CephEntityAddr "
+	"name=expire,type=CephFloat,range=0.0,req=false",
+	"add (optionally until <expire> seconds from now) or remove <addr> from blacklist",
+	"osd", "rw",
+	FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd blacklist ls", "show blacklisted clients", "osd", "r",
+	FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("osd blacklist clear", "clear all blacklisted clients", "osd", "rw",
+	FLAG(DEPRECATED))
+
+COMMAND("osd pool mksnap "
+	"name=pool,type=CephPoolname "
+	"name=snap,type=CephString",
+	"make snapshot <snap> in <pool>", "osd", "rw")
+COMMAND("osd pool rmsnap "
+	"name=pool,type=CephPoolname "
+	"name=snap,type=CephString",
+	"remove snapshot <snap> from <pool>", "osd", "rw")
+COMMAND("osd pool ls "
+	"name=detail,type=CephChoices,strings=detail,req=false",
+	"list pools", "osd", "r")
+COMMAND("osd pool create "
+	"name=pool,type=CephPoolname "
+	"name=pg_num,type=CephInt,range=0,req=false "
+	"name=pgp_num,type=CephInt,range=0,req=false "
+        "name=pool_type,type=CephChoices,strings=replicated|erasure,req=false "
+	"name=erasure_code_profile,type=CephString,req=false,goodchars=[A-Za-z0-9-_.] "
+	"name=rule,type=CephString,req=false "
+        "name=expected_num_objects,type=CephInt,range=0,req=false "
+        "name=size,type=CephInt,range=0,req=false "
+	"name=pg_num_min,type=CephInt,range=0,req=false "
+	"name=pg_num_max,type=CephInt,range=0,req=false "
+	"name=autoscale_mode,type=CephChoices,strings=on|off|warn,req=false "
+	"name=bulk,type=CephBool,req=false "
+	"name=target_size_bytes,type=CephInt,range=0,req=false "
+	"name=target_size_ratio,type=CephFloat,range=0.0,req=false",\
+	"create pool", "osd", "rw")
+COMMAND_WITH_FLAG("osd pool delete "
+	"name=pool,type=CephPoolname "
+	"name=pool2,type=CephPoolname,req=false "
+	"name=yes_i_really_really_mean_it,type=CephBool,req=false "
+	"name=yes_i_really_really_mean_it_not_faking,type=CephBool,req=false ",
+	"delete pool",
+	"osd", "rw",
+    FLAG(DEPRECATED))
+COMMAND("osd pool rm "
+	"name=pool,type=CephPoolname "
+	"name=pool2,type=CephPoolname,req=false "
+	"name=yes_i_really_really_mean_it,type=CephBool,req=false "
+	"name=yes_i_really_really_mean_it_not_faking,type=CephBool,req=false ",
+	"remove pool",
+	"osd", "rw")
+COMMAND("osd pool rename "
+	"name=srcpool,type=CephPoolname "
+	"name=destpool,type=CephPoolname",
+	"rename <srcpool> to <destpool>", "osd", "rw")
+COMMAND("osd pool get "
+	"name=pool,type=CephPoolname "
+	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|bulk",
+	"get pool parameter <var>", "osd", "r")
+COMMAND("osd pool set "
+	"name=pool,type=CephPoolname "
+	"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|bulk "
+	"name=val,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"set pool parameter <var> to <val>", "osd", "rw")
+// 'val' is a CephString because it can include a unit.  Perhaps
+// there should be a Python type for validation/conversion of strings
+// with units.
+COMMAND("osd pool set-quota "
+	"name=pool,type=CephPoolname "
+	"name=field,type=CephChoices,strings=max_objects|max_bytes "
+	"name=val,type=CephString",
+	"set object or byte limit on pool", "osd", "rw")
+COMMAND("osd pool get-quota "
+        "name=pool,type=CephPoolname ",
+        "obtain object or byte limits for pool",
+        "osd", "r")
+COMMAND("osd pool application enable "
+        "name=pool,type=CephPoolname "
+        "name=app,type=CephString,goodchars=[A-Za-z0-9-_.] "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+        "enable use of an application <app> [cephfs,rbd,rgw] on pool <poolname>",
+        "osd", "rw")
+COMMAND("osd pool application disable "
+        "name=pool,type=CephPoolname "
+        "name=app,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+        "disables use of an application <app> on pool <poolname>",
+        "osd", "rw")
+COMMAND("osd pool application set "
+        "name=pool,type=CephPoolname "
+        "name=app,type=CephString "
+        "name=key,type=CephString,goodchars=[A-Za-z0-9-_.] "
+        "name=value,type=CephString,goodchars=[A-Za-z0-9-_.=]",
+        "sets application <app> metadata key <key> to <value> on pool <poolname>",
+        "osd", "rw")
+COMMAND("osd pool application rm "
+        "name=pool,type=CephPoolname "
+        "name=app,type=CephString "
+        "name=key,type=CephString",
+        "removes application <app> metadata key <key> on pool <poolname>",
+        "osd", "rw")
+COMMAND("osd pool application get "
+        "name=pool,type=CephPoolname,req=fasle "
+        "name=app,type=CephString,req=false "
+        "name=key,type=CephString,req=false",
+        "get value of key <key> of application <app> on pool <poolname>",
+        "osd", "r")
+COMMAND("osd utilization",
+	"get basic pg distribution stats",
+	"osd", "r")
+COMMAND("osd force_healthy_stretch_mode " \
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"force a healthy stretch mode, requiring the full number of CRUSH buckets "
+	"to peer and letting all non-tiebreaker monitors be elected leader ",
+	"osd", "rw")
+COMMAND("osd force_recovery_stretch_mode " \
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"try and force a recovery stretch mode, increasing the "
+	"pool size to its non-failure value if currently degraded and "
+	"all monitor buckets are up",
+	"osd", "rw")
+
+
+// tiering
+COMMAND("osd tier add "
+	"name=pool,type=CephPoolname "
+	"name=tierpool,type=CephPoolname "
+	"name=force_nonempty,type=CephChoices,strings=--force-nonempty,req=false",
+	"add the tier <tierpool> (the second one) to base pool <pool> (the first one)",
+	"osd", "rw")
+COMMAND("osd tier rm "
+	"name=pool,type=CephPoolname "
+	"name=tierpool,type=CephPoolname",
+	"remove the tier <tierpool> (the second one) from base pool <pool> (the first one)",
+	"osd", "rw")
+COMMAND_WITH_FLAG("osd tier remove "
+	"name=pool,type=CephPoolname "
+	"name=tierpool,type=CephPoolname",
+	"remove the tier <tierpool> (the second one) from base pool <pool> (the first one)",
+	"osd", "rw",
+    FLAG(DEPRECATED))
+COMMAND("osd tier cache-mode "
+	"name=pool,type=CephPoolname "
+	"name=mode,type=CephChoices,strings=writeback|readproxy|readonly|none "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"specify the caching mode for cache tier <pool>", "osd", "rw")
+COMMAND("osd tier set-overlay "
+	"name=pool,type=CephPoolname "
+	"name=overlaypool,type=CephPoolname",
+	"set the overlay pool for base pool <pool> to be <overlaypool>", "osd", "rw")
+COMMAND("osd tier rm-overlay "
+	"name=pool,type=CephPoolname ",
+	"remove the overlay pool for base pool <pool>", "osd", "rw")
+COMMAND_WITH_FLAG("osd tier remove-overlay "
+	"name=pool,type=CephPoolname ",
+	"remove the overlay pool for base pool <pool>", "osd", "rw",
+    FLAG(DEPRECATED))
+
+COMMAND("osd tier add-cache "
+	"name=pool,type=CephPoolname "
+	"name=tierpool,type=CephPoolname "
+	"name=size,type=CephInt,range=0",
+	"add a cache <tierpool> (the second one) of size <size> to existing pool <pool> (the first one)",
+	"osd", "rw")
+
+/*
+ * mon/KVMonitor.cc
+ */
+
+COMMAND("config-key get "
+	"name=key,type=CephString",
+	"get <key>", "config-key", "r")
+COMMAND("config-key set "
+	"name=key,type=CephString "
+	"name=val,type=CephString,req=false",
+	"set <key> to value <val>", "config-key", "rw")
+COMMAND_WITH_FLAG("config-key put "
+		  "name=key,type=CephString "
+		  "name=val,type=CephString,req=false",
+		  "put <key>, value <val>", "config-key", "rw",
+		  FLAG(DEPRECATED))
+COMMAND_WITH_FLAG("config-key del "
+	"name=key,type=CephString",
+	"delete <key>", "config-key", "rw",
+    FLAG(DEPRECATED))
+COMMAND("config-key rm "
+	"name=key,type=CephString",
+	"rm <key>", "config-key", "rw")
+COMMAND("config-key exists "
+	"name=key,type=CephString",
+	"check for <key>'s existence", "config-key", "r")
+COMMAND_WITH_FLAG("config-key list ", "list keys", "config-key", "r",
+		  FLAG(DEPRECATED))
+COMMAND("config-key ls ", "list keys", "config-key", "r")
+COMMAND("config-key dump "
+	"name=key,type=CephString,req=false", "dump keys and values (with optional prefix)", "config-key", "r")
+
+
+/*
+ * mon/MgrMonitor.cc
+ */
+COMMAND("mgr stat",
+	"dump basic info about the mgr cluster state",
+	"mgr", "r")
+COMMAND("mgr dump "
+	"name=epoch,type=CephInt,range=0,req=false",
+	"dump the latest MgrMap",
+	"mgr", "r")
+COMMAND("mgr fail name=who,type=CephString,req=false",
+	"treat the named manager daemon as failed", "mgr", "rw")
+COMMAND("mgr module ls",
+	"list active mgr modules", "mgr", "r")
+COMMAND("mgr services",
+	"list service endpoints provided by mgr modules",
+        "mgr", "r")
+COMMAND("mgr module enable "
+	"name=module,type=CephString "
+	"name=force,type=CephChoices,strings=--force,req=false",
+	"enable mgr module", "mgr", "rw")
+COMMAND("mgr module disable "
+	"name=module,type=CephString",
+	"disable mgr module", "mgr", "rw")
+COMMAND("mgr metadata name=who,type=CephString,req=false",
+	"dump metadata for all daemons or a specific daemon",
+	"mgr", "r")
+COMMAND("mgr count-metadata name=property,type=CephString",
+	"count ceph-mgr daemons by metadata field property",
+	"mgr", "r")
+COMMAND("mgr versions",
+	"check running versions of ceph-mgr daemons",
+	"mgr", "r")
+
+// ConfigMonitor
+COMMAND("config set"
+	" name=who,type=CephString"
+	" name=name,type=CephString"
+	" name=value,type=CephString"
+	" name=force,type=CephBool,req=false",
+	"Set a configuration option for one or more entities",
+	"config", "rw")
+COMMAND("config rm"
+	" name=who,type=CephString"
+	" name=name,type=CephString",
+	"Clear a configuration option for one or more entities",
+	"config", "rw")
+COMMAND("config get "
+	"name=who,type=CephString "
+	"name=key,type=CephString,req=False",
+	"Show configuration option(s) for an entity",
+	"config", "r")
+COMMAND("config dump",
+	"Show all configuration option(s)",
+	"mon", "r")
+COMMAND("config help "
+	"name=key,type=CephString",
+	"Describe a configuration option",
+	"config", "r")
+COMMAND("config ls",
+	"List available configuration options",
+	"config", "r")
+COMMAND("config assimilate-conf",
+	"Assimilate options from a conf, and return a new, minimal conf file",
+	"config", "rw")
+COMMAND("config log name=num,type=CephInt,req=False",
+	"Show recent history of config changes",
+	"config", "r")
+COMMAND("config reset "
+	"name=num,type=CephInt,range=0",
+	"Revert configuration to a historical version specified by <num>",
+	"config", "rw")
+COMMAND("config generate-minimal-conf",
+	"Generate a minimal ceph.conf file",
+	"config", "r")
+
+
+
+
+// these are tell commands that were implemented as CLI commands in
+// the broken pre-octopus way that we want to allow to work when a
+// monitor has upgraded to octopus+ but the monmap min_mon_release is
+// still < octopus.  we exclude things that weren't well supported
+// before and that aren't implemented by the octopus mon anymore.
+//
+// the command set below matches the kludge in Monitor::handle_command
+// that shunts these off to the asok machinery.
+
+COMMAND_WITH_FLAG("injectargs "
+	    "name=injected_args,type=CephString,n=N",
+	    "inject config arguments into monitor", "mon", "rw",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("smart name=devid,type=CephString,req=false",
+            "Query health metrics for underlying device",
+	    "mon", "rw",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("mon_status",
+	    "report status of monitors",
+	    "mon", "r",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("heap "
+            "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats "
+            "name=value,type=CephString,req=false",
+            "show heap usage info (available only if compiled with tcmalloc)",
+	    "mon", "rw",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("connection scores dump",
+		  "show the scores used in connectivity-based elections",
+		  "mon", "rwx",
+		  FLAG(TELL))
+COMMAND_WITH_FLAG("connection scores reset",
+		  "reset the scores used in connectivity-based elections",
+		  "mon", "rwx",
+		  FLAG(TELL))
+COMMAND_WITH_FLAG("sync_force "
+            "name=validate,type=CephChoices,strings=--yes-i-really-mean-it,req=false",
+            "force sync of and clear monitor store",
+            "mon", "rw",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("add_bootstrap_peer_hint "
+            "name=addr,type=CephIPAddr",
+            "add peer address as potential bootstrap "
+            "peer for cluster bringup",
+            "mon", "rw",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("add_bootstrap_peer_hintv "
+            "name=addrv,type=CephString",
+            "add peer address vector as potential bootstrap "
+            "peer for cluster bringup",
+            "mon", "rw",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("quorum enter ",
+            "force monitor back into quorum",
+            "mon", "rw",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("quorum exit",
+            "force monitor out of the quorum",
+            "mon", "rw",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("ops",
+            "show the ops currently in flight",
+            "mon", "r",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("sessions",
+            "list existing sessions",
+            "mon", "r",
+            FLAG(TELL))
+COMMAND_WITH_FLAG("dump_historic_ops",
+            "dump_historic_ops",
+            "mon", "r",
+            FLAG(TELL))
diff --git a/src/mon/MonMap.cc b/src/mon/MonMap.cc
new file mode 100644
index 000000000..2d14578a6
--- /dev/null
+++ b/src/mon/MonMap.cc
@@ -0,0 +1,972 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MonMap.h"
+
+#include <algorithm>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#ifdef WITH_SEASTAR
+#include <seastar/core/fstream.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/net/dns.hh>
+#include "crimson/common/config_proxy.h"
+#endif
+
+#include "common/Formatter.h"
+
+#include "include/ceph_features.h"
+#include "include/addr_parsing.h"
+#include "common/ceph_argparse.h"
+#include "common/dns_resolve.h"
+#include "common/errno.h"
+#include "common/dout.h"
+#include "common/Clock.h"
+#include "mon/health_check.h"
+
+using std::list;
+using std::map;
+using std::ostream;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::DNSResolver;
+using ceph::Formatter;
+
+void mon_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  uint8_t v = 5;
+  uint8_t min_v = 1;
+  if (!crush_loc.empty()) {
+    // we added crush_loc in version 5, but need to let old clients decode it
+    // so just leave the min_v at version 1. Monitors are protected
+    // from misunderstandings about location because setting it is blocked
+    // on FEATURE_PINGING
+    min_v = 1;
+  }
+  if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+    v = 2;
+  }
+  ENCODE_START(v, min_v, bl);
+  encode(name, bl);
+  if (v < 3) {
+    ceph_assert(min_v == 1);
+    auto a = public_addrs.legacy_addr();
+    if (a != entity_addr_t()) {
+      encode(a, bl, features);
+    } else {
+      // note: we don't have a legacy addr here, so lie so that it looks
+      // like one, just so that old clients get a valid-looking map.
+      // they won't be able to talk to the v2 mons, but that's better
+      // than nothing.
+      encode(public_addrs.as_legacy_addr(), bl, features);
+    }
+  } else {
+    encode(public_addrs, bl, features);
+  }
+  encode(priority, bl);
+  encode(weight, bl);
+  encode(crush_loc, bl);
+  ENCODE_FINISH(bl);
+}
+
+void mon_info_t::decode(ceph::buffer::list::const_iterator& p)
+{
+  DECODE_START(5, p);
+  decode(name, p);
+  decode(public_addrs, p);
+  if (struct_v >= 2) {
+    decode(priority, p);
+  }
+  if (struct_v >= 4) {
+    decode(weight, p);
+  }
+  if (struct_v >= 5) {
+    decode(crush_loc, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void mon_info_t::print(ostream& out) const
+{
+  out << "mon." << name
+      << " addrs " << public_addrs
+      << " priority " << priority
+      << " weight " << weight
+      << " crush location " << crush_loc;
+}
+
+namespace {
+  struct rank_cmp {
+    bool operator()(const mon_info_t &a, const mon_info_t &b) const {
+      if (a.public_addrs.legacy_or_front_addr() == b.public_addrs.legacy_or_front_addr())
+        return a.name < b.name;
+      return a.public_addrs.legacy_or_front_addr() < b.public_addrs.legacy_or_front_addr();
+    }
+  };
+}
+
+void MonMap::calc_legacy_ranks()
+{
+  ranks.resize(mon_info.size());
+
+  // Used to order entries according to public_addr, because that's
+  // how the ranks are expected to be ordered by. We may expand this
+  // later on, according to some other criteria, by specifying a
+  // different comparator.
+  //
+  // Please note that we use a 'set' here instead of resorting to
+  // std::sort() because we need more info than that's available in
+  // the vector. The vector will thus be ordered by, e.g., public_addr
+  // while only containing the names of each individual monitor.
+  // The only way of achieving this with std::sort() would be to first
+  // insert every mon_info_t entry into a vector 'foo', std::sort() 'foo'
+  // with custom comparison functions, and then copy each invidual entry
+  // to a new vector. Unless there's a simpler way, we don't think the
+  // added complexity makes up for the additional memory usage of a 'set'.
+  set<mon_info_t, rank_cmp> tmp;
+
+  for (auto p = mon_info.begin(); p != mon_info.end(); ++p) {
+    mon_info_t &m = p->second;
+    tmp.insert(m);
+  }
+
+  // map the set to the actual ranks etc
+  unsigned i = 0;
+  for (auto p = tmp.begin(); p != tmp.end(); ++p, ++i) {
+    ranks[i] = p->name;
+  }
+}
+
+void MonMap::encode(ceph::buffer::list& blist, uint64_t con_features) const
+{
+  if ((con_features & CEPH_FEATURE_MONNAMES) == 0) {
+    using ceph::encode;
+    __u16 v = 1;
+    encode(v, blist);
+    ceph::encode_raw(fsid, blist);
+    encode(epoch, blist);
+    vector<entity_inst_t> mon_inst(ranks.size());
+    for (unsigned n = 0; n < ranks.size(); n++) {
+      mon_inst[n].name = entity_name_t::MON(n);
+      mon_inst[n].addr = get_addrs(n).legacy_addr();
+    }
+    encode(mon_inst, blist, con_features);
+    encode(last_changed, blist);
+    encode(created, blist);
+    return;
+  }
+
+  map<string,entity_addr_t> legacy_mon_addr;
+  if (!HAVE_FEATURE(con_features, MONENC) ||
+      !HAVE_FEATURE(con_features, SERVER_NAUTILUS)) {
+    for (auto& [name, info] : mon_info) {
+      legacy_mon_addr[name] = info.public_addrs.legacy_addr();
+    }
+  }
+
+  if (!HAVE_FEATURE(con_features, MONENC)) {
+    /* we keep the mon_addr map when encoding to ensure compatibility
+       * with clients and other monitors that do not yet support the 'mons'
+       * map. This map keeps its original behavior, containing a mapping of
+       * monitor id (i.e., 'foo' in 'mon.foo') to the monitor's public
+       * address -- which is obtained from the public address of each entry
+       * in the 'mons' map.
+       */
+    using ceph::encode;
+    __u16 v = 2;
+    encode(v, blist);
+    ceph::encode_raw(fsid, blist);
+    encode(epoch, blist);
+    encode(legacy_mon_addr, blist, con_features);
+    encode(last_changed, blist);
+    encode(created, blist);
+    return;
+  }
+
+  if (!HAVE_FEATURE(con_features, SERVER_NAUTILUS)) {
+    ENCODE_START(5, 3, blist);
+    ceph::encode_raw(fsid, blist);
+    encode(epoch, blist);
+    encode(legacy_mon_addr, blist, con_features);
+    encode(last_changed, blist);
+    encode(created, blist);
+    encode(persistent_features, blist);
+    encode(optional_features, blist);
+    encode(mon_info, blist, con_features);
+    ENCODE_FINISH(blist);
+    return;
+  }
+
+  ENCODE_START(9, 6, blist);
+  ceph::encode_raw(fsid, blist);
+  encode(epoch, blist);
+  encode(last_changed, blist);
+  encode(created, blist);
+  encode(persistent_features, blist);
+  encode(optional_features, blist);
+  encode(mon_info, blist, con_features);
+  encode(ranks, blist);
+  encode(min_mon_release, blist);
+  encode(removed_ranks, blist);
+  uint8_t t = strategy;
+  encode(t, blist);
+  encode(disallowed_leaders, blist);
+  encode(stretch_mode_enabled, blist);
+  encode(tiebreaker_mon, blist);
+  encode(stretch_marked_down_mons, blist);
+  ENCODE_FINISH(blist);
+}
+
+void MonMap::decode(ceph::buffer::list::const_iterator& p)
+{
+  map<string,entity_addr_t> mon_addr;
+  DECODE_START_LEGACY_COMPAT_LEN_16(9, 3, 3, p);
+  ceph::decode_raw(fsid, p);
+  decode(epoch, p);
+  if (struct_v == 1) {
+    vector<entity_inst_t> mon_inst;
+    decode(mon_inst, p);
+    for (unsigned i = 0; i < mon_inst.size(); i++) {
+      char n[2];
+      n[0] = '0' + i;
+      n[1] = 0;
+      string name = n;
+      mon_addr[name] = mon_inst[i].addr;
+    }
+  } else if (struct_v < 6) {
+    decode(mon_addr, p);
+  }
+  decode(last_changed, p);
+  decode(created, p);
+  if (struct_v >= 4) {
+    decode(persistent_features, p);
+    decode(optional_features, p);
+  }
+  if (struct_v < 5) {
+    // generate mon_info from legacy mon_addr
+    for (auto& [name, addr] : mon_addr) {
+      mon_info_t &m = mon_info[name];
+      m.name = name;
+      m.public_addrs = entity_addrvec_t(addr);
+    }
+  } else {
+    decode(mon_info, p);
+  }
+  if (struct_v < 6) {
+    calc_legacy_ranks();
+  } else {
+    decode(ranks, p);
+  }
+  if (struct_v >= 7) {
+    decode(min_mon_release, p);
+  } else {
+    min_mon_release = infer_ceph_release_from_mon_features(persistent_features);
+  }
+  if (struct_v >= 8) {
+    decode(removed_ranks, p);
+    uint8_t t;
+    decode(t, p);
+    strategy = static_cast<election_strategy>(t);
+    decode(disallowed_leaders, p);
+  }
+  if (struct_v >= 9) {
+    decode(stretch_mode_enabled, p);
+    decode(tiebreaker_mon, p);
+    decode(stretch_marked_down_mons, p);
+  } else {
+    stretch_mode_enabled = false;
+    tiebreaker_mon = "";
+    stretch_marked_down_mons.clear();
+  }
+  calc_addr_mons();
+  DECODE_FINISH(p);
+}
+
+void MonMap::generate_test_instances(list<MonMap*>& o)
+{
+  o.push_back(new MonMap);
+  o.push_back(new MonMap);
+  o.back()->epoch = 1;
+  o.back()->last_changed = utime_t(123, 456);
+  o.back()->created = utime_t(789, 101112);
+  o.back()->add("one", entity_addrvec_t());
+
+  MonMap *m = new MonMap;
+  {
+    m->epoch = 1;
+    m->last_changed = utime_t(123, 456);
+
+    entity_addrvec_t empty_addr_one = entity_addrvec_t(entity_addr_t());
+    empty_addr_one.v[0].set_nonce(1);
+    m->add("empty_addr_one", empty_addr_one);
+    entity_addrvec_t empty_addr_two = entity_addrvec_t(entity_addr_t());
+    empty_addr_two.v[0].set_nonce(2);
+    m->add("empty_addr_two", empty_addr_two);
+
+    const char *local_pub_addr_s = "127.0.1.2";
+
+    const char *end_p = local_pub_addr_s + strlen(local_pub_addr_s);
+    entity_addrvec_t local_pub_addr;
+    local_pub_addr.parse(local_pub_addr_s, &end_p);
+
+    m->add(mon_info_t("filled_pub_addr", entity_addrvec_t(local_pub_addr), 1, 1));
+
+    m->add("empty_addr_zero", entity_addrvec_t());
+  }
+  o.push_back(m);
+}
+
+// read from/write to a file
+int MonMap::write(const char *fn) 
+{
+  // encode
+  ceph::buffer::list bl;
+  encode(bl, CEPH_FEATURES_ALL);
+  
+  return bl.write_file(fn);
+}
+
+int MonMap::read(const char *fn) 
+{
+  // read
+  ceph::buffer::list bl;
+  std::string error;
+  int r = bl.read_file(fn, &error);
+  if (r < 0)
+    return r;
+  decode(bl);
+  return 0;
+}
+
+void MonMap::print_summary(ostream& out) const
+{
+  out << "e" << epoch << ": "
+      << mon_info.size() << " mons at {";
+  // the map that we used to print, as it was, no longer
+  // maps strings to the monitor's public address, but to
+  // mon_info_t instead. As such, print the map in a way
+  // that keeps the expected format.
+  bool has_printed = false;
+  for (auto p = mon_info.begin(); p != mon_info.end(); ++p) {
+    if (has_printed)
+      out << ",";
+    out << p->first << "=" << p->second.public_addrs;
+    has_printed = true;
+  }
+  out << "}" << " removed_ranks: {" << removed_ranks << "}";
+}
+ 
+void MonMap::print(ostream& out) const
+{
+  out << "epoch " << epoch << "\n";
+  out << "fsid " << fsid << "\n";
+  out << "last_changed " << last_changed << "\n";
+  out << "created " << created << "\n";
+  out << "min_mon_release " << to_integer<unsigned>(min_mon_release)
+      << " (" << min_mon_release << ")\n";
+  out << "election_strategy: " << strategy << "\n";
+  if (stretch_mode_enabled) {
+    out << "stretch_mode_enabled " << stretch_mode_enabled << "\n";
+    out << "tiebreaker_mon " << tiebreaker_mon << "\n";
+  }
+  if (stretch_mode_enabled ||
+      !disallowed_leaders.empty()) {
+    out << "disallowed_leaders " << disallowed_leaders << "\n";
+  }
+  unsigned i = 0;
+  for (auto p = ranks.begin(); p != ranks.end(); ++p) {
+    const auto &mi = mon_info.find(*p);
+    ceph_assert(mi != mon_info.end());
+    out << i++ << ": " << mi->second.public_addrs << " mon." << *p;
+    if (!mi->second.crush_loc.empty()) {
+      out << "; crush_location " << mi->second.crush_loc;
+    }
+    out << "\n";
+  }
+}
+
+void MonMap::dump(Formatter *f) const
+{
+  f->dump_unsigned("epoch", epoch);
+  f->dump_stream("fsid") <<  fsid;
+  last_changed.gmtime(f->dump_stream("modified"));
+  created.gmtime(f->dump_stream("created"));
+  f->dump_unsigned("min_mon_release", to_integer<unsigned>(min_mon_release));
+  f->dump_string("min_mon_release_name", to_string(min_mon_release));
+  f->dump_int ("election_strategy", strategy);
+  f->dump_stream("disallowed_leaders: ") << disallowed_leaders;
+  f->dump_bool("stretch_mode", stretch_mode_enabled);
+  f->dump_string("tiebreaker_mon", tiebreaker_mon);
+  f->dump_stream("removed_ranks: ") << removed_ranks;
+  f->open_object_section("features");
+  persistent_features.dump(f, "persistent");
+  optional_features.dump(f, "optional");
+  f->close_section();
+  f->open_array_section("mons");
+  int i = 0;
+  for (auto p = ranks.begin(); p != ranks.end(); ++p, ++i) {
+    f->open_object_section("mon");
+    f->dump_int("rank", i);
+    f->dump_string("name", *p);
+    f->dump_object("public_addrs", get_addrs(*p));
+    // compat: make these look like pre-nautilus entity_addr_t
+    f->dump_stream("addr") << get_addrs(*p).get_legacy_str();
+    f->dump_stream("public_addr") << get_addrs(*p).get_legacy_str();
+    f->dump_unsigned("priority", get_priority(*p));
+    f->dump_unsigned("weight", get_weight(*p));
+    const auto &mi = mon_info.find(*p);
+    // we don't need to assert this validity as all the get_* functions did
+    f->dump_stream("crush_location") << mi->second.crush_loc;
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void MonMap::dump_summary(Formatter *f) const
+{
+  f->dump_unsigned("epoch", epoch);
+  f->dump_string("min_mon_release_name", to_string(min_mon_release));
+  f->dump_unsigned("num_mons", ranks.size());
+}
+
+// an ambiguous mon addr may be legacy or may be msgr2--we aren' sure.
+// when that happens we need to try them both (unless we can
+// reasonably infer from the port number which it is).
+void MonMap::_add_ambiguous_addr(const string& name,
+                                 entity_addr_t addr,
+                                 int priority,
+                                 int weight,
+                                 bool for_mkfs)
+{
+  if (addr.get_type() != entity_addr_t::TYPE_ANY) {
+    // a v1: or v2: prefix was specified
+    if (addr.get_port() == 0) {
+      // use default port
+      if (addr.get_type() == entity_addr_t::TYPE_LEGACY) {
+	addr.set_port(CEPH_MON_PORT_LEGACY);
+      } else if (addr.get_type() == entity_addr_t::TYPE_MSGR2) {
+	addr.set_port(CEPH_MON_PORT_IANA);
+      } else {
+	// wth
+	return;
+      }
+      if (!contains(addr)) {
+	add(name, entity_addrvec_t(addr), priority, weight);
+      }
+    } else {
+      if (!contains(addr)) {
+	add(name, entity_addrvec_t(addr), priority, weight);
+      }
+    }
+  } else {
+    // no v1: or v2: prefix specified
+    if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
+      // legacy port implies legacy addr
+      addr.set_type(entity_addr_t::TYPE_LEGACY);
+      if (!contains(addr)) {
+	if (!for_mkfs) {
+	  add(name + "-legacy", entity_addrvec_t(addr), priority, weight);
+	} else {
+	  add(name, entity_addrvec_t(addr), priority, weight);
+	}
+      }
+    } else if (addr.get_port() == CEPH_MON_PORT_IANA) {
+      // iana port implies msgr2 addr
+      addr.set_type(entity_addr_t::TYPE_MSGR2);
+      if (!contains(addr)) {
+	add(name, entity_addrvec_t(addr), priority, weight);
+      }
+    } else if (addr.get_port() == 0) {
+      // no port; include both msgr2 and legacy ports
+      if (!for_mkfs) {
+	addr.set_type(entity_addr_t::TYPE_MSGR2);
+	addr.set_port(CEPH_MON_PORT_IANA);
+	if (!contains(addr)) {
+	  add(name, entity_addrvec_t(addr), priority, weight);
+	}
+	addr.set_type(entity_addr_t::TYPE_LEGACY);
+	addr.set_port(CEPH_MON_PORT_LEGACY);
+	if (!contains(addr)) {
+	  add(name + "-legacy", entity_addrvec_t(addr), priority, weight);
+	}
+      } else {
+	entity_addrvec_t av;
+	addr.set_type(entity_addr_t::TYPE_MSGR2);
+	addr.set_port(CEPH_MON_PORT_IANA);
+	av.v.push_back(addr);
+	addr.set_type(entity_addr_t::TYPE_LEGACY);
+	addr.set_port(CEPH_MON_PORT_LEGACY);
+	av.v.push_back(addr);
+	if (!contains(av)) {
+	  add(name, av, priority, weight);
+	}
+      }
+    } else {
+      addr.set_type(entity_addr_t::TYPE_MSGR2);
+      if (!contains(addr)) {
+	add(name, entity_addrvec_t(addr), priority, weight);
+      }
+      if (!for_mkfs) {
+	// try legacy on same port too
+	addr.set_type(entity_addr_t::TYPE_LEGACY);
+	if (!contains(addr)) {
+	  add(name + "-legacy", entity_addrvec_t(addr), priority, weight);
+	}
+      }
+    }
+  }
+}
+
+void MonMap::init_with_addrs(const std::vector<entity_addrvec_t>& addrs,
+                             bool for_mkfs,
+                             std::string_view prefix)
+{
+  char id = 'a';
+  for (auto& addr : addrs) {
+    string name{prefix};
+    name += id++;
+    if (addr.v.size() == 1) {
+      _add_ambiguous_addr(name, addr.front(), 0, 0, for_mkfs);
+    } else {
+      // they specified an addrvec, so let's assume they also specified
+      // the addr *type* and *port*.  (we could possibly improve this?)
+      add(name, addr, 0);
+    }
+  }
+}
+
+int MonMap::init_with_ips(const std::string& ips,
+			  bool for_mkfs,
+			  std::string_view prefix)
+{
+  vector<entity_addrvec_t> addrs;
+  if (!parse_ip_port_vec(
+	ips.c_str(), addrs,
+	entity_addr_t::TYPE_ANY)) {
+    return -EINVAL;
+  }
+  if (addrs.empty())
+    return -ENOENT;
+  init_with_addrs(addrs, for_mkfs, prefix);
+  return 0;
+}
+
+int MonMap::init_with_hosts(const std::string& hostlist,
+			    bool for_mkfs,
+			    std::string_view prefix)
+{
+  // maybe they passed us a DNS-resolvable name
+  char *hosts = resolve_addrs(hostlist.c_str());
+  if (!hosts)
+    return -EINVAL;
+
+  vector<entity_addrvec_t> addrs;
+  bool success = parse_ip_port_vec(
+    hosts, addrs,
+    entity_addr_t::TYPE_ANY);
+  free(hosts);
+  if (!success)
+    return -EINVAL;
+  if (addrs.empty())
+    return -ENOENT;
+  init_with_addrs(addrs, for_mkfs, prefix);
+  calc_legacy_ranks();
+  return 0;
+}
+
+void MonMap::set_initial_members(CephContext *cct,
+				 list<std::string>& initial_members,
+				 string my_name,
+				 const entity_addrvec_t& my_addrs,
+				 set<entity_addrvec_t> *removed)
+{
+  // remove non-initial members
+  unsigned i = 0;
+  while (i < size()) {
+    string n = get_name(i);
+    if (std::find(initial_members.begin(), initial_members.end(), n)
+	!= initial_members.end()) {
+      lgeneric_dout(cct, 1) << " keeping " << n << " " << get_addrs(i) << dendl;
+      i++;
+      continue;
+    }
+
+    lgeneric_dout(cct, 1) << " removing " << get_name(i) << " " << get_addrs(i)
+			  << dendl;
+    if (removed) {
+      removed->insert(get_addrs(i));
+    }
+    remove(n);
+    ceph_assert(!contains(n));
+  }
+
+  // add missing initial members
+  for (auto& p : initial_members) {
+    if (!contains(p)) {
+      if (p == my_name) {
+	lgeneric_dout(cct, 1) << " adding self " << p << " " << my_addrs
+			      << dendl;
+	add(p, my_addrs);
+      } else {
+	entity_addr_t a;
+	a.set_type(entity_addr_t::TYPE_LEGACY);
+	a.set_family(AF_INET);
+	for (int n=1; ; n++) {
+	  a.set_nonce(n);
+	  if (!contains(a))
+	    break;
+	}
+	lgeneric_dout(cct, 1) << " adding " << p << " " << a << dendl;
+	add(p, entity_addrvec_t(a));
+      }
+      ceph_assert(contains(p));
+    }
+  }
+  calc_legacy_ranks();
+}
+
+int MonMap::init_with_config_file(const ConfigProxy& conf,
+                                  std::ostream& errout)
+{
+  std::vector<std::string> sections;
+  int ret = conf.get_all_sections(sections);
+  if (ret) {
+    errout << "Unable to find any monitors in the configuration "
+         << "file, because there was an error listing the sections. error "
+	 << ret << std::endl;
+    return -ENOENT;
+  }
+  std::vector<std::string> mon_names;
+  for (const auto& section : sections) {
+    if (section.substr(0, 4) == "mon." && section.size() > 4) {
+      mon_names.push_back(section.substr(4));
+    }
+  }
+
+  // Find an address for each monitor in the config file.
+  for (const auto& mon_name : mon_names) {
+    std::vector<std::string> sections;
+    std::string m_name("mon");
+    m_name += ".";
+    m_name += mon_name;
+    sections.push_back(m_name);
+    sections.push_back("mon");
+    sections.push_back("global");
+    std::string val;
+    int res = conf.get_val_from_conf_file(sections, "mon addr", val, true);
+    if (res) {
+      errout << "failed to get an address for mon." << mon_name
+             << ": error " << res << std::endl;
+      continue;
+    }
+    // the 'mon addr' field is a legacy field, so assume anything
+    // there on a weird port is a v1 address, and do not handle
+    // addrvecs.
+    entity_addr_t addr;
+    if (!addr.parse(val.c_str(), nullptr, entity_addr_t::TYPE_LEGACY)) {
+      errout << "unable to parse address for mon." << mon_name
+             << ": addr='" << val << "'" << std::endl;
+      continue;
+    }
+    if (addr.get_port() == 0) {
+      addr.set_port(CEPH_MON_PORT_LEGACY);
+    }
+    uint16_t priority = 0;
+    if (!conf.get_val_from_conf_file(sections, "mon priority", val, false)) {
+      try {
+        priority = std::stoul(val);
+      } catch (std::logic_error&) {
+        errout << "unable to parse priority for mon." << mon_name
+               << ": priority='" << val << "'" << std::endl;
+        continue;
+      }
+    }
+    uint16_t weight = 0;
+    if (!conf.get_val_from_conf_file(sections, "mon weight", val, false)) {
+      try {
+        weight = std::stoul(val);
+      } catch (std::logic_error&) {
+        errout << "unable to parse weight for mon." << mon_name
+               << ": weight='" << val << "'"
+               << std::endl;
+        continue;
+      }
+    }
+
+    // make sure this mon isn't already in the map
+    if (contains(addr))
+      remove(get_name(addr));
+    if (contains(mon_name))
+      remove(mon_name);
+    _add_ambiguous_addr(mon_name, addr, priority, weight, false);
+  }
+  return 0;
+}
+
+void MonMap::check_health(health_check_map_t *checks) const
+{
+  if (stretch_mode_enabled) {
+    list<string> detail;
+    for (auto& p : mon_info) {
+      if (p.second.crush_loc.empty()) {
+	ostringstream ss;
+	ss << "mon " << p.first << " has no location set while in stretch mode";
+	detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " monitor(s) have no location set while in stretch mode"
+	 << "; this may cause issues with failover, OSD connections, netsplit handling, etc";
+      auto& d = checks->add("MON_LOCATION_NOT_SET", HEALTH_WARN,
+			    ss.str(), detail.size());
+      d.detail.swap(detail);
+    }
+  }
+}
+
+#ifdef WITH_SEASTAR
+
+using namespace seastar;
+
+seastar::future<> MonMap::read_monmap(const std::string& monmap)
+{
+  return open_file_dma(monmap, open_flags::ro).then([this] (file f) {
+    return f.size().then([this, f = std::move(f)](size_t s) {
+      return do_with(make_file_input_stream(f), [this, s](input_stream<char>& in) {
+        return in.read_exactly(s).then([this](temporary_buffer<char> buf) {
+          ceph::buffer::list bl;
+          bl.push_back(ceph::buffer::ptr_node::create(
+            ceph::buffer::create(std::move(buf))));
+          decode(bl);
+        });
+      });
+    });
+  });
+}
+
+seastar::future<> MonMap::init_with_dns_srv(bool for_mkfs, const std::string& name)
+{
+  string domain;
+  string service = name;
+  // check if domain is also provided and extract it from srv_name
+  size_t idx = name.find("_");
+  if (idx != name.npos) {
+    domain = name.substr(idx + 1);
+    service = name.substr(0, idx);
+  }
+  return seastar::net::dns::get_srv_records(
+      seastar::net::dns_resolver::srv_proto::tcp,
+      service, domain).then([this](seastar::net::dns_resolver::srv_records records) {
+    return parallel_for_each(records, [this](auto record) {
+      return seastar::net::dns::resolve_name(record.target).then(
+          [record,this](seastar::net::inet_address a) {
+	// the resolved address does not contain ceph specific info like nonce
+	// nonce or msgr proto (legacy, msgr2), so set entity_addr_t manually
+	entity_addr_t addr;
+	addr.set_type(entity_addr_t::TYPE_ANY);
+	addr.set_family(int(a.in_family()));
+	addr.set_port(record.port);
+	switch (a.in_family()) {
+	case seastar::net::inet_address::family::INET:
+	  addr.in4_addr().sin_addr = a;
+	  break;
+	case seastar::net::inet_address::family::INET6:
+	  addr.in6_addr().sin6_addr = a;
+	  break;
+	}
+        _add_ambiguous_addr(record.target,
+                            addr,
+                            record.priority,
+                            record.weight,
+                            false);
+      });
+    });
+  }).handle_exception_type([](const std::system_error& e) {
+    // ignore DNS failures
+    return seastar::make_ready_future<>();
+  });
+}
+
+seastar::future<> MonMap::build_monmap(const crimson::common::ConfigProxy& conf,
+				       bool for_mkfs)
+{
+  // -m foo?
+  if (const auto mon_host = conf.get_val<std::string>("mon_host");
+      !mon_host.empty()) {
+    if (auto ret = init_with_ips(mon_host, for_mkfs, "noname-"); ret == 0) {
+      return make_ready_future<>();
+    }
+    // TODO: resolve_addrs() is a blocking call
+    if (auto ret = init_with_hosts(mon_host, for_mkfs, "noname-"); ret == 0) {
+      return make_ready_future<>();
+    } else {
+      throw std::runtime_error(cpp_strerror(ret));
+    }
+  }
+
+  // What monitors are in the config file?
+  ostringstream errout;
+  if (auto ret = init_with_config_file(conf, errout); ret < 0) {
+    throw std::runtime_error(errout.str());
+  }
+  if (size() > 0) {
+    return make_ready_future<>();
+  }
+  // no info found from conf options lets try use DNS SRV records
+  const string srv_name = conf.get_val<std::string>("mon_dns_srv_name");
+  return init_with_dns_srv(for_mkfs, srv_name).then([this] {
+    if (size() == 0) {
+      throw std::runtime_error("no monitors specified to connect to.");
+    }
+  });
+}
+
+seastar::future<> MonMap::build_initial(const crimson::common::ConfigProxy& conf, bool for_mkfs)
+{
+  // file?
+  if (const auto monmap = conf.get_val<std::string>("monmap");
+      !monmap.empty()) {
+    return read_monmap(monmap);
+  } else {
+    // fsid from conf?
+    if (const auto new_fsid = conf.get_val<uuid_d>("fsid");
+        !new_fsid.is_zero()) {
+      fsid = new_fsid;
+    }
+    return build_monmap(conf, for_mkfs).then([this] {
+      created = ceph_clock_now();
+      last_changed = created;
+      calc_legacy_ranks();
+    });
+  }
+}
+
+#else  // WITH_SEASTAR
+
+int MonMap::init_with_monmap(const std::string& monmap, std::ostream& errout)
+{
+  int r;
+  try {
+    r = read(monmap.c_str());
+  } catch (ceph::buffer::error&) {
+    r = -EINVAL;
+  }
+  if (r >= 0)
+    return 0;
+  errout << "unable to read/decode monmap from " << monmap
+         << ": " << cpp_strerror(-r) << std::endl;
+  return r;
+}
+
+int MonMap::init_with_dns_srv(CephContext* cct,
+                              std::string srv_name,
+			      bool for_mkfs,
+                              std::ostream& errout)
+{
+  string domain;
+  // check if domain is also provided and extract it from srv_name
+  size_t idx = srv_name.find("_");
+  if (idx != string::npos) {
+    domain = srv_name.substr(idx + 1);
+    srv_name = srv_name.substr(0, idx);
+  }
+
+  map<string, DNSResolver::Record> records;
+  if (DNSResolver::get_instance()->resolve_srv_hosts(cct, srv_name,
+        DNSResolver::SRV_Protocol::TCP, domain, &records) != 0) {
+
+    errout << "unable to get monitor info from DNS SRV with service name: "
+           << "ceph-mon" << std::endl;
+    return -1;
+  } else {
+    for (auto& record : records) {
+      record.second.addr.set_type(entity_addr_t::TYPE_ANY);
+      _add_ambiguous_addr(record.first,
+                          record.second.addr,
+                          record.second.priority,
+                          record.second.weight,
+                          false);
+    }
+    return 0;
+  }
+}
+
+int MonMap::build_initial(CephContext *cct, bool for_mkfs, ostream& errout)
+{
+  const auto& conf = cct->_conf;
+
+  // mon_host_override?
+  auto mon_host_override = conf.get_val<std::string>("mon_host_override");
+  if (!mon_host_override.empty()) {
+    lgeneric_dout(cct, 1) << "Using mon_host_override " << mon_host_override << dendl;
+    auto ret = init_with_ips(mon_host_override, for_mkfs, "noname-");
+    if (ret == -EINVAL) {
+      ret = init_with_hosts(mon_host_override, for_mkfs, "noname-");
+    }
+    if (ret < 0) {
+      errout << "unable to parse addrs in '" << mon_host_override << "'"
+	     << std::endl;
+    }
+    return ret;
+  }
+
+  // cct?
+  auto addrs = cct->get_mon_addrs();
+  if (addrs != nullptr && (addrs->size() > 0)) {
+    init_with_addrs(*addrs, for_mkfs, "noname-");
+    return 0;
+  }
+
+  // file?
+  if (const auto monmap = conf.get_val<std::string>("monmap");
+      !monmap.empty()) {
+    return init_with_monmap(monmap, errout);
+  }
+
+  // fsid from conf?
+  if (const auto new_fsid = conf.get_val<uuid_d>("fsid");
+      !new_fsid.is_zero()) {
+    fsid = new_fsid;
+  }
+  // -m foo?
+  if (const auto mon_host = conf.get_val<std::string>("mon_host");
+      !mon_host.empty()) {
+    auto ret = init_with_ips(mon_host, for_mkfs, "noname-");
+    if (ret == -EINVAL) {
+      ret = init_with_hosts(mon_host, for_mkfs, "noname-");
+    }
+    if (ret < 0) {
+      errout << "unable to parse addrs in '" << mon_host << "'"
+	     << std::endl;
+      return ret;
+    }
+  }
+  if (size() == 0) {
+    // What monitors are in the config file?
+    if (auto ret = init_with_config_file(conf, errout); ret < 0) {
+      return ret;
+    }
+  }
+  if (size() == 0) {
+    // no info found from conf options lets try use DNS SRV records
+    string srv_name = conf.get_val<std::string>("mon_dns_srv_name");
+    if (auto ret = init_with_dns_srv(cct, srv_name, for_mkfs, errout); ret < 0) {
+      return -ENOENT;
+    }
+  }
+  if (size() == 0) {
+    errout << "no monitors specified to connect to." << std::endl;
+    return -ENOENT;
+  }
+  strategy = static_cast<election_strategy>(conf.get_val<uint64_t>("mon_election_default_strategy"));
+  created = ceph_clock_now();
+  last_changed = created;
+  calc_legacy_ranks();
+  return 0;
+}
+#endif	// WITH_SEASTAR
diff --git a/src/mon/MonMap.h b/src/mon/MonMap.h
new file mode 100644
index 000000000..02304edfd
--- /dev/null
+++ b/src/mon/MonMap.h
@@ -0,0 +1,546 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MONMAP_H
+#define CEPH_MONMAP_H
+
+#ifdef WITH_SEASTAR
+#include <seastar/core/future.hh>
+#endif
+
+#include "common/config_fwd.h"
+#include "common/ceph_releases.h"
+
+#include "include/err.h"
+#include "include/types.h"
+
+#include "mon/mon_types.h"
+#include "msg/Message.h"
+
+class health_check_map_t;
+
+#ifdef WITH_SEASTAR
+namespace crimson::common {
+  class ConfigProxy;
+}
+#endif
+
+namespace ceph {
+  class Formatter;
+}
+
+struct mon_info_t {
+  /**
+   * monitor name
+   *
+   * i.e., 'foo' in 'mon.foo'
+   */
+  std::string name;
+  /**
+   * monitor's public address(es)
+   *
+   * public facing address(es), used to communicate with all clients
+   * and with other monitors.
+   */
+  entity_addrvec_t public_addrs;
+  /**
+   * the priority of the mon, the lower value the more preferred
+   */
+  uint16_t priority{0};
+  uint16_t weight{0};
+
+  /**
+   * The location of the monitor, in CRUSH hierarchy terms
+   */
+  std::map<std::string,std::string> crush_loc;
+
+  // <REMOVE ME>
+  mon_info_t(const std::string& n, const entity_addr_t& p_addr, uint16_t p)
+    : name(n), public_addrs(p_addr), priority(p)
+  {}
+  // </REMOVE ME>
+
+  mon_info_t(const std::string& n, const entity_addrvec_t& p_addrs,
+             uint16_t p, uint16_t w)
+    : name(n), public_addrs(p_addrs), priority(p), weight(w)
+  {}
+  mon_info_t(const std::string &n, const entity_addrvec_t& p_addrs)
+    : name(n), public_addrs(p_addrs)
+  { }
+
+  mon_info_t() { }
+
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void print(std::ostream& out) const;
+};
+WRITE_CLASS_ENCODER_FEATURES(mon_info_t)
+
+inline std::ostream& operator<<(std::ostream& out, const mon_info_t& mon) {
+  mon.print(out);
+  return out;
+}
+
+class MonMap {
+ public:
+  epoch_t epoch;       // what epoch/version of the monmap
+  uuid_d fsid;
+  utime_t last_changed;
+  utime_t created;
+
+  std::map<std::string, mon_info_t> mon_info;
+  std::map<entity_addr_t, std::string> addr_mons;
+
+  std::vector<std::string> ranks;
+  /* ranks which were removed when this map took effect.
+     There should only be one at a time, but leave support
+     for arbitrary numbers just to be safe. */
+  std::set<int> removed_ranks;
+
+  /**
+   * Persistent Features are all those features that once set on a
+   * monmap cannot, and should not, be removed. These will define the
+   * non-negotiable features that a given monitor must support to
+   * properly operate in a given quorum.
+   *
+   * Should be reserved for features that we really want to make sure
+   * are sticky, and are important enough to tolerate not being able
+   * to downgrade a monitor.
+   */
+  mon_feature_t persistent_features;
+  /**
+   * Optional Features are all those features that can be enabled or
+   * disabled following a given criteria -- e.g., user-mandated via the
+   * cli --, and act much like indicators of what the cluster currently
+   * supports.
+   *
+   * They are by no means "optional" in the sense that monitors can
+   * ignore them. Just that they are not persistent.
+   */
+  mon_feature_t optional_features;
+
+  /**
+   * Returns the set of features required by this monmap.
+   *
+   * The features required by this monmap is the union of all the
+   * currently set persistent features and the currently set optional
+   * features.
+   *
+   * @returns the set of features required by this monmap
+   */
+  mon_feature_t get_required_features() const {
+    return (persistent_features | optional_features);
+  }
+
+  // upgrade gate
+  ceph_release_t min_mon_release{ceph_release_t::unknown};
+
+  void _add_ambiguous_addr(const std::string& name,
+                           entity_addr_t addr,
+                           int priority,
+                           int weight,
+                           bool for_mkfs);
+
+  enum election_strategy {
+			  // Keep in sync with ElectionLogic.h!
+    CLASSIC = 1, // the original rank-based one
+    DISALLOW = 2, // disallow a set from being leader
+    CONNECTIVITY = 3 // includes DISALLOW, extends to prefer stronger connections
+  };
+  election_strategy strategy = CLASSIC;
+  std::set<std::string> disallowed_leaders; // can't be leader under CONNECTIVITY/DISALLOW
+  bool stretch_mode_enabled = false;
+  string tiebreaker_mon;
+  set<string> stretch_marked_down_mons; // can't be leader until fully recovered
+
+public:
+  void calc_legacy_ranks();
+  void calc_addr_mons() {
+    // populate addr_mons
+    addr_mons.clear();
+    for (auto& p : mon_info) {
+      for (auto& a : p.second.public_addrs.v) {
+	addr_mons[a] = p.first;
+      }
+    }
+  }
+
+  MonMap()
+    : epoch(0) {
+  }
+
+  uuid_d& get_fsid() { return fsid; }
+
+  unsigned size() const {
+    return mon_info.size();
+  }
+
+  unsigned min_quorum_size(unsigned total_mons=0) const {
+    if (total_mons == 0) {
+      total_mons = size();
+    }
+    return total_mons / 2 + 1;
+  }
+
+  epoch_t get_epoch() const { return epoch; }
+  void set_epoch(epoch_t e) { epoch = e; }
+
+  /**
+   * Obtain list of public facing addresses
+   *
+   * @param ls list to populate with the monitors' addresses
+   */
+  void list_addrs(std::list<entity_addr_t>& ls) const {
+    for (auto& i : mon_info) {
+      for (auto& j : i.second.public_addrs.v) {
+	ls.push_back(j);
+      }
+    }
+  }
+
+  /**
+   * Add new monitor to the monmap
+   *
+   * @param m monitor info of the new monitor
+   */
+  void add(const mon_info_t& m) {
+    ceph_assert(mon_info.count(m.name) == 0);
+    for (auto& a : m.public_addrs.v) {
+      ceph_assert(addr_mons.count(a) == 0);
+    }
+    mon_info[m.name] = m;
+    if (get_required_features().contains_all(
+	  ceph::features::mon::FEATURE_NAUTILUS)) {
+      ranks.push_back(m.name);
+      ceph_assert(ranks.size() == mon_info.size());
+    } else {
+      calc_legacy_ranks();
+    }
+    calc_addr_mons();
+  }
+
+  /**
+   * Add new monitor to the monmap
+   *
+   * @param name Monitor name (i.e., 'foo' in 'mon.foo')
+   * @param addr Monitor's public address
+   */
+  void add(const std::string &name, const entity_addrvec_t &addrv,
+	   uint16_t priority=0, uint16_t weight=0) {
+    add(mon_info_t(name, addrv, priority, weight));
+  }
+
+  /**
+   * Remove monitor from the monmap
+   *
+   * @param name Monitor name (i.e., 'foo' in 'mon.foo')
+   */
+  void remove(const std::string &name) {
+    // this must match what we do in ConnectionTracker::notify_rank_removed
+    ceph_assert(mon_info.count(name));
+    int rank = get_rank(name);
+    mon_info.erase(name);
+    disallowed_leaders.erase(name);
+    ceph_assert(mon_info.count(name) == 0);
+    if (rank >= 0 ) {
+      removed_ranks.insert(rank);
+    }
+    if (get_required_features().contains_all(
+	  ceph::features::mon::FEATURE_NAUTILUS)) {
+      ranks.erase(std::find(ranks.begin(), ranks.end(), name));
+      ceph_assert(ranks.size() == mon_info.size());
+    } else {
+      calc_legacy_ranks();
+    }
+    calc_addr_mons();
+  }
+
+  /**
+   * Rename monitor from @p oldname to @p newname
+   *
+   * @param oldname monitor's current name (i.e., 'foo' in 'mon.foo')
+   * @param newname monitor's new name (i.e., 'bar' in 'mon.bar')
+   */
+  void rename(std::string oldname, std::string newname) {
+    ceph_assert(contains(oldname));
+    ceph_assert(!contains(newname));
+    mon_info[newname] = mon_info[oldname];
+    mon_info.erase(oldname);
+    mon_info[newname].name = newname;
+    if (get_required_features().contains_all(
+	  ceph::features::mon::FEATURE_NAUTILUS)) {
+      *std::find(ranks.begin(), ranks.end(), oldname) = newname;
+      ceph_assert(ranks.size() == mon_info.size());
+    } else {
+      calc_legacy_ranks();
+    }
+    calc_addr_mons();
+  }
+
+  int set_rank(const std::string& name, int rank) {
+    int oldrank = get_rank(name);
+    if (oldrank < 0) {
+      return -ENOENT;
+    }
+    if (rank < 0 || rank >= (int)ranks.size()) {
+      return -EINVAL;
+    }
+    if (oldrank != rank) {
+      ranks.erase(ranks.begin() + oldrank);
+      ranks.insert(ranks.begin() + rank, name);
+    }
+    return 0;
+  }
+
+  bool contains(const std::string& name) const {
+    return mon_info.count(name);
+  }
+
+  /**
+   * Check if monmap contains a monitor with address @p a
+   *
+   * @note checks for all addresses a monitor may have, public or otherwise.
+   *
+   * @param a monitor address
+   * @returns true if monmap contains a monitor with address @p;
+   *          false otherwise.
+   */
+  bool contains(const entity_addr_t &a, std::string *name=nullptr) const {
+    for (auto& i : mon_info) {
+      for (auto& j : i.second.public_addrs.v) {
+	if (j == a) {
+	  if (name) {
+	    *name = i.first;
+	  }
+	  return true;
+	}
+      }
+    }
+    return false;
+  }
+  bool contains(const entity_addrvec_t &av, std::string *name=nullptr) const {
+    for (auto& i : mon_info) {
+      for (auto& j : i.second.public_addrs.v) {
+	for (auto& k : av.v) {
+	  if (j == k) {
+	    if (name) {
+	      *name = i.first;
+	    }
+	    return true;
+	  }
+	}
+      }
+    }
+    return false;
+  }
+
+  std::string get_name(unsigned n) const {
+    ceph_assert(n < ranks.size());
+    return ranks[n];
+  }
+  std::string get_name(const entity_addr_t& a) const {
+    std::map<entity_addr_t, std::string>::const_iterator p = addr_mons.find(a);
+    if (p == addr_mons.end())
+      return std::string();
+    else
+      return p->second;
+  }
+  std::string get_name(const entity_addrvec_t& av) const {
+    for (auto& i : av.v) {
+      std::map<entity_addr_t, std::string>::const_iterator p = addr_mons.find(i);
+      if (p != addr_mons.end())
+	return p->second;
+    }
+    return std::string();
+  }
+
+  int get_rank(const std::string& n) const {
+    if (auto found = std::find(ranks.begin(), ranks.end(), n);
+	found != ranks.end()) {
+      return std::distance(ranks.begin(), found);
+    } else {
+      return -1;
+    }
+  }
+  int get_rank(const entity_addr_t& a) const {
+    std::string n = get_name(a);
+    if (!n.empty()) {
+      return get_rank(n);
+    }
+    return -1;
+  }
+  int get_rank(const entity_addrvec_t& av) const {
+    std::string n = get_name(av);
+    if (!n.empty()) {
+      return get_rank(n);
+    }
+    return -1;
+  }
+  bool get_addr_name(const entity_addr_t& a, std::string& name) {
+    if (addr_mons.count(a) == 0)
+      return false;
+    name = addr_mons[a];
+    return true;
+  }
+
+  const entity_addrvec_t& get_addrs(const std::string& n) const {
+    ceph_assert(mon_info.count(n));
+    std::map<std::string,mon_info_t>::const_iterator p = mon_info.find(n);
+    return p->second.public_addrs;
+  }
+  const entity_addrvec_t& get_addrs(unsigned m) const {
+    ceph_assert(m < ranks.size());
+    return get_addrs(ranks[m]);
+  }
+  void set_addrvec(const std::string& n, const entity_addrvec_t& a) {
+    ceph_assert(mon_info.count(n));
+    mon_info[n].public_addrs = a;
+    calc_addr_mons();
+  }
+  uint16_t get_priority(const std::string& n) const {
+    auto it = mon_info.find(n);
+    ceph_assert(it != mon_info.end());
+    return it->second.priority;
+  }
+  uint16_t get_weight(const std::string& n) const {
+    auto it = mon_info.find(n);
+    ceph_assert(it != mon_info.end());
+    return it->second.weight;
+  }
+  void set_weight(const std::string& n, uint16_t v) {
+    auto it = mon_info.find(n);
+    ceph_assert(it != mon_info.end());
+    it->second.weight = v;
+  }
+
+  void encode(ceph::buffer::list& blist, uint64_t con_features) const;
+  void decode(ceph::buffer::list& blist) {
+    auto p = std::cbegin(blist);
+    decode(p);
+  }
+  void decode(ceph::buffer::list::const_iterator& p);
+
+  void generate_fsid() {
+    fsid.generate_random();
+  }
+
+  // read from/write to a file
+  int write(const char *fn);
+  int read(const char *fn);
+
+  /**
+   * build an initial bootstrap monmap from conf
+   *
+   * Build an initial bootstrap monmap from the config.  This will
+   * try, in this order:
+   *
+   *   1 monmap   -- an explicitly provided monmap
+   *   2 mon_host -- list of monitors
+   *   3 config [mon.*] sections, and 'mon addr' fields in those sections
+   *
+   * @param cct context (and associated config)
+   * @param errout std::ostream to send error messages too
+   */
+#ifdef WITH_SEASTAR
+  seastar::future<> build_initial(const crimson::common::ConfigProxy& conf, bool for_mkfs);
+#else
+  int build_initial(CephContext *cct, bool for_mkfs, std::ostream& errout);
+#endif
+  /**
+   * filter monmap given a set of initial members.
+   *
+   * Remove mons that aren't in the initial_members list.  Add missing
+   * mons and give them dummy IPs (blank IPv4, with a non-zero
+   * nonce). If the name matches my_name, then my_addr will be used in
+   * place of a dummy addr.
+   *
+   * @param initial_members list of initial member names
+   * @param my_name name of self, can be blank
+   * @param my_addr my addr
+   * @param removed optional pointer to set to insert removed mon addrs to
+   */
+  void set_initial_members(CephContext *cct,
+			   std::list<std::string>& initial_members,
+			   std::string my_name,
+			   const entity_addrvec_t& my_addrs,
+			   std::set<entity_addrvec_t> *removed);
+
+  void print(std::ostream& out) const;
+  void print_summary(std::ostream& out) const;
+  void dump(ceph::Formatter *f) const;
+  void dump_summary(ceph::Formatter *f) const;
+
+  void check_health(health_check_map_t *checks) const;
+
+  static void generate_test_instances(std::list<MonMap*>& o);
+protected:
+  /**
+   * build a monmap from a list of entity_addrvec_t's
+   *
+   * Give mons dummy names.
+   *
+   * @param addrs  list of entity_addrvec_t's
+   * @param prefix prefix to prepend to generated mon names
+   */
+  void init_with_addrs(const std::vector<entity_addrvec_t>& addrs,
+                       bool for_mkfs,
+                       std::string_view prefix);
+  /**
+   * build a monmap from a list of ips
+   *
+   * Give mons dummy names.
+   *
+   * @param hosts  list of ips, space or comma separated
+   * @param prefix prefix to prepend to generated mon names
+   * @return 0 for success, -errno on error
+   */
+  int init_with_ips(const std::string& ips,
+		    bool for_mkfs,
+		    std::string_view prefix);
+  /**
+   * build a monmap from a list of hostnames
+   *
+   * Give mons dummy names.
+   *
+   * @param hosts  list of ips, space or comma separated
+   * @param prefix prefix to prepend to generated mon names
+   * @return 0 for success, -errno on error
+   */
+  int init_with_hosts(const std::string& hostlist,
+		      bool for_mkfs,
+		      std::string_view prefix);
+  int init_with_config_file(const ConfigProxy& conf, std::ostream& errout);
+#if WITH_SEASTAR
+  seastar::future<> read_monmap(const std::string& monmap);
+  /// try to build monmap with different settings, like
+  /// mon_host, mon* sections, and mon_dns_srv_name
+  seastar::future<> build_monmap(const crimson::common::ConfigProxy& conf, bool for_mkfs);
+  /// initialize monmap by resolving given service name
+  seastar::future<> init_with_dns_srv(bool for_mkfs, const std::string& name);
+#else
+  /// read from encoded monmap file
+  int init_with_monmap(const std::string& monmap, std::ostream& errout);
+  int init_with_dns_srv(CephContext* cct, std::string srv_name, bool for_mkfs,
+			std::ostream& errout);
+#endif
+};
+WRITE_CLASS_ENCODER_FEATURES(MonMap)
+
+inline std::ostream& operator<<(std::ostream &out, const MonMap &m) {
+  m.print_summary(out);
+  return out;
+}
+
+#endif
diff --git a/src/mon/MonOpRequest.h b/src/mon/MonOpRequest.h
new file mode 100644
index 000000000..73275e81e
--- /dev/null
+++ b/src/mon/MonOpRequest.h
@@ -0,0 +1,238 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat <contact@redhat.com>
+ * Copyright (C) 2015 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MON_OPREQUEST_H_
+#define MON_OPREQUEST_H_
+#include <iosfwd>
+#include <stdint.h>
+
+#include "common/TrackedOp.h"
+#include "mon/Session.h"
+#include "msg/Message.h"
+
+struct MonOpRequest : public TrackedOp {
+  friend class OpTracker;
+
+  void mark_dispatch() {
+    mark_event("monitor_dispatch");
+  }
+  void mark_wait_for_quorum() {
+    mark_event("wait_for_quorum");
+  }
+  void mark_zap() {
+    mark_event("monitor_zap");
+  }
+  void mark_forwarded() {
+    mark_event("forwarded");
+    forwarded_to_leader = true;
+  }
+
+  void mark_svc_event(const std::string &service, const std::string &event) {
+    std::string s = service;
+    s.append(":").append(event);
+    mark_event(s);
+  }
+
+  void mark_logmon_event(const std::string &event) {
+    mark_svc_event("logm", event);
+  }
+  void mark_osdmon_event(const std::string &event) {
+    mark_svc_event("osdmap", event);
+  }
+  void mark_pgmon_event(const std::string &event) {
+    mark_svc_event("pgmap", event);
+  }
+  void mark_mdsmon_event(const std::string &event) {
+    mark_svc_event("mdsmap", event);
+  }
+  void mark_authmon_event(const std::string &event) {
+    mark_svc_event("auth", event);
+  }
+  void mark_paxos_event(const std::string &event) {
+    mark_svc_event("paxos", event);
+  }
+
+
+  enum op_type_t {
+    OP_TYPE_NONE    = 0,      ///< no type defined (default)
+    OP_TYPE_SERVICE,          ///< belongs to a Paxos Service or similar
+    OP_TYPE_MONITOR,          ///< belongs to the Monitor class
+    OP_TYPE_ELECTION,         ///< belongs to the Elector class
+    OP_TYPE_PAXOS,            ///< refers to Paxos messages
+    OP_TYPE_COMMAND,          ///< is a command
+  };
+
+  MonOpRequest(const MonOpRequest &other) = delete;
+  MonOpRequest & operator = (const MonOpRequest &other) = delete;
+
+private:
+  Message *request;
+  utime_t dequeued_time;
+  RefCountedPtr session;
+  ConnectionRef con;
+  bool forwarded_to_leader;
+  op_type_t op_type;
+
+  MonOpRequest(Message *req, OpTracker *tracker) :
+    TrackedOp(tracker,
+      req->get_recv_stamp().is_zero() ?
+      ceph_clock_now() : req->get_recv_stamp()),
+    request(req),
+    con(NULL),
+    forwarded_to_leader(false),
+    op_type(OP_TYPE_NONE)
+  {
+    if (req) {
+      con = req->get_connection();
+      if (con) {
+        session = con->get_priv();
+      }
+    }
+  }
+
+  void _dump(ceph::Formatter *f) const override {
+    {
+      f->open_array_section("events");
+      std::lock_guard l(lock);
+    for (auto i = events.begin(); i != events.end(); ++i) {
+      f->open_object_section("event");
+      f->dump_string("event", i->str);
+      f->dump_stream("time") << i->stamp;
+
+      auto i_next = i + 1;
+
+      if (i_next < events.end()) {
+	f->dump_float("duration", i_next->stamp - i->stamp);
+      } else {
+	f->dump_float("duration", events.rbegin()->stamp - get_initiated());
+      }
+
+      f->close_section();
+    }
+      f->close_section();
+      f->open_object_section("info");
+      f->dump_int("seq", seq);
+      f->dump_bool("src_is_mon", is_src_mon());
+      f->dump_stream("source") << request->get_source_inst();
+      f->dump_bool("forwarded_to_leader", forwarded_to_leader);
+      f->close_section();
+    }
+  }
+
+protected:
+  void _dump_op_descriptor_unlocked(std::ostream& stream) const override {
+    get_req()->print(stream);
+  }
+
+public:
+  ~MonOpRequest() override {
+    request->put();
+  }
+
+  MonSession *get_session() const {
+    return static_cast<MonSession*>(session.get());
+  }
+
+  template<class T>
+  T *get_req() const { return static_cast<T*>(request); }
+
+  Message *get_req() const { return get_req<Message>(); }
+
+  int get_req_type() const {
+    if (!request)
+      return 0;
+    return request->get_type();
+  }
+
+  ConnectionRef get_connection() { return con; }
+
+  void set_session(MonSession *s) {
+    session.reset(s);
+  }
+
+  bool is_src_mon() const {
+    return (con && con->get_peer_type() & CEPH_ENTITY_TYPE_MON);
+  }
+
+  typedef boost::intrusive_ptr<MonOpRequest> Ref;
+
+  void set_op_type(op_type_t t) {
+    op_type = t;
+  }
+  void set_type_service() {
+    set_op_type(OP_TYPE_SERVICE);
+  }
+  void set_type_monitor() {
+    set_op_type(OP_TYPE_MONITOR);
+  }
+  void set_type_paxos() {
+    set_op_type(OP_TYPE_PAXOS);
+  }
+  void set_type_election_or_ping() {
+    set_op_type(OP_TYPE_ELECTION);
+  }
+  void set_type_command() {
+    set_op_type(OP_TYPE_COMMAND);
+  }
+
+  op_type_t get_op_type() {
+    return op_type;
+  }
+
+  bool is_type_service() {
+    return (get_op_type() == OP_TYPE_SERVICE);
+  }
+  bool is_type_monitor() {
+    return (get_op_type() == OP_TYPE_MONITOR);
+  }
+  bool is_type_paxos() {
+    return (get_op_type() == OP_TYPE_PAXOS);
+  }
+  bool is_type_election_or_ping() {
+    return (get_op_type() == OP_TYPE_ELECTION);
+  }
+  bool is_type_command() {
+    return (get_op_type() == OP_TYPE_COMMAND);
+  }
+};
+
+typedef MonOpRequest::Ref MonOpRequestRef;
+
+struct C_MonOp : public Context
+{
+  MonOpRequestRef op;
+
+  explicit C_MonOp(MonOpRequestRef o) :
+    op(o) { }
+
+  void finish(int r) override {
+    if (op && r == -ECANCELED) {
+      op->mark_event("callback canceled");
+    } else if (op && r == -EAGAIN) {
+      op->mark_event("callback retry");
+    } else if (op && r == 0) {
+      op->mark_event("callback finished");
+    }
+    _finish(r);
+  }
+
+  void mark_op_event(const std::string &event) {
+    if (op)
+      op->mark_event(event);
+  }
+
+  virtual void _finish(int r) = 0;
+};
+
+#endif /* MON_OPREQUEST_H_ */
diff --git a/src/mon/MonSub.cc b/src/mon/MonSub.cc
new file mode 100644
index 000000000..a2c60ba91
--- /dev/null
+++ b/src/mon/MonSub.cc
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MonSub.h"
+
+bool MonSub::have_new() const {
+  return !sub_new.empty();
+}
+
+bool MonSub::need_renew() const
+{
+  return ceph::coarse_mono_clock::now() > renew_after;
+}
+
+void MonSub::renewed()
+{
+  if (clock::is_zero(renew_sent)) {
+    renew_sent = clock::now();
+  }
+  // update sub_sent with sub_new
+  sub_new.insert(sub_sent.begin(), sub_sent.end());
+  std::swap(sub_new, sub_sent);
+  sub_new.clear();
+}
+
+void MonSub::acked(uint32_t interval)
+{
+  if (!clock::is_zero(renew_sent)) {
+    // NOTE: this is only needed for legacy (infernalis or older)
+    // mons; see MonClient::tick().
+    renew_after = renew_sent;
+    renew_after += ceph::make_timespan(interval / 2.0);
+    renew_sent = clock::zero();
+  }
+}
+
+bool MonSub::reload()
+{
+  for (auto& [what, sub] : sub_sent) {
+    if (sub_new.count(what) == 0) {
+      sub_new[what] = sub;
+    }
+  }
+  return have_new();
+}
+
+void MonSub::got(const std::string& what, version_t have)
+{
+  if (auto i = sub_new.find(what); i != sub_new.end()) {
+    auto& sub = i->second;
+    if (sub.start <= have) {
+      if (sub.flags & CEPH_SUBSCRIBE_ONETIME) {
+        sub_new.erase(i);
+      } else {
+        sub.start = have + 1;
+      }
+    }
+  } else if (auto i = sub_sent.find(what); i != sub_sent.end()) {
+    auto& sub = i->second;
+    if (sub.start <= have) {
+      if (sub.flags & CEPH_SUBSCRIBE_ONETIME) {
+        sub_sent.erase(i);
+      } else {
+        sub.start = have + 1;
+      }
+    }
+  }
+}
+
+bool MonSub::want(const std::string& what, version_t start, unsigned flags)
+{
+  if (auto sub = sub_new.find(what);
+      sub != sub_new.end() &&
+      sub->second.start == start &&
+      sub->second.flags == flags) {
+    return false;
+  } else if (auto sub = sub_sent.find(what);
+      sub != sub_sent.end() &&
+      sub->second.start == start &&
+      sub->second.flags == flags) {
+	return false;
+  } else {
+    sub_new[what].start = start;
+    sub_new[what].flags = flags;
+    return true;
+  }
+}
+
+bool MonSub::inc_want(const std::string& what, version_t start, unsigned flags)
+{
+  if (auto sub = sub_new.find(what); sub != sub_new.end()) {
+    if (sub->second.start >= start) {
+      return false;
+    } else {
+      sub->second.start = start;
+      sub->second.flags = flags;
+      return true;
+    }
+  } else if (auto sub = sub_sent.find(what);
+             sub == sub_sent.end() || sub->second.start < start) {
+    auto& item = sub_new[what];
+    item.start = start;
+    item.flags = flags;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void MonSub::unwant(const std::string& what)
+{
+  sub_sent.erase(what);
+  sub_new.erase(what);
+}
diff --git a/src/mon/MonSub.h b/src/mon/MonSub.h
new file mode 100644
index 000000000..8ff5a8f18
--- /dev/null
+++ b/src/mon/MonSub.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "common/ceph_time.h"
+#include "include/types.h"
+
+// mon subscriptions
+class MonSub
+{
+public:
+  // @returns true if there is any "new" subscriptions
+  bool have_new() const;
+  auto get_subs() const {
+    return sub_new;
+  }
+  bool need_renew() const;
+  // change the status of "new" subscriptions to "sent"
+  void renewed();
+  // the peer acked the subscription request
+  void acked(uint32_t interval);
+  void got(const std::string& what, version_t version);
+  // revert the status of subscriptions from "sent" to "new"
+  // @returns true if there is any pending "new" subscriptions
+  bool reload();
+  // add a new subscription
+  bool want(const std::string& what, version_t start, unsigned flags);
+  // increment the requested subscription start point. If you do increase
+  // the value, apply the passed-in flags as well; otherwise do nothing.
+  bool inc_want(const std::string& what, version_t start, unsigned flags);
+  // cancel a subscription
+  void unwant(const std::string& what);
+private:
+  // my subs, and current versions
+  std::map<std::string,ceph_mon_subscribe_item> sub_sent;
+  // unsent new subs
+  std::map<std::string,ceph_mon_subscribe_item> sub_new;
+  using time_point = ceph::coarse_mono_time;
+  using clock = typename time_point::clock;
+  time_point renew_sent;
+  time_point renew_after;
+};
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
new file mode 100644
index 000000000..ce7ec37d9
--- /dev/null
+++ b/src/mon/Monitor.cc
@@ -0,0 +1,6887 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include <iterator>
+#include <sstream>
+#include <tuple>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <cstring>
+#include <boost/scope_exit.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "json_spirit/json_spirit_reader.h"
+#include "json_spirit/json_spirit_writer.h"
+
+#include "Monitor.h"
+#include "common/version.h"
+#include "common/blkdev.h"
+#include "common/cmdparse.h"
+#include "common/signal.h"
+
+#include "osd/OSDMap.h"
+
+#include "MonitorDBStore.h"
+
+#include "messages/PaxosServiceMessage.h"
+#include "messages/MMonMap.h"
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersion.h"
+#include "messages/MMonGetVersionReply.h"
+#include "messages/MGenericMessage.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+#include "messages/MMonSync.h"
+#include "messages/MMonScrub.h"
+#include "messages/MMonProbe.h"
+#include "messages/MMonJoin.h"
+#include "messages/MMonPaxos.h"
+#include "messages/MRoute.h"
+#include "messages/MForward.h"
+
+#include "messages/MMonSubscribe.h"
+#include "messages/MMonSubscribeAck.h"
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+
+#include "messages/MAuthReply.h"
+
+#include "messages/MTimeCheck2.h"
+#include "messages/MPing.h"
+
+#include "common/strtol.h"
+#include "common/ceph_argparse.h"
+#include "common/Timer.h"
+#include "common/Clock.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/admin_socket.h"
+#include "global/signal_handler.h"
+#include "common/Formatter.h"
+#include "include/stringify.h"
+#include "include/color.h"
+#include "include/ceph_fs.h"
+#include "include/str_list.h"
+
+#include "OSDMonitor.h"
+#include "MDSMonitor.h"
+#include "MonmapMonitor.h"
+#include "LogMonitor.h"
+#include "AuthMonitor.h"
+#include "MgrMonitor.h"
+#include "MgrStatMonitor.h"
+#include "ConfigMonitor.h"
+#include "KVMonitor.h"
+#include "mon/HealthMonitor.h"
+#include "common/config.h"
+#include "common/cmdparse.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+#include "perfglue/heap_profiler.h"
+
+#include "auth/none/AuthNoneClientHandler.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+using namespace TOPNSPC::common;
+
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+
+
+static ostream& _prefix(std::ostream *_dout, const Monitor *mon) {
+  return *_dout << "mon." << mon->name << "@" << mon->rank
+		<< "(" << mon->get_state_name() << ") e" << mon->monmap->get_epoch() << " ";
+}
+
+const string Monitor::MONITOR_NAME = "monitor";
+const string Monitor::MONITOR_STORE_PREFIX = "monitor_store";
+
+
+#undef FLAG
+#undef COMMAND
+#undef COMMAND_WITH_FLAG
+#define FLAG(f) (MonCommand::FLAG_##f)
+#define COMMAND(parsesig, helptext, modulename, req_perms)	\
+  {parsesig, helptext, modulename, req_perms, FLAG(NONE)},
+#define COMMAND_WITH_FLAG(parsesig, helptext, modulename, req_perms, flags) \
+  {parsesig, helptext, modulename, req_perms, flags},
+MonCommand mon_commands[] = {
+#include <mon/MonCommands.h>
+};
+#undef COMMAND
+#undef COMMAND_WITH_FLAG
+
+Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
+		 Messenger *m, Messenger *mgr_m, MonMap *map) :
+  Dispatcher(cct_),
+  AuthServer(cct_),
+  name(nm),
+  rank(-1), 
+  messenger(m),
+  con_self(m ? m->get_loopback_connection() : NULL),
+  timer(cct_, lock),
+  finisher(cct_, "mon_finisher", "fin"),
+  cpu_tp(cct, "Monitor::cpu_tp", "cpu_tp", g_conf()->mon_cpu_threads),
+  has_ever_joined(false),
+  logger(NULL), cluster_logger(NULL), cluster_logger_registered(false),
+  monmap(map),
+  log_client(cct_, messenger, monmap, LogClient::FLAG_MON),
+  key_server(cct, &keyring),
+  auth_cluster_required(cct,
+			cct->_conf->auth_supported.empty() ?
+			cct->_conf->auth_cluster_required : cct->_conf->auth_supported),
+  auth_service_required(cct,
+			cct->_conf->auth_supported.empty() ?
+			cct->_conf->auth_service_required : cct->_conf->auth_supported),
+  mgr_messenger(mgr_m),
+  mgr_client(cct_, mgr_m, monmap),
+  gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
+  store(s),
+  
+  elector(this, map->strategy),
+  required_features(0),
+  leader(0),
+  quorum_con_features(0),
+  // scrub
+  scrub_version(0),
+  scrub_event(NULL),
+  scrub_timeout_event(NULL),
+
+  // sync state
+  sync_provider_count(0),
+  sync_cookie(0),
+  sync_full(false),
+  sync_start_version(0),
+  sync_timeout_event(NULL),
+  sync_last_committed_floor(0),
+
+  timecheck_round(0),
+  timecheck_acks(0),
+  timecheck_rounds_since_clean(0),
+  timecheck_event(NULL),
+
+  admin_hook(NULL),
+  routed_request_tid(0),
+  op_tracker(cct, g_conf().get_val<bool>("mon_enable_op_tracker"), 1)
+{
+  clog = log_client.create_channel(CLOG_CHANNEL_CLUSTER);
+  audit_clog = log_client.create_channel(CLOG_CHANNEL_AUDIT);
+
+  update_log_clients();
+
+  if (!gss_ktfile_client.empty()) {
+    // Assert we can export environment variable 
+    /* 
+        The default client keytab is used, if it is present and readable,
+        to automatically obtain initial credentials for GSSAPI client
+        applications. The principal name of the first entry in the client
+        keytab is used by default when obtaining initial credentials.
+        1. The KRB5_CLIENT_KTNAME environment variable.
+        2. The default_client_keytab_name profile variable in [libdefaults].
+        3. The hardcoded default, DEFCKTNAME.
+    */
+    const int32_t set_result(setenv("KRB5_CLIENT_KTNAME", 
+                                    gss_ktfile_client.c_str(), 1));
+    ceph_assert(set_result == 0);
+  }
+
+  op_tracker.set_complaint_and_threshold(
+      g_conf().get_val<std::chrono::seconds>("mon_op_complaint_time").count(),
+      g_conf().get_val<int64_t>("mon_op_log_threshold"));
+  op_tracker.set_history_size_and_duration(
+      g_conf().get_val<uint64_t>("mon_op_history_size"),
+      g_conf().get_val<std::chrono::seconds>("mon_op_history_duration").count());
+  op_tracker.set_history_slow_op_size_and_threshold(
+      g_conf().get_val<uint64_t>("mon_op_history_slow_op_size"),
+      g_conf().get_val<std::chrono::seconds>("mon_op_history_slow_op_threshold").count());
+
+  paxos = std::make_unique<Paxos>(*this, "paxos");
+
+  paxos_service[PAXOS_MDSMAP].reset(new MDSMonitor(*this, *paxos, "mdsmap"));
+  paxos_service[PAXOS_MONMAP].reset(new MonmapMonitor(*this, *paxos, "monmap"));
+  paxos_service[PAXOS_OSDMAP].reset(new OSDMonitor(cct, *this, *paxos, "osdmap"));
+  paxos_service[PAXOS_LOG].reset(new LogMonitor(*this, *paxos, "logm"));
+  paxos_service[PAXOS_AUTH].reset(new AuthMonitor(*this, *paxos, "auth"));
+  paxos_service[PAXOS_MGR].reset(new MgrMonitor(*this, *paxos, "mgr"));
+  paxos_service[PAXOS_MGRSTAT].reset(new MgrStatMonitor(*this, *paxos, "mgrstat"));
+  paxos_service[PAXOS_HEALTH].reset(new HealthMonitor(*this, *paxos, "health"));
+  paxos_service[PAXOS_CONFIG].reset(new ConfigMonitor(*this, *paxos, "config"));
+  paxos_service[PAXOS_KV].reset(new KVMonitor(*this, *paxos, "kv"));
+
+  bool r = mon_caps.parse("allow *", NULL);
+  ceph_assert(r);
+
+  exited_quorum = ceph_clock_now();
+
+  // prepare local commands
+  local_mon_commands.resize(std::size(mon_commands));
+  for (unsigned i = 0; i < std::size(mon_commands); ++i) {
+    local_mon_commands[i] = mon_commands[i];
+  }
+  MonCommand::encode_vector(local_mon_commands, local_mon_commands_bl);
+
+  prenautilus_local_mon_commands = local_mon_commands;
+  for (auto& i : prenautilus_local_mon_commands) {
+    std::string n = cmddesc_get_prenautilus_compat(i.cmdstring);
+    if (n != i.cmdstring) {
+      dout(20) << " pre-nautilus cmd " << i.cmdstring << " -> " << n << dendl;
+      i.cmdstring = n;
+    }
+  }
+  MonCommand::encode_vector(prenautilus_local_mon_commands, prenautilus_local_mon_commands_bl);
+
+  // assume our commands until we have an election.  this only means
+  // we won't reply with EINVAL before the election; any command that
+  // actually matters will wait until we have quorum etc and then
+  // retry (and revalidate).
+  leader_mon_commands = local_mon_commands;
+}
+
+Monitor::~Monitor()
+{
+  op_tracker.on_shutdown();
+
+  delete logger;
+  ceph_assert(session_map.sessions.empty());
+}
+
+
+class AdminHook : public AdminSocketHook {
+  Monitor *mon;
+public:
+  explicit AdminHook(Monitor *m) : mon(m) {}
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    stringstream outss;
+    int r = mon->do_admin_command(command, cmdmap, f, errss, outss);
+    out.append(outss);
+    return r;
+  }
+};
+
+int Monitor::do_admin_command(
+  std::string_view command,
+  const cmdmap_t& cmdmap,
+  Formatter *f,
+  std::ostream& err,
+  std::ostream& out)
+{
+  std::lock_guard l(lock);
+
+  int r = 0;
+  string args;
+  for (auto p = cmdmap.begin();
+       p != cmdmap.end(); ++p) {
+    if (p->first == "prefix")
+      continue;
+    if (!args.empty())
+      args += ", ";
+    args += cmd_vartype_stringify(p->second);
+  }
+  args = "[" + args + "]";
+
+  bool read_only = (command == "mon_status" ||
+                    command == "mon metadata" ||
+                    command == "quorum_status" ||
+                    command == "ops" ||
+                    command == "sessions");
+
+  (read_only ? audit_clog->debug() : audit_clog->info())
+    << "from='admin socket' entity='admin socket' "
+    << "cmd='" << command << "' args=" << args << ": dispatch";
+
+  if (command == "mon_status") {
+    get_mon_status(f);
+  } else if (command == "quorum_status") {
+    _quorum_status(f, out);
+  } else if (command == "sync_force") {
+    string validate;
+    if ((!cmd_getval(cmdmap, "validate", validate)) ||
+	(validate != "--yes-i-really-mean-it")) {
+      err << "are you SURE? this will mean the monitor store will be erased "
+	"the next time the monitor is restarted.  pass "
+	"'--yes-i-really-mean-it' if you really do.";
+      r = -EPERM;
+      goto abort;
+    }
+    sync_force(f);
+  } else if (command.compare(0, 23, "add_bootstrap_peer_hint") == 0 ||
+	     command.compare(0, 24, "add_bootstrap_peer_hintv") == 0) {
+    if (!_add_bootstrap_peer_hint(command, cmdmap, out))
+      goto abort;
+  } else if (command == "quorum enter") {
+    elector.start_participating();
+    start_election();
+    out << "started responding to quorum, initiated new election";
+  } else if (command == "quorum exit") {
+    start_election();
+    elector.stop_participating();
+    out << "stopped responding to quorum, initiated new election";
+  } else if (command == "ops") {
+    (void)op_tracker.dump_ops_in_flight(f);
+  } else if (command == "sessions") {
+    f->open_array_section("sessions");
+    for (auto p : session_map.sessions) {
+      f->dump_object("session", *p);
+    }
+    f->close_section();
+  } else if (command == "dump_historic_ops") {
+    if (!op_tracker.dump_historic_ops(f)) {
+      err << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+        please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+    }
+  } else if (command == "dump_historic_ops_by_duration" ) {
+    if (op_tracker.dump_historic_ops(f, true)) {
+      err << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+        please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+    }
+  } else if (command == "dump_historic_slow_ops") {
+    if (op_tracker.dump_historic_slow_ops(f, {})) {
+      err << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+        please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+    }
+  } else if (command == "quorum") {
+    string quorumcmd;
+    cmd_getval(cmdmap, "quorumcmd", quorumcmd);
+    if (quorumcmd == "exit") {
+      start_election();
+      elector.stop_participating();
+      out << "stopped responding to quorum, initiated new election" << std::endl;
+    } else if (quorumcmd == "enter") {
+      elector.start_participating();
+      start_election();
+      out << "started responding to quorum, initiated new election" << std::endl;
+    } else {
+      err << "needs a valid 'quorum' command" << std::endl;
+    }
+  } else if (command == "connection scores dump") {
+    if (!get_quorum_mon_features().contains_all(
+				   ceph::features::mon::FEATURE_PINGING)) {
+      err << "Not all monitors support changing election strategies; \
+              please upgrade them first!";
+    }
+    elector.dump_connection_scores(f);
+  } else if (command == "connection scores reset") {
+    if (!get_quorum_mon_features().contains_all(
+				   ceph::features::mon::FEATURE_PINGING)) {
+      err << "Not all monitors support changing election strategies; \
+              please upgrade them first!";
+    }
+    elector.notify_clear_peer_state();
+  } else if (command == "smart") {
+    string want_devid;
+    cmd_getval(cmdmap, "devid", want_devid);
+
+    string devname = store->get_devname();
+    if (devname.empty()) {
+      err << "could not determine device name for " << store->get_path();
+      r = -ENOENT;
+      goto abort;
+    }
+    set<string> devnames;
+    get_raw_devices(devname, &devnames);
+    json_spirit::mObject json_map;
+    uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
+      "mon_smart_report_timeout");
+    for (auto& devname : devnames) {
+      string err;
+      string devid = get_device_id(devname, &err);
+      if (want_devid.size() && want_devid != devid) {
+	derr << "get_device_id failed on " << devname << ": " << err << dendl;
+	continue;
+      }
+      json_spirit::mValue smart_json;
+      if (block_device_get_metrics(devname, smart_timeout,
+				   &smart_json)) {
+	dout(10) << "block_device_get_metrics failed for /dev/" << devname
+		 << dendl;
+	continue;
+      }
+      json_map[devid] = smart_json;
+    }
+    json_spirit::write(json_map, out, json_spirit::pretty_print);
+  } else if (command == "heap") {
+    if (!ceph_using_tcmalloc()) {
+      err << "could not issue heap profiler command -- not using tcmalloc!";
+      r = -EOPNOTSUPP;
+      goto abort;
+    }
+    string cmd;
+    if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
+      err << "unable to get value for command \"" << cmd << "\"";
+      r = -EINVAL;
+      goto abort;
+    }
+    std::vector<std::string> cmd_vec;
+    get_str_vec(cmd, cmd_vec);
+    string val;
+    if (cmd_getval(cmdmap, "value", val)) {
+      cmd_vec.push_back(val);
+    }
+    ceph_heap_profiler_handle_command(cmd_vec, out);
+  } else if (command == "compact") {
+    dout(1) << "triggering manual compaction" << dendl;
+    auto start = ceph::coarse_mono_clock::now();
+    store->compact_async();
+    auto end = ceph::coarse_mono_clock::now();
+    auto duration = ceph::to_seconds<double>(end - start);
+    dout(1) << "finished manual compaction in "
+	    << duration << " seconds" << dendl;
+    out << "compacted " << g_conf().get_val<std::string>("mon_keyvaluedb")
+	<< " in " << duration << " seconds";
+ } else {
+    ceph_abort_msg("bad AdminSocket command binding");
+  }
+  (read_only ? audit_clog->debug() : audit_clog->info())
+    << "from='admin socket' "
+    << "entity='admin socket' "
+    << "cmd=" << command << " "
+    << "args=" << args << ": finished";
+  return r;
+
+abort:
+  (read_only ? audit_clog->debug() : audit_clog->info())
+    << "from='admin socket' "
+    << "entity='admin socket' "
+    << "cmd=" << command << " "
+    << "args=" << args << ": aborted";
+  return r;
+}
+
+void Monitor::handle_signal(int signum)
+{
+  ceph_assert(signum == SIGINT || signum == SIGTERM);
+  derr << "*** Got Signal " << sig_str(signum) << " ***" << dendl;
+  shutdown();
+}
+
+CompatSet Monitor::get_initial_supported_features()
+{
+  CompatSet::FeatureSet ceph_mon_feature_compat;
+  CompatSet::FeatureSet ceph_mon_feature_ro_compat;
+  CompatSet::FeatureSet ceph_mon_feature_incompat;
+  ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE);
+  ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS);
+  return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat,
+		   ceph_mon_feature_incompat);
+}
+
+CompatSet Monitor::get_supported_features()
+{
+  CompatSet compat = get_initial_supported_features();
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_MIMIC);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_NAUTILUS);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OCTOPUS);
+  compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_PACIFIC);
+  return compat;
+}
+
+CompatSet Monitor::get_legacy_features()
+{
+  CompatSet::FeatureSet ceph_mon_feature_compat;
+  CompatSet::FeatureSet ceph_mon_feature_ro_compat;
+  CompatSet::FeatureSet ceph_mon_feature_incompat;
+  ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE);
+  return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat,
+		   ceph_mon_feature_incompat);
+}
+
+int Monitor::check_features(MonitorDBStore *store)
+{
+  CompatSet required = get_supported_features();
+  CompatSet ondisk;
+
+  read_features_off_disk(store, &ondisk);
+
+  if (!required.writeable(ondisk)) {
+    CompatSet diff = required.unsupported(ondisk);
+    generic_derr << "ERROR: on disk data includes unsupported features: " << diff << dendl;
+    return -EPERM;
+  }
+
+  return 0;
+}
+
+void Monitor::read_features_off_disk(MonitorDBStore *store, CompatSet *features)
+{
+  bufferlist featuresbl;
+  store->get(MONITOR_NAME, COMPAT_SET_LOC, featuresbl);
+  if (featuresbl.length() == 0) {
+    generic_dout(0) << "WARNING: mon fs missing feature list.\n"
+            << "Assuming it is old-style and introducing one." << dendl;
+    //we only want the baseline ~v.18 features assumed to be on disk.
+    //If new features are introduced this code needs to disappear or
+    //be made smarter.
+    *features = get_legacy_features();
+
+    features->encode(featuresbl);
+    auto t(std::make_shared<MonitorDBStore::Transaction>());
+    t->put(MONITOR_NAME, COMPAT_SET_LOC, featuresbl);
+    store->apply_transaction(t);
+  } else {
+    auto it = featuresbl.cbegin();
+    features->decode(it);
+  }
+}
+
+void Monitor::read_features()
+{
+  read_features_off_disk(store, &features);
+  dout(10) << "features " << features << dendl;
+
+  calc_quorum_requirements();
+  dout(10) << "required_features " << required_features << dendl;
+}
+
+void Monitor::write_features(MonitorDBStore::TransactionRef t)
+{
+  bufferlist bl;
+  features.encode(bl);
+  t->put(MONITOR_NAME, COMPAT_SET_LOC, bl);
+}
+
+const char** Monitor::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "crushtool", // helpful for testing
+    "mon_election_timeout",
+    "mon_lease",
+    "mon_lease_renew_interval_factor",
+    "mon_lease_ack_timeout_factor",
+    "mon_accept_timeout_factor",
+    // clog & admin clog
+    "clog_to_monitors",
+    "clog_to_syslog",
+    "clog_to_syslog_facility",
+    "clog_to_syslog_level",
+    "clog_to_graylog",
+    "clog_to_graylog_host",
+    "clog_to_graylog_port",
+    "host",
+    "fsid",
+    // periodic health to clog
+    "mon_health_to_clog",
+    "mon_health_to_clog_interval",
+    "mon_health_to_clog_tick_interval",
+    // scrub interval
+    "mon_scrub_interval",
+    "mon_allow_pool_delete",
+    // osdmap pruning - observed, not handled.
+    "mon_osdmap_full_prune_enabled",
+    "mon_osdmap_full_prune_min",
+    "mon_osdmap_full_prune_interval",
+    "mon_osdmap_full_prune_txsize",
+    // debug options - observed, not handled
+    "mon_debug_extra_checks",
+    "mon_debug_block_osdmap_trim",
+    NULL
+  };
+  return KEYS;
+}
+
+void Monitor::handle_conf_change(const ConfigProxy& conf,
+                                 const std::set<std::string> &changed)
+{
+  sanitize_options();
+
+  dout(10) << __func__ << " " << changed << dendl;
+
+  if (changed.count("clog_to_monitors") ||
+      changed.count("clog_to_syslog") ||
+      changed.count("clog_to_syslog_level") ||
+      changed.count("clog_to_syslog_facility") ||
+      changed.count("clog_to_graylog") ||
+      changed.count("clog_to_graylog_host") ||
+      changed.count("clog_to_graylog_port") ||
+      changed.count("host") ||
+      changed.count("fsid")) {
+    update_log_clients();
+  }
+
+  if (changed.count("mon_health_to_clog") ||
+      changed.count("mon_health_to_clog_interval") ||
+      changed.count("mon_health_to_clog_tick_interval")) {
+    finisher.queue(new C_MonContext{this, [this, changed](int) {
+      std::lock_guard l{lock};
+      health_to_clog_update_conf(changed);
+    }});
+  }
+
+  if (changed.count("mon_scrub_interval")) {
+    auto scrub_interval =
+      conf.get_val<std::chrono::seconds>("mon_scrub_interval");
+    finisher.queue(new C_MonContext{this, [this, scrub_interval](int) {
+      std::lock_guard l{lock};
+      scrub_update_interval(scrub_interval);
+    }});
+  }
+}
+
+void Monitor::update_log_clients()
+{
+  map<string,string> log_to_monitors;
+  map<string,string> log_to_syslog;
+  map<string,string> log_channel;
+  map<string,string> log_prio;
+  map<string,string> log_to_graylog;
+  map<string,string> log_to_graylog_host;
+  map<string,string> log_to_graylog_port;
+  uuid_d fsid;
+  string host;
+
+  if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog,
+			       log_channel, log_prio, log_to_graylog,
+			       log_to_graylog_host, log_to_graylog_port,
+			       fsid, host))
+    return;
+
+  clog->update_config(log_to_monitors, log_to_syslog,
+		      log_channel, log_prio, log_to_graylog,
+		      log_to_graylog_host, log_to_graylog_port,
+		      fsid, host);
+
+  audit_clog->update_config(log_to_monitors, log_to_syslog,
+			    log_channel, log_prio, log_to_graylog,
+			    log_to_graylog_host, log_to_graylog_port,
+			    fsid, host);
+}
+
+int Monitor::sanitize_options()
+{
+  int r = 0;
+
+  // mon_lease must be greater than mon_lease_renewal; otherwise we
+  // may incur in leases expiring before they are renewed.
+  if (g_conf()->mon_lease_renew_interval_factor >= 1.0) {
+    clog->error() << "mon_lease_renew_interval_factor ("
+		  << g_conf()->mon_lease_renew_interval_factor
+		  << ") must be less than 1.0";
+    r = -EINVAL;
+  }
+
+  // mon_lease_ack_timeout must be greater than mon_lease to make sure we've
+  // got time to renew the lease and get an ack for it. Having both options
+  // with the same value, for a given small vale, could mean timing out if
+  // the monitors happened to be overloaded -- or even under normal load for
+  // a small enough value.
+  if (g_conf()->mon_lease_ack_timeout_factor <= 1.0) {
+    clog->error() << "mon_lease_ack_timeout_factor ("
+		  << g_conf()->mon_lease_ack_timeout_factor
+		  << ") must be greater than 1.0";
+    r = -EINVAL;
+  }
+
+  return r;
+}
+
+int Monitor::preinit()
+{
+  std::unique_lock l(lock);
+
+  dout(1) << "preinit fsid " << monmap->fsid << dendl;
+
+  int r = sanitize_options();
+  if (r < 0) {
+    derr << "option sanitization failed!" << dendl;
+    return r;
+  }
+
+  ceph_assert(!logger);
+  {
+    PerfCountersBuilder pcb(g_ceph_context, "mon", l_mon_first, l_mon_last);
+    pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess",
+        PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions",
+        "sadd", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions",
+        "srm", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions",
+        "strm", PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in",
+        "ecnt", PerfCountersBuilder::PRIO_USEFUL);
+    pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started",
+        "estt", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won",
+        "ewon", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost",
+        "elst", PerfCountersBuilder::PRIO_INTERESTING);
+    logger = pcb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(logger);
+  }
+
+  ceph_assert(!cluster_logger);
+  {
+    PerfCountersBuilder pcb(g_ceph_context, "cluster", l_cluster_first, l_cluster_last);
+    pcb.add_u64(l_cluster_num_mon, "num_mon", "Monitors");
+    pcb.add_u64(l_cluster_num_mon_quorum, "num_mon_quorum", "Monitors in quorum");
+    pcb.add_u64(l_cluster_num_osd, "num_osd", "OSDs");
+    pcb.add_u64(l_cluster_num_osd_up, "num_osd_up", "OSDs that are up");
+    pcb.add_u64(l_cluster_num_osd_in, "num_osd_in", "OSD in state \"in\" (they are in cluster)");
+    pcb.add_u64(l_cluster_osd_epoch, "osd_epoch", "Current epoch of OSD map");
+    pcb.add_u64(l_cluster_osd_bytes, "osd_bytes", "Total capacity of cluster", NULL, 0, unit_t(UNIT_BYTES));
+    pcb.add_u64(l_cluster_osd_bytes_used, "osd_bytes_used", "Used space", NULL, 0, unit_t(UNIT_BYTES));
+    pcb.add_u64(l_cluster_osd_bytes_avail, "osd_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
+    pcb.add_u64(l_cluster_num_pool, "num_pool", "Pools");
+    pcb.add_u64(l_cluster_num_pg, "num_pg", "Placement groups");
+    pcb.add_u64(l_cluster_num_pg_active_clean, "num_pg_active_clean", "Placement groups in active+clean state");
+    pcb.add_u64(l_cluster_num_pg_active, "num_pg_active", "Placement groups in active state");
+    pcb.add_u64(l_cluster_num_pg_peering, "num_pg_peering", "Placement groups in peering state");
+    pcb.add_u64(l_cluster_num_object, "num_object", "Objects");
+    pcb.add_u64(l_cluster_num_object_degraded, "num_object_degraded", "Degraded (missing replicas) objects");
+    pcb.add_u64(l_cluster_num_object_misplaced, "num_object_misplaced", "Misplaced (wrong location in the cluster) objects");
+    pcb.add_u64(l_cluster_num_object_unfound, "num_object_unfound", "Unfound objects");
+    pcb.add_u64(l_cluster_num_bytes, "num_bytes", "Size of all objects", NULL, 0, unit_t(UNIT_BYTES));
+    cluster_logger = pcb.create_perf_counters();
+  }
+
+  paxos->init_logger();
+
+  // verify cluster_uuid
+  {
+    int r = check_fsid();
+    if (r == -ENOENT)
+      r = write_fsid();
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  // open compatset
+  read_features();
+
+  // have we ever joined a quorum?
+  has_ever_joined = (store->get(MONITOR_NAME, "joined") != 0);
+  dout(10) << "has_ever_joined = " << (int)has_ever_joined << dendl;
+
+  if (!has_ever_joined) {
+    // impose initial quorum restrictions?
+    list<string> initial_members;
+    get_str_list(g_conf()->mon_initial_members, initial_members);
+
+    if (!initial_members.empty()) {
+      dout(1) << " initial_members " << initial_members << ", filtering seed monmap" << dendl;
+
+      monmap->set_initial_members(
+	g_ceph_context, initial_members, name, messenger->get_myaddrs(),
+	&extra_probe_peers);
+
+      dout(10) << " monmap is " << *monmap << dendl;
+      dout(10) << " extra probe peers " << extra_probe_peers << dendl;
+    }
+  } else if (!monmap->contains(name)) {
+    derr << "not in monmap and have been in a quorum before; "
+         << "must have been removed" << dendl;
+    if (g_conf()->mon_force_quorum_join) {
+      dout(0) << "we should have died but "
+              << "'mon_force_quorum_join' is set -- allowing boot" << dendl;
+    } else {
+      derr << "commit suicide!" << dendl;
+      return -ENOENT;
+    }
+  }
+
+  {
+    // We have a potentially inconsistent store state in hands. Get rid of it
+    // and start fresh.
+    bool clear_store = false;
+    if (store->exists("mon_sync", "in_sync")) {
+      dout(1) << __func__ << " clean up potentially inconsistent store state"
+	      << dendl;
+      clear_store = true;
+    }
+
+    if (store->get("mon_sync", "force_sync") > 0) {
+      dout(1) << __func__ << " force sync by clearing store state" << dendl;
+      clear_store = true;
+    }
+
+    if (clear_store) {
+      set<string> sync_prefixes = get_sync_targets_names();
+      store->clear(sync_prefixes);
+    }
+  }
+
+  sync_last_committed_floor = store->get("mon_sync", "last_committed_floor");
+  dout(10) << "sync_last_committed_floor " << sync_last_committed_floor << dendl;
+
+  init_paxos();
+
+  if (is_keyring_required()) {
+    // we need to bootstrap authentication keys so we can form an
+    // initial quorum.
+    if (authmon()->get_last_committed() == 0) {
+      dout(10) << "loading initial keyring to bootstrap authentication for mkfs" << dendl;
+      bufferlist bl;
+      int err = store->get("mkfs", "keyring", bl);
+      if (err == 0 && bl.length() > 0) {
+        // Attempt to decode and extract keyring only if it is found.
+        KeyRing keyring;
+        auto p = bl.cbegin();
+        decode(keyring, p);
+        extract_save_mon_key(keyring);
+      }
+    }
+
+    string keyring_loc = g_conf()->mon_data + "/keyring";
+
+    r = keyring.load(cct, keyring_loc);
+    if (r < 0) {
+      EntityName mon_name;
+      mon_name.set_type(CEPH_ENTITY_TYPE_MON);
+      EntityAuth mon_key;
+      if (key_server.get_auth(mon_name, mon_key)) {
+	dout(1) << "copying mon. key from old db to external keyring" << dendl;
+	keyring.add(mon_name, mon_key);
+	bufferlist bl;
+	keyring.encode_plaintext(bl);
+	write_default_keyring(bl);
+      } else {
+	derr << "unable to load initial keyring " << g_conf()->keyring << dendl;
+	return r;
+      }
+    }
+  }
+
+  admin_hook = new AdminHook(this);
+  AdminSocket* admin_socket = cct->get_admin_socket();
+
+  // unlock while registering to avoid mon_lock -> admin socket lock dependency.
+  l.unlock();
+  // register tell/asock commands
+  for (const auto& command : local_mon_commands) {
+    if (!command.is_tell()) {
+      continue;
+    }
+    const auto prefix = cmddesc_get_prefix(command.cmdstring);
+    if (prefix == "injectargs" ||
+	prefix == "version" ||
+	prefix == "tell") {
+      // not registerd by me
+      continue;
+    }
+    r = admin_socket->register_command(command.cmdstring, admin_hook,
+				       command.helpstring);
+    ceph_assert(r == 0);
+  }
+  l.lock();
+
+  // add ourselves as a conf observer
+  g_conf().add_observer(this);
+
+  messenger->set_auth_client(this);
+  messenger->set_auth_server(this);
+  mgr_messenger->set_auth_client(this);
+
+  auth_registry.refresh_config();
+
+  return 0;
+}
+
+int Monitor::init()
+{
+  dout(2) << "init" << dendl;
+  std::lock_guard l(lock);
+
+  finisher.start();
+
+  // start ticker
+  timer.init();
+  new_tick();
+
+  cpu_tp.start();
+
+  // i'm ready!
+  messenger->add_dispatcher_tail(this);
+
+  // kickstart pet mgrclient
+  mgr_client.init();
+  mgr_messenger->add_dispatcher_tail(&mgr_client);
+  mgr_messenger->add_dispatcher_tail(this);  // for auth ms_* calls
+  mgrmon()->prime_mgr_client();
+
+  state = STATE_PROBING;
+
+  bootstrap();
+
+  if (!elector.peer_tracker_is_clean()){
+    dout(10) << "peer_tracker looks inconsistent"
+      << " previous bad logic, clearing ..." << dendl;
+    elector.notify_clear_peer_state();
+  }
+
+  // add features of myself into feature_map
+  session_map.feature_map.add_mon(con_self->get_features());
+  return 0;
+}
+
+void Monitor::init_paxos()
+{
+  dout(10) << __func__ << dendl;
+  paxos->init();
+
+  // init services
+  for (auto& svc : paxos_service) {
+    svc->init();
+  }
+
+  refresh_from_paxos(NULL);
+}
+
+void Monitor::refresh_from_paxos(bool *need_bootstrap)
+{
+  dout(10) << __func__ << dendl;
+
+  bufferlist bl;
+  int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl);
+  if (r >= 0) {
+    try {
+      auto p = bl.cbegin();
+      decode(fingerprint, p);
+    }
+    catch (ceph::buffer::error& e) {
+      dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl;
+    }
+  } else {
+    dout(10) << __func__ << " no cluster_fingerprint" << dendl;
+  }
+
+  for (auto& svc : paxos_service) {
+    svc->refresh(need_bootstrap);
+  }
+  for (auto& svc : paxos_service) {
+    svc->post_refresh();
+  }
+  load_metadata();
+}
+
+void Monitor::register_cluster_logger()
+{
+  if (!cluster_logger_registered) {
+    dout(10) << "register_cluster_logger" << dendl;
+    cluster_logger_registered = true;
+    cct->get_perfcounters_collection()->add(cluster_logger);
+  } else {
+    dout(10) << "register_cluster_logger - already registered" << dendl;
+  }
+}
+
+void Monitor::unregister_cluster_logger()
+{
+  if (cluster_logger_registered) {
+    dout(10) << "unregister_cluster_logger" << dendl;
+    cluster_logger_registered = false;
+    cct->get_perfcounters_collection()->remove(cluster_logger);
+  } else {
+    dout(10) << "unregister_cluster_logger - not registered" << dendl;
+  }
+}
+
+void Monitor::update_logger()
+{
+  cluster_logger->set(l_cluster_num_mon, monmap->size());
+  cluster_logger->set(l_cluster_num_mon_quorum, quorum.size());
+}
+
+void Monitor::shutdown()
+{
+  dout(1) << "shutdown" << dendl;
+
+  lock.lock();
+
+  wait_for_paxos_write();
+
+  {
+    std::lock_guard l(auth_lock);
+    authmon()->_set_mon_num_rank(0, 0);
+  }
+
+  state = STATE_SHUTDOWN;
+
+  lock.unlock();
+  g_conf().remove_observer(this);
+  lock.lock();
+
+  if (admin_hook) {
+    cct->get_admin_socket()->unregister_commands(admin_hook);
+    delete admin_hook;
+    admin_hook = NULL;
+  }
+
+  elector.shutdown();
+
+  mgr_client.shutdown();
+
+  lock.unlock();
+  finisher.wait_for_empty();
+  finisher.stop();
+  lock.lock();
+
+  // clean up
+  paxos->shutdown();
+  for (auto& svc : paxos_service) {
+    svc->shutdown();
+  }
+
+  finish_contexts(g_ceph_context, waitfor_quorum, -ECANCELED);
+  finish_contexts(g_ceph_context, maybe_wait_for_quorum, -ECANCELED);
+
+  timer.shutdown();
+
+  cpu_tp.stop();
+
+  remove_all_sessions();
+
+  log_client.shutdown();
+
+  // unlock before msgr shutdown...
+  lock.unlock();
+
+  // shutdown messenger before removing logger from perfcounter collection, 
+  // otherwise _ms_dispatch() will try to update deleted logger
+  messenger->shutdown();
+  mgr_messenger->shutdown();
+
+  if (logger) {
+    cct->get_perfcounters_collection()->remove(logger);
+  }
+  if (cluster_logger) {
+    if (cluster_logger_registered)
+      cct->get_perfcounters_collection()->remove(cluster_logger);
+    delete cluster_logger;
+    cluster_logger = NULL;
+  }
+}
+
+void Monitor::wait_for_paxos_write()
+{
+  if (paxos->is_writing() || paxos->is_writing_previous()) {
+    dout(10) << __func__ << " flushing pending write" << dendl;
+    lock.unlock();
+    store->flush();
+    lock.lock();
+    dout(10) << __func__ << " flushed pending write" << dendl;
+  }
+}
+
+void Monitor::respawn()
+{
+  // --- WARNING TO FUTURE COPY/PASTERS ---
+  // You must also add a call like
+  //
+  //   ceph_pthread_setname(pthread_self(), "ceph-mon");
+  //
+  // to main() so that /proc/$pid/stat field 2 contains "(ceph-mon)"
+  // instead of "(exe)", so that killall (and log rotation) will work.
+
+  dout(0) << __func__ << dendl;
+
+  char *new_argv[orig_argc+1];
+  dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
+  for (int i=0; i<orig_argc; i++) {
+    new_argv[i] = (char *)orig_argv[i];
+    dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl;
+  }
+  new_argv[orig_argc] = NULL;
+
+  /* Determine the path to our executable, test if Linux /proc/self/exe exists.
+   * This allows us to exec the same executable even if it has since been
+   * unlinked.
+   */
+  char exe_path[PATH_MAX] = "";
+#ifdef PROCPREFIX
+  if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) != -1) {
+    dout(1) << "respawning with exe " << exe_path << dendl;
+    strcpy(exe_path, PROCPREFIX "/proc/self/exe");
+  } else {
+#else
+  {
+#endif
+    /* Print CWD for the user's interest */
+    char buf[PATH_MAX];
+    char *cwd = getcwd(buf, sizeof(buf));
+    ceph_assert(cwd);
+    dout(1) << " cwd " << cwd << dendl;
+
+    /* Fall back to a best-effort: just running in our CWD */
+    strncpy(exe_path, orig_argv[0], PATH_MAX-1);
+  }
+
+  dout(1) << " exe_path " << exe_path << dendl;
+
+  unblock_all_signals(NULL);
+  execv(exe_path, new_argv);
+
+  dout(0) << "respawn execv " << orig_argv[0]
+	  << " failed with " << cpp_strerror(errno) << dendl;
+
+  // We have to assert out here, because suicide() returns, and callers
+  // to respawn expect it never to return.
+  ceph_abort();
+}
+
+void Monitor::bootstrap()
+{
+  dout(10) << "bootstrap" << dendl;
+  wait_for_paxos_write();
+
+  sync_reset_requester();
+  unregister_cluster_logger();
+  cancel_probe_timeout();
+
+  if (monmap->get_epoch() == 0) {
+    dout(10) << "reverting to legacy ranks for seed monmap (epoch 0)" << dendl;
+    monmap->calc_legacy_ranks();
+  }
+  dout(10) << "monmap " << *monmap << dendl;
+  {
+    auto from_release = monmap->min_mon_release;
+    ostringstream err;
+    if (!can_upgrade_from(from_release, "min_mon_release", err)) {
+      derr << "current monmap has " << err.str() << " stopping." << dendl;
+      exit(0);
+    }
+  }
+  // note my rank
+  int newrank = monmap->get_rank(messenger->get_myaddrs());
+  if (newrank < 0 && rank >= 0) {
+    // was i ever part of the quorum?
+    if (has_ever_joined) {
+      dout(0) << " removed from monmap, suicide." << dendl;
+      exit(0);
+    }
+    elector.notify_clear_peer_state();
+  }
+  if (newrank >= 0 &&
+      monmap->get_addrs(newrank) != messenger->get_myaddrs()) {
+    dout(0) << " monmap addrs for rank " << newrank << " changed, i am "
+	    << messenger->get_myaddrs()
+	    << ", monmap is " << monmap->get_addrs(newrank) << ", respawning"
+	    << dendl;
+
+    if (monmap->get_epoch()) {
+      // store this map in temp mon_sync location so that we use it on
+      // our next startup
+      derr << " stashing newest monmap " << monmap->get_epoch()
+	   << " for next startup" << dendl;
+      bufferlist bl;
+      monmap->encode(bl, -1);
+      auto t(std::make_shared<MonitorDBStore::Transaction>());
+      t->put("mon_sync", "temp_newer_monmap", bl);
+      store->apply_transaction(t);
+    }
+
+    respawn();
+  }
+  if (newrank != rank) {
+    dout(0) << " my rank is now " << newrank << " (was " << rank << ")" << dendl;
+    messenger->set_myname(entity_name_t::MON(newrank));
+    rank = newrank;
+    elector.notify_rank_changed(rank);
+
+    // reset all connections, or else our peers will think we are someone else.
+    messenger->mark_down_all();
+  }
+
+  // reset
+  state = STATE_PROBING;
+
+  _reset();
+
+  // sync store
+  if (g_conf()->mon_compact_on_bootstrap) {
+    dout(10) << "bootstrap -- triggering compaction" << dendl;
+    store->compact();
+    dout(10) << "bootstrap -- finished compaction" << dendl;
+  }
+
+  // stretch mode bits
+  set_elector_disallowed_leaders(false);
+
+  // singleton monitor?
+  if (monmap->size() == 1 && rank == 0) {
+    win_standalone_election();
+    return;
+  }
+
+  reset_probe_timeout();
+
+  // i'm outside the quorum
+  if (monmap->contains(name))
+    outside_quorum.insert(name);
+
+  // probe monitors
+  dout(10) << "probing other monitors" << dendl;
+  for (unsigned i = 0; i < monmap->size(); i++) {
+    if ((int)i != rank)
+      send_mon_message(
+	new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined,
+		      ceph_release()),
+	i);
+  }
+  for (auto& av : extra_probe_peers) {
+    if (av != messenger->get_myaddrs()) {
+      messenger->send_to_mon(
+	new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined,
+		      ceph_release()),
+	av);
+    }
+  }
+}
+
+bool Monitor::_add_bootstrap_peer_hint(std::string_view cmd,
+				       const cmdmap_t& cmdmap,
+				       ostream& ss)
+{
+  if (is_leader() || is_peon()) {
+    ss << "mon already active; ignoring bootstrap hint";
+    return true;
+  }
+
+  entity_addrvec_t addrs;
+  string addrstr;
+  if (cmd_getval(cmdmap, "addr", addrstr)) {
+    dout(10) << "_add_bootstrap_peer_hint '" << cmd << "' addr '"
+	     << addrstr << "'" << dendl;
+
+    entity_addr_t addr;
+    const char *end = 0;
+    if (!addr.parse(addrstr.c_str(), &end, entity_addr_t::TYPE_ANY)) {
+      ss << "failed to parse addrs '" << addrstr
+	 << "'; syntax is 'add_bootstrap_peer_hint ip[:port]'";
+      return false;
+    }
+
+    addrs.v.push_back(addr);
+    if (addr.get_port() == 0) {
+      addrs.v[0].set_type(entity_addr_t::TYPE_MSGR2);
+      addrs.v[0].set_port(CEPH_MON_PORT_IANA);
+      addrs.v.push_back(addr);
+      addrs.v[1].set_type(entity_addr_t::TYPE_LEGACY);
+      addrs.v[1].set_port(CEPH_MON_PORT_LEGACY);
+    } else if (addr.get_type() == entity_addr_t::TYPE_ANY) {
+      if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
+	addrs.v[0].set_type(entity_addr_t::TYPE_LEGACY);
+      } else {
+	addrs.v[0].set_type(entity_addr_t::TYPE_MSGR2);
+      }
+    }
+  } else if (cmd_getval(cmdmap, "addrv", addrstr)) {
+    dout(10) << "_add_bootstrap_peer_hintv '" << cmd << "' addrv '"
+	     << addrstr << "'" << dendl;
+    const char *end = 0;
+    if (!addrs.parse(addrstr.c_str(), &end)) {
+      ss << "failed to parse addrs '" << addrstr
+	 << "'; syntax is 'add_bootstrap_peer_hintv v2:ip:port[,v1:ip:port]'";
+      return false;
+    }
+  } else {
+    ss << "no addr or addrv provided";
+    return false;
+  }
+
+  extra_probe_peers.insert(addrs);
+  ss << "adding peer " << addrs << " to list: " << extra_probe_peers;
+  return true;
+}
+
+// called by bootstrap(), or on leader|peon -> electing
+void Monitor::_reset()
+{
+  dout(10) << __func__ << dendl;
+
+  // disable authentication
+  {
+    std::lock_guard l(auth_lock);
+    authmon()->_set_mon_num_rank(0, 0);
+  }
+
+  cancel_probe_timeout();
+  timecheck_finish();
+  health_events_cleanup();
+  health_check_log_times.clear();
+  scrub_event_cancel();
+
+  leader_since = utime_t();
+  quorum_since = {};
+  if (!quorum.empty()) {
+    exited_quorum = ceph_clock_now();
+  }
+  quorum.clear();
+  outside_quorum.clear();
+  quorum_feature_map.clear();
+
+  scrub_reset();
+
+  paxos->restart();
+
+  for (auto& svc : paxos_service) {
+    svc->restart();
+  }
+}
+
+
+// -----------------------------------------------------------
+// sync
+
+set<string> Monitor::get_sync_targets_names()
+{
+  set<string> targets;
+  targets.insert(paxos->get_name());
+  for (auto& svc : paxos_service) {
+    svc->get_store_prefixes(targets);
+  }
+  return targets;
+}
+
+
+void Monitor::sync_timeout()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(state == STATE_SYNCHRONIZING);
+  bootstrap();
+}
+
+void Monitor::sync_obtain_latest_monmap(bufferlist &bl)
+{
+  dout(1) << __func__ << dendl;
+
+  MonMap latest_monmap;
+
+  // Grab latest monmap from MonmapMonitor
+  bufferlist monmon_bl;
+  int err = monmon()->get_monmap(monmon_bl);
+  if (err < 0) {
+    if (err != -ENOENT) {
+      derr << __func__
+           << " something wrong happened while reading the store: "
+           << cpp_strerror(err) << dendl;
+      ceph_abort_msg("error reading the store");
+    }
+  } else {
+    latest_monmap.decode(monmon_bl);
+  }
+
+  // Grab last backed up monmap (if any) and compare epochs
+  if (store->exists("mon_sync", "latest_monmap")) {
+    bufferlist backup_bl;
+    int err = store->get("mon_sync", "latest_monmap", backup_bl);
+    if (err < 0) {
+      derr << __func__
+           << " something wrong happened while reading the store: "
+           << cpp_strerror(err) << dendl;
+      ceph_abort_msg("error reading the store");
+    }
+    ceph_assert(backup_bl.length() > 0);
+
+    MonMap backup_monmap;
+    backup_monmap.decode(backup_bl);
+
+    if (backup_monmap.epoch > latest_monmap.epoch)
+      latest_monmap = backup_monmap;
+  }
+
+  // Check if our current monmap's epoch is greater than the one we've
+  // got so far.
+  if (monmap->epoch > latest_monmap.epoch)
+    latest_monmap = *monmap;
+
+  dout(1) << __func__ << " obtained monmap e" << latest_monmap.epoch << dendl;
+
+  latest_monmap.encode(bl, CEPH_FEATURES_ALL);
+}
+
+void Monitor::sync_reset_requester()
+{
+  dout(10) << __func__ << dendl;
+
+  if (sync_timeout_event) {
+    timer.cancel_event(sync_timeout_event);
+    sync_timeout_event = NULL;
+  }
+
+  sync_provider = entity_addrvec_t();
+  sync_cookie = 0;
+  sync_full = false;
+  sync_start_version = 0;
+}
+
+void Monitor::sync_reset_provider()
+{
+  dout(10) << __func__ << dendl;
+  sync_providers.clear();
+}
+
+void Monitor::sync_start(entity_addrvec_t &addrs, bool full)
+{
+  dout(10) << __func__ << " " << addrs << (full ? " full" : " recent") << dendl;
+
+  ceph_assert(state == STATE_PROBING ||
+	 state == STATE_SYNCHRONIZING);
+  state = STATE_SYNCHRONIZING;
+
+  // make sure are not a provider for anyone!
+  sync_reset_provider();
+
+  sync_full = full;
+
+  if (sync_full) {
+    // stash key state, and mark that we are syncing
+    auto t(std::make_shared<MonitorDBStore::Transaction>());
+    sync_stash_critical_state(t);
+    t->put("mon_sync", "in_sync", 1);
+
+    sync_last_committed_floor = std::max(sync_last_committed_floor, paxos->get_version());
+    dout(10) << __func__ << " marking sync in progress, storing sync_last_committed_floor "
+	     << sync_last_committed_floor << dendl;
+    t->put("mon_sync", "last_committed_floor", sync_last_committed_floor);
+
+    store->apply_transaction(t);
+
+    ceph_assert(g_conf()->mon_sync_requester_kill_at != 1);
+
+    // clear the underlying store
+    set<string> targets = get_sync_targets_names();
+    dout(10) << __func__ << " clearing prefixes " << targets << dendl;
+    store->clear(targets);
+
+    // make sure paxos knows it has been reset.  this prevents a
+    // bootstrap and then different probe reply order from possibly
+    // deciding a partial or no sync is needed.
+    paxos->init();
+
+    ceph_assert(g_conf()->mon_sync_requester_kill_at != 2);
+  }
+
+  // assume 'other' as the leader. We will update the leader once we receive
+  // a reply to the sync start.
+  sync_provider = addrs;
+
+  sync_reset_timeout();
+
+  MMonSync *m = new MMonSync(sync_full ? MMonSync::OP_GET_COOKIE_FULL : MMonSync::OP_GET_COOKIE_RECENT);
+  if (!sync_full)
+    m->last_committed = paxos->get_version();
+  messenger->send_to_mon(m, sync_provider);
+}
+
+void Monitor::sync_stash_critical_state(MonitorDBStore::TransactionRef t)
+{
+  dout(10) << __func__ << dendl;
+  bufferlist backup_monmap;
+  sync_obtain_latest_monmap(backup_monmap);
+  ceph_assert(backup_monmap.length() > 0);
+  t->put("mon_sync", "latest_monmap", backup_monmap);
+}
+
+void Monitor::sync_reset_timeout()
+{
+  dout(10) << __func__ << dendl;
+  if (sync_timeout_event)
+    timer.cancel_event(sync_timeout_event);
+  sync_timeout_event = timer.add_event_after(
+    g_conf()->mon_sync_timeout,
+    new C_MonContext{this, [this](int) {
+	sync_timeout();
+      }});
+}
+
+void Monitor::sync_finish(version_t last_committed)
+{
+  dout(10) << __func__ << " lc " << last_committed << " from " << sync_provider << dendl;
+
+  ceph_assert(g_conf()->mon_sync_requester_kill_at != 7);
+
+  if (sync_full) {
+    // finalize the paxos commits
+    auto tx(std::make_shared<MonitorDBStore::Transaction>());
+    paxos->read_and_prepare_transactions(tx, sync_start_version,
+					 last_committed);
+    tx->put(paxos->get_name(), "last_committed", last_committed);
+
+    dout(30) << __func__ << " final tx dump:\n";
+    JSONFormatter f(true);
+    tx->dump(&f);
+    f.flush(*_dout);
+    *_dout << dendl;
+
+    store->apply_transaction(tx);
+  }
+
+  ceph_assert(g_conf()->mon_sync_requester_kill_at != 8);
+
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  t->erase("mon_sync", "in_sync");
+  t->erase("mon_sync", "force_sync");
+  t->erase("mon_sync", "last_committed_floor");
+  store->apply_transaction(t);
+
+  ceph_assert(g_conf()->mon_sync_requester_kill_at != 9);
+
+  init_paxos();
+
+  ceph_assert(g_conf()->mon_sync_requester_kill_at != 10);
+
+  bootstrap();
+}
+
+void Monitor::handle_sync(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonSync>();
+  dout(10) << __func__ << " " << *m << dendl;
+  switch (m->op) {
+
+    // provider ---------
+
+  case MMonSync::OP_GET_COOKIE_FULL:
+  case MMonSync::OP_GET_COOKIE_RECENT:
+    handle_sync_get_cookie(op);
+    break;
+  case MMonSync::OP_GET_CHUNK:
+    handle_sync_get_chunk(op);
+    break;
+
+    // client -----------
+
+  case MMonSync::OP_COOKIE:
+    handle_sync_cookie(op);
+    break;
+
+  case MMonSync::OP_CHUNK:
+  case MMonSync::OP_LAST_CHUNK:
+    handle_sync_chunk(op);
+    break;
+  case MMonSync::OP_NO_COOKIE:
+    handle_sync_no_cookie(op);
+    break;
+
+  default:
+    dout(0) << __func__ << " unknown op " << m->op << dendl;
+    ceph_abort_msg("unknown op");
+  }
+}
+
+// leader
+
+void Monitor::_sync_reply_no_cookie(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonSync>();
+  MMonSync *reply = new MMonSync(MMonSync::OP_NO_COOKIE, m->cookie);
+  m->get_connection()->send_message(reply);
+}
+
+void Monitor::handle_sync_get_cookie(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonSync>();
+  if (is_synchronizing()) {
+    _sync_reply_no_cookie(op);
+    return;
+  }
+
+  ceph_assert(g_conf()->mon_sync_provider_kill_at != 1);
+
+  // make sure they can understand us.
+  if ((required_features ^ m->get_connection()->get_features()) &
+      required_features) {
+    dout(5) << " ignoring peer mon." << m->get_source().num()
+	    << " has features " << std::hex
+	    << m->get_connection()->get_features()
+	    << " but we require " << required_features << std::dec << dendl;
+    return;
+  }
+
+  // make up a unique cookie.  include election epoch (which persists
+  // across restarts for the whole cluster) and a counter for this
+  // process instance.  there is no need to be unique *across*
+  // monitors, though.
+  uint64_t cookie = ((unsigned long long)elector.get_epoch() << 24) + ++sync_provider_count;
+  ceph_assert(sync_providers.count(cookie) == 0);
+
+  dout(10) << __func__ << " cookie " << cookie << " for " << m->get_source_inst() << dendl;
+
+  SyncProvider& sp = sync_providers[cookie];
+  sp.cookie = cookie;
+  sp.addrs = m->get_source_addrs();
+  sp.reset_timeout(g_ceph_context, g_conf()->mon_sync_timeout * 2);
+
+  set<string> sync_targets;
+  if (m->op == MMonSync::OP_GET_COOKIE_FULL) {
+    // full scan
+    sync_targets = get_sync_targets_names();
+    sp.last_committed = paxos->get_version();
+    sp.synchronizer = store->get_synchronizer(sp.last_key, sync_targets);
+    sp.full = true;
+    dout(10) << __func__ << " will sync prefixes " << sync_targets << dendl;
+  } else {
+    // just catch up paxos
+    sp.last_committed = m->last_committed;
+  }
+  dout(10) << __func__ << " will sync from version " << sp.last_committed << dendl;
+
+  MMonSync *reply = new MMonSync(MMonSync::OP_COOKIE, sp.cookie);
+  reply->last_committed = sp.last_committed;
+  m->get_connection()->send_message(reply);
+}
+
+void Monitor::handle_sync_get_chunk(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonSync>();
+  dout(10) << __func__ << " " << *m << dendl;
+
+  if (sync_providers.count(m->cookie) == 0) {
+    dout(10) << __func__ << " no cookie " << m->cookie << dendl;
+    _sync_reply_no_cookie(op);
+    return;
+  }
+
+  ceph_assert(g_conf()->mon_sync_provider_kill_at != 2);
+
+  SyncProvider& sp = sync_providers[m->cookie];
+  sp.reset_timeout(g_ceph_context, g_conf()->mon_sync_timeout * 2);
+
+  if (sp.last_committed < paxos->get_first_committed() &&
+      paxos->get_first_committed() > 1) {
+    dout(10) << __func__ << " sync requester fell behind paxos, their lc " << sp.last_committed
+	     << " < our fc " << paxos->get_first_committed() << dendl;
+    sync_providers.erase(m->cookie);
+    _sync_reply_no_cookie(op);
+    return;
+  }
+
+  MMonSync *reply = new MMonSync(MMonSync::OP_CHUNK, sp.cookie);
+  auto tx(std::make_shared<MonitorDBStore::Transaction>());
+
+  int bytes_left = g_conf()->mon_sync_max_payload_size;
+  int keys_left = g_conf()->mon_sync_max_payload_keys;
+  while (sp.last_committed < paxos->get_version() &&
+	 bytes_left > 0 &&
+	 keys_left > 0) {
+    bufferlist bl;
+    sp.last_committed++;
+
+    int err = store->get(paxos->get_name(), sp.last_committed, bl);
+    ceph_assert(err == 0);
+
+    tx->put(paxos->get_name(), sp.last_committed, bl);
+    bytes_left -= bl.length();
+    --keys_left;
+    dout(20) << __func__ << " including paxos state " << sp.last_committed
+	     << dendl;
+  }
+  reply->last_committed = sp.last_committed;
+
+  if (sp.full && bytes_left > 0 && keys_left > 0) {
+    sp.synchronizer->get_chunk_tx(tx, bytes_left, keys_left);
+    sp.last_key = sp.synchronizer->get_last_key();
+    reply->last_key = sp.last_key;
+  }
+
+  if ((sp.full && sp.synchronizer->has_next_chunk()) ||
+      sp.last_committed < paxos->get_version()) {
+    dout(10) << __func__ << " chunk, through version " << sp.last_committed
+	     << " key " << sp.last_key << dendl;
+  } else {
+    dout(10) << __func__ << " last chunk, through version " << sp.last_committed
+	     << " key " << sp.last_key << dendl;
+    reply->op = MMonSync::OP_LAST_CHUNK;
+
+    ceph_assert(g_conf()->mon_sync_provider_kill_at != 3);
+
+    // clean up our local state
+    sync_providers.erase(sp.cookie);
+  }
+
+  encode(*tx, reply->chunk_bl);
+
+  m->get_connection()->send_message(reply);
+}
+
+// requester
+
+void Monitor::handle_sync_cookie(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonSync>();
+  dout(10) << __func__ << " " << *m << dendl;
+  if (sync_cookie) {
+    dout(10) << __func__ << " already have a cookie, ignoring" << dendl;
+    return;
+  }
+  if (m->get_source_addrs() != sync_provider) {
+    dout(10) << __func__ << " source does not match, discarding" << dendl;
+    return;
+  }
+  sync_cookie = m->cookie;
+  sync_start_version = m->last_committed;
+
+  sync_reset_timeout();
+  sync_get_next_chunk();
+
+  ceph_assert(g_conf()->mon_sync_requester_kill_at != 3);
+}
+
+void Monitor::sync_get_next_chunk()
+{
+  dout(20) << __func__ << " cookie " << sync_cookie << " provider " << sync_provider << dendl;
+  if (g_conf()->mon_inject_sync_get_chunk_delay > 0) {
+    dout(20) << __func__ << " injecting delay of " << g_conf()->mon_inject_sync_get_chunk_delay << dendl;
+    usleep((long long)(g_conf()->mon_inject_sync_get_chunk_delay * 1000000.0));
+  }
+  MMonSync *r = new MMonSync(MMonSync::OP_GET_CHUNK, sync_cookie);
+  messenger->send_to_mon(r, sync_provider);
+
+  ceph_assert(g_conf()->mon_sync_requester_kill_at != 4);
+}
+
+void Monitor::handle_sync_chunk(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonSync>();
+  dout(10) << __func__ << " " << *m << dendl;
+
+  if (m->cookie != sync_cookie) {
+    dout(10) << __func__ << " cookie does not match, discarding" << dendl;
+    return;
+  }
+  if (m->get_source_addrs() != sync_provider) {
+    dout(10) << __func__ << " source does not match, discarding" << dendl;
+    return;
+  }
+
+  ceph_assert(state == STATE_SYNCHRONIZING);
+  ceph_assert(g_conf()->mon_sync_requester_kill_at != 5);
+
+  auto tx(std::make_shared<MonitorDBStore::Transaction>());
+  tx->append_from_encoded(m->chunk_bl);
+
+  dout(30) << __func__ << " tx dump:\n";
+  JSONFormatter f(true);
+  tx->dump(&f);
+  f.flush(*_dout);
+  *_dout << dendl;
+
+  store->apply_transaction(tx);
+
+  ceph_assert(g_conf()->mon_sync_requester_kill_at != 6);
+
+  if (!sync_full) {
+    dout(10) << __func__ << " applying recent paxos transactions as we go" << dendl;
+    auto tx(std::make_shared<MonitorDBStore::Transaction>());
+    paxos->read_and_prepare_transactions(tx, paxos->get_version() + 1,
+					 m->last_committed);
+    tx->put(paxos->get_name(), "last_committed", m->last_committed);
+
+    dout(30) << __func__ << " tx dump:\n";
+    JSONFormatter f(true);
+    tx->dump(&f);
+    f.flush(*_dout);
+    *_dout << dendl;
+
+    store->apply_transaction(tx);
+    paxos->init();  // to refresh what we just wrote
+  }
+
+  if (m->op == MMonSync::OP_CHUNK) {
+    sync_reset_timeout();
+    sync_get_next_chunk();
+  } else if (m->op == MMonSync::OP_LAST_CHUNK) {
+    sync_finish(m->last_committed);
+  }
+}
+
+void Monitor::handle_sync_no_cookie(MonOpRequestRef op)
+{
+  dout(10) << __func__ << dendl;
+  bootstrap();
+}
+
+void Monitor::sync_trim_providers()
+{
+  dout(20) << __func__ << dendl;
+
+  utime_t now = ceph_clock_now();
+  map<uint64_t,SyncProvider>::iterator p = sync_providers.begin();
+  while (p != sync_providers.end()) {
+    if (now > p->second.timeout) {
+      dout(10) << __func__ << " expiring cookie " << p->second.cookie
+	       << " for " << p->second.addrs << dendl;
+      sync_providers.erase(p++);
+    } else {
+      ++p;
+    }
+  }
+}
+
+// ---------------------------------------------------
+// probe
+
+void Monitor::cancel_probe_timeout()
+{
+  if (probe_timeout_event) {
+    dout(10) << "cancel_probe_timeout " << probe_timeout_event << dendl;
+    timer.cancel_event(probe_timeout_event);
+    probe_timeout_event = NULL;
+  } else {
+    dout(10) << "cancel_probe_timeout (none scheduled)" << dendl;
+  }
+}
+
+void Monitor::reset_probe_timeout()
+{
+  cancel_probe_timeout();
+  probe_timeout_event = new C_MonContext{this, [this](int r) {
+      probe_timeout(r);
+    }};
+  double t = g_conf()->mon_probe_timeout;
+  if (timer.add_event_after(t, probe_timeout_event)) {
+    dout(10) << "reset_probe_timeout " << probe_timeout_event
+	     << " after " << t << " seconds" << dendl;
+  } else {
+    probe_timeout_event = nullptr;
+  }
+}
+
+void Monitor::probe_timeout(int r)
+{
+  dout(4) << "probe_timeout " << probe_timeout_event << dendl;
+  ceph_assert(is_probing() || is_synchronizing());
+  ceph_assert(probe_timeout_event);
+  probe_timeout_event = NULL;
+  bootstrap();
+}
+
+void Monitor::handle_probe(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonProbe>();
+  dout(10) << "handle_probe " << *m << dendl;
+
+  if (m->fsid != monmap->fsid) {
+    dout(0) << "handle_probe ignoring fsid " << m->fsid << " != " << monmap->fsid << dendl;
+    return;
+  }
+
+  switch (m->op) {
+  case MMonProbe::OP_PROBE:
+    handle_probe_probe(op);
+    break;
+
+  case MMonProbe::OP_REPLY:
+    handle_probe_reply(op);
+    break;
+
+  case MMonProbe::OP_MISSING_FEATURES:
+    derr << __func__ << " require release " << (int)m->mon_release << " > "
+	 << (int)ceph_release()
+	 << ", or missing features (have " << CEPH_FEATURES_ALL
+	 << ", required " << m->required_features
+	 << ", missing " << (m->required_features & ~CEPH_FEATURES_ALL) << ")"
+	 << dendl;
+    break;
+  }
+}
+
+void Monitor::handle_probe_probe(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonProbe>();
+
+  dout(10) << "handle_probe_probe " << m->get_source_inst() << *m
+	   << " features " << m->get_connection()->get_features() << dendl;
+  uint64_t missing = required_features & ~m->get_connection()->get_features();
+  if ((m->mon_release != ceph_release_t::unknown &&
+       m->mon_release < monmap->min_mon_release) ||
+      missing) {
+    dout(1) << " peer " << m->get_source_addr()
+	    << " release " << m->mon_release
+	    << " < min_mon_release " << monmap->min_mon_release
+	    << ", or missing features " << missing << dendl;
+    MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_MISSING_FEATURES,
+				 name, has_ever_joined, monmap->min_mon_release);
+    m->required_features = required_features;
+    m->get_connection()->send_message(r);
+    goto out;
+  }
+
+  if (!is_probing() && !is_synchronizing()) {
+    // If the probing mon is way ahead of us, we need to re-bootstrap.
+    // Normally we capture this case when we initially bootstrap, but
+    // it is possible we pass those checks (we overlap with
+    // quorum-to-be) but fail to join a quorum before it moves past
+    // us.  We need to be kicked back to bootstrap so we can
+    // synchonize, not keep calling elections.
+    if (paxos->get_version() + 1 < m->paxos_first_version) {
+      dout(1) << " peer " << m->get_source_addr() << " has first_committed "
+	      << "ahead of us, re-bootstrapping" << dendl;
+      bootstrap();
+      goto out;
+
+    }
+  }
+  
+  MMonProbe *r;
+  r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, name, has_ever_joined,
+		    ceph_release());
+  r->name = name;
+  r->quorum = quorum;
+  r->leader = leader;
+  monmap->encode(r->monmap_bl, m->get_connection()->get_features());
+  r->paxos_first_version = paxos->get_first_committed();
+  r->paxos_last_version = paxos->get_version();
+  m->get_connection()->send_message(r);
+
+  // did we discover a peer here?
+  if (!monmap->contains(m->get_source_addr())) {
+    dout(1) << " adding peer " << m->get_source_addrs()
+	    << " to list of hints" << dendl;
+    extra_probe_peers.insert(m->get_source_addrs());
+  } else {
+    elector.begin_peer_ping(monmap->get_rank(m->get_source_addr()));
+  }
+
+ out:
+  return;
+}
+
+void Monitor::handle_probe_reply(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonProbe>();
+  dout(10) << "handle_probe_reply " << m->get_source_inst()
+	   << " " << *m << dendl;
+  dout(10) << " monmap is " << *monmap << dendl;
+
+  // discover name and addrs during probing or electing states.
+  if (!is_probing() && !is_electing()) {
+    return;
+  }
+
+  // newer map, or they've joined a quorum and we haven't?
+  bufferlist mybl;
+  monmap->encode(mybl, m->get_connection()->get_features());
+  // make sure it's actually different; the checks below err toward
+  // taking the other guy's map, which could cause us to loop.
+  if (!mybl.contents_equal(m->monmap_bl)) {
+    MonMap *newmap = new MonMap;
+    newmap->decode(m->monmap_bl);
+    if (m->has_ever_joined && (newmap->get_epoch() > monmap->get_epoch() ||
+			       !has_ever_joined)) {
+      dout(10) << " got newer/committed monmap epoch " << newmap->get_epoch()
+	       << ", mine was " << monmap->get_epoch() << dendl;
+      int epoch_diff = newmap->get_epoch() - monmap->get_epoch();
+      delete newmap;
+      monmap->decode(m->monmap_bl);
+      dout(20) << "has_ever_joined: " << has_ever_joined << dendl;
+      if (epoch_diff == 1 && has_ever_joined) {
+        notify_new_monmap(false);
+      } else {
+        notify_new_monmap(false, false);
+        elector.notify_clear_peer_state();
+      }
+      bootstrap();
+      return;
+    }
+    delete newmap;
+  }
+
+  // rename peer?
+  string peer_name = monmap->get_name(m->get_source_addr());
+  if (monmap->get_epoch() == 0 && peer_name.compare(0, 7, "noname-") == 0) {
+    dout(10) << " renaming peer " << m->get_source_addr() << " "
+	     << peer_name << " -> " << m->name << " in my monmap"
+	     << dendl;
+    monmap->rename(peer_name, m->name);
+
+    if (is_electing()) {
+      bootstrap();
+      return;
+    }
+  } else if (peer_name.size()) {
+    dout(10) << " peer name is " << peer_name << dendl;
+  } else {
+    dout(10) << " peer " << m->get_source_addr() << " not in map" << dendl;
+  }
+
+  // new initial peer?
+  if (monmap->get_epoch() == 0 &&
+      monmap->contains(m->name) &&
+      monmap->get_addrs(m->name).front().is_blank_ip()) {
+    dout(1) << " learned initial mon " << m->name
+	    << " addrs " << m->get_source_addrs() << dendl;
+    monmap->set_addrvec(m->name, m->get_source_addrs());
+
+    bootstrap();
+    return;
+  }
+
+  // end discover phase
+  if (!is_probing()) {
+    return;
+  }
+
+  ceph_assert(paxos != NULL);
+
+  if (is_synchronizing()) {
+    dout(10) << " currently syncing" << dendl;
+    return;
+  }
+
+  entity_addrvec_t other = m->get_source_addrs();
+
+  if (m->paxos_last_version < sync_last_committed_floor) {
+    dout(10) << " peer paxos versions [" << m->paxos_first_version
+	     << "," << m->paxos_last_version << "] < my sync_last_committed_floor "
+	     << sync_last_committed_floor << ", ignoring"
+	     << dendl;
+  } else {
+    if (paxos->get_version() < m->paxos_first_version &&
+	m->paxos_first_version > 1) {  // no need to sync if we're 0 and they start at 1.
+      dout(10) << " peer paxos first versions [" << m->paxos_first_version
+	       << "," << m->paxos_last_version << "]"
+	       << " vs my version " << paxos->get_version()
+	       << " (too far ahead)"
+	       << dendl;
+      cancel_probe_timeout();
+      sync_start(other, true);
+      return;
+    }
+    if (paxos->get_version() + g_conf()->paxos_max_join_drift < m->paxos_last_version) {
+      dout(10) << " peer paxos last version " << m->paxos_last_version
+	       << " vs my version " << paxos->get_version()
+	       << " (too far ahead)"
+	       << dendl;
+      cancel_probe_timeout();
+      sync_start(other, false);
+      return;
+    }
+  }
+
+  // did the existing cluster complete upgrade to luminous?
+  if (osdmon()->osdmap.get_epoch()) {
+    if (osdmon()->osdmap.require_osd_release < ceph_release_t::luminous) {
+      derr << __func__ << " existing cluster has not completed upgrade to"
+	   << " luminous; 'ceph osd require_osd_release luminous' before"
+	   << " upgrading" << dendl;
+      exit(0);
+    }
+    if (!osdmon()->osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) ||
+	!osdmon()->osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
+      derr << __func__ << " existing cluster has not completed a full luminous"
+	   << " scrub to purge legacy snapdir objects; please scrub before"
+	   << " upgrading beyond luminous." << dendl;
+      exit(0);
+    }
+  }
+
+  // is there an existing quorum?
+  if (m->quorum.size()) {
+    dout(10) << " existing quorum " << m->quorum << dendl;
+
+    dout(10) << " peer paxos version " << m->paxos_last_version
+             << " vs my version " << paxos->get_version()
+             << " (ok)"
+             << dendl;
+    bool in_map = false;
+    const auto my_info = monmap->mon_info.find(name);
+    const map<string,string> *map_crush_loc{nullptr};
+    if (my_info != monmap->mon_info.end()) {
+      in_map = true;
+      map_crush_loc = &my_info->second.crush_loc;
+    }
+    if (in_map &&
+	!monmap->get_addrs(name).front().is_blank_ip() &&
+	(!need_set_crush_loc || (*map_crush_loc == crush_loc))) {
+      // i'm part of the cluster; just initiate a new election
+      start_election();
+    } else {
+      dout(10) << " ready to join, but i'm not in the monmap/"
+	"my addr is blank/location is wrong, trying to join" << dendl;
+      send_mon_message(new MMonJoin(monmap->fsid, name,
+				    messenger->get_myaddrs(), crush_loc,
+				    need_set_crush_loc),
+		       m->leader);
+    }
+  } else {
+    if (monmap->contains(m->name)) {
+      dout(10) << " mon." << m->name << " is outside the quorum" << dendl;
+      outside_quorum.insert(m->name);
+    } else {
+      dout(10) << " mostly ignoring mon." << m->name << ", not part of monmap" << dendl;
+      return;
+    }
+
+    unsigned need = monmap->min_quorum_size();
+    dout(10) << " outside_quorum now " << outside_quorum << ", need " << need << dendl;
+    if (outside_quorum.size() >= need) {
+      if (outside_quorum.count(name)) {
+        dout(10) << " that's enough to form a new quorum, calling election" << dendl;
+        start_election();
+      } else {
+        dout(10) << " that's enough to form a new quorum, but it does not include me; waiting" << dendl;
+      }
+    } else {
+      dout(10) << " that's not yet enough for a new quorum, waiting" << dendl;
+    }
+  }
+}
+
+void Monitor::join_election()
+{
+  dout(10) << __func__ << dendl;
+  wait_for_paxos_write();
+  _reset();
+  state = STATE_ELECTING;
+
+  logger->inc(l_mon_num_elections);
+}
+
+void Monitor::start_election()
+{
+  dout(10) << "start_election" << dendl;
+  wait_for_paxos_write();
+  _reset();
+  state = STATE_ELECTING;
+
+  logger->inc(l_mon_num_elections);
+  logger->inc(l_mon_election_call);
+
+  clog->info() << "mon." << name << " calling monitor election";
+  elector.call_election();
+}
+
+void Monitor::win_standalone_election()
+{
+  dout(1) << "win_standalone_election" << dendl;
+
+  // bump election epoch, in case the previous epoch included other
+  // monitors; we need to be able to make the distinction.
+  elector.declare_standalone_victory();
+
+  rank = monmap->get_rank(name);
+  ceph_assert(rank == 0);
+  set<int> q;
+  q.insert(rank);
+
+  map<int,Metadata> metadata;
+  collect_metadata(&metadata[0]);
+
+  win_election(elector.get_epoch(), q,
+               CEPH_FEATURES_ALL,
+               ceph::features::mon::get_supported(),
+	       ceph_release(),
+	       metadata);
+}
+
+const utime_t& Monitor::get_leader_since() const
+{
+  ceph_assert(state == STATE_LEADER);
+  return leader_since;
+}
+
+epoch_t Monitor::get_epoch()
+{
+  return elector.get_epoch();
+}
+
+void Monitor::_finish_svc_election()
+{
+  ceph_assert(state == STATE_LEADER || state == STATE_PEON);
+
+  for (auto& svc : paxos_service) {
+    // we already called election_finished() on monmon(); avoid callig twice
+    if (state == STATE_LEADER && svc.get() == monmon())
+      continue;
+    svc->election_finished();
+  }
+}
+
+void Monitor::win_election(epoch_t epoch, const set<int>& active, uint64_t features,
+                           const mon_feature_t& mon_features,
+			   ceph_release_t min_mon_release,
+			   const map<int,Metadata>& metadata)
+{
+  dout(10) << __func__ << " epoch " << epoch << " quorum " << active
+	   << " features " << features
+           << " mon_features " << mon_features
+	   << " min_mon_release " << min_mon_release
+           << dendl;
+  ceph_assert(is_electing());
+  state = STATE_LEADER;
+  leader_since = ceph_clock_now();
+  quorum_since = mono_clock::now();
+  leader = rank;
+  quorum = active;
+  quorum_con_features = features;
+  quorum_mon_features = mon_features;
+  quorum_min_mon_release = min_mon_release;
+  pending_metadata = metadata;
+  outside_quorum.clear();
+
+  clog->info() << "mon." << name << " is new leader, mons " << get_quorum_names()
+      << " in quorum (ranks " << quorum << ")";
+
+  set_leader_commands(get_local_commands(mon_features));
+
+  paxos->leader_init();
+  // NOTE: tell monmap monitor first.  This is important for the
+  // bootstrap case to ensure that the very first paxos proposal
+  // codifies the monmap.  Otherwise any manner of chaos can ensue
+  // when monitors are call elections or participating in a paxos
+  // round without agreeing on who the participants are.
+  monmon()->election_finished();
+  _finish_svc_election();
+
+  logger->inc(l_mon_election_win);
+
+  // inject new metadata in first transaction.
+  {
+    // include previous metadata for missing mons (that aren't part of
+    // the current quorum).
+    map<int,Metadata> m = metadata;
+    for (unsigned rank = 0; rank < monmap->size(); ++rank) {
+      if (m.count(rank) == 0 &&
+	  mon_metadata.count(rank)) {
+	m[rank] = mon_metadata[rank];
+      }
+    }
+
+    // FIXME: This is a bit sloppy because we aren't guaranteed to submit
+    // a new transaction immediately after the election finishes.  We should
+    // do that anyway for other reasons, though.
+    MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
+    bufferlist bl;
+    encode(m, bl);
+    t->put(MONITOR_STORE_PREFIX, "last_metadata", bl);
+  }
+
+  finish_election();
+  if (monmap->size() > 1 &&
+      monmap->get_epoch() > 0) {
+    timecheck_start();
+    health_tick_start();
+
+    // Freshen the health status before doing health_to_clog in case
+    // our just-completed election changed the health
+    healthmon()->wait_for_active_ctx(new LambdaContext([this](int r){
+      dout(20) << "healthmon now active" << dendl;
+      healthmon()->tick();
+      if (healthmon()->is_proposing()) {
+        dout(20) << __func__ << " healthmon proposing, waiting" << dendl;
+        healthmon()->wait_for_finished_proposal(nullptr, new C_MonContext{this,
+              [this](int r){
+                ceph_assert(ceph_mutex_is_locked_by_me(lock));
+                do_health_to_clog_interval();
+              }});
+
+      } else {
+        do_health_to_clog_interval();
+      }
+    }));
+
+    scrub_event_start();
+  }
+}
+
+void Monitor::lose_election(epoch_t epoch, set<int> &q, int l,
+                            uint64_t features,
+                            const mon_feature_t& mon_features,
+			    ceph_release_t min_mon_release)
+{
+  state = STATE_PEON;
+  leader_since = utime_t();
+  quorum_since = mono_clock::now();
+  leader = l;
+  quorum = q;
+  outside_quorum.clear();
+  quorum_con_features = features;
+  quorum_mon_features = mon_features;
+  quorum_min_mon_release = min_mon_release;
+  dout(10) << "lose_election, epoch " << epoch << " leader is mon" << leader
+	   << " quorum is " << quorum << " features are " << quorum_con_features
+           << " mon_features are " << quorum_mon_features
+	   << " min_mon_release " << min_mon_release
+           << dendl;
+
+  paxos->peon_init();
+  _finish_svc_election();
+
+  logger->inc(l_mon_election_lose);
+
+  finish_election();
+}
+
+namespace {
+std::string collect_compression_algorithms()
+{
+  ostringstream os;
+  bool printed = false;
+  for (auto [name, key] : Compressor::compression_algorithms) {
+    if (printed) {
+      os << ", ";
+    } else {
+      printed = true;
+    }
+    std::ignore = key;
+    os << name;
+  }
+  return os.str();
+}
+}
+
+void Monitor::collect_metadata(Metadata *m)
+{
+  collect_sys_info(m, g_ceph_context);
+  (*m)["addrs"] = stringify(messenger->get_myaddrs());
+  (*m)["compression_algorithms"] = collect_compression_algorithms();
+
+  // infer storage device
+  string devname = store->get_devname();
+  set<string> devnames;
+  get_raw_devices(devname, &devnames);
+  map<string,string> errs;
+  get_device_metadata(devnames, m, &errs);
+  for (auto& i : errs) {
+    dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
+  }
+}
+
+void Monitor::finish_election()
+{
+  apply_quorum_to_compatset_features();
+  apply_monmap_to_compatset_features();
+  timecheck_finish();
+  exited_quorum = utime_t();
+  finish_contexts(g_ceph_context, waitfor_quorum);
+  finish_contexts(g_ceph_context, maybe_wait_for_quorum);
+  resend_routed_requests();
+  update_logger();
+  register_cluster_logger();
+
+  // enable authentication
+  {
+    std::lock_guard l(auth_lock);
+    authmon()->_set_mon_num_rank(monmap->size(), rank);
+  }
+
+  // am i named and located properly?
+  string cur_name = monmap->get_name(messenger->get_myaddrs());
+  const auto my_infop = monmap->mon_info.find(cur_name);
+  const map<string,string>& map_crush_loc = my_infop->second.crush_loc;
+  
+  if (cur_name != name ||
+      (need_set_crush_loc && map_crush_loc != crush_loc)) {
+    dout(10) << " renaming/moving myself from " << cur_name << "/"
+	     << map_crush_loc <<" -> " << name << "/" << crush_loc << dendl;
+    send_mon_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddrs(),
+				  crush_loc, need_set_crush_loc),
+		     leader);
+    return;
+  }
+  do_stretch_mode_election_work();
+}
+
+void Monitor::_apply_compatset_features(CompatSet &new_features)
+{
+  if (new_features.compare(features) != 0) {
+    CompatSet diff = features.unsupported(new_features);
+    dout(1) << __func__ << " enabling new quorum features: " << diff << dendl;
+    features = new_features;
+
+    auto t = std::make_shared<MonitorDBStore::Transaction>();
+    write_features(t);
+    store->apply_transaction(t);
+
+    calc_quorum_requirements();
+  }
+}
+
+void Monitor::apply_quorum_to_compatset_features()
+{
+  CompatSet new_features(features);
+  new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES);
+  if (quorum_con_features & CEPH_FEATURE_OSDMAP_ENC) {
+    new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC);
+  }
+  new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2);
+  new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3);
+  dout(5) << __func__ << dendl;
+  _apply_compatset_features(new_features);
+}
+
+void Monitor::apply_monmap_to_compatset_features()
+{
+  CompatSet new_features(features);
+  mon_feature_t monmap_features = monmap->get_required_features();
+
+  /* persistent monmap features may go into the compatset.
+   * optional monmap features may not - why?
+   *   because optional monmap features may be set/unset by the admin,
+   *   and possibly by other means that haven't yet been thought out,
+   *   so we can't make the monitor enforce them on start - because they
+   *   may go away.
+   *   this, of course, does not invalidate setting a compatset feature
+   *   for an optional feature - as long as you make sure to clean it up
+   *   once you unset it.
+   */
+  if (monmap_features.contains_all(ceph::features::mon::FEATURE_KRAKEN)) {
+    ceph_assert(ceph::features::mon::get_persistent().contains_all(
+           ceph::features::mon::FEATURE_KRAKEN));
+    // this feature should only ever be set if the quorum supports it.
+    ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_KRAKEN));
+    new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN);
+  }
+  if (monmap_features.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) {
+    ceph_assert(ceph::features::mon::get_persistent().contains_all(
+           ceph::features::mon::FEATURE_LUMINOUS));
+    // this feature should only ever be set if the quorum supports it.
+    ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_LUMINOUS));
+    new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS);
+  }
+  if (monmap_features.contains_all(ceph::features::mon::FEATURE_MIMIC)) {
+    ceph_assert(ceph::features::mon::get_persistent().contains_all(
+           ceph::features::mon::FEATURE_MIMIC));
+    // this feature should only ever be set if the quorum supports it.
+    ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_MIMIC));
+    new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_MIMIC);
+  }
+  if (monmap_features.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
+    ceph_assert(ceph::features::mon::get_persistent().contains_all(
+           ceph::features::mon::FEATURE_NAUTILUS));
+    // this feature should only ever be set if the quorum supports it.
+    ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_NAUTILUS));
+    new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_NAUTILUS);
+  }
+  if (monmap_features.contains_all(ceph::features::mon::FEATURE_OCTOPUS)) {
+    ceph_assert(ceph::features::mon::get_persistent().contains_all(
+           ceph::features::mon::FEATURE_OCTOPUS));
+    // this feature should only ever be set if the quorum supports it.
+    ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_OCTOPUS));
+    new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OCTOPUS);
+  }
+  if (monmap_features.contains_all(ceph::features::mon::FEATURE_PACIFIC)) {
+    ceph_assert(ceph::features::mon::get_persistent().contains_all(
+           ceph::features::mon::FEATURE_PACIFIC));
+    // this feature should only ever be set if the quorum supports it.
+    ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_PACIFIC));
+    new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_PACIFIC);
+  }
+
+  dout(5) << __func__ << dendl;
+  _apply_compatset_features(new_features);
+}
+
+void Monitor::calc_quorum_requirements()
+{
+  required_features = 0;
+
+  // compatset
+  if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC)) {
+    required_features |= CEPH_FEATURE_OSDMAP_ENC;
+  }
+  if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_KRAKEN)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_KRAKEN;
+  }
+  if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_LUMINOUS;
+  }
+  if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_MIMIC)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_MIMIC;
+  }
+  if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_NAUTILUS)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_NAUTILUS |
+      CEPH_FEATUREMASK_CEPHX_V2;
+  }
+  if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OCTOPUS)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_OCTOPUS;
+  }
+  if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_PACIFIC)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_PACIFIC;
+  }
+
+  // monmap
+  if (monmap->get_required_features().contains_all(
+	ceph::features::mon::FEATURE_KRAKEN)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_KRAKEN;
+  }
+  if (monmap->get_required_features().contains_all(
+	ceph::features::mon::FEATURE_LUMINOUS)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_LUMINOUS;
+  }
+  if (monmap->get_required_features().contains_all(
+	ceph::features::mon::FEATURE_MIMIC)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_MIMIC;
+  }
+  if (monmap->get_required_features().contains_all(
+	ceph::features::mon::FEATURE_NAUTILUS)) {
+    required_features |= CEPH_FEATUREMASK_SERVER_NAUTILUS |
+      CEPH_FEATUREMASK_CEPHX_V2;
+  }
+  dout(10) << __func__ << " required_features " << required_features << dendl;
+}
+
+void Monitor::get_combined_feature_map(FeatureMap *fm)
+{
+  *fm += session_map.feature_map;
+  for (auto id : quorum) {
+    if (id != rank) {
+      *fm += quorum_feature_map[id];
+    }
+  }
+}
+
+void Monitor::sync_force(Formatter *f)
+{
+  auto tx(std::make_shared<MonitorDBStore::Transaction>());
+  sync_stash_critical_state(tx);
+  tx->put("mon_sync", "force_sync", 1);
+  store->apply_transaction(tx);
+
+  f->open_object_section("sync_force");
+  f->dump_int("ret", 0);
+  f->dump_stream("msg") << "forcing store sync the next time the monitor starts";
+  f->close_section(); // sync_force
+}
+
+void Monitor::_quorum_status(Formatter *f, ostream& ss)
+{
+  bool free_formatter = false;
+
+  if (!f) {
+    // louzy/lazy hack: default to json if no formatter has been defined
+    f = new JSONFormatter();
+    free_formatter = true;
+  }
+  f->open_object_section("quorum_status");
+  f->dump_int("election_epoch", get_epoch());
+
+  f->open_array_section("quorum");
+  for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
+    f->dump_int("mon", *p);
+  f->close_section(); // quorum
+
+  list<string> quorum_names = get_quorum_names();
+  f->open_array_section("quorum_names");
+  for (list<string>::iterator p = quorum_names.begin(); p != quorum_names.end(); ++p)
+    f->dump_string("mon", *p);
+  f->close_section(); // quorum_names
+
+  f->dump_string("quorum_leader_name", quorum.empty() ? string() : monmap->get_name(leader));
+
+  if (!quorum.empty()) {
+    f->dump_int(
+      "quorum_age",
+      quorum_age());
+  }
+
+  f->open_object_section("features");
+  f->dump_stream("quorum_con") << quorum_con_features;
+  quorum_mon_features.dump(f, "quorum_mon");
+  f->close_section();
+
+  f->open_object_section("monmap");
+  monmap->dump(f);
+  f->close_section(); // monmap
+
+  f->close_section(); // quorum_status
+  f->flush(ss);
+  if (free_formatter)
+    delete f;
+}
+
+void Monitor::get_mon_status(Formatter *f)
+{
+  f->open_object_section("mon_status");
+  f->dump_string("name", name);
+  f->dump_int("rank", rank);
+  f->dump_string("state", get_state_name());
+  f->dump_int("election_epoch", get_epoch());
+
+  f->open_array_section("quorum");
+  for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) {
+    f->dump_int("mon", *p);
+  }
+  f->close_section(); // quorum
+
+  if (!quorum.empty()) {
+    f->dump_int(
+      "quorum_age",
+      quorum_age());
+  }
+
+  f->open_object_section("features");
+  f->dump_stream("required_con") << required_features;
+  mon_feature_t req_mon_features = get_required_mon_features();
+  req_mon_features.dump(f, "required_mon");
+  f->dump_stream("quorum_con") << quorum_con_features;
+  quorum_mon_features.dump(f, "quorum_mon");
+  f->close_section(); // features
+
+  f->open_array_section("outside_quorum");
+  for (set<string>::iterator p = outside_quorum.begin(); p != outside_quorum.end(); ++p)
+    f->dump_string("mon", *p);
+  f->close_section(); // outside_quorum
+
+  f->open_array_section("extra_probe_peers");
+  for (set<entity_addrvec_t>::iterator p = extra_probe_peers.begin();
+       p != extra_probe_peers.end();
+       ++p) {
+    f->dump_object("peer", *p);
+  }
+  f->close_section(); // extra_probe_peers
+
+  f->open_array_section("sync_provider");
+  for (map<uint64_t,SyncProvider>::const_iterator p = sync_providers.begin();
+       p != sync_providers.end();
+       ++p) {
+    f->dump_unsigned("cookie", p->second.cookie);
+    f->dump_object("addrs", p->second.addrs);
+    f->dump_stream("timeout") << p->second.timeout;
+    f->dump_unsigned("last_committed", p->second.last_committed);
+    f->dump_stream("last_key") << p->second.last_key;
+  }
+  f->close_section();
+
+  if (is_synchronizing()) {
+    f->open_object_section("sync");
+    f->dump_stream("sync_provider") << sync_provider;
+    f->dump_unsigned("sync_cookie", sync_cookie);
+    f->dump_unsigned("sync_start_version", sync_start_version);
+    f->close_section();
+  }
+
+  if (g_conf()->mon_sync_provider_kill_at > 0)
+    f->dump_int("provider_kill_at", g_conf()->mon_sync_provider_kill_at);
+  if (g_conf()->mon_sync_requester_kill_at > 0)
+    f->dump_int("requester_kill_at", g_conf()->mon_sync_requester_kill_at);
+
+  f->open_object_section("monmap");
+  monmap->dump(f);
+  f->close_section();
+
+  f->dump_object("feature_map", session_map.feature_map);
+  f->dump_bool("stretch_mode", stretch_mode_engaged);
+  f->close_section(); // mon_status
+}
+
+
+// health status to clog
+
+void Monitor::health_tick_start()
+{
+  if (!cct->_conf->mon_health_to_clog ||
+      cct->_conf->mon_health_to_clog_tick_interval <= 0)
+    return;
+
+  dout(15) << __func__ << dendl;
+
+  health_tick_stop();
+  health_tick_event = timer.add_event_after(
+    cct->_conf->mon_health_to_clog_tick_interval,
+    new C_MonContext{this, [this](int r) {
+	if (r < 0)
+	  return;
+	health_tick_start();
+      }});
+}
+
+void Monitor::health_tick_stop()
+{
+  dout(15) << __func__ << dendl;
+
+  if (health_tick_event) {
+    timer.cancel_event(health_tick_event);
+    health_tick_event = NULL;
+  }
+}
+
+ceph::real_clock::time_point Monitor::health_interval_calc_next_update()
+{
+  auto now = ceph::real_clock::now();
+
+  auto secs = std::chrono::duration_cast<std::chrono::seconds>(now.time_since_epoch());
+  int remainder = secs.count() % cct->_conf->mon_health_to_clog_interval;
+  int adjustment = cct->_conf->mon_health_to_clog_interval - remainder;
+  auto next = secs + std::chrono::seconds(adjustment);
+
+  dout(20) << __func__
+    << " now: " << now << ","
+    << " next: " << next << ","
+    << " interval: " << cct->_conf->mon_health_to_clog_interval
+    << dendl;
+
+  return ceph::real_clock::time_point{next};
+}
+
+void Monitor::health_interval_start()
+{
+  dout(15) << __func__ << dendl;
+
+  if (!cct->_conf->mon_health_to_clog ||
+      cct->_conf->mon_health_to_clog_interval <= 0) {
+    return;
+  }
+
+  health_interval_stop();
+  auto next = health_interval_calc_next_update();
+  health_interval_event = new C_MonContext{this, [this](int r) {
+      if (r < 0)
+        return;
+      do_health_to_clog_interval();
+    }};
+  if (!timer.add_event_at(next, health_interval_event)) {
+    health_interval_event = nullptr;
+  }
+}
+
+void Monitor::health_interval_stop()
+{
+  dout(15) << __func__ << dendl;
+  if (health_interval_event) {
+    timer.cancel_event(health_interval_event);
+  }
+  health_interval_event = NULL;
+}
+
+void Monitor::health_events_cleanup()
+{
+  health_tick_stop();
+  health_interval_stop();
+  health_status_cache.reset();
+}
+
+void Monitor::health_to_clog_update_conf(const std::set<std::string> &changed)
+{
+  dout(20) << __func__ << dendl;
+
+  if (changed.count("mon_health_to_clog")) {
+    if (!cct->_conf->mon_health_to_clog) {
+      health_events_cleanup();
+      return;
+    } else {
+      if (!health_tick_event) {
+        health_tick_start();
+      }
+      if (!health_interval_event) {
+        health_interval_start();
+      }
+    }
+  }
+
+  if (changed.count("mon_health_to_clog_interval")) {
+    if (cct->_conf->mon_health_to_clog_interval <= 0) {
+      health_interval_stop();
+    } else {
+      health_interval_start();
+    }
+  }
+
+  if (changed.count("mon_health_to_clog_tick_interval")) {
+    if (cct->_conf->mon_health_to_clog_tick_interval <= 0) {
+      health_tick_stop();
+    } else {
+      health_tick_start();
+    }
+  }
+}
+
+void Monitor::do_health_to_clog_interval()
+{
+  // outputting to clog may have been disabled in the conf
+  // since we were scheduled.
+  if (!cct->_conf->mon_health_to_clog ||
+      cct->_conf->mon_health_to_clog_interval <= 0)
+    return;
+
+  dout(10) << __func__ << dendl;
+
+  // do we have a cached value for next_clog_update?  if not,
+  // do we know when the last update was?
+
+  do_health_to_clog(true);
+  health_interval_start();
+}
+
+void Monitor::do_health_to_clog(bool force)
+{
+  // outputting to clog may have been disabled in the conf
+  // since we were scheduled.
+  if (!cct->_conf->mon_health_to_clog ||
+      cct->_conf->mon_health_to_clog_interval <= 0)
+    return;
+
+  dout(10) << __func__ << (force ? " (force)" : "") << dendl;
+
+  string summary;
+  health_status_t level = healthmon()->get_health_status(false, nullptr, &summary);
+  if (!force &&
+      summary == health_status_cache.summary &&
+      level == health_status_cache.overall)
+    return;
+
+  if (g_conf()->mon_health_detail_to_clog &&
+      summary != health_status_cache.summary &&
+      level != HEALTH_OK) {
+    string details;
+    level = healthmon()->get_health_status(true, nullptr, &details);
+    clog->health(level) << "Health detail: " << details;
+  } else {
+    clog->health(level) << "overall " << summary;
+  }
+  health_status_cache.summary = summary;
+  health_status_cache.overall = level;
+}
+
+void Monitor::log_health(
+  const health_check_map_t& updated,
+  const health_check_map_t& previous,
+  MonitorDBStore::TransactionRef t)
+{
+  if (!g_conf()->mon_health_to_clog) {
+    return;
+  }
+
+  const utime_t now = ceph_clock_now();
+
+  // FIXME: log atomically as part of @t instead of using clog.
+  dout(10) << __func__ << " updated " << updated.checks.size()
+	   << " previous " << previous.checks.size()
+	   << dendl;
+  const auto min_log_period = g_conf().get_val<int64_t>(
+      "mon_health_log_update_period");
+  for (auto& p : updated.checks) {
+    auto q = previous.checks.find(p.first);
+    bool logged = false;
+    if (q == previous.checks.end()) {
+      // new
+      ostringstream ss;
+      ss << "Health check failed: " << p.second.summary << " ("
+         << p.first << ")";
+      clog->health(p.second.severity) << ss.str();
+
+      logged = true;
+    } else {
+      if (p.second.summary != q->second.summary ||
+	  p.second.severity != q->second.severity) {
+
+        auto status_iter = health_check_log_times.find(p.first);
+        if (status_iter != health_check_log_times.end()) {
+          if (p.second.severity == q->second.severity &&
+              now - status_iter->second.updated_at < min_log_period) {
+            // We already logged this recently and the severity is unchanged,
+            // so skip emitting an update of the summary string.
+            // We'll get an update out of tick() later if the check
+            // is still failing.
+            continue;
+          }
+        }
+
+        // summary or severity changed (ignore detail changes at this level)
+        ostringstream ss;
+        ss << "Health check update: " << p.second.summary << " (" << p.first << ")";
+        clog->health(p.second.severity) << ss.str();
+
+        logged = true;
+      }
+    }
+    // Record the time at which we last logged, so that we can check this
+    // when considering whether/when to print update messages.
+    if (logged) {
+      auto iter = health_check_log_times.find(p.first);
+      if (iter == health_check_log_times.end()) {
+        health_check_log_times.emplace(p.first, HealthCheckLogStatus(
+          p.second.severity, p.second.summary, now));
+      } else {
+        iter->second = HealthCheckLogStatus(
+          p.second.severity, p.second.summary, now);
+      }
+    }
+  }
+  for (auto& p : previous.checks) {
+    if (!updated.checks.count(p.first)) {
+      // cleared
+      ostringstream ss;
+      if (p.first == "DEGRADED_OBJECTS") {
+        clog->info() << "All degraded objects recovered";
+      } else if (p.first == "OSD_FLAGS") {
+        clog->info() << "OSD flags cleared";
+      } else {
+        clog->info() << "Health check cleared: " << p.first << " (was: "
+                     << p.second.summary << ")";
+      }
+
+      if (health_check_log_times.count(p.first)) {
+        health_check_log_times.erase(p.first);
+      }
+    }
+  }
+
+  if (previous.checks.size() && updated.checks.size() == 0) {
+    // We might be going into a fully healthy state, check
+    // other subsystems
+    bool any_checks = false;
+    for (auto& svc : paxos_service) {
+      if (&(svc->get_health_checks()) == &(previous)) {
+        // Ignore the ones we're clearing right now
+        continue;
+      }
+
+      if (svc->get_health_checks().checks.size() > 0) {
+        any_checks = true;
+        break;
+      }
+    }
+    if (!any_checks) {
+      clog->info() << "Cluster is now healthy";
+    }
+  }
+}
+
+void Monitor::update_pending_metadata()
+{
+  Metadata metadata;
+  collect_metadata(&metadata);
+  size_t version_size = mon_metadata[rank]["ceph_version_short"].size();
+  const std::string current_version = mon_metadata[rank]["ceph_version_short"];
+  const std::string pending_version = metadata["ceph_version_short"];
+
+  if (current_version.compare(0, version_size, pending_version) < 0) {
+    mgr_client.update_daemon_metadata("mon", name, metadata);
+  }
+}
+
+void Monitor::get_cluster_status(stringstream &ss, Formatter *f,
+				 MonSession *session)
+{
+  if (f)
+    f->open_object_section("status");
+
+  const auto&& fs_names = session->get_allowed_fs_names();
+
+  if (f) {
+    f->dump_stream("fsid") << monmap->get_fsid();
+    healthmon()->get_health_status(false, f, nullptr);
+    f->dump_unsigned("election_epoch", get_epoch());
+    {
+      f->open_array_section("quorum");
+      for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
+	f->dump_int("rank", *p);
+      f->close_section();
+      f->open_array_section("quorum_names");
+      for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
+	f->dump_string("id", monmap->get_name(*p));
+      f->close_section();
+      f->dump_int(
+	"quorum_age",
+        quorum_age());
+    }
+    f->open_object_section("monmap");
+    monmap->dump_summary(f);
+    f->close_section();
+    f->open_object_section("osdmap");
+    osdmon()->osdmap.print_summary(f, cout, string(12, ' '));
+    f->close_section();
+    f->open_object_section("pgmap");
+    mgrstatmon()->print_summary(f, NULL);
+    f->close_section();
+    f->open_object_section("fsmap");
+
+    FSMap fsmap_copy = mdsmon()->get_fsmap();
+    if (!fs_names.empty()) {
+      fsmap_copy.filter(fs_names);
+    }
+    const FSMap *fsmapp = &fsmap_copy;
+
+    fsmapp->print_summary(f, NULL);
+    f->close_section();
+    f->open_object_section("mgrmap");
+    mgrmon()->get_map().print_summary(f, nullptr);
+    f->close_section();
+
+    f->dump_object("servicemap", mgrstatmon()->get_service_map());
+
+    f->open_object_section("progress_events");
+    for (auto& i : mgrstatmon()->get_progress_events()) {
+      f->dump_object(i.first.c_str(), i.second);
+    }
+    f->close_section();
+
+    f->close_section();
+  } else {
+    ss << "  cluster:\n";
+    ss << "    id:     " << monmap->get_fsid() << "\n";
+
+    string health;
+    healthmon()->get_health_status(false, nullptr, &health,
+				   "\n            ", "\n            ");
+    ss << "    health: " << health << "\n";
+
+    ss << "\n \n  services:\n";
+    {
+      size_t maxlen = 3;
+      auto& service_map = mgrstatmon()->get_service_map();
+      for (auto& p : service_map.services) {
+	maxlen = std::max(maxlen, p.first.size());
+      }
+      string spacing(maxlen - 3, ' ');
+      const auto quorum_names = get_quorum_names();
+      const auto mon_count = monmap->mon_info.size();
+      auto mnow = ceph::mono_clock::now();
+      ss << "    mon: " << spacing << mon_count << " daemons, quorum "
+	 << quorum_names << " (age " << timespan_str(mnow - quorum_since) << ")";
+      if (quorum_names.size() != mon_count) {
+	std::list<std::string> out_of_q;
+	for (size_t i = 0; i < monmap->ranks.size(); ++i) {
+	  if (quorum.count(i) == 0) {
+	    out_of_q.push_back(monmap->ranks[i]);
+	  }
+	}
+	ss << ", out of quorum: " << joinify(out_of_q.begin(),
+					     out_of_q.end(), std::string(", "));
+      }
+      ss << "\n";
+      if (mgrmon()->in_use()) {
+	ss << "    mgr: " << spacing;
+	mgrmon()->get_map().print_summary(nullptr, &ss);
+	ss << "\n";
+      }
+
+      FSMap fsmap_copy = mdsmon()->get_fsmap();
+      if (!fs_names.empty()) {
+	fsmap_copy.filter(fs_names);
+      }
+      const FSMap *fsmapp = &fsmap_copy;
+
+      if (fsmapp->filesystem_count() > 0 and mdsmon()->should_print_status()){
+        ss << "    mds: " << spacing;
+	fsmapp->print_daemon_summary(ss);
+	ss << "\n";
+      }
+
+      ss << "    osd: " << spacing;
+      osdmon()->osdmap.print_summary(NULL, ss, string(maxlen + 6, ' '));
+      ss << "\n";
+      for (auto& p : service_map.services) {
+        const std::string &service = p.first;
+        // filter out normal ceph entity types
+        if (ServiceMap::is_normal_ceph_entity(service)) {
+          continue;
+        }
+	ss << "    " << p.first << ": " << string(maxlen - p.first.size(), ' ')
+	   << p.second.get_summary() << "\n";
+      }
+    }
+
+    if (auto& service_map = mgrstatmon()->get_service_map();
+        std::any_of(service_map.services.begin(),
+                    service_map.services.end(),
+                    [](auto& service) {
+                      return service.second.has_running_tasks();
+                    })) {
+      ss << "\n \n  task status:\n";
+      for (auto& [name, service] : service_map.services) {
+	ss << service.get_task_summary(name);
+      }
+    }
+
+    ss << "\n \n  data:\n";
+    mdsmon()->print_fs_summary(ss);
+    mgrstatmon()->print_summary(NULL, &ss);
+
+    auto& pem = mgrstatmon()->get_progress_events();
+    if (!pem.empty()) {
+      ss << "\n \n  progress:\n";
+      for (auto& i : pem) {
+	if (i.second.add_to_ceph_s){
+	ss << "    " << i.second.message << "\n";
+	}
+      }
+    }
+    ss << "\n ";
+  }
+}
+
+void Monitor::_generate_command_map(cmdmap_t& cmdmap,
+                                    map<string,string> &param_str_map)
+{
+  for (auto p = cmdmap.begin(); p != cmdmap.end(); ++p) {
+    if (p->first == "prefix")
+      continue;
+    if (p->first == "caps") {
+      vector<string> cv;
+      if (cmd_getval(cmdmap, "caps", cv) &&
+	  cv.size() % 2 == 0) {
+	for (unsigned i = 0; i < cv.size(); i += 2) {
+	  string k = string("caps_") + cv[i];
+	  param_str_map[k] = cv[i + 1];
+	}
+	continue;
+      }
+    }
+    param_str_map[p->first] = cmd_vartype_stringify(p->second);
+  }
+}
+
+const MonCommand *Monitor::_get_moncommand(
+  const string &cmd_prefix,
+  const vector<MonCommand>& cmds)
+{
+  for (auto& c : cmds) {
+    if (c.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
+      return &c;
+    }
+  }
+  return nullptr;
+}
+
+bool Monitor::_allowed_command(MonSession *s, const string &module,
+			       const string &prefix, const cmdmap_t& cmdmap,
+                               const map<string,string>& param_str_map,
+                               const MonCommand *this_cmd) {
+
+  bool cmd_r = this_cmd->requires_perm('r');
+  bool cmd_w = this_cmd->requires_perm('w');
+  bool cmd_x = this_cmd->requires_perm('x');
+
+  bool capable = s->caps.is_capable(
+    g_ceph_context,
+    s->entity_name,
+    module, prefix, param_str_map,
+    cmd_r, cmd_w, cmd_x,
+    s->get_peer_socket_addr());
+
+  dout(10) << __func__ << " " << (capable ? "" : "not ") << "capable" << dendl;
+  return capable;
+}
+
+void Monitor::format_command_descriptions(const std::vector<MonCommand> &commands,
+					  Formatter *f,
+					  uint64_t features,
+					  bufferlist *rdata)
+{
+  int cmdnum = 0;
+  f->open_object_section("command_descriptions");
+  for (const auto &cmd : commands) {
+    unsigned flags = cmd.flags;
+    ostringstream secname;
+    secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+    dump_cmddesc_to_json(f, features, secname.str(),
+			 cmd.cmdstring, cmd.helpstring, cmd.module,
+			 cmd.req_perms, flags);
+    cmdnum++;
+  }
+  f->close_section();	// command_descriptions
+
+  f->flush(*rdata);
+}
+
+bool Monitor::is_keyring_required()
+{
+  return auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX) || 
+         auth_service_required.is_supported_auth(CEPH_AUTH_CEPHX) || 
+         auth_cluster_required.is_supported_auth(CEPH_AUTH_GSS)   || 
+         auth_service_required.is_supported_auth(CEPH_AUTH_GSS);
+}
+
+struct C_MgrProxyCommand : public Context {
+  Monitor *mon;
+  MonOpRequestRef op;
+  uint64_t size;
+  bufferlist outbl;
+  string outs;
+  C_MgrProxyCommand(Monitor *mon, MonOpRequestRef op, uint64_t s)
+    : mon(mon), op(op), size(s) { }
+  void finish(int r) {
+    std::lock_guard l(mon->lock);
+    mon->mgr_proxy_bytes -= size;
+    mon->reply_command(op, r, outs, outbl, 0);
+  }
+};
+
+void Monitor::handle_tell_command(MonOpRequestRef op)
+{
+  ceph_assert(op->is_type_command());
+  MCommand *m = static_cast<MCommand*>(op->get_req());
+  if (m->fsid != monmap->fsid) {
+    dout(0) << "handle_command on fsid " << m->fsid << " != " << monmap->fsid << dendl;
+    return reply_tell_command(op, -EACCES, "wrong fsid");
+  }
+  MonSession *session = op->get_session();
+  if (!session) {
+    dout(5) << __func__ << " dropping stray message " << *m << dendl;
+    return;
+  }
+  cmdmap_t cmdmap;
+  if (stringstream ss; !cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    return reply_tell_command(op, -EINVAL, ss.str());
+  }
+  map<string,string> param_str_map;
+  _generate_command_map(cmdmap, param_str_map);
+  string prefix;
+  if (!cmd_getval(cmdmap, "prefix", prefix)) {
+    return reply_tell_command(op, -EINVAL, "no prefix");
+  }
+  if (auto cmd = _get_moncommand(prefix,
+				 get_local_commands(quorum_mon_features));
+      cmd) {
+    if (cmd->is_obsolete() ||
+	(cct->_conf->mon_debug_deprecated_as_obsolete &&
+	 cmd->is_deprecated())) {
+      return reply_tell_command(op, -ENOTSUP,
+				"command is obsolete; "
+				"please check usage and/or man page");
+    }
+  }
+  // see if command is allowed
+  if (!session->caps.is_capable(
+      g_ceph_context,
+      session->entity_name,
+      "mon", prefix, param_str_map,
+      true, true, true,
+      session->get_peer_socket_addr())) {
+    return reply_tell_command(op, -EACCES, "insufficient caps");
+  }
+  // pass it to asok
+  cct->get_admin_socket()->queue_tell_command(m);
+}
+
+void Monitor::handle_command(MonOpRequestRef op)
+{
+  ceph_assert(op->is_type_command());
+  auto m = op->get_req<MMonCommand>();
+  if (m->fsid != monmap->fsid) {
+    dout(0) << "handle_command on fsid " << m->fsid << " != " << monmap->fsid
+	    << dendl;
+    reply_command(op, -EPERM, "wrong fsid", 0);
+    return;
+  }
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    dout(5) << __func__ << " dropping stray message " << *m << dendl;
+    return;
+  }
+
+  if (m->cmd.empty()) {
+    reply_command(op, -EINVAL, "no command specified", 0);
+    return;
+  }
+
+  string prefix;
+  vector<string> fullcmd;
+  cmdmap_t cmdmap;
+  stringstream ss, ds;
+  bufferlist rdata;
+  string rs;
+  int r = -EINVAL;
+  rs = "unrecognized command";
+
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    // ss has reason for failure
+    r = -EINVAL;
+    rs = ss.str();
+    if (!m->get_source().is_mon())  // don't reply to mon->mon commands
+      reply_command(op, r, rs, 0);
+    return;
+  }
+
+  // check return value. If no prefix parameter provided,
+  // return value will be false, then return error info.
+  if (!cmd_getval(cmdmap, "prefix", prefix)) {
+    reply_command(op, -EINVAL, "command prefix not found", 0);
+    return;
+  }
+
+  // check prefix is empty
+  if (prefix.empty()) {
+    reply_command(op, -EINVAL, "command prefix must not be empty", 0);
+    return;
+  }
+
+  if (prefix == "get_command_descriptions") {
+    bufferlist rdata;
+    Formatter *f = Formatter::create("json");
+
+    std::vector<MonCommand> commands = static_cast<MgrMonitor*>(
+        paxos_service[PAXOS_MGR].get())->get_command_descs();
+
+    for (auto& c : leader_mon_commands) {
+      commands.push_back(c);
+    }
+
+    auto features = m->get_connection()->get_features();
+    format_command_descriptions(commands, f, features, &rdata);
+    delete f;
+    reply_command(op, 0, "", rdata, 0);
+    return;
+  }
+
+  dout(0) << "handle_command " << *m << dendl;
+
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  get_str_vec(prefix, fullcmd);
+
+  // make sure fullcmd is not empty.
+  // invalid prefix will cause empty vector fullcmd.
+  // such as, prefix=";,,;"
+  if (fullcmd.empty()) {
+    reply_command(op, -EINVAL, "command requires a prefix to be valid", 0);
+    return;
+  }
+
+  std::string_view module = fullcmd[0];
+
+  // validate command is in leader map
+
+  const MonCommand *leader_cmd;
+  const auto& mgr_cmds = mgrmon()->get_command_descs();
+  const MonCommand *mgr_cmd = nullptr;
+  if (!mgr_cmds.empty()) {
+    mgr_cmd = _get_moncommand(prefix, mgr_cmds);
+  }
+  leader_cmd = _get_moncommand(prefix, leader_mon_commands);
+  if (!leader_cmd) {
+    leader_cmd = mgr_cmd;
+    if (!leader_cmd) {
+      reply_command(op, -EINVAL, "command not known", 0);
+      return;
+    }
+  }
+  // validate command is in our map & matches, or forward if it is allowed
+  const MonCommand *mon_cmd = _get_moncommand(
+    prefix,
+    get_local_commands(quorum_mon_features));
+  if (!mon_cmd) {
+    mon_cmd = mgr_cmd;
+  }
+  if (!is_leader()) {
+    if (!mon_cmd) {
+      if (leader_cmd->is_noforward()) {
+	reply_command(op, -EINVAL,
+		      "command not locally supported and not allowed to forward",
+		      0);
+	return;
+      }
+      dout(10) << "Command not locally supported, forwarding request "
+	       << m << dendl;
+      forward_request_leader(op);
+      return;
+    } else if (!mon_cmd->is_compat(leader_cmd)) {
+      if (mon_cmd->is_noforward()) {
+	reply_command(op, -EINVAL,
+		      "command not compatible with leader and not allowed to forward",
+		      0);
+	return;
+      }
+      dout(10) << "Command not compatible with leader, forwarding request "
+	       << m << dendl;
+      forward_request_leader(op);
+      return;
+    }
+  }
+
+  if (mon_cmd->is_obsolete() ||
+      (cct->_conf->mon_debug_deprecated_as_obsolete
+       && mon_cmd->is_deprecated())) {
+    reply_command(op, -ENOTSUP,
+                  "command is obsolete; please check usage and/or man page",
+                  0);
+    return;
+  }
+
+  if (session->proxy_con && mon_cmd->is_noforward()) {
+    dout(10) << "Got forward for noforward command " << m << dendl;
+    reply_command(op, -EINVAL, "forward for noforward command", rdata, 0);
+    return;
+  }
+
+  /* what we perceive as being the service the command falls under */
+  string service(mon_cmd->module);
+
+  dout(25) << __func__ << " prefix='" << prefix
+           << "' module='" << module
+           << "' service='" << service << "'" << dendl;
+
+  bool cmd_is_rw =
+    (mon_cmd->requires_perm('w') || mon_cmd->requires_perm('x'));
+
+  // validate user's permissions for requested command
+  map<string,string> param_str_map;
+
+  // Catch bad_cmd_get exception if _generate_command_map() throws it
+  try {
+    _generate_command_map(cmdmap, param_str_map);
+  }
+  catch(bad_cmd_get& e) {
+    reply_command(op, -EINVAL, e.what(), 0);
+  }
+
+  if (!_allowed_command(session, service, prefix, cmdmap,
+                        param_str_map, mon_cmd)) {
+    dout(1) << __func__ << " access denied" << dendl;
+    if (prefix != "config set" && prefix != "config-key set")
+      (cmd_is_rw ? audit_clog->info() : audit_clog->debug())
+        << "from='" << session->name << " " << session->addrs << "' "
+        << "entity='" << session->entity_name << "' "
+        << "cmd=" << m->cmd << ":  access denied";
+    reply_command(op, -EACCES, "access denied", 0);
+    return;
+  }
+
+  if (prefix != "config set" && prefix != "config-key set")
+    (cmd_is_rw ? audit_clog->info() : audit_clog->debug())
+        << "from='" << session->name << " " << session->addrs << "' "
+        << "entity='" << session->entity_name << "' "
+        << "cmd=" << m->cmd << ": dispatch";
+
+  // compat kludge for legacy clients trying to tell commands that are
+  // new.  see bottom of MonCommands.h.  we need to handle both (1)
+  // pre-octopus clients and (2) octopus clients with a mix of pre-octopus
+  // and octopus mons.
+  if ((!HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS) ||
+       monmap->min_mon_release < ceph_release_t::octopus) &&
+      (prefix == "injectargs" ||
+       prefix == "smart" ||
+       prefix == "mon_status" ||
+       prefix == "heap")) {
+    if (m->get_connection()->get_messenger() == 0) {
+      // Prior to octopus, monitors might forward these messages
+      // around. that was broken at baseline, and if we try to process
+      // this message now, it will assert out when we try to send a
+      // message in reply from the asok/tell worker (see
+      // AnonConnection).  Just reply with an error.
+      dout(5) << __func__ << " failing forwarded command from a (presumably) "
+	      << "pre-octopus peer" << dendl;
+      reply_command(
+	op, -EBUSY,
+	"failing forwarded tell command in mixed-version mon cluster", 0);
+      return;
+    }
+    dout(5) << __func__ << " passing command to tell/asok" << dendl;
+    cct->get_admin_socket()->queue_tell_command(m);
+    return;
+  }
+
+  if (mon_cmd->is_mgr()) {
+    const auto& hdr = m->get_header();
+    uint64_t size = hdr.front_len + hdr.middle_len + hdr.data_len;
+    uint64_t max = g_conf().get_val<Option::size_t>("mon_client_bytes")
+                 * g_conf().get_val<double>("mon_mgr_proxy_client_bytes_ratio");
+    if (mgr_proxy_bytes + size > max) {
+      dout(10) << __func__ << " current mgr proxy bytes " << mgr_proxy_bytes
+	       << " + " << size << " > max " << max << dendl;
+      reply_command(op, -EAGAIN, "hit limit on proxied mgr commands", rdata, 0);
+      return;
+    }
+    mgr_proxy_bytes += size;
+    dout(10) << __func__ << " proxying mgr command (+" << size
+	     << " -> " << mgr_proxy_bytes << ")" << dendl;
+    C_MgrProxyCommand *fin = new C_MgrProxyCommand(this, op, size);
+    mgr_client.start_command(m->cmd,
+			     m->get_data(),
+			     &fin->outbl,
+			     &fin->outs,
+			     new C_OnFinisher(fin, &finisher));
+    return;
+  }
+
+  if ((module == "mds" || module == "fs")  &&
+      prefix != "fs authorize") {
+    mdsmon()->dispatch(op);
+    return;
+  }
+  if ((module == "osd" ||
+       prefix == "pg map" ||
+       prefix == "pg repeer") &&
+      prefix != "osd last-stat-seq") {
+    osdmon()->dispatch(op);
+    return;
+  }
+  if (module == "config") {
+    configmon()->dispatch(op);
+    return;
+  }
+
+  if (module == "mon" &&
+      /* Let the Monitor class handle the following commands:
+       *  'mon scrub'
+       */
+      prefix != "mon scrub" &&
+      prefix != "mon metadata" &&
+      prefix != "mon versions" &&
+      prefix != "mon count-metadata" &&
+      prefix != "mon ok-to-stop" &&
+      prefix != "mon ok-to-add-offline" &&
+      prefix != "mon ok-to-rm") {
+    monmon()->dispatch(op);
+    return;
+  }
+  if (module == "health" && prefix != "health") {
+    healthmon()->dispatch(op);
+    return;
+  }
+  if (module == "auth" || prefix == "fs authorize") {
+    authmon()->dispatch(op);
+    return;
+  }
+  if (module == "log") {
+    logmon()->dispatch(op);
+    return;
+  }
+
+  if (module == "config-key") {
+    kvmon()->dispatch(op);
+    return;
+  }
+
+  if (module == "mgr") {
+    mgrmon()->dispatch(op);
+    return;
+  }
+
+  if (prefix == "fsid") {
+    if (f) {
+      f->open_object_section("fsid");
+      f->dump_stream("fsid") << monmap->fsid;
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ds << monmap->fsid;
+      rdata.append(ds);
+    }
+    reply_command(op, 0, "", rdata, 0);
+    return;
+  }
+
+  if (prefix == "mon scrub") {
+    wait_for_paxos_write();
+    if (is_leader()) {
+      int r = scrub_start();
+      reply_command(op, r, "", rdata, 0);
+    } else if (is_peon()) {
+      forward_request_leader(op);
+    } else {
+      reply_command(op, -EAGAIN, "no quorum", rdata, 0);
+    }
+    return;
+  }
+
+  if (prefix == "time-sync-status") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    f->open_object_section("time_sync");
+    if (!timecheck_skews.empty()) {
+      f->open_object_section("time_skew_status");
+      for (auto& i : timecheck_skews) {
+	double skew = i.second;
+	double latency = timecheck_latencies[i.first];
+	string name = monmap->get_name(i.first);
+	ostringstream tcss;
+	health_status_t tcstatus = timecheck_status(tcss, skew, latency);
+	f->open_object_section(name.c_str());
+	f->dump_float("skew", skew);
+	f->dump_float("latency", latency);
+	f->dump_stream("health") << tcstatus;
+	if (tcstatus != HEALTH_OK) {
+	  f->dump_stream("details") << tcss.str();
+	}
+	f->close_section();
+      }
+      f->close_section();
+    }
+    f->open_object_section("timechecks");
+    f->dump_unsigned("epoch", get_epoch());
+    f->dump_int("round", timecheck_round);
+    f->dump_stream("round_status") << ((timecheck_round%2) ?
+				       "on-going" : "finished");
+    f->close_section();
+    f->close_section();
+    f->flush(rdata);
+    r = 0;
+    rs = "";
+  } else if (prefix == "status" ||
+	     prefix == "health" ||
+	     prefix == "df") {
+    string detail;
+    cmd_getval(cmdmap, "detail", detail);
+
+    if (prefix == "status") {
+      // get_cluster_status handles f == NULL
+      get_cluster_status(ds, f.get(), session);
+
+      if (f) {
+        f->flush(ds);
+        ds << '\n';
+      }
+      rdata.append(ds);
+    } else if (prefix == "health") {
+      string plain;
+      healthmon()->get_health_status(detail == "detail", f.get(), f ? nullptr : &plain);
+      if (f) {
+	f->flush(rdata);
+      } else {
+	rdata.append(plain);
+      }
+    } else if (prefix == "df") {
+      bool verbose = (detail == "detail");
+      if (f)
+        f->open_object_section("stats");
+
+      mgrstatmon()->dump_cluster_stats(&ds, f.get(), verbose);
+      if (!f) {
+	ds << "\n \n";
+      }
+      mgrstatmon()->dump_pool_stats(osdmon()->osdmap, &ds, f.get(), verbose);
+
+      if (f) {
+        f->close_section();
+        f->flush(ds);
+        ds << '\n';
+      }
+    } else {
+      ceph_abort_msg("We should never get here!");
+      return;
+    }
+    rdata.append(ds);
+    rs = "";
+    r = 0;
+  } else if (prefix == "report") {
+
+    // this must be formatted, in its current form
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    f->open_object_section("report");
+    f->dump_stream("cluster_fingerprint") << fingerprint;
+    f->dump_string("version", ceph_version_to_str());
+    f->dump_string("commit", git_version_to_str());
+    f->dump_stream("timestamp") << ceph_clock_now();
+
+    vector<string> tagsvec;
+    cmd_getval(cmdmap, "tags", tagsvec);
+    string tagstr = str_join(tagsvec, " ");
+    if (!tagstr.empty())
+      tagstr = tagstr.substr(0, tagstr.find_last_of(' '));
+    f->dump_string("tag", tagstr);
+
+    healthmon()->get_health_status(true, f.get(), nullptr);
+
+    monmon()->dump_info(f.get());
+    osdmon()->dump_info(f.get());
+    mdsmon()->dump_info(f.get());
+    authmon()->dump_info(f.get());
+    mgrstatmon()->dump_info(f.get());
+
+    paxos->dump_info(f.get());
+
+    f->close_section();
+    f->flush(rdata);
+
+    ostringstream ss2;
+    ss2 << "report " << rdata.crc32c(CEPH_MON_PORT_LEGACY);
+    rs = ss2.str();
+    r = 0;
+  } else if (prefix == "osd last-stat-seq") {
+    int64_t osd = 0;
+    cmd_getval(cmdmap, "id", osd);
+    uint64_t seq = mgrstatmon()->get_last_osd_stat_seq(osd);
+    if (f) {
+      f->dump_unsigned("seq", seq);
+      f->flush(ds);
+    } else {
+      ds << seq;
+      rdata.append(ds);
+    }
+    rs = "";
+    r = 0;
+  } else if (prefix == "node ls") {
+    string node_type("all");
+    cmd_getval(cmdmap, "type", node_type);
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    if (node_type == "all") {
+      f->open_object_section("nodes");
+      print_nodes(f.get(), ds);
+      osdmon()->print_nodes(f.get());
+      mdsmon()->print_nodes(f.get());
+      mgrmon()->print_nodes(f.get());
+      f->close_section();
+    } else if (node_type == "mon") {
+      print_nodes(f.get(), ds);
+    } else if (node_type == "osd") {
+      osdmon()->print_nodes(f.get());
+    } else if (node_type == "mds") {
+      mdsmon()->print_nodes(f.get());
+    } else if (node_type == "mgr") {
+      mgrmon()->print_nodes(f.get());
+    }
+    f->flush(ds);
+    rdata.append(ds);
+    rs = "";
+    r = 0;
+  } else if (prefix == "features") {
+    if (!is_leader() && !is_peon()) {
+      dout(10) << " waiting for quorum" << dendl;
+      waitfor_quorum.push_back(new C_RetryMessage(this, op));
+      return;
+    }
+    if (!is_leader()) {
+      forward_request_leader(op);
+      return;
+    }
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    FeatureMap fm;
+    get_combined_feature_map(&fm);
+    f->dump_object("features", fm);
+    f->flush(rdata);
+    rs = "";
+    r = 0;
+  } else if (prefix == "mon metadata") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+
+    string name;
+    bool all = !cmd_getval(cmdmap, "id", name);
+    if (!all) {
+      // Dump a single mon's metadata
+      int mon = monmap->get_rank(name);
+      if (mon < 0) {
+        rs = "requested mon not found";
+        r = -ENOENT;
+        goto out;
+      }
+      f->open_object_section("mon_metadata");
+      r = get_mon_metadata(mon, f.get(), ds);
+      f->close_section();
+    } else {
+      // Dump all mons' metadata
+      r = 0;
+      f->open_array_section("mon_metadata");
+      for (unsigned int rank = 0; rank < monmap->size(); ++rank) {
+        std::ostringstream get_err;
+        f->open_object_section("mon");
+        f->dump_string("name", monmap->get_name(rank));
+        r = get_mon_metadata(rank, f.get(), get_err);
+        f->close_section();
+        if (r == -ENOENT || r == -EINVAL) {
+          dout(1) << get_err.str() << dendl;
+          // Drop error, list what metadata we do have
+          r = 0;
+        } else if (r != 0) {
+          derr << "Unexpected error from get_mon_metadata: "
+               << cpp_strerror(r) << dendl;
+          ds << get_err.str();
+          break;
+        }
+      }
+      f->close_section();
+    }
+
+    f->flush(ds);
+    rdata.append(ds);
+    rs = "";
+  } else if (prefix == "mon versions") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    count_metadata("ceph_version", f.get());
+    f->flush(ds);
+    rdata.append(ds);
+    rs = "";
+    r = 0;
+  } else if (prefix == "mon count-metadata") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    string field;
+    cmd_getval(cmdmap, "property", field);
+    count_metadata(field, f.get());
+    f->flush(ds);
+    rdata.append(ds);
+    rs = "";
+    r = 0;
+  } else if (prefix == "quorum_status") {
+    // make sure our map is readable and up to date
+    if (!is_leader() && !is_peon()) {
+      dout(10) << " waiting for quorum" << dendl;
+      waitfor_quorum.push_back(new C_RetryMessage(this, op));
+      return;
+    }
+    _quorum_status(f.get(), ds);
+    rdata.append(ds);
+    rs = "";
+    r = 0;
+  } else if (prefix == "mon ok-to-stop") {
+    vector<string> ids;
+    if (!cmd_getval(cmdmap, "ids", ids)) {
+      r = -EINVAL;
+      goto out;
+    }
+    set<string> wouldbe;
+    for (auto rank : quorum) {
+      wouldbe.insert(monmap->get_name(rank));
+    }
+    for (auto& n : ids) {
+      if (monmap->contains(n)) {
+	wouldbe.erase(n);
+      }
+    }
+    if (wouldbe.size() < monmap->min_quorum_size()) {
+      r = -EBUSY;
+      rs = "not enough monitors would be available (" + stringify(wouldbe) +
+	") after stopping mons " + stringify(ids);
+      goto out;
+    }
+    r = 0;
+    rs = "quorum should be preserved (" + stringify(wouldbe) +
+      ") after stopping " + stringify(ids);
+  } else if (prefix == "mon ok-to-add-offline") {
+    if (quorum.size() < monmap->min_quorum_size(monmap->size() + 1)) {
+      rs = "adding a monitor may break quorum (until that monitor starts)";
+      r = -EBUSY;
+      goto out;
+    }
+    rs = "adding another mon that is not yet online will not break quorum";
+    r = 0;
+  } else if (prefix == "mon ok-to-rm") {
+    string id;
+    if (!cmd_getval(cmdmap, "id", id)) {
+      r = -EINVAL;
+      rs = "must specify a monitor id";
+      goto out;
+    }
+    if (!monmap->contains(id)) {
+      r = 0;
+      rs = "mon." + id + " does not exist";
+      goto out;
+    }
+    int rank = monmap->get_rank(id);
+    if (quorum.count(rank) &&
+	quorum.size() - 1 < monmap->min_quorum_size(monmap->size() - 1)) {
+      r = -EBUSY;
+      rs = "removing mon." + id + " would break quorum";
+      goto out;
+    }
+    r = 0;
+    rs = "safe to remove mon." + id;
+  } else if (prefix == "version") {
+    if (f) {
+      f->open_object_section("version");
+      f->dump_string("version", pretty_version_to_str());
+      f->close_section();
+      f->flush(ds);
+    } else {
+      ds << pretty_version_to_str();
+    }
+    rdata.append(ds);
+    rs = "";
+    r = 0;
+  } else if (prefix == "versions") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    map<string,int> overall;
+    f->open_object_section("version");
+    map<string,int> mon, mgr, osd, mds;
+
+    count_metadata("ceph_version", &mon);
+    f->open_object_section("mon");
+    for (auto& p : mon) {
+      f->dump_int(p.first.c_str(), p.second);
+      overall[p.first] += p.second;
+    }
+    f->close_section();
+
+    mgrmon()->count_metadata("ceph_version", &mgr);
+    f->open_object_section("mgr");
+    for (auto& p : mgr) {
+      f->dump_int(p.first.c_str(), p.second);
+      overall[p.first] += p.second;
+    }
+    f->close_section();
+
+    osdmon()->count_metadata("ceph_version", &osd);
+    f->open_object_section("osd");
+    for (auto& p : osd) {
+      f->dump_int(p.first.c_str(), p.second);
+      overall[p.first] += p.second;
+    }
+    f->close_section();
+
+    mdsmon()->count_metadata("ceph_version", &mds);
+    f->open_object_section("mds");
+    for (auto& p : mds) {
+      f->dump_int(p.first.c_str(), p.second);
+      overall[p.first] += p.second;
+    }
+    f->close_section();
+
+    for (auto& p : mgrstatmon()->get_service_map().services) {
+      auto &service = p.first;
+      if (ServiceMap::is_normal_ceph_entity(service)) {
+        continue;
+      }
+      f->open_object_section(service.c_str());
+      map<string,int> m;
+      p.second.count_metadata("ceph_version", &m);
+      for (auto& q : m) {
+	f->dump_int(q.first.c_str(), q.second);
+	overall[q.first] += q.second;
+      }
+      f->close_section();
+    }
+
+    f->open_object_section("overall");
+    for (auto& p : overall) {
+      f->dump_int(p.first.c_str(), p.second);
+    }
+    f->close_section();
+    f->close_section();
+    f->flush(rdata);
+    rs = "";
+    r = 0;
+  }
+
+ out:
+  if (!m->get_source().is_mon())  // don't reply to mon->mon commands
+    reply_command(op, r, rs, rdata, 0);
+}
+
+void Monitor::reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version)
+{
+  bufferlist rdata;
+  reply_command(op, rc, rs, rdata, version);
+}
+
+void Monitor::reply_command(MonOpRequestRef op, int rc, const string &rs,
+                            bufferlist& rdata, version_t version)
+{
+  auto m = op->get_req<MMonCommand>();
+  ceph_assert(m->get_type() == MSG_MON_COMMAND);
+  MMonCommandAck *reply = new MMonCommandAck(m->cmd, rc, rs, version);
+  reply->set_tid(m->get_tid());
+  reply->set_data(rdata);
+  send_reply(op, reply);
+}
+
+void Monitor::reply_tell_command(
+  MonOpRequestRef op, int rc, const string &rs)
+{
+  MCommand *m = static_cast<MCommand*>(op->get_req());
+  ceph_assert(m->get_type() == MSG_COMMAND);
+  MCommandReply *reply = new MCommandReply(rc, rs);
+  reply->set_tid(m->get_tid());
+  m->get_connection()->send_message(reply);
+}
+
+
+// ------------------------
+// request/reply routing
+//
+// a client/mds/osd will connect to a random monitor.  we need to forward any
+// messages requiring state updates to the leader, and then route any replies
+// back via the correct monitor and back to them.  (the monitor will not
+// initiate any connections.)
+
+void Monitor::forward_request_leader(MonOpRequestRef op)
+{
+  op->mark_event(__func__);
+
+  int mon = get_leader();
+  MonSession *session = op->get_session();
+  PaxosServiceMessage *req = op->get_req<PaxosServiceMessage>();
+  
+  if (req->get_source().is_mon() && req->get_source_addrs() != messenger->get_myaddrs()) {
+    dout(10) << "forward_request won't forward (non-local) mon request " << *req << dendl;
+  } else if (session->proxy_con) {
+    dout(10) << "forward_request won't double fwd request " << *req << dendl;
+  } else if (!session->closed) {
+    RoutedRequest *rr = new RoutedRequest;
+    rr->tid = ++routed_request_tid;
+    rr->con = req->get_connection();
+    rr->con_features = rr->con->get_features();
+    encode_message(req, CEPH_FEATURES_ALL, rr->request_bl);   // for my use only; use all features
+    rr->session = static_cast<MonSession *>(session->get());
+    rr->op = op;
+    routed_requests[rr->tid] = rr;
+    session->routed_request_tids.insert(rr->tid);
+    
+    dout(10) << "forward_request " << rr->tid << " request " << *req
+	     << " features " << rr->con_features << dendl;
+
+    MForward *forward = new MForward(rr->tid,
+                                     req,
+				     rr->con_features,
+				     rr->session->caps);
+    forward->set_priority(req->get_priority());
+    if (session->auth_handler) {
+      forward->entity_name = session->entity_name;
+    } else if (req->get_source().is_mon()) {
+      forward->entity_name.set_type(CEPH_ENTITY_TYPE_MON);
+    }
+    send_mon_message(forward, mon);
+    op->mark_forwarded();
+    ceph_assert(op->get_req()->get_type() != 0);
+  } else {
+    dout(10) << "forward_request no session for request " << *req << dendl;
+  }
+}
+
+// fake connection attached to forwarded messages
+struct AnonConnection : public Connection {
+  entity_addr_t socket_addr;
+
+  int send_message(Message *m) override {
+    ceph_assert(!"send_message on anonymous connection");
+  }
+  void send_keepalive() override {
+    ceph_assert(!"send_keepalive on anonymous connection");
+  }
+  void mark_down() override {
+    // silently ignore
+  }
+  void mark_disposable() override {
+    // silengtly ignore
+  }
+  bool is_connected() override { return false; }
+  entity_addr_t get_peer_socket_addr() const override {
+    return socket_addr;
+  }
+
+private:
+  FRIEND_MAKE_REF(AnonConnection);
+  explicit AnonConnection(CephContext *cct, const entity_addr_t& sa)
+    : Connection(cct, nullptr),
+      socket_addr(sa) {}
+};
+
+//extract the original message and put it into the regular dispatch function
+void Monitor::handle_forward(MonOpRequestRef op)
+{
+  auto m = op->get_req<MForward>();
+  dout(10) << "received forwarded message from "
+	   << ceph_entity_type_name(m->client_type)
+	   << " " << m->client_addrs
+	   << " via " << m->get_source_inst() << dendl;
+  MonSession *session = op->get_session();
+  ceph_assert(session);
+
+  if (!session->is_capable("mon", MON_CAP_X)) {
+    dout(0) << "forward from entity with insufficient caps! " 
+	    << session->caps << dendl;
+  } else {
+    // see PaxosService::dispatch(); we rely on this being anon
+    // (c->msgr == NULL)
+    PaxosServiceMessage *req = m->claim_message();
+    ceph_assert(req != NULL);
+
+    auto c = ceph::make_ref<AnonConnection>(cct, m->client_socket_addr);
+    MonSession *s = new MonSession(static_cast<Connection*>(c.get()));
+    s->_ident(req->get_source(),
+	      req->get_source_addrs());
+    c->set_priv(RefCountedPtr{s, false});
+    c->set_peer_addrs(m->client_addrs);
+    c->set_peer_type(m->client_type);
+    c->set_features(m->con_features);
+
+    s->authenticated = true;
+    s->caps = m->client_caps;
+    dout(10) << " caps are " << s->caps << dendl;
+    s->entity_name = m->entity_name;
+    dout(10) << " entity name '" << s->entity_name << "' type "
+             << s->entity_name.get_type() << dendl;
+    s->proxy_con = m->get_connection();
+    s->proxy_tid = m->tid;
+
+    req->set_connection(c);
+
+    // not super accurate, but better than nothing.
+    req->set_recv_stamp(m->get_recv_stamp());
+
+    /*
+     * note which election epoch this is; we will drop the message if
+     * there is a future election since our peers will resend routed
+     * requests in that case.
+     */
+    req->rx_election_epoch = get_epoch();
+
+    dout(10) << " mesg " << req << " from " << m->get_source_addr() << dendl;
+    _ms_dispatch(req);
+
+    // break the session <-> con ref loop by removing the con->session
+    // reference, which is no longer needed once the MonOpRequest is
+    // set up.
+    c->set_priv(NULL);
+  }
+}
+
+void Monitor::send_reply(MonOpRequestRef op, Message *reply)
+{
+  op->mark_event(__func__);
+
+  MonSession *session = op->get_session();
+  ceph_assert(session);
+  Message *req = op->get_req();
+  ConnectionRef con = op->get_connection();
+
+  reply->set_cct(g_ceph_context);
+  dout(2) << __func__ << " " << op << " " << reply << " " << *reply << dendl;
+
+  if (!con) {
+    dout(2) << "send_reply no connection, dropping reply " << *reply
+	    << " to " << req << " " << *req << dendl;
+    reply->put();
+    op->mark_event("reply: no connection");
+    return;
+  }
+
+  if (!session->con && !session->proxy_con) {
+    dout(2) << "send_reply no connection, dropping reply " << *reply
+	    << " to " << req << " " << *req << dendl;
+    reply->put();
+    op->mark_event("reply: no connection");
+    return;
+  }
+
+  if (session->proxy_con) {
+    dout(15) << "send_reply routing reply to " << con->get_peer_addr()
+	     << " via " << session->proxy_con->get_peer_addr()
+	     << " for request " << *req << dendl;
+    session->proxy_con->send_message(new MRoute(session->proxy_tid, reply));
+    op->mark_event("reply: send routed request");
+  } else {
+    session->con->send_message(reply);
+    op->mark_event("reply: send");
+  }
+}
+
+void Monitor::no_reply(MonOpRequestRef op)
+{
+  MonSession *session = op->get_session();
+  Message *req = op->get_req();
+
+  if (session->proxy_con) {
+    dout(10) << "no_reply to " << req->get_source_inst()
+	     << " via " << session->proxy_con->get_peer_addr()
+	     << " for request " << *req << dendl;
+    session->proxy_con->send_message(new MRoute(session->proxy_tid, NULL));
+    op->mark_event("no_reply: send routed request");
+  } else {
+    dout(10) << "no_reply to " << req->get_source_inst()
+             << " " << *req << dendl;
+    op->mark_event("no_reply");
+  }
+}
+
+void Monitor::handle_route(MonOpRequestRef op)
+{
+  auto m = op->get_req<MRoute>();
+  MonSession *session = op->get_session();
+  //check privileges
+  if (!session->is_capable("mon", MON_CAP_X)) {
+    dout(0) << "MRoute received from entity without appropriate perms! "
+	    << dendl;
+    return;
+  }
+  if (m->msg)
+    dout(10) << "handle_route tid " << m->session_mon_tid << " " << *m->msg
+	     << dendl;
+  else
+    dout(10) << "handle_route tid " << m->session_mon_tid << " null" << dendl;
+  
+  // look it up
+  if (!m->session_mon_tid) {
+    dout(10) << " not a routed request, ignoring" << dendl;
+    return;
+  }
+  auto found = routed_requests.find(m->session_mon_tid);
+  if (found == routed_requests.end()) {
+    dout(10) << " don't have routed request tid " << m->session_mon_tid << dendl;
+    return;
+  }
+  std::unique_ptr<RoutedRequest> rr{found->second};
+  // reset payload, in case encoding is dependent on target features
+  if (m->msg) {
+    m->msg->clear_payload();
+    rr->con->send_message(m->msg);
+    m->msg = NULL;
+  }
+  if (m->send_osdmap_first) {
+    dout(10) << " sending osdmaps from " << m->send_osdmap_first << dendl;
+    osdmon()->send_incremental(m->send_osdmap_first, rr->session,
+			       true, MonOpRequestRef());
+  }
+  ceph_assert(rr->tid == m->session_mon_tid && rr->session->routed_request_tids.count(m->session_mon_tid));
+  routed_requests.erase(found);
+  rr->session->routed_request_tids.erase(m->session_mon_tid);
+}
+
+void Monitor::resend_routed_requests()
+{
+  dout(10) << "resend_routed_requests" << dendl;
+  int mon = get_leader();
+  list<Context*> retry;
+  for (map<uint64_t, RoutedRequest*>::iterator p = routed_requests.begin();
+       p != routed_requests.end();
+       ++p) {
+    RoutedRequest *rr = p->second;
+
+    if (mon == rank) {
+      dout(10) << " requeue for self tid " << rr->tid << dendl;
+      rr->op->mark_event("retry routed request");
+      retry.push_back(new C_RetryMessage(this, rr->op));
+      if (rr->session) {
+        ceph_assert(rr->session->routed_request_tids.count(p->first));
+        rr->session->routed_request_tids.erase(p->first);
+      }
+      delete rr;
+    } else {
+      auto q = rr->request_bl.cbegin();
+      PaxosServiceMessage *req =
+	(PaxosServiceMessage *)decode_message(cct, 0, q);
+      rr->op->mark_event("resend forwarded message to leader");
+      dout(10) << " resend to mon." << mon << " tid " << rr->tid << " " << *req
+	       << dendl;
+      MForward *forward = new MForward(rr->tid,
+				       req,
+				       rr->con_features,
+				       rr->session->caps);
+      req->put();  // forward takes its own ref; drop ours.
+      forward->client_type = rr->con->get_peer_type();
+      forward->client_addrs = rr->con->get_peer_addrs();
+      forward->client_socket_addr = rr->con->get_peer_socket_addr();
+      forward->set_priority(req->get_priority());
+      send_mon_message(forward, mon);
+    }
+  }
+  if (mon == rank) {
+    routed_requests.clear();
+    finish_contexts(g_ceph_context, retry);
+  }
+}
+
+void Monitor::remove_session(MonSession *s)
+{
+  dout(10) << "remove_session " << s << " " << s->name << " " << s->addrs
+	   << " features 0x" << std::hex << s->con_features << std::dec << dendl;
+  ceph_assert(s->con);
+  ceph_assert(!s->closed);
+  for (set<uint64_t>::iterator p = s->routed_request_tids.begin();
+       p != s->routed_request_tids.end();
+       ++p) {
+    ceph_assert(routed_requests.count(*p));
+    RoutedRequest *rr = routed_requests[*p];
+    dout(10) << " dropping routed request " << rr->tid << dendl;
+    delete rr;
+    routed_requests.erase(*p);
+  }
+  s->routed_request_tids.clear();
+  s->con->set_priv(nullptr);
+  session_map.remove_session(s);
+  logger->set(l_mon_num_sessions, session_map.get_size());
+  logger->inc(l_mon_session_rm);
+}
+
+void Monitor::remove_all_sessions()
+{
+  std::lock_guard l(session_map_lock);
+  while (!session_map.sessions.empty()) {
+    MonSession *s = session_map.sessions.front();
+    remove_session(s);
+    logger->inc(l_mon_session_rm);
+  }
+  if (logger)
+    logger->set(l_mon_num_sessions, session_map.get_size());
+}
+
+void Monitor::send_mon_message(Message *m, int rank)
+{
+  messenger->send_to_mon(m, monmap->get_addrs(rank));
+}
+
+void Monitor::waitlist_or_zap_client(MonOpRequestRef op)
+{
+  /**
+   * Wait list the new session until we're in the quorum, assuming it's
+   * sufficiently new.
+   * tick() will periodically send them back through so we can send
+   * the client elsewhere if we don't think we're getting back in.
+   *
+   * But we allow a few sorts of messages:
+   * 1) Monitors can talk to us at any time, of course.
+   * 2) auth messages. It's unlikely to go through much faster, but
+   * it's possible we've just lost our quorum status and we want to take...
+   * 3) command messages. We want to accept these under all possible
+   * circumstances.
+   */
+  Message *m = op->get_req();
+  MonSession *s = op->get_session();
+  ConnectionRef con = op->get_connection();
+  utime_t too_old = ceph_clock_now();
+  too_old -= g_ceph_context->_conf->mon_lease;
+  if (m->get_recv_stamp() > too_old &&
+      con->is_connected()) {
+    dout(5) << "waitlisting message " << *m << dendl;
+    maybe_wait_for_quorum.push_back(new C_RetryMessage(this, op));
+    op->mark_wait_for_quorum();
+  } else {
+    dout(5) << "discarding message " << *m << " and sending client elsewhere" << dendl;
+    con->mark_down();
+    // proxied sessions aren't registered and don't have a con; don't remove
+    // those.
+    if (!s->proxy_con) {
+      std::lock_guard l(session_map_lock);
+      remove_session(s);
+    }
+    op->mark_zap();
+  }
+}
+
+void Monitor::_ms_dispatch(Message *m)
+{
+  if (is_shutdown()) {
+    m->put();
+    return;
+  }
+
+  MonOpRequestRef op = op_tracker.create_request<MonOpRequest>(m);
+  bool src_is_mon = op->is_src_mon();
+  op->mark_event("mon:_ms_dispatch");
+  MonSession *s = op->get_session();
+  if (s && s->closed) {
+    return;
+  }
+
+  if (src_is_mon && s) {
+    ConnectionRef con = m->get_connection();
+    if (con->get_messenger() && con->get_features() != s->con_features) {
+      // only update features if this is a non-anonymous connection
+      dout(10) << __func__ << " feature change for " << m->get_source_inst()
+               << " (was " << s->con_features
+               << ", now " << con->get_features() << ")" << dendl;
+      // connection features changed - recreate session.
+      if (s->con && s->con != con) {
+        dout(10) << __func__ << " connection for " << m->get_source_inst()
+                 << " changed from session; mark down and replace" << dendl;
+        s->con->mark_down();
+      }
+      if (s->item.is_on_list()) {
+        // forwarded messages' sessions are not in the sessions map and
+        // exist only while the op is being handled.
+        std::lock_guard l(session_map_lock);
+        remove_session(s);
+      }
+      s = nullptr;
+    }
+  }
+
+  if (!s) {
+    // if the sender is not a monitor, make sure their first message for a
+    // session is an MAuth.  If it is not, assume it's a stray message,
+    // and considering that we are creating a new session it is safe to
+    // assume that the sender hasn't authenticated yet, so we have no way
+    // of assessing whether we should handle it or not.
+    if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH &&
+			m->get_type() != CEPH_MSG_MON_GET_MAP &&
+			m->get_type() != CEPH_MSG_PING)) {
+      dout(1) << __func__ << " dropping stray message " << *m
+	      << " from " << m->get_source_inst() << dendl;
+      return;
+    }
+
+    ConnectionRef con = m->get_connection();
+    {
+      std::lock_guard l(session_map_lock);
+      s = session_map.new_session(m->get_source(),
+				  m->get_source_addrs(),
+				  con.get());
+    }
+    ceph_assert(s);
+    con->set_priv(RefCountedPtr{s, false});
+    dout(10) << __func__ << " new session " << s << " " << *s
+	     << " features 0x" << std::hex
+	     << s->con_features << std::dec << dendl;
+    op->set_session(s);
+
+    logger->set(l_mon_num_sessions, session_map.get_size());
+    logger->inc(l_mon_session_add);
+
+    if (src_is_mon) {
+      // give it monitor caps; the peer type has been authenticated
+      dout(5) << __func__ << " setting monitor caps on this connection" << dendl;
+      if (!s->caps.is_allow_all()) // but no need to repeatedly copy
+        s->caps = mon_caps;
+      s->authenticated = true;
+    }
+  } else {
+    dout(20) << __func__ << " existing session " << s << " for " << s->name
+	     << dendl;
+  }
+
+  ceph_assert(s);
+
+  s->session_timeout = ceph_clock_now();
+  s->session_timeout += g_conf()->mon_session_timeout;
+
+  if (s->auth_handler) {
+    s->entity_name = s->auth_handler->get_entity_name();
+    s->global_id = s->auth_handler->get_global_id();
+    s->global_id_status = s->auth_handler->get_global_id_status();
+  }
+  dout(20) << " entity_name " << s->entity_name
+	   << " global_id " << s->global_id
+	   << " (" << s->global_id_status
+	   << ") caps " << s->caps.get_str() << dendl;
+
+  if (!session_stretch_allowed(s, op)) {
+    return;
+  }
+  if ((is_synchronizing() ||
+       (!s->authenticated && !exited_quorum.is_zero())) &&
+      !src_is_mon &&
+      m->get_type() != CEPH_MSG_PING) {
+    waitlist_or_zap_client(op);
+  } else {
+    dispatch_op(op);
+  }
+  return;
+}
+
+void Monitor::dispatch_op(MonOpRequestRef op)
+{
+  op->mark_event("mon:dispatch_op");
+  MonSession *s = op->get_session();
+  ceph_assert(s);
+  if (s->closed) {
+    dout(10) << " session closed, dropping " << op->get_req() << dendl;
+    return;
+  }
+
+  /* we will consider the default type as being 'monitor' until proven wrong */
+  op->set_type_monitor();
+  /* deal with all messages that do not necessarily need caps */
+  switch (op->get_req()->get_type()) {
+    // auth
+    case MSG_MON_GLOBAL_ID:
+    case CEPH_MSG_AUTH:
+      op->set_type_service();
+      /* no need to check caps here */
+      paxos_service[PAXOS_AUTH]->dispatch(op);
+      return;
+
+    case CEPH_MSG_PING:
+      handle_ping(op);
+      return;
+    case MSG_COMMAND:
+      op->set_type_command();
+      handle_tell_command(op);
+      return;
+  }
+
+  if (!op->get_session()->authenticated) {
+    dout(5) << __func__ << " " << op->get_req()->get_source_inst()
+            << " is not authenticated, dropping " << *(op->get_req())
+            << dendl;
+    return;
+  }
+
+  // global_id_status == NONE: all sessions for auth_none and krb,
+  // mon <-> mon sessions (including proxied sessions) for cephx
+  ceph_assert(s->global_id_status == global_id_status_t::NONE ||
+              s->global_id_status == global_id_status_t::NEW_OK ||
+              s->global_id_status == global_id_status_t::NEW_NOT_EXPOSED ||
+              s->global_id_status == global_id_status_t::RECLAIM_OK ||
+              s->global_id_status == global_id_status_t::RECLAIM_INSECURE);
+
+  // let mon_getmap through for "ping" (which doesn't reconnect)
+  // and "tell" (which reconnects but doesn't attempt to preserve
+  // its global_id and stays in NEW_NOT_EXPOSED, retrying until
+  // ->send_attempts reaches 0)
+  if (cct->_conf->auth_expose_insecure_global_id_reclaim &&
+      s->global_id_status == global_id_status_t::NEW_NOT_EXPOSED &&
+      op->get_req()->get_type() != CEPH_MSG_MON_GET_MAP) {
+    dout(5) << __func__ << " " << op->get_req()->get_source_inst()
+            << " may omit old_ticket on reconnects, discarding "
+            << *op->get_req() << " and forcing reconnect" << dendl;
+    ceph_assert(s->con && !s->proxy_con);
+    s->con->mark_down();
+    {
+      std::lock_guard l(session_map_lock);
+      remove_session(s);
+    }
+    op->mark_zap();
+    return;
+  }
+
+  switch (op->get_req()->get_type()) {
+    case CEPH_MSG_MON_GET_MAP:
+      handle_mon_get_map(op);
+      return;
+
+    case MSG_GET_CONFIG:
+      configmon()->handle_get_config(op);
+      return;
+
+    case CEPH_MSG_MON_SUBSCRIBE:
+      /* FIXME: check what's being subscribed, filter accordingly */
+      handle_subscribe(op);
+      return;
+  }
+
+  /* well, maybe the op belongs to a service... */
+  op->set_type_service();
+  /* deal with all messages which caps should be checked somewhere else */
+  switch (op->get_req()->get_type()) {
+
+    // OSDs
+    case CEPH_MSG_MON_GET_OSDMAP:
+    case CEPH_MSG_POOLOP:
+    case MSG_OSD_BEACON:
+    case MSG_OSD_MARK_ME_DOWN:
+    case MSG_OSD_MARK_ME_DEAD:
+    case MSG_OSD_FULL:
+    case MSG_OSD_FAILURE:
+    case MSG_OSD_BOOT:
+    case MSG_OSD_ALIVE:
+    case MSG_OSD_PGTEMP:
+    case MSG_OSD_PG_CREATED:
+    case MSG_REMOVE_SNAPS:
+    case MSG_MON_GET_PURGED_SNAPS:
+    case MSG_OSD_PG_READY_TO_MERGE:
+      paxos_service[PAXOS_OSDMAP]->dispatch(op);
+      return;
+
+    // MDSs
+    case MSG_MDS_BEACON:
+    case MSG_MDS_OFFLOAD_TARGETS:
+      paxos_service[PAXOS_MDSMAP]->dispatch(op);
+      return;
+
+    // Mgrs
+    case MSG_MGR_BEACON:
+      paxos_service[PAXOS_MGR]->dispatch(op);
+      return;
+
+    // MgrStat
+    case MSG_MON_MGR_REPORT:
+    case CEPH_MSG_STATFS:
+    case MSG_GETPOOLSTATS:
+      paxos_service[PAXOS_MGRSTAT]->dispatch(op);
+      return;
+
+      // log
+    case MSG_LOG:
+      paxos_service[PAXOS_LOG]->dispatch(op);
+      return;
+
+    // handle_command() does its own caps checking
+    case MSG_MON_COMMAND:
+      op->set_type_command();
+      handle_command(op);
+      return;
+  }
+
+  /* nop, looks like it's not a service message; revert back to monitor */
+  op->set_type_monitor();
+
+  /* messages we, the Monitor class, need to deal with
+   * but may be sent by clients. */
+
+  if (!op->get_session()->is_capable("mon", MON_CAP_R)) {
+    dout(5) << __func__ << " " << op->get_req()->get_source_inst()
+            << " not enough caps for " << *(op->get_req()) << " -- dropping"
+            << dendl;
+    return;
+  }
+
+  switch (op->get_req()->get_type()) {
+    // misc
+    case CEPH_MSG_MON_GET_VERSION:
+      handle_get_version(op);
+      return;
+  }
+
+  if (!op->is_src_mon()) {
+    dout(1) << __func__ << " unexpected monitor message from"
+            << " non-monitor entity " << op->get_req()->get_source_inst()
+            << " " << *(op->get_req()) << " -- dropping" << dendl;
+    return;
+  }
+
+  /* messages that should only be sent by another monitor */
+  switch (op->get_req()->get_type()) {
+
+    case MSG_ROUTE:
+      handle_route(op);
+      return;
+
+    case MSG_MON_PROBE:
+      handle_probe(op);
+      return;
+
+    // Sync (i.e., the new slurp, but on steroids)
+    case MSG_MON_SYNC:
+      handle_sync(op);
+      return;
+    case MSG_MON_SCRUB:
+      handle_scrub(op);
+      return;
+
+    /* log acks are sent from a monitor we sent the MLog to, and are
+       never sent by clients to us. */
+    case MSG_LOGACK:
+      log_client.handle_log_ack((MLogAck*)op->get_req());
+      return;
+
+    // monmap
+    case MSG_MON_JOIN:
+      op->set_type_service();
+      paxos_service[PAXOS_MONMAP]->dispatch(op);
+      return;
+
+    // paxos
+    case MSG_MON_PAXOS:
+      {
+        op->set_type_paxos();
+        auto pm = op->get_req<MMonPaxos>();
+        if (!op->get_session()->is_capable("mon", MON_CAP_X)) {
+          //can't send these!
+          return;
+        }
+
+        if (state == STATE_SYNCHRONIZING) {
+          // we are synchronizing. These messages would do us no
+          // good, thus just drop them and ignore them.
+          dout(10) << __func__ << " ignore paxos msg from "
+            << pm->get_source_inst() << dendl;
+          return;
+        }
+
+        // sanitize
+        if (pm->epoch > get_epoch()) {
+          bootstrap();
+          return;
+        }
+        if (pm->epoch != get_epoch()) {
+          return;
+        }
+
+        paxos->dispatch(op);
+      }
+      return;
+
+    // elector messages
+    case MSG_MON_ELECTION:
+      op->set_type_election_or_ping();
+      //check privileges here for simplicity
+      if (!op->get_session()->is_capable("mon", MON_CAP_X)) {
+        dout(0) << "MMonElection received from entity without enough caps!"
+          << op->get_session()->caps << dendl;
+        return;;
+      }
+      if (!is_probing() && !is_synchronizing()) {
+        elector.dispatch(op);
+      }
+      return;
+
+    case MSG_MON_PING:
+      op->set_type_election_or_ping();
+      elector.dispatch(op);
+      return;
+
+    case MSG_FORWARD:
+      handle_forward(op);
+      return;
+
+    case MSG_TIMECHECK:
+      dout(5) << __func__ << " ignoring " << op << dendl;
+      return;
+    case MSG_TIMECHECK2:
+      handle_timecheck(op);
+      return;
+
+    case MSG_MON_HEALTH:
+      dout(5) << __func__ << " dropping deprecated message: "
+	      << *op->get_req() << dendl;
+      break;
+    case MSG_MON_HEALTH_CHECKS:
+      op->set_type_service();
+      paxos_service[PAXOS_HEALTH]->dispatch(op);
+      return;
+  }
+  dout(1) << "dropping unexpected " << *(op->get_req()) << dendl;
+  return;
+}
+
+void Monitor::handle_ping(MonOpRequestRef op)
+{
+  auto m = op->get_req<MPing>();
+  dout(10) << __func__ << " " << *m << dendl;
+  MPing *reply = new MPing;
+  bufferlist payload;
+  boost::scoped_ptr<Formatter> f(new JSONFormatter(true));
+  f->open_object_section("pong");
+
+  healthmon()->get_health_status(false, f.get(), nullptr);
+  get_mon_status(f.get());
+
+  f->close_section();
+  stringstream ss;
+  f->flush(ss);
+  encode(ss.str(), payload);
+  reply->set_payload(payload);
+  dout(10) << __func__ << " reply payload len " << reply->get_payload().length() << dendl;
+  m->get_connection()->send_message(reply);
+}
+
+void Monitor::timecheck_start()
+{
+  dout(10) << __func__ << dendl;
+  timecheck_cleanup();
+  if (get_quorum_mon_features().contains_all(
+	ceph::features::mon::FEATURE_NAUTILUS)) {
+    timecheck_start_round();
+  }
+}
+
+void Monitor::timecheck_finish()
+{
+  dout(10) << __func__ << dendl;
+  timecheck_cleanup();
+}
+
+void Monitor::timecheck_start_round()
+{
+  dout(10) << __func__ << " curr " << timecheck_round << dendl;
+  ceph_assert(is_leader());
+
+  if (monmap->size() == 1) {
+    ceph_abort_msg("We are alone; this shouldn't have been scheduled!");
+    return;
+  }
+
+  if (timecheck_round % 2) {
+    dout(10) << __func__ << " there's a timecheck going on" << dendl;
+    utime_t curr_time = ceph_clock_now();
+    double max = g_conf()->mon_timecheck_interval*3;
+    if (curr_time - timecheck_round_start < max) {
+      dout(10) << __func__ << " keep current round going" << dendl;
+      goto out;
+    } else {
+      dout(10) << __func__
+               << " finish current timecheck and start new" << dendl;
+      timecheck_cancel_round();
+    }
+  }
+
+  ceph_assert(timecheck_round % 2 == 0);
+  timecheck_acks = 0;
+  timecheck_round ++;
+  timecheck_round_start = ceph_clock_now();
+  dout(10) << __func__ << " new " << timecheck_round << dendl;
+
+  timecheck();
+out:
+  dout(10) << __func__ << " setting up next event" << dendl;
+  timecheck_reset_event();
+}
+
+void Monitor::timecheck_finish_round(bool success)
+{
+  dout(10) << __func__ << " curr " << timecheck_round << dendl;
+  ceph_assert(timecheck_round % 2);
+  timecheck_round ++;
+  timecheck_round_start = utime_t();
+
+  if (success) {
+    ceph_assert(timecheck_waiting.empty());
+    ceph_assert(timecheck_acks == quorum.size());
+    timecheck_report();
+    timecheck_check_skews();
+    return;
+  }
+
+  dout(10) << __func__ << " " << timecheck_waiting.size()
+           << " peers still waiting:";
+  for (auto& p : timecheck_waiting) {
+    *_dout << " mon." << p.first;
+  }
+  *_dout << dendl;
+  timecheck_waiting.clear();
+
+  dout(10) << __func__ << " finished to " << timecheck_round << dendl;
+}
+
+void Monitor::timecheck_cancel_round()
+{
+  timecheck_finish_round(false);
+}
+
+void Monitor::timecheck_cleanup()
+{
+  timecheck_round = 0;
+  timecheck_acks = 0;
+  timecheck_round_start = utime_t();
+
+  if (timecheck_event) {
+    timer.cancel_event(timecheck_event);
+    timecheck_event = NULL;
+  }
+  timecheck_waiting.clear();
+  timecheck_skews.clear();
+  timecheck_latencies.clear();
+
+  timecheck_rounds_since_clean = 0;
+}
+
+void Monitor::timecheck_reset_event()
+{
+  if (timecheck_event) {
+    timer.cancel_event(timecheck_event);
+    timecheck_event = NULL;
+  }
+
+  double delay =
+    cct->_conf->mon_timecheck_skew_interval * timecheck_rounds_since_clean;
+
+  if (delay <= 0 || delay > cct->_conf->mon_timecheck_interval) {
+    delay = cct->_conf->mon_timecheck_interval;
+  }
+
+  dout(10) << __func__ << " delay " << delay
+           << " rounds_since_clean " << timecheck_rounds_since_clean
+           << dendl;
+
+  timecheck_event = timer.add_event_after(
+    delay,
+    new C_MonContext{this, [this](int) {
+	timecheck_start_round();
+      }});
+}
+
+void Monitor::timecheck_check_skews()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(is_leader());
+  ceph_assert((timecheck_round % 2) == 0);
+  if (monmap->size() == 1) {
+    ceph_abort_msg("We are alone; we shouldn't have gotten here!");
+    return;
+  }
+  ceph_assert(timecheck_latencies.size() == timecheck_skews.size());
+
+  bool found_skew = false;
+  for (auto& p : timecheck_skews) {
+    double abs_skew;
+    if (timecheck_has_skew(p.second, &abs_skew)) {
+      dout(10) << __func__
+               << " " << p.first << " skew " << abs_skew << dendl;
+      found_skew = true;
+    }
+  }
+
+  if (found_skew) {
+    ++timecheck_rounds_since_clean;
+    timecheck_reset_event();
+  } else if (timecheck_rounds_since_clean > 0) {
+    dout(1) << __func__
+      << " no clock skews found after " << timecheck_rounds_since_clean
+      << " rounds" << dendl;
+    // make sure the skews are really gone and not just a transient success
+    // this will run just once if not in the presence of skews again.
+    timecheck_rounds_since_clean = 1;
+    timecheck_reset_event();
+    timecheck_rounds_since_clean = 0;
+  }
+
+}
+
+void Monitor::timecheck_report()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(is_leader());
+  ceph_assert((timecheck_round % 2) == 0);
+  if (monmap->size() == 1) {
+    ceph_abort_msg("We are alone; we shouldn't have gotten here!");
+    return;
+  }
+
+  ceph_assert(timecheck_latencies.size() == timecheck_skews.size());
+  bool do_output = true; // only output report once
+  for (set<int>::iterator q = quorum.begin(); q != quorum.end(); ++q) {
+    if (monmap->get_name(*q) == name)
+      continue;
+
+    MTimeCheck2 *m = new MTimeCheck2(MTimeCheck2::OP_REPORT);
+    m->epoch = get_epoch();
+    m->round = timecheck_round;
+
+    for (auto& it : timecheck_skews) {
+      double skew = it.second;
+      double latency = timecheck_latencies[it.first];
+
+      m->skews[it.first] = skew;
+      m->latencies[it.first] = latency;
+
+      if (do_output) {
+        dout(25) << __func__ << " mon." << it.first
+                 << " latency " << latency
+                 << " skew " << skew << dendl;
+      }
+    }
+    do_output = false;
+    dout(10) << __func__ << " send report to mon." << *q << dendl;
+    send_mon_message(m, *q);
+  }
+}
+
+void Monitor::timecheck()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(is_leader());
+  if (monmap->size() == 1) {
+    ceph_abort_msg("We are alone; we shouldn't have gotten here!");
+    return;
+  }
+  ceph_assert(timecheck_round % 2 != 0);
+
+  timecheck_acks = 1; // we ack ourselves
+
+  dout(10) << __func__ << " start timecheck epoch " << get_epoch()
+           << " round " << timecheck_round << dendl;
+
+  // we are at the eye of the storm; the point of reference
+  timecheck_skews[rank] = 0.0;
+  timecheck_latencies[rank] = 0.0;
+
+  for (set<int>::iterator it = quorum.begin(); it != quorum.end(); ++it) {
+    if (monmap->get_name(*it) == name)
+      continue;
+
+    utime_t curr_time = ceph_clock_now();
+    timecheck_waiting[*it] = curr_time;
+    MTimeCheck2 *m = new MTimeCheck2(MTimeCheck2::OP_PING);
+    m->epoch = get_epoch();
+    m->round = timecheck_round;
+    dout(10) << __func__ << " send " << *m << " to mon." << *it << dendl;
+    send_mon_message(m, *it);
+  }
+}
+
+health_status_t Monitor::timecheck_status(ostringstream &ss,
+                                          const double skew_bound,
+                                          const double latency)
+{
+  health_status_t status = HEALTH_OK;
+  ceph_assert(latency >= 0);
+
+  double abs_skew;
+  if (timecheck_has_skew(skew_bound, &abs_skew)) {
+    status = HEALTH_WARN;
+    ss << "clock skew " << abs_skew << "s"
+       << " > max " << g_conf()->mon_clock_drift_allowed << "s";
+  }
+
+  return status;
+}
+
+void Monitor::handle_timecheck_leader(MonOpRequestRef op)
+{
+  auto m = op->get_req<MTimeCheck2>();
+  dout(10) << __func__ << " " << *m << dendl;
+  /* handles PONG's */
+  ceph_assert(m->op == MTimeCheck2::OP_PONG);
+
+  int other = m->get_source().num();
+  if (m->epoch < get_epoch()) {
+    dout(1) << __func__ << " got old timecheck epoch " << m->epoch
+            << " from " << other
+            << " curr " << get_epoch()
+            << " -- severely lagged? discard" << dendl;
+    return;
+  }
+  ceph_assert(m->epoch == get_epoch());
+
+  if (m->round < timecheck_round) {
+    dout(1) << __func__ << " got old round " << m->round
+            << " from " << other
+            << " curr " << timecheck_round << " -- discard" << dendl;
+    return;
+  }
+
+  utime_t curr_time = ceph_clock_now();
+
+  ceph_assert(timecheck_waiting.count(other) > 0);
+  utime_t timecheck_sent = timecheck_waiting[other];
+  timecheck_waiting.erase(other);
+  if (curr_time < timecheck_sent) {
+    // our clock was readjusted -- drop everything until it all makes sense.
+    dout(1) << __func__ << " our clock was readjusted --"
+            << " bump round and drop current check"
+            << dendl;
+    timecheck_cancel_round();
+    return;
+  }
+
+  /* update peer latencies */
+  double latency = (double)(curr_time - timecheck_sent);
+
+  if (timecheck_latencies.count(other) == 0)
+    timecheck_latencies[other] = latency;
+  else {
+    double avg_latency = ((timecheck_latencies[other]*0.8)+(latency*0.2));
+    timecheck_latencies[other] = avg_latency;
+  }
+
+  /*
+   * update skews
+   *
+   * some nasty thing goes on if we were to do 'a - b' between two utime_t,
+   * and 'a' happens to be lower than 'b'; so we use double instead.
+   *
+   * latency is always expected to be >= 0.
+   *
+   * delta, the difference between theirs timestamp and ours, may either be
+   * lower or higher than 0; will hardly ever be 0.
+   *
+   * The absolute skew is the absolute delta minus the latency, which is
+   * taken as a whole instead of an rtt given that there is some queueing
+   * and dispatch times involved and it's hard to assess how long exactly
+   * it took for the message to travel to the other side and be handled. So
+   * we call it a bounded skew, the worst case scenario.
+   *
+   * Now, to math!
+   *
+   * Given that the latency is always positive, we can establish that the
+   * bounded skew will be:
+   *
+   *  1. positive if the absolute delta is higher than the latency and
+   *     delta is positive
+   *  2. negative if the absolute delta is higher than the latency and
+   *     delta is negative.
+   *  3. zero if the absolute delta is lower than the latency.
+   *
+   * On 3. we make a judgement call and treat the skew as non-existent.
+   * This is because that, if the absolute delta is lower than the
+   * latency, then the apparently existing skew is nothing more than a
+   * side-effect of the high latency at work.
+   *
+   * This may not be entirely true though, as a severely skewed clock
+   * may be masked by an even higher latency, but with high latencies
+   * we probably have worse issues to deal with than just skewed clocks.
+   */
+  ceph_assert(latency >= 0);
+
+  double delta = ((double) m->timestamp) - ((double) curr_time);
+  double abs_delta = (delta > 0 ? delta : -delta);
+  double skew_bound = abs_delta - latency;
+  if (skew_bound < 0)
+    skew_bound = 0;
+  else if (delta < 0)
+    skew_bound = -skew_bound;
+
+  ostringstream ss;
+  health_status_t status = timecheck_status(ss, skew_bound, latency);
+  if (status != HEALTH_OK) {
+    clog->health(status) << other << " " << ss.str();
+  }
+
+  dout(10) << __func__ << " from " << other << " ts " << m->timestamp
+	   << " delta " << delta << " skew_bound " << skew_bound
+	   << " latency " << latency << dendl;
+
+  timecheck_skews[other] = skew_bound;
+
+  timecheck_acks++;
+  if (timecheck_acks == quorum.size()) {
+    dout(10) << __func__ << " got pongs from everybody ("
+             << timecheck_acks << " total)" << dendl;
+    ceph_assert(timecheck_skews.size() == timecheck_acks);
+    ceph_assert(timecheck_waiting.empty());
+    // everyone has acked, so bump the round to finish it.
+    timecheck_finish_round();
+  }
+}
+
+void Monitor::handle_timecheck_peon(MonOpRequestRef op)
+{
+  auto m = op->get_req<MTimeCheck2>();
+  dout(10) << __func__ << " " << *m << dendl;
+
+  ceph_assert(is_peon());
+  ceph_assert(m->op == MTimeCheck2::OP_PING || m->op == MTimeCheck2::OP_REPORT);
+
+  if (m->epoch != get_epoch()) {
+    dout(1) << __func__ << " got wrong epoch "
+            << "(ours " << get_epoch()
+            << " theirs: " << m->epoch << ") -- discarding" << dendl;
+    return;
+  }
+
+  if (m->round < timecheck_round) {
+    dout(1) << __func__ << " got old round " << m->round
+            << " current " << timecheck_round
+            << " (epoch " << get_epoch() << ") -- discarding" << dendl;
+    return;
+  }
+
+  timecheck_round = m->round;
+
+  if (m->op == MTimeCheck2::OP_REPORT) {
+    ceph_assert((timecheck_round % 2) == 0);
+    timecheck_latencies.swap(m->latencies);
+    timecheck_skews.swap(m->skews);
+    return;
+  }
+
+  ceph_assert((timecheck_round % 2) != 0);
+  MTimeCheck2 *reply = new MTimeCheck2(MTimeCheck2::OP_PONG);
+  utime_t curr_time = ceph_clock_now();
+  reply->timestamp = curr_time;
+  reply->epoch = m->epoch;
+  reply->round = m->round;
+  dout(10) << __func__ << " send " << *m
+           << " to " << m->get_source_inst() << dendl;
+  m->get_connection()->send_message(reply);
+}
+
+void Monitor::handle_timecheck(MonOpRequestRef op)
+{
+  auto m = op->get_req<MTimeCheck2>();
+  dout(10) << __func__ << " " << *m << dendl;
+
+  if (is_leader()) {
+    if (m->op != MTimeCheck2::OP_PONG) {
+      dout(1) << __func__ << " drop unexpected msg (not pong)" << dendl;
+    } else {
+      handle_timecheck_leader(op);
+    }
+  } else if (is_peon()) {
+    if (m->op != MTimeCheck2::OP_PING && m->op != MTimeCheck2::OP_REPORT) {
+      dout(1) << __func__ << " drop unexpected msg (not ping or report)" << dendl;
+    } else {
+      handle_timecheck_peon(op);
+    }
+  } else {
+    dout(1) << __func__ << " drop unexpected msg" << dendl;
+  }
+}
+
+void Monitor::handle_subscribe(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonSubscribe>();
+  dout(10) << "handle_subscribe " << *m << dendl;
+  
+  bool reply = false;
+
+  MonSession *s = op->get_session();
+  ceph_assert(s);
+
+  if (m->hostname.size()) {
+    s->remote_host = m->hostname;
+  }
+
+  for (map<string,ceph_mon_subscribe_item>::iterator p = m->what.begin();
+       p != m->what.end();
+       ++p) {
+    if (p->first == "monmap" || p->first == "config") {
+      // these require no caps
+    } else if (!s->is_capable("mon", MON_CAP_R)) {
+      dout(5) << __func__ << " " << op->get_req()->get_source_inst()
+	      << " not enough caps for " << *(op->get_req()) << " -- dropping"
+	      << dendl;
+      continue;
+    }
+
+    // if there are any non-onetime subscriptions, we need to reply to start the resubscribe timer
+    if ((p->second.flags & CEPH_SUBSCRIBE_ONETIME) == 0)
+      reply = true;
+
+    // remove conflicting subscribes
+    if (logmon()->sub_name_to_id(p->first) >= 0) {
+      for (map<string, Subscription*>::iterator it = s->sub_map.begin();
+	   it != s->sub_map.end(); ) {
+	if (it->first != p->first && logmon()->sub_name_to_id(it->first) >= 0) {
+	  std::lock_guard l(session_map_lock);
+	  session_map.remove_sub((it++)->second);
+	} else {
+	  ++it;
+	}
+      }
+    }
+
+    {
+      std::lock_guard l(session_map_lock);
+      session_map.add_update_sub(s, p->first, p->second.start,
+				 p->second.flags & CEPH_SUBSCRIBE_ONETIME,
+				 m->get_connection()->has_feature(CEPH_FEATURE_INCSUBOSDMAP));
+    }
+
+    if (p->first.compare(0, 6, "mdsmap") == 0 || p->first.compare(0, 5, "fsmap") == 0) {
+      dout(10) << __func__ << ": MDS sub '" << p->first << "'" << dendl;
+      if ((int)s->is_capable("mds", MON_CAP_R)) {
+        Subscription *sub = s->sub_map[p->first];
+        ceph_assert(sub != nullptr);
+        mdsmon()->check_sub(sub);
+      }
+    } else if (p->first == "osdmap") {
+      if ((int)s->is_capable("osd", MON_CAP_R)) {
+	if (s->osd_epoch > p->second.start) {
+	  // client needs earlier osdmaps on purpose, so reset the sent epoch
+	  s->osd_epoch = 0;
+	}
+        osdmon()->check_osdmap_sub(s->sub_map["osdmap"]);
+      }
+    } else if (p->first == "osd_pg_creates") {
+      if ((int)s->is_capable("osd", MON_CAP_W)) {
+	osdmon()->check_pg_creates_sub(s->sub_map["osd_pg_creates"]);
+      }
+    } else if (p->first == "monmap") {
+      monmon()->check_sub(s->sub_map[p->first]);
+    } else if (logmon()->sub_name_to_id(p->first) >= 0) {
+      logmon()->check_sub(s->sub_map[p->first]);
+    } else if (p->first == "mgrmap" || p->first == "mgrdigest") {
+      mgrmon()->check_sub(s->sub_map[p->first]);
+    } else if (p->first == "servicemap") {
+      mgrstatmon()->check_sub(s->sub_map[p->first]);
+    } else if (p->first == "config") {
+      configmon()->check_sub(s);
+    } else if (p->first.find("kv:") == 0) {
+      kvmon()->check_sub(s->sub_map[p->first]);
+    }
+  }
+
+  if (reply) {
+    // we only need to reply if the client is old enough to think it
+    // has to send renewals.
+    ConnectionRef con = m->get_connection();
+    if (!con->has_feature(CEPH_FEATURE_MON_STATEFUL_SUB))
+      m->get_connection()->send_message(new MMonSubscribeAck(
+	monmap->get_fsid(), (int)g_conf()->mon_subscribe_interval));
+  }
+
+}
+
+void Monitor::handle_get_version(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonGetVersion>();
+  dout(10) << "handle_get_version " << *m << dendl;
+  PaxosService *svc = NULL;
+
+  MonSession *s = op->get_session();
+  ceph_assert(s);
+
+  if (!is_leader() && !is_peon()) {
+    dout(10) << " waiting for quorum" << dendl;
+    waitfor_quorum.push_back(new C_RetryMessage(this, op));
+    goto out;
+  }
+
+  if (m->what == "mdsmap") {
+    svc = mdsmon();
+  } else if (m->what == "fsmap") {
+    svc = mdsmon();
+  } else if (m->what == "osdmap") {
+    svc = osdmon();
+  } else if (m->what == "monmap") {
+    svc = monmon();
+  } else {
+    derr << "invalid map type " << m->what << dendl;
+  }
+
+  if (svc) {
+    if (!svc->is_readable()) {
+      svc->wait_for_readable(op, new C_RetryMessage(this, op));
+      goto out;
+    }
+
+    MMonGetVersionReply *reply = new MMonGetVersionReply();
+    reply->handle = m->handle;
+    reply->version = svc->get_last_committed();
+    reply->oldest_version = svc->get_first_committed();
+    reply->set_tid(m->get_tid());
+
+    m->get_connection()->send_message(reply);
+  }
+ out:
+  return;
+}
+
+bool Monitor::ms_handle_reset(Connection *con)
+{
+  dout(10) << "ms_handle_reset " << con << " " << con->get_peer_addr() << dendl;
+
+  // ignore lossless monitor sessions
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON)
+    return false;
+
+  auto priv = con->get_priv();
+  auto s = static_cast<MonSession*>(priv.get());
+  if (!s)
+    return false;
+
+  // break any con <-> session ref cycle
+  s->con->set_priv(nullptr);
+
+  if (is_shutdown())
+    return false;
+
+  std::lock_guard l(lock);
+
+  dout(10) << "reset/close on session " << s->name << " " << s->addrs << dendl;
+  if (!s->closed && s->item.is_on_list()) {
+    std::lock_guard l(session_map_lock);
+    remove_session(s);
+  }
+  return true;
+}
+
+bool Monitor::ms_handle_refused(Connection *con)
+{
+  // just log for now...
+  dout(10) << "ms_handle_refused " << con << " " << con->get_peer_addr() << dendl;
+  return false;
+}
+
+// -----
+
+void Monitor::send_latest_monmap(Connection *con)
+{
+  bufferlist bl;
+  monmap->encode(bl, con->get_features());
+  con->send_message(new MMonMap(bl));
+}
+
+void Monitor::handle_mon_get_map(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonGetMap>();
+  dout(10) << "handle_mon_get_map" << dendl;
+  send_latest_monmap(m->get_connection().get());
+}
+
+int Monitor::load_metadata()
+{
+  bufferlist bl;
+  int r = store->get(MONITOR_STORE_PREFIX, "last_metadata", bl);
+  if (r)
+    return r;
+  auto it = bl.cbegin();
+  decode(mon_metadata, it);
+
+  pending_metadata = mon_metadata;
+  return 0;
+}
+
+int Monitor::get_mon_metadata(int mon, Formatter *f, ostream& err)
+{
+  ceph_assert(f);
+  if (!mon_metadata.count(mon)) {
+    err << "mon." << mon << " not found";
+    return -EINVAL;
+  }
+  const Metadata& m = mon_metadata[mon];
+  for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
+    f->dump_string(p->first.c_str(), p->second);
+  }
+  return 0;
+}
+
+void Monitor::count_metadata(const string& field, map<string,int> *out)
+{
+  for (auto& p : mon_metadata) {
+    auto q = p.second.find(field);
+    if (q == p.second.end()) {
+      (*out)["unknown"]++;
+    } else {
+      (*out)[q->second]++;
+    }
+  }
+}
+
+void Monitor::count_metadata(const string& field, Formatter *f)
+{
+  map<string,int> by_val;
+  count_metadata(field, &by_val);
+  f->open_object_section(field.c_str());
+  for (auto& p : by_val) {
+    f->dump_int(p.first.c_str(), p.second);
+  }
+  f->close_section();
+}
+
+void Monitor::get_all_versions(std::map<string, list<string> > &versions)
+{
+  // mon
+  get_versions(versions);
+  // osd
+  osdmon()->get_versions(versions);
+  // mgr
+  mgrmon()->get_versions(versions);
+  // mds
+  mdsmon()->get_versions(versions);
+  dout(20) << __func__ << " all versions=" << versions << dendl;
+}
+
+void Monitor::get_versions(std::map<string, list<string> > &versions)
+{
+  for (auto& [rank, metadata] : mon_metadata) {
+    auto q = metadata.find("ceph_version_short");
+    if (q == metadata.end()) {
+      // not likely
+      continue;
+    }
+    versions[q->second].push_back(string("mon.") + monmap->get_name(rank));
+  }
+}
+
+int Monitor::print_nodes(Formatter *f, ostream& err)
+{
+  map<string, list<string> > mons;	// hostname => mon
+  for (map<int, Metadata>::iterator it = mon_metadata.begin();
+       it != mon_metadata.end(); ++it) {
+    const Metadata& m = it->second;
+    Metadata::const_iterator hostname = m.find("hostname");
+    if (hostname == m.end()) {
+      // not likely though
+      continue;
+    }
+    mons[hostname->second].push_back(monmap->get_name(it->first));
+  }
+
+  dump_services(f, mons, "mon");
+  return 0;
+}
+
+// ----------------------------------------------
+// scrub
+
+int Monitor::scrub_start()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(is_leader());
+
+  if (!scrub_result.empty()) {
+    clog->info() << "scrub already in progress";
+    return -EBUSY;
+  }
+
+  scrub_event_cancel();
+  scrub_result.clear();
+  scrub_state.reset(new ScrubState);
+
+  scrub();
+  return 0;
+}
+
+int Monitor::scrub()
+{
+  ceph_assert(is_leader());
+  ceph_assert(scrub_state);
+
+  scrub_cancel_timeout();
+  wait_for_paxos_write();
+  scrub_version = paxos->get_version();
+
+
+  // scrub all keys if we're the only monitor in the quorum
+  int32_t num_keys =
+    (quorum.size() == 1 ? -1 : cct->_conf->mon_scrub_max_keys);
+
+  for (set<int>::iterator p = quorum.begin();
+       p != quorum.end();
+       ++p) {
+    if (*p == rank)
+      continue;
+    MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version,
+                                 num_keys);
+    r->key = scrub_state->last_key;
+    send_mon_message(r, *p);
+  }
+
+  // scrub my keys
+  bool r = _scrub(&scrub_result[rank],
+                  &scrub_state->last_key,
+                  &num_keys);
+
+  scrub_state->finished = !r;
+
+  // only after we got our scrub results do we really care whether the
+  // other monitors are late on their results.  Also, this way we avoid
+  // triggering the timeout if we end up getting stuck in _scrub() for
+  // longer than the duration of the timeout.
+  scrub_reset_timeout();
+
+  if (quorum.size() == 1) {
+    ceph_assert(scrub_state->finished == true);
+    scrub_finish();
+  }
+  return 0;
+}
+
+void Monitor::handle_scrub(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonScrub>();
+  dout(10) << __func__ << " " << *m << dendl;
+  switch (m->op) {
+  case MMonScrub::OP_SCRUB:
+    {
+      if (!is_peon())
+	break;
+
+      wait_for_paxos_write();
+
+      if (m->version != paxos->get_version())
+	break;
+
+      MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT,
+                                       m->version,
+                                       m->num_keys);
+
+      reply->key = m->key;
+      _scrub(&reply->result, &reply->key, &reply->num_keys);
+      m->get_connection()->send_message(reply);
+    }
+    break;
+
+  case MMonScrub::OP_RESULT:
+    {
+      if (!is_leader())
+	break;
+      if (m->version != scrub_version)
+	break;
+      // reset the timeout each time we get a result
+      scrub_reset_timeout();
+
+      int from = m->get_source().num();
+      ceph_assert(scrub_result.count(from) == 0);
+      scrub_result[from] = m->result;
+
+      if (scrub_result.size() == quorum.size()) {
+        scrub_check_results();
+        scrub_result.clear();
+        if (scrub_state->finished)
+          scrub_finish();
+        else
+          scrub();
+      }
+    }
+    break;
+  }
+}
+
+bool Monitor::_scrub(ScrubResult *r,
+                     pair<string,string> *start,
+                     int *num_keys)
+{
+  ceph_assert(r != NULL);
+  ceph_assert(start != NULL);
+  ceph_assert(num_keys != NULL);
+
+  set<string> prefixes = get_sync_targets_names();
+  prefixes.erase("paxos");  // exclude paxos, as this one may have extra states for proposals, etc.
+
+  dout(10) << __func__ << " start (" << *start << ")"
+           << " num_keys " << *num_keys << dendl;
+
+  MonitorDBStore::Synchronizer it = store->get_synchronizer(*start, prefixes);
+
+  int scrubbed_keys = 0;
+  pair<string,string> last_key;
+
+  while (it->has_next_chunk()) {
+
+    if (*num_keys > 0 && scrubbed_keys == *num_keys)
+      break;
+
+    pair<string,string> k = it->get_next_key();
+    if (prefixes.count(k.first) == 0)
+      continue;
+
+    if (cct->_conf->mon_scrub_inject_missing_keys > 0.0 &&
+        (rand() % 10000 < cct->_conf->mon_scrub_inject_missing_keys*10000.0)) {
+      dout(10) << __func__ << " inject missing key, skipping (" << k << ")"
+               << dendl;
+      continue;
+    }
+
+    bufferlist bl;
+    int err = store->get(k.first, k.second, bl);
+    ceph_assert(err == 0);
+    
+    uint32_t key_crc = bl.crc32c(0);
+    dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes"
+                                     << " crc " << key_crc << dendl;
+    r->prefix_keys[k.first]++;
+    if (r->prefix_crc.count(k.first) == 0) {
+      r->prefix_crc[k.first] = 0;
+    }
+    r->prefix_crc[k.first] = bl.crc32c(r->prefix_crc[k.first]);
+
+    if (cct->_conf->mon_scrub_inject_crc_mismatch > 0.0 &&
+        (rand() % 10000 < cct->_conf->mon_scrub_inject_crc_mismatch*10000.0)) {
+      dout(10) << __func__ << " inject failure at (" << k << ")" << dendl;
+      r->prefix_crc[k.first] += 1;
+    }
+
+    ++scrubbed_keys;
+    last_key = k;
+  }
+
+  dout(20) << __func__ << " last_key (" << last_key << ")"
+                       << " scrubbed_keys " << scrubbed_keys
+                       << " has_next " << it->has_next_chunk() << dendl;
+
+  *start = last_key;
+  *num_keys = scrubbed_keys;
+
+  return it->has_next_chunk();
+}
+
+void Monitor::scrub_check_results()
+{
+  dout(10) << __func__ << dendl;
+
+  // compare
+  int errors = 0;
+  ScrubResult& mine = scrub_result[rank];
+  for (map<int,ScrubResult>::iterator p = scrub_result.begin();
+       p != scrub_result.end();
+       ++p) {
+    if (p->first == rank)
+      continue;
+    if (p->second != mine) {
+      ++errors;
+      clog->error() << "scrub mismatch";
+      clog->error() << " mon." << rank << " " << mine;
+      clog->error() << " mon." << p->first << " " << p->second;
+    }
+  }
+  if (!errors)
+    clog->debug() << "scrub ok on " << quorum << ": " << mine;
+}
+
+inline void Monitor::scrub_timeout()
+{
+  dout(1) << __func__ << " restarting scrub" << dendl;
+  scrub_reset();
+  scrub_start();
+}
+
+void Monitor::scrub_finish()
+{
+  dout(10) << __func__ << dendl;
+  scrub_reset();
+  scrub_event_start();
+}
+
+void Monitor::scrub_reset()
+{
+  dout(10) << __func__ << dendl;
+  scrub_cancel_timeout();
+  scrub_version = 0;
+  scrub_result.clear();
+  scrub_state.reset();
+}
+
+inline void Monitor::scrub_update_interval(ceph::timespan interval)
+{
+  // we don't care about changes if we are not the leader.
+  // changes will be visible if we become the leader.
+  if (!is_leader())
+    return;
+
+  dout(1) << __func__ << " new interval = " << interval << dendl;
+
+  // if scrub already in progress, all changes will already be visible during
+  // the next round.  Nothing to do.
+  if (scrub_state != NULL)
+    return;
+
+  scrub_event_cancel();
+  scrub_event_start();
+}
+
+void Monitor::scrub_event_start()
+{
+  dout(10) << __func__ << dendl;
+
+  if (scrub_event)
+    scrub_event_cancel();
+
+  auto scrub_interval =
+    cct->_conf.get_val<std::chrono::seconds>("mon_scrub_interval");
+  if (scrub_interval == std::chrono::seconds::zero()) {
+    dout(1) << __func__ << " scrub event is disabled"
+            << " (mon_scrub_interval = " << scrub_interval
+            << ")" << dendl;
+    return;
+  }
+
+  scrub_event = timer.add_event_after(
+    scrub_interval,
+    new C_MonContext{this, [this](int) {
+      scrub_start();
+      }});
+}
+
+void Monitor::scrub_event_cancel()
+{
+  dout(10) << __func__ << dendl;
+  if (scrub_event) {
+    timer.cancel_event(scrub_event);
+    scrub_event = NULL;
+  }
+}
+
+inline void Monitor::scrub_cancel_timeout()
+{
+  if (scrub_timeout_event) {
+    timer.cancel_event(scrub_timeout_event);
+    scrub_timeout_event = NULL;
+  }
+}
+
+void Monitor::scrub_reset_timeout()
+{
+  dout(15) << __func__ << " reset timeout event" << dendl;
+  scrub_cancel_timeout();
+  scrub_timeout_event = timer.add_event_after(
+    g_conf()->mon_scrub_timeout,
+    new C_MonContext{this, [this](int) {
+      scrub_timeout();
+    }});
+}
+
+/************ TICK ***************/
+void Monitor::new_tick()
+{
+  timer.add_event_after(g_conf()->mon_tick_interval, new C_MonContext{this, [this](int) {
+	tick();
+      }});
+}
+
+void Monitor::tick()
+{
+  // ok go.
+  dout(11) << "tick" << dendl;
+  const utime_t now = ceph_clock_now();
+  
+  // Check if we need to emit any delayed health check updated messages
+  if (is_leader()) {
+    const auto min_period = g_conf().get_val<int64_t>(
+                              "mon_health_log_update_period");
+    for (auto& svc : paxos_service) {
+      auto health = svc->get_health_checks();
+
+      for (const auto &i : health.checks) {
+        const std::string &code = i.first;
+        const std::string &summary = i.second.summary;
+        const health_status_t severity = i.second.severity;
+
+        auto status_iter = health_check_log_times.find(code);
+        if (status_iter == health_check_log_times.end()) {
+          continue;
+        }
+
+        auto &log_status = status_iter->second;
+        bool const changed = log_status.last_message != summary
+                             || log_status.severity != severity;
+
+        if (changed && now - log_status.updated_at > min_period) {
+          log_status.last_message = summary;
+          log_status.updated_at = now;
+          log_status.severity = severity;
+
+          ostringstream ss;
+          ss << "Health check update: " << summary << " (" << code << ")";
+          clog->health(severity) << ss.str();
+        }
+      }
+    }
+  }
+
+
+  for (auto& svc : paxos_service) {
+    svc->tick();
+    svc->maybe_trim();
+  }
+  
+  // trim sessions
+  {
+    std::lock_guard l(session_map_lock);
+    auto p = session_map.sessions.begin();
+
+    bool out_for_too_long = (!exited_quorum.is_zero() &&
+			     now > (exited_quorum + 2*g_conf()->mon_lease));
+
+    while (!p.end()) {
+      MonSession *s = *p;
+      ++p;
+    
+      // don't trim monitors
+      if (s->name.is_mon())
+	continue;
+
+      if (s->session_timeout < now && s->con) {
+	// check keepalive, too
+	s->session_timeout = s->con->get_last_keepalive();
+	s->session_timeout += g_conf()->mon_session_timeout;
+      }
+      if (s->session_timeout < now) {
+	dout(10) << " trimming session " << s->con << " " << s->name
+		 << " " << s->addrs
+		 << " (timeout " << s->session_timeout
+		 << " < now " << now << ")" << dendl;
+      } else if (out_for_too_long) {
+	// boot the client Session because we've taken too long getting back in
+	dout(10) << " trimming session " << s->con << " " << s->name
+		 << " because we've been out of quorum too long" << dendl;
+      } else {
+	continue;
+      }
+
+      s->con->mark_down();
+      remove_session(s);
+      logger->inc(l_mon_session_trim);
+    }
+  }
+  sync_trim_providers();
+
+  if (!maybe_wait_for_quorum.empty()) {
+    finish_contexts(g_ceph_context, maybe_wait_for_quorum);
+  }
+
+  if (is_leader() && paxos->is_active() && fingerprint.is_zero()) {
+    // this is only necessary on upgraded clusters.
+    MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
+    prepare_new_fingerprint(t);
+    paxos->trigger_propose();
+  }
+
+  mgr_client.update_daemon_health(get_health_metrics());
+  new_tick();
+}
+
+vector<DaemonHealthMetric> Monitor::get_health_metrics() 
+{
+  vector<DaemonHealthMetric> metrics;
+
+  utime_t oldest_secs;
+  const utime_t now = ceph_clock_now();
+  auto too_old = now;
+  too_old -= g_conf().get_val<std::chrono::seconds>("mon_op_complaint_time").count();
+  int slow = 0;
+  TrackedOpRef oldest_op;
+  auto count_slow_ops = [&](TrackedOp& op) {
+    if (op.get_initiated() < too_old) {
+      slow++;
+      if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
+	oldest_op = &op;
+      }
+      return true;
+    } else {
+      return false;
+    }
+  };
+  if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
+    if (slow) {
+      derr << __func__ << " reporting " << slow << " slow ops, oldest is "
+	   << oldest_op->get_desc() << dendl;
+    }
+    metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
+  } else {
+    metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
+  }
+  return metrics;
+}
+
+void Monitor::prepare_new_fingerprint(MonitorDBStore::TransactionRef t)
+{
+  uuid_d nf;
+  nf.generate_random();
+  dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl;
+
+  bufferlist bl;
+  encode(nf, bl);
+  t->put(MONITOR_NAME, "cluster_fingerprint", bl);
+}
+
+int Monitor::check_fsid()
+{
+  bufferlist ebl;
+  int r = store->get(MONITOR_NAME, "cluster_uuid", ebl);
+  if (r == -ENOENT)
+    return r;
+  ceph_assert(r == 0);
+
+  string es(ebl.c_str(), ebl.length());
+
+  // only keep the first line
+  size_t pos = es.find_first_of('\n');
+  if (pos != string::npos)
+    es.resize(pos);
+
+  dout(10) << "check_fsid cluster_uuid contains '" << es << "'" << dendl;
+  uuid_d ondisk;
+  if (!ondisk.parse(es.c_str())) {
+    derr << "error: unable to parse uuid" << dendl;
+    return -EINVAL;
+  }
+
+  if (monmap->get_fsid() != ondisk) {
+    derr << "error: cluster_uuid file exists with value " << ondisk
+	 << ", != our uuid " << monmap->get_fsid() << dendl;
+    return -EEXIST;
+  }
+
+  return 0;
+}
+
+int Monitor::write_fsid()
+{
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  write_fsid(t);
+  int r = store->apply_transaction(t);
+  return r;
+}
+
+int Monitor::write_fsid(MonitorDBStore::TransactionRef t)
+{
+  ostringstream ss;
+  ss << monmap->get_fsid() << "\n";
+  string us = ss.str();
+
+  bufferlist b;
+  b.append(us);
+
+  t->put(MONITOR_NAME, "cluster_uuid", b);
+  return 0;
+}
+
+/*
+ * this is the closest thing to a traditional 'mkfs' for ceph.
+ * initialize the monitor state machines to their initial values.
+ */
+int Monitor::mkfs(bufferlist& osdmapbl)
+{
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+
+  // verify cluster fsid
+  int r = check_fsid();
+  if (r < 0 && r != -ENOENT)
+    return r;
+
+  bufferlist magicbl;
+  magicbl.append(CEPH_MON_ONDISK_MAGIC);
+  magicbl.append("\n");
+  t->put(MONITOR_NAME, "magic", magicbl);
+
+
+  features = get_initial_supported_features();
+  write_features(t);
+
+  // save monmap, osdmap, keyring.
+  bufferlist monmapbl;
+  monmap->encode(monmapbl, CEPH_FEATURES_ALL);
+  monmap->set_epoch(0);     // must be 0 to avoid confusing first MonmapMonitor::update_from_paxos()
+  t->put("mkfs", "monmap", monmapbl);
+
+  if (osdmapbl.length()) {
+    // make sure it's a valid osdmap
+    try {
+      OSDMap om;
+      om.decode(osdmapbl);
+    }
+    catch (ceph::buffer::error& e) {
+      derr << "error decoding provided osdmap: " << e.what() << dendl;
+      return -EINVAL;
+    }
+    t->put("mkfs", "osdmap", osdmapbl);
+  }
+
+  if (is_keyring_required()) {
+    KeyRing keyring;
+    string keyring_filename;
+
+    r = ceph_resolve_file_search(g_conf()->keyring, keyring_filename);
+    if (r) {
+      if (g_conf()->key != "") {
+	string keyring_plaintext = "[mon.]\n\tkey = " + g_conf()->key +
+	  "\n\tcaps mon = \"allow *\"\n";
+	bufferlist bl;
+	bl.append(keyring_plaintext);
+	try {
+	  auto i = bl.cbegin();
+	  keyring.decode_plaintext(i);
+	}
+	catch (const ceph::buffer::error& e) {
+	  derr << "error decoding keyring " << keyring_plaintext
+	       << ": " << e.what() << dendl;
+	  return -EINVAL;
+	}
+      } else {
+	derr << "unable to find a keyring on " << g_conf()->keyring
+	     << ": " << cpp_strerror(r) << dendl;
+	return r;
+      }
+    } else {
+      r = keyring.load(g_ceph_context, keyring_filename);
+      if (r < 0) {
+	derr << "unable to load initial keyring " << g_conf()->keyring << dendl;
+	return r;
+      }
+    }
+
+    // put mon. key in external keyring; seed with everything else.
+    extract_save_mon_key(keyring);
+
+    bufferlist keyringbl;
+    keyring.encode_plaintext(keyringbl);
+    t->put("mkfs", "keyring", keyringbl);
+  }
+  write_fsid(t);
+  store->apply_transaction(t);
+
+  return 0;
+}
+
+int Monitor::write_default_keyring(bufferlist& bl)
+{
+  ostringstream os;
+  os << g_conf()->mon_data << "/keyring";
+
+  int err = 0;
+  int fd = ::open(os.str().c_str(), O_WRONLY|O_CREAT|O_CLOEXEC, 0600);
+  if (fd < 0) {
+    err = -errno;
+    dout(0) << __func__ << " failed to open " << os.str() 
+	    << ": " << cpp_strerror(err) << dendl;
+    return err;
+  }
+
+  err = bl.write_fd(fd);
+  if (!err)
+    ::fsync(fd);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+  return err;
+}
+
+void Monitor::extract_save_mon_key(KeyRing& keyring)
+{
+  EntityName mon_name;
+  mon_name.set_type(CEPH_ENTITY_TYPE_MON);
+  EntityAuth mon_key;
+  if (keyring.get_auth(mon_name, mon_key)) {
+    dout(10) << "extract_save_mon_key moving mon. key to separate keyring" << dendl;
+    KeyRing pkey;
+    pkey.add(mon_name, mon_key);
+    bufferlist bl;
+    pkey.encode_plaintext(bl);
+    write_default_keyring(bl);
+    keyring.remove(mon_name);
+  }
+}
+
+// AuthClient methods -- for mon <-> mon communication
+int Monitor::get_auth_request(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  uint32_t *method,
+  vector<uint32_t> *preferred_modes,
+  bufferlist *out)
+{
+  std::scoped_lock l(auth_lock);
+  if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
+      con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
+    return -EACCES;
+  }
+  AuthAuthorizer *auth;
+  if (!get_authorizer(con->get_peer_type(), &auth)) {
+    return -EACCES;
+  }
+  auth_meta->authorizer.reset(auth);
+  auth_registry.get_supported_modes(con->get_peer_type(),
+				    auth->protocol,
+				    preferred_modes);
+  *method = auth->protocol;
+  *out = auth->bl;
+  return 0;
+}
+
+int Monitor::handle_auth_reply_more(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  const bufferlist& bl,
+  bufferlist *reply)
+{
+  std::scoped_lock l(auth_lock);
+  if (!auth_meta->authorizer) {
+    derr << __func__ << " no authorizer?" << dendl;
+    return -EACCES;
+  }
+  auth_meta->authorizer->add_challenge(cct, bl);
+  *reply = auth_meta->authorizer->bl;
+  return 0;
+}
+
+int Monitor::handle_auth_done(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  uint64_t global_id,
+  uint32_t con_mode,
+  const bufferlist& bl,
+  CryptoKey *session_key,
+  std::string *connection_secret)
+{
+  std::scoped_lock l(auth_lock);
+  // verify authorizer reply
+  auto p = bl.begin();
+  if (!auth_meta->authorizer->verify_reply(p, connection_secret)) {
+    dout(0) << __func__ << " failed verifying authorizer reply" << dendl;
+    return -EACCES;
+  }
+  auth_meta->session_key = auth_meta->authorizer->session_key;
+  return 0;
+}
+
+int Monitor::handle_auth_bad_method(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  uint32_t old_auth_method,
+  int result,
+  const std::vector<uint32_t>& allowed_methods,
+  const std::vector<uint32_t>& allowed_modes)
+{
+  derr << __func__ << " hmm, they didn't like " << old_auth_method
+       << " result " << cpp_strerror(result) << dendl;
+  return -EACCES;
+}
+
+bool Monitor::get_authorizer(int service_id, AuthAuthorizer **authorizer)
+{
+  dout(10) << "get_authorizer for " << ceph_entity_type_name(service_id)
+	   << dendl;
+
+  if (is_shutdown())
+    return false;
+
+  // we only connect to other monitors and mgr; every else connects to us.
+  if (service_id != CEPH_ENTITY_TYPE_MON &&
+      service_id != CEPH_ENTITY_TYPE_MGR)
+    return false;
+
+  if (!auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX)) {
+    // auth_none
+    dout(20) << __func__ << " building auth_none authorizer" << dendl;
+    AuthNoneClientHandler handler{g_ceph_context};
+    handler.set_global_id(0);
+    *authorizer = handler.build_authorizer(service_id);
+    return true;
+  }
+
+  CephXServiceTicketInfo auth_ticket_info;
+  CephXSessionAuthInfo info;
+  int ret;
+
+  EntityName name;
+  name.set_type(CEPH_ENTITY_TYPE_MON);
+  auth_ticket_info.ticket.name = name;
+  auth_ticket_info.ticket.global_id = 0;
+
+  if (service_id == CEPH_ENTITY_TYPE_MON) {
+    // mon to mon authentication uses the private monitor shared key and not the
+    // rotating key
+    CryptoKey secret;
+    if (!keyring.get_secret(name, secret) &&
+	!key_server.get_secret(name, secret)) {
+      dout(0) << " couldn't get secret for mon service from keyring or keyserver"
+	      << dendl;
+      stringstream ss, ds;
+      int err = key_server.list_secrets(ds);
+      if (err < 0)
+	ss << "no installed auth entries!";
+      else
+	ss << "installed auth entries:";
+      dout(0) << ss.str() << "\n" << ds.str() << dendl;
+      return false;
+    }
+
+    ret = key_server.build_session_auth_info(
+      service_id, auth_ticket_info.ticket, secret, (uint64_t)-1, info);
+    if (ret < 0) {
+      dout(0) << __func__ << " failed to build mon session_auth_info "
+	      << cpp_strerror(ret) << dendl;
+      return false;
+    }
+  } else if (service_id == CEPH_ENTITY_TYPE_MGR) {
+    // mgr
+    ret = key_server.build_session_auth_info(
+      service_id, auth_ticket_info.ticket, info);
+    if (ret < 0) {
+      derr << __func__ << " failed to build mgr service session_auth_info "
+	   << cpp_strerror(ret) << dendl;
+      return false;
+    }
+  } else {
+    ceph_abort();  // see check at top of fn
+  }
+
+  CephXTicketBlob blob;
+  if (!cephx_build_service_ticket_blob(cct, info, blob)) {
+    dout(0) << "get_authorizer failed to build service ticket" << dendl;
+    return false;
+  }
+  bufferlist ticket_data;
+  encode(blob, ticket_data);
+
+  auto iter = ticket_data.cbegin();
+  CephXTicketHandler handler(g_ceph_context, service_id);
+  decode(handler.ticket, iter);
+
+  handler.session_key = info.session_key;
+
+  *authorizer = handler.build_authorizer(0);
+  
+  return true;
+}
+
+int Monitor::handle_auth_request(
+  Connection *con,
+  AuthConnectionMeta *auth_meta,
+  bool more,
+  uint32_t auth_method,
+  const bufferlist &payload,
+  bufferlist *reply)
+{
+  std::scoped_lock l(auth_lock);
+
+  // NOTE: be careful, the Connection hasn't fully negotiated yet, so
+  // e.g., peer_features, peer_addrs, and others are still unknown.
+
+  dout(10) << __func__ << " con " << con << (more ? " (more)":" (start)")
+	   << " method " << auth_method
+	   << " payload " << payload.length()
+	   << dendl;
+  if (!payload.length()) {
+    if (!con->is_msgr2() &&
+	con->get_peer_type() != CEPH_ENTITY_TYPE_MON) {
+      // for v1 connections, we tolerate no authorizer (from
+      // non-monitors), because authentication happens via MAuth
+      // messages.
+      return 1;
+    }
+    return -EACCES;
+  }
+  if (!more) {
+    auth_meta->auth_mode = payload[0];
+  }
+
+  if (auth_meta->auth_mode >= AUTH_MODE_AUTHORIZER &&
+      auth_meta->auth_mode <= AUTH_MODE_AUTHORIZER_MAX) {
+    AuthAuthorizeHandler *ah = get_auth_authorize_handler(con->get_peer_type(),
+							  auth_method);
+    if (!ah) {
+      lderr(cct) << __func__ << " no AuthAuthorizeHandler found for auth method "
+		 << auth_method << dendl;
+      return -EOPNOTSUPP;
+    }
+    bool was_challenge = (bool)auth_meta->authorizer_challenge;
+    bool isvalid = ah->verify_authorizer(
+      cct,
+      keyring,
+      payload,
+      auth_meta->get_connection_secret_length(),
+      reply,
+      &con->peer_name,
+      &con->peer_global_id,
+      &con->peer_caps_info,
+      &auth_meta->session_key,
+      &auth_meta->connection_secret,
+      &auth_meta->authorizer_challenge);
+    if (isvalid) {
+      ms_handle_authentication(con);
+      return 1;
+    }
+    if (!more && !was_challenge && auth_meta->authorizer_challenge) {
+      return 0;
+    }
+    dout(10) << __func__ << " bad authorizer on " << con << dendl;
+    return -EACCES;
+  } else if (auth_meta->auth_mode < AUTH_MODE_MON ||
+	     auth_meta->auth_mode > AUTH_MODE_MON_MAX) {
+    derr << __func__ << " unrecognized auth mode " << auth_meta->auth_mode
+	 << dendl;
+    return -EACCES;
+  }
+
+  // wait until we've formed an initial quorum on mkfs so that we have
+  // the initial keys (e.g., client.admin).
+  if (authmon()->get_last_committed() == 0) {
+    dout(10) << __func__ << " haven't formed initial quorum, EBUSY" << dendl;
+    return -EBUSY;
+  }
+
+  RefCountedPtr priv;
+  MonSession *s;
+  int32_t r = 0;
+  auto p = payload.begin();
+  if (!more) {
+    if (con->get_priv()) {
+      return -EACCES; // wtf
+    }
+
+    // handler?
+    unique_ptr<AuthServiceHandler> auth_handler{get_auth_service_handler(
+      auth_method, g_ceph_context, &key_server)};
+    if (!auth_handler) {
+      dout(1) << __func__ << " auth_method " << auth_method << " not supported"
+	      << dendl;
+      return -EOPNOTSUPP;
+    }
+
+    uint8_t mode;
+    EntityName entity_name;
+
+    try {
+      decode(mode, p);
+      if (mode < AUTH_MODE_MON ||
+	  mode > AUTH_MODE_MON_MAX) {
+	dout(1) << __func__ << " invalid mode " << (int)mode << dendl;
+	return -EACCES;
+      }
+      assert(mode >= AUTH_MODE_MON && mode <= AUTH_MODE_MON_MAX);
+      decode(entity_name, p);
+      decode(con->peer_global_id, p);
+    } catch (ceph::buffer::error& e) {
+      dout(1) << __func__ << " failed to decode, " << e.what() << dendl;
+      return -EACCES;
+    }
+
+    // supported method?
+    if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON ||
+	entity_name.get_type() == CEPH_ENTITY_TYPE_OSD ||
+	entity_name.get_type() == CEPH_ENTITY_TYPE_MDS ||
+	entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) {
+      if (!auth_cluster_required.is_supported_auth(auth_method)) {
+	dout(10) << __func__ << " entity " << entity_name << " method "
+		 << auth_method << " not among supported "
+		 << auth_cluster_required.get_supported_set() << dendl;
+	return -EOPNOTSUPP;
+      }
+    } else {
+      if (!auth_service_required.is_supported_auth(auth_method)) {
+	dout(10) << __func__ << " entity " << entity_name << " method "
+		 << auth_method << " not among supported "
+		 << auth_cluster_required.get_supported_set() << dendl;
+	return -EOPNOTSUPP;
+      }
+    }
+
+    // for msgr1 we would do some weirdness here to ensure signatures
+    // are supported by the client if we require it.  for msgr2 that
+    // is not necessary.
+
+    bool is_new_global_id = false;
+    if (!con->peer_global_id) {
+      con->peer_global_id = authmon()->_assign_global_id();
+      if (!con->peer_global_id) {
+	dout(1) << __func__ << " failed to assign global_id" << dendl;
+	return -EBUSY;
+      }
+      is_new_global_id = true;
+    }
+
+    // set up partial session
+    s = new MonSession(con);
+    s->auth_handler = auth_handler.release();
+    con->set_priv(RefCountedPtr{s, false});
+
+    r = s->auth_handler->start_session(
+      entity_name,
+      con->peer_global_id,
+      is_new_global_id,
+      reply,
+      &con->peer_caps_info);
+  } else {
+    priv = con->get_priv();
+    if (!priv) {
+      // this can happen if the async ms_handle_reset event races with
+      // the unlocked call into handle_auth_request
+      return -EACCES;
+    }
+    s = static_cast<MonSession*>(priv.get());
+    r = s->auth_handler->handle_request(
+      p,
+      auth_meta->get_connection_secret_length(),
+      reply,
+      &con->peer_caps_info,
+      &auth_meta->session_key,
+      &auth_meta->connection_secret);
+  }
+  if (r > 0 &&
+      !s->authenticated) {
+    ms_handle_authentication(con);
+  }
+
+  dout(30) << " r " << r << " reply:\n";
+  reply->hexdump(*_dout);
+  *_dout << dendl;
+  return r;
+}
+
+void Monitor::ms_handle_accept(Connection *con)
+{
+  auto priv = con->get_priv();
+  MonSession *s = static_cast<MonSession*>(priv.get());
+  if (!s) {
+    // legacy protocol v1?
+    dout(10) << __func__ << " con " << con << " no session" << dendl;
+    return;
+  }
+
+  if (s->item.is_on_list()) {
+    dout(10) << __func__ << " con " << con << " session " << s
+	     << " already on list" << dendl;
+  } else {
+    std::lock_guard l(session_map_lock);
+    if (state == STATE_SHUTDOWN) {
+      dout(10) << __func__ << " ignoring new con " << con << " (shutdown)" << dendl;
+      con->mark_down();
+      return;
+    }
+    dout(10) << __func__ << " con " << con << " session " << s
+	     << " registering session for "
+	     << con->get_peer_addrs() << dendl;
+    s->_ident(entity_name_t(con->get_peer_type(), con->get_peer_id()),
+	      con->get_peer_addrs());
+    session_map.add_session(s);
+  }
+}
+
+int Monitor::ms_handle_authentication(Connection *con)
+{
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    // mon <-> mon connections need no Session, and setting one up
+    // creates an awkward ref cycle between Session and Connection.
+    return 1;
+  }
+
+  auto priv = con->get_priv();
+  MonSession *s = static_cast<MonSession*>(priv.get());
+  if (!s) {
+    // must be msgr2, otherwise dispatch would have set up the session.
+    s = session_map.new_session(
+      entity_name_t(con->get_peer_type(), -1),  // we don't know yet
+      con->get_peer_addrs(),
+      con);
+    assert(s);
+    dout(10) << __func__ << " adding session " << s << " to con " << con
+	     << dendl;
+    con->set_priv(s);
+    logger->set(l_mon_num_sessions, session_map.get_size());
+    logger->inc(l_mon_session_add);
+  }
+  dout(10) << __func__ << " session " << s << " con " << con
+	   << " addr " << s->con->get_peer_addr()
+	   << " " << *s << dendl;
+
+  AuthCapsInfo &caps_info = con->get_peer_caps_info();
+  int ret = 0;
+  if (caps_info.allow_all) {
+    s->caps.set_allow_all();
+    s->authenticated = true;
+    ret = 1;
+  } else if (caps_info.caps.length()) {
+    bufferlist::const_iterator p = caps_info.caps.cbegin();
+    string str;
+    try {
+      decode(str, p);
+    } catch (const ceph::buffer::error &err) {
+      derr << __func__ << " corrupt cap data for " << con->get_peer_entity_name()
+	   << " in auth db" << dendl;
+      str.clear();
+      ret = -EACCES;
+    }
+    if (ret >= 0) {
+      if (s->caps.parse(str, NULL)) {
+	s->authenticated = true;
+	ret = 1;
+      } else {
+	derr << __func__ << " unparseable caps '" << str << "' for "
+	     << con->get_peer_entity_name() << dendl;
+	ret = -EACCES;
+      }
+    }
+  }
+
+  return ret;
+}
+
+void Monitor::set_mon_crush_location(const string& loc)
+{
+  if (loc.empty()) {
+    return;
+  }
+  vector<string> loc_vec;
+  loc_vec.push_back(loc);
+  CrushWrapper::parse_loc_map(loc_vec, &crush_loc);
+  need_set_crush_loc = true;
+}
+
+void Monitor::notify_new_monmap(bool can_change_external_state, bool remove_rank_elector)
+{
+  if (need_set_crush_loc) {
+    auto my_info_i = monmap->mon_info.find(name);
+    if (my_info_i != monmap->mon_info.end() &&
+	my_info_i->second.crush_loc == crush_loc) {
+      need_set_crush_loc = false;
+    }
+  }
+  elector.notify_strategy_maybe_changed(monmap->strategy);
+  if (remove_rank_elector){
+    dout(10) << __func__ << " we have " << monmap->ranks.size()<< " ranks" << dendl;
+    dout(10) << __func__ << " we have " << monmap->removed_ranks.size() << " removed ranks" << dendl;
+    for (auto i = monmap->removed_ranks.rbegin();
+        i != monmap->removed_ranks.rend(); ++i) {
+      int remove_rank = *i;
+      dout(10) << __func__ << " removing rank " << remove_rank << dendl;
+      if (rank == remove_rank) {
+        dout(5) << "We are removing our own rank, probably we"
+          << " are removed from monmap before we shutdown ... dropping." << dendl;
+        continue;
+      }
+      int new_rank = monmap->get_rank(messenger->get_myaddrs());
+      if (new_rank == -1) {
+        dout(5) << "We no longer exists in the monmap! ... dropping." << dendl;
+        continue;
+      }
+      elector.notify_rank_removed(remove_rank, new_rank);
+    }
+  }
+
+  if (monmap->stretch_mode_enabled) {
+    try_engage_stretch_mode();
+  }
+
+  if (is_stretch_mode()) {
+    if (!monmap->stretch_marked_down_mons.empty()) {
+      set_degraded_stretch_mode();
+    }
+  }
+  set_elector_disallowed_leaders(can_change_external_state);
+}
+
+void Monitor::set_elector_disallowed_leaders(bool allow_election)
+{
+  set<int> dl;
+  for (auto name : monmap->disallowed_leaders) {
+    dl.insert(monmap->get_rank(name));
+  }
+  if (is_stretch_mode()) {
+    for (auto name : monmap->stretch_marked_down_mons) {
+      dl.insert(monmap->get_rank(name));
+    }
+    dl.insert(monmap->get_rank(monmap->tiebreaker_mon));
+  }
+
+  bool disallowed_changed = elector.set_disallowed_leaders(dl);
+  if (disallowed_changed && allow_election) {
+    elector.call_election();
+  }
+}
+
+struct CMonEnableStretchMode : public Context {
+  Monitor *m;
+  CMonEnableStretchMode(Monitor *mon) : m(mon) {}
+  void finish(int r) {
+    m->try_engage_stretch_mode();
+  }
+};
+void Monitor::try_engage_stretch_mode()
+{
+  dout(20) << __func__ << dendl;
+  if (stretch_mode_engaged) return;
+  if (!osdmon()->is_readable()) {
+    osdmon()->wait_for_readable_ctx(new CMonEnableStretchMode(this));
+  }
+  if (osdmon()->osdmap.stretch_mode_enabled &&
+      monmap->stretch_mode_enabled) {
+    dout(10) << "Engaging stretch mode!" << dendl;
+    stretch_mode_engaged = true;
+    int32_t stretch_divider_id = osdmon()->osdmap.stretch_mode_bucket;
+    stretch_bucket_divider = osdmon()->osdmap.
+      crush->get_type_name(stretch_divider_id);
+    disconnect_disallowed_stretch_sessions();
+  }
+}
+
+void Monitor::do_stretch_mode_election_work()
+{
+  dout(20) << __func__ << dendl;
+  if (!is_stretch_mode() ||
+      !is_leader()) return;
+  dout(20) << "checking for degraded stretch mode" << dendl;
+  map<string, set<string>> old_dead_buckets;
+  old_dead_buckets.swap(dead_mon_buckets);
+  up_mon_buckets.clear();
+  // identify if we've lost a CRUSH bucket, request OSDMonitor check for death
+  map<string,set<string>> down_mon_buckets;
+  for (unsigned i = 0; i < monmap->size(); ++i) {
+    const auto &mi = monmap->mon_info[monmap->get_name(i)];
+    auto ci = mi.crush_loc.find(stretch_bucket_divider);
+    ceph_assert(ci != mi.crush_loc.end());
+    if (quorum.count(i)) {
+      up_mon_buckets.insert(ci->second);
+    } else {
+      down_mon_buckets[ci->second].insert(mi.name);
+    }
+  }
+  dout(20) << "prior dead_mon_buckets: " << old_dead_buckets
+	   << "; down_mon_buckets: " << down_mon_buckets
+	   << "; up_mon_buckets: " << up_mon_buckets << dendl;
+  for (auto di : down_mon_buckets) {
+    if (!up_mon_buckets.count(di.first)) {
+      dead_mon_buckets[di.first] = di.second;
+    }
+  }
+  dout(20) << "new dead_mon_buckets " << dead_mon_buckets << dendl;
+
+  if (dead_mon_buckets != old_dead_buckets &&
+      dead_mon_buckets.size() >= old_dead_buckets.size()) {
+    maybe_go_degraded_stretch_mode();
+  }
+}
+
+struct CMonGoDegraded : public Context {
+  Monitor *m;
+  CMonGoDegraded(Monitor *mon) : m(mon) {}
+  void finish(int r) {
+    m->maybe_go_degraded_stretch_mode();
+  }
+};
+
+struct CMonGoRecovery : public Context {
+  Monitor *m;
+  CMonGoRecovery(Monitor *mon) : m(mon) {}
+  void finish(int r) {
+    m->go_recovery_stretch_mode();
+  }
+};
+void Monitor::go_recovery_stretch_mode()
+{
+  dout(20) << __func__ << dendl;
+  if (!is_leader()) return;
+  if (!is_degraded_stretch_mode()) return;
+  if (is_recovering_stretch_mode()) return;
+
+  if (dead_mon_buckets.size()) {
+    ceph_assert( 0 == "how did we try and do stretch recovery while we have dead monitor buckets?");
+    // we can't recover if we are missing monitors in a zone!
+    return;
+  }
+  
+  if (!osdmon()->is_readable()) {
+    osdmon()->wait_for_readable_ctx(new CMonGoRecovery(this));
+    return;
+  }
+
+  if (!osdmon()->is_writeable()) {
+    osdmon()->wait_for_writeable_ctx(new CMonGoRecovery(this));
+  }
+  osdmon()->trigger_recovery_stretch_mode();
+}
+
+void Monitor::set_recovery_stretch_mode()
+{
+  degraded_stretch_mode = true;
+  recovering_stretch_mode = true;
+  osdmon()->set_recovery_stretch_mode();
+}
+
+void Monitor::maybe_go_degraded_stretch_mode()
+{
+  dout(20) << __func__ << dendl;
+  if (is_degraded_stretch_mode()) return;
+  if (!is_leader()) return;
+  if (dead_mon_buckets.empty()) return;
+  if (!osdmon()->is_readable()) {
+    osdmon()->wait_for_readable_ctx(new CMonGoDegraded(this));
+    return;
+  }
+  ceph_assert(monmap->contains(monmap->tiebreaker_mon));
+  // filter out the tiebreaker zone and check if remaining sites are down by OSDs too
+  const auto &mi = monmap->mon_info[monmap->tiebreaker_mon];
+  auto ci = mi.crush_loc.find(stretch_bucket_divider);
+  map<string, set<string>> filtered_dead_buckets = dead_mon_buckets;
+  filtered_dead_buckets.erase(ci->second);
+
+  set<int> matched_down_buckets;
+  set<string> matched_down_mons;
+  bool dead = osdmon()->check_for_dead_crush_zones(filtered_dead_buckets,
+						   &matched_down_buckets,
+						   &matched_down_mons);
+  if (dead) {
+    if (!osdmon()->is_writeable()) {
+      osdmon()->wait_for_writeable_ctx(new CMonGoDegraded(this));
+    }
+    if (!monmon()->is_writeable()) {
+      monmon()->wait_for_writeable_ctx(new CMonGoDegraded(this));
+    }
+    trigger_degraded_stretch_mode(matched_down_mons, matched_down_buckets);
+  }
+}
+
+void Monitor::trigger_degraded_stretch_mode(const set<string>& dead_mons,
+					    const set<int>& dead_buckets)
+{
+  dout(20) << __func__ << dendl;
+  ceph_assert(osdmon()->is_writeable());
+  ceph_assert(monmon()->is_writeable());
+
+  // figure out which OSD zone(s) remains alive by removing
+  // tiebreaker mon from up_mon_buckets
+  set<string> live_zones = up_mon_buckets;
+  ceph_assert(monmap->contains(monmap->tiebreaker_mon));
+  const auto &mi = monmap->mon_info[monmap->tiebreaker_mon];
+  auto ci = mi.crush_loc.find(stretch_bucket_divider);
+  live_zones.erase(ci->second);
+  ceph_assert(live_zones.size() == 1); // only support 2 zones right now
+  
+  osdmon()->trigger_degraded_stretch_mode(dead_buckets, live_zones);
+  monmon()->trigger_degraded_stretch_mode(dead_mons);
+  set_degraded_stretch_mode();
+}
+
+void Monitor::set_degraded_stretch_mode()
+{
+  degraded_stretch_mode = true;
+  recovering_stretch_mode = false;
+  osdmon()->set_degraded_stretch_mode();
+}
+
+struct CMonGoHealthy : public Context {
+  Monitor *m;
+  CMonGoHealthy(Monitor *mon) : m(mon) {}
+  void finish(int r) {
+    m->trigger_healthy_stretch_mode();
+  }
+};
+
+
+void Monitor::trigger_healthy_stretch_mode()
+{
+  dout(20) << __func__ << dendl;
+  if (!is_degraded_stretch_mode()) return;
+  if (!is_leader()) return;
+  if (!osdmon()->is_writeable()) {
+    osdmon()->wait_for_writeable_ctx(new CMonGoHealthy(this));
+  }
+  if (!monmon()->is_writeable()) {
+    monmon()->wait_for_writeable_ctx(new CMonGoHealthy(this));
+  }
+
+  ceph_assert(osdmon()->osdmap.recovering_stretch_mode);
+  osdmon()->trigger_healthy_stretch_mode();
+  monmon()->trigger_healthy_stretch_mode();
+}
+
+void Monitor::set_healthy_stretch_mode()
+{
+  degraded_stretch_mode = false;
+  recovering_stretch_mode = false;
+  osdmon()->set_healthy_stretch_mode();
+}
+
+bool Monitor::session_stretch_allowed(MonSession *s, MonOpRequestRef& op)
+{
+  if (!is_stretch_mode()) return true;
+  if (s->proxy_con) return true;
+  if (s->validated_stretch_connection) return true;
+  if (!s->con) return true;
+  if (s->con->peer_is_osd()) {
+    dout(20) << __func__ << "checking OSD session" << s << dendl;
+    // okay, check the crush location
+    int barrier_id;
+    int retval = osdmon()->osdmap.crush->get_validated_type_id(stretch_bucket_divider,
+							       &barrier_id);
+    ceph_assert(retval >= 0);
+    int osd_bucket_id = osdmon()->osdmap.crush->get_parent_of_type(s->con->peer_id,
+								   barrier_id);
+    const auto &mi = monmap->mon_info.find(name);
+    ceph_assert(mi != monmap->mon_info.end());
+    auto ci = mi->second.crush_loc.find(stretch_bucket_divider);
+    ceph_assert(ci != mi->second.crush_loc.end());
+    int mon_bucket_id = osdmon()->osdmap.crush->get_item_id(ci->second);
+    
+    if (osd_bucket_id != mon_bucket_id) {
+      dout(5) << "discarding session " << *s
+	      << " and sending OSD to matched zone" << dendl;
+      s->con->mark_down();
+      std::lock_guard l(session_map_lock);
+      remove_session(s);
+      if (op) {
+	op->mark_zap();
+      }
+      return false;
+    }
+  }
+
+  s->validated_stretch_connection = true;
+  return true;
+}
+
+void Monitor::disconnect_disallowed_stretch_sessions()
+{
+  dout(20) << __func__ << dendl;
+  MonOpRequestRef blank;
+  auto i = session_map.sessions.begin();
+  while (i != session_map.sessions.end()) {
+    auto j = i;
+    ++i;
+    session_stretch_allowed(*j, blank);
+  }
+}
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
new file mode 100644
index 000000000..1093649bb
--- /dev/null
+++ b/src/mon/Monitor.h
@@ -0,0 +1,1148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/*
+ * This is the top level monitor. It runs on each machine in the Monitor
+ * Cluster. The election of a leader for the paxos algorithm only happens
+ * once per machine via the elector. There is a separate paxos instance (state)
+ * kept for each of the system components: Object Store Device (OSD) Monitor,
+ * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
+ */
+
+#ifndef CEPH_MONITOR_H
+#define CEPH_MONITOR_H
+
+#include <errno.h>
+#include <cmath>
+#include <string>
+#include <array>
+
+#include "include/types.h"
+#include "include/health.h"
+#include "msg/Messenger.h"
+
+#include "common/Timer.h"
+
+#include "health_check.h"
+#include "MonMap.h"
+#include "Elector.h"
+#include "Paxos.h"
+#include "Session.h"
+#include "MonCommand.h"
+
+
+#include "common/config_obs.h"
+#include "common/LogClient.h"
+#include "auth/AuthClient.h"
+#include "auth/AuthServer.h"
+#include "auth/cephx/CephxKeyServer.h"
+#include "auth/AuthMethodList.h"
+#include "auth/KeyRing.h"
+#include "include/common_fwd.h"
+#include "messages/MMonCommand.h"
+#include "mon/MonitorDBStore.h"
+#include "mgr/MgrClient.h"
+
+#include "mon/MonOpRequest.h"
+#include "common/WorkQueue.h"
+
+using namespace TOPNSPC::common;
+
+#define CEPH_MON_PROTOCOL     13 /* cluster internal */
+
+
+enum {
+  l_cluster_first = 555000,
+  l_cluster_num_mon,
+  l_cluster_num_mon_quorum,
+  l_cluster_num_osd,
+  l_cluster_num_osd_up,
+  l_cluster_num_osd_in,
+  l_cluster_osd_epoch,
+  l_cluster_osd_bytes,
+  l_cluster_osd_bytes_used,
+  l_cluster_osd_bytes_avail,
+  l_cluster_num_pool,
+  l_cluster_num_pg,
+  l_cluster_num_pg_active_clean,
+  l_cluster_num_pg_active,
+  l_cluster_num_pg_peering,
+  l_cluster_num_object,
+  l_cluster_num_object_degraded,
+  l_cluster_num_object_misplaced,
+  l_cluster_num_object_unfound,
+  l_cluster_num_bytes,
+  l_cluster_last,
+};
+
+enum {
+  l_mon_first = 456000,
+  l_mon_num_sessions,
+  l_mon_session_add,
+  l_mon_session_rm,
+  l_mon_session_trim,
+  l_mon_num_elections,
+  l_mon_election_call,
+  l_mon_election_win,
+  l_mon_election_lose,
+  l_mon_last,
+};
+
+class PaxosService;
+
+class AdminSocketHook;
+
+#define COMPAT_SET_LOC "feature_set"
+
+class Monitor : public Dispatcher,
+		public AuthClient,
+		public AuthServer,
+                public md_config_obs_t {
+public:
+  int orig_argc = 0;
+  const char **orig_argv = nullptr;
+
+  // me
+  std::string name;
+  int rank;
+  Messenger *messenger;
+  ConnectionRef con_self;
+  ceph::mutex lock = ceph::make_mutex("Monitor::lock");
+  SafeTimer timer;
+  Finisher finisher;
+  ThreadPool cpu_tp;  ///< threadpool for CPU intensive work
+
+  ceph::mutex auth_lock = ceph::make_mutex("Monitor::auth_lock");
+
+  /// true if we have ever joined a quorum.  if false, we are either a
+  /// new cluster, a newly joining monitor, or a just-upgraded
+  /// monitor.
+  bool has_ever_joined;
+
+  PerfCounters *logger, *cluster_logger;
+  bool cluster_logger_registered;
+
+  void register_cluster_logger();
+  void unregister_cluster_logger();
+
+  MonMap *monmap;
+  uuid_d fingerprint;
+
+  std::set<entity_addrvec_t> extra_probe_peers;
+
+  LogClient log_client;
+  LogChannelRef clog;
+  LogChannelRef audit_clog;
+  KeyRing keyring;
+  KeyServer key_server;
+
+  AuthMethodList auth_cluster_required;
+  AuthMethodList auth_service_required;
+
+  CompatSet features;
+
+  std::vector<MonCommand> leader_mon_commands; // quorum leader's commands
+  std::vector<MonCommand> local_mon_commands;  // commands i support
+  ceph::buffer::list local_mon_commands_bl;       // encoded version of above
+
+  std::vector<MonCommand> prenautilus_local_mon_commands;
+  ceph::buffer::list prenautilus_local_mon_commands_bl;
+
+  Messenger *mgr_messenger;
+  MgrClient mgr_client;
+  uint64_t mgr_proxy_bytes = 0;  // in-flight proxied mgr command message bytes
+  std::string gss_ktfile_client{};
+
+private:
+  void new_tick();
+
+  // -- local storage --
+public:
+  MonitorDBStore *store;
+  static const std::string MONITOR_NAME;
+  static const std::string MONITOR_STORE_PREFIX;
+
+  // -- monitor state --
+private:
+  enum {
+    STATE_INIT = 1,
+    STATE_PROBING,
+    STATE_SYNCHRONIZING,
+    STATE_ELECTING,
+    STATE_LEADER,
+    STATE_PEON,
+    STATE_SHUTDOWN
+  };
+  int state = STATE_INIT;
+
+public:
+  static const char *get_state_name(int s) {
+    switch (s) {
+    case STATE_PROBING: return "probing";
+    case STATE_SYNCHRONIZING: return "synchronizing";
+    case STATE_ELECTING: return "electing";
+    case STATE_LEADER: return "leader";
+    case STATE_PEON: return "peon";
+    case STATE_SHUTDOWN: return "shutdown";
+    default: return "???";
+    }
+  }
+  const char *get_state_name() const {
+    return get_state_name(state);
+  }
+
+  bool is_init() const { return state == STATE_INIT; }
+  bool is_shutdown() const { return state == STATE_SHUTDOWN; }
+  bool is_probing() const { return state == STATE_PROBING; }
+  bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
+  bool is_electing() const { return state == STATE_ELECTING; }
+  bool is_leader() const { return state == STATE_LEADER; }
+  bool is_peon() const { return state == STATE_PEON; }
+
+  const utime_t &get_leader_since() const;
+
+  void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
+
+  std::vector<DaemonHealthMetric> get_health_metrics();
+
+  int quorum_age() const {
+    auto age = std::chrono::duration_cast<std::chrono::seconds>(
+      ceph::mono_clock::now() - quorum_since);
+    return age.count();
+  }
+
+  bool is_mon_down() const {
+    int max = monmap->size();
+    int actual = get_quorum().size();
+    auto now = ceph::real_clock::now();
+    return actual < max && now > monmap->created.to_real_time();
+  }
+
+  // -- elector --
+private:
+  std::unique_ptr<Paxos> paxos;
+  Elector elector;
+  friend class Elector;
+
+  /// features we require of peers (based on on-disk compatset)
+  uint64_t required_features;
+  
+  int leader;            // current leader (to best of knowledge)
+  std::set<int> quorum;       // current active set of monitors (if !starting)
+  ceph::mono_clock::time_point quorum_since;  // when quorum formed
+  utime_t leader_since;  // when this monitor became the leader, if it is the leader
+  utime_t exited_quorum; // time detected as not in quorum; 0 if in
+
+  // map of counts of connected clients, by type and features, for
+  // each quorum mon
+  std::map<int,FeatureMap> quorum_feature_map;
+
+  /**
+   * Intersection of quorum member's connection feature bits.
+   */
+  uint64_t quorum_con_features;
+  /**
+   * Intersection of quorum members mon-specific feature bits
+   */
+  mon_feature_t quorum_mon_features;
+
+  ceph_release_t quorum_min_mon_release{ceph_release_t::unknown};
+
+  std::set<std::string> outside_quorum;
+
+  bool stretch_mode_engaged{false};
+  bool degraded_stretch_mode{false};
+  bool recovering_stretch_mode{false};
+  string stretch_bucket_divider;
+  map<string, set<string>> dead_mon_buckets; // bucket->mon ranks, locations with no live mons
+  set<string> up_mon_buckets; // locations with a live mon
+  void do_stretch_mode_election_work();
+
+  bool session_stretch_allowed(MonSession *s, MonOpRequestRef& op);
+  void disconnect_disallowed_stretch_sessions();
+  void set_elector_disallowed_leaders(bool allow_election);
+
+  map <string,string> crush_loc;
+  bool need_set_crush_loc{false};
+public:
+  bool is_stretch_mode() { return stretch_mode_engaged; }
+  bool is_degraded_stretch_mode() { return degraded_stretch_mode; }
+  bool is_recovering_stretch_mode() { return recovering_stretch_mode; }
+
+  /**
+   * This set of functions maintains the in-memory stretch state
+   * and sets up transitions of the map states by calling in to
+   * MonmapMonitor and OSDMonitor.
+   *
+   * The [maybe_]go_* functions are called on the leader to
+   * decide if transitions should happen; the trigger_* functions
+   * set up the map transitions; and the set_* functions actually
+   * change the memory state -- but these are only called
+   * via OSDMonitor::update_from_paxos, to guarantee consistent
+   * updates across the entire cluster.
+   */
+  void try_engage_stretch_mode();
+  void maybe_go_degraded_stretch_mode();
+  void trigger_degraded_stretch_mode(const set<string>& dead_mons,
+				     const set<int>& dead_buckets);
+  void set_degraded_stretch_mode();
+  void go_recovery_stretch_mode();
+  void set_recovery_stretch_mode();
+  void trigger_healthy_stretch_mode();
+  void set_healthy_stretch_mode();
+  void enable_stretch_mode();
+  void set_mon_crush_location(const string& loc);
+
+  
+private:
+
+  /**
+   * @defgroup Monitor_h_scrub
+   * @{
+   */
+  version_t scrub_version;            ///< paxos version we are scrubbing
+  std::map<int,ScrubResult> scrub_result;  ///< results so far
+
+  /**
+   * trigger a cross-mon scrub
+   *
+   * Verify all mons are storing identical content
+   */
+  int scrub_start();
+  int scrub();
+  void handle_scrub(MonOpRequestRef op);
+  bool _scrub(ScrubResult *r,
+              std::pair<std::string,std::string> *start,
+              int *num_keys);
+  void scrub_check_results();
+  void scrub_timeout();
+  void scrub_finish();
+  void scrub_reset();
+  void scrub_update_interval(ceph::timespan interval);
+
+  Context *scrub_event;       ///< periodic event to trigger scrub (leader)
+  Context *scrub_timeout_event;  ///< scrub round timeout (leader)
+  void scrub_event_start();
+  void scrub_event_cancel();
+  void scrub_reset_timeout();
+  void scrub_cancel_timeout();
+
+  struct ScrubState {
+    std::pair<std::string,std::string> last_key; ///< last scrubbed key
+    bool finished;
+
+    ScrubState() : finished(false) { }
+    virtual ~ScrubState() { }
+  };
+  std::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
+
+  /**
+   * @defgroup Monitor_h_sync Synchronization
+   * @{
+   */
+  /**
+   * @} // provider state
+   */
+  struct SyncProvider {
+    entity_addrvec_t addrs;
+    uint64_t cookie;       ///< unique cookie for this sync attempt
+    utime_t timeout;       ///< when we give up and expire this attempt
+    version_t last_committed; ///< last paxos version on peer
+    std::pair<std::string,std::string> last_key; ///< last key sent to (or on) peer
+    bool full;             ///< full scan?
+    MonitorDBStore::Synchronizer synchronizer;   ///< iterator
+
+    SyncProvider() : cookie(0), last_committed(0), full(false) {}
+
+    void reset_timeout(CephContext *cct, int grace) {
+      timeout = ceph_clock_now();
+      timeout += grace;
+    }
+  };
+
+  std::map<std::uint64_t, SyncProvider> sync_providers;  ///< cookie -> SyncProvider for those syncing from us
+  uint64_t sync_provider_count;   ///< counter for issued cookies to keep them unique
+
+  /**
+   * @} // requester state
+   */
+  entity_addrvec_t sync_provider;  ///< who we are syncing from
+  uint64_t sync_cookie;          ///< 0 if we are starting, non-zero otherwise
+  bool sync_full;                ///< true if we are a full sync, false for recent catch-up
+  version_t sync_start_version;  ///< last_committed at sync start
+  Context *sync_timeout_event;   ///< timeout event
+
+  /**
+   * floor for sync source
+   *
+   * When we sync we forget about our old last_committed value which
+   * can be dangerous.  For example, if we have a cluster of:
+   *
+   *   mon.a: lc 100
+   *   mon.b: lc 80
+   *   mon.c: lc 100 (us)
+   *
+   * If something forces us to sync (say, corruption, or manual
+   * intervention, or bug), we forget last_committed, and might abort.
+   * If mon.a happens to be down when we come back, we will see:
+   *
+   *   mon.b: lc 80
+   *   mon.c: lc 0 (us)
+   *
+   * and sync from mon.b, at which point a+b will both have lc 80 and
+   * come online with a majority holding out of date commits.
+   *
+   * Avoid this by preserving our old last_committed value prior to
+   * sync and never going backwards.
+   */
+  version_t sync_last_committed_floor;
+
+  /**
+   * Obtain the synchronization target prefixes in set form.
+   *
+   * We consider a target prefix all those that are relevant when
+   * synchronizing two stores. That is, all those that hold paxos service's
+   * versions, as well as paxos versions, or any control keys such as the
+   * first or last committed version.
+   *
+   * Given the current design, this function should return the name of all and
+   * any available paxos service, plus the paxos name.
+   *
+   * @returns a set of strings referring to the prefixes being synchronized
+   */
+  std::set<std::string> get_sync_targets_names();
+
+  /**
+   * Reset the monitor's sync-related data structures for syncing *from* a peer
+   */
+  void sync_reset_requester();
+
+  /**
+   * Reset sync state related to allowing others to sync from us
+   */
+  void sync_reset_provider();
+
+  /**
+   * Caled when a sync attempt times out (requester-side)
+   */
+  void sync_timeout();
+
+  /**
+   * Get the latest monmap for backup purposes during sync
+   */
+  void sync_obtain_latest_monmap(ceph::buffer::list &bl);
+
+  /**
+   * Start sync process
+   *
+   * Start pulling committed state from another monitor.
+   *
+   * @param entity where to pull committed state from
+   * @param full whether to do a full sync or just catch up on recent paxos
+   */
+  void sync_start(entity_addrvec_t &addrs, bool full);
+
+public:
+  /**
+   * force a sync on next mon restart
+   */
+  void sync_force(ceph::Formatter *f);
+
+private:
+  /**
+   * store critical state for safekeeping during sync
+   *
+   * We store a few things on the side that we don't want to get clobbered by sync.  This
+   * includes the latest monmap and a lower bound on last_committed.
+   */
+  void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
+
+  /**
+   * reset the sync timeout
+   *
+   * This is used on the client to restart if things aren't progressing
+   */
+  void sync_reset_timeout();
+
+  /**
+   * trim stale sync provider state
+   *
+   * If someone is syncing from us and hasn't talked to us recently, expire their state.
+   */
+  void sync_trim_providers();
+
+  /**
+   * Complete a sync
+   *
+   * Finish up a sync after we've gotten all of the chunks.
+   *
+   * @param last_committed final last_committed value from provider
+   */
+  void sync_finish(version_t last_committed);
+
+  /**
+   * request the next chunk from the provider
+   */
+  void sync_get_next_chunk();
+
+  /**
+   * handle sync message
+   *
+   * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
+   */
+  void handle_sync(MonOpRequestRef op);
+
+  void _sync_reply_no_cookie(MonOpRequestRef op);
+
+  void handle_sync_get_cookie(MonOpRequestRef op);
+  void handle_sync_get_chunk(MonOpRequestRef op);
+  void handle_sync_finish(MonOpRequestRef op);
+
+  void handle_sync_cookie(MonOpRequestRef op);
+  void handle_sync_forward(MonOpRequestRef op);
+  void handle_sync_chunk(MonOpRequestRef op);
+  void handle_sync_no_cookie(MonOpRequestRef op);
+
+  /**
+   * @} // Synchronization
+   */
+
+  std::list<Context*> waitfor_quorum;
+  std::list<Context*> maybe_wait_for_quorum;
+
+  /**
+   * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
+   * @{
+   *
+   * We use time checks to keep track of any clock drifting going on in the
+   * cluster. This is accomplished by periodically ping each monitor in the
+   * quorum and register its response time on a map, assessing how much its
+   * clock has drifted. We also take this opportunity to assess the latency
+   * on response.
+   *
+   * This mechanism works as follows:
+   *
+   *  - Leader sends out a 'PING' message to each other monitor in the quorum.
+   *    The message is timestamped with the leader's current time. The leader's
+   *    current time is recorded in a map, associated with each peon's
+   *    instance.
+   *  - The peon replies to the leader with a timestamped 'PONG' message.
+   *  - The leader calculates a delta between the peon's timestamp and its
+   *    current time and stashes it.
+   *  - The leader also calculates the time it took to receive the 'PONG'
+   *    since the 'PING' was sent, and stashes an approximate latency estimate.
+   *  - Once all the quorum members have pong'ed, the leader will share the
+   *    clock skew and latency maps with all the monitors in the quorum.
+   */
+  std::map<int, utime_t> timecheck_waiting;
+  std::map<int, double> timecheck_skews;
+  std::map<int, double> timecheck_latencies;
+  // odd value means we are mid-round; even value means the round has
+  // finished.
+  version_t timecheck_round;
+  unsigned int timecheck_acks;
+  utime_t timecheck_round_start;
+  friend class HealthMonitor;
+  /* When we hit a skew we will start a new round based off of
+   * 'mon_timecheck_skew_interval'. Each new round will be backed off
+   * until we hit 'mon_timecheck_interval' -- which is the typical
+   * interval when not in the presence of a skew.
+   *
+   * This variable tracks the number of rounds with skews since last clean
+   * so that we can report to the user and properly adjust the backoff.
+   */
+  uint64_t timecheck_rounds_since_clean;
+  /**
+   * Time Check event.
+   */
+  Context *timecheck_event;
+
+  void timecheck_start();
+  void timecheck_finish();
+  void timecheck_start_round();
+  void timecheck_finish_round(bool success = true);
+  void timecheck_cancel_round();
+  void timecheck_cleanup();
+  void timecheck_reset_event();
+  void timecheck_check_skews();
+  void timecheck_report();
+  void timecheck();
+  health_status_t timecheck_status(std::ostringstream &ss,
+                                   const double skew_bound,
+                                   const double latency);
+  void handle_timecheck_leader(MonOpRequestRef op);
+  void handle_timecheck_peon(MonOpRequestRef op);
+  void handle_timecheck(MonOpRequestRef op);
+
+  /**
+   * Returns 'true' if this is considered to be a skew; 'false' otherwise.
+   */
+  bool timecheck_has_skew(const double skew_bound, double *abs) const {
+    double abs_skew = std::fabs(skew_bound);
+    if (abs)
+      *abs = abs_skew;
+    return (abs_skew > g_conf()->mon_clock_drift_allowed);
+  }
+
+  /**
+   * @}
+   */
+  /**
+   * Handle ping messages from others.
+   */
+  void handle_ping(MonOpRequestRef op);
+
+  Context *probe_timeout_event = nullptr;  // for probing
+
+  void reset_probe_timeout();
+  void cancel_probe_timeout();
+  void probe_timeout(int r);
+
+  void _apply_compatset_features(CompatSet &new_features);
+
+public:
+  epoch_t get_epoch();
+  int get_leader() const { return leader; }
+  std::string get_leader_name() {
+    return quorum.empty() ? std::string() : monmap->get_name(leader);
+  }
+  const std::set<int>& get_quorum() const { return quorum; }
+  std::list<std::string> get_quorum_names() {
+    std::list<std::string> q;
+    for (auto p = quorum.begin(); p != quorum.end(); ++p)
+      q.push_back(monmap->get_name(*p));
+    return q;
+  }
+  uint64_t get_quorum_con_features() const {
+    return quorum_con_features;
+  }
+  mon_feature_t get_quorum_mon_features() const {
+    return quorum_mon_features;
+  }
+  uint64_t get_required_features() const {
+    return required_features;
+  }
+  mon_feature_t get_required_mon_features() const {
+    return monmap->get_required_features();
+  }
+  void apply_quorum_to_compatset_features();
+  void apply_monmap_to_compatset_features();
+  void calc_quorum_requirements();
+
+  void get_combined_feature_map(FeatureMap *fm);
+
+private:
+  void _reset();   ///< called from bootstrap, start_, or join_election
+  void wait_for_paxos_write();
+  void _finish_svc_election(); ///< called by {win,lose}_election
+  void respawn();
+public:
+  void bootstrap();
+  void join_election();
+  void start_election();
+  void win_standalone_election();
+  // end election (called by Elector)
+  void win_election(epoch_t epoch, const std::set<int>& q,
+		    uint64_t features,
+                    const mon_feature_t& mon_features,
+		    ceph_release_t min_mon_release,
+		    const std::map<int,Metadata>& metadata);
+  void lose_election(epoch_t epoch, std::set<int>& q, int l,
+		     uint64_t features,
+                     const mon_feature_t& mon_features,
+		     ceph_release_t min_mon_release);
+  // end election (called by Elector)
+  void finish_election();
+
+  void update_logger();
+
+  /**
+   * Vector holding the Services serviced by this Monitor.
+   */
+  std::array<std::unique_ptr<PaxosService>, PAXOS_NUM> paxos_service;
+
+  class MDSMonitor *mdsmon() {
+    return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP].get();
+  }
+
+  class MonmapMonitor *monmon() {
+    return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP].get();
+  }
+
+  class OSDMonitor *osdmon() {
+    return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP].get();
+  }
+
+  class AuthMonitor *authmon() {
+    return (class AuthMonitor *)paxos_service[PAXOS_AUTH].get();
+  }
+
+  class LogMonitor *logmon() {
+    return (class LogMonitor*) paxos_service[PAXOS_LOG].get();
+  }
+
+  class MgrMonitor *mgrmon() {
+    return (class MgrMonitor*) paxos_service[PAXOS_MGR].get();
+  }
+
+  class MgrStatMonitor *mgrstatmon() {
+    return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT].get();
+  }
+
+  class HealthMonitor *healthmon() {
+    return (class HealthMonitor*) paxos_service[PAXOS_HEALTH].get();
+  }
+
+  class ConfigMonitor *configmon() {
+    return (class ConfigMonitor*) paxos_service[PAXOS_CONFIG].get();
+  }
+
+  class KVMonitor *kvmon() {
+    return (class KVMonitor*) paxos_service[PAXOS_KV].get();
+  }
+
+  friend class Paxos;
+  friend class OSDMonitor;
+  friend class MDSMonitor;
+  friend class MonmapMonitor;
+  friend class LogMonitor;
+  friend class KVMonitor;
+
+  // -- sessions --
+  MonSessionMap session_map;
+  ceph::mutex session_map_lock = ceph::make_mutex("Monitor::session_map_lock");
+  AdminSocketHook *admin_hook;
+
+  template<typename Func, typename...Args>
+  void with_session_map(Func&& func) {
+    std::lock_guard l(session_map_lock);
+    std::forward<Func>(func)(session_map);
+  }
+  void send_latest_monmap(Connection *con);
+
+  // messages
+  void handle_get_version(MonOpRequestRef op);
+  void handle_subscribe(MonOpRequestRef op);
+  void handle_mon_get_map(MonOpRequestRef op);
+
+  static void _generate_command_map(cmdmap_t& cmdmap,
+                                    std::map<std::string,std::string> &param_str_map);
+  static const MonCommand *_get_moncommand(
+    const std::string &cmd_prefix,
+    const std::vector<MonCommand>& cmds);
+  bool _allowed_command(MonSession *s, const std::string& module,
+			const std::string& prefix,
+                        const cmdmap_t& cmdmap,
+                        const std::map<std::string,std::string>& param_str_map,
+                        const MonCommand *this_cmd);
+  void get_mon_status(ceph::Formatter *f);
+  void _quorum_status(ceph::Formatter *f, std::ostream& ss);
+  bool _add_bootstrap_peer_hint(std::string_view cmd, const cmdmap_t& cmdmap,
+				std::ostream& ss);
+  void handle_tell_command(MonOpRequestRef op);
+  void handle_command(MonOpRequestRef op);
+  void handle_route(MonOpRequestRef op);
+
+  int get_mon_metadata(int mon, ceph::Formatter *f, std::ostream& err);
+  int print_nodes(ceph::Formatter *f, std::ostream& err);
+
+  // track metadata reported by win_election()
+  std::map<int, Metadata> mon_metadata;
+  std::map<int, Metadata> pending_metadata;
+
+  /**
+   *
+   */
+  struct health_cache_t {
+    health_status_t overall;
+    std::string summary;
+
+    void reset() {
+      // health_status_t doesn't really have a NONE value and we're not
+      // okay with setting something else (say, HEALTH_ERR).  so just
+      // leave it be.
+      summary.clear();
+    }
+  } health_status_cache;
+
+  Context *health_tick_event = nullptr;
+  Context *health_interval_event = nullptr;
+
+  void health_tick_start();
+  void health_tick_stop();
+  ceph::real_clock::time_point health_interval_calc_next_update();
+  void health_interval_start();
+  void health_interval_stop();
+  void health_events_cleanup();
+
+  void health_to_clog_update_conf(const std::set<std::string> &changed);
+
+  void do_health_to_clog_interval();
+  void do_health_to_clog(bool force = false);
+
+  void log_health(
+    const health_check_map_t& updated,
+    const health_check_map_t& previous,
+    MonitorDBStore::TransactionRef t);
+
+  void update_pending_metadata();
+
+protected:
+
+  class HealthCheckLogStatus {
+    public:
+    health_status_t severity;
+    std::string last_message;
+    utime_t updated_at = 0;
+    HealthCheckLogStatus(health_status_t severity_,
+                         const std::string &last_message_,
+                         utime_t updated_at_)
+      : severity(severity_),
+        last_message(last_message_),
+        updated_at(updated_at_)
+    {}
+  };
+  std::map<std::string, HealthCheckLogStatus> health_check_log_times;
+
+public:
+
+  void get_cluster_status(std::stringstream &ss, ceph::Formatter *f,
+			  MonSession *session);
+
+  void reply_command(MonOpRequestRef op, int rc, const std::string &rs, version_t version);
+  void reply_command(MonOpRequestRef op, int rc, const std::string &rs, ceph::buffer::list& rdata, version_t version);
+
+  void reply_tell_command(MonOpRequestRef op, int rc, const std::string &rs);
+
+
+
+  void handle_probe(MonOpRequestRef op);
+  /**
+   * Handle a Probe Operation, replying with our name, quorum and known versions.
+   *
+   * We use the MMonProbe message class for anything and everything related with
+   * Monitor probing. One of the operations relates directly with the probing
+   * itself, in which we receive a probe request and to which we reply with
+   * our name, our quorum and the known versions for each Paxos service. Thus the
+   * redundant function name. This reply will obviously be sent to the one
+   * probing/requesting these infos.
+   *
+   * @todo Add @pre and @post
+   *
+   * @param m A Probe message, with an operation of type Probe.
+   */
+  void handle_probe_probe(MonOpRequestRef op);
+  void handle_probe_reply(MonOpRequestRef op);
+
+  // request routing
+  struct RoutedRequest {
+    uint64_t tid;
+    ceph::buffer::list request_bl;
+    MonSession *session;
+    ConnectionRef con;
+    uint64_t con_features;
+    MonOpRequestRef op;
+
+    RoutedRequest() : tid(0), session(NULL), con_features(0) {}
+    ~RoutedRequest() {
+      if (session)
+	session->put();
+    }
+  };
+  uint64_t routed_request_tid;
+  std::map<uint64_t, RoutedRequest*> routed_requests;
+
+  void forward_request_leader(MonOpRequestRef op);
+  void handle_forward(MonOpRequestRef op);
+  void send_reply(MonOpRequestRef op, Message *reply);
+  void no_reply(MonOpRequestRef op);
+  void resend_routed_requests();
+  void remove_session(MonSession *s);
+  void remove_all_sessions();
+  void waitlist_or_zap_client(MonOpRequestRef op);
+
+  void send_mon_message(Message *m, int rank);
+  /** can_change_external_state if we can do things like
+   *  call elections as a result of the new map.
+   */
+  void notify_new_monmap(bool can_change_external_state=false, bool remove_rank_elector=true);
+
+public:
+  struct C_Command : public C_MonOp {
+    Monitor &mon;
+    int rc;
+    std::string rs;
+    ceph::buffer::list rdata;
+    version_t version;
+    C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, version_t v) :
+      C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
+    C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, ceph::buffer::list rd, version_t v) :
+      C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
+
+    void _finish(int r) override {
+      auto m = op->get_req<MMonCommand>();
+      if (r >= 0) {
+	std::ostringstream ss;
+        if (!op->get_req()->get_connection()) {
+          ss << "connection dropped for command ";
+        } else {
+          MonSession *s = op->get_session();
+
+          // if client drops we may not have a session to draw information from.
+          if (s) {
+            ss << "from='" << s->name << " " << s->addrs << "' "
+              << "entity='" << s->entity_name << "' ";
+          } else {
+            ss << "session dropped for command ";
+          }
+        }
+        cmdmap_t cmdmap;
+        std::ostringstream ds;
+        string prefix;
+        cmdmap_from_json(m->cmd, &cmdmap, ds);
+        cmd_getval(cmdmap, "prefix", prefix);
+        if (prefix != "config set" && prefix != "config-key set")
+          ss << "cmd='" << m->cmd << "': finished";
+
+        mon.audit_clog->info() << ss.str();
+        mon.reply_command(op, rc, rs, rdata, version);
+      }
+      else if (r == -ECANCELED)
+        return;
+      else if (r == -EAGAIN)
+        mon.dispatch_op(op);
+      else
+	ceph_abort_msg("bad C_Command return value");
+    }
+  };
+
+ private:
+  class C_RetryMessage : public C_MonOp {
+    Monitor *mon;
+  public:
+    C_RetryMessage(Monitor *m, MonOpRequestRef op) :
+      C_MonOp(op), mon(m) { }
+
+    void _finish(int r) override {
+      if (r == -EAGAIN || r >= 0)
+        mon->dispatch_op(op);
+      else if (r == -ECANCELED)
+        return;
+      else
+	ceph_abort_msg("bad C_RetryMessage return value");
+    }
+  };
+
+  //ms_dispatch handles a lot of logic and we want to reuse it
+  //on forwarded messages, so we create a non-locking version for this class
+  void _ms_dispatch(Message *m);
+  bool ms_dispatch(Message *m) override {
+    std::lock_guard l{lock};
+    _ms_dispatch(m);
+    return true;
+  }
+  void dispatch_op(MonOpRequestRef op);
+  //mon_caps is used for un-connected messages from monitors
+  MonCap mon_caps;
+  bool get_authorizer(int dest_type, AuthAuthorizer **authorizer);
+public: // for AuthMonitor msgr1:
+  int ms_handle_authentication(Connection *con) override;
+private:
+  void ms_handle_accept(Connection *con) override;
+  bool ms_handle_reset(Connection *con) override;
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override;
+
+  // AuthClient
+  int get_auth_request(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    uint32_t *method,
+    std::vector<uint32_t> *preferred_modes,
+    ceph::buffer::list *out) override;
+  int handle_auth_reply_more(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+   const ceph::buffer::list& bl,
+    ceph::buffer::list *reply) override;
+  int handle_auth_done(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    uint64_t global_id,
+    uint32_t con_mode,
+    const ceph::buffer::list& bl,
+    CryptoKey *session_key,
+    std::string *connection_secret) override;
+  int handle_auth_bad_method(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    uint32_t old_auth_method,
+    int result,
+    const std::vector<uint32_t>& allowed_methods,
+    const std::vector<uint32_t>& allowed_modes) override;
+  // /AuthClient
+  // AuthServer
+  int handle_auth_request(
+    Connection *con,
+    AuthConnectionMeta *auth_meta,
+    bool more,
+    uint32_t auth_method,
+    const ceph::buffer::list& bl,
+    ceph::buffer::list *reply) override;
+  // /AuthServer
+
+  int write_default_keyring(ceph::buffer::list& bl);
+  void extract_save_mon_key(KeyRing& keyring);
+
+  void collect_metadata(Metadata *m);
+  int load_metadata();
+  void count_metadata(const std::string& field, ceph::Formatter *f);
+  void count_metadata(const std::string& field, std::map<std::string,int> *out);
+  // get_all_versions() gathers version information from daemons for health check
+  void get_all_versions(std::map<string, std::list<std::string>> &versions);
+  void get_versions(std::map<string, std::list<std::string>> &versions);
+
+  // features
+  static CompatSet get_initial_supported_features();
+  static CompatSet get_supported_features();
+  static CompatSet get_legacy_features();
+  /// read the ondisk features into the CompatSet pointed to by read_features
+  static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
+  void read_features();
+  void write_features(MonitorDBStore::TransactionRef t);
+
+  OpTracker op_tracker;
+
+ public:
+  Monitor(CephContext *cct_, std::string nm, MonitorDBStore *s,
+	  Messenger *m, Messenger *mgr_m, MonMap *map);
+  ~Monitor() override;
+
+  static int check_features(MonitorDBStore *store);
+
+  // config observer
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set<std::string> &changed) override;
+
+  void update_log_clients();
+  int sanitize_options();
+  int preinit();
+  int init();
+  void init_paxos();
+  void refresh_from_paxos(bool *need_bootstrap);
+  void shutdown();
+  void tick();
+
+  void handle_signal(int sig);
+
+  int mkfs(ceph::buffer::list& osdmapbl);
+
+  /**
+   * check cluster_fsid file
+   *
+   * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
+   */
+  int check_fsid();
+
+  /**
+   * write cluster_fsid file
+   *
+   * @return 0 on success, or negative error code
+   */
+  int write_fsid();
+  int write_fsid(MonitorDBStore::TransactionRef t);
+
+  int do_admin_command(std::string_view command, const cmdmap_t& cmdmap,
+		       ceph::Formatter *f,
+		       std::ostream& err,
+		       std::ostream& out);
+
+private:
+  // don't allow copying
+  Monitor(const Monitor& rhs);
+  Monitor& operator=(const Monitor &rhs);
+
+public:
+  static void format_command_descriptions(const std::vector<MonCommand> &commands,
+					  ceph::Formatter *f,
+					  uint64_t features,
+					  ceph::buffer::list *rdata);
+
+  const std::vector<MonCommand> &get_local_commands(mon_feature_t f) {
+    if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
+      return local_mon_commands;
+    } else {
+      return prenautilus_local_mon_commands;
+    }
+  }
+  const ceph::buffer::list& get_local_commands_bl(mon_feature_t f) {
+    if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
+      return local_mon_commands_bl;
+    } else {
+      return prenautilus_local_mon_commands_bl;
+    }
+  }
+  void set_leader_commands(const std::vector<MonCommand>& cmds) {
+    leader_mon_commands = cmds;
+  }
+
+  bool is_keyring_required();
+};
+
+#define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
+#define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
+#define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
+#define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
+#define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
+#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
+#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
+#define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
+#define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_MIMIC CompatSet::Feature(10, "mimic ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_NAUTILUS CompatSet::Feature(11, "nautilus ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_OCTOPUS CompatSet::Feature(12, "octopus ondisk layout")
+#define CEPH_MON_FEATURE_INCOMPAT_PACIFIC CompatSet::Feature(13, "pacific ondisk layout")
+// make sure you add your feature to Monitor::get_supported_features
+
+
+/* Callers use:
+ *
+ *      new C_MonContext{...}
+ *
+ * instead of
+ *
+ *      new C_MonContext(...)
+ *
+ * because of gcc bug [1].
+ *
+ * [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85883
+ */
+template<typename T>
+class C_MonContext : public LambdaContext<T> {
+public:
+  C_MonContext(const Monitor* m, T&& f) :
+      LambdaContext<T>(std::forward<T>(f)),
+      mon(m)
+  {}
+  void finish(int r) override {
+    if (mon->is_shutdown())
+      return;
+    LambdaContext<T>::finish(r);
+  }
+private:
+  const Monitor* mon;
+};
+
+#endif
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
new file mode 100644
index 000000000..c33d35e48
--- /dev/null
+++ b/src/mon/MonitorDBStore.h
@@ -0,0 +1,814 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#ifndef CEPH_MONITOR_DB_STORE_H
+#define CEPH_MONITOR_DB_STORE_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include <set>
+#include <map>
+#include <string>
+#include <boost/scoped_ptr.hpp>
+#include <sstream>
+#include <fstream>
+#include "kv/KeyValueDB.h"
+
+#include "include/ceph_assert.h"
+#include "common/Formatter.h"
+#include "common/Finisher.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "common/safe_io.h"
+#include "common/blkdev.h"
+#include "common/PriorityCache.h"
+
+#define dout_context g_ceph_context
+
+class MonitorDBStore
+{
+  std::string path;
+  boost::scoped_ptr<KeyValueDB> db;
+  bool do_dump;
+  int dump_fd_binary;
+  std::ofstream dump_fd_json;
+  ceph::JSONFormatter dump_fmt;
+  
+
+  Finisher io_work;
+
+  bool is_open;
+
+ public:
+
+  std::string get_devname() {
+    char devname[4096] = {0}, partition[4096];
+    get_device_by_path(path.c_str(), partition, devname,
+		       sizeof(devname));
+    return devname;
+  }
+
+  std::string get_path() {
+    return path;
+  }
+
+  std::shared_ptr<PriorityCache::PriCache> get_priority_cache() const {
+    return db->get_priority_cache();
+  }
+
+  struct Op {
+    uint8_t type;
+    std::string prefix;
+    std::string key, endkey;
+    ceph::buffer::list bl;
+
+    Op()
+      : type(0) { }
+    Op(int t, const std::string& p, const std::string& k)
+      : type(t), prefix(p), key(k) { }
+    Op(int t, const std::string& p, const std::string& k, const ceph::buffer::list& b)
+      : type(t), prefix(p), key(k), bl(b) { }
+    Op(int t, const std::string& p, const std::string& start, const std::string& end)
+      : type(t), prefix(p), key(start), endkey(end) { }
+
+    void encode(ceph::buffer::list& encode_bl) const {
+      ENCODE_START(2, 1, encode_bl);
+      encode(type, encode_bl);
+      encode(prefix, encode_bl);
+      encode(key, encode_bl);
+      encode(bl, encode_bl);
+      encode(endkey, encode_bl);
+      ENCODE_FINISH(encode_bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator& decode_bl) {
+      DECODE_START(2, decode_bl);
+      decode(type, decode_bl);
+      decode(prefix, decode_bl);
+      decode(key, decode_bl);
+      decode(bl, decode_bl);
+      if (struct_v >= 2)
+	decode(endkey, decode_bl);
+      DECODE_FINISH(decode_bl);
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_int("type", type);
+      f->dump_string("prefix", prefix);
+      f->dump_string("key", key);
+      if (endkey.length()) {
+	f->dump_string("endkey", endkey);
+      }
+    }
+
+    int approx_size() const {
+      return 6 + 1 +
+	4 + prefix.size() +
+	4 + key.size() +
+	4 + endkey.size() +
+	4 + bl.length();
+    }
+
+    static void generate_test_instances(std::list<Op*>& ls) {
+      ls.push_back(new Op);
+      // we get coverage here from the Transaction instances
+    }
+  };
+
+  struct Transaction;
+  typedef std::shared_ptr<Transaction> TransactionRef;
+  struct Transaction {
+    std::list<Op> ops;
+    uint64_t bytes, keys;
+
+    Transaction() : bytes(6 + 4 + 8*2), keys(0) {}
+
+    enum {
+      OP_PUT	= 1,
+      OP_ERASE	= 2,
+      OP_COMPACT = 3,
+      OP_ERASE_RANGE = 4,
+    };
+
+    void put(const std::string& prefix, const std::string& key, const ceph::buffer::list& bl) {
+      ops.push_back(Op(OP_PUT, prefix, key, bl));
+      ++keys;
+      bytes += ops.back().approx_size();
+    }
+
+    void put(const std::string& prefix, version_t ver, const ceph::buffer::list& bl) {
+      std::ostringstream os;
+      os << ver;
+      put(prefix, os.str(), bl);
+    }
+
+    void put(const std::string& prefix, const std::string& key, version_t ver) {
+      using ceph::encode;
+      ceph::buffer::list bl;
+      encode(ver, bl);
+      put(prefix, key, bl);
+    }
+
+    void erase(const std::string& prefix, const std::string& key) {
+      ops.push_back(Op(OP_ERASE, prefix, key));
+      ++keys;
+      bytes += ops.back().approx_size();
+    }
+
+    void erase(const std::string& prefix, version_t ver) {
+      std::ostringstream os;
+      os << ver;
+      erase(prefix, os.str());
+    }
+
+    void erase_range(const std::string& prefix, const std::string& begin,
+		     const std::string& end) {
+      ops.push_back(Op(OP_ERASE_RANGE, prefix, begin, end));
+      ++keys;
+      bytes += ops.back().approx_size();
+    }
+
+    void compact_prefix(const std::string& prefix) {
+      ops.push_back(Op(OP_COMPACT, prefix, {}));
+    }
+
+    void compact_range(const std::string& prefix, const std::string& start,
+		       const std::string& end) {
+      ops.push_back(Op(OP_COMPACT, prefix, start, end));
+    }
+
+    void encode(ceph::buffer::list& bl) const {
+      ENCODE_START(2, 1, bl);
+      encode(ops, bl);
+      encode(bytes, bl);
+      encode(keys, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator& bl) {
+      DECODE_START(2, bl);
+      decode(ops, bl);
+      if (struct_v >= 2) {
+	decode(bytes, bl);
+	decode(keys, bl);
+      }
+      DECODE_FINISH(bl);
+    }
+
+    static void generate_test_instances(std::list<Transaction*>& ls) {
+      ls.push_back(new Transaction);
+      ls.push_back(new Transaction);
+      ceph::buffer::list bl;
+      bl.append("value");
+      ls.back()->put("prefix", "key", bl);
+      ls.back()->erase("prefix2", "key2");
+      ls.back()->erase_range("prefix3", "key3", "key4");
+      ls.back()->compact_prefix("prefix3");
+      ls.back()->compact_range("prefix4", "from", "to");
+    }
+
+    void append(TransactionRef other) {
+      ops.splice(ops.end(), other->ops);
+      keys += other->keys;
+      bytes += other->bytes;
+    }
+
+    void append_from_encoded(ceph::buffer::list& bl) {
+      auto other(std::make_shared<Transaction>());
+      auto it = bl.cbegin();
+      other->decode(it);
+      append(other);
+    }
+
+    bool empty() {
+      return (size() == 0);
+    }
+
+    size_t size() const {
+      return ops.size();
+    }
+    uint64_t get_keys() const {
+      return keys;
+    }
+    uint64_t get_bytes() const {
+      return bytes;
+    }
+
+    void dump(ceph::Formatter *f, bool dump_val=false) const {
+      f->open_object_section("transaction");
+      f->open_array_section("ops");
+      int op_num = 0;
+      for (auto it = ops.begin(); it != ops.end(); ++it) {
+	const Op& op = *it;
+	f->open_object_section("op");
+	f->dump_int("op_num", op_num++);
+	switch (op.type) {
+	case OP_PUT:
+	  {
+	    f->dump_string("type", "PUT");
+	    f->dump_string("prefix", op.prefix);
+	    f->dump_string("key", op.key);
+	    f->dump_unsigned("length", op.bl.length());
+	    if (dump_val) {
+	      std::ostringstream os;
+	      op.bl.hexdump(os);
+	      f->dump_string("bl", os.str());
+	    }
+	  }
+	  break;
+	case OP_ERASE:
+	  {
+	    f->dump_string("type", "ERASE");
+	    f->dump_string("prefix", op.prefix);
+	    f->dump_string("key", op.key);
+	  }
+	  break;
+	case OP_ERASE_RANGE:
+	  {
+	    f->dump_string("type", "ERASE_RANGE");
+	    f->dump_string("prefix", op.prefix);
+	    f->dump_string("start", op.key);
+	    f->dump_string("end", op.endkey);
+	  }
+	  break;
+	case OP_COMPACT:
+	  {
+	    f->dump_string("type", "COMPACT");
+	    f->dump_string("prefix", op.prefix);
+	    f->dump_string("start", op.key);
+	    f->dump_string("end", op.endkey);
+	  }
+	  break;
+	default:
+	  {
+	    f->dump_string("type", "unknown");
+	    f->dump_unsigned("op_code", op.type);
+	    break;
+	  }
+	}
+	f->close_section();
+      }
+      f->close_section();
+      f->dump_unsigned("num_keys", keys);
+      f->dump_unsigned("num_bytes", bytes);
+      f->close_section();
+    }
+  };
+
+  int apply_transaction(MonitorDBStore::TransactionRef t) {
+    KeyValueDB::Transaction dbt = db->get_transaction();
+
+    if (do_dump) {
+      if (!g_conf()->mon_debug_dump_json) {
+        ceph::buffer::list bl;
+        t->encode(bl);
+        bl.write_fd(dump_fd_binary);
+      } else {
+        t->dump(&dump_fmt, true);
+        dump_fmt.flush(dump_fd_json);
+        dump_fd_json.flush();
+      }
+    }
+
+    std::list<std::pair<std::string, std::pair<std::string,std::string>>> compact;
+    for (auto it = t->ops.begin(); it != t->ops.end(); ++it) {
+      const Op& op = *it;
+      switch (op.type) {
+      case Transaction::OP_PUT:
+	dbt->set(op.prefix, op.key, op.bl);
+	break;
+      case Transaction::OP_ERASE:
+	dbt->rmkey(op.prefix, op.key);
+	break;
+      case Transaction::OP_ERASE_RANGE:
+	dbt->rm_range_keys(op.prefix, op.key, op.endkey);
+	break;
+      case Transaction::OP_COMPACT:
+	compact.push_back(make_pair(op.prefix, make_pair(op.key, op.endkey)));
+	break;
+      default:
+	derr << __func__ << " unknown op type " << op.type << dendl;
+	ceph_abort();
+	break;
+      }
+    }
+    int r = db->submit_transaction_sync(dbt);
+    if (r >= 0) {
+      while (!compact.empty()) {
+	if (compact.front().second.first == std::string() &&
+	    compact.front().second.second == std::string())
+	  db->compact_prefix_async(compact.front().first);
+	else
+	  db->compact_range_async(compact.front().first, compact.front().second.first, compact.front().second.second);
+	compact.pop_front();
+      }
+    } else {
+      ceph_abort_msg("failed to write to db");
+    }
+    return r;
+  }
+
+  struct C_DoTransaction : public Context {
+    MonitorDBStore *store;
+    MonitorDBStore::TransactionRef t;
+    Context *oncommit;
+    C_DoTransaction(MonitorDBStore *s, MonitorDBStore::TransactionRef t,
+		    Context *f)
+      : store(s), t(t), oncommit(f)
+    {}
+    void finish(int r) override {
+      /* The store serializes writes.  Each transaction is handled
+       * sequentially by the io_work Finisher.  If a transaction takes longer
+       * to apply its state to permanent storage, then no other transaction
+       * will be handled meanwhile.
+       *
+       * We will now randomly inject random delays.  We can safely sleep prior
+       * to applying the transaction as it won't break the model.
+       */
+      double delay_prob = g_conf()->mon_inject_transaction_delay_probability;
+      if (delay_prob && (rand() % 10000 < delay_prob * 10000.0)) {
+        utime_t delay;
+        double delay_max = g_conf()->mon_inject_transaction_delay_max;
+        delay.set_from_double(delay_max * (double)(rand() % 10000) / 10000.0);
+        lsubdout(g_ceph_context, mon, 1)
+          << "apply_transaction will be delayed for " << delay
+          << " seconds" << dendl;
+        delay.sleep();
+      }
+      int ret = store->apply_transaction(t);
+      oncommit->complete(ret);
+    }
+  };
+
+  /**
+   * queue transaction
+   *
+   * Queue a transaction to commit asynchronously.  Trigger a context
+   * on completion (without any locks held).
+   */
+  void queue_transaction(MonitorDBStore::TransactionRef t,
+			 Context *oncommit) {
+    io_work.queue(new C_DoTransaction(this, t, oncommit));
+  }
+
+  /**
+   * block and flush all io activity
+   */
+  void flush() {
+    io_work.wait_for_empty();
+  }
+
+  class StoreIteratorImpl {
+  protected:
+    bool done;
+    std::pair<std::string,std::string> last_key;
+    ceph::buffer::list crc_bl;
+
+    StoreIteratorImpl() : done(false) { }
+    virtual ~StoreIteratorImpl() { }
+
+    virtual bool _is_valid() = 0;
+
+  public:
+    __u32 crc() {
+      if (g_conf()->mon_sync_debug)
+	return crc_bl.crc32c(0);
+      return 0;
+    }
+    std::pair<std::string,std::string> get_last_key() {
+      return last_key;
+    }
+    virtual bool has_next_chunk() {
+      return !done && _is_valid();
+    }
+    virtual void get_chunk_tx(TransactionRef tx, uint64_t max_bytes,
+			      uint64_t max_keys) = 0;
+    virtual std::pair<std::string,std::string> get_next_key() = 0;
+  };
+  typedef std::shared_ptr<StoreIteratorImpl> Synchronizer;
+
+  class WholeStoreIteratorImpl : public StoreIteratorImpl {
+    KeyValueDB::WholeSpaceIterator iter;
+    std::set<std::string> sync_prefixes;
+
+  public:
+    WholeStoreIteratorImpl(KeyValueDB::WholeSpaceIterator iter,
+			   std::set<std::string> &prefixes)
+      : StoreIteratorImpl(),
+	iter(iter),
+	sync_prefixes(prefixes)
+    { }
+
+    ~WholeStoreIteratorImpl() override { }
+
+    /**
+     * Obtain a chunk of the store
+     *
+     * @param bl	    Encoded transaction that will recreate the chunk
+     * @param first_key	    Pair containing the first key to obtain, and that
+     *			    will contain the first key in the chunk (that may
+     *			    differ from the one passed on to the function)
+     * @param last_key[out] Last key in the chunk
+     */
+    void get_chunk_tx(TransactionRef tx, uint64_t max_bytes,
+		      uint64_t max_keys) override {
+      using ceph::encode;
+      ceph_assert(done == false);
+      ceph_assert(iter->valid() == true);
+
+      while (iter->valid()) {
+	std::string prefix(iter->raw_key().first);
+	std::string key(iter->raw_key().second);
+	if (sync_prefixes.count(prefix)) {
+	  ceph::buffer::list value = iter->value();
+	  if (tx->empty() ||
+	      (tx->get_bytes() + value.length() + key.size() +
+	       prefix.size() < max_bytes &&
+	       tx->get_keys() < max_keys)) {
+	    // NOTE: putting every key in a separate transaction is
+	    // questionable as far as efficiency goes
+	    auto tmp(std::make_shared<Transaction>());
+	    tmp->put(prefix, key, value);
+	    tx->append(tmp);
+	    if (g_conf()->mon_sync_debug) {
+	      encode(prefix, crc_bl);
+	      encode(key, crc_bl);
+	      encode(value, crc_bl);
+	    }
+	  } else {
+	    last_key.first = prefix;
+	    last_key.second = key;
+	    return;
+	  }
+	}
+	iter->next();
+      }
+      ceph_assert(iter->valid() == false);
+      done = true;
+    }
+
+    std::pair<std::string,std::string> get_next_key() override {
+      ceph_assert(iter->valid());
+
+      for (; iter->valid(); iter->next()) {
+	std::pair<std::string,std::string> r = iter->raw_key();
+        if (sync_prefixes.count(r.first) > 0) {
+          iter->next();
+          return r;
+        }
+      }
+      return std::pair<std::string,std::string>();
+    }
+
+    bool _is_valid() override {
+      return iter->valid();
+    }
+  };
+
+  Synchronizer get_synchronizer(std::pair<std::string,std::string> &key,
+				std::set<std::string> &prefixes) {
+    KeyValueDB::WholeSpaceIterator iter;
+    iter = db->get_wholespace_iterator();
+
+    if (!key.first.empty() && !key.second.empty())
+      iter->upper_bound(key.first, key.second);
+    else
+      iter->seek_to_first();
+
+    return std::shared_ptr<StoreIteratorImpl>(
+	new WholeStoreIteratorImpl(iter, prefixes)
+    );
+  }
+
+  KeyValueDB::Iterator get_iterator(const std::string &prefix) {
+    ceph_assert(!prefix.empty());
+    KeyValueDB::Iterator iter = db->get_iterator(prefix);
+    iter->seek_to_first();
+    return iter;
+  }
+
+  KeyValueDB::WholeSpaceIterator get_iterator() {
+    KeyValueDB::WholeSpaceIterator iter;
+    iter = db->get_wholespace_iterator();
+    iter->seek_to_first();
+    return iter;
+  }
+
+  int get(const std::string& prefix, const std::string& key, ceph::buffer::list& bl) {
+    ceph_assert(bl.length() == 0);
+    return db->get(prefix, key, &bl);
+  }
+
+  int get(const std::string& prefix, const version_t ver, ceph::buffer::list& bl) {
+    std::ostringstream os;
+    os << ver;
+    return get(prefix, os.str(), bl);
+  }
+
+  version_t get(const std::string& prefix, const std::string& key) {
+    using ceph::decode;
+    ceph::buffer::list bl;
+    int err = get(prefix, key, bl);
+    if (err < 0) {
+      if (err == -ENOENT) // if key doesn't exist, assume its value is 0
+        return 0;
+      // we're not expecting any other negative return value, and we can't
+      // just return a negative value if we're returning a version_t
+      generic_dout(0) << "MonitorDBStore::get() error obtaining"
+                      << " (" << prefix << ":" << key << "): "
+                      << cpp_strerror(err) << dendl;
+      ceph_abort_msg("error obtaining key");
+    }
+
+    ceph_assert(bl.length());
+    version_t ver;
+    auto p = bl.cbegin();
+    decode(ver, p);
+    return ver;
+  }
+
+  bool exists(const std::string& prefix, const std::string& key) {
+    KeyValueDB::Iterator it = db->get_iterator(prefix);
+    int err = it->lower_bound(key);
+    if (err < 0)
+      return false;
+
+    return (it->valid() && it->key() == key);
+  }
+
+  bool exists(const std::string& prefix, version_t ver) {
+    std::ostringstream os;
+    os << ver;
+    return exists(prefix, os.str());
+  }
+
+  std::string combine_strings(const std::string& prefix, const std::string& value) {
+    std::string out = prefix;
+    out.push_back('_');
+    out.append(value);
+    return out;
+  }
+
+  std::string combine_strings(const std::string& prefix, const version_t ver) {
+    std::ostringstream os;
+    os << ver;
+    return combine_strings(prefix, os.str());
+  }
+
+  void clear(std::set<std::string>& prefixes) {
+    KeyValueDB::Transaction dbt = db->get_transaction();
+
+    for (auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) {
+      dbt->rmkeys_by_prefix((*iter));
+    }
+    int r = db->submit_transaction_sync(dbt);
+    ceph_assert(r >= 0);
+  }
+
+  void _open(const std::string& kv_type) {
+    int pos = 0;
+    for (auto rit = path.rbegin(); rit != path.rend(); ++rit, ++pos) {
+      if (*rit != '/')
+	break;
+    }
+    std::ostringstream os;
+    os << path.substr(0, path.size() - pos) << "/store.db";
+    std::string full_path = os.str();
+
+    KeyValueDB *db_ptr = KeyValueDB::create(g_ceph_context,
+					    kv_type,
+					    full_path);
+    if (!db_ptr) {
+      derr << __func__ << " error initializing "
+	   << kv_type << " db back storage in "
+	   << full_path << dendl;
+      ceph_abort_msg("MonitorDBStore: error initializing keyvaluedb back storage");
+    }
+    db.reset(db_ptr);
+
+    if (g_conf()->mon_debug_dump_transactions) {
+      if (!g_conf()->mon_debug_dump_json) {
+        dump_fd_binary = ::open(
+          g_conf()->mon_debug_dump_location.c_str(),
+          O_CREAT|O_APPEND|O_WRONLY|O_CLOEXEC, 0644);
+        if (dump_fd_binary < 0) {
+          dump_fd_binary = -errno;
+          derr << "Could not open log file, got "
+               << cpp_strerror(dump_fd_binary) << dendl;
+        }
+      } else {
+        dump_fmt.reset();
+        dump_fmt.open_array_section("dump");
+        dump_fd_json.open(g_conf()->mon_debug_dump_location.c_str());
+      }
+      do_dump = true;
+    }
+    if (kv_type == "rocksdb")
+      db->init(g_conf()->mon_rocksdb_options);
+    else
+      db->init();
+
+
+  }
+
+  int open(std::ostream &out) {
+    std::string kv_type;
+    int r = read_meta("kv_backend", &kv_type);
+    if (r < 0 || kv_type.empty()) {
+      // assume old monitors that did not mark the type were leveldb.
+      kv_type = "leveldb";
+      r = write_meta("kv_backend", kv_type);
+      if (r < 0)
+	return r;
+    }
+    _open(kv_type);
+    r = db->open(out);
+    if (r < 0)
+      return r;
+
+    // Monitors are few in number, so the resource cost of exposing 
+    // very detailed stats is low: ramp up the priority of all the
+    // KV store's perf counters.  Do this after open, because backend may
+    // not have constructed PerfCounters earlier.
+    if (db->get_perf_counters()) {
+      db->get_perf_counters()->set_prio_adjust(
+          PerfCountersBuilder::PRIO_USEFUL - PerfCountersBuilder::PRIO_DEBUGONLY);
+    }
+
+    io_work.start();
+    is_open = true;
+    return 0;
+  }
+
+  int create_and_open(std::ostream &out) {
+    // record the type before open
+    std::string kv_type;
+    int r = read_meta("kv_backend", &kv_type);
+    if (r < 0) {
+      kv_type = g_conf()->mon_keyvaluedb;
+      r = write_meta("kv_backend", kv_type);
+      if (r < 0)
+	return r;
+    }
+    _open(kv_type);
+    r = db->create_and_open(out);
+    if (r < 0)
+      return r;
+    io_work.start();
+    is_open = true;
+    return 0;
+  }
+
+  void close() {
+    // there should be no work queued!
+    io_work.stop();
+    is_open = false;
+    db.reset(NULL);
+  }
+
+  void compact() {
+    db->compact();
+  }
+
+  void compact_async() {
+    db->compact_async();
+  }
+
+  void compact_prefix(const std::string& prefix) {
+    db->compact_prefix(prefix);
+  }
+
+  uint64_t get_estimated_size(std::map<std::string, uint64_t> &extras) {
+    return db->get_estimated_size(extras);
+  }
+
+  /**
+   * write_meta - write a simple configuration key out-of-band
+   *
+   * Write a simple key/value pair for basic store configuration
+   * (e.g., a uuid or magic number) to an unopened/unmounted store.
+   * The default implementation writes this to a plaintext file in the
+   * path.
+   *
+   * A newline is appended.
+   *
+   * @param key key name (e.g., "fsid")
+   * @param value value (e.g., a uuid rendered as a string)
+   * @returns 0 for success, or an error code
+   */
+  int write_meta(const std::string& key,
+		 const std::string& value) const {
+    std::string v = value;
+    v += "\n";
+    int r = safe_write_file(path.c_str(), key.c_str(),
+			    v.c_str(), v.length(),
+			    0600);
+    if (r < 0)
+      return r;
+    return 0;
+  }
+
+  /**
+   * read_meta - read a simple configuration key out-of-band
+   *
+   * Read a simple key value to an unopened/mounted store.
+   *
+   * Trailing whitespace is stripped off.
+   *
+   * @param key key name
+   * @param value pointer to value string
+   * @returns 0 for success, or an error code
+   */
+  int read_meta(const std::string& key,
+		std::string *value) const {
+    char buf[4096];
+    int r = safe_read_file(path.c_str(), key.c_str(),
+			   buf, sizeof(buf));
+    if (r <= 0)
+      return r;
+    // drop trailing newlines
+    while (r && isspace(buf[r-1])) {
+      --r;
+    }
+    *value = std::string(buf, r);
+    return 0;
+  }
+
+  explicit MonitorDBStore(const std::string& path)
+    : path(path),
+      db(0),
+      do_dump(false),
+      dump_fd_binary(-1),
+      dump_fmt(true),
+      io_work(g_ceph_context, "monstore", "fn_monstore"),
+      is_open(false) {
+  }
+  ~MonitorDBStore() {
+    ceph_assert(!is_open);
+    if (do_dump) {
+      if (!g_conf()->mon_debug_dump_json) {
+        ::close(dump_fd_binary);
+      } else {
+        dump_fmt.close_section();
+        dump_fmt.flush(dump_fd_json);
+        dump_fd_json.flush();
+        dump_fd_json.close();
+      }
+    }
+  }
+
+};
+
+WRITE_CLASS_ENCODER(MonitorDBStore::Op)
+WRITE_CLASS_ENCODER(MonitorDBStore::Transaction)
+
+#endif /* CEPH_MONITOR_DB_STORE_H */
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
new file mode 100644
index 000000000..91d9021c2
--- /dev/null
+++ b/src/mon/MonmapMonitor.cc
@@ -0,0 +1,1470 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "MonmapMonitor.h"
+#include "Monitor.h"
+#include "OSDMonitor.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonJoin.h"
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include <sstream>
+#include "common/config.h"
+#include "common/cmdparse.h"
+
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon)
+using namespace TOPNSPC::common;
+
+using std::cout;
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::setfill;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+using std::unique_ptr;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+static ostream& _prefix(std::ostream *_dout, Monitor &mon) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name()
+		<< ").monmap v" << mon.monmap->epoch << " ";
+}
+
+void MonmapMonitor::create_initial()
+{
+  dout(10) << __func__ << " using current monmap" << dendl;
+  pending_map = *mon.monmap;
+  pending_map.epoch = 1;
+
+  if (g_conf()->mon_debug_no_initial_persistent_features) {
+    derr << __func__ << " mon_debug_no_initial_persistent_features=true"
+	 << dendl;
+  } else {
+    // initialize with default persistent features for new clusters
+    pending_map.persistent_features = ceph::features::mon::get_persistent();
+    pending_map.min_mon_release = ceph_release();
+  }
+}
+
+void MonmapMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  version_t version = get_last_committed();
+  if (version <= mon.monmap->get_epoch())
+    return;
+
+  dout(10) << __func__ << " version " << version
+	   << ", my v " << mon.monmap->epoch << dendl;
+  
+  if (need_bootstrap && version != mon.monmap->get_epoch()) {
+    dout(10) << " signaling that we need a bootstrap" << dendl;
+    *need_bootstrap = true;
+  }
+
+  // read and decode
+  monmap_bl.clear();
+  int ret = get_version(version, monmap_bl);
+  ceph_assert(ret == 0);
+  ceph_assert(monmap_bl.length());
+
+  dout(10) << __func__ << " got " << version << dendl;
+  mon.monmap->decode(monmap_bl);
+
+  if (mon.store->exists("mkfs", "monmap")) {
+    auto t(std::make_shared<MonitorDBStore::Transaction>());
+    t->erase("mkfs", "monmap");
+    mon.store->apply_transaction(t);
+  }
+
+  check_subs();
+
+  // make sure we've recorded min_mon_release
+  string val;
+  if (mon.store->read_meta("min_mon_release", &val) < 0 ||
+      val.size() == 0 ||
+      atoi(val.c_str()) != (int)ceph_release()) {
+    dout(10) << __func__ << " updating min_mon_release meta" << dendl;
+    mon.store->write_meta("min_mon_release",
+			   stringify(ceph_release()));
+  }
+
+  mon.notify_new_monmap(true);
+}
+
+void MonmapMonitor::create_pending()
+{
+  pending_map = *mon.monmap;
+  pending_map.epoch++;
+  pending_map.last_changed = ceph_clock_now();
+  pending_map.removed_ranks.clear();
+}
+
+void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  dout(10) << __func__ << " epoch " << pending_map.epoch << dendl;
+
+  ceph_assert(mon.monmap->epoch + 1 == pending_map.epoch ||
+	 pending_map.epoch == 1);  // special case mkfs!
+  bufferlist bl;
+  pending_map.encode(bl, mon.get_quorum_con_features());
+
+  put_version(t, pending_map.epoch, bl);
+  put_last_committed(t, pending_map.epoch);
+
+  // generate a cluster fingerprint, too?
+  if (pending_map.epoch == 1) {
+    mon.prepare_new_fingerprint(t);
+  }
+
+  //health
+  health_check_map_t next;
+  pending_map.check_health(&next);
+  encode_health(next, t);
+}
+
+class C_ApplyFeatures : public Context {
+  MonmapMonitor *svc;
+  mon_feature_t features;
+  ceph_release_t min_mon_release;
+public:
+  C_ApplyFeatures(MonmapMonitor *s, const mon_feature_t& f, ceph_release_t mmr) :
+    svc(s), features(f), min_mon_release(mmr) { }
+  void finish(int r) override {
+    if (r >= 0) {
+      svc->apply_mon_features(features, min_mon_release);
+    } else if (r == -EAGAIN || r == -ECANCELED) {
+      // discard features if we're no longer on the quorum that
+      // established them in the first place.
+      return;
+    } else {
+      ceph_abort_msg("bad C_ApplyFeatures return value");
+    }
+  }
+};
+
+void MonmapMonitor::apply_mon_features(const mon_feature_t& features,
+				       ceph_release_t min_mon_release)
+{
+  if (!is_writeable()) {
+    dout(5) << __func__ << " wait for service to be writeable" << dendl;
+    wait_for_writeable_ctx(new C_ApplyFeatures(this, features, min_mon_release));
+    return;
+  }
+
+  // do nothing here unless we have a full quorum
+  if (mon.get_quorum().size() < mon.monmap->size()) {
+    return;
+  }
+
+  ceph_assert(is_writeable());
+  ceph_assert(features.contains_all(pending_map.persistent_features));
+  // we should never hit this because `features` should be the result
+  // of the quorum's supported features. But if it happens, die.
+  ceph_assert(ceph::features::mon::get_supported().contains_all(features));
+
+  mon_feature_t new_features =
+    (pending_map.persistent_features ^
+     (features & ceph::features::mon::get_persistent()));
+
+  if (new_features.empty() &&
+      pending_map.min_mon_release == min_mon_release) {
+    dout(10) << __func__ << " min_mon_release (" << (int)min_mon_release
+	     << ") and features (" << features << ") match" << dendl;
+    return;
+  }
+
+  if (!new_features.empty()) {
+    dout(1) << __func__ << " applying new features "
+	    << new_features << ", had " << pending_map.persistent_features
+	    << ", will have "
+	    << (new_features | pending_map.persistent_features)
+	    << dendl;
+    pending_map.persistent_features |= new_features;
+  }
+  if (min_mon_release > pending_map.min_mon_release) {
+    dout(1) << __func__ << " increasing min_mon_release to "
+	    << to_integer<int>(min_mon_release) << " (" << min_mon_release
+	    << ")" << dendl;
+    pending_map.min_mon_release = min_mon_release;
+  }
+
+  propose_pending();
+}
+
+void MonmapMonitor::on_active()
+{
+  if (get_last_committed() >= 1 && !mon.has_ever_joined) {
+    // make note of the fact that i was, once, part of the quorum.
+    dout(10) << "noting that i was, once, part of an active quorum." << dendl;
+
+    /* This is some form of nasty in-breeding we have between the MonmapMonitor
+       and the Monitor itself. We should find a way to get rid of it given our
+       new architecture. Until then, stick with it since we are a
+       single-threaded process and, truth be told, no one else relies on this
+       thing besides us.
+     */
+    auto t(std::make_shared<MonitorDBStore::Transaction>());
+    t->put(Monitor::MONITOR_NAME, "joined", 1);
+    mon.store->apply_transaction(t);
+    mon.has_ever_joined = true;
+  }
+
+  if (mon.is_leader()) {
+    mon.clog->debug() << "monmap " << *mon.monmap;
+  }
+
+  apply_mon_features(mon.get_quorum_mon_features(),
+		     mon.quorum_min_mon_release);
+
+  mon.update_pending_metadata();
+}
+
+bool MonmapMonitor::preprocess_query(MonOpRequestRef op)
+{
+  auto m = op->get_req<PaxosServiceMessage>();
+  switch (m->get_type()) {
+    // READs
+  case MSG_MON_COMMAND:
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+  case MSG_MON_JOIN:
+    return preprocess_join(op);
+  default:
+    ceph_abort();
+    return true;
+  }
+}
+
+void MonmapMonitor::dump_info(Formatter *f)
+{
+  f->dump_unsigned("monmap_first_committed", get_first_committed());
+  f->dump_unsigned("monmap_last_committed", get_last_committed());
+  f->open_object_section("monmap");
+  mon.monmap->dump(f);
+  f->close_section();
+  f->open_array_section("quorum");
+  for (set<int>::iterator q = mon.get_quorum().begin(); q != mon.get_quorum().end(); ++q)
+    f->dump_int("mon", *q);
+  f->close_section();
+}
+
+bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  int r = -1;
+  bufferlist rdata;
+  stringstream ss;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+    return true;
+  }
+
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  if (prefix == "mon stat") {
+    if (f) {
+      f->open_object_section("monmap");
+      mon.monmap->dump_summary(f.get());
+      f->dump_string("leader", mon.get_leader_name());
+      f->open_array_section("quorum");
+      for (auto rank: mon.get_quorum()) {
+        std::string name = mon.monmap->get_name(rank);
+        f->open_object_section("mon");
+        f->dump_int("rank", rank);
+        f->dump_string("name", name);
+        f->close_section();  // mon
+      }
+      f->close_section();  // quorum
+      f->close_section();  // monmap
+      f->flush(ss);
+    } else {
+      mon.monmap->print_summary(ss);
+      ss << ", election epoch " << mon.get_epoch() << ", leader "
+         << mon.get_leader() << " " << mon.get_leader_name()
+         << ", quorum " << mon.get_quorum()
+         << " " << mon.get_quorum_names();
+    }
+
+    rdata.append(ss);
+    ss.str("");
+    r = 0;
+
+  } else if (prefix == "mon getmap" ||
+             prefix == "mon dump") {
+
+    epoch_t epoch;
+    int64_t epochnum;
+    cmd_getval(cmdmap, "epoch", epochnum, (int64_t)0);
+    epoch = epochnum;
+
+    MonMap *p = mon.monmap;
+    if (epoch) {
+      bufferlist bl;
+      r = get_version(epoch, bl);
+      if (r == -ENOENT) {
+        ss << "there is no map for epoch " << epoch;
+        goto reply;
+      }
+      ceph_assert(r == 0);
+      ceph_assert(bl.length() > 0);
+      p = new MonMap;
+      p->decode(bl);
+    }
+
+    ceph_assert(p);
+
+    if (prefix == "mon getmap") {
+      p->encode(rdata, m->get_connection()->get_features());
+      r = 0;
+      ss << "got monmap epoch " << p->get_epoch();
+    } else if (prefix == "mon dump") {
+      stringstream ds;
+      if (f) {
+        f->open_object_section("monmap");
+        p->dump(f.get());
+        f->open_array_section("quorum");
+        for (set<int>::iterator q = mon.get_quorum().begin();
+            q != mon.get_quorum().end(); ++q) {
+          f->dump_int("mon", *q);
+        }
+        f->close_section();
+        f->close_section();
+        f->flush(ds);
+        r = 0;
+      } else {
+        p->print(ds);
+        r = 0;
+      }
+      rdata.append(ds);
+      ss << "dumped monmap epoch " << p->get_epoch();
+    }
+    if (p != mon.monmap) {
+       delete p;
+       p = nullptr;
+    }
+
+  } else if (prefix == "mon feature ls") {
+   
+    bool list_with_value = false;
+    string with_value;
+    if (cmd_getval(cmdmap, "with_value", with_value) &&
+        with_value == "--with-value") {
+      list_with_value = true;
+    }
+
+    MonMap *p = mon.monmap;
+
+    // list features
+    mon_feature_t supported = ceph::features::mon::get_supported();
+    mon_feature_t persistent = ceph::features::mon::get_persistent();
+    mon_feature_t required = p->get_required_features();
+
+    stringstream ds;
+    auto print_feature = [&](mon_feature_t& m_features, const char* m_str) {
+      if (f) {
+        if (list_with_value)
+          m_features.dump_with_value(f.get(), m_str);
+        else
+          m_features.dump(f.get(), m_str);
+      } else {
+        if (list_with_value)
+          m_features.print_with_value(ds);
+        else
+          m_features.print(ds);
+      }
+    };
+
+    if (f) {
+      f->open_object_section("features");
+
+      f->open_object_section("all");
+      print_feature(supported, "supported");
+      print_feature(persistent, "persistent");
+      f->close_section(); // all
+
+      f->open_object_section("monmap");
+      print_feature(p->persistent_features, "persistent");
+      print_feature(p->optional_features, "optional");
+      print_feature(required, "required");
+      f->close_section(); // monmap 
+
+      f->close_section(); // features
+      f->flush(ds);
+
+    } else {
+      ds << "all features" << std::endl
+        << "\tsupported: ";
+      print_feature(supported, nullptr);
+      ds << std::endl
+        << "\tpersistent: ";
+      print_feature(persistent, nullptr);
+      ds << std::endl
+        << std::endl;
+
+      ds << "on current monmap (epoch "
+         << p->get_epoch() << ")" << std::endl
+         << "\tpersistent: ";
+      print_feature(p->persistent_features, nullptr);
+      ds << std::endl
+        // omit optional features in plain-text
+        // makes it easier to read, and they're, currently, empty.
+	 << "\trequired: ";
+      print_feature(required, nullptr);
+      ds << std::endl;
+    }
+    rdata.append(ds);
+    r = 0;
+  }
+
+reply:
+  if (r != -1) {
+    string rs;
+    getline(ss, rs);
+
+    mon.reply_command(op, r, rs, rdata, get_last_committed());
+    return true;
+  } else
+    return false;
+}
+
+
+bool MonmapMonitor::prepare_update(MonOpRequestRef op)
+{
+  auto m = op->get_req<PaxosServiceMessage>();
+  dout(7) << __func__ << " " << *m << " from " << m->get_orig_source_inst() << dendl;
+  
+  switch (m->get_type()) {
+  case MSG_MON_COMMAND:
+    try {
+      return prepare_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+  case MSG_MON_JOIN:
+    return prepare_join(op);
+  default:
+    ceph_abort();
+  }
+
+  return false;
+}
+
+bool MonmapMonitor::prepare_command(MonOpRequestRef op)
+{
+  auto m = op->get_req<MMonCommand>();
+  stringstream ss;
+  string rs;
+  int err = -EINVAL;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+    return true;
+  }
+
+  /* We should follow the following rules:
+   *
+   * - 'monmap' is the current, consistent version of the monmap
+   * - 'pending_map' is the uncommitted version of the monmap
+   *
+   * All checks for the current state must be made against 'monmap'.
+   * All changes are made against 'pending_map'.
+   *
+   * If there are concurrent operations modifying 'pending_map', please
+   * follow the following rules.
+   *
+   * - if pending_map has already been changed, the second operation must
+   *   wait for the proposal to finish and be run again; This is the easiest
+   *   path to guarantee correctness but may impact performance (i.e., it
+   *   will take longer for the user to get a reply).
+   *
+   * - if the result of the second operation can be guaranteed to be
+   *   idempotent, the operation may reply to the user once the proposal
+   *   finishes; still needs to wait for the proposal to finish.
+   *
+   * - An operation _NEVER_ returns to the user based on pending state.
+   *
+   * If an operation does not modify current stable monmap, it may be
+   * serialized before current pending map, regardless of any change that
+   * has been made to the pending map -- remember, pending is uncommitted
+   * state, thus we are not bound by it.
+   */
+
+  ceph_assert(mon.monmap);
+  MonMap &monmap = *mon.monmap;
+
+
+  /* Please note:
+   *
+   * Adding or removing monitors may lead to loss of quorum.
+   *
+   * Because quorum may be lost, it's important to reply something
+   * to the user, lest she end up waiting forever for a reply. And
+   * no reply will ever be sent until quorum is formed again.
+   *
+   * On the other hand, this means we're leaking uncommitted state
+   * to the user. As such, please be mindful of the reply message.
+   *
+   * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going
+   * operation and conveys its not-yet-permanent nature); whereas
+   * 'added monitor mon.foo' presumes the action has successfully
+   * completed and state has been committed, which may not be true.
+   */
+
+
+  bool propose = false;
+  if (prefix == "mon add") {
+    string name;
+    cmd_getval(cmdmap, "name", name);
+    string addrstr;
+    cmd_getval(cmdmap, "addr", addrstr);
+    entity_addr_t addr;
+    bufferlist rdata;
+
+    if (!addr.parse(addrstr.c_str())) {
+      err = -EINVAL;
+      ss << "addr " << addrstr << "does not parse";
+      goto reply;
+    }
+
+    vector<string> locationvec;
+    map<string, string> loc;
+    cmd_getval(cmdmap, "location", locationvec);
+    CrushWrapper::parse_loc_map(locationvec, &loc);
+    if (locationvec.size() &&
+	!mon.get_quorum_mon_features().contains_all(
+				        ceph::features::mon::FEATURE_PINGING)) {
+      err = -ENOTSUP;
+      ss << "Not all monitors support adding monitors with a location; please upgrade first!";
+      goto reply;
+    }
+    if (locationvec.size() && !loc.size()) {
+      ss << "We could not parse your input location to anything real; " << locationvec
+	 << " turned into an empty map!";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    dout(10) << "mon add setting location for " << name << " to " << loc << dendl;
+
+    // TODO: validate location in crush map
+    if (monmap.stretch_mode_enabled && !loc.size()) {
+      ss << "We are in stretch mode and new monitors must have a location, but "
+	 << "could not parse your input location to anything real; " << locationvec
+	 << " turned into an empty map!";
+      err = -EINVAL;
+      goto reply;
+    }
+    // TODO: validate location against any existing stretch config
+
+    entity_addrvec_t addrs;
+    if (monmap.persistent_features.contains_all(
+	  ceph::features::mon::FEATURE_NAUTILUS)) {
+      if (addr.get_port() == CEPH_MON_PORT_IANA) {
+	addr.set_type(entity_addr_t::TYPE_MSGR2);
+      }
+      if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
+	// if they specified the *old* default they probably don't care
+	addr.set_port(0);
+      }
+      if (addr.get_port()) {
+	addrs.v.push_back(addr);
+      } else {
+	addr.set_type(entity_addr_t::TYPE_MSGR2);
+	addr.set_port(CEPH_MON_PORT_IANA);
+	addrs.v.push_back(addr);
+	addr.set_type(entity_addr_t::TYPE_LEGACY);
+	addr.set_port(CEPH_MON_PORT_LEGACY);
+	addrs.v.push_back(addr);
+      }
+    } else {
+      if (addr.get_port() == 0) {
+	addr.set_port(CEPH_MON_PORT_LEGACY);
+      }
+      addr.set_type(entity_addr_t::TYPE_LEGACY);
+      addrs.v.push_back(addr);
+    }
+    dout(20) << __func__ << " addr " << addr << " -> addrs " << addrs << dendl;
+
+    /**
+     * If we have a monitor with the same name and different addr, then EEXIST
+     * If we have a monitor with the same addr and different name, then EEXIST
+     * If we have a monitor with the same addr and same name, then wait for
+     * the proposal to finish and return success.
+     * If we don't have the monitor, add it.
+     */
+
+    err = 0;
+    if (!ss.str().empty())
+      ss << "; ";
+
+    do {
+      if (monmap.contains(name)) {
+        if (monmap.get_addrs(name) == addrs) {
+          // stable map contains monitor with the same name at the same address.
+          // serialize before current pending map.
+          err = 0; // for clarity; this has already been set above.
+          ss << "mon." << name << " at " << addrs << " already exists";
+          goto reply;
+        } else {
+          ss << "mon." << name
+             << " already exists at address " << monmap.get_addrs(name);
+        }
+      } else if (monmap.contains(addrs)) {
+        // we established on the previous branch that name is different
+        ss << "mon." << monmap.get_name(addrs)
+           << " already exists at address " << addr;
+      } else {
+        // go ahead and add
+        break;
+      }
+      err = -EEXIST;
+      goto reply;
+    } while (false);
+
+    if (pending_map.stretch_mode_enabled) {
+      
+    }
+    
+    /* Given there's no delay between proposals on the MonmapMonitor (see
+     * MonmapMonitor::should_propose()), there is no point in checking for
+     * a mismatch between name and addr on pending_map.
+     *
+     * Once we established the monitor does not exist in the committed state,
+     * we can simply go ahead and add the monitor.
+     */
+
+    pending_map.add(name, addrs);
+    pending_map.mon_info[name].crush_loc = loc;
+    pending_map.last_changed = ceph_clock_now();
+    ss << "adding mon." << name << " at " << addrs;
+    propose = true;
+    dout(0) << __func__ << " proposing new mon." << name << dendl;
+
+  } else if (prefix == "mon remove" ||
+             prefix == "mon rm") {
+    string name;
+    cmd_getval(cmdmap, "name", name);
+    if (!monmap.contains(name)) {
+      err = 0;
+      ss << "mon." << name << " does not exist or has already been removed";
+      goto reply;
+    }
+
+    if (monmap.size() == 1) {
+      err = -EINVAL;
+      ss << "error: refusing removal of last monitor " << name;
+      goto reply;
+    }
+
+    if (pending_map.stretch_mode_enabled &&
+	name == pending_map.tiebreaker_mon) {
+      err = -EINVAL;
+      ss << "you cannot remove stretch mode's tiebreaker monitor";
+      goto reply;
+    }
+    /* At the time of writing, there is no risk of races when multiple clients
+     * attempt to use the same name. The reason is simple but may not be
+     * obvious.
+     *
+     * In a nutshell, we do not collate proposals on the MonmapMonitor. As
+     * soon as we return 'true' below, PaxosService::dispatch() will check if
+     * the service should propose, and - if so - the service will be marked as
+     * 'proposing' and a proposal will be triggered. The PaxosService class
+     * guarantees that once a service is marked 'proposing' no further writes
+     * will be handled.
+     *
+     * The decision on whether the service should propose or not is, in this
+     * case, made by MonmapMonitor::should_propose(), which always considers
+     * the proposal delay being 0.0 seconds. This is key for PaxosService to
+     * trigger the proposal immediately.
+     * 0.0 seconds of delay.
+     *
+     * From the above, there's no point in performing further checks on the
+     * pending_map, as we don't ever have multiple proposals in-flight in
+     * this service. As we've established the committed state contains the
+     * monitor, we can simply go ahead and remove it.
+     *
+     * Please note that the code hinges on all of the above to be true. It
+     * has been true since time immemorial and we don't see a good reason
+     * to make it sturdier at this time - mainly because we don't think it's
+     * going to change any time soon, lest for any bug that may be unwillingly
+     * introduced.
+     */
+
+    entity_addrvec_t addrs = pending_map.get_addrs(name);
+    pending_map.remove(name);
+    pending_map.disallowed_leaders.erase(name);
+    pending_map.last_changed = ceph_clock_now();
+    propose = true;
+    err = 0;
+
+  } else if (prefix == "mon feature set") {
+
+    /* PLEASE NOTE:
+     *
+     * We currently only support setting/unsetting persistent features.
+     * This is by design, given at the moment we still don't have optional
+     * features, and, as such, there is no point introducing an interface
+     * to manipulate them. This allows us to provide a cleaner, more
+     * intuitive interface to the user, modifying solely persistent
+     * features.
+     *
+     * In the future we should consider adding another interface to handle
+     * optional features/flags; e.g., 'mon feature flag set/unset', or
+     * 'mon flag set/unset'.
+     */
+    string feature_name;
+    if (!cmd_getval(cmdmap, "feature_name", feature_name)) {
+      ss << "missing required feature name";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    mon_feature_t feature;
+    feature = ceph::features::mon::get_feature_by_name(feature_name);
+    if (feature == ceph::features::mon::FEATURE_NONE) {
+      ss << "unknown feature '" << feature_name << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      ss << "please specify '--yes-i-really-mean-it' if you "
+         << "really, **really** want to set feature '"
+         << feature << "' in the monmap.";
+      err = -EPERM;
+      goto reply;
+    }
+
+    if (!mon.get_quorum_mon_features().contains_all(feature)) {
+      ss << "current quorum does not support feature '" << feature
+         << "'; supported features: "
+         << mon.get_quorum_mon_features();
+      err = -EINVAL;
+      goto reply;
+    }
+
+    ss << "setting feature '" << feature << "'";
+
+    err = 0;
+    if (monmap.persistent_features.contains_all(feature)) {
+      dout(10) << __func__ << " feature '" << feature
+               << "' already set on monmap; no-op." << dendl;
+      goto reply;
+    }
+
+    pending_map.persistent_features.set_feature(feature);
+    pending_map.last_changed = ceph_clock_now();
+    propose = true;
+
+    dout(1) << __func__ << " " << ss.str() << "; new features will be: "
+            << "persistent = " << pending_map.persistent_features
+            // output optional nevertheless, for auditing purposes.
+            << ", optional = " << pending_map.optional_features << dendl;
+
+  } else if (prefix == "mon set-rank") {
+    string name;
+    int64_t rank;
+    if (!cmd_getval(cmdmap, "name", name) ||
+	!cmd_getval(cmdmap, "rank", rank)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    int oldrank = pending_map.get_rank(name);
+    if (oldrank < 0) {
+      ss << "mon." << name << " does not exist in monmap";
+      err = -ENOENT;
+      goto reply;
+    }
+    err = 0;
+    pending_map.set_rank(name, rank);
+    pending_map.last_changed = ceph_clock_now();
+    propose = true;
+  } else if (prefix == "mon set-addrs") {
+    string name;
+    string addrs;
+    if (!cmd_getval(cmdmap, "name", name) ||
+	!cmd_getval(cmdmap, "addrs", addrs)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!pending_map.contains(name)) {
+      ss << "mon." << name << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    entity_addrvec_t av;
+    if (!av.parse(addrs.c_str(), nullptr)) {
+      ss << "failed to parse addrs '" << addrs << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    for (auto& a : av.v) {
+      a.set_nonce(0);
+      if (!a.get_port()) {
+	ss << "monitor must bind to a non-zero port, not " << a;
+	err = -EINVAL;
+	goto reply;
+      }
+    }
+    err = 0;
+    pending_map.set_addrvec(name, av);
+    pending_map.last_changed = ceph_clock_now();
+    propose = true;
+  } else if (prefix == "mon set-weight") {
+    string name;
+    int64_t weight;
+    if (!cmd_getval(cmdmap, "name", name) ||
+        !cmd_getval(cmdmap, "weight", weight)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!pending_map.contains(name)) {
+      ss << "mon." << name << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    err = 0;
+    pending_map.set_weight(name, weight);
+    pending_map.last_changed = ceph_clock_now();
+    propose = true;
+  } else if (prefix == "mon enable-msgr2") {
+    if (!monmap.get_required_features().contains_all(
+	  ceph::features::mon::FEATURE_NAUTILUS)) {
+      err = -EACCES;
+      ss << "all monitors must be running nautilus to enable v2";
+      goto reply;
+    }
+    for (auto& i : pending_map.mon_info) {
+      if (i.second.public_addrs.v.size() == 1 &&
+	  i.second.public_addrs.front().is_legacy() &&
+	  i.second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) {
+	entity_addrvec_t av;
+	entity_addr_t a = i.second.public_addrs.front();
+	a.set_type(entity_addr_t::TYPE_MSGR2);
+	a.set_port(CEPH_MON_PORT_IANA);
+	av.v.push_back(a);
+	av.v.push_back(i.second.public_addrs.front());
+	dout(10) << " setting mon." << i.first
+		 << " addrs " << i.second.public_addrs
+		 << " -> " << av << dendl;
+	pending_map.set_addrvec(i.first, av);
+	propose = true;
+	pending_map.last_changed = ceph_clock_now();
+      }
+    }
+    err = 0;
+  } else if (prefix == "mon set election_strategy") {
+    if (!mon.get_quorum_mon_features().contains_all(
+				        ceph::features::mon::FEATURE_PINGING)) {
+      err = -ENOTSUP;
+      ss << "Not all monitors support changing election strategies; please upgrade first!";
+      goto reply;
+    }
+    string strat;
+    MonMap::election_strategy strategy;
+    if (!cmd_getval(cmdmap, "strategy", strat)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    if (strat == "classic") {
+      strategy = MonMap::CLASSIC;
+    } else if (strat == "disallow") {
+      strategy = MonMap::DISALLOW;
+    } else if (strat == "connectivity") {
+      strategy = MonMap::CONNECTIVITY;
+    } else {
+      err = -EINVAL;
+      goto reply;
+    }
+    err = 0;
+    pending_map.strategy = strategy;
+    pending_map.last_changed = ceph_clock_now();
+    propose = true;
+  } else if (prefix == "mon add disallowed_leader") {
+    if (!mon.get_quorum_mon_features().contains_all(
+				        ceph::features::mon::FEATURE_PINGING)) {
+      err = -ENOTSUP;
+      ss << "Not all monitors support changing election strategies; please upgrade first!";
+      goto reply;
+    }
+    string name;
+    if (!cmd_getval(cmdmap, "name", name)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    if (pending_map.strategy != MonMap::DISALLOW &&
+	pending_map.strategy != MonMap::CONNECTIVITY) {
+      ss << "You cannot disallow monitors in your current election mode";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!pending_map.contains(name)) {
+      ss << "mon." << name << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    if (pending_map.disallowed_leaders.count(name)) {
+      ss << "mon." << name << " is already disallowed";
+      err = 0;
+      goto reply;
+    }
+    if (pending_map.disallowed_leaders.size() == pending_map.size() - 1) {
+      ss << "mon." << name << " is the only remaining allowed leader!";
+      err = -EINVAL;
+      goto reply;
+    }
+    pending_map.disallowed_leaders.insert(name);
+    pending_map.last_changed = ceph_clock_now();
+    err = 0;
+    propose = true;
+  } else if (prefix == "mon rm disallowed_leader") {
+    if (!mon.get_quorum_mon_features().contains_all(
+				        ceph::features::mon::FEATURE_PINGING)) {
+      err = -ENOTSUP;
+      ss << "Not all monitors support changing election strategies; please upgrade first!";
+      goto reply;
+    }
+    string name;
+    if (!cmd_getval(cmdmap, "name", name)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    if (pending_map.strategy != MonMap::DISALLOW &&
+	pending_map.strategy != MonMap::CONNECTIVITY) {
+      ss << "You cannot disallow monitors in your current election mode";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!pending_map.contains(name)) {
+      ss << "mon." << name << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    if (!pending_map.disallowed_leaders.count(name)) {
+      ss << "mon." << name << " is already allowed";
+      err = 0;
+      goto reply;
+    }
+    pending_map.disallowed_leaders.erase(name);
+    pending_map.last_changed = ceph_clock_now();
+    err = 0;
+    propose = true;
+  } else if (prefix == "mon set_location") {
+    if (!mon.get_quorum_mon_features().contains_all(
+				        ceph::features::mon::FEATURE_PINGING)) {
+      err = -ENOTSUP;
+      ss << "Not all monitors support monitor locations; please upgrade first!";
+      goto reply;
+    }
+    string name;
+    if (!cmd_getval(cmdmap, "name", name)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!pending_map.contains(name)) {
+      ss << "mon." << name << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+
+    vector<string> argvec;
+    map<string, string> loc;
+    cmd_getval(cmdmap, "args", argvec);
+    CrushWrapper::parse_loc_map(argvec, &loc);
+
+    dout(10) << "mon set_location for " << name << " to " << loc << dendl;
+
+    // TODO: validate location in crush map
+    if (!loc.size()) {
+      ss << "We could not parse your input location to anything real; " << argvec
+	 << " turned into an empty map!";
+      err = -EINVAL;
+      goto reply;
+    }
+    // TODO: validate location against any existing stretch config
+    pending_map.mon_info[name].crush_loc = loc;
+    pending_map.last_changed = ceph_clock_now();
+    err = 0;
+    propose = true;
+  } else if (prefix == "mon set_new_tiebreaker") {
+    if (!pending_map.stretch_mode_enabled) {
+      err = -EINVAL;
+      ss << "Stretch mode is not enabled, so there is no tiebreaker";
+      goto reply;
+    }
+    string name;
+    if (!cmd_getval(cmdmap, "name", name)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+    const auto &existing_tiebreaker_info_i = pending_map.mon_info.find(pending_map.tiebreaker_mon);
+    const auto &new_tiebreaker_info_i = pending_map.mon_info.find(name);
+    if (new_tiebreaker_info_i == pending_map.mon_info.end()) {
+      ss << "mon." << name << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    const auto& new_info = new_tiebreaker_info_i->second;
+    if (new_info.crush_loc.empty()) {
+      ss << "mon." << name << " does not have a location specified";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    if (!mon.osdmon()->is_readable()) {
+      dout(10) << __func__
+	       << ": waiting for osdmon readable to inspect crush barrier"
+	       << dendl;
+      mon.osdmon()->wait_for_readable(op, new Monitor::C_RetryMessage(&mon, op));
+      return false;
+    }
+    int32_t stretch_divider_id = mon.osdmon()->osdmap.stretch_mode_bucket;
+    string stretch_bucket_divider = mon.osdmon()->osdmap.crush->
+      get_type_name(stretch_divider_id);
+
+    const auto& new_loc_i = new_info.crush_loc.find(stretch_bucket_divider);
+    if (new_loc_i == new_info.crush_loc.end()) {
+      ss << "mon." << name << " has a specificed location, but not a "
+	 << stretch_bucket_divider << ", which is the stretch divider";
+      err = -EINVAL;
+      goto reply;
+    }
+    const string& new_loc = new_loc_i->second;
+    set<string> matching_mons;
+    for (const auto& mii : pending_map.mon_info) {
+      const auto& other_loc_i = mii.second.crush_loc.find(stretch_bucket_divider);
+      if (mii.first == name) {
+	continue;
+      }
+      if (other_loc_i == mii.second.crush_loc.end()) { // huh
+	continue;
+      }
+      const string& other_loc = other_loc_i->second;
+      if (other_loc == new_loc &&
+	  mii.first != existing_tiebreaker_info_i->first) {
+	matching_mons.insert(mii.first);
+      }
+    }
+    if (!matching_mons.empty()) {
+      ss << "mon." << name << " has location " << new_loc_i->second
+	 << ", which matches mons " << matching_mons << " on the "
+	 << stretch_bucket_divider << " dividing bucket for stretch mode. "
+	"Pass --yes-i-really-mean-it if you're sure you want to do this."
+	"(You really don't.)";
+      err = -EINVAL;
+      goto reply;
+    }
+    pending_map.tiebreaker_mon = name;
+    pending_map.disallowed_leaders.insert(name);
+    pending_map.last_changed = ceph_clock_now();
+    err = 0;
+    propose = true;
+  } else if (prefix == "mon enable_stretch_mode") {
+    if (!mon.osdmon()->is_writeable()) {
+      dout(10) << __func__
+	      << ":  waiting for osdmon writeable for stretch mode" << dendl;
+      mon.osdmon()->wait_for_writeable(op, new Monitor::C_RetryMessage(&mon, op));
+      return false;
+    }
+    {
+      if (monmap.stretch_mode_enabled) {
+	ss << "stretch mode is already engaged";
+	err = -EINVAL;
+	goto reply;
+      }
+      if (pending_map.stretch_mode_enabled) {
+	ss << "stretch mode currently committing";
+	err = 0;
+	goto reply;
+      }
+      string tiebreaker_mon;
+      if (!cmd_getval(cmdmap, "tiebreaker_mon", tiebreaker_mon)) {
+	ss << "must specify a tiebreaker monitor";
+	err = -EINVAL;
+	goto reply;
+      }
+      string new_crush_rule;
+      if (!cmd_getval(cmdmap, "new_crush_rule", new_crush_rule)) {
+	ss << "must specify a new crush rule that spreads out copies over multiple sites";
+	err = -EINVAL;
+	goto reply;
+      }
+      string dividing_bucket;
+      if (!cmd_getval(cmdmap, "dividing_bucket", dividing_bucket)) {
+	ss << "must specify a dividing bucket";
+	err = -EINVAL;
+	goto reply;
+      }
+      //okay, initial arguments make sense, check pools and cluster state
+      err = mon.osdmon()->check_cluster_features(CEPH_FEATUREMASK_STRETCH_MODE, ss);
+      if (err)
+	goto reply;
+      struct Plugger {
+	Paxos &p;
+	Plugger(Paxos &p) : p(p) { p.plug(); }
+	~Plugger() { p.unplug(); }
+      } plugger(paxos);
+
+      set<pg_pool_t*> pools;
+      bool okay = false;
+      int errcode = 0;
+
+      mon.osdmon()->try_enable_stretch_mode_pools(ss, &okay, &errcode,
+						   &pools, new_crush_rule);
+      if (!okay) {
+	err = errcode;
+	goto reply;
+      }
+      try_enable_stretch_mode(ss, &okay, &errcode, false,
+			      tiebreaker_mon, dividing_bucket);
+      if (!okay) {
+	err = errcode;
+	goto reply;
+      }
+      mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, false,
+					     dividing_bucket, 2, pools, new_crush_rule);
+      if (!okay) {
+	err = errcode;
+	goto reply;
+      }
+      // everything looks good, actually commit the changes!
+      try_enable_stretch_mode(ss, &okay, &errcode, true,
+			      tiebreaker_mon, dividing_bucket);
+      mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, true,
+					     dividing_bucket,
+					     2, // right now we only support 2 sites
+					     pools, new_crush_rule);
+      ceph_assert(okay == true);
+    }
+    request_proposal(mon.osdmon());
+    err = 0;
+    propose = true;
+  } else {
+    ss << "unknown command " << prefix;
+    err = -EINVAL;
+  }
+
+reply:
+  getline(ss, rs);
+  mon.reply_command(op, err, rs, get_last_committed());
+  // we are returning to the user; do not propose.
+  return propose;
+}
+
+void MonmapMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
+					    int *errcode, bool commit,
+					    const string& tiebreaker_mon,
+					    const string& dividing_bucket)
+{
+  dout(20) << __func__ << dendl;
+  *okay = false;
+  if (pending_map.strategy != MonMap::CONNECTIVITY) {
+    ss << "Monitors must use the connectivity strategy to enable stretch mode";
+    *errcode = -EINVAL;
+    ceph_assert(!commit);
+    return;
+  }
+  if (!pending_map.contains(tiebreaker_mon)) {
+    ss << "mon " << tiebreaker_mon << "does not seem to exist";
+    *errcode = -ENOENT;
+    ceph_assert(!commit);
+    return;
+  }
+  map<string,string> buckets;
+  for (const auto&mii : mon.monmap->mon_info) {
+    const auto& mi = mii.second;
+    const auto& bi = mi.crush_loc.find(dividing_bucket);
+    if (bi == mi.crush_loc.end()) {
+      ss << "Could not find location entry for " << dividing_bucket
+	 << " on monitor " << mi.name;
+      *errcode = -EINVAL;
+      ceph_assert(!commit);
+      return;
+    }
+    buckets[mii.first] = bi->second;
+  }
+  string bucket1, bucket2, tiebreaker_bucket;
+  for (auto& i : buckets) {
+    if (i.first == tiebreaker_mon) {
+      tiebreaker_bucket = i.second;
+      continue;
+    }
+    if (bucket1.empty()) {
+      bucket1 = i.second;
+    }
+    if (bucket1 != i.second &&
+	bucket2.empty()) {
+      bucket2 = i.second;
+    }
+    if (bucket1 != i.second &&
+	bucket2 != i.second) {
+      ss << "There are too many monitor buckets for stretch mode, found "
+	 << bucket1 << "," << bucket2 << "," << i.second;
+      *errcode = -EINVAL;
+      ceph_assert(!commit);
+      return;
+    }
+  }
+  if (bucket1.empty() || bucket2.empty()) {
+    ss << "There are not enough monitor buckets for stretch mode;"
+       << " must have at least 2 plus the tiebreaker but only found "
+       << (bucket1.empty() ? bucket1 : bucket2);
+    *errcode = -EINVAL;
+    ceph_assert(!commit);
+    return;
+  }
+  if (tiebreaker_bucket == bucket1 ||
+      tiebreaker_bucket == bucket2) {
+    ss << "The named tiebreaker monitor " << tiebreaker_mon
+       << " is in the same CRUSH bucket " << tiebreaker_bucket
+       << " as other monitors";
+    *errcode = -EINVAL;
+    ceph_assert(!commit);
+    return;
+  }
+  if (commit) {
+    pending_map.disallowed_leaders.insert(tiebreaker_mon);
+    pending_map.tiebreaker_mon = tiebreaker_mon;
+    pending_map.stretch_mode_enabled = true;
+  }
+  *okay = true;
+}
+
+void MonmapMonitor::trigger_degraded_stretch_mode(const set<string>& dead_mons)
+{
+  dout(20) << __func__ << dendl;
+  pending_map.stretch_marked_down_mons.insert(dead_mons.begin(), dead_mons.end());
+  propose_pending();
+}
+
+void MonmapMonitor::trigger_healthy_stretch_mode()
+{
+  dout(20) << __func__ << dendl;
+  pending_map.stretch_marked_down_mons.clear();
+  propose_pending();
+}
+
+bool MonmapMonitor::preprocess_join(MonOpRequestRef op)
+{
+  auto join = op->get_req<MMonJoin>();
+  dout(10) << __func__ << " " << join->name << " at " << join->addrs << dendl;
+
+  MonSession *session = op->get_session();
+  if (!session ||
+      !session->is_capable("mon", MON_CAP_W | MON_CAP_X)) {
+    dout(10) << " insufficient caps" << dendl;
+    return true;
+  }
+
+  const auto name_info_i = pending_map.mon_info.find(join->name);
+  if (name_info_i != pending_map.mon_info.end() &&
+      !name_info_i->second.public_addrs.front().is_blank_ip() &&
+      (!join->force_loc || join->crush_loc == name_info_i->second.crush_loc)) {
+    dout(10) << " already have " << join->name << dendl;
+    return true;
+  }
+  string addr_name;
+  if (pending_map.contains(join->addrs)) {
+    addr_name = pending_map.get_name(join->addrs);
+  }
+  if (!addr_name.empty() &&
+      addr_name == join->name &&
+      (!join->force_loc || join->crush_loc.empty() ||
+       pending_map.mon_info[addr_name].crush_loc == join->crush_loc)) {
+    dout(10) << " already have " << join->addrs << dendl;
+    return true;
+  }
+  if (pending_map.stretch_mode_enabled &&
+      join->crush_loc.empty() &&
+      (addr_name.empty() ||
+       pending_map.mon_info[addr_name].crush_loc.empty())) {
+    dout(10) << "stretch mode engaged but no source of crush_loc" << dendl;
+    mon.clog->info() << join->name << " attempted to join from " << join->name
+		      << ' ' << join->addrs
+		      << "; but lacks a crush_location for stretch mode";
+    return true;
+  }
+  return false;
+}
+
+bool MonmapMonitor::prepare_join(MonOpRequestRef op)
+{
+  auto join = op->get_req<MMonJoin>();
+  dout(0) << "adding/updating " << join->name
+	  << " at " << join->addrs << " to monitor cluster" << dendl;
+  map<string,string> existing_loc;
+  if (pending_map.contains(join->addrs)) {
+    string name = pending_map.get_name(join->addrs);
+    existing_loc = pending_map.mon_info[name].crush_loc;
+    pending_map.remove(name);
+  }
+  if (pending_map.contains(join->name))
+    pending_map.remove(join->name);
+  pending_map.add(join->name, join->addrs);
+  pending_map.mon_info[join->name].crush_loc =
+    ((join->force_loc || existing_loc.empty()) ?
+     join->crush_loc : existing_loc);
+  pending_map.last_changed = ceph_clock_now();
+  return true;
+}
+
+bool MonmapMonitor::should_propose(double& delay)
+{
+  delay = 0.0;
+  return true;
+}
+
+int MonmapMonitor::get_monmap(bufferlist &bl)
+{
+  version_t latest_ver = get_last_committed();
+  dout(10) << __func__ << " ver " << latest_ver << dendl;
+
+  if (!mon.store->exists(get_service_name(), stringify(latest_ver)))
+    return -ENOENT;
+
+  int err = get_version(latest_ver, bl);
+  if (err < 0) {
+    dout(1) << __func__ << " error obtaining monmap: "
+            << cpp_strerror(err) << dendl;
+    return err;
+  }
+  return 0;
+}
+
+void MonmapMonitor::check_subs()
+{
+  const string type = "monmap";
+  mon.with_session_map([this, &type](const MonSessionMap& session_map) {
+      auto subs = session_map.subs.find(type);
+      if (subs == session_map.subs.end())
+	return;
+      for (auto sub : *subs->second) {
+	check_sub(sub);
+      }
+    });
+}
+
+void MonmapMonitor::check_sub(Subscription *sub)
+{
+  const auto epoch = mon.monmap->get_epoch();
+  dout(10) << __func__
+	   << " monmap next " << sub->next
+	   << " have " << epoch << dendl;
+  if (sub->next <= epoch) {
+    mon.send_latest_monmap(sub->session->con.get());
+    if (sub->onetime) {
+      mon.with_session_map([sub](MonSessionMap& session_map) {
+	  session_map.remove_sub(sub);
+	});
+    } else {
+      sub->next = epoch + 1;
+    }
+  }
+}
+
+void MonmapMonitor::tick()
+{
+  if (!is_active() ||
+      !mon.is_leader()) {
+    return;
+  }
+
+  if (mon.monmap->created.is_zero()) {
+    dout(10) << __func__ << " detected empty created stamp" << dendl;
+    utime_t ctime;
+    for (version_t v = 1; v <= get_last_committed(); v++) {
+      bufferlist bl;
+      int r = get_version(v, bl);
+      if (r < 0) {
+	continue;
+      }
+      MonMap m;
+      auto p = bl.cbegin();
+      decode(m, p);
+      if (!m.last_changed.is_zero()) {
+	dout(10) << __func__ << " first monmap with last_changed is "
+		 << v << " with " << m.last_changed << dendl;
+	ctime = m.last_changed;
+	break;
+      }
+    }
+    if (ctime.is_zero()) {
+      ctime = ceph_clock_now();
+    }
+    dout(10) << __func__ << " updating created stamp to " << ctime << dendl;
+    pending_map.created = ctime;
+    propose_pending();
+  }
+}
diff --git a/src/mon/MonmapMonitor.h b/src/mon/MonmapMonitor.h
new file mode 100644
index 000000000..cf22ae9f8
--- /dev/null
+++ b/src/mon/MonmapMonitor.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+/*
+ * The Monmap Monitor is used to track the monitors in the cluster.
+ */
+
+#ifndef CEPH_MONMAPMONITOR_H
+#define CEPH_MONMAPMONITOR_H
+
+#include <map>
+#include <set>
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "PaxosService.h"
+#include "MonMap.h"
+#include "MonitorDBStore.h"
+
+class MonmapMonitor : public PaxosService {
+ public:
+  MonmapMonitor(Monitor &mn, Paxos &p, const std::string& service_name)
+    : PaxosService(mn, p, service_name)
+  {
+  }
+  MonMap pending_map; //the pending map awaiting passage
+
+  void create_initial() override;
+
+  void update_from_paxos(bool *need_bootstrap) override;
+
+  void create_pending() override;
+
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  // we always encode the full map; we have no use for full versions
+  void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+  void on_active() override;
+  void apply_mon_features(const mon_feature_t& features,
+			  ceph_release_t min_mon_release);
+
+  void dump_info(ceph::Formatter *f);
+
+  bool preprocess_query(MonOpRequestRef op) override;
+  bool prepare_update(MonOpRequestRef op) override;
+
+  bool preprocess_join(MonOpRequestRef op);
+  bool prepare_join(MonOpRequestRef op);
+
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
+
+  int get_monmap(ceph::buffer::list &bl);
+
+  /*
+   * Since monitors are pretty
+   * important, this implementation will just write 0.0.
+   */
+  bool should_propose(double& delay) override;
+
+  void check_sub(Subscription *sub);
+
+  void tick() override;
+
+private:
+  void check_subs();
+  ceph::buffer::list monmap_bl;
+  /**
+   * Check validity of inputs and monitor state to
+   * engage stretch mode. Designed to be used with
+   * OSDMonitor::try_enable_stretch_mode() where we call both twice,
+   * first with commit=false to validate.
+   * @param ss: a stringstream to write errors into
+   * @param okay: Filled to true if okay, false if validation fails
+   * @param errcode: filled with -errno if there's a problem
+   * @param commit: true if we should commit the change, false if just testing
+   * @param tiebreaker_mon: the name of the monitor to declare tiebreaker
+   * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster
+   */
+  void try_enable_stretch_mode(stringstream& ss, bool *okay,
+			       int *errcode, bool commit,
+			       const string& tiebreaker_mon,
+			       const string& dividing_bucket);
+
+public:
+  /**
+   * Set us to degraded stretch mode. Put the dead_mons in
+   * the MonMap.
+   */
+  void trigger_degraded_stretch_mode(const set<string>& dead_mons);
+  /**
+   * Set us to healthy stretch mode: clear out the
+   * down list to allow any non-tiebreaker mon to be the leader again.
+   */
+  void trigger_healthy_stretch_mode();
+};
+
+
+#endif
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
new file mode 100644
index 000000000..3191ed5bf
--- /dev/null
+++ b/src/mon/OSDMonitor.cc
@@ -0,0 +1,14832 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <boost/algorithm/string.hpp>
+#include <experimental/iterator>
+#include <locale>
+#include <sstream>
+
+#include "mon/OSDMonitor.h"
+#include "mon/Monitor.h"
+#include "mon/MDSMonitor.h"
+#include "mon/MgrStatMonitor.h"
+#include "mon/AuthMonitor.h"
+#include "mon/KVMonitor.h"
+
+#include "mon/MonitorDBStore.h"
+#include "mon/Session.h"
+
+#include "crush/CrushWrapper.h"
+#include "crush/CrushTester.h"
+#include "crush/CrushTreeDumper.h"
+
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDMarkMeDead.h"
+#include "messages/MOSDFull.h"
+#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDAlive.h"
+#include "messages/MPoolOp.h"
+#include "messages/MPoolOpReply.h"
+#include "messages/MOSDPGCreate.h"
+#include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGTemp.h"
+#include "messages/MOSDPGReadyToMerge.h"
+#include "messages/MMonCommand.h"
+#include "messages/MRemoveSnaps.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MRoute.h"
+#include "messages/MMonGetPurgedSnaps.h"
+#include "messages/MMonGetPurgedSnapsReply.h"
+
+#include "common/TextTable.h"
+#include "common/Timer.h"
+#include "common/ceph_argparse.h"
+#include "common/perf_counters.h"
+#include "common/PriorityCache.h"
+#include "common/strtol.h"
+#include "common/numa.h"
+
+#include "common/config.h"
+#include "common/errno.h"
+
+#include "erasure-code/ErasureCodePlugin.h"
+#include "compressor/Compressor.h"
+#include "common/Checksummer.h"
+
+#include "include/compat.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "include/util.h"
+#include "common/cmdparse.h"
+#include "include/str_list.h"
+#include "include/str_map.h"
+#include "include/scope_guard.h"
+#include "perfglue/heap_profiler.h"
+
+#include "auth/cephx/CephxKeyServer.h"
+#include "osd/OSDCap.h"
+
+#include "json_spirit/json_spirit_reader.h"
+
+#include <boost/algorithm/string/predicate.hpp>
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::ErasureCodePluginRegistry;
+using ceph::ErasureCodeProfile;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_message;
+
+#define dout_subsys ceph_subsys_mon
+static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
+static const string OSD_METADATA_PREFIX("osd_metadata");
+static const string OSD_SNAP_PREFIX("osd_snap");
+
+/*
+
+  OSD snapshot metadata
+  ---------------------
+
+  -- starting with mimic, removed in octopus --
+
+  "removed_epoch_%llu_%08lx" % (pool, epoch)
+   -> interval_set<snapid_t>
+
+  "removed_snap_%llu_%016llx" % (pool, last_snap)
+   -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
+
+
+  -- starting with mimic --
+
+  "purged_snap_%llu_%016llx" % (pool, last_snap)
+   -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
+
+  - note that the {removed,purged}_snap put the last snap in they key so
+    that we can use forward iteration only to search for an epoch in an
+    interval.  e.g., to test if epoch N is removed/purged, we'll find a key
+    >= N that either does or doesn't contain the given snap.
+
+
+  -- starting with octopus --
+
+  "purged_epoch_%08lx" % epoch
+  -> map<int64_t,interval_set<snapid_t>>
+
+  */
+using namespace TOPNSPC::common;
+namespace {
+
+struct OSDMemCache : public PriorityCache::PriCache {
+  OSDMonitor *osdmon;
+  int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
+  int64_t committed_bytes = 0;
+  double cache_ratio = 0;
+
+  OSDMemCache(OSDMonitor *m) : osdmon(m) {};
+
+  virtual uint64_t _get_used_bytes() const = 0;
+
+  virtual int64_t request_cache_bytes(
+      PriorityCache::Priority pri, uint64_t total_cache) const {
+    int64_t assigned = get_cache_bytes(pri);
+
+    switch (pri) {
+    // All cache items are currently set to have PRI1 priority
+    case PriorityCache::Priority::PRI1:
+      {
+        int64_t request = _get_used_bytes();
+        return (request > assigned) ? request - assigned : 0;
+      }
+    default:
+      break;
+    }
+    return -EOPNOTSUPP;
+  }
+
+  virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
+      return cache_bytes[pri];
+  }
+
+  virtual int64_t get_cache_bytes() const {
+    int64_t total = 0;
+
+    for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+      PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+      total += get_cache_bytes(pri);
+    }
+    return total;
+  }
+
+  virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+    cache_bytes[pri] = bytes;
+  }
+  virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+    cache_bytes[pri] += bytes;
+  }
+  virtual int64_t commit_cache_size(uint64_t total_cache) {
+    committed_bytes = PriorityCache::get_chunk(
+        get_cache_bytes(), total_cache);
+    return committed_bytes;
+  }
+  virtual int64_t get_committed_size() const {
+    return committed_bytes;
+  }
+  virtual double get_cache_ratio() const {
+    return cache_ratio;
+  }
+  virtual void set_cache_ratio(double ratio) {
+    cache_ratio = ratio;
+  }
+  virtual string get_cache_name() const = 0;
+};
+
+struct IncCache : public OSDMemCache {
+  IncCache(OSDMonitor *m) : OSDMemCache(m) {};
+
+  virtual uint64_t _get_used_bytes() const {
+    return osdmon->inc_osd_cache.get_bytes();
+  }
+
+  virtual string get_cache_name() const {
+    return "OSDMap Inc Cache";
+  }
+
+  uint64_t _get_num_osdmaps() const {
+    return osdmon->inc_osd_cache.get_size();
+  }
+};
+
+struct FullCache : public OSDMemCache {
+  FullCache(OSDMonitor *m) : OSDMemCache(m) {};
+
+  virtual uint64_t _get_used_bytes() const {
+    return osdmon->full_osd_cache.get_bytes();
+  }
+
+  virtual string get_cache_name() const {
+    return "OSDMap Full Cache";
+  }
+
+  uint64_t _get_num_osdmaps() const {
+    return osdmon->full_osd_cache.get_size();
+  }
+};
+
+std::shared_ptr<IncCache> inc_cache;
+std::shared_ptr<FullCache> full_cache;
+
+const uint32_t MAX_POOL_APPLICATIONS = 4;
+const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
+const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
+
+bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
+  // Note: this doesn't include support for the application tag match
+  if ((grant.spec.allow & OSD_CAP_W) != 0) {
+    auto& match = grant.match;
+    if (match.is_match_all()) {
+      return true;
+    } else if (pool_name != nullptr &&
+               !match.pool_namespace.pool_name.empty() &&
+               match.pool_namespace.pool_name == *pool_name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool is_unmanaged_snap_op_permitted(CephContext* cct,
+                                    const KeyServer& key_server,
+                                    const EntityName& entity_name,
+                                    const MonCap& mon_caps,
+				    const entity_addr_t& peer_socket_addr,
+                                    const std::string* pool_name)
+{
+  typedef std::map<std::string, std::string> CommandArgs;
+
+  if (mon_caps.is_capable(
+	cct, entity_name, "osd",
+	"osd pool op unmanaged-snap",
+	(pool_name == nullptr ?
+	 CommandArgs{} /* pool DNE, require unrestricted cap */ :
+	 CommandArgs{{"poolname", *pool_name}}),
+	false, true, false,
+	peer_socket_addr)) {
+    return true;
+  }
+
+  AuthCapsInfo caps_info;
+  if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
+                                   caps_info)) {
+    dout(10) << "unable to locate OSD cap data for " << entity_name
+             << " in auth db" << dendl;
+    return false;
+  }
+
+  string caps_str;
+  if (caps_info.caps.length() > 0) {
+    auto p = caps_info.caps.cbegin();
+    try {
+      decode(caps_str, p);
+    } catch (const ceph::buffer::error &err) {
+      derr << "corrupt OSD cap data for " << entity_name << " in auth db"
+           << dendl;
+      return false;
+    }
+  }
+
+  OSDCap osd_cap;
+  if (!osd_cap.parse(caps_str, nullptr)) {
+    dout(10) << "unable to parse OSD cap data for " << entity_name
+             << " in auth db" << dendl;
+    return false;
+  }
+
+  // if the entity has write permissions in one or all pools, permit
+  // usage of unmanaged-snapshots
+  if (osd_cap.allow_all()) {
+    return true;
+  }
+
+  for (auto& grant : osd_cap.grants) {
+    if (grant.profile.is_valid()) {
+      for (auto& profile_grant : grant.profile_grants) {
+        if (is_osd_writable(profile_grant, pool_name)) {
+          return true;
+        }
+      }
+    } else if (is_osd_writable(grant, pool_name)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+} // anonymous namespace
+
+void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
+				 epoch_t last_epoch_clean)
+{
+  if (ps >= pg_num) {
+    // removed PG
+    return;
+  }
+  epoch_by_pg.resize(pg_num, 0);
+  const auto old_lec = epoch_by_pg[ps];
+  if (old_lec >= last_epoch_clean) {
+    // stale lec
+    return;
+  }
+  epoch_by_pg[ps] = last_epoch_clean;
+  if (last_epoch_clean < floor) {
+    floor = last_epoch_clean;
+  } else if (last_epoch_clean > floor) {
+    if (old_lec == floor) {
+      // probably should increase floor?
+      auto new_floor = std::min_element(std::begin(epoch_by_pg),
+					std::end(epoch_by_pg));
+      floor = *new_floor;
+    }
+  }
+  if (ps != next_missing) {
+    return;
+  }
+  for (; next_missing < epoch_by_pg.size(); next_missing++) {
+    if (epoch_by_pg[next_missing] == 0) {
+      break;
+    }
+  }
+}
+
+void LastEpochClean::remove_pool(uint64_t pool)
+{
+  report_by_pool.erase(pool);
+}
+
+void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
+			    epoch_t last_epoch_clean)
+{
+  auto& lec = report_by_pool[pg.pool()];
+  return lec.report(pg_num, pg.ps(), last_epoch_clean);
+}
+
+epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
+{
+  auto floor = latest.get_epoch();
+  for (auto& pool : latest.get_pools()) {
+    auto reported = report_by_pool.find(pool.first);
+    if (reported == report_by_pool.end()) {
+      return 0;
+    }
+    if (reported->second.next_missing < pool.second.get_pg_num()) {
+      return 0;
+    }
+    if (reported->second.floor < floor) {
+      floor = reported->second.floor;
+    }
+  }
+  return floor;
+}
+
+void LastEpochClean::dump(Formatter *f) const
+{
+  f->open_array_section("per_pool");
+
+  for (auto& [pool, lec] : report_by_pool) {
+    f->open_object_section("pool");
+    f->dump_unsigned("poolid", pool);
+    f->dump_unsigned("floor", lec.floor);
+    f->close_section();
+  }
+
+  f->close_section();
+}
+
+class C_UpdateCreatingPGs : public Context {
+public:
+  OSDMonitor *osdmon;
+  utime_t start;
+  epoch_t epoch;
+  C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
+    osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
+  void finish(int r) override {
+    if (r >= 0) {
+      utime_t end = ceph_clock_now();
+      dout(10) << "osdmap epoch " << epoch << " mapping took "
+	       << (end - start) << " seconds" << dendl;
+      osdmon->update_creating_pgs();
+      osdmon->check_pg_creates_subs();
+    }
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, osdmap)
+static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name()
+		<< ").osd e" << osdmap.get_epoch() << " ";
+}
+
+OSDMonitor::OSDMonitor(
+  CephContext *cct,
+  Monitor &mn,
+  Paxos &p,
+  const string& service_name)
+ : PaxosService(mn, p, service_name),
+   cct(cct),
+   inc_osd_cache(g_conf()->mon_osd_cache_size),
+   full_osd_cache(g_conf()->mon_osd_cache_size),
+   has_osdmap_manifest(false),
+   mapper(mn.cct, &mn.cpu_tp)
+{
+  inc_cache = std::make_shared<IncCache>(this);
+  full_cache = std::make_shared<FullCache>(this);
+  cct->_conf.add_observer(this);
+  int r = _set_cache_sizes();
+  if (r < 0) {
+    derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
+         << g_conf()->mon_osd_cache_size
+         << ") without priority cache management"
+         << dendl;
+  }
+}
+
+const char **OSDMonitor::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "mon_memory_target",
+    "mon_memory_autotune",
+    "rocksdb_cache_size",
+    NULL
+  };
+  return KEYS;
+}
+
+void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
+                                    const std::set<std::string> &changed)
+{
+  dout(10) << __func__ << " " << changed << dendl;
+
+  if (changed.count("mon_memory_autotune")) {
+    _set_cache_autotuning();
+  }
+  if (changed.count("mon_memory_target") ||
+      changed.count("rocksdb_cache_size")) {
+    int r = _update_mon_cache_settings();
+    if (r < 0) {
+      derr << __func__ << " mon_memory_target:"
+           << g_conf()->mon_memory_target
+           << " rocksdb_cache_size:"
+           << g_conf()->rocksdb_cache_size
+           << ". Unable to update cache size."
+           << dendl;
+    }
+  }
+}
+
+void OSDMonitor::_set_cache_autotuning()
+{
+  if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
+    // Disable cache autotuning
+    std::lock_guard l(balancer_lock);
+    pcm = nullptr;
+  }
+
+  if (g_conf()->mon_memory_autotune && pcm == nullptr) {
+    int r = register_cache_with_pcm();
+    if (r < 0) {
+      dout(10) << __func__
+               << " Error while registering osdmon caches with pcm."
+               << " Cache auto tuning not enabled."
+               << dendl;
+      mon_memory_autotune = false;
+    } else {
+      mon_memory_autotune = true;
+    }
+  }
+}
+
+int OSDMonitor::_update_mon_cache_settings()
+{
+  if (g_conf()->mon_memory_target <= 0 ||
+      g_conf()->mon_memory_target < mon_memory_min ||
+      g_conf()->rocksdb_cache_size <= 0) {
+    return -EINVAL;
+  }
+
+  if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
+    derr << __func__ << " not using pcm and rocksdb" << dendl;
+    return -EINVAL;
+  }
+
+  uint64_t old_mon_memory_target = mon_memory_target;
+  uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
+
+  // Set the new pcm memory cache sizes
+  mon_memory_target = g_conf()->mon_memory_target;
+  rocksdb_cache_size = g_conf()->rocksdb_cache_size;
+
+  uint64_t base = mon_memory_base;
+  double fragmentation = mon_memory_fragmentation;
+  uint64_t target = mon_memory_target;
+  uint64_t min = mon_memory_min;
+  uint64_t max = min;
+
+  uint64_t ltarget = (1.0 - fragmentation) * target;
+  if (ltarget > base + min) {
+    max = ltarget - base;
+  }
+
+  int r = _set_cache_ratios();
+  if (r < 0) {
+    derr << __func__ << " Cache ratios for pcm could not be set."
+         << " Review the kv (rocksdb) and mon_memory_target sizes."
+         << dendl;
+    mon_memory_target = old_mon_memory_target;
+    rocksdb_cache_size = old_rocksdb_cache_size;
+    return -EINVAL;
+  }
+
+  if (mon_memory_autotune && pcm != nullptr) {
+    std::lock_guard l(balancer_lock);
+    // set pcm cache levels
+    pcm->set_target_memory(target);
+    pcm->set_min_memory(min);
+    pcm->set_max_memory(max);
+    // tune memory based on new values
+    pcm->tune_memory();
+    pcm->balance();
+    _set_new_cache_sizes();
+    dout(1) << __func__ << " Updated mon cache setting."
+             << " target: " << target
+             << " min: " << min
+             << " max: " << max
+             << dendl;
+  }
+  return 0;
+}
+
+int OSDMonitor::_set_cache_sizes()
+{
+  if (g_conf()->mon_memory_autotune) {
+    // set the new osdmon cache targets to be managed by pcm
+    mon_osd_cache_size = g_conf()->mon_osd_cache_size;
+    rocksdb_cache_size = g_conf()->rocksdb_cache_size;
+    mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+    mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+    mon_memory_target = g_conf()->mon_memory_target;
+    mon_memory_min = g_conf()->mon_osd_cache_size_min;
+    if (mon_memory_target <= 0 || mon_memory_min <= 0) {
+      derr << __func__ << " mon_memory_target:" << mon_memory_target
+           << " mon_memory_min:" << mon_memory_min
+           << ". Invalid size option(s) provided."
+           << dendl;
+      return -EINVAL;
+    }
+    // Set the initial inc and full LRU cache sizes
+    inc_osd_cache.set_bytes(mon_memory_min);
+    full_osd_cache.set_bytes(mon_memory_min);
+    mon_memory_autotune = g_conf()->mon_memory_autotune;
+  }
+  return 0;
+}
+
+bool OSDMonitor::_have_pending_crush()
+{
+  return pending_inc.crush.length() > 0;
+}
+
+CrushWrapper &OSDMonitor::_get_stable_crush()
+{
+  return *osdmap.crush;
+}
+
+void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
+{
+  bufferlist bl;
+  if (pending_inc.crush.length())
+    bl = pending_inc.crush;
+  else
+    osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+  auto p = bl.cbegin();
+  newcrush.decode(p);
+}
+
+void OSDMonitor::create_initial()
+{
+  dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
+
+  OSDMap newmap;
+
+  bufferlist bl;
+  mon.store->get("mkfs", "osdmap", bl);
+
+  if (bl.length()) {
+    newmap.decode(bl);
+    newmap.set_fsid(mon.monmap->fsid);
+  } else {
+    newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
+  }
+  newmap.set_epoch(1);
+  newmap.created = newmap.modified = ceph_clock_now();
+
+  // new clusters should sort bitwise by default.
+  newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
+
+  newmap.flags |=
+    CEPH_OSDMAP_RECOVERY_DELETES |
+    CEPH_OSDMAP_PURGED_SNAPDIRS |
+    CEPH_OSDMAP_PGLOG_HARDLIMIT;
+  newmap.full_ratio = g_conf()->mon_osd_full_ratio;
+  if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
+  newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
+  if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
+  newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
+  if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
+
+  // new cluster should require latest by default
+  if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) {
+    if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
+      derr << __func__ << " mon_debug_no_require_pacific and octopus=true" << dendl;
+      newmap.require_osd_release = ceph_release_t::nautilus;
+    } else {
+      derr << __func__ << " mon_debug_no_require_pacific=true" << dendl;
+      newmap.require_osd_release = ceph_release_t::octopus;
+    }
+  } else {
+    newmap.require_osd_release = ceph_release_t::pacific;
+  }
+
+  if (newmap.require_osd_release >= ceph_release_t::octopus) {
+    ceph_release_t r = ceph_release_from_name(
+      g_conf()->mon_osd_initial_require_min_compat_client);
+    if (!r) {
+      ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
+    }
+    newmap.require_min_compat_client = r;
+  }
+
+  // encode into pending incremental
+  uint64_t features = newmap.get_encoding_features();
+  newmap.encode(pending_inc.fullmap,
+                features | CEPH_FEATURE_RESERVED);
+  pending_inc.full_crc = newmap.get_crc();
+  dout(20) << " full crc " << pending_inc.full_crc << dendl;
+}
+
+void OSDMonitor::get_store_prefixes(std::set<string>& s) const
+{
+  s.insert(service_name);
+  s.insert(OSD_PG_CREATING_PREFIX);
+  s.insert(OSD_METADATA_PREFIX);
+  s.insert(OSD_SNAP_PREFIX);
+}
+
+void OSDMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  // we really don't care if the version has been updated, because we may
+  // have trimmed without having increased the last committed; yet, we may
+  // need to update the in-memory manifest.
+  load_osdmap_manifest();
+
+  version_t version = get_last_committed();
+  if (version == osdmap.epoch)
+    return;
+  ceph_assert(version > osdmap.epoch);
+
+  dout(15) << "update_from_paxos paxos e " << version
+	   << ", my e " << osdmap.epoch << dendl;
+
+  int prev_num_up_osd = osdmap.num_up_osd;
+
+  if (mapping_job) {
+    if (!mapping_job->is_done()) {
+      dout(1) << __func__ << " mapping job "
+	      << mapping_job.get() << " did not complete, "
+	      << mapping_job->shards << " left, canceling" << dendl;
+      mapping_job->abort();
+    }
+    mapping_job.reset();
+  }
+
+  load_health();
+
+  /*
+   * We will possibly have a stashed latest that *we* wrote, and we will
+   * always be sure to have the oldest full map in the first..last range
+   * due to encode_trim_extra(), which includes the oldest full map in the trim
+   * transaction.
+   *
+   * encode_trim_extra() does not however write the full map's
+   * version to 'full_latest'.  This is only done when we are building the
+   * full maps from the incremental versions.  But don't panic!  We make sure
+   * that the following conditions find whichever full map version is newer.
+   */
+  version_t latest_full = get_version_latest_full();
+  if (latest_full == 0 && get_first_committed() > 1)
+    latest_full = get_first_committed();
+
+  if (get_first_committed() > 1 &&
+      latest_full < get_first_committed()) {
+    // the monitor could be just sync'ed with its peer, and the latest_full key
+    // is not encoded in the paxos commits in encode_pending(), so we need to
+    // make sure we get it pointing to a proper version.
+    version_t lc = get_last_committed();
+    version_t fc = get_first_committed();
+
+    dout(10) << __func__ << " looking for valid full map in interval"
+	     << " [" << fc << ", " << lc << "]" << dendl;
+
+    latest_full = 0;
+    for (version_t v = lc; v >= fc; v--) {
+      string full_key = "full_" + stringify(v);
+      if (mon.store->exists(get_service_name(), full_key)) {
+        dout(10) << __func__ << " found latest full map v " << v << dendl;
+        latest_full = v;
+        break;
+      }
+    }
+
+    ceph_assert(latest_full > 0);
+    auto t(std::make_shared<MonitorDBStore::Transaction>());
+    put_version_latest_full(t, latest_full);
+    mon.store->apply_transaction(t);
+    dout(10) << __func__ << " updated the on-disk full map version to "
+             << latest_full << dendl;
+  }
+
+  if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
+    bufferlist latest_bl;
+    get_version_full(latest_full, latest_bl);
+    ceph_assert(latest_bl.length() != 0);
+    dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
+    osdmap = OSDMap();
+    osdmap.decode(latest_bl);
+  }
+
+  bufferlist bl;
+  if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
+    auto p = bl.cbegin();
+    std::lock_guard<std::mutex> l(creating_pgs_lock);
+    creating_pgs.decode(p);
+    dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
+	    << creating_pgs.last_scan_epoch
+	    << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
+  } else {
+    dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
+	    << dendl;
+  }
+
+  // walk through incrementals
+  MonitorDBStore::TransactionRef t;
+  size_t tx_size = 0;
+  while (version > osdmap.epoch) {
+    bufferlist inc_bl;
+    int err = get_version(osdmap.epoch+1, inc_bl);
+    ceph_assert(err == 0);
+    ceph_assert(inc_bl.length());
+    // set priority cache manager levels if the osdmap is
+    // being populated for the first time.
+    if (mon_memory_autotune && pcm == nullptr) {
+      int r = register_cache_with_pcm();
+      if (r < 0) {
+        dout(10) << __func__
+                 << " Error while registering osdmon caches with pcm."
+                 << " Proceeding without cache auto tuning."
+                 << dendl;
+      }
+    }
+
+    dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
+	    << dendl;
+    OSDMap::Incremental inc(inc_bl);
+    err = osdmap.apply_incremental(inc);
+    ceph_assert(err == 0);
+
+    if (!t)
+      t.reset(new MonitorDBStore::Transaction);
+
+    // Write out the full map for all past epochs.  Encode the full
+    // map with the same features as the incremental.  If we don't
+    // know, use the quorum features.  If we don't know those either,
+    // encode with all features.
+    uint64_t f = inc.encode_features;
+    if (!f)
+      f = mon.get_quorum_con_features();
+    if (!f)
+      f = -1;
+    bufferlist full_bl;
+    osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
+    tx_size += full_bl.length();
+
+    bufferlist orig_full_bl;
+    get_version_full(osdmap.epoch, orig_full_bl);
+    if (orig_full_bl.length()) {
+      // the primary provided the full map
+      ceph_assert(inc.have_crc);
+      if (inc.full_crc != osdmap.crc) {
+	// This will happen if the mons were running mixed versions in
+	// the past or some other circumstance made the full encoded
+	// maps divergent.  Reloading here will bring us back into
+	// sync with the primary for this and all future maps.  OSDs
+	// will also be brought back into sync when they discover the
+	// crc mismatch and request a full map from a mon.
+	derr << __func__ << " full map CRC mismatch, resetting to canonical"
+	     << dendl;
+
+	dout(20) << __func__ << " my (bad) full osdmap:\n";
+	JSONFormatter jf(true);
+	jf.dump_object("osdmap", osdmap);
+	jf.flush(*_dout);
+	*_dout << "\nhexdump:\n";
+	full_bl.hexdump(*_dout);
+	*_dout << dendl;
+
+	osdmap = OSDMap();
+	osdmap.decode(orig_full_bl);
+
+	dout(20) << __func__ << " canonical full osdmap:\n";
+	JSONFormatter jf(true);
+	jf.dump_object("osdmap", osdmap);
+	jf.flush(*_dout);
+	*_dout << "\nhexdump:\n";
+	orig_full_bl.hexdump(*_dout);
+	*_dout << dendl;
+      }
+    } else {
+      ceph_assert(!inc.have_crc);
+      put_version_full(t, osdmap.epoch, full_bl);
+    }
+    put_version_latest_full(t, osdmap.epoch);
+
+    // share
+    dout(1) << osdmap << dendl;
+
+    if (osdmap.epoch == 1) {
+      t->erase("mkfs", "osdmap");
+    }
+
+    if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
+      mon.store->apply_transaction(t);
+      t = MonitorDBStore::TransactionRef();
+      tx_size = 0;
+    }
+    for (const auto [osd, state] : inc.new_state) {
+      if (state & CEPH_OSD_UP) {
+	// could be marked up *or* down, but we're too lazy to check which
+	last_osd_report.erase(osd);
+      }
+      if (state & CEPH_OSD_OUT) {
+        // could be marked in *or* out, but we can safely drop it
+        osd_epochs.erase(osd);
+      }
+    }
+    for (const auto [osd, weight] : inc.new_weight) {
+      if (weight == CEPH_OSD_OUT) {
+        // manually marked out, so drop it
+        osd_epochs.erase(osd);
+      }
+    }
+  }
+
+  if (t) {
+    mon.store->apply_transaction(t);
+  }
+
+  bool marked_osd_down = false;
+  for (int o = 0; o < osdmap.get_max_osd(); o++) {
+    if (osdmap.is_out(o))
+      continue;
+    auto found = down_pending_out.find(o);
+    if (osdmap.is_down(o)) {
+      // populate down -> out map
+      if (found == down_pending_out.end()) {
+        dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
+        down_pending_out[o] = ceph_clock_now();
+	marked_osd_down = true;
+      }
+    } else {
+      if (found != down_pending_out.end()) {
+        dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
+        down_pending_out.erase(found);
+      }
+    }
+  }
+  // XXX: need to trim MonSession connected with a osd whose id > max_osd?
+
+  check_osdmap_subs();
+  check_pg_creates_subs();
+
+  share_map_with_random_osd();
+  update_logger();
+  process_failures();
+
+  // make sure our feature bits reflect the latest map
+  update_msgr_features();
+
+  if (!mon.is_leader()) {
+    // will be called by on_active() on the leader, avoid doing so twice
+    start_mapping();
+  }
+  if (osdmap.stretch_mode_enabled) {
+    dout(20) << "Stretch mode enabled in this map" << dendl;
+    mon.try_engage_stretch_mode();
+    if (osdmap.degraded_stretch_mode) {
+      dout(20) << "Degraded stretch mode set in this map" << dendl;
+      if (!osdmap.recovering_stretch_mode) {
+	mon.set_degraded_stretch_mode();
+	if (prev_num_up_osd < osdmap.num_up_osd &&
+	    (osdmap.num_up_osd / (double)osdmap.num_osd) >
+	    cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) {
+	  // TODO: This works for 2-site clusters when the OSD maps are appropriately
+	  // trimmed and everything is "normal" but not if you have a lot of out OSDs
+	  // you're ignoring or in some really degenerate failure cases
+	  dout(10) << "Enabling recovery stretch mode in this map" << dendl;
+	  mon.go_recovery_stretch_mode();
+	}
+      } else {
+	mon.set_recovery_stretch_mode();
+      }
+    } else {
+      mon.set_healthy_stretch_mode();
+    }
+    if (marked_osd_down &&
+	(!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
+      dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
+      mon.maybe_go_degraded_stretch_mode();
+    }
+  }
+}
+
+int OSDMonitor::register_cache_with_pcm()
+{
+  if (mon_memory_target <= 0 || mon_memory_min <= 0) {
+    derr << __func__ << " Invalid memory size specified for mon caches."
+         << " Caches will not be auto-tuned."
+         << dendl;
+    return -EINVAL;
+  }
+  uint64_t base = mon_memory_base;
+  double fragmentation = mon_memory_fragmentation;
+  // For calculating total target memory, consider rocksdb cache size.
+  uint64_t target = mon_memory_target;
+  uint64_t min = mon_memory_min;
+  uint64_t max = min;
+
+  // Apply the same logic as in bluestore to set the max amount
+  // of memory to use for cache. Assume base memory for OSDMaps
+  // and then add in some overhead for fragmentation.
+  uint64_t ltarget = (1.0 - fragmentation) * target;
+  if (ltarget > base + min) {
+    max = ltarget - base;
+  }
+
+  rocksdb_binned_kv_cache = mon.store->get_priority_cache();
+  if (!rocksdb_binned_kv_cache) {
+    derr << __func__ << " not using rocksdb" << dendl;
+    return -EINVAL;
+  }
+
+  int r = _set_cache_ratios();
+  if (r < 0) {
+    derr << __func__ << " Cache ratios for pcm could not be set."
+         << " Review the kv (rocksdb) and mon_memory_target sizes."
+         << dendl;
+    return -EINVAL;
+  }
+
+  pcm = std::make_shared<PriorityCache::Manager>(
+      cct, min, max, target, true);
+  pcm->insert("kv", rocksdb_binned_kv_cache, true);
+  pcm->insert("inc", inc_cache, true);
+  pcm->insert("full", full_cache, true);
+  dout(1) << __func__ << " pcm target: " << target
+           << " pcm max: " << max
+           << " pcm min: " << min
+           << " inc_osd_cache size: " << inc_osd_cache.get_size()
+           << dendl;
+  return 0;
+}
+
+int OSDMonitor::_set_cache_ratios()
+{
+  double old_cache_kv_ratio = cache_kv_ratio;
+
+  // Set the cache ratios for kv(rocksdb), inc and full caches
+  cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
+  if (cache_kv_ratio >= 1.0) {
+    derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
+         << ") must be in range [0,<1.0]."
+         << dendl;
+    cache_kv_ratio = old_cache_kv_ratio;
+    return -EINVAL;
+  }
+  rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
+  cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
+  inc_cache->set_cache_ratio(cache_inc_ratio);
+  full_cache->set_cache_ratio(cache_full_ratio);
+
+  dout(1) << __func__ << " kv ratio " << cache_kv_ratio
+           << " inc ratio " << cache_inc_ratio
+           << " full ratio " << cache_full_ratio
+           << dendl;
+  return 0;
+}
+
+void OSDMonitor::start_mapping()
+{
+  // initiate mapping job
+  if (mapping_job) {
+    dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
+	     << dendl;
+    mapping_job->abort();
+  }
+  if (!osdmap.get_pools().empty()) {
+    auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
+    mapping_job = mapping.start_update(osdmap, mapper,
+				       g_conf()->mon_osd_mapping_pgs_per_chunk);
+    dout(10) << __func__ << " started mapping job " << mapping_job.get()
+	     << " at " << fin->start << dendl;
+    mapping_job->set_finish_event(fin);
+  } else {
+    dout(10) << __func__ << " no pools, no mapping job" << dendl;
+    mapping_job = nullptr;
+  }
+}
+
+void OSDMonitor::update_msgr_features()
+{
+  const int types[] = {
+    entity_name_t::TYPE_OSD,
+    entity_name_t::TYPE_CLIENT,
+    entity_name_t::TYPE_MDS,
+    entity_name_t::TYPE_MON
+  };
+  for (int type : types) {
+    uint64_t mask;
+    uint64_t features = osdmap.get_features(type, &mask);
+    if ((mon.messenger->get_policy(type).features_required & mask) != features) {
+      dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
+      ceph::net::Policy p = mon.messenger->get_policy(type);
+      p.features_required = (p.features_required & ~mask) | features;
+      mon.messenger->set_policy(type, p);
+    }
+  }
+}
+
+void OSDMonitor::on_active()
+{
+  update_logger();
+
+  if (mon.is_leader()) {
+    mon.clog->debug() << "osdmap " << osdmap;
+    if (!priority_convert) {
+      // Only do this once at start-up
+      convert_pool_priorities();
+      priority_convert = true;
+    }
+  } else {
+    list<MonOpRequestRef> ls;
+    take_all_failures(ls);
+    while (!ls.empty()) {
+      MonOpRequestRef op = ls.front();
+      op->mark_osdmon_event(__func__);
+      dispatch(op);
+      ls.pop_front();
+    }
+  }
+  start_mapping();
+}
+
+void OSDMonitor::on_restart()
+{
+  last_osd_report.clear();
+}
+
+void OSDMonitor::on_shutdown()
+{
+  dout(10) << __func__ << dendl;
+  if (mapping_job) {
+    dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
+	     << dendl;
+    mapping_job->abort();
+  }
+
+  // discard failure info, waiters
+  list<MonOpRequestRef> ls;
+  take_all_failures(ls);
+  ls.clear();
+}
+
+void OSDMonitor::update_logger()
+{
+  dout(10) << "update_logger" << dendl;
+
+  mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
+  mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
+  mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
+  mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
+}
+
+void OSDMonitor::create_pending()
+{
+  pending_inc = OSDMap::Incremental(osdmap.epoch+1);
+  pending_inc.fsid = mon.monmap->fsid;
+  pending_metadata.clear();
+  pending_metadata_rm.clear();
+  pending_pseudo_purged_snaps.clear();
+
+  dout(10) << "create_pending e " << pending_inc.epoch << dendl;
+
+  // safety checks (this shouldn't really happen)
+  {
+    if (osdmap.backfillfull_ratio <= 0) {
+      pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
+      if (pending_inc.new_backfillfull_ratio > 1.0)
+	pending_inc.new_backfillfull_ratio /= 100;
+      dout(1) << __func__ << " setting backfillfull_ratio = "
+	      << pending_inc.new_backfillfull_ratio << dendl;
+    }
+    if (osdmap.full_ratio <= 0) {
+      pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
+      if (pending_inc.new_full_ratio > 1.0)
+        pending_inc.new_full_ratio /= 100;
+      dout(1) << __func__ << " setting full_ratio = "
+	      << pending_inc.new_full_ratio << dendl;
+    }
+    if (osdmap.nearfull_ratio <= 0) {
+      pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
+      if (pending_inc.new_nearfull_ratio > 1.0)
+        pending_inc.new_nearfull_ratio /= 100;
+      dout(1) << __func__ << " setting nearfull_ratio = "
+	      << pending_inc.new_nearfull_ratio << dendl;
+    }
+  }
+
+  // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
+  // structure.
+  if (osdmap.crush->has_legacy_rule_ids()) {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    // First, for all pools, work out which rule they really used
+    // by resolving ruleset to rule.
+    for (const auto &i : osdmap.get_pools()) {
+      const auto pool_id = i.first;
+      const auto &pool = i.second;
+      int new_rule_id = newcrush.find_rule(pool.crush_rule,
+					   pool.type, pool.size);
+
+      dout(1) << __func__ << " rewriting pool "
+	      << osdmap.get_pool_name(pool_id) << " crush ruleset "
+	      << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
+      if (pending_inc.new_pools.count(pool_id) == 0) {
+	pending_inc.new_pools[pool_id] = pool;
+      }
+      pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
+    }
+
+    // Now, go ahead and renumber all the rules so that their
+    // rule_id field corresponds to their position in the array
+    auto old_to_new = newcrush.renumber_rules();
+    dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
+    for (const auto &i : old_to_new) {
+      dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
+    }
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+  }
+}
+
+creating_pgs_t
+OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
+			       const OSDMap& nextmap)
+{
+  dout(10) << __func__ << dendl;
+  creating_pgs_t pending_creatings;
+  {
+    std::lock_guard<std::mutex> l(creating_pgs_lock);
+    pending_creatings = creating_pgs;
+  }
+  // check for new or old pools
+  if (pending_creatings.last_scan_epoch < inc.epoch) {
+    unsigned queued = 0;
+    queued += scan_for_creating_pgs(osdmap.get_pools(),
+				    inc.old_pools,
+				    inc.modified,
+				    &pending_creatings);
+    queued += scan_for_creating_pgs(inc.new_pools,
+				    inc.old_pools,
+				    inc.modified,
+				    &pending_creatings);
+    dout(10) << __func__ << " " << queued << " pools queued" << dendl;
+    for (auto deleted_pool : inc.old_pools) {
+      auto removed = pending_creatings.remove_pool(deleted_pool);
+      dout(10) << __func__ << " " << removed
+               << " pg removed because containing pool deleted: "
+               << deleted_pool << dendl;
+      last_epoch_clean.remove_pool(deleted_pool);
+    }
+    // pgmon updates its creating_pgs in check_osd_map() which is called by
+    // on_active() and check_osd_map() could be delayed if lease expires, so its
+    // creating_pgs could be stale in comparison with the one of osdmon. let's
+    // trim them here. otherwise, they will be added back after being erased.
+    unsigned removed = 0;
+    for (auto& pg : pending_created_pgs) {
+      dout(20) << __func__ << " noting created pg " << pg << dendl;
+      pending_creatings.created_pools.insert(pg.pool());
+      removed += pending_creatings.pgs.erase(pg);
+    }
+    pending_created_pgs.clear();
+    dout(10) << __func__ << " " << removed
+	     << " pgs removed because they're created" << dendl;
+    pending_creatings.last_scan_epoch = osdmap.get_epoch();
+  }
+
+  // filter out any pgs that shouldn't exist.
+  {
+    auto i = pending_creatings.pgs.begin();
+    while (i != pending_creatings.pgs.end()) {
+      if (!nextmap.pg_exists(i->first)) {
+	dout(10) << __func__ << " removing pg " << i->first
+		 << " which should not exist" << dendl;
+	i = pending_creatings.pgs.erase(i);
+      } else {
+	++i;
+      }
+    }
+  }
+
+  // process queue
+  unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
+  const auto total = pending_creatings.pgs.size();
+  while (pending_creatings.pgs.size() < max &&
+	 !pending_creatings.queue.empty()) {
+    auto p = pending_creatings.queue.begin();
+    int64_t poolid = p->first;
+    dout(10) << __func__ << " pool " << poolid
+	     << " created " << p->second.created
+	     << " modified " << p->second.modified
+	     << " [" << p->second.start << "-" << p->second.end << ")"
+	     << dendl;
+    int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
+				  p->second.end - p->second.start);
+    ps_t first = p->second.start;
+    ps_t end = first + n;
+    for (ps_t ps = first; ps < end; ++ps) {
+      const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
+      // NOTE: use the *current* epoch as the PG creation epoch so that the
+      // OSD does not have to generate a long set of PastIntervals.
+      pending_creatings.pgs.emplace(
+	pgid,
+	creating_pgs_t::pg_create_info(inc.epoch,
+				       p->second.modified));
+      dout(10) << __func__ << " adding " << pgid << dendl;
+    }
+    p->second.start = end;
+    if (p->second.done()) {
+      dout(10) << __func__ << " done with queue for " << poolid << dendl;
+      pending_creatings.queue.erase(p);
+    } else {
+      dout(10) << __func__ << " pool " << poolid
+	       << " now [" << p->second.start << "-" << p->second.end << ")"
+	       << dendl;
+    }
+  }
+  dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
+	   << " pools" << dendl;
+
+  if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
+    // walk creating pgs' history and past_intervals forward
+    for (auto& i : pending_creatings.pgs) {
+      // this mirrors PG::start_peering_interval()
+      pg_t pgid = i.first;
+
+      // this is a bit imprecise, but sufficient?
+      struct min_size_predicate_t : public IsPGRecoverablePredicate {
+	const pg_pool_t *pi;
+	bool operator()(const set<pg_shard_t> &have) const {
+	  return have.size() >= pi->min_size;
+	}
+	explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
+      } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
+
+      vector<int> up, acting;
+      int up_primary, acting_primary;
+      nextmap.pg_to_up_acting_osds(
+	pgid, &up, &up_primary, &acting, &acting_primary);
+      if (i.second.history.epoch_created == 0) {
+	// new pg entry, set it up
+	i.second.up = up;
+	i.second.acting = acting;
+	i.second.up_primary = up_primary;
+	i.second.acting_primary = acting_primary;
+	i.second.history = pg_history_t(i.second.create_epoch,
+					i.second.create_stamp);
+	dout(10) << __func__ << "  pg " << pgid << " just added, "
+		 << " up " << i.second.up
+		 << " p " << i.second.up_primary
+		 << " acting " << i.second.acting
+		 << " p " << i.second.acting_primary
+		 << " history " << i.second.history
+		 << " past_intervals " << i.second.past_intervals
+		 << dendl;
+     } else {
+	std::stringstream debug;
+	if (PastIntervals::check_new_interval(
+	      i.second.acting_primary, acting_primary,
+	      i.second.acting, acting,
+	      i.second.up_primary, up_primary,
+	      i.second.up, up,
+	      i.second.history.same_interval_since,
+	      i.second.history.last_epoch_clean,
+	      &nextmap,
+	      &osdmap,
+	      pgid,
+	      min_size_predicate,
+	      &i.second.past_intervals,
+	      &debug)) {
+	  epoch_t e = inc.epoch;
+	  i.second.history.same_interval_since = e;
+	  if (i.second.up != up) {
+	    i.second.history.same_up_since = e;
+	  }
+	  if (i.second.acting_primary != acting_primary) {
+	    i.second.history.same_primary_since = e;
+	  }
+	  if (pgid.is_split(
+		osdmap.get_pg_num(pgid.pool()),
+		nextmap.get_pg_num(pgid.pool()),
+		nullptr)) {
+	    i.second.history.last_epoch_split = e;
+	  }
+	  dout(10) << __func__ << "  pg " << pgid << " new interval,"
+		   << " up " << i.second.up << " -> " << up
+		   << " p " << i.second.up_primary << " -> " << up_primary
+		   << " acting " << i.second.acting << " -> " << acting
+		   << " p " << i.second.acting_primary << " -> "
+		   << acting_primary
+		   << " history " << i.second.history
+		   << " past_intervals " << i.second.past_intervals
+		   << dendl;
+	  dout(20) << "  debug: " << debug.str() << dendl;
+	  i.second.up = up;
+	  i.second.acting = acting;
+	  i.second.up_primary = up_primary;
+	  i.second.acting_primary = acting_primary;
+	}
+      }
+    }
+  }
+  dout(10) << __func__
+	   << " " << (pending_creatings.pgs.size() - total)
+	   << "/" << pending_creatings.pgs.size()
+	   << " pgs added from queued pools" << dendl;
+  return pending_creatings;
+}
+
+void OSDMonitor::maybe_prime_pg_temp()
+{
+  bool all = false;
+  if (pending_inc.crush.length()) {
+    dout(10) << __func__ << " new crush map, all" << dendl;
+    all = true;
+  }
+
+  if (!pending_inc.new_up_client.empty()) {
+    dout(10) << __func__ << " new up osds, all" << dendl;
+    all = true;
+  }
+
+  // check for interesting OSDs
+  set<int> osds;
+  for (auto p = pending_inc.new_state.begin();
+       !all && p != pending_inc.new_state.end();
+       ++p) {
+    if ((p->second & CEPH_OSD_UP) &&
+	osdmap.is_up(p->first)) {
+      osds.insert(p->first);
+    }
+  }
+  for (auto p = pending_inc.new_weight.begin();
+       !all && p != pending_inc.new_weight.end();
+       ++p) {
+    if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
+      // weight reduction
+      osds.insert(p->first);
+    } else {
+      dout(10) << __func__ << " osd." << p->first << " weight increase, all"
+	       << dendl;
+      all = true;
+    }
+  }
+
+  if (!all && osds.empty())
+    return;
+
+  if (!all) {
+    unsigned estimate =
+      mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
+    if (estimate > mapping.get_num_pgs() *
+	g_conf()->mon_osd_prime_pg_temp_max_estimate) {
+      dout(10) << __func__ << " estimate " << estimate << " pgs on "
+	       << osds.size() << " osds >= "
+	       << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
+	       << mapping.get_num_pgs() << " pgs, all"
+	       << dendl;
+      all = true;
+    } else {
+      dout(10) << __func__ << " estimate " << estimate << " pgs on "
+	       << osds.size() << " osds" << dendl;
+    }
+  }
+
+  OSDMap next;
+  next.deepish_copy_from(osdmap);
+  next.apply_incremental(pending_inc);
+
+  if (next.get_pools().empty()) {
+    dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
+  } else if (all) {
+    PrimeTempJob job(next, this);
+    mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
+    if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
+      dout(10) << __func__ << " done in " << job.get_duration() << dendl;
+    } else {
+      dout(10) << __func__ << " did not finish in "
+	       << g_conf()->mon_osd_prime_pg_temp_max_time
+	       << ", stopping" << dendl;
+      job.abort();
+    }
+  } else {
+    dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
+    utime_t stop = ceph_clock_now();
+    stop += g_conf()->mon_osd_prime_pg_temp_max_time;
+    const int chunk = 1000;
+    int n = chunk;
+    std::unordered_set<pg_t> did_pgs;
+    for (auto osd : osds) {
+      auto& pgs = mapping.get_osd_acting_pgs(osd);
+      dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
+      for (auto pgid : pgs) {
+	if (!did_pgs.insert(pgid).second) {
+	  continue;
+	}
+	prime_pg_temp(next, pgid);
+	if (--n <= 0) {
+	  n = chunk;
+	  if (ceph_clock_now() > stop) {
+	    dout(10) << __func__ << " consumed more than "
+		     << g_conf()->mon_osd_prime_pg_temp_max_time
+		     << " seconds, stopping"
+		     << dendl;
+	    return;
+	  }
+	}
+      }
+    }
+  }
+}
+
+void OSDMonitor::prime_pg_temp(
+  const OSDMap& next,
+  pg_t pgid)
+{
+  // TODO: remove this creating_pgs direct access?
+  if (creating_pgs.pgs.count(pgid)) {
+    return;
+  }
+  if (!osdmap.pg_exists(pgid)) {
+    return;
+  }
+
+  vector<int> up, acting;
+  mapping.get(pgid, &up, nullptr, &acting, nullptr);
+
+  vector<int> next_up, next_acting;
+  int next_up_primary, next_acting_primary;
+  next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
+			    &next_acting, &next_acting_primary);
+  if (acting == next_acting &&
+      !(up != acting && next_up == next_acting))
+    return;  // no change since last epoch
+
+  if (acting.empty())
+    return;  // if previously empty now we can be no worse off
+  const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
+  if (pool && acting.size() < pool->min_size)
+    return;  // can be no worse off than before
+
+  if (next_up == next_acting) {
+    acting.clear();
+    dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
+	     << dendl;
+  }
+
+  dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
+	   << " -> " << next_up << "/" << next_acting
+	   << ", priming " << acting
+	   << dendl;
+  {
+    std::lock_guard l(prime_pg_temp_lock);
+    // do not touch a mapping if a change is pending
+    pending_inc.new_pg_temp.emplace(
+      pgid,
+      mempool::osdmap::vector<int>(acting.begin(), acting.end()));
+  }
+}
+
+/**
+ * @note receiving a transaction in this function gives a fair amount of
+ * freedom to the service implementation if it does need it. It shouldn't.
+ */
+void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  dout(10) << "encode_pending e " << pending_inc.epoch
+	   << dendl;
+
+  if (do_prune(t)) {
+    dout(1) << __func__ << " osdmap full prune encoded e"
+            << pending_inc.epoch << dendl;
+  }
+
+  // finalize up pending_inc
+  pending_inc.modified = ceph_clock_now();
+
+  int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
+  ceph_assert(r == 0);
+
+  if (mapping_job) {
+    if (!mapping_job->is_done()) {
+      dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
+	      << mapping_job.get() << " did not complete, "
+	      << mapping_job->shards << " left" << dendl;
+      mapping_job->abort();
+    } else if (mapping.get_epoch() < osdmap.get_epoch()) {
+      dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
+	      << mapping_job.get() << " is prior epoch "
+	      << mapping.get_epoch() << dendl;
+    } else {
+      if (g_conf()->mon_osd_prime_pg_temp) {
+	maybe_prime_pg_temp();
+      }
+    } 
+  } else if (g_conf()->mon_osd_prime_pg_temp) {
+    dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
+	    << dendl;
+  }
+  mapping_job.reset();
+
+  // ensure we don't have blank new_state updates.  these are interrpeted as
+  // CEPH_OSD_UP (and almost certainly not what we want!).
+  auto p = pending_inc.new_state.begin();
+  while (p != pending_inc.new_state.end()) {
+    if (p->second == 0) {
+      dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
+      p = pending_inc.new_state.erase(p);
+    } else {
+      if (p->second & CEPH_OSD_UP) {
+	pending_inc.new_last_up_change = pending_inc.modified;
+      }
+      ++p;
+    }
+  }
+  if (!pending_inc.new_up_client.empty()) {
+    pending_inc.new_last_up_change = pending_inc.modified;
+  }
+  for (auto& i : pending_inc.new_weight) {
+    if (i.first >= osdmap.max_osd) {
+      if (i.second) {
+	// new osd is already marked in
+	pending_inc.new_last_in_change = pending_inc.modified;
+        break;
+      }
+    } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
+      // existing osd marked in or out
+      pending_inc.new_last_in_change = pending_inc.modified;
+      break;
+    }
+  }
+
+  {
+    OSDMap tmp;
+    tmp.deepish_copy_from(osdmap);
+    tmp.apply_incremental(pending_inc);
+
+    // clean pg_temp mappings
+    OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
+
+    // clean inappropriate pg_upmap/pg_upmap_items (if any)
+    {
+      // check every upmapped pg for now
+      // until we could reliably identify certain cases to ignore,
+      // which is obviously the hard part TBD..
+      vector<pg_t> pgs_to_check;
+      tmp.get_upmap_pgs(&pgs_to_check);
+      if (pgs_to_check.size() <
+	  static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
+        // not enough pgs, do it inline
+        tmp.clean_pg_upmaps(cct, &pending_inc);
+      } else {
+        CleanUpmapJob job(cct, tmp, pending_inc);
+        mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
+        job.wait();
+      }
+    }
+
+    // update creating pgs first so that we can remove the created pgid and
+    // process the pool flag removal below in the same osdmap epoch.
+    auto pending_creatings = update_pending_pgs(pending_inc, tmp);
+    bufferlist creatings_bl;
+    uint64_t features = CEPH_FEATURES_ALL;
+    if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
+      dout(20) << __func__ << " encoding pending pgs without octopus features"
+	       << dendl;
+      features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
+    }
+    encode(pending_creatings, creatings_bl, features);
+    t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
+
+    // remove any old (or incompat) POOL_CREATING flags
+    for (auto& i : tmp.get_pools()) {
+      if (tmp.require_osd_release < ceph_release_t::nautilus) {
+	// pre-nautilus OSDMaps shouldn't get this flag.
+	if (pending_inc.new_pools.count(i.first)) {
+	  pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
+	}
+      }
+      if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
+	  !pending_creatings.still_creating_pool(i.first)) {
+	dout(10) << __func__ << " done creating pool " << i.first
+		 << ", clearing CREATING flag" << dendl;
+	if (pending_inc.new_pools.count(i.first) == 0) {
+	  pending_inc.new_pools[i.first] = i.second;
+	}
+	pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
+      }
+    }
+
+    // collect which pools are currently affected by
+    // the near/backfill/full osd(s),
+    // and set per-pool near/backfill/full flag instead
+    set<int64_t> full_pool_ids;
+    set<int64_t> backfillfull_pool_ids;
+    set<int64_t> nearfull_pool_ids;
+    tmp.get_full_pools(cct,
+		       &full_pool_ids,
+		       &backfillfull_pool_ids,
+                         &nearfull_pool_ids);
+    if (full_pool_ids.empty() ||
+	backfillfull_pool_ids.empty() ||
+	nearfull_pool_ids.empty()) {
+      // normal case - no nearfull, backfillfull or full osds
+        // try cancel any improper nearfull/backfillfull/full pool
+        // flags first
+      for (auto &pool: tmp.get_pools()) {
+	auto p = pool.first;
+	if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
+	    nearfull_pool_ids.empty()) {
+	  dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+		   << "'s nearfull flag" << dendl;
+	  if (pending_inc.new_pools.count(p) == 0) {
+	    // load original pool info first!
+	    pending_inc.new_pools[p] = pool.second;
+	  }
+	  pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+	}
+	if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
+	    backfillfull_pool_ids.empty()) {
+	  dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+		   << "'s backfillfull flag" << dendl;
+	  if (pending_inc.new_pools.count(p) == 0) {
+	    pending_inc.new_pools[p] = pool.second;
+	  }
+	  pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+	}
+	if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
+	    full_pool_ids.empty()) {
+	  if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+	    // set by EQUOTA, skipping
+	    continue;
+	  }
+	  dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+		   << "'s full flag" << dendl;
+	  if (pending_inc.new_pools.count(p) == 0) {
+	    pending_inc.new_pools[p] = pool.second;
+	  }
+	  pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+	}
+      }
+    }
+    if (!full_pool_ids.empty()) {
+      dout(10) << __func__ << " marking pool(s) " << full_pool_ids
+	       << " as full" << dendl;
+      for (auto &p: full_pool_ids) {
+	if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
+	  continue;
+	}
+	if (pending_inc.new_pools.count(p) == 0) {
+	  pending_inc.new_pools[p] = tmp.pools[p];
+	}
+	pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
+	pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+	pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+      }
+      // cancel FLAG_FULL for pools which are no longer full too
+      for (auto &pool: tmp.get_pools()) {
+	auto p = pool.first;
+	if (full_pool_ids.count(p)) {
+	  // skip pools we have just marked as full above
+	  continue;
+	}
+	if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
+	    tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+	  // don't touch if currently is not full
+	  // or is running out of quota (and hence considered as full)
+	  continue;
+	}
+	dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+		 << "'s full flag" << dendl;
+	if (pending_inc.new_pools.count(p) == 0) {
+	  pending_inc.new_pools[p] = pool.second;
+	}
+	pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
+      }
+    }
+    if (!backfillfull_pool_ids.empty()) {
+      for (auto &p: backfillfull_pool_ids) {
+	if (full_pool_ids.count(p)) {
+	  // skip pools we have already considered as full above
+	  continue;
+	}
+	if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+	  // make sure FLAG_FULL is truly set, so we are safe not
+	  // to set a extra (redundant) FLAG_BACKFILLFULL flag
+	  ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+	  continue;
+	}
+	if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+	  // don't bother if pool is already marked as backfillfull
+	  continue;
+	}
+	dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+		 << "'s as backfillfull" << dendl;
+	if (pending_inc.new_pools.count(p) == 0) {
+	  pending_inc.new_pools[p] = tmp.pools[p];
+	}
+	pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
+	pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+      }
+      // cancel FLAG_BACKFILLFULL for pools
+      // which are no longer backfillfull too
+      for (auto &pool: tmp.get_pools()) {
+	auto p = pool.first;
+	if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+	  // skip pools we have just marked as backfillfull/full above
+	  continue;
+	}
+	if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+	  // and don't touch if currently is not backfillfull
+	  continue;
+	}
+	dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+		 << "'s backfillfull flag" << dendl;
+	if (pending_inc.new_pools.count(p) == 0) {
+	  pending_inc.new_pools[p] = pool.second;
+	}
+	pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
+      }
+    }
+    if (!nearfull_pool_ids.empty()) {
+      for (auto &p: nearfull_pool_ids) {
+	if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
+	  continue;
+	}
+	if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+	  // make sure FLAG_FULL is truly set, so we are safe not
+	  // to set a extra (redundant) FLAG_NEARFULL flag
+	  ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
+	  continue;
+	}
+	if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+	  // don't bother if pool is already marked as nearfull
+	  continue;
+	}
+	dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
+		 << "'s as nearfull" << dendl;
+	if (pending_inc.new_pools.count(p) == 0) {
+	  pending_inc.new_pools[p] = tmp.pools[p];
+	}
+	pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
+      }
+      // cancel FLAG_NEARFULL for pools
+      // which are no longer nearfull too
+      for (auto &pool: tmp.get_pools()) {
+	auto p = pool.first;
+	if (full_pool_ids.count(p) ||
+	    backfillfull_pool_ids.count(p) ||
+	    nearfull_pool_ids.count(p)) {
+	  // skip pools we have just marked as
+	  // nearfull/backfillfull/full above
+	  continue;
+	}
+	if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
+	  // and don't touch if currently is not nearfull
+	  continue;
+	}
+	dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
+		 << "'s nearfull flag" << dendl;
+	if (pending_inc.new_pools.count(p) == 0) {
+	  pending_inc.new_pools[p] = pool.second;
+	}
+	pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
+      }
+    }
+
+    // min_compat_client?
+    if (!tmp.require_min_compat_client) {
+      auto mv = tmp.get_min_compat_client();
+      dout(1) << __func__ << " setting require_min_compat_client to currently "
+	      << "required " << mv << dendl;
+      mon.clog->info() << "setting require_min_compat_client to currently "
+			<< "required " << mv;
+      pending_inc.new_require_min_compat_client = mv;
+    }
+
+    if (osdmap.require_osd_release < ceph_release_t::nautilus &&
+	tmp.require_osd_release >= ceph_release_t::nautilus) {
+      dout(10) << __func__ << " first nautilus+ epoch" << dendl;
+      // add creating flags?
+      for (auto& i : tmp.get_pools()) {
+	if (pending_creatings.still_creating_pool(i.first)) {
+	  dout(10) << __func__ << " adding CREATING flag to pool " << i.first
+		   << dendl;
+	  if (pending_inc.new_pools.count(i.first) == 0) {
+	    pending_inc.new_pools[i.first] = i.second;
+	  }
+	  pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
+	}
+      }
+      // adjust blocklist items to all be TYPE_ANY
+      for (auto& i : tmp.blocklist) {
+	auto a = i.first;
+	a.set_type(entity_addr_t::TYPE_ANY);
+	pending_inc.new_blocklist[a] = i.second;
+	pending_inc.old_blocklist.push_back(i.first);
+      }
+    }
+
+    if (osdmap.require_osd_release < ceph_release_t::octopus &&
+	tmp.require_osd_release >= ceph_release_t::octopus) {
+      dout(10) << __func__ << " first octopus+ epoch" << dendl;
+
+      // adjust obsoleted cache modes
+      for (auto& [poolid, pi] : tmp.pools) {
+	if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
+	  if (pending_inc.new_pools.count(poolid) == 0) {
+	    pending_inc.new_pools[poolid] = pi;
+	  }
+	  dout(10) << __func__ << " switching pool " << poolid
+		   << " cachemode from forward -> proxy" << dendl;
+	  pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
+	}
+	if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
+	  if (pending_inc.new_pools.count(poolid) == 0) {
+	    pending_inc.new_pools[poolid] = pi;
+	  }
+	  dout(10) << __func__ << " switching pool " << poolid
+		   << " cachemode from readforward -> readproxy" << dendl;
+	  pending_inc.new_pools[poolid].cache_mode =
+	    pg_pool_t::CACHEMODE_READPROXY;
+	}
+      }
+
+      // clear removed_snaps for every pool
+      for (auto& [poolid, pi] : tmp.pools) {
+	if (pi.removed_snaps.empty()) {
+	  continue;
+	}
+	if (pending_inc.new_pools.count(poolid) == 0) {
+	  pending_inc.new_pools[poolid] = pi;
+	}
+	dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
+		 << dendl;
+	pending_inc.new_pools[poolid].removed_snaps.clear();
+      }
+
+      // create a combined purged snap epoch key for all purged snaps
+      // prior to this epoch, and store it in the current epoch (i.e.,
+      // the last pre-octopus epoch, just prior to the one we're
+      // encoding now).
+      auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
+      it->lower_bound("purged_snap_");
+      map<int64_t,snap_interval_set_t> combined;
+      while (it->valid()) {
+	if (it->key().find("purged_snap_") != 0) {
+	  break;
+	}
+	string k = it->key();
+	long long unsigned pool;
+	int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
+	if (n != 1) {
+	  derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
+	} else {
+	  bufferlist v = it->value();
+	  auto p = v.cbegin();
+	  snapid_t begin, end;
+	  ceph::decode(begin, p);
+	  ceph::decode(end, p);
+	  combined[pool].insert(begin, end - begin);
+	}
+	it->next();
+      }
+      if (!combined.empty()) {
+	string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
+	bufferlist v;
+	ceph::encode(combined, v);
+	t->put(OSD_SNAP_PREFIX, k, v);
+	dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
+		 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
+		 << dendl;
+      } else {
+	dout(10) << __func__ << " there were no pre-octopus purged snaps"
+		 << dendl;
+      }
+
+      // clean out the old removed_snap_ and removed_epoch keys
+      // ('`' is ASCII '_' + 1)
+      t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
+      t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
+    }
+  }
+
+  // tell me about it
+  for (auto i = pending_inc.new_state.begin();
+       i != pending_inc.new_state.end();
+       ++i) {
+    int s = i->second ? i->second : CEPH_OSD_UP;
+    if (s & CEPH_OSD_UP) {
+      dout(2) << " osd." << i->first << " DOWN" << dendl;
+      // Reset laggy parameters if failure interval exceeds a threshold.
+      const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
+      if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
+        int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
+        if (grace_interval_threshold_exceeded(last_failure_interval)) {
+          set_default_laggy_params(i->first);
+        }
+      }
+    }
+    if (s & CEPH_OSD_EXISTS)
+      dout(2) << " osd." << i->first << " DNE" << dendl;
+  }
+  for (auto i = pending_inc.new_up_client.begin();
+       i != pending_inc.new_up_client.end();
+       ++i) {
+    //FIXME: insert cluster addresses too
+    dout(2) << " osd." << i->first << " UP " << i->second << dendl;
+  }
+  for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
+       i != pending_inc.new_weight.end();
+       ++i) {
+    if (i->second == CEPH_OSD_OUT) {
+      dout(2) << " osd." << i->first << " OUT" << dendl;
+    } else if (i->second == CEPH_OSD_IN) {
+      dout(2) << " osd." << i->first << " IN" << dendl;
+    } else {
+      dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
+    }
+  }
+
+  // features for osdmap and its incremental
+  uint64_t features;
+
+  // encode full map and determine its crc
+  OSDMap tmp;
+  {
+    tmp.deepish_copy_from(osdmap);
+    tmp.apply_incremental(pending_inc);
+
+    // determine appropriate features
+    features = tmp.get_encoding_features();
+    dout(10) << __func__ << " encoding full map with "
+	     << tmp.require_osd_release
+	     << " features " << features << dendl;
+
+    // the features should be a subset of the mon quorum's features!
+    ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
+
+    bufferlist fullbl;
+    encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
+    pending_inc.full_crc = tmp.get_crc();
+
+    // include full map in the txn.  note that old monitors will
+    // overwrite this.  new ones will now skip the local full map
+    // encode and reload from this.
+    put_version_full(t, pending_inc.epoch, fullbl);
+  }
+
+  // encode
+  ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
+  bufferlist bl;
+  encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
+
+  dout(20) << " full_crc " << tmp.get_crc()
+	   << " inc_crc " << pending_inc.inc_crc << dendl;
+
+  /* put everything in the transaction */
+  put_version(t, pending_inc.epoch, bl);
+  put_last_committed(t, pending_inc.epoch);
+
+  // metadata, too!
+  for (map<int,bufferlist>::iterator p = pending_metadata.begin();
+       p != pending_metadata.end();
+       ++p)
+    t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
+  for (set<int>::iterator p = pending_metadata_rm.begin();
+       p != pending_metadata_rm.end();
+       ++p)
+    t->erase(OSD_METADATA_PREFIX, stringify(*p));
+  pending_metadata.clear();
+  pending_metadata_rm.clear();
+
+  // purged_snaps
+  if (tmp.require_osd_release >= ceph_release_t::octopus &&
+      !pending_inc.new_purged_snaps.empty()) {
+    // all snaps purged this epoch (across all pools)
+    string k = make_purged_snap_epoch_key(pending_inc.epoch);
+    bufferlist v;
+    encode(pending_inc.new_purged_snaps, v);
+    t->put(OSD_SNAP_PREFIX, k, v);
+  }
+  for (auto& i : pending_inc.new_purged_snaps) {
+    for (auto q = i.second.begin();
+	 q != i.second.end();
+	 ++q) {
+      insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
+				pending_inc.epoch,
+				t);
+    }
+  }
+  for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
+    for (auto snap : snaps) {
+      insert_purged_snap_update(pool, snap, snap + 1,
+				pending_inc.epoch,
+				t);
+    }
+  }
+
+  // health
+  health_check_map_t next;
+  tmp.check_health(cct, &next);
+  encode_health(next, t);
+}
+
+int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
+{
+  bufferlist bl;
+  int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
+  if (r < 0)
+    return r;
+  try {
+    auto p = bl.cbegin();
+    decode(m, p);
+  }
+  catch (ceph::buffer::error& e) {
+    if (err)
+      *err << "osd." << osd << " metadata is corrupt";
+    return -EIO;
+  }
+  return 0;
+}
+
+void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
+{
+  for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
+    if (osdmap.is_up(osd)) {
+      map<string,string> meta;
+      load_metadata(osd, meta, nullptr);
+      auto p = meta.find(field);
+      if (p == meta.end()) {
+	(*out)["unknown"]++;
+      } else {
+	(*out)[p->second]++;
+      }
+    }
+  }
+}
+
+void OSDMonitor::count_metadata(const string& field, Formatter *f)
+{
+  map<string,int> by_val;
+  count_metadata(field, &by_val);
+  f->open_object_section(field.c_str());
+  for (auto& p : by_val) {
+    f->dump_int(p.first.c_str(), p.second);
+  }
+  f->close_section();
+}
+
+void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
+{
+  for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
+    if (osdmap.is_up(osd)) {
+      map<string,string> meta;
+      load_metadata(osd, meta, nullptr);
+      auto p = meta.find("ceph_version_short");
+      if (p == meta.end()) continue;
+      versions[p->second].push_back(string("osd.") + stringify(osd));
+    }
+  }
+}
+
+int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
+{
+  map<string, string> metadata;
+  int r = load_metadata(osd, metadata, nullptr);
+  if (r < 0)
+    return r;
+
+  auto it = metadata.find("osd_objectstore");
+  if (it == metadata.end())
+    return -ENOENT;
+  *type = it->second;
+  return 0;
+}
+
+bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
+						 const pg_pool_t &pool,
+						 ostream *err)
+{
+  // just check a few pgs for efficiency - this can't give a guarantee anyway,
+  // since filestore osds could always join the pool later
+  set<int> checked_osds;
+  for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
+    vector<int> up, acting;
+    pg_t pgid(ps, pool_id);
+    osdmap.pg_to_up_acting_osds(pgid, up, acting);
+    for (int osd : up) {
+      if (checked_osds.find(osd) != checked_osds.end())
+	continue;
+      string objectstore_type;
+      int r = get_osd_objectstore_type(osd, &objectstore_type);
+      // allow with missing metadata, e.g. due to an osd never booting yet
+      if (r < 0 || objectstore_type == "bluestore") {
+	checked_osds.insert(osd);
+	continue;
+      }
+      *err << "osd." << osd << " uses " << objectstore_type;
+      return false;
+    }
+  }
+  return true;
+}
+
+int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
+{
+  map<string,string> m;
+  if (int r = load_metadata(osd, m, err))
+    return r;
+  for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
+    f->dump_string(p->first.c_str(), p->second);
+  return 0;
+}
+
+void OSDMonitor::print_nodes(Formatter *f)
+{
+  // group OSDs by their hosts
+  map<string, list<int> > osds; // hostname => osd
+  for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
+    map<string, string> m;
+    if (load_metadata(osd, m, NULL)) {
+      continue;
+    }
+    map<string, string>::iterator hostname = m.find("hostname");
+    if (hostname == m.end()) {
+      // not likely though
+      continue;
+    }
+    osds[hostname->second].push_back(osd);
+  }
+
+  dump_services(f, osds, "osd");
+}
+
+void OSDMonitor::share_map_with_random_osd()
+{
+  if (osdmap.get_num_up_osds() == 0) {
+    dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
+    return;
+  }
+
+  MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
+  if (!s) {
+    dout(10) << __func__ << " no up osd on our session map" << dendl;
+    return;
+  }
+
+  dout(10) << "committed, telling random " << s->name
+	   << " all about it" << dendl;
+
+  // get feature of the peer
+  // use quorum_con_features, if it's an anonymous connection.
+  uint64_t features = s->con_features ? s->con_features :
+                                        mon.get_quorum_con_features();
+  // whatev, they'll request more if they need it
+  MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
+  s->con->send_message(m);
+  // NOTE: do *not* record osd has up to this epoch (as we do
+  // elsewhere) as they may still need to request older values.
+}
+
+version_t OSDMonitor::get_trim_to() const
+{
+  if (mon.get_quorum().empty()) {
+    dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
+    return 0;
+  }
+
+  {
+    std::lock_guard<std::mutex> l(creating_pgs_lock);
+    if (!creating_pgs.pgs.empty()) {
+      dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
+      return 0;
+    }
+  }
+
+  if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
+    dout(0) << __func__
+            << " blocking osdmap trim"
+            << " ('mon_debug_block_osdmap_trim' set to 'true')"
+            << " trim_to = 0" << dendl;
+    return 0;
+  }
+
+  {
+    epoch_t floor = get_min_last_epoch_clean();
+    dout(10) << " min_last_epoch_clean " << floor << dendl;
+    if (g_conf()->mon_osd_force_trim_to > 0 &&
+	g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
+      floor = g_conf()->mon_osd_force_trim_to;
+      dout(10) << __func__
+               << " explicit mon_osd_force_trim_to = " << floor << dendl;
+    }
+    unsigned min = g_conf()->mon_min_osdmap_epochs;
+    if (floor + min > get_last_committed()) {
+      if (min < get_last_committed())
+	floor = get_last_committed() - min;
+      else
+	floor = 0;
+    }
+    if (floor > get_first_committed()) {
+      dout(10) << __func__ << " trim_to = " << floor << dendl;
+      return floor;
+    }
+  }
+  dout(10) << __func__ << " trim_to = 0" << dendl;
+  return 0;
+}
+
+epoch_t OSDMonitor::get_min_last_epoch_clean() const
+{
+  auto floor = last_epoch_clean.get_lower_bound(osdmap);
+  // also scan osd epochs
+  // don't trim past the oldest reported osd epoch
+  for (auto [osd, epoch] : osd_epochs) {
+    if (epoch < floor) {
+      floor = epoch;
+    }
+  }
+  return floor;
+}
+
+void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
+				   version_t first)
+{
+  dout(10) << __func__ << " including full map for e " << first << dendl;
+  bufferlist bl;
+  get_version_full(first, bl);
+  put_version_full(tx, first, bl);
+
+  if (has_osdmap_manifest &&
+      first > osdmap_manifest.get_first_pinned()) {
+    _prune_update_trimmed(tx, first);
+  }
+}
+
+
+/* full osdmap prune
+ *
+ * for more information, please refer to doc/dev/mon-osdmap-prune.rst
+ */
+
+void OSDMonitor::load_osdmap_manifest()
+{
+  bool store_has_manifest =
+    mon.store->exists(get_service_name(), "osdmap_manifest");
+
+  if (!store_has_manifest) {
+    if (!has_osdmap_manifest) {
+      return;
+    }
+
+    dout(20) << __func__
+             << " dropping osdmap manifest from memory." << dendl;
+    osdmap_manifest = osdmap_manifest_t();
+    has_osdmap_manifest = false;
+    return;
+  }
+
+  dout(20) << __func__
+           << " osdmap manifest detected in store; reload." << dendl;
+
+  bufferlist manifest_bl;
+  int r = get_value("osdmap_manifest", manifest_bl);
+  if (r < 0) {
+    derr << __func__ << " unable to read osdmap version manifest" << dendl;
+    ceph_abort_msg("error reading manifest");
+  }
+  osdmap_manifest.decode(manifest_bl);
+  has_osdmap_manifest = true;
+
+  dout(10) << __func__ << " store osdmap manifest pinned ("
+           << osdmap_manifest.get_first_pinned()
+           << " .. "
+           << osdmap_manifest.get_last_pinned()
+           << ")"
+           << dendl;
+}
+
+bool OSDMonitor::should_prune() const
+{
+  version_t first = get_first_committed();
+  version_t last = get_last_committed();
+  version_t min_osdmap_epochs =
+    g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
+  version_t prune_min =
+    g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
+  version_t prune_interval =
+    g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
+  version_t last_pinned = osdmap_manifest.get_last_pinned();
+  version_t last_to_pin = last - min_osdmap_epochs;
+
+  // Make it or break it constraints.
+  //
+  // If any of these conditions fails, we will not prune, regardless of
+  // whether we have an on-disk manifest with an on-going pruning state.
+  //
+  if ((last - first) <= min_osdmap_epochs) {
+    // between the first and last committed epochs, we don't have
+    // enough epochs to trim, much less to prune.
+    dout(10) << __func__
+             << " currently holding only " << (last - first)
+             << " epochs (min osdmap epochs: " << min_osdmap_epochs
+             << "); do not prune."
+             << dendl;
+    return false;
+
+  } else if ((last_to_pin - first) < prune_min) {
+    // between the first committed epoch and the last epoch we would prune,
+    // we simply don't have enough versions over the minimum to prune maps.
+    dout(10) << __func__
+             << " could only prune " << (last_to_pin - first)
+             << " epochs (" << first << ".." << last_to_pin << "), which"
+                " is less than the required minimum (" << prune_min << ")"
+             << dendl;
+    return false;
+
+  } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
+    dout(10) << __func__
+             << " we have pruned as far as we can; do not prune."
+             << dendl;
+    return false;
+
+  } else if (last_pinned + prune_interval > last_to_pin) {
+    dout(10) << __func__
+             << " not enough epochs to form an interval (last pinned: "
+             << last_pinned << ", last to pin: "
+             << last_to_pin << ", interval: " << prune_interval << ")"
+             << dendl;
+    return false;
+  }
+
+  dout(15) << __func__
+           << " should prune (" << last_pinned << ".." << last_to_pin << ")"
+           << " lc (" << first << ".." << last << ")"
+           << dendl;
+  return true;
+}
+
+void OSDMonitor::_prune_update_trimmed(
+    MonitorDBStore::TransactionRef tx,
+    version_t first)
+{
+  dout(10) << __func__
+           << " first " << first
+           << " last_pinned " << osdmap_manifest.get_last_pinned()
+           << dendl;
+
+  osdmap_manifest_t manifest = osdmap_manifest;
+
+  if (!manifest.is_pinned(first)) {
+    manifest.pin(first);
+  }
+
+  set<version_t>::iterator p_end = manifest.pinned.find(first);
+  set<version_t>::iterator p = manifest.pinned.begin();
+  manifest.pinned.erase(p, p_end);
+  ceph_assert(manifest.get_first_pinned() == first);
+
+  if (manifest.get_last_pinned() == first+1 ||
+      manifest.pinned.size() == 1) {
+    // we reached the end of the line, as pinned maps go; clean up our
+    // manifest, and let `should_prune()` decide whether we should prune
+    // again.
+    tx->erase(get_service_name(), "osdmap_manifest");
+    return;
+  }
+
+  bufferlist bl;
+  manifest.encode(bl);
+  tx->put(get_service_name(), "osdmap_manifest", bl);
+}
+
+void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
+{
+  dout(1) << __func__ << dendl;
+
+  version_t pin_first;
+
+  // verify constrainsts on stable in-memory state
+  if (!has_osdmap_manifest) {
+    // we must have never pruned, OR if we pruned the state must no longer
+    // be relevant (i.e., the state must have been removed alongside with
+    // the trim that *must* have removed past the last pinned map in a
+    // previous prune).
+    ceph_assert(osdmap_manifest.pinned.empty());
+    ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
+    pin_first = get_first_committed();
+
+  } else {
+    // we must have pruned in the past AND its state is still relevant
+    // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
+    // and thus we still hold a manifest in the store).
+    ceph_assert(!osdmap_manifest.pinned.empty());
+    ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
+    ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
+
+    dout(10) << __func__
+             << " first_pinned " << osdmap_manifest.get_first_pinned()
+             << " last_pinned " << osdmap_manifest.get_last_pinned()
+             << dendl;
+
+    pin_first = osdmap_manifest.get_last_pinned();
+  }
+
+  manifest.pin(pin_first);
+}
+
+bool OSDMonitor::_prune_sanitize_options() const
+{
+  uint64_t prune_interval =
+    g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
+  uint64_t prune_min =
+    g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
+  uint64_t txsize =
+    g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
+
+  bool r = true;
+
+  if (prune_interval == 0) {
+    derr << __func__
+         << " prune is enabled BUT prune interval is zero; abort."
+         << dendl;
+    r = false;
+  } else if (prune_interval == 1) {
+    derr << __func__
+         << " prune interval is equal to one, which essentially means"
+            " no pruning; abort."
+         << dendl;
+    r = false;
+  }
+  if (prune_min == 0) {
+    derr << __func__
+         << " prune is enabled BUT prune min is zero; abort."
+         << dendl;
+    r = false;
+  }
+  if (prune_interval > prune_min) {
+    derr << __func__
+         << " impossible to ascertain proper prune interval because"
+         << " it is greater than the minimum prune epochs"
+         << " (min: " << prune_min << ", interval: " << prune_interval << ")"
+         << dendl;
+    r = false;
+  }
+
+  if (txsize < prune_interval - 1) {
+    derr << __func__
+         << " 'mon_osdmap_full_prune_txsize' (" << txsize
+         << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
+         << "); abort." << dendl;
+    r = false;
+  }
+  return r;
+}
+
+bool OSDMonitor::is_prune_enabled() const {
+  return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
+}
+
+bool OSDMonitor::is_prune_supported() const {
+  return mon.get_required_mon_features().contains_any(
+      ceph::features::mon::FEATURE_OSDMAP_PRUNE);
+}
+
+/** do_prune
+ *
+ * @returns true if has side-effects; false otherwise.
+ */
+bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
+{
+  bool enabled = is_prune_enabled();
+
+  dout(1) << __func__ << " osdmap full prune "
+          << ( enabled ? "enabled" : "disabled")
+          << dendl;
+
+  if (!enabled || !_prune_sanitize_options() || !should_prune()) {
+    return false;
+  }
+
+  // we are beyond the minimum prune versions, we need to remove maps because
+  // otherwise the store will grow unbounded and we may end up having issues
+  // with available disk space or store hangs.
+
+  // we will not pin all versions. We will leave a buffer number of versions.
+  // this allows us the monitor to trim maps without caring too much about
+  // pinned maps, and then allow us to use another ceph-mon without these
+  // capabilities, without having to repair the store.
+
+  osdmap_manifest_t manifest = osdmap_manifest;
+
+  version_t first = get_first_committed();
+  version_t last = get_last_committed();
+
+  version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
+  version_t last_pinned = manifest.get_last_pinned();
+  uint64_t prune_interval =
+    g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
+  uint64_t txsize =
+    g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
+
+  prune_init(manifest);
+
+  // we need to get rid of some osdmaps
+
+  dout(5) << __func__
+          << " lc (" << first << " .. " << last << ")"
+          << " last_pinned " << last_pinned
+          << " interval " << prune_interval
+          << " last_to_pin " << last_to_pin
+          << dendl;
+
+  // We will be erasing maps as we go.
+  //
+  // We will erase all maps between `last_pinned` and the `next_to_pin`.
+  //
+  // If `next_to_pin` happens to be greater than `last_to_pin`, then
+  // we stop pruning. We could prune the maps between `next_to_pin` and
+  // `last_to_pin`, but by not doing it we end up with neater pruned
+  // intervals, aligned with `prune_interval`. Besides, this should not be a
+  // problem as long as `prune_interval` is set to a sane value, instead of
+  // hundreds or thousands of maps.
+
+  auto map_exists = [this](version_t v) {
+    string k = mon.store->combine_strings("full", v);
+    return mon.store->exists(get_service_name(), k);
+  };
+
+  // 'interval' represents the number of maps from the last pinned
+  // i.e., if we pinned version 1 and have an interval of 10, we're pinning
+  // version 11 next; all intermediate versions will be removed.
+  //
+  // 'txsize' represents the maximum number of versions we'll be removing in
+  // this iteration. If 'txsize' is large enough to perform multiple passes
+  // pinning and removing maps, we will do so; if not, we'll do at least one
+  // pass. We are quite relaxed about honouring 'txsize', but we'll always
+  // ensure that we never go *over* the maximum.
+
+  // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
+  uint64_t removal_interval = prune_interval - 1;
+
+  if (txsize < removal_interval) {
+    dout(5) << __func__
+	    << " setting txsize to removal interval size ("
+	    << removal_interval << " versions"
+	    << dendl;
+    txsize = removal_interval;
+  }
+  ceph_assert(removal_interval > 0);
+
+  uint64_t num_pruned = 0;
+  while (num_pruned + removal_interval <= txsize) { 
+    last_pinned = manifest.get_last_pinned();
+
+    if (last_pinned + prune_interval > last_to_pin) {
+      break;
+    }
+    ceph_assert(last_pinned < last_to_pin);
+
+    version_t next_pinned = last_pinned + prune_interval;
+    ceph_assert(next_pinned <= last_to_pin);
+    manifest.pin(next_pinned);
+
+    dout(20) << __func__
+	     << " last_pinned " << last_pinned
+	     << " next_pinned " << next_pinned
+	     << " num_pruned " << num_pruned
+	     << " removal interval (" << (last_pinned+1)
+	     << ".." << (next_pinned-1) << ")"
+	     << " txsize " << txsize << dendl;
+
+    ceph_assert(map_exists(last_pinned));
+    ceph_assert(map_exists(next_pinned));
+
+    for (version_t v = last_pinned+1; v < next_pinned; ++v) {
+      ceph_assert(!manifest.is_pinned(v));
+
+      dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
+      string full_key = mon.store->combine_strings("full", v);
+      tx->erase(get_service_name(), full_key);
+      ++num_pruned;
+    }
+  }
+
+  ceph_assert(num_pruned > 0);
+
+  bufferlist bl;
+  manifest.encode(bl);
+  tx->put(get_service_name(), "osdmap_manifest", bl);
+
+  return true;
+}
+
+
+// -------------
+
+bool OSDMonitor::preprocess_query(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  Message *m = op->get_req();
+  dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
+
+  switch (m->get_type()) {
+    // READs
+  case MSG_MON_COMMAND:
+    try {
+      return preprocess_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+  case CEPH_MSG_MON_GET_OSDMAP:
+    return preprocess_get_osdmap(op);
+
+    // damp updates
+  case MSG_OSD_MARK_ME_DOWN:
+    return preprocess_mark_me_down(op);
+  case MSG_OSD_MARK_ME_DEAD:
+    return preprocess_mark_me_dead(op);
+  case MSG_OSD_FULL:
+    return preprocess_full(op);
+  case MSG_OSD_FAILURE:
+    return preprocess_failure(op);
+  case MSG_OSD_BOOT:
+    return preprocess_boot(op);
+  case MSG_OSD_ALIVE:
+    return preprocess_alive(op);
+  case MSG_OSD_PG_CREATED:
+    return preprocess_pg_created(op);
+  case MSG_OSD_PG_READY_TO_MERGE:
+    return preprocess_pg_ready_to_merge(op);
+  case MSG_OSD_PGTEMP:
+    return preprocess_pgtemp(op);
+  case MSG_OSD_BEACON:
+    return preprocess_beacon(op);
+
+  case CEPH_MSG_POOLOP:
+    return preprocess_pool_op(op);
+
+  case MSG_REMOVE_SNAPS:
+    return preprocess_remove_snaps(op);
+
+  case MSG_MON_GET_PURGED_SNAPS:
+    return preprocess_get_purged_snaps(op);
+
+  default:
+    ceph_abort();
+    return true;
+  }
+}
+
+bool OSDMonitor::prepare_update(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  Message *m = op->get_req();
+  dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
+
+  switch (m->get_type()) {
+    // damp updates
+  case MSG_OSD_MARK_ME_DOWN:
+    return prepare_mark_me_down(op);
+  case MSG_OSD_MARK_ME_DEAD:
+    return prepare_mark_me_dead(op);
+  case MSG_OSD_FULL:
+    return prepare_full(op);
+  case MSG_OSD_FAILURE:
+    return prepare_failure(op);
+  case MSG_OSD_BOOT:
+    return prepare_boot(op);
+  case MSG_OSD_ALIVE:
+    return prepare_alive(op);
+  case MSG_OSD_PG_CREATED:
+    return prepare_pg_created(op);
+  case MSG_OSD_PGTEMP:
+    return prepare_pgtemp(op);
+  case MSG_OSD_PG_READY_TO_MERGE:
+    return prepare_pg_ready_to_merge(op);
+  case MSG_OSD_BEACON:
+    return prepare_beacon(op);
+
+  case MSG_MON_COMMAND:
+    try {
+      return prepare_command(op);
+    } catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+
+  case CEPH_MSG_POOLOP:
+    return prepare_pool_op(op);
+
+  case MSG_REMOVE_SNAPS:
+    return prepare_remove_snaps(op);
+
+
+  default:
+    ceph_abort();
+  }
+
+  return false;
+}
+
+bool OSDMonitor::should_propose(double& delay)
+{
+  dout(10) << "should_propose" << dendl;
+
+  // if full map, propose immediately!  any subsequent changes will be clobbered.
+  if (pending_inc.fullmap.length())
+    return true;
+
+  // adjust osd weights?
+  if (!osd_weight.empty() &&
+      osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
+    dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
+    osdmap.adjust_osd_weights(osd_weight, pending_inc);
+    delay = 0.0;
+    osd_weight.clear();
+    return true;
+  }
+
+  return PaxosService::should_propose(delay);
+}
+
+
+
+// ---------------------------
+// READs
+
+bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MMonGetOSDMap>();
+
+  uint64_t features = mon.get_quorum_con_features();
+  if (op->get_session() && op->get_session()->con_features)
+    features = op->get_session()->con_features;
+
+  dout(10) << __func__ << " " << *m << dendl;
+  MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
+  epoch_t first = get_first_committed();
+  epoch_t last = osdmap.get_epoch();
+  int max = g_conf()->osd_map_message_max;
+  ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
+  for (epoch_t e = std::max(first, m->get_full_first());
+       e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
+       ++e, --max) {
+    bufferlist& bl = reply->maps[e];
+    int r = get_version_full(e, features, bl);
+    ceph_assert(r >= 0);
+    max_bytes -= bl.length();
+  }
+  for (epoch_t e = std::max(first, m->get_inc_first());
+       e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
+       ++e, --max) {
+    bufferlist& bl = reply->incremental_maps[e];
+    int r = get_version(e, features, bl);
+    ceph_assert(r >= 0);
+    max_bytes -= bl.length();
+  }
+  reply->oldest_map = first;
+  reply->newest_map = last;
+  mon.send_reply(op, reply);
+  return true;
+}
+
+
+// ---------------------------
+// UPDATEs
+
+// failure --
+
+bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
+  // check permissions
+  MonSession *session = op->get_session();
+  if (!session)
+    return true;
+  if (!session->is_capable("osd", MON_CAP_X)) {
+    dout(0) << "got MOSDFailure from entity with insufficient caps "
+	    << session->caps << dendl;
+    return true;
+  }
+  if (fsid != mon.monmap->fsid) {
+    dout(0) << "check_source: on fsid " << fsid
+	    << " != " << mon.monmap->fsid << dendl;
+    return true;
+  }
+  return false;
+}
+
+
+bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDFailure>();
+  // who is target_osd
+  int badboy = m->get_target_osd();
+
+  // check permissions
+  if (check_source(op, m->fsid))
+    goto didit;
+
+  // first, verify the reporting host is valid
+  if (m->get_orig_source().is_osd()) {
+    int from = m->get_orig_source().num();
+    if (!osdmap.exists(from) ||
+	!osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
+	(osdmap.is_down(from) && m->if_osd_failed())) {
+      dout(5) << "preprocess_failure from dead osd." << from
+	      << ", ignoring" << dendl;
+      send_incremental(op, m->get_epoch()+1);
+      goto didit;
+    }
+  }
+
+
+  // weird?
+  if (osdmap.is_down(badboy)) {
+    dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
+	    << " " << m->get_target_addrs()
+	    << ", from " << m->get_orig_source() << dendl;
+    if (m->get_epoch() < osdmap.get_epoch())
+      send_incremental(op, m->get_epoch()+1);
+    goto didit;
+  }
+  if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
+    dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
+	    << " " << m->get_target_addrs()
+	    << " != map's " << osdmap.get_addrs(badboy)
+	    << ", from " << m->get_orig_source() << dendl;
+    if (m->get_epoch() < osdmap.get_epoch())
+      send_incremental(op, m->get_epoch()+1);
+    goto didit;
+  }
+
+  // already reported?
+  if (osdmap.is_down(badboy) ||
+      osdmap.get_up_from(badboy) > m->get_epoch()) {
+    dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
+	    << " " << m->get_target_addrs()
+	    << ", from " << m->get_orig_source() << dendl;
+    if (m->get_epoch() < osdmap.get_epoch())
+      send_incremental(op, m->get_epoch()+1);
+    goto didit;
+  }
+
+  if (!can_mark_down(badboy)) {
+    dout(5) << "preprocess_failure ignoring report of osd."
+	    << m->get_target_osd() << " " << m->get_target_addrs()
+	    << " from " << m->get_orig_source() << dendl;
+    goto didit;
+  }
+
+  dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
+	   << " " << m->get_target_addrs()
+	   << ", from " << m->get_orig_source() << dendl;
+  return false;
+
+ didit:
+  mon.no_reply(op);
+  return true;
+}
+
+class C_AckMarkedDown : public C_MonOp {
+  OSDMonitor *osdmon;
+public:
+  C_AckMarkedDown(
+    OSDMonitor *osdmon,
+    MonOpRequestRef op)
+    : C_MonOp(op), osdmon(osdmon) {}
+
+  void _finish(int r) override {
+    if (r == 0) {
+      auto m = op->get_req<MOSDMarkMeDown>();
+      osdmon->mon.send_reply(
+        op,
+        new MOSDMarkMeDown(
+          m->fsid,
+          m->target_osd,
+          m->target_addrs,
+          m->get_epoch(),
+          false));   // ACK itself does not request an ack
+    } else if (r == -EAGAIN) {
+        osdmon->dispatch(op);
+    } else {
+        ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
+    }
+  }
+  ~C_AckMarkedDown() override {
+  }
+};
+
+bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDMarkMeDown>();
+  int from = m->target_osd;
+
+  // check permissions
+  if (check_source(op, m->fsid))
+    goto reply;
+
+  // first, verify the reporting host is valid
+  if (!m->get_orig_source().is_osd())
+    goto reply;
+
+  if (!osdmap.exists(from) ||
+      osdmap.is_down(from) ||
+      osdmap.get_addrs(from) != m->target_addrs) {
+    dout(5) << "preprocess_mark_me_down from dead osd."
+	    << from << ", ignoring" << dendl;
+    send_incremental(op, m->get_epoch()+1);
+    goto reply;
+  }
+
+  // no down might be set
+  if (!can_mark_down(from))
+    goto reply;
+
+  dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
+	   << " " << m->target_addrs << dendl;
+  return false;
+
+ reply:
+  if (m->request_ack) {
+    Context *c(new C_AckMarkedDown(this, op));
+    c->complete(0);
+  }
+  return true;
+}
+
+bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDMarkMeDown>();
+  int target_osd = m->target_osd;
+
+  ceph_assert(osdmap.is_up(target_osd));
+  ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
+
+  mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
+  pending_inc.new_state[target_osd] = CEPH_OSD_UP;
+  if (m->down_and_dead) {
+    if (!pending_inc.new_xinfo.count(target_osd)) {
+      pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+    }
+    pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
+  }
+  if (m->request_ack)
+    wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
+  return true;
+}
+
+bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDMarkMeDead>();
+  int from = m->target_osd;
+
+  // check permissions
+  if (check_source(op, m->fsid)) {
+    mon.no_reply(op);
+    return true;
+  }
+
+  // first, verify the reporting host is valid
+  if (!m->get_orig_source().is_osd()) {
+    mon.no_reply(op);
+    return true;
+  }
+
+  if (!osdmap.exists(from) ||
+      !osdmap.is_down(from)) {
+    dout(5) << __func__ << " from nonexistent or up osd." << from
+	    << ", ignoring" << dendl;
+    send_incremental(op, m->get_epoch()+1);
+    mon.no_reply(op);
+    return true;
+  }
+
+  return false;
+}
+
+bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDMarkMeDead>();
+  int target_osd = m->target_osd;
+
+  ceph_assert(osdmap.is_down(target_osd));
+
+  mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
+		    << m->get_epoch();
+  if (!pending_inc.new_xinfo.count(target_osd)) {
+    pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+  }
+  pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
+  wait_for_finished_proposal(
+    op,
+    new LambdaContext(
+      [op, this] (int r) {
+	if (r >= 0) {
+	  mon.no_reply(op);	  // ignore on success
+	}
+      }
+      ));
+  return true;
+}
+
+bool OSDMonitor::can_mark_down(int i)
+{
+  if (osdmap.is_nodown(i)) {
+    dout(5) << __func__ << " osd." << i << " is marked as nodown, "
+            << "will not mark it down" << dendl;
+    return false;
+  }
+
+  int num_osds = osdmap.get_num_osds();
+  if (num_osds == 0) {
+    dout(5) << __func__ << " no osds" << dendl;
+    return false;
+  }
+  int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
+  float up_ratio = (float)up / (float)num_osds;
+  if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
+    dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
+	    << g_conf()->mon_osd_min_up_ratio
+	    << ", will not mark osd." << i << " down" << dendl;
+    return false;
+  }
+  return true;
+}
+
+bool OSDMonitor::can_mark_up(int i)
+{
+  if (osdmap.is_noup(i)) {
+    dout(5) << __func__ << " osd." << i << " is marked as noup, "
+            << "will not mark it up" << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+/**
+ * @note the parameter @p i apparently only exists here so we can output the
+ *	 osd's id on messages.
+ */
+bool OSDMonitor::can_mark_out(int i)
+{
+  if (osdmap.is_noout(i)) {
+    dout(5) << __func__ << " osd." << i << " is marked as noout, "
+            << "will not mark it out" << dendl;
+    return false;
+  }
+
+  int num_osds = osdmap.get_num_osds();
+  if (num_osds == 0) {
+    dout(5) << __func__ << " no osds" << dendl;
+    return false;
+  }
+  int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
+  float in_ratio = (float)in / (float)num_osds;
+  if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
+    if (i >= 0)
+      dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
+	      << g_conf()->mon_osd_min_in_ratio
+	      << ", will not mark osd." << i << " out" << dendl;
+    else
+      dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
+	      << g_conf()->mon_osd_min_in_ratio
+	      << ", will not mark osds out" << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+bool OSDMonitor::can_mark_in(int i)
+{
+  if (osdmap.is_noin(i)) {
+    dout(5) << __func__ << " osd." << i << " is marked as noin, "
+            << "will not mark it in" << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+bool OSDMonitor::check_failures(utime_t now)
+{
+  bool found_failure = false;
+  auto p = failure_info.begin();
+  while (p != failure_info.end()) {
+    auto& [target_osd, fi] = *p;
+    if (can_mark_down(target_osd) &&
+	check_failure(now, target_osd, fi)) {
+      found_failure = true;
+      ++p;
+    } else if (is_failure_stale(now, fi)) {
+      dout(10) << " dropping stale failure_info for osd." << target_osd
+	       << " from " << fi.reporters.size() << " reporters"
+	       << dendl;
+      p = failure_info.erase(p);
+    } else {
+      ++p;
+    }
+  }
+  return found_failure;
+}
+
+utime_t OSDMonitor::get_grace_time(utime_t now,
+				   int target_osd,
+				   failure_info_t& fi) const
+{
+  utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
+  if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
+    return orig_grace;
+  }
+  utime_t grace = orig_grace;
+  double halflife = (double)g_conf()->mon_osd_laggy_halflife;
+  double decay_k = ::log(.5) / halflife;
+
+  // scale grace period based on historical probability of 'lagginess'
+  // (false positive failures due to slowness).
+  const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
+  const utime_t failed_for = now - fi.get_failed_since();
+  double decay = exp((double)failed_for * decay_k);
+  dout(20) << " halflife " << halflife << " decay_k " << decay_k
+	   << " failed_for " << failed_for << " decay " << decay << dendl;
+  double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
+  grace += my_grace;
+
+  // consider the peers reporting a failure a proxy for a potential
+  // 'subcluster' over the overall cluster that is similarly
+  // laggy.  this is clearly not true in all cases, but will sometimes
+  // help us localize the grace correction to a subset of the system
+  // (say, a rack with a bad switch) that is unhappy.
+  double peer_grace = 0;
+  for (auto& [reporter, report] : fi.reporters) {
+    if (osdmap.exists(reporter)) {
+      const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
+      utime_t elapsed = now - xi.down_stamp;
+      double decay = exp((double)elapsed * decay_k);
+      peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
+    }
+  }
+  peer_grace /= (double)fi.reporters.size();
+  grace += peer_grace;
+  dout(10) << " osd." << target_osd << " has "
+	   << fi.reporters.size() << " reporters, "
+	   << grace << " grace (" << orig_grace << " + " << my_grace
+	   << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
+	   << dendl;
+
+  return grace;
+}
+
+bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
+{
+  // already pending failure?
+  if (pending_inc.new_state.count(target_osd) &&
+      pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
+    dout(10) << " already pending failure" << dendl;
+    return true;
+  }
+
+  set<string> reporters_by_subtree;
+  auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
+  ceph_assert(fi.reporters.size());
+  for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
+    // get the parent bucket whose type matches with "reporter_subtree_level".
+    // fall back to OSD if the level doesn't exist.
+    if (osdmap.exists(p->first)) {
+      auto reporter_loc = osdmap.crush->get_full_location(p->first);
+      if (auto iter = reporter_loc.find(reporter_subtree_level);
+          iter == reporter_loc.end()) {
+        reporters_by_subtree.insert("osd." + to_string(p->first));
+      } else {
+        reporters_by_subtree.insert(iter->second);
+      }
+      ++p;
+    } else {
+      fi.cancel_report(p->first);;
+      p = fi.reporters.erase(p);
+    }
+  }
+  if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
+    return false;
+  }
+  const utime_t failed_for = now - fi.get_failed_since();
+  const utime_t grace = get_grace_time(now, target_osd, fi);
+  if (failed_for >= grace) {
+    dout(1) << " we have enough reporters to mark osd." << target_osd
+	    << " down" << dendl;
+    pending_inc.new_state[target_osd] = CEPH_OSD_UP;
+
+    mon.clog->info() << "osd." << target_osd << " failed ("
+		      << osdmap.crush->get_full_location_ordered_string(
+			target_osd)
+		      << ") ("
+		      << (int)reporters_by_subtree.size()
+		      << " reporters from different "
+		      << reporter_subtree_level << " after "
+		      << failed_for << " >= grace " << grace << ")";
+    return true;
+  }
+  return false;
+}
+
+bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
+{
+  // if it takes too long to either cancel the report to mark the osd down,
+  // some reporters must have failed to cancel their reports. let's just
+  // forget these reports.
+  const utime_t failed_for = now - fi.get_failed_since();
+  auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+  auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+  return failed_for >= (heartbeat_grace + heartbeat_stale);
+}
+
+void OSDMonitor::force_failure(int target_osd, int by)
+{
+  // already pending failure?
+  if (pending_inc.new_state.count(target_osd) &&
+      pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
+    dout(10) << " already pending failure" << dendl;
+    return;
+  }
+
+  dout(1) << " we're forcing failure of osd." << target_osd << dendl;
+  pending_inc.new_state[target_osd] = CEPH_OSD_UP;
+  if (!pending_inc.new_xinfo.count(target_osd)) {
+    pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+  }
+  pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
+
+  mon.clog->info() << "osd." << target_osd << " failed ("
+		    << osdmap.crush->get_full_location_ordered_string(target_osd)
+		    << ") (connection refused reported by osd." << by << ")";
+  return;
+}
+
+bool OSDMonitor::prepare_failure(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDFailure>();
+  dout(1) << "prepare_failure osd." << m->get_target_osd()
+	  << " " << m->get_target_addrs()
+	  << " from " << m->get_orig_source()
+          << " is reporting failure:" << m->if_osd_failed() << dendl;
+
+  int target_osd = m->get_target_osd();
+  int reporter = m->get_orig_source().num();
+  ceph_assert(osdmap.is_up(target_osd));
+  ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
+
+  mon.no_reply(op);
+
+  if (m->if_osd_failed()) {
+    // calculate failure time
+    utime_t now = ceph_clock_now();
+    utime_t failed_since =
+      m->get_recv_stamp() - utime_t(m->failed_for, 0);
+
+    // add a report
+    if (m->is_immediate()) {
+      mon.clog->debug() << "osd." << m->get_target_osd()
+			 << " reported immediately failed by "
+			 << m->get_orig_source();
+      force_failure(target_osd, reporter);
+      return true;
+    }
+    mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
+		      << m->get_orig_source();
+
+    failure_info_t& fi = failure_info[target_osd];
+    fi.add_report(reporter, failed_since, op);
+    return check_failure(now, target_osd, fi);
+  } else {
+    // remove the report
+    mon.clog->debug() << "osd." << m->get_target_osd()
+		       << " failure report canceled by "
+		       << m->get_orig_source();
+    if (failure_info.count(target_osd)) {
+      failure_info_t& fi = failure_info[target_osd];
+      fi.cancel_report(reporter);
+      if (fi.reporters.empty()) {
+	dout(10) << " removing last failure_info for osd." << target_osd
+		 << dendl;
+	failure_info.erase(target_osd);
+      } else {
+	dout(10) << " failure_info for osd." << target_osd << " now "
+		 << fi.reporters.size() << " reporters" << dendl;
+      }
+    } else {
+      dout(10) << " no failure_info for osd." << target_osd << dendl;
+    }
+  }
+
+  return false;
+}
+
+void OSDMonitor::process_failures()
+{
+  map<int,failure_info_t>::iterator p = failure_info.begin();
+  while (p != failure_info.end()) {
+    if (osdmap.is_up(p->first)) {
+      ++p;
+    } else {
+      dout(10) << "process_failures osd." << p->first << dendl;
+      list<MonOpRequestRef> ls;
+      p->second.take_report_messages(ls);
+      failure_info.erase(p++);
+
+      while (!ls.empty()) {
+        MonOpRequestRef o = ls.front();
+        if (o) {
+          o->mark_event(__func__);
+          MOSDFailure *m = o->get_req<MOSDFailure>();
+          send_latest(o, m->get_epoch());
+	  mon.no_reply(o);
+        }
+	ls.pop_front();
+      }
+    }
+  }
+}
+
+void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
+{
+  dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
+
+  for (map<int,failure_info_t>::iterator p = failure_info.begin();
+       p != failure_info.end();
+       ++p) {
+    p->second.take_report_messages(ls);
+  }
+  failure_info.clear();
+}
+
+int OSDMonitor::get_grace_interval_threshold()
+{
+  int halflife = g_conf()->mon_osd_laggy_halflife;
+  // Scale the halflife period (default: 1_hr) by
+  // a factor (48) to calculate the threshold.
+  int grace_threshold_factor = 48;
+  return halflife * grace_threshold_factor;
+}
+
+bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
+{
+  int grace_interval_threshold_secs = get_grace_interval_threshold();
+  if (last_failed_interval > grace_interval_threshold_secs) {
+    dout(1) << " last_failed_interval " << last_failed_interval
+            << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
+            << dendl;
+    return true;
+  }
+  return false;
+}
+
+void OSDMonitor::set_default_laggy_params(int target_osd)
+{
+  if (pending_inc.new_xinfo.count(target_osd) == 0) {
+    pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
+  }
+  osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
+  xi.down_stamp = pending_inc.modified;
+  xi.laggy_probability = 0.0;
+  xi.laggy_interval = 0;
+  dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
+}
+
+
+// boot --
+
+bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDBoot>();
+  int from = m->get_orig_source_inst().name.num();
+
+  // check permissions, ignore if failed (no response expected)
+  MonSession *session = op->get_session();
+  if (!session)
+    goto ignore;
+  if (!session->is_capable("osd", MON_CAP_X)) {
+    dout(0) << "got preprocess_boot message from entity with insufficient caps"
+	    << session->caps << dendl;
+    goto ignore;
+  }
+
+  if (m->sb.cluster_fsid != mon.monmap->fsid) {
+    dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
+	    << " != " << mon.monmap->fsid << dendl;
+    goto ignore;
+  }
+
+  if (m->get_orig_source_inst().addr.is_blank_ip()) {
+    dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
+    goto ignore;
+  }
+
+  ceph_assert(m->get_orig_source_inst().name.is_osd());
+
+  // force all osds to have gone through luminous prior to upgrade to nautilus
+  {
+    vector<string> missing;
+    if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
+      missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
+    }
+    if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
+      missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
+    }
+    if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
+      missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
+    }
+    if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
+      missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
+    }
+
+    if (!missing.empty()) {
+      using std::experimental::make_ostream_joiner;
+
+      stringstream ss;
+      copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
+
+      mon.clog->info() << "disallowing boot of OSD "
+			<< m->get_orig_source_inst()
+			<< " because the osd lacks " << ss.str();
+      goto ignore;
+    }
+  }
+
+  // make sure osd versions do not span more than 3 releases
+  if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
+      osdmap.require_osd_release < ceph_release_t::mimic) {
+    mon.clog->info() << "disallowing boot of octopus+ OSD "
+		      << m->get_orig_source_inst()
+		      << " because require_osd_release < mimic";
+    goto ignore;
+  }
+  if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) &&
+      osdmap.require_osd_release < ceph_release_t::nautilus) {
+    mon.clog->info() << "disallowing boot of pacific+ OSD "
+		      << m->get_orig_source_inst()
+		      << " because require_osd_release < nautilus";
+    goto ignore;
+  }
+
+  // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+  // we are reusing a jewel feature bit that was retired in luminous.
+  if (osdmap.require_osd_release >= ceph_release_t::luminous &&
+      osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
+      !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
+    mon.clog->info() << "disallowing boot of OSD "
+		      << m->get_orig_source_inst()
+		      << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
+    goto ignore;
+  }
+
+  if (osdmap.stretch_mode_enabled &&
+      !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
+    mon.clog->info() << "disallowing boot of OSD "
+		      << m->get_orig_source_inst()
+		      << " because stretch mode is on and OSD lacks support";
+    goto ignore;
+  }
+
+  // already booted?
+  if (osdmap.is_up(from) &&
+      osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
+      osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
+    // yup.
+    dout(7) << "preprocess_boot dup from " << m->get_orig_source()
+	    << " " << m->get_orig_source_addrs()
+	    << " =~ " << osdmap.get_addrs(from) << dendl;
+    _booted(op, false);
+    return true;
+  }
+
+  if (osdmap.exists(from) &&
+      !osdmap.get_uuid(from).is_zero() &&
+      osdmap.get_uuid(from) != m->sb.osd_fsid) {
+    dout(7) << __func__ << " from " << m->get_orig_source_inst()
+            << " clashes with existing osd: different fsid"
+            << " (ours: " << osdmap.get_uuid(from)
+            << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
+    goto ignore;
+  }
+
+  if (osdmap.exists(from) &&
+      osdmap.get_info(from).up_from > m->version &&
+      osdmap.get_most_recent_addrs(from).legacy_equals(
+	m->get_orig_source_addrs())) {
+    dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
+    send_latest(op, m->sb.current_epoch+1);
+    return true;
+  }
+
+  // noup?
+  if (!can_mark_up(from)) {
+    dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
+    send_latest(op, m->sb.current_epoch+1);
+    return true;
+  }
+
+  dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
+  return false;
+
+ ignore:
+  return true;
+}
+
+bool OSDMonitor::prepare_boot(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDBoot>();
+  dout(7) << __func__ << " from " << m->get_source()
+	  << " sb " << m->sb
+	  << " client_addrs" << m->get_connection()->get_peer_addrs()
+	  << " cluster_addrs " << m->cluster_addrs
+	  << " hb_back_addrs " << m->hb_back_addrs
+	  << " hb_front_addrs " << m->hb_front_addrs
+	  << dendl;
+
+  ceph_assert(m->get_orig_source().is_osd());
+  int from = m->get_orig_source().num();
+
+  // does this osd exist?
+  if (from >= osdmap.get_max_osd()) {
+    dout(1) << "boot from osd." << from << " >= max_osd "
+	    << osdmap.get_max_osd() << dendl;
+    return false;
+  }
+
+  int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
+  if (pending_inc.new_state.count(from))
+    oldstate ^= pending_inc.new_state[from];
+
+  // already up?  mark down first?
+  if (osdmap.is_up(from)) {
+    dout(7) << __func__ << " was up, first marking down osd." << from << " "
+	    << osdmap.get_addrs(from) << dendl;
+    // preprocess should have caught these;  if not, assert.
+    ceph_assert(!osdmap.get_addrs(from).legacy_equals(
+		  m->get_orig_source_addrs()) ||
+		!osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
+    ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
+
+    if (pending_inc.new_state.count(from) == 0 ||
+	(pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
+      // mark previous guy down
+      pending_inc.new_state[from] = CEPH_OSD_UP;
+    }
+    wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+  } else if (pending_inc.new_up_client.count(from)) {
+    // already prepared, just wait
+    dout(7) << __func__ << " already prepared, waiting on "
+	    << m->get_orig_source_addr() << dendl;
+    wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+  } else {
+    // mark new guy up.
+    pending_inc.new_up_client[from] = m->get_orig_source_addrs();
+    pending_inc.new_up_cluster[from] = m->cluster_addrs;
+    pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
+    pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
+
+    down_pending_out.erase(from);  // if any
+
+    if (m->sb.weight)
+      osd_weight[from] = m->sb.weight;
+
+    // set uuid?
+    dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
+	     << dendl;
+    if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
+      // preprocess should have caught this;  if not, assert.
+      ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
+      pending_inc.new_uuid[from] = m->sb.osd_fsid;
+    }
+
+    // fresh osd?
+    if (m->sb.newest_map == 0 && osdmap.exists(from)) {
+      const osd_info_t& i = osdmap.get_info(from);
+      if (i.up_from > i.lost_at) {
+	dout(10) << " fresh osd; marking lost_at too" << dendl;
+	pending_inc.new_lost[from] = osdmap.get_epoch();
+      }
+    }
+
+    // metadata
+    bufferlist osd_metadata;
+    encode(m->metadata, osd_metadata);
+    pending_metadata[from] = osd_metadata;
+    pending_metadata_rm.erase(from);
+
+    // adjust last clean unmount epoch?
+    const osd_info_t& info = osdmap.get_info(from);
+    dout(10) << " old osd_info: " << info << dendl;
+    if (m->sb.mounted > info.last_clean_begin ||
+	(m->sb.mounted == info.last_clean_begin &&
+	 m->sb.clean_thru > info.last_clean_end)) {
+      epoch_t begin = m->sb.mounted;
+      epoch_t end = m->sb.clean_thru;
+
+      dout(10) << __func__ << " osd." << from << " last_clean_interval "
+	       << "[" << info.last_clean_begin << "," << info.last_clean_end
+	       << ") -> [" << begin << "-" << end << ")"
+	       << dendl;
+      pending_inc.new_last_clean_interval[from] =
+	pair<epoch_t,epoch_t>(begin, end);
+    }
+
+    if (pending_inc.new_xinfo.count(from) == 0)
+      pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
+    osd_xinfo_t& xi = pending_inc.new_xinfo[from];
+    if (m->boot_epoch == 0) {
+      xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
+      xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
+      dout(10) << " not laggy, new xi " << xi << dendl;
+    } else {
+      if (xi.down_stamp.sec()) {
+        int interval = ceph_clock_now().sec() -
+	  xi.down_stamp.sec();
+        if (g_conf()->mon_osd_laggy_max_interval &&
+	    (interval > g_conf()->mon_osd_laggy_max_interval)) {
+          interval =  g_conf()->mon_osd_laggy_max_interval;
+        }
+        xi.laggy_interval =
+	  interval * g_conf()->mon_osd_laggy_weight +
+	  xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
+      }
+      xi.laggy_probability =
+	g_conf()->mon_osd_laggy_weight +
+	xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
+      dout(10) << " laggy, now xi " << xi << dendl;
+    }
+
+    // set features shared by the osd
+    if (m->osd_features)
+      xi.features = m->osd_features;
+    else
+      xi.features = m->get_connection()->get_features();
+
+    // mark in?
+    if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
+	 (oldstate & CEPH_OSD_AUTOOUT)) ||
+	(g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
+	(g_conf()->mon_osd_auto_mark_in)) {
+      if (can_mark_in(from)) {
+	if (xi.old_weight > 0) {
+	  pending_inc.new_weight[from] = xi.old_weight;
+	  xi.old_weight = 0;
+	} else {
+	  pending_inc.new_weight[from] = CEPH_OSD_IN;
+	}
+      } else {
+	dout(7) << __func__ << " NOIN set, will not mark in "
+		<< m->get_orig_source_addr() << dendl;
+      }
+    }
+
+    // wait
+    wait_for_finished_proposal(op, new C_Booted(this, op));
+  }
+  return true;
+}
+
+void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDBoot>();
+  dout(7) << "_booted " << m->get_orig_source_inst() 
+	  << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
+
+  if (logit) {
+    mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
+		      << " boot";
+  }
+
+  send_latest(op, m->sb.current_epoch+1);
+}
+
+
+// -------------
+// full
+
+bool OSDMonitor::preprocess_full(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDFull>();
+  int from = m->get_orig_source().num();
+  set<string> state;
+  unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
+
+  // check permissions, ignore if failed
+  MonSession *session = op->get_session();
+  if (!session)
+    goto ignore;
+  if (!session->is_capable("osd", MON_CAP_X)) {
+    dout(0) << "MOSDFull from entity with insufficient privileges:"
+	    << session->caps << dendl;
+    goto ignore;
+  }
+
+  // ignore a full message from the osd instance that already went down
+  if (!osdmap.exists(from)) {
+    dout(7) << __func__ << " ignoring full message from nonexistent "
+	    << m->get_orig_source_inst() << dendl;
+    goto ignore;
+  }
+  if ((!osdmap.is_up(from) &&
+       osdmap.get_most_recent_addrs(from).legacy_equals(
+	 m->get_orig_source_addrs())) ||
+      (osdmap.is_up(from) &&
+       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
+    dout(7) << __func__ << " ignoring full message from down "
+	    << m->get_orig_source_inst() << dendl;
+    goto ignore;
+  }
+
+  OSDMap::calc_state_set(osdmap.get_state(from), state);
+
+  if ((osdmap.get_state(from) & mask) == m->state) {
+    dout(7) << __func__ << " state already " << state << " for osd." << from
+	    << " " << m->get_orig_source_inst() << dendl;
+    _reply_map(op, m->version);
+    goto ignore;
+  }
+
+  dout(10) << __func__ << " want state " << state << " for osd." << from
+	   << " " << m->get_orig_source_inst() << dendl;
+  return false;
+
+ ignore:
+  return true;
+}
+
+bool OSDMonitor::prepare_full(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDFull>();
+  const int from = m->get_orig_source().num();
+
+  const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
+  const unsigned want_state = m->state & mask;  // safety first
+
+  unsigned cur_state = osdmap.get_state(from);
+  auto p = pending_inc.new_state.find(from);
+  if (p != pending_inc.new_state.end()) {
+    cur_state ^= p->second;
+  }
+  cur_state &= mask;
+
+  set<string> want_state_set, cur_state_set;
+  OSDMap::calc_state_set(want_state, want_state_set);
+  OSDMap::calc_state_set(cur_state, cur_state_set);
+
+  if (cur_state != want_state) {
+    if (p != pending_inc.new_state.end()) {
+      p->second &= ~mask;
+    } else {
+      pending_inc.new_state[from] = 0;
+    }
+    pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
+    dout(7) << __func__ << " osd." << from << " " << cur_state_set
+	    << " -> " << want_state_set << dendl;
+  } else {
+    dout(7) << __func__ << " osd." << from << " " << cur_state_set
+	    << " = wanted " << want_state_set << ", just waiting" << dendl;
+  }
+
+  wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
+  return true;
+}
+
+// -------------
+// alive
+
+bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDAlive>();
+  int from = m->get_orig_source().num();
+
+  // check permissions, ignore if failed
+  MonSession *session = op->get_session();
+  if (!session)
+    goto ignore;
+  if (!session->is_capable("osd", MON_CAP_X)) {
+    dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
+	    << session->caps << dendl;
+    goto ignore;
+  }
+
+  if (!osdmap.is_up(from) ||
+      !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
+    dout(7) << "preprocess_alive ignoring alive message from down "
+	    << m->get_orig_source() << " " << m->get_orig_source_addrs()
+	    << dendl;
+    goto ignore;
+  }
+
+  if (osdmap.get_up_thru(from) >= m->want) {
+    // yup.
+    dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
+    _reply_map(op, m->version);
+    return true;
+  }
+
+  dout(10) << "preprocess_alive want up_thru " << m->want
+	   << " from " << m->get_orig_source_inst() << dendl;
+  return false;
+
+ ignore:
+  return true;
+}
+
+bool OSDMonitor::prepare_alive(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDAlive>();
+  int from = m->get_orig_source().num();
+
+  if (0) {  // we probably don't care much about these
+    mon.clog->debug() << m->get_orig_source_inst() << " alive";
+  }
+
+  dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
+	  << " from " << m->get_orig_source_inst() << dendl;
+
+  update_up_thru(from, m->version); // set to the latest map the OSD has
+  wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
+  return true;
+}
+
+void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
+{
+  op->mark_osdmon_event(__func__);
+  dout(7) << "_reply_map " << e
+	  << " from " << op->get_req()->get_orig_source_inst()
+	  << dendl;
+  send_latest(op, e);
+}
+
+// pg_created
+bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m  = op->get_req<MOSDPGCreated>();
+  dout(10) << __func__ << " " << *m << dendl;
+  auto session = op->get_session();
+  mon.no_reply(op);
+  if (!session) {
+    dout(10) << __func__ << ": no monitor session!" << dendl;
+    return true;
+  }
+  if (!session->is_capable("osd", MON_CAP_X)) {
+    derr << __func__ << " received from entity "
+         << "with insufficient privileges " << session->caps << dendl;
+    return true;
+  }
+  // always forward the "created!" to the leader
+  return false;
+}
+
+bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDPGCreated>();
+  dout(10) << __func__ << " " << *m << dendl;
+  auto src = m->get_orig_source();
+  auto from = src.num();
+  if (!src.is_osd() ||
+      !mon.osdmon()->osdmap.is_up(from) ||
+      !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
+	m->get_orig_source_addrs())) {
+    dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
+    return false;
+  }
+  pending_created_pgs.push_back(m->pgid);
+  return true;
+}
+
+bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDPGReadyToMerge>();
+  dout(10) << __func__ << " " << *m << dendl;
+  const pg_pool_t *pi;
+  auto session = op->get_session();
+  if (!session) {
+    dout(10) << __func__ << ": no monitor session!" << dendl;
+    goto ignore;
+  }
+  if (!session->is_capable("osd", MON_CAP_X)) {
+    derr << __func__ << " received from entity "
+         << "with insufficient privileges " << session->caps << dendl;
+    goto ignore;
+  }
+  pi = osdmap.get_pg_pool(m->pgid.pool());
+  if (!pi) {
+    derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
+    goto ignore;
+  }
+  if (pi->get_pg_num() <= m->pgid.ps()) {
+    dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
+    goto ignore;
+  }
+  if (pi->get_pg_num() != m->pgid.ps() + 1) {
+    derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
+    goto ignore;
+  }
+  if (pi->get_pg_num_pending() > m->pgid.ps()) {
+    dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
+    goto ignore;
+  }
+  return false;
+
+ ignore:
+  mon.no_reply(op);
+  return true;
+}
+
+bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m  = op->get_req<MOSDPGReadyToMerge>();
+  dout(10) << __func__ << " " << *m << dendl;
+  pg_pool_t p;
+  if (pending_inc.new_pools.count(m->pgid.pool()))
+    p = pending_inc.new_pools[m->pgid.pool()];
+  else
+    p = *osdmap.get_pg_pool(m->pgid.pool());
+  if (p.get_pg_num() != m->pgid.ps() + 1 ||
+      p.get_pg_num_pending() > m->pgid.ps()) {
+    dout(10) << __func__
+	     << " race with concurrent pg_num[_pending] update, will retry"
+	     << dendl;
+    wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+    return true;
+  }
+
+  if (m->ready) {
+    p.dec_pg_num(m->pgid,
+		 pending_inc.epoch,
+		 m->source_version,
+		 m->target_version,
+		 m->last_epoch_started,
+		 m->last_epoch_clean);
+    p.last_change = pending_inc.epoch;
+  } else {
+    // back off the merge attempt!
+    p.set_pg_num_pending(p.get_pg_num());
+  }
+
+  // force pre-nautilus clients to resend their ops, since they
+  // don't understand pg_num_pending changes form a new interval
+  p.last_force_op_resend_prenautilus = pending_inc.epoch;
+
+  pending_inc.new_pools[m->pgid.pool()] = p;
+
+  auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
+  if (m->ready &&
+      prob > 0 &&
+      prob > (double)(rand() % 1000)/1000.0) {
+    derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
+    auto n = new MMonCommand(mon.monmap->get_fsid());
+    n->set_connection(m->get_connection());
+    n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
+	       osdmap.get_pool_name(m->pgid.pool()) +
+	       "\", \"var\": \"pg_num_actual\", \"val\": \"" +
+	       stringify(m->pgid.ps() + 1) + "\"}" };
+    MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
+    nop->set_type_service();
+    wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
+  } else {
+    wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
+  }
+  return true;
+}
+
+
+// -------------
+// pg_temp changes
+
+bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
+{
+  auto m = op->get_req<MOSDPGTemp>();
+  dout(10) << "preprocess_pgtemp " << *m << dendl;
+  mempool::osdmap::vector<int> empty;
+  int from = m->get_orig_source().num();
+  size_t ignore_cnt = 0;
+
+  // check caps
+  MonSession *session = op->get_session();
+  if (!session)
+    goto ignore;
+  if (!session->is_capable("osd", MON_CAP_X)) {
+    dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
+	    << session->caps << dendl;
+    goto ignore;
+  }
+
+  if (!osdmap.is_up(from) ||
+      !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
+    dout(7) << "ignoring pgtemp message from down "
+	    << m->get_orig_source() << " " << m->get_orig_source_addrs()
+	    << dendl;
+    goto ignore;
+  }
+
+  if (m->forced) {
+    return false;
+  }
+
+  for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
+    dout(20) << " " << p->first
+	     << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
+             << " -> " << p->second << dendl;
+
+    // does the pool exist?
+    if (!osdmap.have_pg_pool(p->first.pool())) {
+      /*
+       * 1. If the osdmap does not have the pool, it means the pool has been
+       *    removed in-between the osd sending this message and us handling it.
+       * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
+       *    not exist in the pending either, as the osds would not send a
+       *    message about a pool they know nothing about (yet).
+       * 3. However, if the pool does exist in the pending, then it must be a
+       *    new pool, and not relevant to this message (see 1).
+       */
+      dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+               << ": pool has been removed" << dendl;
+      ignore_cnt++;
+      continue;
+    }
+
+    int acting_primary = -1;
+    osdmap.pg_to_up_acting_osds(
+      p->first, nullptr, nullptr, nullptr, &acting_primary);
+    if (acting_primary != from) {
+      /* If the source isn't the primary based on the current osdmap, we know
+       * that the interval changed and that we can discard this message.
+       * Indeed, we must do so to avoid 16127 since we can't otherwise determine
+       * which of two pg temp mappings on the same pg is more recent.
+       */
+      dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+	       << ": primary has changed" << dendl;
+      ignore_cnt++;
+      continue;
+    }
+
+    // removal?
+    if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
+			      osdmap.primary_temp->count(p->first)))
+      return false;
+    // change?
+    //  NOTE: we assume that this will clear pg_primary, so consider
+    //        an existing pg_primary field to imply a change
+    if (p->second.size() &&
+	(osdmap.pg_temp->count(p->first) == 0 ||
+	 osdmap.pg_temp->get(p->first) != p->second ||
+	 osdmap.primary_temp->count(p->first)))
+      return false;
+  }
+
+  // should we ignore all the pgs?
+  if (ignore_cnt == m->pg_temp.size())
+    goto ignore;
+
+  dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
+  _reply_map(op, m->map_epoch);
+  return true;
+
+ ignore:
+  mon.no_reply(op);
+  return true;
+}
+
+void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
+{
+  epoch_t old_up_thru = osdmap.get_up_thru(from);
+  auto ut = pending_inc.new_up_thru.find(from);
+  if (ut != pending_inc.new_up_thru.end()) {
+    old_up_thru = ut->second;
+  }
+  if (up_thru > old_up_thru) {
+    // set up_thru too, so the osd doesn't have to ask again
+    pending_inc.new_up_thru[from] = up_thru;
+  }
+}
+
+bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MOSDPGTemp>();
+  int from = m->get_orig_source().num();
+  dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
+  for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
+    uint64_t pool = p->first.pool();
+    if (pending_inc.old_pools.count(pool)) {
+      dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+               << ": pool pending removal" << dendl;
+      continue;
+    }
+    if (!osdmap.have_pg_pool(pool)) {
+      dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
+               << ": pool has been removed" << dendl;
+      continue;
+    }
+    pending_inc.new_pg_temp[p->first] =
+      mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
+
+    // unconditionally clear pg_primary (until this message can encode
+    // a change for that, too.. at which point we need to also fix
+    // preprocess_pg_temp)
+    if (osdmap.primary_temp->count(p->first) ||
+	pending_inc.new_primary_temp.count(p->first))
+      pending_inc.new_primary_temp[p->first] = -1;
+  }
+
+  // set up_thru too, so the osd doesn't have to ask again
+  update_up_thru(from, m->map_epoch);
+
+  wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
+  return true;
+}
+
+
+// ---
+
+bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MRemoveSnaps>();
+  dout(7) << "preprocess_remove_snaps " << *m << dendl;
+
+  // check privilege, ignore if failed
+  MonSession *session = op->get_session();
+  mon.no_reply(op);
+  if (!session)
+    goto ignore;
+  if (!session->caps.is_capable(
+	cct,
+	session->entity_name,
+        "osd", "osd pool rmsnap", {}, true, true, false,
+	session->get_peer_socket_addr())) {
+    dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
+	    << session->caps << dendl;
+    goto ignore;
+  }
+
+  for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
+       q != m->snaps.end();
+       ++q) {
+    if (!osdmap.have_pg_pool(q->first)) {
+      dout(10) << " ignoring removed_snaps " << q->second
+	       << " on non-existent pool " << q->first << dendl;
+      continue;
+    }
+    const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
+    for (vector<snapid_t>::iterator p = q->second.begin();
+	 p != q->second.end();
+	 ++p) {
+      if (*p > pi->get_snap_seq() ||
+	  !_is_removed_snap(q->first, *p)) {
+	return false;
+      }
+    }
+  }
+
+  if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
+    auto reply = make_message<MRemoveSnaps>();
+    reply->snaps = m->snaps;
+    mon.send_reply(op, reply.detach());
+  }
+
+ ignore:
+  return true;
+}
+
+bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MRemoveSnaps>();
+  dout(7) << "prepare_remove_snaps " << *m << dendl;
+
+  for (auto& [pool, snaps] : m->snaps) {
+    if (!osdmap.have_pg_pool(pool)) {
+      dout(10) << " ignoring removed_snaps " << snaps
+	       << " on non-existent pool " << pool << dendl;
+      continue;
+    }
+
+    pg_pool_t& pi = osdmap.pools[pool];
+    for (auto s : snaps) {
+      if (!_is_removed_snap(pool, s) &&
+	  (!pending_inc.new_pools.count(pool) ||
+	   !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
+	  (!pending_inc.new_removed_snaps.count(pool) ||
+	   !pending_inc.new_removed_snaps[pool].contains(s))) {
+	pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
+	if (osdmap.require_osd_release < ceph_release_t::octopus) {
+	  newpi->removed_snaps.insert(s);
+	  dout(10) << " pool " << pool << " removed_snaps added " << s
+		   << " (now " << newpi->removed_snaps << ")" << dendl;
+	}
+	newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
+	if (s > newpi->get_snap_seq()) {
+	  dout(10) << " pool " << pool << " snap_seq "
+		   << newpi->get_snap_seq() << " -> " << s << dendl;
+	  newpi->set_snap_seq(s);
+	}
+	newpi->set_snap_epoch(pending_inc.epoch);
+	dout(10) << " added pool " << pool << " snap " << s
+		 << " to removed_snaps queue" << dendl;
+	pending_inc.new_removed_snaps[pool].insert(s);
+      }
+    }
+  }
+
+  if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
+    auto reply = make_message<MRemoveSnaps>();
+    reply->snaps = m->snaps;
+    wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
+  }
+
+  return true;
+}
+
+bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MMonGetPurgedSnaps>();
+  dout(7) << __func__ << " " << *m << dendl;
+
+  map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
+
+  string k = make_purged_snap_epoch_key(m->start);
+  auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
+  it->upper_bound(k);
+  unsigned long epoch = m->last;
+  while (it->valid()) {
+    if (it->key().find("purged_epoch_") != 0) {
+      break;
+    }
+    string k = it->key();
+    int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
+    if (n != 1) {
+      derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
+    } else if (epoch > m->last) {
+      break;
+    } else {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      auto &v = r[epoch];
+      try {
+	ceph::decode(v, p);
+      } catch (ceph::buffer::error& e) {
+	derr << __func__ << " unable to parse value for key '" << it->key()
+	     << "': \n";
+	bl.hexdump(*_dout);
+	*_dout << dendl;
+      }
+      n += 4 + v.size() * 16;
+    }
+    if (n > 1048576) {
+      // impose a semi-arbitrary limit to message size
+      break;
+    }
+    it->next();
+  }
+
+  auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
+  reply->purged_snaps.swap(r);
+  mon.send_reply(op, reply.detach());
+
+  return true;
+}
+
+// osd beacon
+bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  // check caps
+  auto session = op->get_session();
+  mon.no_reply(op);
+  if (!session) {
+    dout(10) << __func__ << " no monitor session!" << dendl;
+    return true;
+  }
+  if (!session->is_capable("osd", MON_CAP_X)) {
+    derr << __func__ << " received from entity "
+         << "with insufficient privileges " << session->caps << dendl;
+    return true;
+  }
+  // Always forward the beacon to the leader, even if they are the same as
+  // the old one. The leader will mark as down osds that haven't sent
+  // beacon for a few minutes.
+  return false;
+}
+
+bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  const auto beacon = op->get_req<MOSDBeacon>();
+  const auto src = beacon->get_orig_source();
+  dout(10) << __func__ << " " << *beacon
+	   << " from " << src << dendl;
+  int from = src.num();
+
+  if (!src.is_osd() ||
+      !osdmap.is_up(from) ||
+      !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
+    if (src.is_osd() && !osdmap.is_up(from)) {
+      // share some new maps with this guy in case it may not be
+      // aware of its own deadness...
+      send_latest(op, beacon->version+1);
+    }
+    dout(1) << " ignoring beacon from non-active osd." << from << dendl;
+    return false;
+  }
+
+  last_osd_report[from].first = ceph_clock_now();
+  last_osd_report[from].second = beacon->osd_beacon_report_interval;
+  osd_epochs[from] = beacon->version;
+
+  for (const auto& pg : beacon->pgs) {
+    if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
+      unsigned pg_num = pool->get_pg_num();
+      last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
+    }
+  }
+
+  if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
+      beacon->last_purged_snaps_scrub) {
+    if (pending_inc.new_xinfo.count(from) == 0) {
+      pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
+    }
+    pending_inc.new_xinfo[from].last_purged_snaps_scrub =
+      beacon->last_purged_snaps_scrub;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// ---------------
+// map helpers
+
+void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
+{
+  op->mark_osdmon_event(__func__);
+  dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
+	  << " start " << start << dendl;
+  if (start == 0)
+    send_full(op);
+  else
+    send_incremental(op, start);
+}
+
+
+MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
+{
+  MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
+  get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
+  r->oldest_map = get_first_committed();
+  r->newest_map = osdmap.get_epoch();
+  return r;
+}
+
+MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
+{
+  dout(10) << "build_incremental [" << from << ".." << to << "] with features "
+	   << std::hex << features << std::dec << dendl;
+  MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
+  m->oldest_map = get_first_committed();
+  m->newest_map = osdmap.get_epoch();
+
+  for (epoch_t e = to; e >= from && e > 0; e--) {
+    bufferlist bl;
+    int err = get_version(e, features, bl);
+    if (err == 0) {
+      ceph_assert(bl.length());
+      // if (get_version(e, bl) > 0) {
+      dout(20) << "build_incremental    inc " << e << " "
+	       << bl.length() << " bytes" << dendl;
+      m->incremental_maps[e] = bl;
+    } else {
+      ceph_assert(err == -ENOENT);
+      ceph_assert(!bl.length());
+      get_version_full(e, features, bl);
+      if (bl.length() > 0) {
+      //else if (get_version("full", e, bl) > 0) {
+      dout(20) << "build_incremental   full " << e << " "
+	       << bl.length() << " bytes" << dendl;
+      m->maps[e] = bl;
+      } else {
+	ceph_abort();  // we should have all maps.
+      }
+    }
+  }
+  return m;
+}
+
+void OSDMonitor::send_full(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
+  mon.send_reply(op, build_latest_full(op->get_session()->con_features));
+}
+
+void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
+{
+  op->mark_osdmon_event(__func__);
+
+  MonSession *s = op->get_session();
+  ceph_assert(s);
+
+  if (s->proxy_con) {
+    // oh, we can tell the other mon to do it
+    dout(10) << __func__ << " asking proxying mon to send_incremental from "
+	     << first << dendl;
+    MRoute *r = new MRoute(s->proxy_tid, NULL);
+    r->send_osdmap_first = first;
+    s->proxy_con->send_message(r);
+    op->mark_event("reply: send routed send_osdmap_first reply");
+  } else {
+    // do it ourselves
+    send_incremental(first, s, false, op);
+  }
+}
+
+void OSDMonitor::send_incremental(epoch_t first,
+				  MonSession *session,
+				  bool onetime,
+				  MonOpRequestRef req)
+{
+  dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
+	  << " to " << session->name << dendl;
+
+  // get feature of the peer
+  // use quorum_con_features, if it's an anonymous connection.
+  uint64_t features = session->con_features ? session->con_features :
+    mon.get_quorum_con_features();
+
+  if (first <= session->osd_epoch) {
+    dout(10) << __func__ << " " << session->name << " should already have epoch "
+	     << session->osd_epoch << dendl;
+    first = session->osd_epoch + 1;
+  }
+
+  if (first < get_first_committed()) {
+    MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
+    m->oldest_map = get_first_committed();
+    m->newest_map = osdmap.get_epoch();
+
+    first = get_first_committed();
+    bufferlist bl;
+    int err = get_version_full(first, features, bl);
+    ceph_assert(err == 0);
+    ceph_assert(bl.length());
+    dout(20) << "send_incremental starting with base full "
+	     << first << " " << bl.length() << " bytes" << dendl;
+    m->maps[first] = bl;
+
+    if (req) {
+      mon.send_reply(req, m);
+      session->osd_epoch = first;
+      return;
+    } else {
+      session->con->send_message(m);
+      session->osd_epoch = first;
+    }
+    first++;
+  }
+
+  while (first <= osdmap.get_epoch()) {
+    epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
+				     osdmap.get_epoch());
+    MOSDMap *m = build_incremental(first, last, features);
+
+    if (req) {
+      // send some maps.  it may not be all of them, but it will get them
+      // started.
+      mon.send_reply(req, m);
+    } else {
+      session->con->send_message(m);
+      first = last + 1;
+    }
+    session->osd_epoch = last;
+    if (onetime || req)
+      break;
+  }
+}
+
+int OSDMonitor::get_version(version_t ver, bufferlist& bl)
+{
+  return get_version(ver, mon.get_quorum_con_features(), bl);
+}
+
+void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
+{
+  OSDMap::Incremental inc;
+  auto q = bl.cbegin();
+  inc.decode(q);
+  // always encode with subset of osdmap's canonical features
+  uint64_t f = features & inc.encode_features;
+  dout(20) << __func__ << " " << inc.epoch << " with features " << f
+	   << dendl;
+  bl.clear();
+  if (inc.fullmap.length()) {
+    // embedded full map?
+    OSDMap m;
+    m.decode(inc.fullmap);
+    inc.fullmap.clear();
+    m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
+  }
+  if (inc.crush.length()) {
+    // embedded crush map
+    CrushWrapper c;
+    auto p = inc.crush.cbegin();
+    c.decode(p);
+    inc.crush.clear();
+    c.encode(inc.crush, f);
+  }
+  inc.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
+{
+  OSDMap m;
+  auto q = bl.cbegin();
+  m.decode(q);
+  // always encode with subset of osdmap's canonical features
+  uint64_t f = features & m.get_encoding_features();
+  dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
+	   << dendl;
+  bl.clear();
+  m.encode(bl, f | CEPH_FEATURE_RESERVED);
+}
+
+int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
+{
+  uint64_t significant_features = OSDMap::get_significant_features(features);
+  if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
+    return 0;
+  }
+  int ret = PaxosService::get_version(ver, bl);
+  if (ret < 0) {
+    return ret;
+  }
+  // NOTE: this check is imprecise; the OSDMap encoding features may
+  // be a subset of the latest mon quorum features, but worst case we
+  // reencode once and then cache the (identical) result under both
+  // feature masks.
+  if (significant_features !=
+      OSDMap::get_significant_features(mon.get_quorum_con_features())) {
+    reencode_incremental_map(bl, features);
+  }
+  inc_osd_cache.add_bytes({ver, significant_features}, bl);
+  return 0;
+}
+
+int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
+{
+  bufferlist inc_bl;
+  int err = get_version(ver, inc_bl);
+  ceph_assert(err == 0);
+  ceph_assert(inc_bl.length());
+
+  auto p = inc_bl.cbegin();
+  inc.decode(p);
+  dout(10) << __func__ << "     "
+           << " epoch " << inc.epoch
+           << " inc_crc " << inc.inc_crc
+           << " full_crc " << inc.full_crc
+           << " encode_features " << inc.encode_features << dendl;
+  return 0;
+}
+
+int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
+{
+  dout(10) << __func__ << " ver " << ver << dendl;
+
+  version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
+  if (closest_pinned == 0) {
+    return -ENOENT;
+  }
+  if (closest_pinned > ver) {
+    dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
+  }
+  ceph_assert(closest_pinned <= ver);
+
+  dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
+
+  // get osdmap incremental maps and apply on top of this one.
+  bufferlist osdm_bl;
+  bool has_cached_osdmap = false;
+  for (version_t v = ver-1; v >= closest_pinned; --v) {
+    if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
+                                &osdm_bl)) {
+      dout(10) << __func__ << " found map in cache ver " << v << dendl;
+      closest_pinned = v;
+      has_cached_osdmap = true;
+      break;
+    }
+  }
+
+  if (!has_cached_osdmap) {
+    int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
+    if (err != 0) {
+      derr << __func__ << " closest pinned map ver " << closest_pinned
+           << " not available! error: " << cpp_strerror(err) << dendl;
+    }
+    ceph_assert(err == 0);
+  }
+
+  ceph_assert(osdm_bl.length());
+
+  OSDMap osdm;
+  osdm.decode(osdm_bl);
+
+  dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
+           << " e" << osdm.epoch
+           << " crc " << osdm.get_crc()
+           << " -- applying incremental maps." << dendl;
+
+  uint64_t encode_features = 0;
+  for (version_t v = closest_pinned + 1; v <= ver; ++v) {
+    dout(20) << __func__ << "    applying inc epoch " << v << dendl;
+
+    OSDMap::Incremental inc;
+    int err = get_inc(v, inc);
+    ceph_assert(err == 0);
+
+    encode_features = inc.encode_features;
+
+    err = osdm.apply_incremental(inc);
+    ceph_assert(err == 0);
+
+    // this block performs paranoid checks on map retrieval
+    if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
+        inc.full_crc != 0) {
+
+      uint64_t f = encode_features;
+      if (!f) {
+        f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
+      }
+
+      // encode osdmap to force calculating crcs
+      bufferlist tbl;
+      osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
+      // decode osdmap to compare crcs with what's expected by incremental
+      OSDMap tosdm;
+      tosdm.decode(tbl);
+
+      if (tosdm.get_crc() != inc.full_crc) {
+        derr << __func__
+             << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
+             << ", expected " << inc.full_crc << ")" << dendl;
+        ceph_abort_msg("osdmap crc mismatch");
+      }
+    }
+
+    // note: we cannot add the recently computed map to the cache, as is,
+    // because we have not encoded the map into a bl.
+  }
+
+  if (!encode_features) {
+    dout(10) << __func__
+             << " last incremental map didn't have features;"
+             << " defaulting to quorum's or all" << dendl;
+    encode_features =
+      (mon.quorum_con_features ? mon.quorum_con_features : -1);
+  }
+  osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
+
+  return 0;
+}
+
+int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
+{
+  return get_version_full(ver, mon.get_quorum_con_features(), bl);
+}
+
+int OSDMonitor::get_version_full(version_t ver, uint64_t features,
+				 bufferlist& bl)
+{
+  uint64_t significant_features = OSDMap::get_significant_features(features);
+  if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
+    return 0;
+  }
+  int ret = PaxosService::get_version_full(ver, bl);
+  if (ret == -ENOENT) {
+    // build map?
+    ret = get_full_from_pinned_map(ver, bl);
+  }
+  if (ret < 0) {
+    return ret;
+  }
+  // NOTE: this check is imprecise; the OSDMap encoding features may
+  // be a subset of the latest mon quorum features, but worst case we
+  // reencode once and then cache the (identical) result under both
+  // feature masks.
+  if (significant_features !=
+      OSDMap::get_significant_features(mon.get_quorum_con_features())) {
+    reencode_full_map(bl, features);
+  }
+  full_osd_cache.add_bytes({ver, significant_features}, bl);
+  return 0;
+}
+
+epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
+{
+  dout(10) << "blocklist " << av << " until " << until << dendl;
+  for (auto a : av.v) {
+    if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
+      a.set_type(entity_addr_t::TYPE_ANY);
+    } else {
+      a.set_type(entity_addr_t::TYPE_LEGACY);
+    }
+    pending_inc.new_blocklist[a] = until;
+  }
+  return pending_inc.epoch;
+}
+
+epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
+{
+  if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
+    a.set_type(entity_addr_t::TYPE_ANY);
+  } else {
+    a.set_type(entity_addr_t::TYPE_LEGACY);
+  }
+  dout(10) << "blocklist " << a << " until " << until << dendl;
+  pending_inc.new_blocklist[a] = until;
+  return pending_inc.epoch;
+}
+
+
+void OSDMonitor::check_osdmap_subs()
+{
+  dout(10) << __func__ << dendl;
+  if (!osdmap.get_epoch()) {
+    return;
+  }
+  auto osdmap_subs = mon.session_map.subs.find("osdmap");
+  if (osdmap_subs == mon.session_map.subs.end()) {
+    return;
+  }
+  auto p = osdmap_subs->second->begin();
+  while (!p.end()) {
+    auto sub = *p;
+    ++p;
+    check_osdmap_sub(sub);
+  }
+}
+
+void OSDMonitor::check_osdmap_sub(Subscription *sub)
+{
+  dout(10) << __func__ << " " << sub << " next " << sub->next
+	   << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
+  if (sub->next <= osdmap.get_epoch()) {
+    if (sub->next >= 1)
+      send_incremental(sub->next, sub->session, sub->incremental_onetime);
+    else
+      sub->session->con->send_message(build_latest_full(sub->session->con_features));
+    if (sub->onetime)
+      mon.session_map.remove_sub(sub);
+    else
+      sub->next = osdmap.get_epoch() + 1;
+  }
+}
+
+void OSDMonitor::check_pg_creates_subs()
+{
+  if (!osdmap.get_num_up_osds()) {
+    return;
+  }
+  ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
+  mon.with_session_map([this](const MonSessionMap& session_map) {
+      auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
+      if (pg_creates_subs == session_map.subs.end()) {
+	return;
+      }
+      for (auto sub : *pg_creates_subs->second) {
+	check_pg_creates_sub(sub);
+      }
+    });
+}
+
+void OSDMonitor::check_pg_creates_sub(Subscription *sub)
+{
+  dout(20) << __func__ << " .. " << sub->session->name << dendl;
+  ceph_assert(sub->type == "osd_pg_creates");
+  // only send these if the OSD is up.  we will check_subs() when they do
+  // come up so they will get the creates then.
+  if (sub->session->name.is_osd() &&
+      mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
+    sub->next = send_pg_creates(sub->session->name.num(),
+				sub->session->con.get(),
+				sub->next);
+  }
+}
+
+void OSDMonitor::do_application_enable(int64_t pool_id,
+                                       const std::string &app_name,
+				       const std::string &app_key,
+				       const std::string &app_value,
+				       bool force)
+{
+  ceph_assert(paxos.is_plugged() && is_writeable());
+
+  dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
+           << dendl;
+
+  ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
+
+  auto pp = osdmap.get_pg_pool(pool_id);
+  ceph_assert(pp != nullptr);
+
+  pg_pool_t p = *pp;
+  if (pending_inc.new_pools.count(pool_id)) {
+    p = pending_inc.new_pools[pool_id];
+  }
+
+  if (app_key.empty()) {
+    p.application_metadata.insert({app_name, {}});
+  } else {
+    if (force) {
+      p.application_metadata[app_name][app_key] = app_value;
+    } else {
+      p.application_metadata.insert({app_name, {{app_key, app_value}}});
+    }
+  }
+  p.last_change = pending_inc.epoch;
+  pending_inc.new_pools[pool_id] = p;
+}
+
+void OSDMonitor::do_set_pool_opt(int64_t pool_id,
+				 pool_opts_t::key_t opt,
+				 pool_opts_t::value_t val)
+{
+  dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
+	   << " val: " << val << dendl;
+  auto p = pending_inc.new_pools.try_emplace(
+    pool_id, *osdmap.get_pg_pool(pool_id));
+  p.first->second.opts.set(opt, val);
+}
+
+unsigned OSDMonitor::scan_for_creating_pgs(
+  const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
+  const mempool::osdmap::set<int64_t>& removed_pools,
+  utime_t modified,
+  creating_pgs_t* creating_pgs) const
+{
+  unsigned queued = 0;
+  for (auto& p : pools) {
+    int64_t poolid = p.first;
+    if (creating_pgs->created_pools.count(poolid)) {
+      dout(10) << __func__ << " already created " << poolid << dendl;
+      continue;
+    }
+    const pg_pool_t& pool = p.second;
+    int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
+					 pool.get_type(), pool.get_size());
+    if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
+      continue;
+
+    const auto last_scan_epoch = creating_pgs->last_scan_epoch;
+    const auto created = pool.get_last_change();
+    if (last_scan_epoch && created <= last_scan_epoch) {
+      dout(10) << __func__ << " no change in pool " << poolid
+	       << " " << pool << dendl;
+      continue;
+    }
+    if (removed_pools.count(poolid)) {
+      dout(10) << __func__ << " pool is being removed: " << poolid
+	       << " " << pool << dendl;
+      continue;
+    }
+    dout(10) << __func__ << " queueing pool create for " << poolid
+	     << " " << pool << dendl;
+    creating_pgs->create_pool(poolid, pool.get_pg_num(),
+			      created, modified);
+    queued++;
+  }
+  return queued;
+}
+
+void OSDMonitor::update_creating_pgs()
+{
+  dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
+	   << creating_pgs.queue.size() << " pools in queue" << dendl;
+  decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
+  std::lock_guard<std::mutex> l(creating_pgs_lock);
+  for (const auto& pg : creating_pgs.pgs) {
+    int acting_primary = -1;
+    auto pgid = pg.first;
+    if (!osdmap.pg_exists(pgid)) {
+      dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
+	       << dendl;
+      continue;
+    }
+    auto mapped = pg.second.create_epoch;
+    dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
+    spg_t spgid(pgid);
+    mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
+    // check the previous creating_pgs, look for the target to whom the pg was
+    // previously mapped
+    for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
+      const auto last_acting_primary = pgs_by_epoch.first;
+      for (auto& pgs: pgs_by_epoch.second) {
+	if (pgs.second.count(spgid)) {
+	  if (last_acting_primary == acting_primary) {
+	    mapped = pgs.first;
+	  } else {
+	    dout(20) << __func__ << " " << pgid << " "
+		     << " acting_primary:" << last_acting_primary
+		     << " -> " << acting_primary << dendl;
+	    // note epoch if the target of the create message changed.
+	    mapped = mapping.get_epoch();
+          }
+          break;
+        } else {
+	  // newly creating
+	  mapped = mapping.get_epoch();
+	}
+      }
+    }
+    dout(10) << __func__ << " will instruct osd." << acting_primary
+	     << " to create " << pgid << "@" << mapped << dendl;
+    new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
+  }
+  creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
+  creating_pgs_epoch = mapping.get_epoch();
+}
+
+epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
+{
+  dout(30) << __func__ << " osd." << osd << " next=" << next
+	   << " " << creating_pgs_by_osd_epoch << dendl;
+  std::lock_guard<std::mutex> l(creating_pgs_lock);
+  if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
+    dout(20) << __func__
+	     << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
+    // the subscribers will be updated when the mapping is completed anyway
+    return next;
+  }
+  auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
+  if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
+    return next;
+  ceph_assert(!creating_pgs_by_epoch->second.empty());
+
+  MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
+  MOSDPGCreate2 *m = nullptr;
+
+  bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
+
+  epoch_t last = 0;
+  for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
+       epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
+    auto epoch = epoch_pgs->first;
+    auto& pgs = epoch_pgs->second;
+    dout(20) << __func__ << " osd." << osd << " from " << next
+             << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
+    last = epoch;
+    for (auto& pg : pgs) {
+      // Need the create time from the monitor using its clock to set
+      // last_scrub_stamp upon pg creation.
+      auto create = creating_pgs.pgs.find(pg.pgid);
+      ceph_assert(create != creating_pgs.pgs.end());
+      if (old) {
+	if (!oldm) {
+	  oldm = new MOSDPGCreate(creating_pgs_epoch);
+	}
+	oldm->mkpg.emplace(pg.pgid,
+			   pg_create_t{create->second.create_epoch, pg.pgid, 0});
+	oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
+      } else {
+	if (!m) {
+	  m = new MOSDPGCreate2(creating_pgs_epoch);
+	}
+	m->pgs.emplace(pg, make_pair(create->second.create_epoch,
+				     create->second.create_stamp));
+	if (create->second.history.epoch_created) {
+	  dout(20) << __func__ << "   " << pg << " " << create->second.history
+		   << " " << create->second.past_intervals << dendl;
+	  m->pg_extra.emplace(pg, make_pair(create->second.history,
+					    create->second.past_intervals));
+	}
+      }
+      dout(20) << __func__ << " will create " << pg
+	       << " at " << create->second.create_epoch << dendl;
+    }
+  }
+  if (m) {
+    con->send_message(m);
+  } else if (oldm) {
+    con->send_message(oldm);
+  } else {
+    dout(20) << __func__ << " osd." << osd << " from " << next
+             << " has nothing to send" << dendl;
+    return next;
+  }
+
+  // sub is current through last + 1
+  return last + 1;
+}
+
+// TICK
+
+
+void OSDMonitor::tick()
+{
+  if (!is_active()) return;
+
+  dout(10) << osdmap << dendl;
+
+  // always update osdmap manifest, regardless of being the leader.
+  load_osdmap_manifest();
+
+  // always tune priority cache manager memory on leader and peons
+  if (ceph_using_tcmalloc() && mon_memory_autotune) {
+    std::lock_guard l(balancer_lock);
+    if (pcm != nullptr) {
+      pcm->tune_memory();
+      pcm->balance();
+      _set_new_cache_sizes();
+      dout(10) << "tick balancer "
+               << " inc cache_bytes: " << inc_cache->get_cache_bytes()
+               << " inc comtd_bytes: " << inc_cache->get_committed_size()
+               << " inc used_bytes: " << inc_cache->_get_used_bytes()
+               << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
+               << dendl;
+      dout(10) << "tick balancer "
+               << " full cache_bytes: " << full_cache->get_cache_bytes()
+               << " full comtd_bytes: " << full_cache->get_committed_size()
+               << " full used_bytes: " << full_cache->_get_used_bytes()
+               << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
+               << dendl;
+    }
+  }
+
+  if (!mon.is_leader()) return;
+
+  bool do_propose = false;
+  utime_t now = ceph_clock_now();
+
+  if (handle_osd_timeouts(now, last_osd_report)) {
+    do_propose = true;
+  }
+
+  // mark osds down?
+  if (check_failures(now)) {
+    do_propose = true;
+  }
+
+  // Force a proposal if we need to prune; pruning is performed on
+  // ``encode_pending()``, hence why we need to regularly trigger a proposal
+  // even if there's nothing going on.
+  if (is_prune_enabled() && should_prune()) {
+    do_propose = true;
+  }
+
+  // mark down osds out?
+
+  /* can_mark_out() checks if we can mark osds as being out. The -1 has no
+   * influence at all. The decision is made based on the ratio of "in" osds,
+   * and the function returns false if this ratio is lower that the minimum
+   * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
+   */
+  if (can_mark_out(-1)) {
+    string down_out_subtree_limit = g_conf().get_val<string>(
+      "mon_osd_down_out_subtree_limit");
+    set<int> down_cache;  // quick cache of down subtrees
+
+    map<int,utime_t>::iterator i = down_pending_out.begin();
+    while (i != down_pending_out.end()) {
+      int o = i->first;
+      utime_t down = now;
+      down -= i->second;
+      ++i;
+
+      if (osdmap.is_down(o) &&
+	  osdmap.is_in(o) &&
+	  can_mark_out(o)) {
+	utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
+	utime_t grace = orig_grace;
+	double my_grace = 0.0;
+
+	if (g_conf()->mon_osd_adjust_down_out_interval) {
+	  // scale grace period the same way we do the heartbeat grace.
+	  const osd_xinfo_t& xi = osdmap.get_xinfo(o);
+	  double halflife = (double)g_conf()->mon_osd_laggy_halflife;
+	  double decay_k = ::log(.5) / halflife;
+	  double decay = exp((double)down * decay_k);
+	  dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
+		   << " down for " << down << " decay " << decay << dendl;
+	  my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
+	  grace += my_grace;
+	}
+
+	// is this an entire large subtree down?
+	if (down_out_subtree_limit.length()) {
+	  int type = osdmap.crush->get_type_id(down_out_subtree_limit);
+	  if (type > 0) {
+	    if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
+	      dout(10) << "tick entire containing " << down_out_subtree_limit
+		       << " subtree for osd." << o
+		       << " is down; resetting timer" << dendl;
+	      // reset timer, too.
+	      down_pending_out[o] = now;
+	      continue;
+	    }
+	  }
+	}
+
+        bool down_out = !osdmap.is_destroyed(o) &&
+          g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
+        bool destroyed_out = osdmap.is_destroyed(o) &&
+          g_conf()->mon_osd_destroyed_out_interval > 0 &&
+        // this is not precise enough as we did not make a note when this osd
+        // was marked as destroyed, but let's not bother with that
+        // complexity for now.
+          down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
+        if (down_out || destroyed_out) {
+	  dout(10) << "tick marking osd." << o << " OUT after " << down
+		   << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
+	  pending_inc.new_weight[o] = CEPH_OSD_OUT;
+
+	  // set the AUTOOUT bit.
+	  if (pending_inc.new_state.count(o) == 0)
+	    pending_inc.new_state[o] = 0;
+	  pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
+
+	  // remember previous weight
+	  if (pending_inc.new_xinfo.count(o) == 0)
+	    pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
+	  pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
+
+	  do_propose = true;
+
+	  mon.clog->info() << "Marking osd." << o << " out (has been down for "
+                            << int(down.sec()) << " seconds)";
+	} else
+	  continue;
+      }
+
+      down_pending_out.erase(o);
+    }
+  } else {
+    dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
+  }
+
+  // expire blocklisted items?
+  for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
+       p != osdmap.blocklist.end();
+       ++p) {
+    if (p->second < now) {
+      dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
+      pending_inc.old_blocklist.push_back(p->first);
+      do_propose = true;
+    }
+  }
+  for (auto p = osdmap.range_blocklist.begin();
+       p != osdmap.range_blocklist.end();
+       ++p) {
+    if (p->second < now) {
+      dout(10) << "expiring range_blocklist item " << p->first
+	       << " expired " << p->second << " < now " << now << dendl;
+      pending_inc.old_range_blocklist.push_back(p->first);
+      do_propose = true;
+    }
+  }
+
+  if (try_prune_purged_snaps()) {
+    do_propose = true;
+  }
+
+  if (update_pools_status())
+    do_propose = true;
+
+  if (do_propose ||
+      !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
+    propose_pending();
+}
+
+void OSDMonitor::_set_new_cache_sizes()
+{
+  uint64_t cache_size = 0;
+  int64_t inc_alloc = 0;
+  int64_t full_alloc = 0;
+  int64_t kv_alloc = 0;
+
+  if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
+    cache_size = pcm->get_tuned_mem();
+    inc_alloc = inc_cache->get_committed_size();
+    full_alloc = full_cache->get_committed_size();
+    kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
+  }
+
+  inc_osd_cache.set_bytes(inc_alloc);
+  full_osd_cache.set_bytes(full_alloc);
+
+  dout(1) << __func__ << " cache_size:" << cache_size
+           << " inc_alloc: " << inc_alloc
+           << " full_alloc: " << full_alloc
+           << " kv_alloc: " << kv_alloc
+           << dendl;
+}
+
+bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
+				     std::map<int, std::pair<utime_t, int>> &last_osd_report)
+{
+  utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
+  if (now - mon.get_leader_since() < timeo) {
+    // We haven't been the leader for long enough to consider OSD timeouts
+    return false;
+  }
+
+  int max_osd = osdmap.get_max_osd();
+  bool new_down = false;
+
+  for (int i=0; i < max_osd; ++i) {
+    dout(30) << __func__ << ": checking up on osd " << i << dendl;
+    if (!osdmap.exists(i)) {
+      last_osd_report.erase(i); // if any
+      continue;
+    }
+    if (!osdmap.is_up(i))
+      continue;
+    const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
+    if (t == last_osd_report.end()) {
+      // it wasn't in the map; start the timer.
+      last_osd_report[i].first = now;
+      last_osd_report[i].second = 0;
+    } else if (can_mark_down(i)) {
+      utime_t diff = now - t->second.first;
+      // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
+      // to allow for the osd to miss a beacon.
+      int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
+      utime_t max_timeout(std::max(mon_osd_report_timeout,  2 * t->second.second), 0);
+      if (diff > max_timeout) {
+        mon.clog->info() << "osd." << i << " marked down after no beacon for "
+                          << diff << " seconds";
+        derr << "no beacon from osd." << i << " since " << t->second.first
+             << ", " << diff << " seconds ago.  marking down" << dendl;
+        pending_inc.new_state[i] = CEPH_OSD_UP;
+        new_down = true;
+      }
+    }
+  }
+  return new_down;
+}
+
+static void dump_cpu_list(Formatter *f, const char *name,
+			  const string& strlist)
+{
+  cpu_set_t cpu_set;
+  size_t cpu_set_size;
+  if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
+    return;
+  }
+  set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
+  f->open_array_section(name);
+  for (auto cpu : cpus) {
+    f->dump_int("cpu", cpu);
+  }
+  f->close_section();
+}
+
+void OSDMonitor::dump_info(Formatter *f)
+{
+  f->open_object_section("osdmap");
+  osdmap.dump(f);
+  f->close_section();
+
+  f->open_array_section("osd_metadata");
+  for (int i=0; i<osdmap.get_max_osd(); ++i) {
+    if (osdmap.exists(i)) {
+      f->open_object_section("osd");
+      f->dump_unsigned("id", i);
+      dump_osd_metadata(i, f, NULL);
+      f->close_section();
+    }
+  }
+  f->close_section();
+
+  f->open_object_section("osdmap_clean_epochs");
+  f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
+
+  f->open_object_section("last_epoch_clean");
+  last_epoch_clean.dump(f);
+  f->close_section();
+
+  f->open_array_section("osd_epochs");
+  for (auto& osd_epoch : osd_epochs) {
+    f->open_object_section("osd");
+    f->dump_unsigned("id", osd_epoch.first);
+    f->dump_unsigned("epoch", osd_epoch.second);
+    f->close_section();
+  }
+  f->close_section(); // osd_epochs
+
+  f->close_section(); // osd_clean_epochs
+
+  f->dump_unsigned("osdmap_first_committed", get_first_committed());
+  f->dump_unsigned("osdmap_last_committed", get_last_committed());
+
+  f->open_object_section("crushmap");
+  osdmap.crush->dump(f);
+  f->close_section();
+
+  if (has_osdmap_manifest) {
+    f->open_object_section("osdmap_manifest");
+    osdmap_manifest.dump(f);
+    f->close_section();
+  }
+}
+
+namespace {
+  enum osd_pool_get_choices {
+    SIZE, MIN_SIZE,
+    PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
+    NODELETE, NOPGCHANGE, NOSIZECHANGE,
+    WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
+    HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
+    USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
+    CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
+    CACHE_TARGET_FULL_RATIO,
+    CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
+    ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
+    MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
+    HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
+    SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
+    RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
+    COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
+    COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
+    CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
+    PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
+    PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
+    DEDUP_CDC_CHUNK_SIZE, PG_NUM_MAX, BULK };
+
+  std::set<osd_pool_get_choices>
+    subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
+				const std::set<osd_pool_get_choices>& second)
+    {
+      std::set<osd_pool_get_choices> result;
+      std::set_difference(first.begin(), first.end(),
+			  second.begin(), second.end(),
+			  std::inserter(result, result.end()));
+      return result;
+    }
+}
+
+
+bool OSDMonitor::preprocess_command(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MMonCommand>();
+  int r = 0;
+  bufferlist rdata;
+  stringstream ss, ds;
+
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, get_last_committed());
+    return true;
+  }
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    derr << __func__ << " no session" << dendl;
+    mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  if (prefix == "osd stat") {
+    if (f) {
+      f->open_object_section("osdmap");
+      osdmap.print_summary(f.get(), ds, "", true);
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      osdmap.print_summary(nullptr, ds, "", true);
+      rdata.append(ds);
+    }
+  }
+  else if (prefix == "osd dump" ||
+	   prefix == "osd tree" ||
+	   prefix == "osd tree-from" ||
+	   prefix == "osd ls" ||
+	   prefix == "osd getmap" ||
+	   prefix == "osd getcrushmap" ||
+	   prefix == "osd ls-tree" ||
+	   prefix == "osd info") {
+
+    epoch_t epoch = 0;
+    int64_t epochnum;
+    cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
+    epoch = epochnum;
+    
+    bufferlist osdmap_bl;
+    int err = get_version_full(epoch, osdmap_bl);
+    if (err == -ENOENT) {
+      r = -ENOENT;
+      ss << "there is no map for epoch " << epoch;
+      goto reply;
+    }
+    ceph_assert(err == 0);
+    ceph_assert(osdmap_bl.length());
+
+    OSDMap *p;
+    if (epoch == osdmap.get_epoch()) {
+      p = &osdmap;
+    } else {
+      p = new OSDMap;
+      p->decode(osdmap_bl);
+    }
+
+    auto sg = make_scope_guard([&] {
+      if (p != &osdmap) {
+        delete p;
+      }
+    });
+
+    if (prefix == "osd dump") {
+      stringstream ds;
+      if (f) {
+	f->open_object_section("osdmap");
+	p->dump(f.get());
+	f->close_section();
+	f->flush(ds);
+      } else {
+	p->print(ds);
+      }
+      rdata.append(ds);
+      if (!f)
+	ds << " ";
+    } else if (prefix == "osd ls") {
+      if (f) {
+	f->open_array_section("osds");
+	for (int i = 0; i < osdmap.get_max_osd(); i++) {
+	  if (osdmap.exists(i)) {
+	    f->dump_int("osd", i);
+	  }
+	}
+	f->close_section();
+	f->flush(ds);
+      } else {
+	bool first = true;
+	for (int i = 0; i < osdmap.get_max_osd(); i++) {
+	  if (osdmap.exists(i)) {
+	    if (!first)
+	      ds << "\n";
+	    first = false;
+	    ds << i;
+	  }
+	}
+      }
+      rdata.append(ds);
+    } else if (prefix == "osd info") {
+      int64_t osd_id;
+      bool do_single_osd = true;
+      if (!cmd_getval(cmdmap, "id", osd_id)) {
+	do_single_osd = false;
+      }
+
+      if (do_single_osd && !osdmap.exists(osd_id)) {
+	ss << "osd." << osd_id << " does not exist";
+	r = -EINVAL;
+	goto reply;
+      }
+
+      if (f) {
+	if (do_single_osd) {
+	  osdmap.dump_osd(osd_id, f.get());
+	} else {
+	  osdmap.dump_osds(f.get());
+	}
+	f->flush(ds);
+      } else {
+	if (do_single_osd) {
+	  osdmap.print_osd(osd_id, ds);
+	} else {
+	  osdmap.print_osds(ds);
+	}
+      }
+      rdata.append(ds);
+    } else if (prefix == "osd tree" || prefix == "osd tree-from") {
+      string bucket;
+      if (prefix == "osd tree-from") {
+        cmd_getval(cmdmap, "bucket", bucket);
+        if (!osdmap.crush->name_exists(bucket)) {
+          ss << "bucket '" << bucket << "' does not exist";
+          r = -ENOENT;
+          goto reply;
+        }
+        int id = osdmap.crush->get_item_id(bucket);
+        if (id >= 0) {
+          ss << "\"" << bucket << "\" is not a bucket";
+          r = -EINVAL;
+          goto reply;
+        }
+      }
+
+      vector<string> states;
+      cmd_getval(cmdmap, "states", states);
+      unsigned filter = 0;
+      for (auto& s : states) {
+	if (s == "up") {
+	  filter |= OSDMap::DUMP_UP;
+	} else if (s == "down") {
+	  filter |= OSDMap::DUMP_DOWN;
+	} else if (s == "in") {
+	  filter |= OSDMap::DUMP_IN;
+	} else if (s == "out") {
+	  filter |= OSDMap::DUMP_OUT;
+	} else if (s == "destroyed") {
+	  filter |= OSDMap::DUMP_DESTROYED;
+	} else {
+	  ss << "unrecognized state '" << s << "'";
+	  r = -EINVAL;
+	  goto reply;
+	}
+      }
+      if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
+	  (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
+        ss << "cannot specify both 'in' and 'out'";
+        r = -EINVAL;
+        goto reply;
+      }
+      if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
+	   (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
+           ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
+           (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
+           ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
+           (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
+	ss << "can specify only one of 'up', 'down' and 'destroyed'";
+	r = -EINVAL;
+	goto reply;
+      }
+      if (f) {
+	f->open_object_section("tree");
+	p->print_tree(f.get(), NULL, filter, bucket);
+	f->close_section();
+	f->flush(ds);
+      } else {
+	p->print_tree(NULL, &ds, filter, bucket);
+      }
+      rdata.append(ds);
+    } else if (prefix == "osd getmap") {
+      rdata.append(osdmap_bl);
+      ss << "got osdmap epoch " << p->get_epoch();
+    } else if (prefix == "osd getcrushmap") {
+      p->crush->encode(rdata, mon.get_quorum_con_features());
+      ss << p->get_crush_version();
+    } else if (prefix == "osd ls-tree") {
+      string bucket_name;
+      cmd_getval(cmdmap, "name", bucket_name);
+      set<int> osds;
+      r = p->get_osds_by_bucket_name(bucket_name, &osds);
+      if (r == -ENOENT) {
+        ss << "\"" << bucket_name << "\" does not exist";
+        goto reply;
+      } else if (r < 0) {
+        ss << "can not parse bucket name:\"" << bucket_name << "\"";
+        goto reply;
+      }
+
+      if (f) {
+        f->open_array_section("osds");
+        for (auto &i : osds) {
+          if (osdmap.exists(i)) {
+            f->dump_int("osd", i);
+          }
+        }
+        f->close_section();
+        f->flush(ds);
+      } else {
+        bool first = true;
+        for (auto &i : osds) {
+          if (osdmap.exists(i)) {
+            if (!first)
+              ds << "\n";
+            first = false;
+            ds << i;
+          }
+        }
+      }
+
+      rdata.append(ds);
+    }
+  } else if (prefix == "osd getmaxosd") {
+    if (f) {
+      f->open_object_section("getmaxosd");
+      f->dump_unsigned("epoch", osdmap.get_epoch());
+      f->dump_int("max_osd", osdmap.get_max_osd());
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
+      rdata.append(ds);
+    }
+  } else if (prefix == "osd utilization") {
+    string out;
+    osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
+    if (f)
+      f->flush(rdata);
+    else
+      rdata.append(out);
+    r = 0;
+    goto reply;
+  } else if (prefix  == "osd find") {
+    int64_t osd;
+    if (!cmd_getval(cmdmap, "id", osd)) {
+      ss << "unable to parse osd id value '"
+         << cmd_vartype_stringify(cmdmap["id"]) << "'";
+      r = -EINVAL;
+      goto reply;
+    }
+    if (!osdmap.exists(osd)) {
+      ss << "osd." << osd << " does not exist";
+      r = -ENOENT;
+      goto reply;
+    }
+    string format;
+    cmd_getval(cmdmap, "format", format);
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+    f->open_object_section("osd_location");
+    f->dump_int("osd", osd);
+    f->dump_object("addrs", osdmap.get_addrs(osd));
+    f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
+
+    // try to identify host, pod/container name, etc.
+    map<string,string> m;
+    load_metadata(osd, m, nullptr);
+    if (auto p = m.find("hostname"); p != m.end()) {
+      f->dump_string("host", p->second);
+    }
+    for (auto& k : {
+	"pod_name", "pod_namespace", // set by rook
+	"container_name"             // set by cephadm, ceph-ansible
+	}) {
+      if (auto p = m.find(k); p != m.end()) {
+	f->dump_string(k, p->second);
+      }
+    }
+
+    // crush is helpful too
+    f->open_object_section("crush_location");
+    map<string,string> loc = osdmap.crush->get_full_location(osd);
+    for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
+      f->dump_string(p->first.c_str(), p->second);
+    f->close_section();
+    f->close_section();
+    f->flush(rdata);
+  } else if (prefix == "osd metadata") {
+    int64_t osd = -1;
+    if (cmd_vartype_stringify(cmdmap["id"]).size() &&
+        !cmd_getval(cmdmap, "id", osd)) {
+      ss << "unable to parse osd id value '"
+         << cmd_vartype_stringify(cmdmap["id"]) << "'";
+      r = -EINVAL;
+      goto reply;
+    }
+    if (osd >= 0 && !osdmap.exists(osd)) {
+      ss << "osd." << osd << " does not exist";
+      r = -ENOENT;
+      goto reply;
+    }
+    string format;
+    cmd_getval(cmdmap, "format", format);
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+    if (osd >= 0) {
+      f->open_object_section("osd_metadata");
+      f->dump_unsigned("id", osd);
+      r = dump_osd_metadata(osd, f.get(), &ss);
+      if (r < 0)
+        goto reply;
+      f->close_section();
+    } else {
+      r = 0;
+      f->open_array_section("osd_metadata");
+      for (int i=0; i<osdmap.get_max_osd(); ++i) {
+        if (osdmap.exists(i)) {
+          f->open_object_section("osd");
+          f->dump_unsigned("id", i);
+          r = dump_osd_metadata(i, f.get(), NULL);
+          if (r == -EINVAL || r == -ENOENT) {
+            // Drop error, continue to get other daemons' metadata
+            dout(4) << "No metadata for osd." << i << dendl;
+            r = 0;
+          } else if (r < 0) {
+            // Unexpected error
+            goto reply;
+          }
+          f->close_section();
+        }
+      }
+      f->close_section();
+    }
+    f->flush(rdata);
+  } else if (prefix == "osd versions") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    count_metadata("ceph_version", f.get());
+    f->flush(rdata);
+    r = 0;
+  } else if (prefix == "osd count-metadata") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    string field;
+    cmd_getval(cmdmap, "property", field);
+    count_metadata(field, f.get());
+    f->flush(rdata);
+    r = 0;
+  } else if (prefix == "osd numa-status") {
+    TextTable tbl;
+    if (f) {
+      f->open_array_section("osds");
+    } else {
+      tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
+      tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
+    }
+    for (int i=0; i<osdmap.get_max_osd(); ++i) {
+      if (osdmap.exists(i)) {
+	map<string,string> m;
+	ostringstream err;
+	if (load_metadata(i, m, &err) < 0) {
+	  continue;
+	}
+	string host;
+	auto p = m.find("hostname");
+	if (p != m.end()) {
+	  host = p->second;
+	}
+	if (f) {
+	  f->open_object_section("osd");
+	  f->dump_int("osd", i);
+	  f->dump_string("host", host);
+	  for (auto n : { "network_numa_node", "objectstore_numa_node",
+		"numa_node" }) {
+	    p = m.find(n);
+	    if (p != m.end()) {
+	      f->dump_int(n, atoi(p->second.c_str()));
+	    }
+	  }
+	  for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
+	    p = m.find(n);
+	    if (p != m.end()) {
+	      list<string> ls = get_str_list(p->second, ",");
+	      f->open_array_section(n);
+	      for (auto node : ls) {
+		f->dump_int("node", atoi(node.c_str()));
+	      }
+	      f->close_section();
+	    }
+	  }
+	  for (auto n : { "numa_node_cpus" }) {
+	    p = m.find(n);
+	    if (p != m.end()) {
+	      dump_cpu_list(f.get(), n, p->second);
+	    }
+	  }
+	  f->close_section();
+	} else {
+	  tbl << i;
+	  tbl << host;
+	  p = m.find("network_numa_nodes");
+	  if (p != m.end()) {
+	    tbl << p->second;
+	  } else {
+	    tbl << "-";
+	  }
+	  p = m.find("objectstore_numa_nodes");
+	  if (p != m.end()) {
+	    tbl << p->second;
+	  } else {
+	    tbl << "-";
+	  }
+	  p = m.find("numa_node");
+	  auto q = m.find("numa_node_cpus");
+	  if (p != m.end() && q != m.end()) {
+	    tbl << p->second;
+	    tbl << q->second;
+	  } else {
+	    tbl << "-";
+	    tbl << "-";
+	  }
+	  tbl << TextTable::endrow;
+	}
+      }
+    }
+    if (f) {
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      rdata.append(stringify(tbl));
+    }
+  } else if (prefix == "osd map") {
+    string poolstr, objstr, namespacestr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    cmd_getval(cmdmap, "object", objstr);
+    cmd_getval(cmdmap, "nspace", namespacestr);
+
+    int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+    if (pool < 0) {
+      ss << "pool " << poolstr << " does not exist";
+      r = -ENOENT;
+      goto reply;
+    }
+    object_locator_t oloc(pool, namespacestr);
+    object_t oid(objstr);
+    pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
+    pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
+    vector<int> up, acting;
+    int up_p, acting_p;
+    osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
+
+    string fullobjname;
+    if (!namespacestr.empty())
+      fullobjname = namespacestr + string("/") + oid.name;
+    else
+      fullobjname = oid.name;
+    if (f) {
+      f->open_object_section("osd_map");
+      f->dump_unsigned("epoch", osdmap.get_epoch());
+      f->dump_string("pool", poolstr);
+      f->dump_int("pool_id", pool);
+      f->dump_stream("objname") << fullobjname;
+      f->dump_stream("raw_pgid") << pgid;
+      f->dump_stream("pgid") << mpgid;
+      f->open_array_section("up");
+      for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
+        f->dump_int("osd", *p);
+      f->close_section();
+      f->dump_int("up_primary", up_p);
+      f->open_array_section("acting");
+      for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
+        f->dump_int("osd", *p);
+      f->close_section();
+      f->dump_int("acting_primary", acting_p);
+      f->close_section(); // osd_map
+      f->flush(rdata);
+    } else {
+      ds << "osdmap e" << osdmap.get_epoch()
+        << " pool '" << poolstr << "' (" << pool << ")"
+        << " object '" << fullobjname << "' ->"
+        << " pg " << pgid << " (" << mpgid << ")"
+        << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
+        << pg_vector_string(acting) << ", p" << acting_p << ")";
+      rdata.append(ds);
+    }
+
+  } else if (prefix == "pg map") {
+    pg_t pgid;
+    string pgidstr;
+    cmd_getval(cmdmap, "pgid", pgidstr);
+    if (!pgid.parse(pgidstr.c_str())) {
+      ss << "invalid pgid '" << pgidstr << "'";
+      r = -EINVAL;
+      goto reply;
+    }
+    vector<int> up, acting;
+    if (!osdmap.have_pg_pool(pgid.pool())) {
+      ss << "pg '" << pgidstr << "' does not exist";
+      r = -ENOENT;
+      goto reply;
+    }
+    pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
+    osdmap.pg_to_up_acting_osds(pgid, up, acting);
+    if (f) {
+      f->open_object_section("pg_map");
+      f->dump_unsigned("epoch", osdmap.get_epoch());
+      f->dump_stream("raw_pgid") << pgid;
+      f->dump_stream("pgid") << mpgid;
+      f->open_array_section("up");
+      for (auto osd : up) {
+	f->dump_int("up_osd", osd);
+      }
+      f->close_section();
+      f->open_array_section("acting");
+      for (auto osd : acting) {
+	f->dump_int("acting_osd", osd);
+      }
+      f->close_section();
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ds << "osdmap e" << osdmap.get_epoch()
+         << " pg " << pgid << " (" << mpgid << ")"
+         << " -> up " << up << " acting " << acting;
+      rdata.append(ds);
+    }
+    goto reply;
+
+  } else if (prefix == "osd lspools") {
+    if (f)
+      f->open_array_section("pools");
+    for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
+	 p != osdmap.pools.end();
+	 ++p) {
+      if (f) {
+	f->open_object_section("pool");
+	f->dump_int("poolnum", p->first);
+	f->dump_string("poolname", osdmap.pool_name[p->first]);
+	f->close_section();
+      } else {
+	ds << p->first << ' ' << osdmap.pool_name[p->first];
+	if (next(p) != osdmap.pools.end()) {
+	  ds << '\n';
+	}
+      }
+    }
+    if (f) {
+      f->close_section();
+      f->flush(ds);
+    }
+    rdata.append(ds);
+  } else if (prefix == "osd blocklist ls" ||
+	     prefix == "osd blacklist ls") {
+    if (f)
+      f->open_array_section("blocklist");
+
+    for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
+	 p != osdmap.blocklist.end();
+	 ++p) {
+      if (f) {
+	f->open_object_section("entry");
+	f->dump_string("addr", p->first.get_legacy_str());
+	f->dump_stream("until") << p->second;
+	f->close_section();
+      } else {
+	stringstream ss;
+	string s;
+	ss << p->first << " " << p->second;
+	getline(ss, s);
+	s += "\n";
+	rdata.append(s);
+      }
+    }
+    if (f) {
+      f->close_section();
+      f->flush(rdata);
+    }
+    if (f)
+      f->open_array_section("range_blocklist");
+
+    for (auto p = osdmap.range_blocklist.begin();
+	 p != osdmap.range_blocklist.end();
+	 ++p) {
+      if (f) {
+	f->open_object_section("entry");
+	f->dump_string("range", p->first.get_legacy_str());
+	f->dump_stream("until") << p->second;
+	f->close_section();
+      } else {
+	stringstream ss;
+	string s;
+	ss << p->first << " " << p->second;
+	getline(ss, s);
+	s += "\n";
+	rdata.append(s);
+      }
+    }
+    if (f) {
+      f->close_section();
+      f->flush(rdata);
+    }
+    ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
+
+  } else if (prefix == "osd pool ls") {
+    string detail;
+    cmd_getval(cmdmap, "detail", detail);
+    if (!f && detail == "detail") {
+      ostringstream ss;
+      osdmap.print_pools(ss);
+      rdata.append(ss.str());
+    } else {
+      if (f)
+	f->open_array_section("pools");
+      for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
+	   it != osdmap.get_pools().end();
+	   ++it) {
+	if (f) {
+	  if (detail == "detail") {
+	    f->open_object_section("pool");
+	    f->dump_int("pool_id", it->first);
+	    f->dump_string("pool_name", osdmap.get_pool_name(it->first));
+	    it->second.dump(f.get());
+	    f->close_section();
+	  } else {
+	    f->dump_string("pool_name", osdmap.get_pool_name(it->first));
+	  }
+	} else {
+	  rdata.append(osdmap.get_pool_name(it->first) + "\n");
+	}
+      }
+      if (f) {
+	f->close_section();
+	f->flush(rdata);
+      }
+    }
+
+  } else if (prefix == "osd crush get-tunable") {
+    string tunable;
+    cmd_getval(cmdmap, "tunable", tunable);
+    ostringstream rss;
+    if (f)
+      f->open_object_section("tunable");
+    if (tunable == "straw_calc_version") {
+      if (f)
+	f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
+      else
+	rss << osdmap.crush->get_straw_calc_version() << "\n";
+    } else {
+      r = -EINVAL;
+      goto reply;
+    }
+    if (f) {
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      rdata.append(rss.str());
+    }
+    r = 0;
+
+  } else if (prefix == "osd pool get") {
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+    if (pool < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      r = -ENOENT;
+      goto reply;
+    }
+
+    const pg_pool_t *p = osdmap.get_pg_pool(pool);
+    string var;
+    cmd_getval(cmdmap, "var", var);
+
+    typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
+    const choices_map_t ALL_CHOICES = {
+      {"size", SIZE},
+      {"min_size", MIN_SIZE},
+      {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
+      {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
+      {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
+      {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
+      {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
+      {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
+      {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
+      {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
+      {"use_gmt_hitset", USE_GMT_HITSET},
+      {"target_max_objects", TARGET_MAX_OBJECTS},
+      {"target_max_bytes", TARGET_MAX_BYTES},
+      {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
+      {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
+      {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
+      {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
+      {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
+      {"erasure_code_profile", ERASURE_CODE_PROFILE},
+      {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
+      {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
+      {"fast_read", FAST_READ},
+      {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
+      {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
+      {"scrub_min_interval", SCRUB_MIN_INTERVAL},
+      {"scrub_max_interval", SCRUB_MAX_INTERVAL},
+      {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
+      {"recovery_priority", RECOVERY_PRIORITY},
+      {"recovery_op_priority", RECOVERY_OP_PRIORITY},
+      {"scrub_priority", SCRUB_PRIORITY},
+      {"compression_mode", COMPRESSION_MODE},
+      {"compression_algorithm", COMPRESSION_ALGORITHM},
+      {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
+      {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
+      {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
+      {"csum_type", CSUM_TYPE},
+      {"csum_max_block", CSUM_MAX_BLOCK},
+      {"csum_min_block", CSUM_MIN_BLOCK},
+      {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
+      {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
+      {"pg_num_min", PG_NUM_MIN},
+      {"pg_num_max", PG_NUM_MAX},
+      {"target_size_bytes", TARGET_SIZE_BYTES},
+      {"target_size_ratio", TARGET_SIZE_RATIO},
+      {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
+      {"dedup_tier", DEDUP_TIER},
+      {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
+      {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
+      {"bulk", BULK}
+    };
+
+    typedef std::set<osd_pool_get_choices> choices_set_t;
+
+    const choices_set_t ONLY_TIER_CHOICES = {
+      HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
+      TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
+      CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
+      CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
+      MIN_READ_RECENCY_FOR_PROMOTE,
+      MIN_WRITE_RECENCY_FOR_PROMOTE,
+      HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
+    };
+    const choices_set_t ONLY_ERASURE_CHOICES = {
+      EC_OVERWRITES, ERASURE_CODE_PROFILE
+    };
+
+    choices_set_t selected_choices;
+    if (var == "all") {
+      for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
+	  it != ALL_CHOICES.end(); ++it) {
+	selected_choices.insert(it->second);
+      }
+
+      if(!p->is_tier()) {
+	selected_choices = subtract_second_from_first(selected_choices,
+						      ONLY_TIER_CHOICES);
+      }
+
+      if(!p->is_erasure()) {
+	selected_choices = subtract_second_from_first(selected_choices,
+						      ONLY_ERASURE_CHOICES);
+      }
+    } else /* var != "all" */  {
+      choices_map_t::const_iterator found = ALL_CHOICES.find(var);
+      if (found == ALL_CHOICES.end()) {
+        ss << "pool '" << poolstr
+	       << "': invalid variable: '" << var << "'";
+        r = -EINVAL;
+        goto reply;
+      }
+
+      osd_pool_get_choices selected = found->second;
+
+      if (!p->is_tier() &&
+	  ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
+	ss << "pool '" << poolstr
+	   << "' is not a tier pool: variable not applicable";
+	r = -EACCES;
+	goto reply;
+      }
+
+      if (!p->is_erasure() &&
+	  ONLY_ERASURE_CHOICES.find(selected)
+	  != ONLY_ERASURE_CHOICES.end()) {
+	ss << "pool '" << poolstr
+	   << "' is not a erasure pool: variable not applicable";
+	r = -EACCES;
+	goto reply;
+      }
+
+      if (pool_opts_t::is_opt_name(var) &&
+	  !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
+	ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
+	r = -ENOENT;
+	goto reply;
+      }
+
+      selected_choices.insert(selected);
+    }
+
+    if (f) {
+      f->open_object_section("pool");
+      f->dump_string("pool", poolstr);
+      f->dump_int("pool_id", pool);
+      for(choices_set_t::const_iterator it = selected_choices.begin();
+	  it != selected_choices.end(); ++it) {
+	choices_map_t::const_iterator i;
+        for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+          if (i->second == *it) {
+            break;
+          }
+        }
+        ceph_assert(i != ALL_CHOICES.end());
+	switch(*it) {
+	  case PG_NUM:
+	    f->dump_int("pg_num", p->get_pg_num());
+	    break;
+	  case PGP_NUM:
+	    f->dump_int("pgp_num", p->get_pgp_num());
+	    break;
+	  case SIZE:
+	    f->dump_int("size", p->get_size());
+	    break;
+	  case MIN_SIZE:
+	    f->dump_int("min_size", p->get_min_size());
+	    break;
+	  case CRUSH_RULE:
+	    if (osdmap.crush->rule_exists(p->get_crush_rule())) {
+	      f->dump_string("crush_rule", osdmap.crush->get_rule_name(
+			       p->get_crush_rule()));
+	    } else {
+	      f->dump_string("crush_rule", stringify(p->get_crush_rule()));
+	    }
+	    break;
+	  case EC_OVERWRITES:
+	    f->dump_bool("allow_ec_overwrites",
+                         p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
+	    break;
+	  case PG_AUTOSCALE_MODE:
+	    f->dump_string("pg_autoscale_mode",
+			   pg_pool_t::get_pg_autoscale_mode_name(
+			     p->pg_autoscale_mode));
+	    break;
+	  case HASHPSPOOL:
+	  case NODELETE:
+	  case BULK:
+	  case NOPGCHANGE:
+	  case NOSIZECHANGE:
+	  case WRITE_FADVISE_DONTNEED:
+	  case NOSCRUB:
+	  case NODEEP_SCRUB:
+	    f->dump_bool(i->first.c_str(),
+			   p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
+	    break;
+	  case HIT_SET_PERIOD:
+	    f->dump_int("hit_set_period", p->hit_set_period);
+	    break;
+	  case HIT_SET_COUNT:
+	    f->dump_int("hit_set_count", p->hit_set_count);
+	    break;
+	  case HIT_SET_TYPE:
+	    f->dump_string("hit_set_type",
+			   HitSet::get_type_name(p->hit_set_params.get_type()));
+	    break;
+	  case HIT_SET_FPP:
+	    {
+	      if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+		BloomHitSet::Params *bloomp =
+		  static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+		f->dump_float("hit_set_fpp", bloomp->get_fpp());
+	      } else if(var != "all") {
+		f->close_section();
+		ss << "hit set is not of type Bloom; " <<
+		  "invalid to get a false positive rate!";
+		r = -EINVAL;
+		goto reply;
+	      }
+	    }
+	    break;
+	  case USE_GMT_HITSET:
+	    f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
+	    break;
+	  case TARGET_MAX_OBJECTS:
+	    f->dump_unsigned("target_max_objects", p->target_max_objects);
+	    break;
+	  case TARGET_MAX_BYTES:
+	    f->dump_unsigned("target_max_bytes", p->target_max_bytes);
+	    break;
+	  case CACHE_TARGET_DIRTY_RATIO:
+	    f->dump_unsigned("cache_target_dirty_ratio_micro",
+			     p->cache_target_dirty_ratio_micro);
+	    f->dump_float("cache_target_dirty_ratio",
+			  ((float)p->cache_target_dirty_ratio_micro/1000000));
+	    break;
+	  case CACHE_TARGET_DIRTY_HIGH_RATIO:
+	    f->dump_unsigned("cache_target_dirty_high_ratio_micro",
+			     p->cache_target_dirty_high_ratio_micro);
+	    f->dump_float("cache_target_dirty_high_ratio",
+			  ((float)p->cache_target_dirty_high_ratio_micro/1000000));
+	    break;
+	  case CACHE_TARGET_FULL_RATIO:
+	    f->dump_unsigned("cache_target_full_ratio_micro",
+			     p->cache_target_full_ratio_micro);
+	    f->dump_float("cache_target_full_ratio",
+			  ((float)p->cache_target_full_ratio_micro/1000000));
+	    break;
+	  case CACHE_MIN_FLUSH_AGE:
+	    f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
+	    break;
+	  case CACHE_MIN_EVICT_AGE:
+	    f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
+	    break;
+	  case ERASURE_CODE_PROFILE:
+	    f->dump_string("erasure_code_profile", p->erasure_code_profile);
+	    break;
+	  case MIN_READ_RECENCY_FOR_PROMOTE:
+	    f->dump_int("min_read_recency_for_promote",
+			p->min_read_recency_for_promote);
+	    break;
+	  case MIN_WRITE_RECENCY_FOR_PROMOTE:
+	    f->dump_int("min_write_recency_for_promote",
+			p->min_write_recency_for_promote);
+	    break;
+          case FAST_READ:
+            f->dump_int("fast_read", p->fast_read);
+            break;
+	  case HIT_SET_GRADE_DECAY_RATE:
+	    f->dump_int("hit_set_grade_decay_rate",
+			p->hit_set_grade_decay_rate);
+	    break;
+	  case HIT_SET_SEARCH_LAST_N:
+	    f->dump_int("hit_set_search_last_n",
+			p->hit_set_search_last_n);
+	    break;
+	  case SCRUB_MIN_INTERVAL:
+	  case SCRUB_MAX_INTERVAL:
+	  case DEEP_SCRUB_INTERVAL:
+          case RECOVERY_PRIORITY:
+          case RECOVERY_OP_PRIORITY:
+          case SCRUB_PRIORITY:
+	  case COMPRESSION_MODE:
+	  case COMPRESSION_ALGORITHM:
+	  case COMPRESSION_REQUIRED_RATIO:
+	  case COMPRESSION_MAX_BLOB_SIZE:
+	  case COMPRESSION_MIN_BLOB_SIZE:
+	  case CSUM_TYPE:
+	  case CSUM_MAX_BLOCK:
+	  case CSUM_MIN_BLOCK:
+	  case FINGERPRINT_ALGORITHM:
+	  case PG_NUM_MIN:
+	  case PG_NUM_MAX:
+	  case TARGET_SIZE_BYTES:
+	  case TARGET_SIZE_RATIO:
+	  case PG_AUTOSCALE_BIAS:
+	  case DEDUP_TIER:
+	  case DEDUP_CHUNK_ALGORITHM:
+	  case DEDUP_CDC_CHUNK_SIZE:
+            pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
+            if (p->opts.is_set(key)) {
+              if(*it == CSUM_TYPE) {
+                int64_t val;
+                p->opts.get(pool_opts_t::CSUM_TYPE, &val);
+                f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
+              } else {
+                p->opts.dump(i->first, f.get());
+              }
+	    }
+            break;
+	}
+      }
+      f->close_section();
+      f->flush(rdata);
+    } else /* !f */ {
+      for(choices_set_t::const_iterator it = selected_choices.begin();
+	  it != selected_choices.end(); ++it) {
+	choices_map_t::const_iterator i;
+	switch(*it) {
+	  case PG_NUM:
+	    ss << "pg_num: " << p->get_pg_num() << "\n";
+	    break;
+	  case PGP_NUM:
+	    ss << "pgp_num: " << p->get_pgp_num() << "\n";
+	    break;
+	  case SIZE:
+	    ss << "size: " << p->get_size() << "\n";
+	    break;
+	  case MIN_SIZE:
+	    ss << "min_size: " << p->get_min_size() << "\n";
+	    break;
+	  case CRUSH_RULE:
+	    if (osdmap.crush->rule_exists(p->get_crush_rule())) {
+	      ss << "crush_rule: " << osdmap.crush->get_rule_name(
+		p->get_crush_rule()) << "\n";
+	    } else {
+	      ss << "crush_rule: " << p->get_crush_rule() << "\n";
+	    }
+	    break;
+	  case PG_AUTOSCALE_MODE:
+	    ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
+	      p->pg_autoscale_mode) <<"\n";
+	    break;
+	  case HIT_SET_PERIOD:
+	    ss << "hit_set_period: " << p->hit_set_period << "\n";
+	    break;
+	  case HIT_SET_COUNT:
+	    ss << "hit_set_count: " << p->hit_set_count << "\n";
+	    break;
+	  case HIT_SET_TYPE:
+	    ss << "hit_set_type: " <<
+	      HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
+	    break;
+	  case HIT_SET_FPP:
+	    {
+	      if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+		BloomHitSet::Params *bloomp =
+		  static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+		ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
+	      } else if(var != "all") {
+		ss << "hit set is not of type Bloom; " <<
+		  "invalid to get a false positive rate!";
+		r = -EINVAL;
+		goto reply;
+	      }
+	    }
+	    break;
+	  case USE_GMT_HITSET:
+	    ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
+	    break;
+	  case TARGET_MAX_OBJECTS:
+	    ss << "target_max_objects: " << p->target_max_objects << "\n";
+	    break;
+	  case TARGET_MAX_BYTES:
+	    ss << "target_max_bytes: " << p->target_max_bytes << "\n";
+	    break;
+	  case CACHE_TARGET_DIRTY_RATIO:
+	    ss << "cache_target_dirty_ratio: "
+	       << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
+	    break;
+	  case CACHE_TARGET_DIRTY_HIGH_RATIO:
+	    ss << "cache_target_dirty_high_ratio: "
+	       << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
+	    break;
+	  case CACHE_TARGET_FULL_RATIO:
+	    ss << "cache_target_full_ratio: "
+	       << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
+	    break;
+	  case CACHE_MIN_FLUSH_AGE:
+	    ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
+	    break;
+	  case CACHE_MIN_EVICT_AGE:
+	    ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
+	    break;
+	  case ERASURE_CODE_PROFILE:
+	    ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
+	    break;
+	  case MIN_READ_RECENCY_FOR_PROMOTE:
+	    ss << "min_read_recency_for_promote: " <<
+	      p->min_read_recency_for_promote << "\n";
+	    break;
+	  case HIT_SET_GRADE_DECAY_RATE:
+	    ss << "hit_set_grade_decay_rate: " <<
+	      p->hit_set_grade_decay_rate << "\n";
+	    break;
+	  case HIT_SET_SEARCH_LAST_N:
+	    ss << "hit_set_search_last_n: " <<
+	      p->hit_set_search_last_n << "\n";
+	    break;
+	  case EC_OVERWRITES:
+	    ss << "allow_ec_overwrites: " <<
+	      (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
+	      "\n";
+	    break;
+	  case HASHPSPOOL:
+	  case NODELETE:
+	  case BULK:
+	  case NOPGCHANGE:
+	  case NOSIZECHANGE:
+	  case WRITE_FADVISE_DONTNEED:
+	  case NOSCRUB:
+	  case NODEEP_SCRUB:
+	    for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+	      if (i->second == *it)
+		break;
+	    }
+	    ceph_assert(i != ALL_CHOICES.end());
+	    ss << i->first << ": " <<
+	      (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
+	       "true" : "false") << "\n";
+	    break;
+	  case MIN_WRITE_RECENCY_FOR_PROMOTE:
+	    ss << "min_write_recency_for_promote: " <<
+	      p->min_write_recency_for_promote << "\n";
+	    break;
+          case FAST_READ:
+            ss << "fast_read: " << p->fast_read << "\n";
+            break;
+	  case SCRUB_MIN_INTERVAL:
+	  case SCRUB_MAX_INTERVAL:
+	  case DEEP_SCRUB_INTERVAL:
+          case RECOVERY_PRIORITY:
+          case RECOVERY_OP_PRIORITY:
+          case SCRUB_PRIORITY:
+	  case COMPRESSION_MODE:
+	  case COMPRESSION_ALGORITHM:
+	  case COMPRESSION_REQUIRED_RATIO:
+	  case COMPRESSION_MAX_BLOB_SIZE:
+	  case COMPRESSION_MIN_BLOB_SIZE:
+	  case CSUM_TYPE:
+	  case CSUM_MAX_BLOCK:
+	  case CSUM_MIN_BLOCK:
+	  case FINGERPRINT_ALGORITHM:
+	  case PG_NUM_MIN:
+	  case PG_NUM_MAX:
+	  case TARGET_SIZE_BYTES:
+	  case TARGET_SIZE_RATIO:
+	  case PG_AUTOSCALE_BIAS:
+	  case DEDUP_TIER:
+	  case DEDUP_CHUNK_ALGORITHM:
+	  case DEDUP_CDC_CHUNK_SIZE:
+	    for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
+	      if (i->second == *it)
+		break;
+	    }
+	    ceph_assert(i != ALL_CHOICES.end());
+	    {
+	      pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
+	      if (p->opts.is_set(key)) {
+                if(key == pool_opts_t::CSUM_TYPE) {
+                  int64_t val;
+                  p->opts.get(key, &val);
+  		  ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
+                } else {
+  		  ss << i->first << ": " << p->opts.get(key) << "\n";
+                }
+	      }
+	    }
+	    break;
+	}
+	rdata.append(ss.str());
+	ss.str("");
+      }
+    }
+    r = 0;
+  } else if (prefix == "osd pool get-quota") {
+    string pool_name;
+    cmd_getval(cmdmap, "pool", pool_name);
+
+    int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
+    if (poolid < 0) {
+      ceph_assert(poolid == -ENOENT);
+      ss << "unrecognized pool '" << pool_name << "'";
+      r = -ENOENT;
+      goto reply;
+    }
+    const pg_pool_t *p = osdmap.get_pg_pool(poolid);
+    const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
+    if (!pstat) {
+      ss << "no stats for pool '" << pool_name << "'";
+      r = -ENOENT;
+      goto reply;
+    }
+    const object_stat_sum_t& sum = pstat->stats.sum;
+    if (f) {
+      f->open_object_section("pool_quotas");
+      f->dump_string("pool_name", pool_name);
+      f->dump_unsigned("pool_id", poolid);
+      f->dump_unsigned("quota_max_objects", p->quota_max_objects);
+      f->dump_int("current_num_objects", sum.num_objects);
+      f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
+      f->dump_int("current_num_bytes", sum.num_bytes);
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      stringstream rs;
+      rs << "quotas for pool '" << pool_name << "':\n"
+         << "  max objects: ";
+      if (p->quota_max_objects == 0)
+        rs << "N/A";
+      else {
+        rs << si_u_t(p->quota_max_objects) << " objects";
+        rs << "  (current num objects: " << sum.num_objects << " objects)";
+      }
+      rs << "\n"
+         << "  max bytes  : ";
+      if (p->quota_max_bytes == 0)
+        rs << "N/A";
+      else {
+        rs << byte_u_t(p->quota_max_bytes);
+        rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
+      }
+      rdata.append(rs.str());
+    }
+    rdata.append("\n");
+    r = 0;
+  } else if (prefix == "osd crush rule list" ||
+	     prefix == "osd crush rule ls") {
+    if (f) {
+      f->open_array_section("rules");
+      osdmap.crush->list_rules(f.get());
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ostringstream ss;
+      osdmap.crush->list_rules(&ss);
+      rdata.append(ss.str());
+    }
+  } else if (prefix == "osd crush rule ls-by-class") {
+    string class_name;
+    cmd_getval(cmdmap, "class", class_name);
+    if (class_name.empty()) {
+      ss << "no class specified";
+      r = -EINVAL;
+      goto reply;
+    }
+    set<int> rules;
+    r = osdmap.crush->get_rules_by_class(class_name, &rules);
+    if (r < 0) {
+      ss << "failed to get rules by class '" << class_name << "'";
+      goto reply;
+    }
+    if (f) {
+      f->open_array_section("rules");
+      for (auto &rule: rules) {
+        f->dump_string("name", osdmap.crush->get_rule_name(rule));
+      }
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ostringstream rs;
+      for (auto &rule: rules) {
+        rs << osdmap.crush->get_rule_name(rule) << "\n";
+      }
+      rdata.append(rs.str());
+    }
+  } else if (prefix == "osd crush rule dump") {
+    string name;
+    cmd_getval(cmdmap, "name", name);
+    string format;
+    cmd_getval(cmdmap, "format", format);
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+    if (name == "") {
+      f->open_array_section("rules");
+      osdmap.crush->dump_rules(f.get());
+      f->close_section();
+    } else {
+      int ruleno = osdmap.crush->get_rule_id(name);
+      if (ruleno < 0) {
+	ss << "unknown crush rule '" << name << "'";
+	r = ruleno;
+	goto reply;
+      }
+      osdmap.crush->dump_rule(ruleno, f.get());
+    }
+    ostringstream rs;
+    f->flush(rs);
+    rs << "\n";
+    rdata.append(rs.str());
+  } else if (prefix == "osd crush dump") {
+    string format;
+    cmd_getval(cmdmap, "format", format);
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+    f->open_object_section("crush_map");
+    osdmap.crush->dump(f.get());
+    f->close_section();
+    ostringstream rs;
+    f->flush(rs);
+    rs << "\n";
+    rdata.append(rs.str());
+  } else if (prefix == "osd crush show-tunables") {
+    string format;
+    cmd_getval(cmdmap, "format", format);
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+    f->open_object_section("crush_map_tunables");
+    osdmap.crush->dump_tunables(f.get());
+    f->close_section();
+    ostringstream rs;
+    f->flush(rs);
+    rs << "\n";
+    rdata.append(rs.str());
+  } else if (prefix == "osd crush tree") {
+    string shadow;
+    cmd_getval(cmdmap, "shadow", shadow);
+    bool show_shadow = shadow == "--show-shadow";
+    boost::scoped_ptr<Formatter> f(Formatter::create(format));
+    if (f) {
+      f->open_object_section("crush_tree");
+      osdmap.crush->dump_tree(nullptr,
+                              f.get(),
+                              osdmap.get_pool_names(),
+                              show_shadow);
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ostringstream ss;
+      osdmap.crush->dump_tree(&ss,
+                              nullptr,
+                              osdmap.get_pool_names(),
+                              show_shadow);
+      rdata.append(ss.str());
+    }
+  } else if (prefix == "osd crush ls") {
+    string name;
+    if (!cmd_getval(cmdmap, "node", name)) {
+      ss << "no node specified";
+      r = -EINVAL;
+      goto reply;
+    }
+    if (!osdmap.crush->name_exists(name)) {
+      ss << "node '" << name << "' does not exist";
+      r = -ENOENT;
+      goto reply;
+    }
+    int id = osdmap.crush->get_item_id(name);
+    list<int> result;
+    if (id >= 0) {
+      result.push_back(id);
+    } else {
+      int num = osdmap.crush->get_bucket_size(id);
+      for (int i = 0; i < num; ++i) {
+	result.push_back(osdmap.crush->get_bucket_item(id, i));
+      }
+    }
+    if (f) {
+      f->open_array_section("items");
+      for (auto i : result) {
+	f->dump_string("item", osdmap.crush->get_item_name(i));
+      }
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ostringstream ss;
+      for (auto i : result) {
+	ss << osdmap.crush->get_item_name(i) << "\n";
+      }
+      rdata.append(ss.str());
+    }
+    r = 0;
+  } else if (prefix == "osd crush class ls") {
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
+    f->open_array_section("crush_classes");
+    for (auto i : osdmap.crush->class_name)
+      f->dump_string("class", i.second);
+    f->close_section();
+    f->flush(rdata);
+  } else if (prefix == "osd crush class ls-osd") {
+    string name;
+    cmd_getval(cmdmap, "class", name);
+    set<int> osds;
+    osdmap.crush->get_devices_by_class(name, &osds);
+    if (f) {
+      f->open_array_section("osds");
+      for (auto &osd: osds)
+        f->dump_int("osd", osd);
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      bool first = true;
+      for (auto &osd : osds) {
+        if (!first)
+          ds << "\n";
+        first = false;
+        ds << osd;
+      }
+      rdata.append(ds);
+    }
+  } else if (prefix == "osd crush get-device-class") {
+    vector<string> idvec;
+    cmd_getval(cmdmap, "ids", idvec);
+    map<int, string> class_by_osd;
+    for (auto& id : idvec) {
+      ostringstream ts;
+      long osd = parse_osd_id(id.c_str(), &ts);
+      if (osd < 0) {
+        ss << "unable to parse osd id:'" << id << "'";
+        r = -EINVAL;
+        goto reply;
+      }
+      auto device_class = osdmap.crush->get_item_class(osd);
+      if (device_class)
+        class_by_osd[osd] = device_class;
+      else
+        class_by_osd[osd] = ""; // no class
+    }
+    if (f) {
+      f->open_array_section("osd_device_classes");
+      for (auto& i : class_by_osd) {
+        f->open_object_section("osd_device_class");
+        f->dump_int("osd", i.first);
+        f->dump_string("device_class", i.second);
+        f->close_section();
+      }
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      if (class_by_osd.size() == 1) {
+        // for single input, make a clean output
+        ds << class_by_osd.begin()->second;
+      } else {
+        // note that we do not group osds by class here
+        for (auto it = class_by_osd.begin();
+             it != class_by_osd.end();
+             it++) {
+          ds << "osd." << it->first << ' ' << it->second;
+          if (next(it) != class_by_osd.end())
+            ds << '\n';
+        }
+      }
+      rdata.append(ds);
+    }
+  } else if (prefix == "osd erasure-code-profile ls") {
+    const auto &profiles = osdmap.get_erasure_code_profiles();
+    if (f)
+      f->open_array_section("erasure-code-profiles");
+    for (auto i = profiles.begin(); i != profiles.end(); ++i) {
+      if (f)
+        f->dump_string("profile", i->first.c_str());
+      else
+	rdata.append(i->first + "\n");
+    }
+    if (f) {
+      f->close_section();
+      ostringstream rs;
+      f->flush(rs);
+      rs << "\n";
+      rdata.append(rs.str());
+    }
+  } else if (prefix == "osd crush weight-set ls") {
+    boost::scoped_ptr<Formatter> f(Formatter::create(format));
+    if (f) {
+      f->open_array_section("weight_sets");
+      if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
+	f->dump_string("pool", "(compat)");
+      }
+      for (auto& i : osdmap.crush->choose_args) {
+	if (i.first >= 0) {
+	  f->dump_string("pool", osdmap.get_pool_name(i.first));
+	}
+      }
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ostringstream rs;
+      if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
+	rs << "(compat)\n";
+      }
+      for (auto& i : osdmap.crush->choose_args) {
+	if (i.first >= 0) {
+	  rs << osdmap.get_pool_name(i.first) << "\n";
+	}
+      }
+      rdata.append(rs.str());
+    }
+  } else if (prefix == "osd crush weight-set dump") {
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+						     "json-pretty"));
+    osdmap.crush->dump_choose_args(f.get());
+    f->flush(rdata);
+  } else if (prefix == "osd erasure-code-profile get") {
+    string name;
+    cmd_getval(cmdmap, "name", name);
+    if (!osdmap.has_erasure_code_profile(name)) {
+      ss << "unknown erasure code profile '" << name << "'";
+      r = -ENOENT;
+      goto reply;
+    }
+    const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
+    if (f)
+      f->open_object_section("profile");
+    for (map<string,string>::const_iterator i = profile.begin();
+	 i != profile.end();
+	 ++i) {
+      if (f)
+        f->dump_string(i->first.c_str(), i->second.c_str());
+      else
+	rdata.append(i->first + "=" + i->second + "\n");
+    }
+    if (f) {
+      f->close_section();
+      ostringstream rs;
+      f->flush(rs);
+      rs << "\n";
+      rdata.append(rs.str());
+    }
+  } else if (prefix == "osd pool application get") {
+    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
+                                                     "json-pretty"));
+    string pool_name;
+    cmd_getval(cmdmap, "pool", pool_name);
+    string app;
+    cmd_getval(cmdmap, "app", app);
+    string key;
+    cmd_getval(cmdmap, "key", key);
+
+    if (pool_name.empty()) {
+      // all
+      f->open_object_section("pools");
+      for (const auto &pool : osdmap.pools) {
+        std::string name("<unknown>");
+        const auto &pni = osdmap.pool_name.find(pool.first);
+        if (pni != osdmap.pool_name.end())
+          name = pni->second;
+        f->open_object_section(name.c_str());
+        for (auto &app_pair : pool.second.application_metadata) {
+          f->open_object_section(app_pair.first.c_str());
+          for (auto &kv_pair : app_pair.second) {
+            f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+          }
+          f->close_section();
+        }
+        f->close_section(); // name
+      }
+      f->close_section(); // pools
+      f->flush(rdata);
+    } else {
+      int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
+      if (pool < 0) {
+        ss << "unrecognized pool '" << pool_name << "'";
+        r = -ENOENT;
+        goto reply;
+      }
+      auto p = osdmap.get_pg_pool(pool);
+      // filter by pool
+      if (app.empty()) {
+        f->open_object_section(pool_name.c_str());
+        for (auto &app_pair : p->application_metadata) {
+          f->open_object_section(app_pair.first.c_str());
+          for (auto &kv_pair : app_pair.second) {
+            f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+          }
+          f->close_section(); // application
+        }
+        f->close_section(); // pool_name
+        f->flush(rdata);
+        goto reply;
+      }
+
+      auto app_it = p->application_metadata.find(app);
+      if (app_it == p->application_metadata.end()) {
+        ss << "pool '" << pool_name << "' has no application '" << app << "'";
+        r = -ENOENT;
+        goto reply;
+      }
+      // filter by pool + app
+      if (key.empty()) {
+        f->open_object_section(app_it->first.c_str());
+        for (auto &kv_pair : app_it->second) {
+          f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+        }
+        f->close_section(); // application
+        f->flush(rdata);
+        goto reply;
+      }
+      // filter by pool + app + key
+      auto key_it = app_it->second.find(key);
+      if (key_it == app_it->second.end()) {
+        ss << "application '" << app << "' on pool '" << pool_name
+           << "' does not have key '" << key << "'";
+        r = -ENOENT;
+        goto reply;
+      }
+      ss << key_it->second << "\n";
+      rdata.append(ss.str());
+      ss.str("");
+    }
+  } else if (prefix == "osd get-require-min-compat-client") {
+    ss << osdmap.require_min_compat_client << std::endl;
+    rdata.append(ss.str());
+    ss.str("");
+    goto reply;
+  } else if (prefix == "osd pool application enable" ||
+             prefix == "osd pool application disable" ||
+             prefix == "osd pool application set" ||
+             prefix == "osd pool application rm") {
+    bool changed = false;
+    r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
+    if (r != 0) {
+      // Error, reply.
+      goto reply;
+    } else if (changed) {
+      // Valid mutation, proceed to prepare phase
+      return false;
+    } else {
+      // Idempotent case, reply
+      goto reply;
+    }
+  } else {
+    // try prepare update
+    return false;
+  }
+
+ reply:
+  string rs;
+  getline(ss, rs);
+  mon.reply_command(op, r, rs, rdata, get_last_committed());
+  return true;
+}
+
+void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
+{
+  pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+    osdmap.get_pg_pool(pool_id));
+  ceph_assert(pool);
+  pool->set_flag(flags);
+}
+
+void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
+{
+  pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
+    osdmap.get_pg_pool(pool_id));
+  ceph_assert(pool);
+  pool->unset_flag(flags);
+}
+
+string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
+{
+  char k[80];
+  snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
+  return k;
+}
+
+string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
+{
+  char k[80];
+  snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
+	   (unsigned long long)pool, (unsigned long long)snap);
+  return k;
+}
+
+string OSDMonitor::make_purged_snap_key_value(
+  int64_t pool, snapid_t snap, snapid_t num,
+  epoch_t epoch, bufferlist *v)
+{
+  // encode the *last* epoch in the key so that we can use forward
+  // iteration only to search for an epoch in an interval.
+  encode(snap, *v);
+  encode(snap + num, *v);
+  encode(epoch, *v);
+  return make_purged_snap_key(pool, snap + num - 1);
+}
+
+
+int OSDMonitor::lookup_purged_snap(
+  int64_t pool, snapid_t snap,
+  snapid_t *begin, snapid_t *end)
+{
+  string k = make_purged_snap_key(pool, snap);
+  auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
+  it->lower_bound(k);
+  if (!it->valid()) {
+    dout(20) << __func__
+	     << " pool " << pool << " snap " << snap
+	     << " - key '" << k << "' not found" << dendl;
+    return -ENOENT;
+  }
+  if (it->key().find("purged_snap_") != 0) {
+    dout(20) << __func__
+	     << " pool " << pool << " snap " << snap
+	     << " - key '" << k << "' got '" << it->key()
+	     << "', wrong prefix" << dendl;
+    return -ENOENT;
+  }
+  string gotk = it->key();
+  const char *format = "purged_snap_%llu_";
+  long long int keypool;
+  int n = sscanf(gotk.c_str(), format, &keypool);
+  if (n != 1) {
+    derr << __func__ << " invalid k '" << gotk << "'" << dendl;
+    return -ENOENT;
+  }
+  if (pool != keypool) {
+    dout(20) << __func__
+	     << " pool " << pool << " snap " << snap
+	     << " - key '" << k << "' got '" << gotk
+	     << "', wrong pool " << keypool
+	     << dendl;
+    return -ENOENT;
+  }
+  bufferlist v = it->value();
+  auto p = v.cbegin();
+  decode(*begin, p);
+  decode(*end, p);
+  if (snap < *begin || snap >= *end) {
+    dout(20) << __func__
+	     << " pool " << pool << " snap " << snap
+	     << " - found [" << *begin << "," << *end << "), no overlap"
+	     << dendl;
+    return -ENOENT;
+  }
+  return 0;
+}
+
+void OSDMonitor::insert_purged_snap_update(
+  int64_t pool,
+  snapid_t start, snapid_t end,
+  epoch_t epoch,
+  MonitorDBStore::TransactionRef t)
+{
+  snapid_t before_begin, before_end;
+  snapid_t after_begin, after_end;
+  int b = lookup_purged_snap(pool, start - 1,
+			     &before_begin, &before_end);
+  int a = lookup_purged_snap(pool, end,
+			     &after_begin, &after_end);
+  if (!b && !a) {
+    dout(10) << __func__
+	     << " [" << start << "," << end << ") - joins ["
+	     << before_begin << "," << before_end << ") and ["
+	     << after_begin << "," << after_end << ")" << dendl;
+    // erase only the begin record; we'll overwrite the end one.
+    t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
+    bufferlist v;
+    string k = make_purged_snap_key_value(pool,
+					  before_begin, after_end - before_begin,
+					  pending_inc.epoch, &v);
+    t->put(OSD_SNAP_PREFIX, k, v);
+  } else if (!b) {
+    dout(10) << __func__
+	     << " [" << start << "," << end << ") - join with earlier ["
+	     << before_begin << "," << before_end << ")" << dendl;
+    t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
+    bufferlist v;
+    string k = make_purged_snap_key_value(pool,
+					  before_begin, end - before_begin,
+					  pending_inc.epoch, &v);
+    t->put(OSD_SNAP_PREFIX, k, v);
+  } else if (!a) {
+    dout(10) << __func__
+	     << " [" << start << "," << end << ") - join with later ["
+	     << after_begin << "," << after_end << ")" << dendl;
+    // overwrite after record
+    bufferlist v;
+    string k = make_purged_snap_key_value(pool,
+					  start, after_end - start,
+					  pending_inc.epoch, &v);
+    t->put(OSD_SNAP_PREFIX, k, v);
+  } else {
+    dout(10) << __func__
+	     << " [" << start << "," << end << ") - new"
+	     << dendl;
+    bufferlist v;
+    string k = make_purged_snap_key_value(pool,
+					  start, end - start,
+					  pending_inc.epoch, &v);
+    t->put(OSD_SNAP_PREFIX, k, v);
+  }
+}
+
+bool OSDMonitor::try_prune_purged_snaps()
+{
+  if (!mon.mgrstatmon()->is_readable()) {
+    return false;
+  }
+  if (!pending_inc.new_purged_snaps.empty()) {
+    return false;  // we already pruned for this epoch
+  }
+
+  unsigned max_prune = cct->_conf.get_val<uint64_t>(
+    "mon_max_snap_prune_per_epoch");
+  if (!max_prune) {
+    max_prune = 100000;
+  }
+  dout(10) << __func__ << " max_prune " << max_prune << dendl;
+
+  unsigned actually_pruned = 0;
+  auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
+  for (auto& p : osdmap.get_pools()) {
+    auto q = purged_snaps.find(p.first);
+    if (q == purged_snaps.end()) {
+      continue;
+    }
+    auto& purged = q->second;
+    if (purged.empty()) {
+      dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
+      continue;
+    }
+    dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
+    snap_interval_set_t to_prune;
+    unsigned maybe_pruned = actually_pruned;
+    for (auto i = purged.begin(); i != purged.end(); ++i) {
+      snapid_t begin = i.get_start();
+      auto end = i.get_start() + i.get_len();
+      snapid_t pbegin = 0, pend = 0;
+      int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
+      if (r == 0) {
+	// already purged.
+	// be a bit aggressive about backing off here, because the mon may
+	// do a lot of work going through this set, and if we know the
+	// purged set from the OSDs is at least *partly* stale we may as
+	// well wait for it to be fresh.
+	dout(20) << __func__ << "  we've already purged " << pbegin
+		 << "~" << (pend - pbegin) << dendl;
+	break;  // next pool
+      }
+      if (pbegin && pbegin > begin && pbegin < end) {
+	// the tail of [begin,end) is purged; shorten the range
+	end = pbegin;
+      }
+      to_prune.insert(begin, end - begin);
+      maybe_pruned += end - begin;
+      if (maybe_pruned >= max_prune) {
+	break;
+      }
+    }
+    if (!to_prune.empty()) {
+      // PGs may still be reporting things as purged that we have already
+      // pruned from removed_snaps_queue.
+      snap_interval_set_t actual;
+      auto r = osdmap.removed_snaps_queue.find(p.first);
+      if (r != osdmap.removed_snaps_queue.end()) {
+	actual.intersection_of(to_prune, r->second);
+      }
+      actually_pruned += actual.size();
+      dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
+	       << ", actual pruned " << actual << dendl;
+      if (!actual.empty()) {
+	pending_inc.new_purged_snaps[p.first].swap(actual);
+      }
+    }
+    if (actually_pruned >= max_prune) {
+      break;
+    }
+  }
+  dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
+  return !!actually_pruned;
+}
+
+bool OSDMonitor::update_pools_status()
+{
+  if (!mon.mgrstatmon()->is_readable())
+    return false;
+
+  bool ret = false;
+
+  auto& pools = osdmap.get_pools();
+  for (auto it = pools.begin(); it != pools.end(); ++it) {
+    const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
+    if (!pstat)
+      continue;
+    const object_stat_sum_t& sum = pstat->stats.sum;
+    const pg_pool_t &pool = it->second;
+    const string& pool_name = osdmap.get_pool_name(it->first);
+
+    bool pool_is_full =
+      (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
+      (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
+
+    if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+      if (pool_is_full)
+        continue;
+
+      mon.clog->info() << "pool '" << pool_name
+                       << "' no longer out of quota; removing NO_QUOTA flag";
+      // below we cancel FLAG_FULL too, we'll set it again in
+      // OSDMonitor::encode_pending if it still fails the osd-full checking.
+      clear_pool_flags(it->first,
+                       pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
+      ret = true;
+    } else {
+      if (!pool_is_full)
+	continue;
+
+      if (pool.quota_max_bytes > 0 &&
+          (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
+        mon.clog->warn() << "pool '" << pool_name << "' is full"
+                         << " (reached quota's max_bytes: "
+                         << byte_u_t(pool.quota_max_bytes) << ")";
+      }
+      if (pool.quota_max_objects > 0 &&
+		 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
+        mon.clog->warn() << "pool '" << pool_name << "' is full"
+                         << " (reached quota's max_objects: "
+                         << pool.quota_max_objects << ")";
+      }
+      // set both FLAG_FULL_QUOTA and FLAG_FULL
+      // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
+      // since FLAG_FULL should always take precedence
+      set_pool_flags(it->first,
+                     pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
+      clear_pool_flags(it->first,
+                       pg_pool_t::FLAG_NEARFULL |
+                       pg_pool_t::FLAG_BACKFILLFULL);
+      ret = true;
+    }
+  }
+  return ret;
+}
+
+int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MPoolOp>();
+  dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
+  MonSession *session = op->get_session();
+  if (!session)
+    return -EPERM;
+  string erasure_code_profile;
+  stringstream ss;
+  string rule_name;
+  bool bulk = false;
+  int ret = 0;
+  ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
+			 0, 0, 0, 0, 0, 0, 0.0,
+			 erasure_code_profile,
+			 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
+			 &ss);
+
+  if (ret < 0) {
+    dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
+  }
+  return ret;
+}
+
+int OSDMonitor::crush_rename_bucket(const string& srcname,
+				    const string& dstname,
+				    ostream *ss)
+{
+  int ret;
+  //
+  // Avoid creating a pending crush if it does not already exists and
+  // the rename would fail.
+  //
+  if (!_have_pending_crush()) {
+    ret = _get_stable_crush().can_rename_bucket(srcname,
+						dstname,
+						ss);
+    if (ret)
+      return ret;
+  }
+
+  CrushWrapper newcrush;
+  _get_pending_crush(newcrush);
+
+  ret = newcrush.rename_bucket(srcname,
+			       dstname,
+			       ss);
+  if (ret)
+    return ret;
+
+  pending_inc.crush.clear();
+  newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+  *ss << "renamed bucket " << srcname << " into " << dstname;	
+  return 0;
+}
+
+void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
+{
+  string replacement = "";
+
+  if (plugin == "jerasure_generic" || 
+      plugin == "jerasure_sse3" ||
+      plugin == "jerasure_sse4" ||
+      plugin == "jerasure_neon") {
+    replacement = "jerasure";
+  } else if (plugin == "shec_generic" ||
+	     plugin == "shec_sse3" ||
+	     plugin == "shec_sse4" ||
+             plugin == "shec_neon") {
+    replacement = "shec";
+  }
+
+  if (replacement != "") {
+    dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
+	    << plugin << " that has been deprecated. Please use " 
+	    << replacement << " instead." << dendl;
+  }
+}
+
+int OSDMonitor::normalize_profile(const string& profilename,
+				  ErasureCodeProfile &profile,
+				  bool force,
+				  ostream *ss)
+{
+  ErasureCodeInterfaceRef erasure_code;
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
+  check_legacy_ec_plugin(plugin->second, profilename);
+  int err = instance.factory(plugin->second,
+			     g_conf().get_val<std::string>("erasure_code_dir"),
+			     profile, &erasure_code, ss);
+  if (err) {
+    return err;
+  }
+
+  err = erasure_code->init(profile, ss);
+  if (err) {
+    return err;
+  }
+
+  auto it = profile.find("stripe_unit");
+  if (it != profile.end()) {
+    string err_str;
+    uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
+    if (!err_str.empty()) {
+      *ss << "could not parse stripe_unit '" << it->second
+	  << "': " << err_str << std::endl;
+      return -EINVAL;
+    }
+    uint32_t data_chunks = erasure_code->get_data_chunk_count();
+    uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
+    if (chunk_size != stripe_unit) {
+      *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
+	  << "alignment. Would be padded to " << chunk_size
+	  << std::endl;
+      return -EINVAL;
+    }
+    if ((stripe_unit % 4096) != 0 && !force) {
+      *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
+	  << "use --force to override this check" << std::endl;
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
+int OSDMonitor::crush_rule_create_erasure(const string &name,
+					     const string &profile,
+					     int *rule,
+					     ostream *ss)
+{
+  int ruleid = osdmap.crush->get_rule_id(name);
+  if (ruleid != -ENOENT) {
+    *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
+    return -EEXIST;
+  }
+
+  CrushWrapper newcrush;
+  _get_pending_crush(newcrush);
+
+  ruleid = newcrush.get_rule_id(name);
+  if (ruleid != -ENOENT) {
+    *rule = newcrush.get_rule_mask_ruleset(ruleid);
+    return -EALREADY;
+  } else {
+    ErasureCodeInterfaceRef erasure_code;
+    int err = get_erasure_code(profile, &erasure_code, ss);
+    if (err) {
+      *ss << "failed to load plugin using profile " << profile << std::endl;
+      return err;
+    }
+
+    err = erasure_code->create_rule(name, newcrush, ss);
+    erasure_code.reset();
+    if (err < 0)
+      return err;
+    *rule = err;
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    return 0;
+  }
+}
+
+int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
+				 ErasureCodeInterfaceRef *erasure_code,
+				 ostream *ss) const
+{
+  if (pending_inc.has_erasure_code_profile(erasure_code_profile))
+    return -EAGAIN;
+  ErasureCodeProfile profile =
+    osdmap.get_erasure_code_profile(erasure_code_profile);
+  ErasureCodeProfile::const_iterator plugin =
+    profile.find("plugin");
+  if (plugin == profile.end()) {
+    *ss << "cannot determine the erasure code plugin"
+	<< " because there is no 'plugin' entry in the erasure_code_profile "
+	<< profile << std::endl;
+    return -EINVAL;
+  }
+  check_legacy_ec_plugin(plugin->second, erasure_code_profile);
+  auto& instance = ErasureCodePluginRegistry::instance();
+  return instance.factory(plugin->second,
+			  g_conf().get_val<std::string>("erasure_code_dir"),
+			  profile, erasure_code, ss);
+}
+
+int OSDMonitor::check_cluster_features(uint64_t features,
+				       stringstream &ss)
+{
+  stringstream unsupported_ss;
+  int unsupported_count = 0;
+  if ((mon.get_quorum_con_features() & features) != features) {
+    unsupported_ss << "the monitor cluster";
+    ++unsupported_count;
+  }
+
+  set<int32_t> up_osds;
+  osdmap.get_up_osds(up_osds);
+  for (set<int32_t>::iterator it = up_osds.begin();
+       it != up_osds.end(); ++it) {
+    const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
+    if ((xi.features & features) != features) {
+      if (unsupported_count > 0)
+	unsupported_ss << ", ";
+      unsupported_ss << "osd." << *it;
+      unsupported_count ++;
+    }
+  }
+
+  if (unsupported_count > 0) {
+    ss << "features " << features << " unsupported by: "
+       << unsupported_ss.str();
+    return -ENOTSUP;
+  }
+
+  // check pending osd state, too!
+  for (map<int32_t,osd_xinfo_t>::const_iterator p =
+	 pending_inc.new_xinfo.begin();
+       p != pending_inc.new_xinfo.end(); ++p) {
+    const osd_xinfo_t &xi = p->second;
+    if ((xi.features & features) != features) {
+      dout(10) << __func__ << " pending osd." << p->first
+	       << " features are insufficient; retry" << dendl;
+      return -EAGAIN;
+    }
+  }
+
+  return 0;
+}
+
+bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
+                                                 stringstream& ss)
+{
+  OSDMap::Incremental new_pending = pending_inc;
+  encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
+  OSDMap newmap;
+  newmap.deepish_copy_from(osdmap);
+  newmap.apply_incremental(new_pending);
+
+  // client compat
+  if (newmap.require_min_compat_client != ceph_release_t::unknown) {
+    auto mv = newmap.get_min_compat_client();
+    if (mv > newmap.require_min_compat_client) {
+      ss << "new crush map requires client version " << mv
+	 << " but require_min_compat_client is "
+	 << newmap.require_min_compat_client;
+      return false;
+    }
+  }
+
+  // osd compat
+  uint64_t features =
+    newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
+    newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
+  stringstream features_ss;
+  int r = check_cluster_features(features, features_ss);
+  if (r) {
+    ss << "Could not change CRUSH: " << features_ss.str();
+    return false;
+  }
+
+  return true;
+}
+
+bool OSDMonitor::erasure_code_profile_in_use(
+  const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
+  const string &profile,
+  ostream *ss)
+{
+  bool found = false;
+  for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
+       p != pools.end();
+       ++p) {
+    if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
+      *ss << osdmap.pool_name[p->first] << " ";
+      found = true;
+    }
+  }
+  if (found) {
+    *ss << "pool(s) are using the erasure code profile '" << profile << "'";
+  }
+  return found;
+}
+
+int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
+					   map<string,string> *erasure_code_profile_map,
+					   ostream *ss)
+{
+  int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
+				   get_json_str_map,
+				   *ss,
+				   erasure_code_profile_map,
+				   true);
+  if (r)
+    return r;
+  ceph_assert((*erasure_code_profile_map).count("plugin"));
+  string default_plugin = (*erasure_code_profile_map)["plugin"];
+  map<string,string> user_map;
+  for (vector<string>::const_iterator i = erasure_code_profile.begin();
+       i != erasure_code_profile.end();
+       ++i) {
+    size_t equal = i->find('=');
+    if (equal == string::npos) {
+      user_map[*i] = string();
+      (*erasure_code_profile_map)[*i] = string();
+    } else {
+      const string key = i->substr(0, equal);
+      equal++;
+      const string value = i->substr(equal);
+      if (key.find("ruleset-") == 0) {
+	*ss << "property '" << key << "' is no longer supported; try "
+	    << "'crush-" << key.substr(8) << "' instead";
+	return -EINVAL;
+      }
+      user_map[key] = value;
+      (*erasure_code_profile_map)[key] = value;
+    }
+  }
+
+  if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
+    (*erasure_code_profile_map) = user_map;
+
+  return 0;
+}
+
+int OSDMonitor::prepare_pool_size(const unsigned pool_type,
+				  const string &erasure_code_profile,
+                                  uint8_t repl_size,
+				  unsigned *size, unsigned *min_size,
+				  ostream *ss)
+{
+  int err = 0;
+  bool set_min_size = false;
+  switch (pool_type) {
+  case pg_pool_t::TYPE_REPLICATED:
+    if (osdmap.stretch_mode_enabled) {
+      if (repl_size == 0)
+	repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
+      if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
+	*ss << "prepare_pool_size: we are in stretch mode but size "
+	   << repl_size << " does not match!";
+	return -EINVAL;
+      }
+      *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+      set_min_size = true;
+    }
+    if (repl_size == 0) {
+      repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
+    }
+    *size = repl_size;
+    if (!set_min_size)
+      *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
+    break;
+  case pg_pool_t::TYPE_ERASURE:
+    {
+      if (osdmap.stretch_mode_enabled) {
+	*ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
+	return -EINVAL;
+      }
+      ErasureCodeInterfaceRef erasure_code;
+      err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
+      if (err == 0) {
+	*size = erasure_code->get_chunk_count();
+	*min_size =
+	  erasure_code->get_data_chunk_count() +
+	  std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
+	assert(*min_size <= *size);
+	assert(*min_size >= erasure_code->get_data_chunk_count());
+      }
+    }
+    break;
+  default:
+    *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
+    err = -EINVAL;
+    break;
+  }
+  return err;
+}
+
+int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
+					  const string &erasure_code_profile,
+					  uint32_t *stripe_width,
+					  ostream *ss)
+{
+  int err = 0;
+  switch (pool_type) {
+  case pg_pool_t::TYPE_REPLICATED:
+    // ignored
+    break;
+  case pg_pool_t::TYPE_ERASURE:
+    {
+      ErasureCodeProfile profile =
+	osdmap.get_erasure_code_profile(erasure_code_profile);
+      ErasureCodeInterfaceRef erasure_code;
+      err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
+      if (err)
+	break;
+      uint32_t data_chunks = erasure_code->get_data_chunk_count();
+      uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
+      auto it = profile.find("stripe_unit");
+      if (it != profile.end()) {
+	string err_str;
+	stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
+	ceph_assert(err_str.empty());
+      }
+      *stripe_width = data_chunks *
+	erasure_code->get_chunk_size(stripe_unit * data_chunks);
+    }
+    break;
+  default:
+    *ss << "prepare_pool_stripe_width: "
+       << pool_type << " is not a known pool type";
+    err = -EINVAL;
+    break;
+  }
+  return err;
+}
+
+int OSDMonitor::get_replicated_stretch_crush_rule()
+{
+  /* we don't write down the stretch rule anywhere, so
+   * we have to guess it. How? Look at all the pools
+   * and count up how many times a given rule is used
+   * on stretch pools and then return the one with
+   * the most users!
+   */
+  map<int,int> rule_counts;
+  for (const auto& pooli : osdmap.pools) {
+    const pg_pool_t& p = pooli.second;
+    if (p.is_replicated() && p.is_stretch_pool()) {
+      if (!rule_counts.count(p.crush_rule)) {
+	rule_counts[p.crush_rule] = 1;
+      } else {
+	++rule_counts[p.crush_rule];
+      }
+    }
+  }
+
+  if (rule_counts.empty()) {
+    return -ENOENT;
+  }
+
+  int most_used_count = 0;
+  int most_used_rule = -1;
+  for (auto i : rule_counts) {
+    if (i.second > most_used_count) {
+      most_used_rule = i.first;
+      most_used_count = i.second;
+    }
+  }
+  ceph_assert(most_used_count > 0);
+  ceph_assert(most_used_rule >= 0);
+  return most_used_rule;
+}
+
+int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
+					const string &erasure_code_profile,
+					const string &rule_name,
+					int *crush_rule,
+					ostream *ss)
+{
+
+  if (*crush_rule < 0) {
+    switch (pool_type) {
+    case pg_pool_t::TYPE_REPLICATED:
+      {
+	if (rule_name == "") {
+	  if (osdmap.stretch_mode_enabled) {
+	    *crush_rule = get_replicated_stretch_crush_rule();
+	  } else {
+	    // Use default rule
+	    *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
+	  }
+	  if (*crush_rule < 0) {
+	    // Errors may happen e.g. if no valid rule is available
+	    *ss << "No suitable CRUSH rule exists, check "
+                << "'osd pool default crush *' config options";
+	    return -ENOENT;
+	  }
+	} else {
+	  return get_crush_rule(rule_name, crush_rule, ss);
+	}
+      }
+      break;
+    case pg_pool_t::TYPE_ERASURE:
+      {
+	int err = crush_rule_create_erasure(rule_name,
+					       erasure_code_profile,
+					       crush_rule, ss);
+	switch (err) {
+	case -EALREADY:
+	  dout(20) << "prepare_pool_crush_rule: rule "
+		   << rule_name << " try again" << dendl;
+	  // fall through
+	case 0:
+	  // need to wait for the crush rule to be proposed before proceeding
+	  err = -EAGAIN;
+	  break;
+	case -EEXIST:
+	  err = 0;
+	  break;
+ 	}
+	return err;
+      }
+      break;
+    default:
+      *ss << "prepare_pool_crush_rule: " << pool_type
+	 << " is not a known pool type";
+      return -EINVAL;
+    }
+  } else {
+    if (!osdmap.crush->ruleset_exists(*crush_rule)) {
+      *ss << "CRUSH rule " << *crush_rule << " not found";
+      return -ENOENT;
+    }
+  }
+
+  return 0;
+}
+
+int OSDMonitor::get_crush_rule(const string &rule_name,
+			       int *crush_rule,
+			       ostream *ss)
+{
+  int ret;
+  ret = osdmap.crush->get_rule_id(rule_name);
+  if (ret != -ENOENT) {
+    // found it, use it
+    *crush_rule = ret;
+  } else {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    ret = newcrush.get_rule_id(rule_name);
+    if (ret != -ENOENT) {
+      // found it, wait for it to be proposed
+      dout(20) << __func__ << ": rule " << rule_name
+	       << " try again" << dendl;
+      return -EAGAIN;
+    } else {
+      // Cannot find it , return error
+      *ss << "specified rule " << rule_name << " doesn't exist";
+      return ret;
+    }
+  }
+  return 0;
+}
+
+int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
+{
+  auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
+  auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
+  auto max_pgs = max_pgs_per_osd * num_osds;
+  uint64_t projected = 0;
+  if (pool < 0) {
+    projected += pg_num * size;
+  }
+  for (const auto& i : osdmap.get_pools()) {
+    if (i.first == pool) {
+      projected += pg_num * size;
+    } else {
+      projected += i.second.get_pg_num_target() * i.second.get_size();
+    }
+  }
+  if (projected > max_pgs) {
+    if (pool >= 0) {
+      *ss << "pool id " << pool;
+    }
+    *ss << " pg_num " << pg_num << " size " << size
+	<< " would mean " << projected
+	<< " total pgs, which exceeds max " << max_pgs
+	<< " (mon_max_pg_per_osd " << max_pgs_per_osd
+	<< " * num_in_osds " << num_osds << ")";
+    return -ERANGE;
+  }
+  return 0;
+}
+
+/**
+ * @param name The name of the new pool
+ * @param crush_rule The crush rule to use. If <0, will use the system default
+ * @param crush_rule_name The crush rule to use, if crush_rulset <0
+ * @param pg_num The pg_num to use. If set to 0, will use the system default
+ * @param pgp_num The pgp_num to use. If set to 0, will use the system default
+ * @param pg_num_min min pg_num
+ * @param pg_num_max max pg_num
+ * @param repl_size Replication factor, or 0 for default
+ * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
+ * @param pool_type TYPE_ERASURE, or TYPE_REP
+ * @param expected_num_objects expected number of objects on the pool
+ * @param fast_read fast read type. 
+ * @param ss human readable error message, if any.
+ *
+ * @return 0 on success, negative errno on failure.
+ */
+int OSDMonitor::prepare_new_pool(string& name,
+				 int crush_rule,
+				 const string &crush_rule_name,
+                                 unsigned pg_num, unsigned pgp_num,
+				 unsigned pg_num_min,
+				 unsigned pg_num_max,
+                                 const uint64_t repl_size,
+				 const uint64_t target_size_bytes,
+				 const float target_size_ratio,
+				 const string &erasure_code_profile,
+                                 const unsigned pool_type,
+                                 const uint64_t expected_num_objects,
+                                 FastReadType fast_read,
+				 const string& pg_autoscale_mode,
+				 bool bulk,
+				 ostream *ss)
+{
+  if (name.length() == 0)
+    return -EINVAL;
+  if (pg_num == 0)
+    pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
+  if (pgp_num == 0)
+    pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
+  if (!pgp_num)
+    pgp_num = pg_num;
+  if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
+    *ss << "'pg_num' must be greater than 0 and less than or equal to "
+        << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
+        << " (you may adjust 'mon max pool pg num' for higher values)";
+    return -ERANGE;
+  }
+  if (pgp_num > pg_num) {
+    *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
+        << ", which in this case is " << pg_num;
+    return -ERANGE;
+  }
+  if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
+    *ss << "'fast_read' can only apply to erasure coding pool";
+    return -EINVAL;
+  }
+  int r;
+  r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
+				 crush_rule_name, &crush_rule, ss);
+  if (r) {
+    dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
+    return r;
+  }
+  if (g_conf()->mon_osd_crush_smoke_test) {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    ostringstream err;
+    CrushTester tester(newcrush, err);
+    tester.set_min_x(0);
+    tester.set_max_x(50);
+    tester.set_rule(crush_rule);
+    auto start = ceph::coarse_mono_clock::now();
+    r = tester.test_with_fork(g_conf()->mon_lease);
+    auto duration = ceph::coarse_mono_clock::now() - start;
+    if (r < 0) {
+      dout(10) << "tester.test_with_fork returns " << r
+	       << ": " << err.str() << dendl;
+      *ss << "crush test failed with " << r << ": " << err.str();
+      return r;
+    }
+    dout(10) << __func__ << " crush smoke test duration: "
+             << duration << dendl;
+  }
+  unsigned size, min_size;
+  r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
+                        &size, &min_size, ss);
+  if (r) {
+    dout(10) << "prepare_pool_size returns " << r << dendl;
+    return r;
+  }
+  r = check_pg_num(-1, pg_num, size, ss);
+  if (r) {
+    dout(10) << "check_pg_num returns " << r << dendl;
+    return r;
+  }
+
+  if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
+    return -EINVAL;
+  }
+
+  uint32_t stripe_width = 0;
+  r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
+  if (r) {
+    dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
+    return r;
+  }
+  
+  bool fread = false;
+  if (pool_type == pg_pool_t::TYPE_ERASURE) {
+    switch (fast_read) {
+      case FAST_READ_OFF:
+        fread = false;
+        break;
+      case FAST_READ_ON:
+        fread = true;
+        break;
+      case FAST_READ_DEFAULT:
+        fread = g_conf()->osd_pool_default_ec_fast_read;
+        break;
+      default:
+        *ss << "invalid fast_read setting: " << fast_read;
+        return -EINVAL;
+    }
+  }
+
+  for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
+       p != pending_inc.new_pool_names.end();
+       ++p) {
+    if (p->second == name)
+      return 0;
+  }
+
+  if (-1 == pending_inc.new_pool_max)
+    pending_inc.new_pool_max = osdmap.pool_max;
+  int64_t pool = ++pending_inc.new_pool_max;
+  pg_pool_t empty;
+  pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
+  pi->create_time = ceph_clock_now();
+  pi->type = pool_type;
+  pi->fast_read = fread; 
+  pi->flags = g_conf()->osd_pool_default_flags;
+  if (bulk) {
+    pi->set_flag(pg_pool_t::FLAG_BULK);
+  } else if (g_conf()->osd_pool_default_flag_bulk) {
+      pi->set_flag(pg_pool_t::FLAG_BULK);
+  }
+  if (g_conf()->osd_pool_default_flag_hashpspool)
+    pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+  if (g_conf()->osd_pool_default_flag_nodelete)
+    pi->set_flag(pg_pool_t::FLAG_NODELETE);
+  if (g_conf()->osd_pool_default_flag_nopgchange)
+    pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
+  if (g_conf()->osd_pool_default_flag_nosizechange)
+    pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+  pi->set_flag(pg_pool_t::FLAG_CREATING);
+  if (g_conf()->osd_pool_use_gmt_hitset)
+    pi->use_gmt_hitset = true;
+  else
+    pi->use_gmt_hitset = false;
+
+  pi->size = size;
+  pi->min_size = min_size;
+  pi->crush_rule = crush_rule;
+  pi->expected_num_objects = expected_num_objects;
+  pi->object_hash = CEPH_STR_HASH_RJENKINS;
+  if (osdmap.stretch_mode_enabled) {
+    pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
+    pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
+    pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
+    pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+    if (osdmap.degraded_stretch_mode) {
+      pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
+      pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
+      // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
+      // TODO: drat, we don't record this ^ anywhere, though given that it
+      // necessarily won't exist elsewhere it likely doesn't matter
+      pi->min_size = pi->min_size / 2;
+      pi->size = pi->size / 2; // only support 2 zones now
+    }
+  }
+
+  if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
+        g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
+      m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+    pi->pg_autoscale_mode = m;
+  } else {
+    pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
+  }
+  auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
+  pi->set_pg_num(
+    max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
+    : pg_num);
+  pi->set_pg_num_pending(pi->get_pg_num());
+  pi->set_pg_num_target(pg_num);
+  pi->set_pgp_num(pi->get_pg_num());
+  pi->set_pgp_num_target(pgp_num);
+  if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
+      pg_num_min) {
+    pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
+  }
+  if (osdmap.require_osd_release >= ceph_release_t::pacific &&
+      pg_num_max) {
+    pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
+  }
+  if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
+	pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+    pi->pg_autoscale_mode = m;
+  }
+
+  pi->last_change = pending_inc.epoch;
+  pi->auid = 0;
+
+  if (pool_type == pg_pool_t::TYPE_ERASURE) {
+      pi->erasure_code_profile = erasure_code_profile;
+  } else {
+      pi->erasure_code_profile = "";
+  }
+  pi->stripe_width = stripe_width;
+
+  if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
+      target_size_bytes) {
+    // only store for nautilus+ because TARGET_SIZE_BYTES may be
+    // larger than int32_t max.
+    pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
+  }
+  if (target_size_ratio > 0.0 &&
+      osdmap.require_osd_release >= ceph_release_t::nautilus) {
+    // only store for nautilus+, just to be consistent and tidy.
+    pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
+  }
+
+  pi->cache_target_dirty_ratio_micro =
+    g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
+  pi->cache_target_dirty_high_ratio_micro =
+    g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
+  pi->cache_target_full_ratio_micro =
+    g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
+  pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
+  pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
+
+  pending_inc.new_pool_names[pool] = name;
+  return 0;
+}
+
+bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
+{
+  op->mark_osdmon_event(__func__);
+  ostringstream ss;
+  if (pending_inc.new_flags < 0)
+    pending_inc.new_flags = osdmap.get_flags();
+  pending_inc.new_flags |= flag;
+  ss << OSDMap::get_flag_string(flag) << " is set";
+  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+						    get_last_committed() + 1));
+  return true;
+}
+
+bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
+{
+  op->mark_osdmon_event(__func__);
+  ostringstream ss;
+  if (pending_inc.new_flags < 0)
+    pending_inc.new_flags = osdmap.get_flags();
+  pending_inc.new_flags &= ~flag;
+  ss << OSDMap::get_flag_string(flag) << " is unset";
+  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+						    get_last_committed() + 1));
+  return true;
+}
+
+int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
+                                         stringstream& ss)
+{
+  string poolstr;
+  cmd_getval(cmdmap, "pool", poolstr);
+  int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+  if (pool < 0) {
+    ss << "unrecognized pool '" << poolstr << "'";
+    return -ENOENT;
+  }
+  string var;
+  cmd_getval(cmdmap, "var", var);
+
+  pg_pool_t p = *osdmap.get_pg_pool(pool);
+  if (pending_inc.new_pools.count(pool))
+    p = pending_inc.new_pools[pool];
+
+  // accept val as a json string in the normal case (current
+  // generation monitor).  parse out int or float values from the
+  // string as needed.  however, if it is not a string, try to pull
+  // out an int, in case an older monitor with an older json schema is
+  // forwarding a request.
+  string val;
+  string interr, floaterr;
+  int64_t n = 0;
+  double f = 0;
+  int64_t uf = 0;  // micro-f
+  cmd_getval(cmdmap, "val", val);
+
+  auto si_options = {
+    "target_max_objects"
+  };
+  auto iec_options = {
+    "target_max_bytes",
+    "target_size_bytes",
+    "compression_max_blob_size",
+    "compression_min_blob_size",
+    "csum_max_block",
+    "csum_min_block",
+  };
+  if (count(begin(si_options), end(si_options), var)) {
+    n = strict_si_cast<int64_t>(val.c_str(), &interr);
+  } else if (count(begin(iec_options), end(iec_options), var)) {
+    n = strict_iec_cast<int64_t>(val.c_str(), &interr);
+  } else {
+    // parse string as both int and float; different fields use different types.
+    n = strict_strtoll(val.c_str(), 10, &interr);
+    f = strict_strtod(val.c_str(), &floaterr);
+    uf = llrintl(f * (double)1000000.0);
+  }
+
+  if (!p.is_tier() &&
+      (var == "hit_set_type" || var == "hit_set_period" ||
+       var == "hit_set_count" || var == "hit_set_fpp" ||
+       var == "target_max_objects" || var == "target_max_bytes" ||
+       var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
+       var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
+       var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
+       var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
+       var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
+    return -EACCES;
+  }
+
+  if (var == "size") {
+    if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
+      ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
+      return -EPERM;
+    }
+    if (p.type == pg_pool_t::TYPE_ERASURE) {
+      ss << "can not change the size of an erasure-coded pool";
+      return -ENOTSUP;
+    }
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n <= 0 || n > 10) {
+      ss << "pool size must be between 1 and 10";
+      return -EINVAL;
+    }
+    if (n == 1) {
+      if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
+	ss << "configuring pool size as 1 is disabled by default.";
+	return -EPERM;
+      }
+      bool sure = false;
+      cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+      if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
+	"without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
+	  "pass the flag --yes-i-really-mean-it.";
+	return -EPERM;
+      }
+    }
+    if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
+      return -EINVAL;
+    }
+    int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
+    if (r < 0) {
+      return r;
+    }
+    p.size = n;
+    p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
+  } else if (var == "min_size") {
+    if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
+      ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
+      return -EPERM;
+    }
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+
+    if (p.type != pg_pool_t::TYPE_ERASURE) {
+      if (n < 1 || n > p.size) {
+	ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
+	return -EINVAL;
+      }
+    } else {
+       ErasureCodeInterfaceRef erasure_code;
+       int k;
+       stringstream tmp;
+       int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
+       if (err == 0) {
+	 k = erasure_code->get_data_chunk_count();
+       } else {
+	 ss << __func__ << " get_erasure_code failed: " << tmp.str();
+	 return err;
+       }
+
+       if (n < k || n > p.size) {
+	 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
+	 return -EINVAL;
+       }
+    }
+    p.min_size = n;
+  } else if (var == "pg_num_actual") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n == (int)p.get_pg_num()) {
+      return 0;
+    }
+    if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
+      ss << "'pg_num' must be greater than 0 and less than or equal to "
+         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
+         << " (you may adjust 'mon max pool pg num' for higher values)";
+      return -ERANGE;
+    }
+    if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
+      ss << "cannot adjust pg_num while initial PGs are being created";
+      return -EBUSY;
+    }
+    if (n > (int)p.get_pg_num()) {
+      if (p.get_pg_num() != p.get_pg_num_pending()) {
+	// force pre-nautilus clients to resend their ops, since they
+	// don't understand pg_num_pending changes form a new interval
+	p.last_force_op_resend_prenautilus = pending_inc.epoch;
+      }
+      p.set_pg_num(n);
+    } else {
+      if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+	ss << "nautilus OSDs are required to adjust pg_num_pending";
+	return -EPERM;
+      }
+      if (n < (int)p.get_pgp_num()) {
+	ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
+	return -EINVAL;
+      }
+      if (n < (int)p.get_pg_num() - 1) {
+	ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
+	   << ") - 1; only single pg decrease is currently supported";
+	return -EINVAL;
+      }
+      p.set_pg_num_pending(n);
+      // force pre-nautilus clients to resend their ops, since they
+      // don't understand pg_num_pending changes form a new interval
+      p.last_force_op_resend_prenautilus = pending_inc.epoch;
+    }
+    // force pre-luminous clients to resend their ops, since they
+    // don't understand that split PGs now form a new interval.
+    p.last_force_op_resend_preluminous = pending_inc.epoch;
+  } else if (var == "pg_num") {
+    if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+      ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
+      return -EPERM;
+    }
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n == (int)p.get_pg_num_target()) {
+      return 0;
+    }
+    if (n <= 0 || static_cast<uint64_t>(n) >
+                  g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
+      ss << "'pg_num' must be greater than 0 and less than or equal to "
+         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
+         << " (you may adjust 'mon max pool pg num' for higher values)";
+      return -ERANGE;
+    }
+    if (n > (int)p.get_pg_num_target()) {
+      int r = check_pg_num(pool, n, p.get_size(), &ss);
+      if (r) {
+	return r;
+      }
+      bool force = false;
+      cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+      if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
+	ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
+	return -EPERM;
+      }
+    } else {
+      if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+	ss << "nautilus OSDs are required to decrease pg_num";
+	return -EPERM;
+      }
+    }
+    int64_t pg_min = 0, pg_max = 0;
+    p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
+    p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
+    if (pg_min && n < pg_min) {
+      ss << "specified pg_num " << n
+	 << " < pg_num_min " << pg_min;
+      return -EINVAL;
+    }
+    if (pg_max && n > pg_max) {
+      ss << "specified pg_num " << n
+	 << " < pg_num_max " << pg_max;
+      return -EINVAL;
+    }
+    if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+      // pre-nautilus osdmap format; increase pg_num directly
+      assert(n > (int)p.get_pg_num());
+      // force pre-nautilus clients to resend their ops, since they
+      // don't understand pg_num_target changes form a new interval
+      p.last_force_op_resend_prenautilus = pending_inc.epoch;
+      // force pre-luminous clients to resend their ops, since they
+      // don't understand that split PGs now form a new interval.
+      p.last_force_op_resend_preluminous = pending_inc.epoch;
+      p.set_pg_num(n);
+    } else {
+      // set targets; mgr will adjust pg_num_actual and pgp_num later.
+      // make pgp_num track pg_num if it already matches.  if it is set
+      // differently, leave it different and let the user control it
+      // manually.
+      if (p.get_pg_num_target() == p.get_pgp_num_target()) {
+	p.set_pgp_num_target(n);
+      }
+      p.set_pg_num_target(n);
+    }
+  } else if (var == "pgp_num_actual") {
+    if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+      ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
+      return -EPERM;
+    }
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n <= 0) {
+      ss << "specified pgp_num must > 0, but you set to " << n;
+      return -EINVAL;
+    }
+    if (n > (int)p.get_pg_num()) {
+      ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
+      return -EINVAL;
+    }
+    if (n > (int)p.get_pg_num_pending()) {
+      ss << "specified pgp_num " << n
+	 << " > pg_num_pending " << p.get_pg_num_pending();
+      return -EINVAL;
+    }
+    p.set_pgp_num(n);
+  } else if (var == "pgp_num") {
+    if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
+      ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
+      return -EPERM;
+    }
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n <= 0) {
+      ss << "specified pgp_num must > 0, but you set to " << n;
+      return -EINVAL;
+    }
+    if (n > (int)p.get_pg_num_target()) {
+      ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
+      return -EINVAL;
+    }
+    if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+      // pre-nautilus osdmap format; increase pgp_num directly
+      p.set_pgp_num(n);
+    } else {
+      p.set_pgp_num_target(n);
+    }
+  } else if (var == "pg_autoscale_mode") {
+    auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
+    if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+      ss << "specified invalid mode " << val;
+      return -EINVAL;
+    }
+    if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+      ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
+      return -EINVAL;
+    }
+    p.pg_autoscale_mode = m;
+  } else if (var == "crush_rule") {
+    int id = osdmap.crush->get_rule_id(val);
+    if (id == -ENOENT) {
+      ss << "crush rule " << val << " does not exist";
+      return -ENOENT;
+    }
+    if (id < 0) {
+      ss << cpp_strerror(id);
+      return -ENOENT;
+    }
+    if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
+      return -EINVAL;
+    }
+    p.crush_rule = id;
+  } else if (var == "nodelete" || var == "nopgchange" ||
+	     var == "nosizechange" || var == "write_fadvise_dontneed" ||
+	     var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
+    uint64_t flag = pg_pool_t::get_flag_by_name(var);
+    // make sure we only compare against 'n' if we didn't receive a string
+    if (val == "true" || (interr.empty() && n == 1)) {
+      p.set_flag(flag);
+    } else if (val == "false" || (interr.empty() && n == 0)) {
+      p.unset_flag(flag);
+    } else {
+      ss << "expecting value 'true', 'false', '0', or '1'";
+      return -EINVAL;
+    }
+  } else if (var == "hashpspool") {
+    uint64_t flag = pg_pool_t::get_flag_by_name(var);
+    bool force = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+    if (!force) {
+      ss << "are you SURE?  this will remap all placement groups in this pool,"
+	    " this triggers large data movement,"
+	    " pass --yes-i-really-mean-it if you really do.";
+      return -EPERM;
+    }
+    // make sure we only compare against 'n' if we didn't receive a string
+    if (val == "true" || (interr.empty() && n == 1)) {
+      p.set_flag(flag);
+    } else if (val == "false" || (interr.empty() && n == 0)) {
+      p.unset_flag(flag);
+    } else {
+      ss << "expecting value 'true', 'false', '0', or '1'";
+      return -EINVAL;
+    }
+  } else if (var == "hit_set_type") {
+    if (val == "none")
+      p.hit_set_params = HitSet::Params();
+    else {
+      int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+      if (err)
+	return err;
+      if (val == "bloom") {
+	BloomHitSet::Params *bsp = new BloomHitSet::Params;
+	bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
+	p.hit_set_params = HitSet::Params(bsp);
+      } else if (val == "explicit_hash")
+	p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
+      else if (val == "explicit_object")
+	p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
+      else {
+	ss << "unrecognized hit_set type '" << val << "'";
+	return -EINVAL;
+      }
+    }
+  } else if (var == "hit_set_period") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    } else if (n < 0) {
+      ss << "hit_set_period should be non-negative";
+      return -EINVAL;
+    }
+    p.hit_set_period = n;
+  } else if (var == "hit_set_count") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    } else if (n < 0) {
+      ss << "hit_set_count should be non-negative";
+      return -EINVAL;
+    }
+    p.hit_set_count = n;
+  } else if (var == "hit_set_fpp") {
+    if (floaterr.length()) {
+      ss << "error parsing floating point value '" << val << "': " << floaterr;
+      return -EINVAL;
+    } else if (f < 0 || f > 1.0) {
+      ss << "hit_set_fpp should be in the range 0..1";
+      return -EINVAL;
+    }
+    if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
+      ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
+      return -EINVAL;
+    }
+    BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
+    bloomp->set_fpp(f);
+  } else if (var == "use_gmt_hitset") {
+    if (val == "true" || (interr.empty() && n == 1)) {
+      p.use_gmt_hitset = true;
+    } else {
+      ss << "expecting value 'true' or '1'";
+      return -EINVAL;
+    }
+  } else if (var == "allow_ec_overwrites") {
+    if (!p.is_erasure()) {
+      ss << "ec overwrites can only be enabled for an erasure coded pool";
+      return -EINVAL;
+    }
+    stringstream err;
+    if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
+	!is_pool_currently_all_bluestore(pool, p, &err)) {
+      ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
+      return -EINVAL;
+    }
+    if (val == "true" || (interr.empty() && n == 1)) {
+	p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
+    } else if (val == "false" || (interr.empty() && n == 0)) {
+      ss << "ec overwrites cannot be disabled once enabled";
+      return -EINVAL;
+    } else {
+      ss << "expecting value 'true', 'false', '0', or '1'";
+      return -EINVAL;
+    }
+  } else if (var == "target_max_objects") {
+    if (interr.length()) {
+      ss << "error parsing int '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.target_max_objects = n;
+  } else if (var == "target_max_bytes") {
+    if (interr.length()) {
+      ss << "error parsing int '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.target_max_bytes = n;
+  } else if (var == "cache_target_dirty_ratio") {
+    if (floaterr.length()) {
+      ss << "error parsing float '" << val << "': " << floaterr;
+      return -EINVAL;
+    }
+    if (f < 0 || f > 1.0) {
+      ss << "value must be in the range 0..1";
+      return -ERANGE;
+    }
+    p.cache_target_dirty_ratio_micro = uf;
+  } else if (var == "cache_target_dirty_high_ratio") {
+    if (floaterr.length()) {
+      ss << "error parsing float '" << val << "': " << floaterr;
+      return -EINVAL;
+    }
+    if (f < 0 || f > 1.0) {
+      ss << "value must be in the range 0..1";
+      return -ERANGE;
+    }
+    p.cache_target_dirty_high_ratio_micro = uf;
+  } else if (var == "cache_target_full_ratio") {
+    if (floaterr.length()) {
+      ss << "error parsing float '" << val << "': " << floaterr;
+      return -EINVAL;
+    }
+    if (f < 0 || f > 1.0) {
+      ss << "value must be in the range 0..1";
+      return -ERANGE;
+    }
+    p.cache_target_full_ratio_micro = uf;
+  } else if (var == "cache_min_flush_age") {
+    if (interr.length()) {
+      ss << "error parsing int '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.cache_min_flush_age = n;
+  } else if (var == "cache_min_evict_age") {
+    if (interr.length()) {
+      ss << "error parsing int '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.cache_min_evict_age = n;
+  } else if (var == "min_read_recency_for_promote") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.min_read_recency_for_promote = n;
+  } else if (var == "hit_set_grade_decay_rate") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n > 100 || n < 0) {
+      ss << "value out of range,valid range is 0 - 100";
+      return -EINVAL;
+    }
+    p.hit_set_grade_decay_rate = n;
+  } else if (var == "hit_set_search_last_n") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n > p.hit_set_count || n < 0) {
+      ss << "value out of range,valid range is 0 - hit_set_count";
+      return -EINVAL;
+    }
+    p.hit_set_search_last_n = n;
+  } else if (var == "min_write_recency_for_promote") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.min_write_recency_for_promote = n;
+  } else if (var == "fast_read") {
+    if (p.is_replicated()) {
+        ss << "fast read is not supported in replication pool";
+        return -EINVAL;
+    }
+    if (val == "true" || (interr.empty() && n == 1)) {
+      p.fast_read = true;
+    } else if (val == "false" || (interr.empty() && n == 0)) {
+      p.fast_read = false;
+    } else {
+      ss << "expecting value 'true', 'false', '0', or '1'";
+      return -EINVAL;
+    }
+  } else if (pool_opts_t::is_opt_name(var)) {
+    bool unset = val == "unset";
+    if (var == "compression_mode") {
+      if (!unset) {
+        auto cmode = Compressor::get_comp_mode_type(val);
+        if (!cmode) {
+	  ss << "unrecognized compression mode '" << val << "'";
+	  return -EINVAL;
+        }
+      }
+    } else if (var == "compression_algorithm") {
+      if (!unset) {
+        auto alg = Compressor::get_comp_alg_type(val);
+        if (!alg) {
+          ss << "unrecognized compression_algorithm '" << val << "'";
+	  return -EINVAL;
+        }
+      }
+    } else if (var == "compression_required_ratio") {
+      if (floaterr.length()) {
+        ss << "error parsing float value '" << val << "': " << floaterr;
+        return -EINVAL;
+      }
+      if (f < 0 || f > 1) {
+        ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
+	return -EINVAL;
+      }
+    } else if (var == "csum_type") {
+      auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
+      if (t < 0 ) {
+        ss << "unrecognized csum_type '" << val << "'";
+	return -EINVAL;
+      }
+      //preserve csum_type numeric value
+      n = t;
+      interr.clear(); 
+    } else if (var == "compression_max_blob_size" ||
+               var == "compression_min_blob_size" ||
+               var == "csum_max_block" ||
+               var == "csum_min_block") {
+      if (interr.length()) {
+        ss << "error parsing int value '" << val << "': " << interr;
+        return -EINVAL;
+      }
+    } else if (var == "fingerprint_algorithm") {
+      if (!unset) {
+        auto alg = pg_pool_t::get_fingerprint_from_str(val);
+        if (!alg) {
+          ss << "unrecognized fingerprint_algorithm '" << val << "'";
+	  return -EINVAL;
+        }
+      }
+    } else if (var == "target_size_bytes") {
+      if (interr.length()) {
+        ss << "error parsing unit value '" << val << "': " << interr;
+        return -EINVAL;
+      }
+      if (osdmap.require_osd_release < ceph_release_t::nautilus) {
+        ss << "must set require_osd_release to nautilus or "
+           << "later before setting target_size_bytes";
+        return -EINVAL;
+      }
+    } else if (var == "target_size_ratio") {
+      if (f < 0.0) {
+	ss << "target_size_ratio cannot be negative";
+	return -EINVAL;
+      }
+    } else if (var == "pg_num_min") {
+      if (interr.length()) {
+        ss << "error parsing int value '" << val << "': " << interr;
+        return -EINVAL;
+      }
+      if (n > (int)p.get_pg_num_target()) {
+	ss << "specified pg_num_min " << n
+	   << " > pg_num " << p.get_pg_num_target();
+	return -EINVAL;
+      }
+    } else if (var == "pg_num_max") {
+      if (interr.length()) {
+        ss << "error parsing int value '" << val << "': " << interr;
+        return -EINVAL;
+      }
+      if (n && n < (int)p.get_pg_num_target()) {
+	ss << "specified pg_num_max " << n
+	   << " < pg_num " << p.get_pg_num_target();
+	return -EINVAL;
+      }
+    } else if (var == "recovery_priority") {
+      if (interr.length()) {
+        ss << "error parsing int value '" << val << "': " << interr;
+        return -EINVAL;
+      }
+      if (!g_conf()->debug_allow_any_pool_priority) {
+        if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
+          ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
+	     << " and " << OSD_POOL_PRIORITY_MAX;
+          return -EINVAL;
+        }
+      }
+    } else if (var == "pg_autoscale_bias") {
+      if (f < 0.0 || f > 1000.0) {
+	ss << "pg_autoscale_bias must be between 0 and 1000";
+	return -EINVAL;
+      }
+    } else if (var == "dedup_tier") {
+      if (interr.empty()) {
+	ss << "expecting value 'pool name'";
+	return -EINVAL;
+      }
+      // Current base tier in dedup does not support ec pool 
+      if (p.is_erasure()) {
+	ss << "pool '" << poolstr
+	   << "' is an ec pool, which cannot be a base tier";
+	return -ENOTSUP;
+      }
+      int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
+      if (lowtierpool_id < 0) {
+	ss << "unrecognized pool '" << val << "'";
+	return -ENOENT;
+      }
+      const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
+      ceph_assert(tp);
+      n = lowtierpool_id;
+      // The original input is string (pool name), but we convert it to int64_t.
+      // So, clear interr
+      interr.clear();
+    } else if (var == "dedup_chunk_algorithm") {
+      if (!unset) {
+        auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
+        if (!alg) {
+          ss << "unrecognized fingerprint_algorithm '" << val << "'";
+	  return -EINVAL;
+        }
+      }
+    } else if (var == "dedup_cdc_chunk_size") {
+      if (interr.length()) {
+        ss << "error parsing int value '" << val << "': " << interr;
+        return -EINVAL;
+      }
+    }
+
+    pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
+    switch (desc.type) {
+    case pool_opts_t::STR:
+      if (unset) {
+	p.opts.unset(desc.key);
+      } else {
+	p.opts.set(desc.key, static_cast<std::string>(val));
+      }
+      break;
+    case pool_opts_t::INT:
+      if (interr.length()) {
+	ss << "error parsing integer value '" << val << "': " << interr;
+	return -EINVAL;
+      }
+      if (n == 0) {
+	p.opts.unset(desc.key);
+      } else {
+	p.opts.set(desc.key, static_cast<int64_t>(n));
+      }
+      break;
+    case pool_opts_t::DOUBLE:
+      if (floaterr.length()) {
+	ss << "error parsing floating point value '" << val << "': " << floaterr;
+	return -EINVAL;
+      }
+      if (f == 0) {
+	p.opts.unset(desc.key);
+      } else {
+	p.opts.set(desc.key, static_cast<double>(f));
+      }
+      break;
+    default:
+      ceph_assert(!"unknown type");
+    }
+  } else {
+    ss << "unrecognized variable '" << var << "'";
+    return -EINVAL;
+  }
+  if (val != "unset") {
+    ss << "set pool " << pool << " " << var << " to " << val;
+  } else {
+    ss << "unset pool " << pool << " " << var;
+  }
+  p.last_change = pending_inc.epoch;
+  pending_inc.new_pools[pool] = p;
+  return 0;
+}
+
+int OSDMonitor::prepare_command_pool_application(const string &prefix,
+                                                 const cmdmap_t& cmdmap,
+                                                 stringstream& ss)
+{
+  return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
+}
+
+int OSDMonitor::preprocess_command_pool_application(const string &prefix,
+                                                    const cmdmap_t& cmdmap,
+                                                    stringstream& ss,
+                                                    bool *modified)
+{
+  return _command_pool_application(prefix, cmdmap, ss, modified, false);
+}
+
+
+/**
+ * Common logic for preprocess and prepare phases of pool application
+ * tag commands.  In preprocess mode we're only detecting invalid
+ * commands, and determining whether it was a modification or a no-op.
+ * In prepare mode we're actually updating the pending state.
+ */
+int OSDMonitor::_command_pool_application(const string &prefix,
+                                          const cmdmap_t& cmdmap,
+                                          stringstream& ss,
+                                          bool *modified,
+                                          bool preparing)
+{
+  string pool_name;
+  cmd_getval(cmdmap, "pool", pool_name);
+  int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
+  if (pool < 0) {
+    ss << "unrecognized pool '" << pool_name << "'";
+    return -ENOENT;
+  }
+
+  pg_pool_t p = *osdmap.get_pg_pool(pool);
+  if (preparing) {
+    if (pending_inc.new_pools.count(pool)) {
+      p = pending_inc.new_pools[pool];
+    }
+  }
+
+  string app;
+  cmd_getval(cmdmap, "app", app);
+  bool app_exists = (p.application_metadata.count(app) > 0);
+
+  string key;
+  cmd_getval(cmdmap, "key", key);
+  if (key == "all") {
+    ss << "key cannot be 'all'";
+    return -EINVAL;
+  }
+
+  string value;
+  cmd_getval(cmdmap, "value", value);
+  if (value == "all") {
+    ss << "value cannot be 'all'";
+    return -EINVAL;
+  }
+
+  if (boost::algorithm::ends_with(prefix, "enable")) {
+    if (app.empty()) {
+      ss << "application name must be provided";
+      return -EINVAL;
+    }
+
+    if (p.is_tier()) {
+      ss << "application must be enabled on base tier";
+      return -EINVAL;
+    }
+
+    bool force = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+    if (!app_exists && !p.application_metadata.empty() && !force) {
+      ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
+         << "application; pass --yes-i-really-mean-it to proceed anyway";
+      return -EPERM;
+    }
+
+    if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
+      ss << "too many enabled applications on pool '" << pool_name << "'; "
+         << "max " << MAX_POOL_APPLICATIONS;
+      return -EINVAL;
+    }
+
+    if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
+      ss << "application name '" << app << "' too long; max length "
+         << MAX_POOL_APPLICATION_LENGTH;
+      return -EINVAL;
+    }
+
+    if (!app_exists) {
+      p.application_metadata[app] = {};
+    }
+    ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
+
+  } else if (boost::algorithm::ends_with(prefix, "disable")) {
+    bool force = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+    if (!force) {
+      ss << "Are you SURE? Disabling an application within a pool might result "
+         << "in loss of application functionality; pass "
+         << "--yes-i-really-mean-it to proceed anyway";
+      return -EPERM;
+    }
+
+    if (!app_exists) {
+      ss << "application '" << app << "' is not enabled on pool '" << pool_name
+         << "'";
+      return 0; // idempotent
+    }
+
+    p.application_metadata.erase(app);
+    ss << "disable application '" << app << "' on pool '" << pool_name << "'";
+
+  } else if (boost::algorithm::ends_with(prefix, "set")) {
+    if (p.is_tier()) {
+      ss << "application metadata must be set on base tier";
+      return -EINVAL;
+    }
+
+    if (!app_exists) {
+      ss << "application '" << app << "' is not enabled on pool '" << pool_name
+         << "'";
+      return -ENOENT;
+    }
+
+    string key;
+    cmd_getval(cmdmap, "key", key);
+
+    if (key.empty()) {
+      ss << "key must be provided";
+      return -EINVAL;
+    }
+
+    auto &app_keys = p.application_metadata[app];
+    if (app_keys.count(key) == 0 &&
+        app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
+      ss << "too many keys set for application '" << app << "' on pool '"
+         << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
+      return -EINVAL;
+    }
+
+    if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
+      ss << "key '" << app << "' too long; max length "
+         << MAX_POOL_APPLICATION_LENGTH;
+      return -EINVAL;
+    }
+
+    string value;
+    cmd_getval(cmdmap, "value", value);
+    if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
+      ss << "value '" << value << "' too long; max length "
+         << MAX_POOL_APPLICATION_LENGTH;
+      return -EINVAL;
+    }
+
+    p.application_metadata[app][key] = value;
+    ss << "set application '" << app << "' key '" << key << "' to '"
+       << value << "' on pool '" << pool_name << "'";
+  } else if (boost::algorithm::ends_with(prefix, "rm")) {
+    if (!app_exists) {
+      ss << "application '" << app << "' is not enabled on pool '" << pool_name
+         << "'";
+      return -ENOENT;
+    }
+
+    string key;
+    cmd_getval(cmdmap, "key", key);
+    auto it = p.application_metadata[app].find(key);
+    if (it == p.application_metadata[app].end()) {
+      ss << "application '" << app << "' on pool '" << pool_name
+         << "' does not have key '" << key << "'";
+      return 0; // idempotent
+    }
+
+    p.application_metadata[app].erase(it);
+    ss << "removed application '" << app << "' key '" << key << "' on pool '"
+       << pool_name << "'";
+  } else {
+    ceph_abort();
+  }
+
+  if (preparing) {
+    p.last_change = pending_inc.epoch;
+    pending_inc.new_pools[pool] = p;
+  }
+
+  // Because we fell through this far, we didn't hit no-op cases,
+  // so pool was definitely modified
+  if (modified != nullptr) {
+    *modified = true;
+  }
+
+  return 0;
+}
+
+int OSDMonitor::_prepare_command_osd_crush_remove(
+    CrushWrapper &newcrush,
+    int32_t id,
+    int32_t ancestor,
+    bool has_ancestor,
+    bool unlink_only)
+{
+  int err = 0;
+
+  if (has_ancestor) {
+    err = newcrush.remove_item_under(cct, id, ancestor,
+        unlink_only);
+  } else {
+    err = newcrush.remove_item(cct, id, unlink_only);
+  }
+  return err;
+}
+
+void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
+{
+  pending_inc.crush.clear();
+  newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+}
+
+int OSDMonitor::prepare_command_osd_crush_remove(
+    CrushWrapper &newcrush,
+    int32_t id,
+    int32_t ancestor,
+    bool has_ancestor,
+    bool unlink_only)
+{
+  int err = _prepare_command_osd_crush_remove(
+      newcrush, id, ancestor,
+      has_ancestor, unlink_only);
+
+  if (err < 0)
+    return err;
+
+  ceph_assert(err == 0);
+  do_osd_crush_remove(newcrush);
+
+  return 0;
+}
+
+int OSDMonitor::prepare_command_osd_remove(int32_t id)
+{
+  if (osdmap.is_up(id)) {
+    return -EBUSY;
+  }
+
+  pending_inc.new_state[id] = osdmap.get_state(id);
+  pending_inc.new_uuid[id] = uuid_d();
+  pending_metadata_rm.insert(id);
+  pending_metadata.erase(id);
+
+  return 0;
+}
+
+int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
+{
+  ceph_assert(existing_id);
+  *existing_id = -1;
+
+  for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
+    if (!osdmap.exists(i) &&
+        pending_inc.new_up_client.count(i) == 0 &&
+        (pending_inc.new_state.count(i) == 0 ||
+         (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
+      *existing_id = i;
+      return -1;
+    }
+  }
+
+  if (pending_inc.new_max_osd < 0) {
+    return osdmap.get_max_osd();
+  }
+  return pending_inc.new_max_osd;
+}
+
+void OSDMonitor::do_osd_create(
+    const int32_t id,
+    const uuid_d& uuid,
+    const string& device_class,
+    int32_t* new_id)
+{
+  dout(10) << __func__ << " uuid " << uuid << dendl;
+  ceph_assert(new_id);
+
+  // We presume validation has been performed prior to calling this
+  // function. We assert with prejudice.
+
+  int32_t allocated_id = -1; // declare here so we can jump
+  int32_t existing_id = -1;
+  if (!uuid.is_zero()) {
+    existing_id = osdmap.identify_osd(uuid);
+    if (existing_id >= 0) {
+      ceph_assert(id < 0 || id == existing_id);
+      *new_id = existing_id;
+      goto out;
+    } else if (id >= 0) {
+      // uuid does not exist, and id has been provided, so just create
+      // the new osd.id
+      *new_id = id;
+      goto out;
+    }
+  }
+
+  // allocate a new id
+  allocated_id = _allocate_osd_id(&existing_id);
+  dout(10) << __func__ << " allocated id " << allocated_id
+           << " existing id " << existing_id << dendl;
+  if (existing_id >= 0) {
+    ceph_assert(existing_id < osdmap.get_max_osd());
+    ceph_assert(allocated_id < 0);
+    *new_id = existing_id;
+  } else if (allocated_id >= 0) {
+    ceph_assert(existing_id < 0);
+    // raise max_osd
+    if (pending_inc.new_max_osd < 0) {
+      pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
+    } else {
+      ++pending_inc.new_max_osd;
+    }
+    *new_id = pending_inc.new_max_osd - 1;
+    ceph_assert(*new_id == allocated_id);
+  } else {
+    ceph_abort_msg("unexpected condition");
+  }
+
+out:
+  if (device_class.size()) {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    if (newcrush.get_max_devices() < *new_id + 1) {
+      newcrush.set_max_devices(*new_id + 1);
+    }
+    string name = string("osd.") + stringify(*new_id);
+    if (!newcrush.item_exists(*new_id)) {
+      newcrush.set_item_name(*new_id, name);
+    }
+    ostringstream ss;
+    int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
+    if (r < 0) {
+      derr << __func__ << " failed to set " << name << " device_class "
+	   << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
+	   << dendl;
+      // non-fatal... this might be a replay and we want to be idempotent.
+    } else {
+      dout(20) << __func__ << " set " << name << " device_class " << device_class
+	       << dendl;
+      pending_inc.crush.clear();
+      newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    }
+  } else {
+    dout(20) << __func__ << " no device_class" << dendl;
+  }
+
+  dout(10) << __func__ << " using id " << *new_id << dendl;
+  if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
+    pending_inc.new_max_osd = *new_id + 1;
+  }
+
+  pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
+  // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
+  // set it for us.  (ugh.)
+  pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
+  if (!uuid.is_zero())
+    pending_inc.new_uuid[*new_id] = uuid;
+}
+
+int OSDMonitor::validate_osd_create(
+    const int32_t id,
+    const uuid_d& uuid,
+    const bool check_osd_exists,
+    int32_t* existing_id,
+    stringstream& ss)
+{
+
+  dout(10) << __func__ << " id " << id << " uuid " << uuid
+           << " check_osd_exists " << check_osd_exists << dendl;
+
+  ceph_assert(existing_id);
+
+  if (id < 0 && uuid.is_zero()) {
+    // we have nothing to validate
+    *existing_id = -1;
+    return 0;
+  } else if (uuid.is_zero()) {
+    // we have an id but we will ignore it - because that's what
+    // `osd create` does.
+    return 0;
+  }
+
+  /*
+   * This function will be used to validate whether we are able to
+   * create a new osd when the `uuid` is specified.
+   *
+   * It will be used by both `osd create` and `osd new`, as the checks
+   * are basically the same when it pertains to osd id and uuid validation.
+   * However, `osd create` presumes an `uuid` is optional, for legacy
+   * reasons, while `osd new` requires the `uuid` to be provided. This
+   * means that `osd create` will not be idempotent if an `uuid` is not
+   * provided, but we will always guarantee the idempotency of `osd new`.
+   */
+
+  ceph_assert(!uuid.is_zero());
+  if (pending_inc.identify_osd(uuid) >= 0) {
+    // osd is about to exist
+    return -EAGAIN;
+  }
+
+  int32_t i = osdmap.identify_osd(uuid);
+  if (i >= 0) {
+    // osd already exists
+    if (id >= 0 && i != id) {
+      ss << "uuid " << uuid << " already in use for different id " << i;
+      return -EEXIST;
+    }
+    // return a positive errno to distinguish between a blocking error
+    // and an error we consider to not be a problem (i.e., this would be
+    // an idempotent operation).
+    *existing_id = i;
+    return EEXIST;
+  }
+  // i < 0
+  if (id >= 0) {
+    if (pending_inc.new_state.count(id)) {
+      // osd is about to exist
+      return -EAGAIN;
+    }
+    // we may not care if an osd exists if we are recreating a previously
+    // destroyed osd.
+    if (check_osd_exists && osdmap.exists(id)) {
+      ss << "id " << id << " already in use and does not match uuid "
+         << uuid;
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
+int OSDMonitor::prepare_command_osd_create(
+    const int32_t id,
+    const uuid_d& uuid,
+    int32_t* existing_id,
+    stringstream& ss)
+{
+  dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
+  ceph_assert(existing_id);
+  if (osdmap.is_destroyed(id)) {
+    ss << "ceph osd create has been deprecated. Please use ceph osd new "
+          "instead.";
+    return -EINVAL;
+  }
+
+  if (uuid.is_zero()) {
+    dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
+  }
+
+  return validate_osd_create(id, uuid, true, existing_id, ss);
+}
+
+int OSDMonitor::prepare_command_osd_new(
+    MonOpRequestRef op,
+    const cmdmap_t& cmdmap,
+    const map<string,string>& params,
+    stringstream &ss,
+    Formatter *f)
+{
+  uuid_d uuid;
+  string uuidstr;
+  int64_t id = -1;
+
+  ceph_assert(paxos.is_plugged());
+
+  dout(10) << __func__ << " " << op << dendl;
+
+  /* validate command. abort now if something's wrong. */
+
+  /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
+   *
+   * If `id` is not specified, we will identify any existing osd based
+   * on `uuid`. Operation will be idempotent iff secrets match.
+   *
+   * If `id` is specified, we will identify any existing osd based on
+   * `uuid` and match against `id`. If they match, operation will be
+   * idempotent iff secrets match.
+   *
+   * `-i secrets.json` will be optional. If supplied, will be used
+   * to check for idempotency when `id` and `uuid` match.
+   *
+   * If `id` is not specified, and `uuid` does not exist, an id will
+   * be found or allocated for the osd.
+   *
+   * If `id` is specified, and the osd has been previously marked
+   * as destroyed, then the `id` will be reused.
+   */
+  if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
+    ss << "requires the OSD's UUID to be specified.";
+    return -EINVAL;
+  } else if (!uuid.parse(uuidstr.c_str())) {
+    ss << "invalid UUID value '" << uuidstr << "'.";
+    return -EINVAL;
+  }
+
+  if (cmd_getval(cmdmap, "id", id) &&
+      (id < 0)) {
+    ss << "invalid OSD id; must be greater or equal than zero.";
+    return -EINVAL;
+  }
+
+  // are we running an `osd create`-like command, or recreating
+  // a previously destroyed osd?
+
+  bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
+
+  // we will care about `id` to assess whether osd is `destroyed`, or
+  // to create a new osd.
+  // we will need an `id` by the time we reach auth.
+
+  int32_t existing_id = -1;
+  int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
+                                &existing_id, ss);
+
+  bool may_be_idempotent = false;
+  if (err == EEXIST) {
+    // this is idempotent from the osdmon's point-of-view
+    may_be_idempotent = true;
+    ceph_assert(existing_id >= 0);
+    id = existing_id;
+  } else if (err < 0) {
+    return err;
+  }
+
+  if (!may_be_idempotent) {
+    // idempotency is out of the window. We are either creating a new
+    // osd or recreating a destroyed osd.
+    //
+    // We now need to figure out if we have an `id` (and if it's valid),
+    // of find an `id` if we don't have one.
+
+    // NOTE: we need to consider the case where the `id` is specified for
+    // `osd create`, and we must honor it. So this means checking if
+    // the `id` is destroyed, and if so assume the destroy; otherwise,
+    // check if it `exists` - in which case we complain about not being
+    // `destroyed`. In the end, if nothing fails, we must allow the
+    // creation, so that we are compatible with `create`.
+    if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
+      dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
+      ss << "OSD " << id << " has not yet been destroyed";
+      return -EINVAL;
+    } else if (id < 0) {
+      // find an `id`
+      id = _allocate_osd_id(&existing_id);
+      if (id < 0) {
+        ceph_assert(existing_id >= 0);
+        id = existing_id;
+      }
+      dout(10) << __func__ << " found id " << id << " to use" << dendl;
+    } else if (id >= 0 && osdmap.is_destroyed(id)) {
+      dout(10) << __func__ << " recreating osd." << id << dendl;
+    } else {
+      dout(10) << __func__ << " creating new osd." << id << dendl;
+    }
+  } else {
+    ceph_assert(id >= 0);
+    ceph_assert(osdmap.exists(id));
+  }
+
+  // we are now able to either create a brand new osd or reuse an existing
+  // osd that has been previously destroyed.
+
+  dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
+
+  if (may_be_idempotent && params.empty()) {
+    // nothing to do, really.
+    dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
+    ceph_assert(id >= 0);
+    if (f) {
+      f->open_object_section("created_osd");
+      f->dump_int("osdid", id);
+      f->close_section();
+    } else {
+      ss << id;
+    }
+    return EEXIST;
+  }
+
+  string device_class;
+  auto p = params.find("crush_device_class");
+  if (p != params.end()) {
+    device_class = p->second;
+    dout(20) << __func__ << " device_class will be " << device_class << dendl;
+  }
+  string cephx_secret, lockbox_secret, dmcrypt_key;
+  bool has_lockbox = false;
+  bool has_secrets = params.count("cephx_secret")
+    || params.count("cephx_lockbox_secret")
+    || params.count("dmcrypt_key");
+
+  KVMonitor *svc = nullptr;
+  AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
+
+  if (has_secrets) {
+    if (params.count("cephx_secret") == 0) {
+      ss << "requires a cephx secret.";
+      return -EINVAL;
+    }
+    cephx_secret = params.at("cephx_secret");
+
+    bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
+    bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
+
+    dout(10) << __func__ << " has lockbox " << has_lockbox_secret
+             << " dmcrypt " << has_dmcrypt_key << dendl;
+
+    if (has_lockbox_secret && has_dmcrypt_key) {
+      has_lockbox = true;
+      lockbox_secret = params.at("cephx_lockbox_secret");
+      dmcrypt_key = params.at("dmcrypt_key");
+    } else if (!has_lockbox_secret != !has_dmcrypt_key) {
+      ss << "requires both a cephx lockbox secret and a dm-crypt key.";
+      return -EINVAL;
+    }
+
+    dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
+
+    err = mon.authmon()->validate_osd_new(id, uuid,
+        cephx_secret,
+        lockbox_secret,
+        cephx_entity,
+        lockbox_entity,
+        ss);
+    if (err < 0) {
+      return err;
+    } else if (may_be_idempotent && err != EEXIST) {
+      // for this to be idempotent, `id` should already be >= 0; no need
+      // to use validate_id.
+      ceph_assert(id >= 0);
+      ss << "osd." << id << " exists but secrets do not match";
+      return -EEXIST;
+    }
+
+    if (has_lockbox) {
+      svc = mon.kvmon();
+      err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
+      if (err < 0) {
+        return err;
+      } else if (may_be_idempotent && err != EEXIST) {
+        ceph_assert(id >= 0);
+        ss << "osd." << id << " exists but dm-crypt key does not match.";
+        return -EEXIST;
+      }
+    }
+  }
+  ceph_assert(!has_secrets || !cephx_secret.empty());
+  ceph_assert(!has_lockbox || !lockbox_secret.empty());
+
+  if (may_be_idempotent) {
+    // we have nothing to do for either the osdmon or the authmon,
+    // and we have no lockbox - so the config key service will not be
+    // touched. This is therefore an idempotent operation, and we can
+    // just return right away.
+    dout(10) << __func__ << " idempotent -- no op." << dendl;
+    ceph_assert(id >= 0);
+    if (f) {
+      f->open_object_section("created_osd");
+      f->dump_int("osdid", id);
+      f->close_section();
+    } else {
+      ss << id;
+    }
+    return EEXIST;
+  }
+  ceph_assert(!may_be_idempotent);
+
+  // perform updates.
+  if (has_secrets) {
+    ceph_assert(!cephx_secret.empty());
+    ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
+           (!lockbox_secret.empty() && !dmcrypt_key.empty()));
+
+    err = mon.authmon()->do_osd_new(cephx_entity,
+        lockbox_entity,
+        has_lockbox);
+    ceph_assert(0 == err);
+
+    if (has_lockbox) {
+      ceph_assert(nullptr != svc);
+      svc->do_osd_new(uuid, dmcrypt_key);
+    }
+  }
+
+  if (is_recreate_destroyed) {
+    ceph_assert(id >= 0);
+    ceph_assert(osdmap.is_destroyed(id));
+    pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
+    if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
+      pending_inc.new_state[id] |= CEPH_OSD_NEW;
+    }
+    if (osdmap.get_state(id) & CEPH_OSD_UP) {
+      // due to http://tracker.ceph.com/issues/20751 some clusters may
+      // have UP set for non-existent OSDs; make sure it is cleared
+      // for a newly created osd.
+      pending_inc.new_state[id] |= CEPH_OSD_UP;
+    }
+    pending_inc.new_uuid[id] = uuid;
+  } else {
+    ceph_assert(id >= 0);
+    int32_t new_id = -1;
+    do_osd_create(id, uuid, device_class, &new_id);
+    ceph_assert(new_id >= 0);
+    ceph_assert(id == new_id);
+  }
+
+  if (f) {
+    f->open_object_section("created_osd");
+    f->dump_int("osdid", id);
+    f->close_section();
+  } else {
+    ss << id;
+  }
+
+  return 0;
+}
+
+bool OSDMonitor::prepare_command(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MMonCommand>();
+  stringstream ss;
+  cmdmap_t cmdmap;
+  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
+    string rs = ss.str();
+    mon.reply_command(op, -EINVAL, rs, get_last_committed());
+    return true;
+  }
+
+  MonSession *session = op->get_session();
+  if (!session) {
+    derr << __func__ << " no session" << dendl;
+    mon.reply_command(op, -EACCES, "access denied", get_last_committed());
+    return true;
+  }
+
+  return prepare_command_impl(op, cmdmap);
+}
+
+static int parse_reweights(CephContext *cct,
+			   const cmdmap_t& cmdmap,
+			   const OSDMap& osdmap,
+			   map<int32_t, uint32_t>* weights)
+{
+  string weights_str;
+  if (!cmd_getval(cmdmap, "weights", weights_str)) {
+    return -EINVAL;
+  }
+  std::replace(begin(weights_str), end(weights_str), '\'', '"');
+  json_spirit::mValue json_value;
+  if (!json_spirit::read(weights_str, json_value)) {
+    return -EINVAL;
+  }
+  if (json_value.type() != json_spirit::obj_type) {
+    return -EINVAL;
+  }
+  const auto obj = json_value.get_obj();
+  try {
+    for (auto& osd_weight : obj) {
+      auto osd_id = std::stoi(osd_weight.first);
+      if (!osdmap.exists(osd_id)) {
+	return -ENOENT;
+      }
+      if (osd_weight.second.type() != json_spirit::str_type) {
+	return -EINVAL;
+      }
+      auto weight = std::stoul(osd_weight.second.get_str());
+      weights->insert({osd_id, weight});
+    }
+  } catch (const std::logic_error& e) {
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int OSDMonitor::prepare_command_osd_destroy(
+    int32_t id,
+    stringstream& ss)
+{
+  ceph_assert(paxos.is_plugged());
+
+  // we check if the osd exists for the benefit of `osd purge`, which may
+  // have previously removed the osd. If the osd does not exist, return
+  // -ENOENT to convey this, and let the caller deal with it.
+  //
+  // we presume that all auth secrets and config keys were removed prior
+  // to this command being called. if they exist by now, we also assume
+  // they must have been created by some other command and do not pertain
+  // to this non-existent osd.
+  if (!osdmap.exists(id)) {
+    dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
+    return -ENOENT;
+  }
+
+  uuid_d uuid = osdmap.get_uuid(id);
+  dout(10) << __func__ << " destroying osd." << id
+           << " uuid " << uuid << dendl;
+
+  // if it has been destroyed, we assume our work here is done.
+  if (osdmap.is_destroyed(id)) {
+    ss << "destroyed osd." << id;
+    return 0;
+  }
+
+  EntityName cephx_entity, lockbox_entity;
+  bool idempotent_auth = false, idempotent_cks = false;
+
+  int err = mon.authmon()->validate_osd_destroy(id, uuid,
+                                                 cephx_entity,
+                                                 lockbox_entity,
+                                                 ss);
+  if (err < 0) {
+    if (err == -ENOENT) {
+      idempotent_auth = true;
+    } else {
+      return err;
+    }
+  }
+
+  auto svc = mon.kvmon();
+  err = svc->validate_osd_destroy(id, uuid);
+  if (err < 0) {
+    ceph_assert(err == -ENOENT);
+    err = 0;
+    idempotent_cks = true;
+  }
+
+  if (!idempotent_auth) {
+    err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
+    ceph_assert(0 == err);
+  }
+
+  if (!idempotent_cks) {
+    svc->do_osd_destroy(id, uuid);
+  }
+
+  pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
+  pending_inc.new_uuid[id] = uuid_d();
+
+  // we can only propose_pending() once per service, otherwise we'll be
+  // defying PaxosService and all laws of nature. Therefore, as we may
+  // be used during 'osd purge', let's keep the caller responsible for
+  // proposing.
+  ceph_assert(err == 0);
+  return 0;
+}
+
+int OSDMonitor::prepare_command_osd_purge(
+    int32_t id,
+    stringstream& ss)
+{
+  ceph_assert(paxos.is_plugged());
+  dout(10) << __func__ << " purging osd." << id << dendl;
+
+  ceph_assert(!osdmap.is_up(id));
+
+  /*
+   * This may look a bit weird, but this is what's going to happen:
+   *
+   *  1. we make sure that removing from crush works
+   *  2. we call `prepare_command_osd_destroy()`. If it returns an
+   *     error, then we abort the whole operation, as no updates
+   *     have been made. However, we this function will have
+   *     side-effects, thus we need to make sure that all operations
+   *     performed henceforth will *always* succeed.
+   *  3. we call `prepare_command_osd_remove()`. Although this
+   *     function can return an error, it currently only checks if the
+   *     osd is up - and we have made sure that it is not so, so there
+   *     is no conflict, and it is effectively an update.
+   *  4. finally, we call `do_osd_crush_remove()`, which will perform
+   *     the crush update we delayed from before.
+   */
+
+  CrushWrapper newcrush;
+  _get_pending_crush(newcrush);
+
+  bool may_be_idempotent = false;
+
+  int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
+  if (err == -ENOENT) {
+    err = 0;
+    may_be_idempotent = true;
+  } else if (err < 0) {
+    ss << "error removing osd." << id << " from crush";
+    return err;
+  }
+
+  // no point destroying the osd again if it has already been marked destroyed
+  if (!osdmap.is_destroyed(id)) {
+    err = prepare_command_osd_destroy(id, ss);
+    if (err < 0) {
+      if (err == -ENOENT) {
+        err = 0;
+      } else {
+        return err;
+      }
+    } else {
+      may_be_idempotent = false;
+    }
+  }
+  ceph_assert(0 == err);
+
+  if (may_be_idempotent && !osdmap.exists(id)) {
+    dout(10) << __func__ << " osd." << id << " does not exist and "
+             << "we are idempotent." << dendl;
+    return -ENOENT;
+  }
+
+  err = prepare_command_osd_remove(id);
+  // we should not be busy, as we should have made sure this id is not up.
+  ceph_assert(0 == err);
+
+  do_osd_crush_remove(newcrush);
+  return 0;
+}
+
+bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
+				      const cmdmap_t& cmdmap)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MMonCommand>();
+  bool ret = false;
+  stringstream ss;
+  string rs;
+  bufferlist rdata;
+  int err = 0;
+
+  string format;
+  cmd_getval(cmdmap, "format", format, string("plain"));
+  boost::scoped_ptr<Formatter> f(Formatter::create(format));
+
+  string prefix;
+  cmd_getval(cmdmap, "prefix", prefix);
+
+  int64_t osdid;
+  string osd_name;
+  bool osdid_present = false;
+  if (prefix != "osd pg-temp" &&
+      prefix != "osd pg-upmap" &&
+      prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
+    osdid_present = cmd_getval(cmdmap, "id", osdid);
+  }
+  if (osdid_present) {
+    ostringstream oss;
+    oss << "osd." << osdid;
+    osd_name = oss.str();
+  }
+
+  // Even if there's a pending state with changes that could affect
+  // a command, considering that said state isn't yet committed, we
+  // just don't care about those changes if the command currently being
+  // handled acts as a no-op against the current committed state.
+  // In a nutshell, we assume this command  happens *before*.
+  //
+  // Let me make this clearer:
+  //
+  //   - If we have only one client, and that client issues some
+  //     operation that would conflict with this operation  but is
+  //     still on the pending state, then we would be sure that said
+  //     operation wouldn't have returned yet, so the client wouldn't
+  //     issue this operation (unless the client didn't wait for the
+  //     operation to finish, and that would be the client's own fault).
+  //
+  //   - If we have more than one client, each client will observe
+  //     whatever is the state at the moment of the commit.  So, if we
+  //     have two clients, one issuing an unlink and another issuing a
+  //     link, and if the link happens while the unlink is still on the
+  //     pending state, from the link's point-of-view this is a no-op.
+  //     If different clients are issuing conflicting operations and
+  //     they care about that, then the clients should make sure they
+  //     enforce some kind of concurrency mechanism -- from our
+  //     perspective that's what Douglas Adams would call an SEP.
+  //
+  // This should be used as a general guideline for most commands handled
+  // in this function.  Adapt as you see fit, but please bear in mind that
+  // this is the expected behavior.
+   
+ 
+  if (prefix == "osd setcrushmap" ||
+      (prefix == "osd crush set" && !osdid_present)) {
+    if (pending_inc.crush.length()) {
+      dout(10) << __func__ << " waiting for pending crush update " << dendl;
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+      return true;
+    }
+    dout(10) << "prepare_command setting new crush map" << dendl;
+    bufferlist data(m->get_data());
+    CrushWrapper crush;
+    try {
+      auto bl = data.cbegin();
+      crush.decode(bl);
+    }
+    catch (const std::exception &e) {
+      err = -EINVAL;
+      ss << "Failed to parse crushmap: " << e.what();
+      goto reply;
+    }
+  
+    int64_t prior_version = 0;
+    if (cmd_getval(cmdmap, "prior_version", prior_version)) {
+      if (prior_version == osdmap.get_crush_version() - 1) {
+	// see if we are a resend of the last update.  this is imperfect
+	// (multiple racing updaters may not both get reliable success)
+	// but we expect crush updaters (via this interface) to be rare-ish.
+	bufferlist current, proposed;
+	osdmap.crush->encode(current, mon.get_quorum_con_features());
+	crush.encode(proposed, mon.get_quorum_con_features());
+	if (current.contents_equal(proposed)) {
+	  dout(10) << __func__
+		   << " proposed matches current and version equals previous"
+		   << dendl;
+	  err = 0;
+	  ss << osdmap.get_crush_version();
+	  goto reply;
+	}
+      }
+      if (prior_version != osdmap.get_crush_version()) {
+	err = -EPERM;
+	ss << "prior_version " << prior_version << " != crush version "
+	   << osdmap.get_crush_version();
+	goto reply;
+      }
+    }
+
+    if (crush.has_legacy_rule_ids()) {
+      err = -EINVAL;
+      ss << "crush maps with ruleset != ruleid are no longer allowed";
+      goto reply;
+    }
+    if (!validate_crush_against_features(&crush, ss)) {
+      err = -EINVAL;
+      goto reply;
+    }
+
+    err = osdmap.validate_crush_rules(&crush, &ss);
+    if (err < 0) {
+      goto reply;
+    }
+
+    if (g_conf()->mon_osd_crush_smoke_test) {
+      // sanity check: test some inputs to make sure this map isn't
+      // totally broken
+      dout(10) << " testing map" << dendl;
+      stringstream ess;
+      CrushTester tester(crush, ess);
+      tester.set_min_x(0);
+      tester.set_max_x(50);
+      auto start = ceph::coarse_mono_clock::now();
+      int r = tester.test_with_fork(g_conf()->mon_lease);
+      auto duration = ceph::coarse_mono_clock::now() - start;
+      if (r < 0) {
+	dout(10) << " tester.test_with_fork returns " << r
+		 << ": " << ess.str() << dendl;
+	ss << "crush smoke test failed with " << r << ": " << ess.str();
+	err = r;
+	goto reply;
+      }
+      dout(10) << __func__ << " crush somke test duration: "
+               << duration << ", result: " << ess.str() << dendl;
+    }
+
+    pending_inc.crush = data;
+    ss << osdmap.get_crush_version() + 1;
+    goto update;
+
+  } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
+      int bid = -1 - b;
+      if (newcrush.bucket_exists(bid) &&
+	  newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
+	dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
+	newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
+      }
+    }
+    if (!validate_crush_against_features(&newcrush, ss)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd crush set-device-class") {
+    string device_class;
+    if (!cmd_getval(cmdmap, "class", device_class)) {
+      err = -EINVAL; // no value!
+      goto reply;
+    }
+
+    bool stop = false;
+    vector<string> idvec;
+    cmd_getval(cmdmap, "ids", idvec);
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    set<int> updated;
+    for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+      set<int> osds;
+      // wildcard?
+      if (j == 0 &&
+          (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+        osdmap.get_all_osds(osds);
+        stop = true;
+      } else {
+        // try traditional single osd way
+        long osd = parse_osd_id(idvec[j].c_str(), &ss);
+        if (osd < 0) {
+          // ss has reason for failure
+          ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
+          err = -EINVAL;
+          continue;
+        }
+        osds.insert(osd);
+      }
+
+      for (auto &osd : osds) {
+        if (!osdmap.exists(osd)) {
+          ss << "osd." << osd << " does not exist. ";
+          continue;
+        }
+
+        ostringstream oss;
+        oss << "osd." << osd;
+        string name = oss.str();
+
+	if (newcrush.get_max_devices() < osd + 1) {
+	  newcrush.set_max_devices(osd + 1);
+	}
+        string action;
+        if (newcrush.item_exists(osd)) {
+          action = "updating";
+        } else {
+          action = "creating";
+          newcrush.set_item_name(osd, name);
+        }
+
+        dout(5) << action << " crush item id " << osd << " name '" << name
+                << "' device_class '" << device_class << "'"
+                << dendl;
+        err = newcrush.update_device_class(osd, device_class, name, &ss);
+        if (err < 0) {
+          goto reply;
+        }
+        if (err == 0 && !_have_pending_crush()) {
+          if (!stop) {
+            // for single osd only, wildcard makes too much noise
+            ss << "set-device-class item id " << osd << " name '" << name
+               << "' device_class '" << device_class << "': no change. ";
+          }
+        } else {
+          updated.insert(osd);
+        }
+      }
+    }
+
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << "set osd(s) " << updated << " to class '" << device_class << "'";
+    getline(ss, rs);
+    wait_for_finished_proposal(
+      op,
+      new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
+    return true;
+ } else if (prefix == "osd crush rm-device-class") {
+    bool stop = false;
+    vector<string> idvec;
+    cmd_getval(cmdmap, "ids", idvec);
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    set<int> updated;
+
+    for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+      set<int> osds;
+
+      // wildcard?
+      if (j == 0 &&
+          (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+        osdmap.get_all_osds(osds);
+        stop = true;
+      } else {
+        // try traditional single osd way
+        long osd = parse_osd_id(idvec[j].c_str(), &ss);
+        if (osd < 0) {
+          // ss has reason for failure
+          ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
+          err = -EINVAL;
+          goto reply;
+        }
+        osds.insert(osd);
+      }
+
+      for (auto &osd : osds) {
+        if (!osdmap.exists(osd)) {
+          ss << "osd." << osd << " does not exist. ";
+          continue;
+        }
+
+        auto class_name = newcrush.get_item_class(osd);
+        if (!class_name) {
+          ss << "osd." << osd << " belongs to no class, ";
+          continue;
+        }
+        // note that we do not verify if class_is_in_use here
+        // in case the device is misclassified and user wants
+        // to overridely reset...
+
+        err = newcrush.remove_device_class(cct, osd, &ss);
+        if (err < 0) {
+          // ss has reason for failure
+          goto reply;
+        }
+        updated.insert(osd);
+      }
+    }
+
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << "done removing class of osd(s): " << updated;
+    getline(ss, rs);
+    wait_for_finished_proposal(
+      op,
+      new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd crush class create") {
+    string device_class;
+    if (!cmd_getval(cmdmap, "class", device_class)) {
+      err = -EINVAL; // no value!
+      goto reply;
+    }
+    if (osdmap.require_osd_release < ceph_release_t::luminous) {
+      ss << "you must complete the upgrade and 'ceph osd require-osd-release "
+         << "luminous' before using crush device classes";
+      err = -EPERM;
+      goto reply;
+    }
+    if (!_have_pending_crush() &&
+        _get_stable_crush().class_exists(device_class)) {
+      ss << "class '" << device_class << "' already exists";
+      goto reply;
+    }
+     CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+     if (newcrush.class_exists(device_class)) {
+      ss << "class '" << device_class << "' already exists";
+      goto update;
+    }
+    int class_id = newcrush.get_or_create_class_id(device_class);
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << "created class " << device_class << " with id " << class_id
+       << " to crush map";
+    goto update;
+  } else if (prefix == "osd crush class rm") {
+    string device_class;
+    if (!cmd_getval(cmdmap, "class", device_class)) {
+       err = -EINVAL; // no value!
+       goto reply;
+     }
+    if (osdmap.require_osd_release < ceph_release_t::luminous) {
+       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
+         << "luminous' before using crush device classes";
+       err = -EPERM;
+       goto reply;
+     }
+
+     if (!osdmap.crush->class_exists(device_class)) {
+       err = 0;
+       goto reply;
+     }
+
+     CrushWrapper newcrush;
+     _get_pending_crush(newcrush);
+     if (!newcrush.class_exists(device_class)) {
+       err = 0; // make command idempotent
+       goto wait;
+     }
+     int class_id = newcrush.get_class_id(device_class);
+     stringstream ts;
+     if (newcrush.class_is_in_use(class_id, &ts)) {
+       err = -EBUSY;
+       ss << "class '" << device_class << "' " << ts.str();
+       goto reply;
+     }
+
+     // check if class is used by any erasure-code-profiles
+     mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
+       osdmap.get_erasure_code_profiles();
+     auto ec_profiles = pending_inc.get_erasure_code_profiles();
+#ifdef HAVE_STDLIB_MAP_SPLICING
+     ec_profiles.merge(old_ec_profiles);
+#else
+     ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
+                        make_move_iterator(end(old_ec_profiles)));
+#endif
+     list<string> referenced_by;
+     for (auto &i: ec_profiles) {
+       for (auto &j: i.second) {
+         if ("crush-device-class" == j.first && device_class == j.second) {
+           referenced_by.push_back(i.first);
+         }
+       }
+     }
+     if (!referenced_by.empty()) {
+       err = -EBUSY;
+       ss << "class '" << device_class
+          << "' is still referenced by erasure-code-profile(s): " << referenced_by;
+       goto reply;
+     }
+
+     set<int> osds;
+     newcrush.get_devices_by_class(device_class, &osds);
+     for (auto& p: osds) {
+       err = newcrush.remove_device_class(g_ceph_context, p, &ss);
+       if (err < 0) {
+         // ss has reason for failure
+         goto reply;
+       }
+     }
+
+     if (osds.empty()) {
+       // empty class, remove directly
+       err = newcrush.remove_class_name(device_class);
+       if (err < 0) {
+         ss << "class '" << device_class << "' cannot be removed '"
+            << cpp_strerror(err) << "'";
+         goto reply;
+       }
+     }
+
+     pending_inc.crush.clear();
+     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+     ss << "removed class " << device_class << " with id " << class_id
+        << " from crush map";
+     goto update;
+  } else if (prefix == "osd crush class rename") {
+    string srcname, dstname;
+    if (!cmd_getval(cmdmap, "srcname", srcname)) {
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!cmd_getval(cmdmap, "dstname", dstname)) {
+      err = -EINVAL;
+      goto reply;
+    }
+
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
+      // suppose this is a replay and return success
+      // so command is idempotent
+      ss << "already renamed to '" << dstname << "'";
+      err = 0;
+      goto reply;
+    }
+
+    err = newcrush.rename_class(srcname, dstname);
+    if (err < 0) {
+      ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
+         << cpp_strerror(err);
+      goto reply;
+    }
+
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << "rename class '" << srcname << "' to '" << dstname << "'";
+    goto update;
+  } else if (prefix == "osd crush add-bucket") {
+    // os crush add-bucket <name> <type>
+    string name, typestr;
+    vector<string> argvec;
+    cmd_getval(cmdmap, "name", name);
+    cmd_getval(cmdmap, "type", typestr);
+    cmd_getval(cmdmap, "args", argvec);
+    map<string,string> loc;
+    if (!argvec.empty()) {
+      CrushWrapper::parse_loc_map(argvec, &loc);
+      dout(0) << "will create and move bucket '" << name
+              << "' to location " << loc << dendl;
+    }
+
+    if (!_have_pending_crush() &&
+	_get_stable_crush().name_exists(name)) {
+      ss << "bucket '" << name << "' already exists";
+      goto reply;
+    }
+
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    if (newcrush.name_exists(name)) {
+      ss << "bucket '" << name << "' already exists";
+      goto update;
+    }
+    int type = newcrush.get_type_id(typestr);
+    if (type < 0) {
+      ss << "type '" << typestr << "' does not exist";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (type == 0) {
+      ss << "type '" << typestr << "' is for devices, not buckets";
+      err = -EINVAL;
+      goto reply;
+    }
+    int bucketno;
+    err = newcrush.add_bucket(0, 0,
+			      CRUSH_HASH_DEFAULT, type, 0, NULL,
+			      NULL, &bucketno);
+    if (err < 0) {
+      ss << "add_bucket error: '" << cpp_strerror(err) << "'";
+      goto reply;
+    }
+    err = newcrush.set_item_name(bucketno, name);
+    if (err < 0) {
+      ss << "error setting bucket name to '" << name << "'";
+      goto reply;
+    }
+
+    if (!loc.empty()) {
+      if (!newcrush.check_item_loc(cct, bucketno, loc,
+          (int *)NULL)) {
+        err = newcrush.move_bucket(cct, bucketno, loc);
+        if (err < 0) {
+          ss << "error moving bucket '" << name << "' to location " << loc;
+          goto reply;
+        }
+      } else {
+        ss << "no need to move item id " << bucketno << " name '" << name
+           << "' to location " << loc << " in crush map";
+      }
+    }
+
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    if (loc.empty()) {
+      ss << "added bucket " << name << " type " << typestr
+         << " to crush map";
+    } else {
+      ss << "added bucket " << name << " type " << typestr
+         << " to location " << loc;
+    }
+    goto update;
+  } else if (prefix == "osd crush rename-bucket") {
+    string srcname, dstname;
+    cmd_getval(cmdmap, "srcname", srcname);
+    cmd_getval(cmdmap, "dstname", dstname);
+
+    err = crush_rename_bucket(srcname, dstname, &ss);
+    if (err == -EALREADY) // equivalent to success for idempotency
+      err = 0;
+    if (err)
+      goto reply;
+    else
+      goto update;
+  } else if (prefix == "osd crush weight-set create" ||
+	     prefix == "osd crush weight-set create-compat") {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    int64_t pool;
+    int positions;
+    if (newcrush.has_non_straw2_buckets()) {
+      ss << "crush map contains one or more bucket(s) that are not straw2";
+      err = -EPERM;
+      goto reply;
+    }
+    if (prefix == "osd crush weight-set create") {
+      if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
+	  osdmap.require_min_compat_client < ceph_release_t::luminous) {
+	ss << "require_min_compat_client "
+	   << osdmap.require_min_compat_client
+	   << " < luminous, which is required for per-pool weight-sets. "
+           << "Try 'ceph osd set-require-min-compat-client luminous' "
+           << "before using the new interface";
+	err = -EPERM;
+	goto reply;
+      }
+      string poolname, mode;
+      cmd_getval(cmdmap, "pool", poolname);
+      pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+      if (pool < 0) {
+	ss << "pool '" << poolname << "' not found";
+	err = -ENOENT;
+	goto reply;
+      }
+      cmd_getval(cmdmap, "mode", mode);
+      if (mode != "flat" && mode != "positional") {
+	ss << "unrecognized weight-set mode '" << mode << "'";
+	err = -EINVAL;
+	goto reply;
+      }
+      positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
+    } else {
+      pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+      positions = 1;
+    }
+    if (!newcrush.create_choose_args(pool, positions)) {
+      if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
+        ss << "compat weight-set already created";
+      } else {
+        ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
+           << "' already created";
+      }
+      goto reply;
+    }
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    goto update;
+
+  } else if (prefix == "osd crush weight-set rm" ||
+	     prefix == "osd crush weight-set rm-compat") {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    int64_t pool;
+    if (prefix == "osd crush weight-set rm") {
+      string poolname;
+      cmd_getval(cmdmap, "pool", poolname);
+      pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+      if (pool < 0) {
+	ss << "pool '" << poolname << "' not found";
+	err = -ENOENT;
+	goto reply;
+      }
+    } else {
+      pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+    }
+    newcrush.rm_choose_args(pool);
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    goto update;
+
+  } else if (prefix == "osd crush weight-set reweight" ||
+	     prefix == "osd crush weight-set reweight-compat") {
+    string poolname, item;
+    vector<double> weight;
+    cmd_getval(cmdmap, "pool", poolname);
+    cmd_getval(cmdmap, "item", item);
+    cmd_getval(cmdmap, "weight", weight);
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    int64_t pool;
+    if (prefix == "osd crush weight-set reweight") {
+      pool = osdmap.lookup_pg_pool_name(poolname.c_str());
+      if (pool < 0) {
+	ss << "pool '" << poolname << "' not found";
+	err = -ENOENT;
+	goto reply;
+      }
+      if (!newcrush.have_choose_args(pool)) {
+	ss << "no weight-set for pool '" << poolname << "'";
+	err = -ENOENT;
+	goto reply;
+      }
+      auto arg_map = newcrush.choose_args_get(pool);
+      int positions = newcrush.get_choose_args_positions(arg_map);
+      if (weight.size() != (size_t)positions) {
+         ss << "must specify exact " << positions << " weight values";
+         err = -EINVAL;
+         goto reply;
+      }
+    } else {
+      pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
+      if (!newcrush.have_choose_args(pool)) {
+	ss << "no backward-compatible weight-set";
+	err = -ENOENT;
+	goto reply;
+      }
+    }
+    if (!newcrush.name_exists(item)) {
+      ss << "item '" << item << "' does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    err = newcrush.choose_args_adjust_item_weightf(
+      cct,
+      newcrush.choose_args_get(pool),
+      newcrush.get_item_id(item),
+      weight,
+      &ss);
+    if (err < 0) {
+      goto reply;
+    }
+    err = 0;
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    goto update;
+  } else if (osdid_present &&
+	     (prefix == "osd crush set" || prefix == "osd crush add")) {
+    // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
+    // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
+    // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
+
+    if (!osdmap.exists(osdid)) {
+      err = -ENOENT;
+      ss << osd_name
+	 << " does not exist. Create it before updating the crush map";
+      goto reply;
+    }
+
+    double weight;
+    if (!cmd_getval(cmdmap, "weight", weight)) {
+      ss << "unable to parse weight value '"
+         << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    string args;
+    vector<string> argvec;
+    cmd_getval(cmdmap, "args", argvec);
+    map<string,string> loc;
+    CrushWrapper::parse_loc_map(argvec, &loc);
+
+    if (prefix == "osd crush set"
+        && !_get_stable_crush().item_exists(osdid)) {
+      err = -ENOENT;
+      ss << "unable to set item id " << osdid << " name '" << osd_name
+         << "' weight " << weight << " at location " << loc
+         << ": does not exist";
+      goto reply;
+    }
+
+    dout(5) << "adding/updating crush item id " << osdid << " name '"
+      << osd_name << "' weight " << weight << " at location "
+      << loc << dendl;
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    string action;
+    if (prefix == "osd crush set" ||
+        newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
+      action = "set";
+      err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
+    } else {
+      action = "add";
+      err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
+      if (err == 0)
+        err = 1;
+    }
+
+    if (err < 0)
+      goto reply;
+
+    if (err == 0 && !_have_pending_crush()) {
+      ss << action << " item id " << osdid << " name '" << osd_name
+	 << "' weight " << weight << " at location " << loc << ": no change";
+      goto reply;
+    }
+
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
+       << weight << " at location " << loc << " to crush map";
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd crush create-or-move") {
+    do {
+      // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
+      if (!osdmap.exists(osdid)) {
+	err = -ENOENT;
+	ss << osd_name
+	   << " does not exist.  create it before updating the crush map";
+	goto reply;
+      }
+
+      double weight;
+      if (!cmd_getval(cmdmap, "weight", weight)) {
+        ss << "unable to parse weight value '"
+           << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+        err = -EINVAL;
+        goto reply;
+      }
+
+      string args;
+      vector<string> argvec;
+      cmd_getval(cmdmap, "args", argvec);
+      map<string,string> loc;
+      CrushWrapper::parse_loc_map(argvec, &loc);
+
+      dout(0) << "create-or-move crush item name '" << osd_name
+	      << "' initial_weight " << weight << " at location " << loc
+	      << dendl;
+
+      CrushWrapper newcrush;
+      _get_pending_crush(newcrush);
+
+      err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
+					 g_conf()->osd_crush_update_weight_set);
+      if (err == 0) {
+	ss << "create-or-move updated item name '" << osd_name
+	   << "' weight " << weight
+	   << " at location " << loc << " to crush map";
+	break;
+      }
+      if (err > 0) {
+	pending_inc.crush.clear();
+	newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+	ss << "create-or-move updating item name '" << osd_name
+	   << "' weight " << weight
+	   << " at location " << loc << " to crush map";
+	getline(ss, rs);
+	wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						  get_last_committed() + 1));
+	return true;
+      }
+    } while (false);
+
+  } else if (prefix == "osd crush move") {
+    do {
+      // osd crush move <name> <loc1> [<loc2> ...]
+      string name;
+      vector<string> argvec;
+      cmd_getval(cmdmap, "name", name);
+      cmd_getval(cmdmap, "args", argvec);
+      map<string,string> loc;
+      CrushWrapper::parse_loc_map(argvec, &loc);
+
+      dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
+      CrushWrapper newcrush;
+      _get_pending_crush(newcrush);
+
+      if (!newcrush.name_exists(name)) {
+	err = -ENOENT;
+	ss << "item " << name << " does not exist";
+	break;
+      }
+      int id = newcrush.get_item_id(name);
+
+      if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
+	if (id >= 0) {
+	  err = newcrush.create_or_move_item(
+	    cct, id, 0, name, loc,
+	    g_conf()->osd_crush_update_weight_set);
+	} else {
+	  err = newcrush.move_bucket(cct, id, loc);
+	}
+	if (err >= 0) {
+	  ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
+	  pending_inc.crush.clear();
+	  newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+	  getline(ss, rs);
+	  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						   get_last_committed() + 1));
+	  return true;
+	}
+      } else {
+	ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
+	err = 0;
+      }
+    } while (false);
+  } else if (prefix == "osd crush swap-bucket") {
+    string source, dest;
+    cmd_getval(cmdmap, "source", source);
+    cmd_getval(cmdmap, "dest", dest);
+
+    bool force = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", force);
+
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    if (!newcrush.name_exists(source)) {
+      ss << "source item " << source << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    if (!newcrush.name_exists(dest)) {
+      ss << "dest item " << dest << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    int sid = newcrush.get_item_id(source);
+    int did = newcrush.get_item_id(dest);
+    int sparent;
+    if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
+      ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
+      err = -EPERM;
+      goto reply;
+    }
+    if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
+	!force) {
+      ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
+	 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
+	 << "; pass --yes-i-really-mean-it to proceed anyway";
+      err = -EPERM;
+      goto reply;
+    }
+    int r = newcrush.swap_bucket(cct, sid, did);
+    if (r < 0) {
+      ss << "failed to swap bucket contents: " << cpp_strerror(r);
+      err = r;
+      goto reply;
+    }
+    ss << "swapped bucket of " << source << " to " << dest;
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    wait_for_finished_proposal(op,
+			       new Monitor::C_Command(mon, op, err, ss.str(),
+						      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd crush link") {
+    // osd crush link <name> <loc1> [<loc2> ...]
+    string name;
+    cmd_getval(cmdmap, "name", name);
+    vector<string> argvec;
+    cmd_getval(cmdmap, "args", argvec);
+    map<string,string> loc;
+    CrushWrapper::parse_loc_map(argvec, &loc);
+
+    // Need an explicit check for name_exists because get_item_id returns
+    // 0 on unfound.
+    int id = osdmap.crush->get_item_id(name);
+    if (!osdmap.crush->name_exists(name)) {
+      err = -ENOENT;
+      ss << "item " << name << " does not exist";
+      goto reply;
+    } else {
+      dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
+    }
+    if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
+      ss << "no need to move item id " << id << " name '" << name
+	 << "' to location " << loc << " in crush map";
+      err = 0;
+      goto reply;
+    }
+
+    dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    if (!newcrush.name_exists(name)) {
+      err = -ENOENT;
+      ss << "item " << name << " does not exist";
+      goto reply;
+    } else {
+      int id = newcrush.get_item_id(name);
+      if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
+	err = newcrush.link_bucket(cct, id, loc);
+	if (err >= 0) {
+	  ss << "linked item id " << id << " name '" << name
+             << "' to location " << loc << " in crush map";
+	  pending_inc.crush.clear();
+	  newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+	} else {
+	  ss << "cannot link item id " << id << " name '" << name
+             << "' to location " << loc;
+          goto reply;
+	}
+      } else {
+	ss << "no need to move item id " << id << " name '" << name
+           << "' to location " << loc << " in crush map";
+	err = 0;
+      }
+    }
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd crush rm" ||
+	     prefix == "osd crush remove" ||
+	     prefix == "osd crush unlink") {
+    do {
+      // osd crush rm <id> [ancestor]
+      CrushWrapper newcrush;
+      _get_pending_crush(newcrush);
+
+      string name;
+      cmd_getval(cmdmap, "name", name);
+
+      if (!osdmap.crush->name_exists(name)) {
+	err = 0;
+	ss << "device '" << name << "' does not appear in the crush map";
+	break;
+      }
+      if (!newcrush.name_exists(name)) {
+	err = 0;
+	ss << "device '" << name << "' does not appear in the crush map";
+	getline(ss, rs);
+	wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						  get_last_committed() + 1));
+	return true;
+      }
+      int id = newcrush.get_item_id(name);
+      int ancestor = 0;
+
+      bool unlink_only = prefix == "osd crush unlink";
+      string ancestor_str;
+      if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
+	if (!newcrush.name_exists(ancestor_str)) {
+	  err = -ENOENT;
+	  ss << "ancestor item '" << ancestor_str
+	     << "' does not appear in the crush map";
+	  break;
+	}
+        ancestor = newcrush.get_item_id(ancestor_str);
+      }
+
+      err = prepare_command_osd_crush_remove(
+          newcrush,
+          id, ancestor,
+          (ancestor < 0), unlink_only);
+
+      if (err == -ENOENT) {
+	ss << "item " << id << " does not appear in that position";
+	err = 0;
+	break;
+      }
+      if (err == 0) {
+        if (!unlink_only)
+          pending_inc.new_crush_node_flags[id] = 0;
+	ss << "removed item id " << id << " name '" << name << "' from crush map";
+	getline(ss, rs);
+	wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						  get_last_committed() + 1));
+	return true;
+      }
+    } while (false);
+
+  } else if (prefix == "osd crush reweight-all") {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    newcrush.reweight(cct);
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << "reweighted crush hierarchy";
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						  get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd crush reweight") {
+    // osd crush reweight <name> <weight>
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    string name;
+    cmd_getval(cmdmap, "name", name);
+    if (!newcrush.name_exists(name)) {
+      err = -ENOENT;
+      ss << "device '" << name << "' does not appear in the crush map";
+      goto reply;
+    }
+
+    int id = newcrush.get_item_id(name);
+    if (id < 0) {
+      ss << "device '" << name << "' is not a leaf in the crush map";
+      err = -EINVAL;
+      goto reply;
+    }
+    double w;
+    if (!cmd_getval(cmdmap, "weight", w)) {
+      ss << "unable to parse weight value '"
+	 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    err = newcrush.adjust_item_weightf(cct, id, w,
+				       g_conf()->osd_crush_update_weight_set);
+    if (err < 0)
+      goto reply;
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << "reweighted item id " << id << " name '" << name << "' to " << w
+       << " in crush map";
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						  get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd crush reweight-subtree") {
+    // osd crush reweight <name> <weight>
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    string name;
+    cmd_getval(cmdmap, "name", name);
+    if (!newcrush.name_exists(name)) {
+      err = -ENOENT;
+      ss << "device '" << name << "' does not appear in the crush map";
+      goto reply;
+    }
+
+    int id = newcrush.get_item_id(name);
+    if (id >= 0) {
+      ss << "device '" << name << "' is not a subtree in the crush map";
+      err = -EINVAL;
+      goto reply;
+    }
+    double w;
+    if (!cmd_getval(cmdmap, "weight", w)) {
+      ss << "unable to parse weight value '"
+	 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    err = newcrush.adjust_subtree_weightf(cct, id, w,
+					  g_conf()->osd_crush_update_weight_set);
+    if (err < 0)
+      goto reply;
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
+       << " in crush map";
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd crush tunables") {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    err = 0;
+    string profile;
+    cmd_getval(cmdmap, "profile", profile);
+    if (profile == "legacy" || profile == "argonaut") {
+      newcrush.set_tunables_legacy();
+    } else if (profile == "bobtail") {
+      newcrush.set_tunables_bobtail();
+    } else if (profile == "firefly") {
+      newcrush.set_tunables_firefly();
+    } else if (profile == "hammer") {
+      newcrush.set_tunables_hammer();
+    } else if (profile == "jewel") {
+      newcrush.set_tunables_jewel();
+    } else if (profile == "optimal") {
+      newcrush.set_tunables_optimal();
+    } else if (profile == "default") {
+      newcrush.set_tunables_default();
+    } else {
+      ss << "unrecognized profile '" << profile << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    if (!validate_crush_against_features(&newcrush, ss)) {
+      err = -EINVAL;
+      goto reply;
+    }
+
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << "adjusted tunables profile to " << profile;
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd crush set-tunable") {
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    err = 0;
+    string tunable;
+    cmd_getval(cmdmap, "tunable", tunable);
+
+    int64_t value = -1;
+    if (!cmd_getval(cmdmap, "value", value)) {
+      err = -EINVAL;
+      ss << "failed to parse integer value "
+	 << cmd_vartype_stringify(cmdmap.at("value"));
+      goto reply;
+    }
+
+    if (tunable == "straw_calc_version") {
+      if (value != 0 && value != 1) {
+	ss << "value must be 0 or 1; got " << value;
+	err = -EINVAL;
+	goto reply;
+      }
+      newcrush.set_straw_calc_version(value);
+    } else {
+      ss << "unrecognized tunable '" << tunable << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    if (!validate_crush_against_features(&newcrush, ss)) {
+      err = -EINVAL;
+      goto reply;
+    }
+
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    ss << "adjusted tunable " << tunable << " to " << value;
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd crush rule create-simple") {
+    string name, root, type, mode;
+    cmd_getval(cmdmap, "name", name);
+    cmd_getval(cmdmap, "root", root);
+    cmd_getval(cmdmap, "type", type);
+    cmd_getval(cmdmap, "mode", mode);
+    if (mode == "")
+      mode = "firstn";
+
+    if (osdmap.crush->rule_exists(name)) {
+      // The name is uniquely associated to a ruleid and the rule it contains
+      // From the user point of view, the rule is more meaningfull.
+      ss << "rule " << name << " already exists";
+      err = 0;
+      goto reply;
+    }
+
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    if (newcrush.rule_exists(name)) {
+      // The name is uniquely associated to a ruleid and the rule it contains
+      // From the user point of view, the rule is more meaningfull.
+      ss << "rule " << name << " already exists";
+      err = 0;
+    } else {
+      int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
+					       pg_pool_t::TYPE_REPLICATED, &ss);
+      if (ruleno < 0) {
+	err = ruleno;
+	goto reply;
+      }
+
+      pending_inc.crush.clear();
+      newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    }
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd crush rule create-replicated") {
+    string name, root, type, device_class;
+    cmd_getval(cmdmap, "name", name);
+    cmd_getval(cmdmap, "root", root);
+    cmd_getval(cmdmap, "type", type);
+    cmd_getval(cmdmap, "class", device_class);
+
+    if (osdmap.crush->rule_exists(name)) {
+      // The name is uniquely associated to a ruleid and the rule it contains
+      // From the user point of view, the rule is more meaningfull.
+      ss << "rule " << name << " already exists";
+      err = 0;
+      goto reply;
+    }
+
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    if (newcrush.rule_exists(name)) {
+      // The name is uniquely associated to a ruleid and the rule it contains
+      // From the user point of view, the rule is more meaningfull.
+      ss << "rule " << name << " already exists";
+      err = 0;
+    } else {
+      int ruleno = newcrush.add_simple_rule(
+	name, root, type, device_class,
+	"firstn", pg_pool_t::TYPE_REPLICATED, &ss);
+      if (ruleno < 0) {
+	err = ruleno;
+	goto reply;
+      }
+
+      pending_inc.crush.clear();
+      newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    }
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd erasure-code-profile rm") {
+    string name;
+    cmd_getval(cmdmap, "name", name);
+
+    if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
+      goto wait;
+
+    if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
+      err = -EBUSY;
+      goto reply;
+    }
+
+    if (osdmap.has_erasure_code_profile(name) ||
+	pending_inc.new_erasure_code_profiles.count(name)) {
+      if (osdmap.has_erasure_code_profile(name)) {
+	pending_inc.old_erasure_code_profiles.push_back(name);
+      } else {
+	dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
+	pending_inc.new_erasure_code_profiles.erase(name);
+      }
+
+      getline(ss, rs);
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+							get_last_committed() + 1));
+      return true;
+    } else {
+      ss << "erasure-code-profile " << name << " does not exist";
+      err = 0;
+      goto reply;
+    }
+
+  } else if (prefix == "osd erasure-code-profile set") {
+    string name;
+    cmd_getval(cmdmap, "name", name);
+    vector<string> profile;
+    cmd_getval(cmdmap, "profile", profile);
+
+    bool force = false;
+    cmd_getval(cmdmap, "force", force);
+
+    map<string,string> profile_map;
+    err = parse_erasure_code_profile(profile, &profile_map, &ss);
+    if (err)
+      goto reply;
+    if (auto found = profile_map.find("crush-failure-domain");
+	found != profile_map.end()) {
+      const auto& failure_domain = found->second;
+      int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
+      if (failure_domain_type < 0) {
+	ss << "erasure-code-profile " << profile_map
+	  << " contains an invalid failure-domain " << std::quoted(failure_domain);
+	err = -EINVAL;
+	goto reply;
+      }
+    }
+
+    if (profile_map.find("plugin") == profile_map.end()) {
+      ss << "erasure-code-profile " << profile_map
+	 << " must contain a plugin entry" << std::endl;
+      err = -EINVAL;
+      goto reply;
+    }
+    string plugin = profile_map["plugin"];
+
+    if (pending_inc.has_erasure_code_profile(name)) {
+      dout(20) << "erasure code profile " << name << " try again" << dendl;
+      goto wait;
+    } else {
+      err = normalize_profile(name, profile_map, force, &ss);
+      if (err)
+	goto reply;
+
+      if (osdmap.has_erasure_code_profile(name)) {
+	ErasureCodeProfile existing_profile_map =
+	  osdmap.get_erasure_code_profile(name);
+	err = normalize_profile(name, existing_profile_map, force, &ss);
+	if (err)
+	  goto reply;
+
+	if (existing_profile_map == profile_map) {
+	  err = 0;
+	  goto reply;
+	}
+	if (!force) {
+	  err = -EPERM;
+	  ss << "will not override erasure code profile " << name
+	     << " because the existing profile "
+	     << existing_profile_map
+	     << " is different from the proposed profile "
+	     << profile_map;
+	  goto reply;
+	}
+      }
+
+      dout(20) << "erasure code profile set " << name << "="
+	       << profile_map << dendl;
+      pending_inc.set_erasure_code_profile(name, profile_map);
+    }
+
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+                                                      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd crush rule create-erasure") {
+    err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
+    if (err == -EAGAIN)
+      goto wait;
+    if (err)
+      goto reply;
+    string name, poolstr;
+    cmd_getval(cmdmap, "name", name);
+    string profile;
+    cmd_getval(cmdmap, "profile", profile);
+    if (profile == "")
+      profile = "default";
+    if (profile == "default") {
+      if (!osdmap.has_erasure_code_profile(profile)) {
+	if (pending_inc.has_erasure_code_profile(profile)) {
+	  dout(20) << "erasure code profile " << profile << " already pending" << dendl;
+	  goto wait;
+	}
+
+	map<string,string> profile_map;
+	err = osdmap.get_erasure_code_profile_default(cct,
+						      profile_map,
+						      &ss);
+	if (err)
+	  goto reply;
+	err = normalize_profile(name, profile_map, true, &ss);
+	if (err)
+	  goto reply;
+	dout(20) << "erasure code profile set " << profile << "="
+		 << profile_map << dendl;
+	pending_inc.set_erasure_code_profile(profile, profile_map);
+	goto wait;
+      }
+    }
+
+    int rule;
+    err = crush_rule_create_erasure(name, profile, &rule, &ss);
+    if (err < 0) {
+      switch(err) {
+      case -EEXIST: // return immediately
+	ss << "rule " << name << " already exists";
+	err = 0;
+	goto reply;
+	break;
+      case -EALREADY: // wait for pending to be proposed
+	ss << "rule " << name << " already exists";
+	err = 0;
+	break;
+      default: // non recoverable error
+ 	goto reply;
+	break;
+      }
+    } else {
+      ss << "created rule " << name << " at " << rule;
+    }
+
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+                                                      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd crush rule rm") {
+    string name;
+    cmd_getval(cmdmap, "name", name);
+
+    if (!osdmap.crush->rule_exists(name)) {
+      ss << "rule " << name << " does not exist";
+      err = 0;
+      goto reply;
+    }
+
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+
+    if (!newcrush.rule_exists(name)) {
+      ss << "rule " << name << " does not exist";
+      err = 0;
+    } else {
+      int ruleno = newcrush.get_rule_id(name);
+      ceph_assert(ruleno >= 0);
+
+      // make sure it is not in use.
+      // FIXME: this is ok in some situations, but let's not bother with that
+      // complexity now.
+      int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
+      if (osdmap.crush_rule_in_use(ruleset)) {
+	ss << "crush ruleset " << name << " " << ruleset << " is in use";
+	err = -EBUSY;
+	goto reply;
+      }
+
+      err = newcrush.remove_rule(ruleno);
+      if (err < 0) {
+	goto reply;
+      }
+
+      pending_inc.crush.clear();
+      newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    }
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd crush rule rename") {
+    string srcname;
+    string dstname;
+    cmd_getval(cmdmap, "srcname", srcname);
+    cmd_getval(cmdmap, "dstname", dstname);
+    if (srcname.empty() || dstname.empty()) {
+      ss << "must specify both source rule name and destination rule name";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (srcname == dstname) {
+      ss << "destination rule name is equal to source rule name";
+      err = 0;
+      goto reply;
+    }
+
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
+      // srcname does not exist and dstname already exists
+      // suppose this is a replay and return success
+      // (so this command is idempotent)
+      ss << "already renamed to '" << dstname << "'";
+      err = 0;
+      goto reply;
+    }
+
+    err = newcrush.rename_rule(srcname, dstname, &ss);
+    if (err < 0) {
+      // ss has reason for failure
+      goto reply;
+    }
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+                               get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd setmaxosd") {
+    int64_t newmax;
+    if (!cmd_getval(cmdmap, "newmax", newmax)) {
+      ss << "unable to parse 'newmax' value '"
+         << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    if (newmax > g_conf()->mon_max_osd) {
+      err = -ERANGE;
+      ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
+	 << g_conf()->mon_max_osd << ")";
+      goto reply;
+    }
+
+    // Don't allow shrinking OSD number as this will cause data loss
+    // and may cause kernel crashes.
+    // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
+    if (newmax < osdmap.get_max_osd()) {
+      // Check if the OSDs exist between current max and new value.
+      // If there are any OSDs exist, then don't allow shrinking number
+      // of OSDs.
+      for (int i = newmax; i < osdmap.get_max_osd(); i++) {
+        if (osdmap.exists(i)) {
+          err = -EBUSY;
+          ss << "cannot shrink max_osd to " << newmax
+             << " because osd." << i << " (and possibly others) still in use";
+          goto reply;
+        }
+      }
+    }
+
+    pending_inc.new_max_osd = newmax;
+    ss << "set new max_osd = " << pending_inc.new_max_osd;
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd set-full-ratio" ||
+	     prefix == "osd set-backfillfull-ratio" ||
+             prefix == "osd set-nearfull-ratio") {
+    double n;
+    if (!cmd_getval(cmdmap, "ratio", n)) {
+      ss << "unable to parse 'ratio' value '"
+         << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (prefix == "osd set-full-ratio")
+      pending_inc.new_full_ratio = n;
+    else if (prefix == "osd set-backfillfull-ratio")
+      pending_inc.new_backfillfull_ratio = n;
+    else if (prefix == "osd set-nearfull-ratio")
+      pending_inc.new_nearfull_ratio = n;
+    ss << prefix << " " << n;
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd set-require-min-compat-client") {
+    string v;
+    cmd_getval(cmdmap, "version", v);
+    ceph_release_t vno = ceph_release_from_name(v);
+    if (!vno) {
+      ss << "version " << v << " is not recognized";
+      err = -EINVAL;
+      goto reply;
+    }
+    OSDMap newmap;
+    newmap.deepish_copy_from(osdmap);
+    newmap.apply_incremental(pending_inc);
+    newmap.require_min_compat_client = vno;
+    auto mvno = newmap.get_min_compat_client();
+    if (vno < mvno) {
+      ss << "osdmap current utilizes features that require " << mvno
+	 << "; cannot set require_min_compat_client below that to " << vno;
+      err = -EPERM;
+      goto reply;
+    }
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      FeatureMap m;
+      mon.get_combined_feature_map(&m);
+      uint64_t features = ceph_release_features(to_integer<int>(vno));
+      bool first = true;
+      bool ok = true;
+      for (int type : {
+	    CEPH_ENTITY_TYPE_CLIENT,
+	    CEPH_ENTITY_TYPE_MDS,
+	    CEPH_ENTITY_TYPE_MGR }) {
+	auto p = m.m.find(type);
+	if (p == m.m.end()) {
+	  continue;
+	}
+	for (auto& q : p->second) {
+	  uint64_t missing = ~q.first & features;
+	  if (missing) {
+	    if (first) {
+	      ss << "cannot set require_min_compat_client to " << v << ": ";
+	    } else {
+	      ss << "; ";
+	    }
+	    first = false;
+	    ss << q.second << " connected " << ceph_entity_type_name(type)
+	       << "(s) look like " << ceph_release_name(
+		 ceph_release_from_features(q.first))
+	       << " (missing 0x" << std::hex << missing << std::dec << ")";
+	    ok = false;
+	  }
+	}
+      }
+      if (!ok) {
+	ss << "; add --yes-i-really-mean-it to do it anyway";
+	err = -EPERM;
+	goto reply;
+      }
+    }
+    ss << "set require_min_compat_client to " << vno;
+    pending_inc.new_require_min_compat_client = vno;
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+							  get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd pause") {
+    return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+
+  } else if (prefix == "osd unpause") {
+    return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+
+  } else if (prefix == "osd set") {
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+    string key;
+    cmd_getval(cmdmap, "key", key);
+    if (key == "pause")
+      return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+    else if (key == "noup")
+      return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
+    else if (key == "nodown")
+      return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
+    else if (key == "noout")
+      return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
+    else if (key == "noin")
+      return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
+    else if (key == "nobackfill")
+      return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
+    else if (key == "norebalance")
+      return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
+    else if (key == "norecover")
+      return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
+    else if (key == "noscrub")
+      return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
+    else if (key == "nodeep-scrub")
+      return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
+    else if (key == "notieragent")
+      return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
+    else if (key == "nosnaptrim")
+      return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
+    else if (key == "pglog_hardlimit") {
+      if (!osdmap.get_num_up_osds() && !sure) {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
+      // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+      // we are reusing a jewel feature bit that was retired in luminous.
+      if (osdmap.require_osd_release >= ceph_release_t::luminous &&
+         (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
+          || sure)) {
+	return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
+      } else {
+	ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
+	err = -EPERM;
+	goto reply;
+      }
+    } else {
+      ss << "unrecognized flag '" << key << "'";
+      err = -EINVAL;
+    }
+
+  } else if (prefix == "osd unset") {
+    string key;
+    cmd_getval(cmdmap, "key", key);
+    if (key == "pause")
+      return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
+    else if (key == "noup")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
+    else if (key == "nodown")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
+    else if (key == "noout")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
+    else if (key == "noin")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
+    else if (key == "nobackfill")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
+    else if (key == "norebalance")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
+    else if (key == "norecover")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
+    else if (key == "noscrub")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
+    else if (key == "nodeep-scrub")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
+    else if (key == "notieragent")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
+    else if (key == "nosnaptrim")
+      return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
+    else {
+      ss << "unrecognized flag '" << key << "'";
+      err = -EINVAL;
+    }
+
+  } else if (prefix == "osd require-osd-release") {
+    string release;
+    cmd_getval(cmdmap, "release", release);
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    ceph_release_t rel = ceph_release_from_name(release.c_str());
+    if (!rel) {
+      ss << "unrecognized release " << release;
+      err = -EINVAL;
+      goto reply;
+    }
+    if (rel == osdmap.require_osd_release) {
+      // idempotent
+      err = 0;
+      goto reply;
+    }
+    ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
+    if (!osdmap.get_num_up_osds() && !sure) {
+      ss << "Not advisable to continue since no OSDs are up. Pass "
+	 << "--yes-i-really-mean-it if you really wish to continue.";
+      err = -EPERM;
+      goto reply;
+    }
+    if (rel == ceph_release_t::mimic) {
+      if (!mon.monmap->get_required_features().contains_all(
+	    ceph::features::mon::FEATURE_MIMIC)) {
+	ss << "not all mons are mimic";
+	err = -EPERM;
+	goto reply;
+      }
+      if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
+           && !sure) {
+	ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
+	err = -EPERM;
+	goto reply;
+      }
+    } else if (rel == ceph_release_t::nautilus) {
+      if (!mon.monmap->get_required_features().contains_all(
+	    ceph::features::mon::FEATURE_NAUTILUS)) {
+	ss << "not all mons are nautilus";
+	err = -EPERM;
+	goto reply;
+      }
+      if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
+           && !sure) {
+	ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
+	err = -EPERM;
+	goto reply;
+      }
+    } else if (rel == ceph_release_t::octopus) {
+      if (!mon.monmap->get_required_features().contains_all(
+	    ceph::features::mon::FEATURE_OCTOPUS)) {
+	ss << "not all mons are octopus";
+	err = -EPERM;
+	goto reply;
+      }
+      if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
+           && !sure) {
+	ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
+	err = -EPERM;
+	goto reply;
+      }
+    } else if (rel == ceph_release_t::pacific) {
+      if (!mon.monmap->get_required_features().contains_all(
+	    ceph::features::mon::FEATURE_PACIFIC)) {
+	ss << "not all mons are pacific";
+	err = -EPERM;
+	goto reply;
+      }
+      if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
+           && !sure) {
+	ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
+	err = -EPERM;
+	goto reply;
+      }
+    } else {
+      ss << "not supported for this release yet";
+      err = -EPERM;
+      goto reply;
+    }
+    if (rel < osdmap.require_osd_release) {
+      ss << "require_osd_release cannot be lowered once it has been set";
+      err = -EPERM;
+      goto reply;
+    }
+    pending_inc.new_require_osd_release = rel;
+    goto update;
+  } else if (prefix == "osd down" ||
+             prefix == "osd out" ||
+             prefix == "osd in" ||
+             prefix == "osd rm" ||
+             prefix == "osd stop") {
+
+    bool any = false;
+    bool stop = false;
+    bool verbose = true;
+    bool definitely_dead = false;
+
+    vector<string> idvec;
+    cmd_getval(cmdmap, "ids", idvec);
+    cmd_getval(cmdmap, "definitely_dead", definitely_dead);
+    derr << "definitely_dead " << (int)definitely_dead << dendl;
+    for (unsigned j = 0; j < idvec.size() && !stop; j++) {
+      set<int> osds;
+
+      // wildcard?
+      if (j == 0 &&
+          (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
+        if (prefix == "osd in") {
+          // touch out osds only
+          osdmap.get_out_existing_osds(osds);
+        } else {
+          osdmap.get_all_osds(osds);
+        }
+        stop = true;
+        verbose = false; // so the output is less noisy.
+      } else {
+        long osd = parse_osd_id(idvec[j].c_str(), &ss);
+        if (osd < 0) {
+          ss << "invalid osd id" << osd;
+          err = -EINVAL;
+          continue;
+        } else if (!osdmap.exists(osd)) {
+          ss << "osd." << osd << " does not exist. ";
+          continue;
+        }
+
+        osds.insert(osd);
+      }
+
+      for (auto &osd : osds) {
+        if (prefix == "osd down") {
+	  if (osdmap.is_down(osd)) {
+            if (verbose)
+	      ss << "osd." << osd << " is already down. ";
+	  } else {
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
+	    ss << "marked down osd." << osd << ". ";
+	    any = true;
+	  }
+	  if (definitely_dead) {
+	    if (!pending_inc.new_xinfo.count(osd)) {
+	      pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
+	    }
+	    if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
+	      any = true;
+	    }
+	    pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
+	  }
+        } else if (prefix == "osd out") {
+	  if (osdmap.is_out(osd)) {
+            if (verbose)
+	      ss << "osd." << osd << " is already out. ";
+	  } else {
+	    pending_inc.new_weight[osd] = CEPH_OSD_OUT;
+	    if (osdmap.osd_weight[osd]) {
+	      if (pending_inc.new_xinfo.count(osd) == 0) {
+	        pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
+	      }
+	      pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
+	    }
+	    ss << "marked out osd." << osd << ". ";
+            std::ostringstream msg;
+            msg << "Client " << op->get_session()->entity_name
+                << " marked osd." << osd << " out";
+            if (osdmap.is_up(osd)) {
+              msg << ", while it was still marked up";
+            } else {
+              auto period = ceph_clock_now() - down_pending_out[osd];
+              msg << ", after it was down for " << int(period.sec())
+                  << " seconds";
+            }
+
+            mon.clog->info() << msg.str();
+	    any = true;
+	  }
+        } else if (prefix == "osd in") {
+	  if (osdmap.is_in(osd)) {
+            if (verbose)
+	      ss << "osd." << osd << " is already in. ";
+	  } else {
+	    if (osdmap.osd_xinfo[osd].old_weight > 0) {
+	      pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
+	      if (pending_inc.new_xinfo.count(osd) == 0) {
+	        pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
+	      }
+	      pending_inc.new_xinfo[osd].old_weight = 0;
+	    } else {
+	      pending_inc.new_weight[osd] = CEPH_OSD_IN;
+	    }
+	    ss << "marked in osd." << osd << ". ";
+	    any = true;
+	  }
+        } else if (prefix == "osd rm") {
+          err = prepare_command_osd_remove(osd);
+
+          if (err == -EBUSY) {
+	    if (any)
+	      ss << ", ";
+            ss << "osd." << osd << " is still up; must be down before removal. ";
+	  } else {
+            ceph_assert(err == 0);
+	    if (any) {
+	      ss << ", osd." << osd;
+            } else {
+	      ss << "removed osd." << osd;
+            }
+	    any = true;
+	  }
+        } else if (prefix == "osd stop") {
+          if (osdmap.is_stop(osd)) {
+            if (verbose)
+              ss << "osd." << osd << " is already stopped. ";
+          } else if (osdmap.is_down(osd)) {
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
+            ss << "stop down osd." << osd << ". ";
+            any = true;
+          } else {
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
+            ss << "stop osd." << osd << ". ";
+            any = true;
+          }
+        }
+      }
+    }
+    if (any) {
+      getline(ss, rs);
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
+						get_last_committed() + 1));
+      return true;
+    }
+  } else if (prefix == "osd set-group" ||
+             prefix == "osd unset-group" ||
+             prefix == "osd add-noup" ||
+             prefix == "osd add-nodown" ||
+             prefix == "osd add-noin" ||
+             prefix == "osd add-noout" ||
+             prefix == "osd rm-noup" ||
+             prefix == "osd rm-nodown" ||
+             prefix == "osd rm-noin" ||
+             prefix == "osd rm-noout") {
+    bool do_set = prefix == "osd set-group" ||
+                  prefix.find("add") != string::npos;
+    string flag_str;
+    unsigned flags = 0;
+    vector<string> who;
+    if (prefix == "osd set-group" || prefix == "osd unset-group") {
+      cmd_getval(cmdmap, "flags", flag_str);
+      cmd_getval(cmdmap, "who", who);
+      vector<string> raw_flags;
+      boost::split(raw_flags, flag_str, boost::is_any_of(","));
+      for (auto& f : raw_flags) {
+        if (f == "noup")
+          flags |= CEPH_OSD_NOUP;
+        else if (f == "nodown")
+          flags |= CEPH_OSD_NODOWN;
+        else if (f == "noin")
+          flags |= CEPH_OSD_NOIN;
+        else if (f == "noout")
+          flags |= CEPH_OSD_NOOUT;
+        else {
+          ss << "unrecognized flag '" << f << "', must be one of "
+             << "{noup,nodown,noin,noout}";
+          err = -EINVAL;
+          goto reply;
+        }
+      }
+    } else {
+      cmd_getval(cmdmap, "ids", who);
+      if (prefix.find("noup") != string::npos)
+        flags = CEPH_OSD_NOUP;
+      else if (prefix.find("nodown") != string::npos)
+        flags = CEPH_OSD_NODOWN;
+      else if (prefix.find("noin") != string::npos)
+        flags = CEPH_OSD_NOIN;
+      else if (prefix.find("noout") != string::npos)
+        flags = CEPH_OSD_NOOUT;
+      else
+        ceph_assert(0 == "Unreachable!");
+    }
+    if (flags == 0) {
+      ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (who.empty()) {
+      ss << "must specify at least one or more targets to set/unset";
+      err = -EINVAL;
+      goto reply;
+    }
+    set<int> osds;
+    set<int> crush_nodes;
+    set<int> device_classes;
+    for (auto& w : who) {
+      if (w == "any" || w == "all" || w == "*") {
+        osdmap.get_all_osds(osds);
+        break;
+      }
+      std::stringstream ts;
+      if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
+        osds.insert(osd);
+      } else if (osdmap.crush->name_exists(w)) {
+        crush_nodes.insert(osdmap.crush->get_item_id(w));
+      } else if (osdmap.crush->class_exists(w)) {
+        device_classes.insert(osdmap.crush->get_class_id(w));
+      } else {
+        ss << "unable to parse osd id or crush node or device class: "
+           << "\"" << w << "\". ";
+      }
+    }
+    if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
+      // ss has reason for failure
+      err = -EINVAL;
+      goto reply;
+    }
+    bool any = false;
+    for (auto osd : osds) {
+      if (!osdmap.exists(osd)) {
+        ss << "osd." << osd << " does not exist. ";
+        continue;
+      }
+      if (do_set) {
+        if (flags & CEPH_OSD_NOUP) {
+          any |= osdmap.is_noup_by_osd(osd) ?
+            pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
+        }
+        if (flags & CEPH_OSD_NODOWN) {
+          any |= osdmap.is_nodown_by_osd(osd) ?
+            pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
+        }
+        if (flags & CEPH_OSD_NOIN) {
+          any |= osdmap.is_noin_by_osd(osd) ?
+            pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
+        }
+        if (flags & CEPH_OSD_NOOUT) {
+          any |= osdmap.is_noout_by_osd(osd) ?
+            pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
+        }
+      } else {
+        if (flags & CEPH_OSD_NOUP) {
+          any |= osdmap.is_noup_by_osd(osd) ?
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
+            pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
+        }
+        if (flags & CEPH_OSD_NODOWN) {
+          any |= osdmap.is_nodown_by_osd(osd) ?
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
+            pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
+        }
+        if (flags & CEPH_OSD_NOIN) {
+          any |= osdmap.is_noin_by_osd(osd) ?
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
+            pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
+        }
+        if (flags & CEPH_OSD_NOOUT) {
+          any |= osdmap.is_noout_by_osd(osd) ?
+            pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
+            pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
+        }
+      }
+    }
+    for (auto& id : crush_nodes) {
+      auto old_flags = osdmap.get_crush_node_flags(id);
+      auto& pending_flags = pending_inc.new_crush_node_flags[id];
+      pending_flags |= old_flags; // adopt existing flags first!
+      if (do_set) {
+        pending_flags |= flags;
+      } else {
+        pending_flags &= ~flags;
+      }
+      any = true;
+    }
+    for (auto& id : device_classes) {
+      auto old_flags = osdmap.get_device_class_flags(id);
+      auto& pending_flags = pending_inc.new_device_class_flags[id];
+      pending_flags |= old_flags;
+      if (do_set) {
+        pending_flags |= flags;
+      } else {
+        pending_flags &= ~flags;
+      }
+      any = true;
+    }
+    if (any) {
+      getline(ss, rs);
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
+                                 get_last_committed() + 1));
+      return true;
+    }
+  } else if (prefix == "osd pg-temp") {
+    string pgidstr;
+    if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
+      ss << "unable to parse 'pgid' value '"
+         << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    pg_t pgid;
+    if (!pgid.parse(pgidstr.c_str())) {
+      ss << "invalid pgid '" << pgidstr << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!osdmap.pg_exists(pgid)) {
+      ss << "pg " << pgid << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    if (pending_inc.new_pg_temp.count(pgid)) {
+      dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+      return true;
+    }
+
+    vector<int64_t> id_vec;
+    vector<int32_t> new_pg_temp;
+    cmd_getval(cmdmap, "id", id_vec);
+    if (id_vec.empty())  {
+      pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
+      ss << "done cleaning up pg_temp of " << pgid;
+      goto update;
+    }
+    for (auto osd : id_vec) {
+      if (!osdmap.exists(osd)) {
+        ss << "osd." << osd << " does not exist";
+        err = -ENOENT;
+        goto reply;
+      }
+      new_pg_temp.push_back(osd);
+    }
+
+    int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
+    if ((int)new_pg_temp.size() < pool_min_size) {
+      ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
+         << pool_min_size << ")";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    int pool_size = osdmap.get_pg_pool_size(pgid);
+    if ((int)new_pg_temp.size() > pool_size) {
+      ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
+         << pool_size << ")";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
+      new_pg_temp.begin(), new_pg_temp.end());
+    ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
+    goto update;
+  } else if (prefix == "osd primary-temp") {
+    string pgidstr;
+    if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
+      ss << "unable to parse 'pgid' value '"
+         << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    pg_t pgid;
+    if (!pgid.parse(pgidstr.c_str())) {
+      ss << "invalid pgid '" << pgidstr << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!osdmap.pg_exists(pgid)) {
+      ss << "pg " << pgid << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+
+    int64_t osd;
+    if (!cmd_getval(cmdmap, "id", osd)) {
+      ss << "unable to parse 'id' value '"
+         << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (osd != -1 && !osdmap.exists(osd)) {
+      ss << "osd." << osd << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+
+    if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
+	osdmap.require_min_compat_client < ceph_release_t::firefly) {
+      ss << "require_min_compat_client "
+	 << osdmap.require_min_compat_client
+	 << " < firefly, which is required for primary-temp";
+      err = -EPERM;
+      goto reply;
+    }
+
+    pending_inc.new_primary_temp[pgid] = osd;
+    ss << "set " << pgid << " primary_temp mapping to " << osd;
+    goto update;
+  } else if (prefix == "pg repeer") {
+    pg_t pgid;
+    string pgidstr;
+    cmd_getval(cmdmap, "pgid", pgidstr);
+    if (!pgid.parse(pgidstr.c_str())) {
+      ss << "invalid pgid '" << pgidstr << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!osdmap.pg_exists(pgid)) {
+      ss << "pg '" << pgidstr << "' does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    vector<int> acting;
+    int primary;
+    osdmap.pg_to_acting_osds(pgid, &acting, &primary);
+    if (primary < 0) {
+      err = -EAGAIN;
+      ss << "pg currently has no primary";
+      goto reply;
+    }
+    if (acting.size() > 1) {
+      // map to just primary; it will map back to what it wants
+      pending_inc.new_pg_temp[pgid] = { primary };
+    } else {
+      // hmm, pick another arbitrary osd to induce a change.  Note
+      // that this won't work if there is only one suitable OSD in the cluster.
+      int i;
+      bool done = false;
+      for (i = 0; i < osdmap.get_max_osd(); ++i) {
+	if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
+	  continue;
+	}
+	pending_inc.new_pg_temp[pgid] = { primary, i };
+	done = true;
+	break;
+      }
+      if (!done) {
+	err = -EAGAIN;
+	ss << "not enough up OSDs in the cluster to force repeer";
+	goto reply;
+      }
+    }
+    goto update;
+  } else if (prefix == "osd pg-upmap" ||
+             prefix == "osd rm-pg-upmap" ||
+             prefix == "osd pg-upmap-items" ||
+             prefix == "osd rm-pg-upmap-items") {
+    if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
+      ss << "min_compat_client "
+	 << osdmap.require_min_compat_client
+	 << " < luminous, which is required for pg-upmap. "
+         << "Try 'ceph osd set-require-min-compat-client luminous' "
+         << "before using the new interface";
+      err = -EPERM;
+      goto reply;
+    }
+    err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
+    if (err == -EAGAIN)
+      goto wait;
+    if (err < 0)
+      goto reply;
+    string pgidstr;
+    if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
+      ss << "unable to parse 'pgid' value '"
+         << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    pg_t pgid;
+    if (!pgid.parse(pgidstr.c_str())) {
+      ss << "invalid pgid '" << pgidstr << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!osdmap.pg_exists(pgid)) {
+      ss << "pg " << pgid << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    if (pending_inc.old_pools.count(pgid.pool())) {
+      ss << "pool of " << pgid << " is pending removal";
+      err = -ENOENT;
+      getline(ss, rs);
+      wait_for_finished_proposal(op,
+        new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
+      return true;
+    }
+
+    enum {
+      OP_PG_UPMAP,
+      OP_RM_PG_UPMAP,
+      OP_PG_UPMAP_ITEMS,
+      OP_RM_PG_UPMAP_ITEMS,
+    } option;
+
+    if (prefix == "osd pg-upmap") {
+      option = OP_PG_UPMAP;
+    } else if (prefix == "osd rm-pg-upmap") {
+      option = OP_RM_PG_UPMAP;
+    } else if (prefix == "osd pg-upmap-items") {
+      option = OP_PG_UPMAP_ITEMS;
+    } else {
+      option = OP_RM_PG_UPMAP_ITEMS;
+    }
+
+    // check pending upmap changes
+    switch (option) {
+    case OP_PG_UPMAP: // fall through
+    case OP_RM_PG_UPMAP:
+      if (pending_inc.new_pg_upmap.count(pgid) ||
+          pending_inc.old_pg_upmap.count(pgid)) {
+        dout(10) << __func__ << " waiting for pending update on "
+                 << pgid << dendl;
+        wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+        return true;
+      }
+      break;
+
+    case OP_PG_UPMAP_ITEMS: // fall through
+    case OP_RM_PG_UPMAP_ITEMS:
+      if (pending_inc.new_pg_upmap_items.count(pgid) ||
+          pending_inc.old_pg_upmap_items.count(pgid)) {
+        dout(10) << __func__ << " waiting for pending update on "
+                 << pgid << dendl;
+        wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+        return true;
+      }
+      break;
+
+    default:
+      ceph_abort_msg("invalid option");
+    }
+
+    switch (option) {
+    case OP_PG_UPMAP:
+      {
+        vector<int64_t> id_vec;
+        if (!cmd_getval(cmdmap, "id", id_vec)) {
+          ss << "unable to parse 'id' value(s) '"
+             << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+          err = -EINVAL;
+          goto reply;
+        }
+
+        int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
+        if ((int)id_vec.size() < pool_min_size) {
+          ss << "num of osds (" << id_vec.size() <<") < pool min size ("
+             << pool_min_size << ")";
+          err = -EINVAL;
+          goto reply;
+        }
+
+        int pool_size = osdmap.get_pg_pool_size(pgid);
+        if ((int)id_vec.size() > pool_size) {
+          ss << "num of osds (" << id_vec.size() <<") > pool size ("
+             << pool_size << ")";
+          err = -EINVAL;
+          goto reply;
+        }
+
+        vector<int32_t> new_pg_upmap;
+        for (auto osd : id_vec) {
+          if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
+            ss << "osd." << osd << " does not exist";
+            err = -ENOENT;
+            goto reply;
+          }
+          auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
+          if (it != new_pg_upmap.end()) {
+            ss << "osd." << osd << " already exists, ";
+            continue;
+          }
+          new_pg_upmap.push_back(osd);
+        }
+
+        if (new_pg_upmap.empty()) {
+          ss << "no valid upmap items(pairs) is specified";
+          err = -EINVAL;
+          goto reply;
+        }
+
+        pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+          new_pg_upmap.begin(), new_pg_upmap.end());
+        ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
+      }
+      break;
+
+    case OP_RM_PG_UPMAP:
+      {
+        pending_inc.old_pg_upmap.insert(pgid);
+        ss << "clear " << pgid << " pg_upmap mapping";
+      }
+      break;
+
+    case OP_PG_UPMAP_ITEMS:
+      {
+        vector<int64_t> id_vec;
+        if (!cmd_getval(cmdmap, "id", id_vec)) {
+          ss << "unable to parse 'id' value(s) '"
+             << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+          err = -EINVAL;
+          goto reply;
+        }
+
+        if (id_vec.size() % 2) {
+          ss << "you must specify pairs of osd ids to be remapped";
+          err = -EINVAL;
+          goto reply;
+        }
+
+        int pool_size = osdmap.get_pg_pool_size(pgid);
+        if ((int)(id_vec.size() / 2) > pool_size) {
+          ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
+             << pool_size << ")";
+          err = -EINVAL;
+          goto reply;
+        }
+
+        vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+        ostringstream items;
+        items << "[";
+        for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
+          int from = *p++;
+          int to = *p;
+          if (from == to) {
+            ss << "from osd." << from << " == to osd." << to << ", ";
+            continue;
+          }
+          if (!osdmap.exists(from)) {
+            ss << "osd." << from << " does not exist";
+            err = -ENOENT;
+            goto reply;
+          }
+          if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
+            ss << "osd." << to << " does not exist";
+            err = -ENOENT;
+            goto reply;
+          }
+          pair<int32_t,int32_t> entry = make_pair(from, to);
+          auto it = std::find(new_pg_upmap_items.begin(),
+            new_pg_upmap_items.end(), entry);
+          if (it != new_pg_upmap_items.end()) {
+            ss << "osd." << from << " -> osd." << to << " already exists, ";
+            continue;
+          }
+          new_pg_upmap_items.push_back(entry);
+          items << from << "->" << to << ",";
+        }
+        string out(items.str());
+        out.resize(out.size() - 1); // drop last ','
+        out += "]";
+
+        if (new_pg_upmap_items.empty()) {
+          ss << "no valid upmap items(pairs) is specified";
+          err = -EINVAL;
+          goto reply;
+        }
+
+        pending_inc.new_pg_upmap_items[pgid] =
+          mempool::osdmap::vector<pair<int32_t,int32_t>>(
+          new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+        ss << "set " << pgid << " pg_upmap_items mapping to " << out;
+      }
+      break;
+
+    case OP_RM_PG_UPMAP_ITEMS:
+      {
+        pending_inc.old_pg_upmap_items.insert(pgid);
+        ss << "clear " << pgid << " pg_upmap_items mapping";
+      }
+      break;
+
+    default:
+      ceph_abort_msg("invalid option");
+    }
+
+    goto update;
+  } else if (prefix == "osd primary-affinity") {
+    int64_t id;
+    if (!cmd_getval(cmdmap, "id", id)) {
+      ss << "invalid osd id value '"
+         << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    double w;
+    if (!cmd_getval(cmdmap, "weight", w)) {
+      ss << "unable to parse 'weight' value '"
+	 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
+    if (ww < 0L) {
+      ss << "weight must be >= 0";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
+	osdmap.require_min_compat_client < ceph_release_t::firefly) {
+      ss << "require_min_compat_client "
+	 << osdmap.require_min_compat_client
+	 << " < firefly, which is required for primary-affinity";
+      err = -EPERM;
+      goto reply;
+    }
+    if (osdmap.exists(id)) {
+      pending_inc.new_primary_affinity[id] = ww;
+      ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
+      getline(ss, rs);
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+                                                get_last_committed() + 1));
+      return true;
+    } else {
+      ss << "osd." << id << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+  } else if (prefix == "osd reweight") {
+    int64_t id;
+    if (!cmd_getval(cmdmap, "id", id)) {
+      ss << "unable to parse osd id value '"
+         << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    double w;
+    if (!cmd_getval(cmdmap, "weight", w)) {
+      ss << "unable to parse weight value '"
+         << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    long ww = (int)((double)CEPH_OSD_IN*w);
+    if (ww < 0L) {
+      ss << "weight must be >= 0";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (osdmap.exists(id)) {
+      pending_inc.new_weight[id] = ww;
+      ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
+      getline(ss, rs);
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						get_last_committed() + 1));
+      return true;
+    } else {
+      ss << "osd." << id << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+  } else if (prefix == "osd reweightn") {
+    map<int32_t, uint32_t> weights;
+    err = parse_reweights(cct, cmdmap, osdmap, &weights);
+    if (err) {
+      ss << "unable to parse 'weights' value '"
+         << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
+      goto reply;
+    }
+    pending_inc.new_weight.insert(weights.begin(), weights.end());
+    wait_for_finished_proposal(
+	op,
+	new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd lost") {
+    int64_t id;
+    if (!cmd_getval(cmdmap, "id", id)) {
+      ss << "unable to parse osd id value '"
+         << cmd_vartype_stringify(cmdmap.at("id")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      ss << "are you SURE?  this might mean real, permanent data loss.  pass "
+	    "--yes-i-really-mean-it if you really do.";
+      err = -EPERM;
+      goto reply;
+    } else if (!osdmap.exists(id)) {
+      ss << "osd." << id << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    } else if (!osdmap.is_down(id)) {
+      ss << "osd." << id << " is not down";
+      err = -EBUSY;
+      goto reply;
+    } else {
+      epoch_t e = osdmap.get_info(id).down_at;
+      pending_inc.new_lost[id] = e;
+      ss << "marked osd lost in epoch " << e;
+      getline(ss, rs);
+      wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						get_last_committed() + 1));
+      return true;
+    }
+
+  } else if (prefix == "osd destroy-actual" ||
+	     prefix == "osd purge-actual" ||
+	     prefix == "osd purge-new") {
+    /* Destroying an OSD means that we don't expect to further make use of
+     * the OSDs data (which may even become unreadable after this operation),
+     * and that we are okay with scrubbing all its cephx keys and config-key
+     * data (which may include lockbox keys, thus rendering the osd's data
+     * unreadable).
+     *
+     * The OSD will not be removed. Instead, we will mark it as destroyed,
+     * such that a subsequent call to `create` will not reuse the osd id.
+     * This will play into being able to recreate the OSD, at the same
+     * crush location, with minimal data movement.
+     */
+
+    // make sure authmon is writeable.
+    if (!mon.authmon()->is_writeable()) {
+      dout(10) << __func__ << " waiting for auth mon to be writeable for "
+               << "osd destroy" << dendl;
+      mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+      return false;
+    }
+
+    int64_t id;
+    if (!cmd_getval(cmdmap, "id", id)) {
+      auto p = cmdmap.find("id");
+      if (p == cmdmap.end()) {
+	ss << "no osd id specified";
+      } else {
+	ss << "unable to parse osd id value '"
+	   << cmd_vartype_stringify(cmdmap.at("id")) << "";
+      }
+      err = -EINVAL;
+      goto reply;
+    }
+
+    bool is_destroy = (prefix == "osd destroy-actual");
+    if (!is_destroy) {
+      ceph_assert("osd purge-actual" == prefix ||
+	     "osd purge-new" == prefix);
+    }
+
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
+	 << "This will mean real, permanent data loss, as well "
+         << "as deletion of cephx and lockbox keys. "
+	 << "Pass --yes-i-really-mean-it if you really do.";
+      err = -EPERM;
+      goto reply;
+    } else if (!osdmap.exists(id)) {
+      ss << "osd." << id << " does not exist";
+      err = 0; // idempotent
+      goto reply;
+    } else if (osdmap.is_up(id)) {
+      ss << "osd." << id << " is not `down`.";
+      err = -EBUSY;
+      goto reply;
+    } else if (is_destroy && osdmap.is_destroyed(id)) {
+      ss << "destroyed osd." << id;
+      err = 0;
+      goto reply;
+    }
+
+    if (prefix == "osd purge-new" &&
+	(osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
+      ss << "osd." << id << " is not new";
+      err = -EPERM;
+      goto reply;
+    }
+
+    bool goto_reply = false;
+
+    paxos.plug();
+    if (is_destroy) {
+      err = prepare_command_osd_destroy(id, ss);
+      // we checked above that it should exist.
+      ceph_assert(err != -ENOENT);
+    } else {
+      err = prepare_command_osd_purge(id, ss);
+      if (err == -ENOENT) {
+        err = 0;
+        ss << "osd." << id << " does not exist.";
+        goto_reply = true;
+      }
+    }
+    paxos.unplug();
+
+    if (err < 0 || goto_reply) {
+      goto reply;
+    }
+
+    if (is_destroy) {
+      ss << "destroyed osd." << id;
+    } else {
+      ss << "purged osd." << id;
+    }
+
+    getline(ss, rs);
+    wait_for_finished_proposal(op,
+        new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
+    force_immediate_propose();
+    return true;
+
+  } else if (prefix == "osd new") {
+
+    // make sure authmon is writeable.
+    if (!mon.authmon()->is_writeable()) {
+      dout(10) << __func__ << " waiting for auth mon to be writeable for "
+               << "osd new" << dendl;
+      mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+      return false;
+    }
+
+    // make sure kvmon is writeable.
+    if (!mon.kvmon()->is_writeable()) {
+      dout(10) << __func__ << " waiting for kv mon to be writeable for "
+               << "osd new" << dendl;
+      mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
+      return false;
+    }
+
+    map<string,string> param_map;
+
+    bufferlist bl = m->get_data();
+    string param_json = bl.to_str();
+    dout(20) << __func__ << " osd new json = " << param_json << dendl;
+
+    err = get_json_str_map(param_json, ss, &param_map);
+    if (err < 0)
+      goto reply;
+
+    dout(20) << __func__ << " osd new params " << param_map << dendl;
+
+    paxos.plug();
+    err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
+    paxos.unplug();
+
+    if (err < 0) {
+      goto reply;
+    }
+
+    if (f) {
+      f->flush(rdata);
+    } else {
+      rdata.append(ss);
+    }
+
+    if (err == EEXIST) {
+      // idempotent operation
+      err = 0;
+      goto reply;
+    }
+
+    wait_for_finished_proposal(op,
+        new Monitor::C_Command(mon, op, 0, rs, rdata,
+                               get_last_committed() + 1));
+    force_immediate_propose();
+    return true;
+
+  } else if (prefix == "osd create") {
+
+    // optional id provided?
+    int64_t id = -1, cmd_id = -1;
+    if (cmd_getval(cmdmap, "id", cmd_id)) {
+      if (cmd_id < 0) {
+	ss << "invalid osd id value '" << cmd_id << "'";
+	err = -EINVAL;
+	goto reply;
+      }
+      dout(10) << " osd create got id " << cmd_id << dendl;
+    }
+
+    uuid_d uuid;
+    string uuidstr;
+    if (cmd_getval(cmdmap, "uuid", uuidstr)) {
+      if (!uuid.parse(uuidstr.c_str())) {
+        ss << "invalid uuid value '" << uuidstr << "'";
+        err = -EINVAL;
+        goto reply;
+      }
+      // we only care about the id if we also have the uuid, to
+      // ensure the operation's idempotency.
+      id = cmd_id;
+    }
+
+    int32_t new_id = -1;
+    err = prepare_command_osd_create(id, uuid, &new_id, ss);
+    if (err < 0) {
+      if (err == -EAGAIN) {
+        wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+        return true;
+      }
+      // a check has failed; reply to the user.
+      goto reply;
+
+    } else if (err == EEXIST) {
+      // this is an idempotent operation; we can go ahead and reply.
+      if (f) {
+        f->open_object_section("created_osd");
+        f->dump_int("osdid", new_id);
+        f->close_section();
+        f->flush(rdata);
+      } else {
+        ss << new_id;
+        rdata.append(ss);
+      }
+      err = 0;
+      goto reply;
+    }
+
+    string empty_device_class;
+    do_osd_create(id, uuid, empty_device_class, &new_id);
+
+    if (f) {
+      f->open_object_section("created_osd");
+      f->dump_int("osdid", new_id);
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ss << new_id;
+      rdata.append(ss);
+    }
+    wait_for_finished_proposal(op,
+        new Monitor::C_Command(mon, op, 0, rs, rdata,
+                               get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd blocklist clear" ||
+	     prefix == "osd blacklist clear") {
+    pending_inc.new_blocklist.clear();
+    std::list<std::pair<entity_addr_t,utime_t > > blocklist;
+    std::list<std::pair<entity_addr_t,utime_t > > range_b;
+    osdmap.get_blocklist(&blocklist, &range_b);
+    for (const auto &entry : blocklist) {
+      pending_inc.old_blocklist.push_back(entry.first);
+    }
+    for (const auto &entry : range_b) {
+      pending_inc.old_range_blocklist.push_back(entry.first);
+    }
+    ss << " removed all blocklist entries";
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+                                              get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd blocklist" ||
+	     prefix == "osd blacklist") {
+    string addrstr, rangestr;
+    bool range = false;
+    cmd_getval(cmdmap, "addr", addrstr);
+    if (cmd_getval(cmdmap, "range", rangestr)) {
+      if (rangestr == "range") {
+	range = true;
+      } else {
+	ss << "Did you mean to specify \"osd blocklist range\"?";
+	err = -EINVAL;
+	goto reply;
+      }
+    }
+    entity_addr_t addr;
+    if (!addr.parse(addrstr.c_str(), 0)) {
+      ss << "unable to parse address " << addrstr;
+      err = -EINVAL;
+      goto reply;
+    }
+    else {
+      if (range) {
+	if (!addr.maybe_cidr()) {
+	  ss << "You specified a range command, but " << addr
+	     << " does not parse as a CIDR range";
+	  err = -EINVAL;
+	  goto reply;
+	}
+	addr.type = entity_addr_t::TYPE_CIDR;
+	err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
+	if (err) {
+	  goto reply;
+	}
+	if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
+	    (addr.is_ipv6() && addr.get_nonce() > 128)) {
+	  ss << "Too many bits in range for that protocol!";
+	  err = -EINVAL;
+	  goto reply;
+	}
+      } else {
+	if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
+	  // always blocklist type ANY
+	  addr.set_type(entity_addr_t::TYPE_ANY);
+	} else {
+	  addr.set_type(entity_addr_t::TYPE_LEGACY);
+	}
+      }
+
+      string blocklistop;
+      if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
+	cmd_getval(cmdmap, "blacklistop", blocklistop);
+      }
+      if (blocklistop == "add") {
+	utime_t expires = ceph_clock_now();
+	double d;
+	// default one hour
+	cmd_getval(cmdmap, "expire", d,
+          g_conf()->mon_osd_blocklist_default_expire);
+	expires += d;
+
+	auto add_to_pending_blocklists = [](auto& nb, auto& ob,
+					    const auto& addr,
+					    const auto& expires) {
+	  nb[addr] = expires;
+	  // cancel any pending un-blocklisting request too
+	  auto it = std::find(ob.begin(),
+			      ob.end(), addr);
+	  if (it != ob.end()) {
+	    ob.erase(it);
+	  }
+	};
+	if (range) {
+	  add_to_pending_blocklists(pending_inc.new_range_blocklist,
+				    pending_inc.old_range_blocklist,
+				    addr, expires);
+
+	} else {
+	  add_to_pending_blocklists(pending_inc.new_blocklist,
+				    pending_inc.old_blocklist,
+				    addr, expires);
+	}
+
+	ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
+	getline(ss, rs);
+	wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						  get_last_committed() + 1));
+	return true;
+      } else if (blocklistop == "rm") {
+	auto rm_from_pending_blocklists = [](const auto& addr,
+					     auto& blocklist,
+					     auto& ob, auto& pb) {
+	  if (blocklist.count(addr)) {
+	    ob.push_back(addr);
+	    return true;
+	  } else if (pb.count(addr)) {
+	    pb.erase(addr);
+	    return true;
+	  }
+	  return false;
+	};
+	if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
+						  pending_inc.old_blocklist,
+						  pending_inc.new_blocklist)) ||
+	    (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
+						 pending_inc.old_range_blocklist,
+						 pending_inc.new_range_blocklist))) {
+	  ss << "un-blocklisting " << addr;
+	  getline(ss, rs);
+	  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						    get_last_committed() + 1));
+	  return true;
+	}
+	ss << addr << " isn't blocklisted";
+	err = 0;
+	goto reply;
+      }
+    }
+  } else if (prefix == "osd pool mksnap") {
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+    if (pool < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    string snapname;
+    cmd_getval(cmdmap, "snap", snapname);
+    const pg_pool_t *p = osdmap.get_pg_pool(pool);
+    if (p->is_unmanaged_snaps_mode()) {
+      ss << "pool " << poolstr << " is in unmanaged snaps mode";
+      err = -EINVAL;
+      goto reply;
+    } else if (p->snap_exists(snapname.c_str())) {
+      ss << "pool " << poolstr << " snap " << snapname << " already exists";
+      err = 0;
+      goto reply;
+    } else if (p->is_tier()) {
+      ss << "pool " << poolstr << " is a cache tier";
+      err = -EINVAL;
+      goto reply;
+    }
+    pg_pool_t *pp = 0;
+    if (pending_inc.new_pools.count(pool))
+      pp = &pending_inc.new_pools[pool];
+    if (!pp) {
+      pp = &pending_inc.new_pools[pool];
+      *pp = *p;
+    }
+    if (pp->snap_exists(snapname.c_str())) {
+      ss << "pool " << poolstr << " snap " << snapname << " already exists";
+    } else {
+      pp->add_snap(snapname.c_str(), ceph_clock_now());
+      pp->set_snap_epoch(pending_inc.epoch);
+      ss << "created pool " << poolstr << " snap " << snapname;
+    }
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd pool rmsnap") {
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+    if (pool < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    string snapname;
+    cmd_getval(cmdmap, "snap", snapname);
+    const pg_pool_t *p = osdmap.get_pg_pool(pool);
+    if (p->is_unmanaged_snaps_mode()) {
+      ss << "pool " << poolstr << " is in unmanaged snaps mode";
+      err = -EINVAL;
+      goto reply;
+    } else if (!p->snap_exists(snapname.c_str())) {
+      ss << "pool " << poolstr << " snap " << snapname << " does not exist";
+      err = 0;
+      goto reply;
+    }
+    pg_pool_t *pp = 0;
+    if (pending_inc.new_pools.count(pool))
+      pp = &pending_inc.new_pools[pool];
+    if (!pp) {
+      pp = &pending_inc.new_pools[pool];
+      *pp = *p;
+    }
+    snapid_t sn = pp->snap_exists(snapname.c_str());
+    if (sn) {
+      pp->remove_snap(sn);
+      pp->set_snap_epoch(pending_inc.epoch);
+      ss << "removed pool " << poolstr << " snap " << snapname;
+    } else {
+      ss << "already removed pool " << poolstr << " snap " << snapname;
+    }
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd pool create") {
+    int64_t pg_num, pgp_num, pg_num_min, pg_num_max;
+    cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
+    cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
+    cmd_getval(cmdmap, "pg_num_max", pg_num_max, int64_t(0));
+    cmd_getval(cmdmap, "pgp_num", pgp_num, int64_t(pg_num));
+    string pool_type_str;
+    cmd_getval(cmdmap, "pool_type", pool_type_str);
+    if (pool_type_str.empty())
+      pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
+
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+    if (pool_id >= 0) {
+      const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+      if (pool_type_str != p->get_type_name()) {
+	ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
+ 	err = -EINVAL;
+      } else {
+	ss << "pool '" << poolstr << "' already exists";
+	err = 0;
+      }
+      goto reply;
+    }
+
+    int pool_type;
+    if (pool_type_str == "replicated") {
+      pool_type = pg_pool_t::TYPE_REPLICATED;
+    } else if (pool_type_str == "erasure") {
+      pool_type = pg_pool_t::TYPE_ERASURE;
+    } else {
+      ss << "unknown pool type '" << pool_type_str << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    bool implicit_rule_creation = false;
+    int64_t expected_num_objects = 0;
+    string rule_name;
+    cmd_getval(cmdmap, "rule", rule_name);
+    string erasure_code_profile;
+    cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
+
+    if (pool_type == pg_pool_t::TYPE_ERASURE) {
+      if (erasure_code_profile == "")
+	erasure_code_profile = "default";
+      //handle the erasure code profile
+      if (erasure_code_profile == "default") {
+	if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
+	  if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
+	    dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
+	    goto wait;
+	  }
+
+	  map<string,string> profile_map;
+	  err = osdmap.get_erasure_code_profile_default(cct,
+						      profile_map,
+						      &ss);
+	  if (err)
+	    goto reply;
+	  dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
+	  pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
+	  goto wait;
+	}
+      }
+      if (rule_name == "") {
+	implicit_rule_creation = true;
+	if (erasure_code_profile == "default") {
+	  rule_name = "erasure-code";
+	} else {
+	  dout(1) << "implicitly use rule named after the pool: "
+		<< poolstr << dendl;
+	  rule_name = poolstr;
+	}
+      }
+      cmd_getval(cmdmap, "expected_num_objects",
+                 expected_num_objects, int64_t(0));
+    } else {
+      //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
+      //     and put expected_num_objects to rule field
+      if (erasure_code_profile != "") { // cmd is from CLI
+        if (rule_name != "") {
+          string interr;
+          expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
+          if (interr.length()) {
+            ss << "error parsing integer value '" << rule_name << "': " << interr;
+            err = -EINVAL;
+            goto reply;
+          }
+        }
+        rule_name = erasure_code_profile;
+      } else { // cmd is well-formed
+        cmd_getval(cmdmap, "expected_num_objects",
+                   expected_num_objects, int64_t(0));
+      }
+    }
+
+    if (!implicit_rule_creation && rule_name != "") {
+      int rule;
+      err = get_crush_rule(rule_name, &rule, &ss);
+      if (err == -EAGAIN) {
+	wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+	return true;
+      }
+      if (err)
+	goto reply;
+    }
+
+    if (expected_num_objects < 0) {
+      ss << "'expected_num_objects' must be non-negative";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    set<int32_t> osds;
+    osdmap.get_all_osds(osds);
+    bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
+      string type;
+      if (!get_osd_objectstore_type(osd, &type)) {
+        return type == "filestore";
+      } else {
+        return false;
+      }
+    });
+
+    if (has_filestore_osd &&
+        expected_num_objects > 0 &&
+        cct->_conf->filestore_merge_threshold > 0) {
+      ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    if (has_filestore_osd &&
+        expected_num_objects == 0 &&
+        cct->_conf->filestore_merge_threshold < 0) {
+      int osds = osdmap.get_num_osds();
+      bool sure = false;
+      cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+      if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
+        ss << "For better initial performance on pools expected to store a "
+           << "large number of objects, consider supplying the "
+           << "expected_num_objects parameter when creating the pool."
+           << " Pass --yes-i-really-mean-it to ignore it";
+        err = -EPERM;
+        goto reply;
+      }
+    }
+
+    int64_t fast_read_param;
+    cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
+    FastReadType fast_read = FAST_READ_DEFAULT;
+    if (fast_read_param == 0)
+      fast_read = FAST_READ_OFF;
+    else if (fast_read_param > 0)
+      fast_read = FAST_READ_ON;
+
+    int64_t repl_size = 0;
+    cmd_getval(cmdmap, "size", repl_size);
+    int64_t target_size_bytes = 0;
+    double target_size_ratio = 0.0;
+    cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
+    cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
+
+    string pg_autoscale_mode;
+    cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
+
+    bool bulk = 0;
+    cmd_getval(cmdmap, "bulk", bulk);
+    err = prepare_new_pool(poolstr,
+			   -1, // default crush rule
+			   rule_name,
+			   pg_num, pgp_num, pg_num_min, pg_num_max,
+                           repl_size, target_size_bytes, target_size_ratio,
+			   erasure_code_profile, pool_type,
+                           (uint64_t)expected_num_objects,
+                           fast_read,
+			   pg_autoscale_mode,
+			   bulk,
+			   &ss);
+    if (err < 0) {
+      switch(err) {
+      case -EEXIST:
+	ss << "pool '" << poolstr << "' already exists";
+	break;
+      case -EAGAIN:
+	wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+	return true;
+      case -ERANGE:
+        goto reply;
+      default:
+	goto reply;
+	break;
+      }
+    } else {
+      ss << "pool '" << poolstr << "' created";
+    }
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd pool delete" ||
+             prefix == "osd pool rm") {
+    // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
+    string poolstr, poolstr2, sure;
+    cmd_getval(cmdmap, "pool", poolstr);
+    cmd_getval(cmdmap, "pool2", poolstr2);
+    int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+    if (pool < 0) {
+      ss << "pool '" << poolstr << "' does not exist";
+      err = 0;
+      goto reply;
+    }
+
+    bool force_no_fake = false;
+    cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
+    bool force = false;
+    cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
+    if (poolstr2 != poolstr ||
+	(!force && !force_no_fake)) {
+      ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
+	 << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
+	 << "followed by --yes-i-really-really-mean-it.";
+      err = -EPERM;
+      goto reply;
+    }
+    err = _prepare_remove_pool(pool, &ss, force_no_fake);
+    if (err == -EAGAIN) {
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+      return true;
+    }
+    if (err < 0)
+      goto reply;
+    goto update;
+  } else if (prefix == "osd pool rename") {
+    string srcpoolstr, destpoolstr;
+    cmd_getval(cmdmap, "srcpool", srcpoolstr);
+    cmd_getval(cmdmap, "destpool", destpoolstr);
+    int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
+    int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
+
+    if (pool_src < 0) {
+      if (pool_dst >= 0) {
+        // src pool doesn't exist, dst pool does exist: to ensure idempotency
+        // of operations, assume this rename succeeded, as it is not changing
+        // the current state.  Make sure we output something understandable
+        // for whoever is issuing the command, if they are paying attention,
+        // in case it was not intentional; or to avoid a "wtf?" and a bug
+        // report in case it was intentional, while expecting a failure.
+        ss << "pool '" << srcpoolstr << "' does not exist; pool '"
+          << destpoolstr << "' does -- assuming successful rename";
+        err = 0;
+      } else {
+        ss << "unrecognized pool '" << srcpoolstr << "'";
+        err = -ENOENT;
+      }
+      goto reply;
+    } else if (pool_dst >= 0) {
+      // source pool exists and so does the destination pool
+      ss << "pool '" << destpoolstr << "' already exists";
+      err = -EEXIST;
+      goto reply;
+    }
+
+    int ret = _prepare_rename_pool(pool_src, destpoolstr);
+    if (ret == 0) {
+      ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
+    } else {
+      ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
+        << cpp_strerror(ret);
+    }
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
+					      get_last_committed() + 1));
+    return true;
+
+  } else if (prefix == "osd pool set") {
+    err = prepare_command_pool_set(cmdmap, ss);
+    if (err == -EAGAIN)
+      goto wait;
+    if (err < 0)
+      goto reply;
+
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+						   get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd tier add") {
+    err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+    if (err == -EAGAIN)
+      goto wait;
+    if (err)
+      goto reply;
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+    if (pool_id < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    string tierpoolstr;
+    cmd_getval(cmdmap, "tierpool", tierpoolstr);
+    int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+    if (tierpool_id < 0) {
+      ss << "unrecognized pool '" << tierpoolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+    ceph_assert(p);
+    const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+    ceph_assert(tp);
+
+    if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
+      goto reply;
+    }
+
+    // make sure new tier is empty
+    string force_nonempty;
+    cmd_getval(cmdmap, "force_nonempty", force_nonempty);
+    const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
+    if (pstats && pstats->stats.sum.num_objects != 0 &&
+	force_nonempty != "--force-nonempty") {
+      ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
+      err = -ENOTEMPTY;
+      goto reply;
+    }
+    if (tp->is_erasure()) {
+      ss << "tier pool '" << tierpoolstr
+	 << "' is an ec pool, which cannot be a tier";
+      err = -ENOTSUP;
+      goto reply;
+    }
+    if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
+	((force_nonempty != "--force-nonempty") ||
+	 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
+      ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
+      err = -ENOTEMPTY;
+      goto reply;
+    }
+    // go
+    pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+    pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
+    if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+      return true;
+    }
+    np->tiers.insert(tierpool_id);
+    np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
+    ntp->tier_of = pool_id;
+    ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd tier remove" ||
+             prefix == "osd tier rm") {
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+    if (pool_id < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    string tierpoolstr;
+    cmd_getval(cmdmap, "tierpool", tierpoolstr);
+    int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+    if (tierpool_id < 0) {
+      ss << "unrecognized pool '" << tierpoolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+    ceph_assert(p);
+    const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+    ceph_assert(tp);
+
+    if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
+      goto reply;
+    }
+
+    if (p->tiers.count(tierpool_id) == 0) {
+      ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+      err = 0;
+      goto reply;
+    }
+    if (tp->tier_of != pool_id) {
+      ss << "tier pool '" << tierpoolstr << "' is a tier of '"
+         << osdmap.get_pool_name(tp->tier_of) << "': "
+         // be scary about it; this is an inconsistency and bells must go off
+         << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (p->read_tier == tierpool_id) {
+      ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
+      err = -EBUSY;
+      goto reply;
+    }
+    // go
+    pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+    pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
+    if (np->tiers.count(tierpool_id) == 0 ||
+	ntp->tier_of != pool_id ||
+	np->read_tier == tierpool_id) {
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+      return true;
+    }
+    np->tiers.erase(tierpool_id);
+    ntp->clear_tier();
+    ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd tier set-overlay") {
+    err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+    if (err == -EAGAIN)
+      goto wait;
+    if (err)
+      goto reply;
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+    if (pool_id < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    string overlaypoolstr;
+    cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
+    int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
+    if (overlaypool_id < 0) {
+      ss << "unrecognized pool '" << overlaypoolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+    ceph_assert(p);
+    const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
+    ceph_assert(overlay_p);
+    if (p->tiers.count(overlaypool_id) == 0) {
+      ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (p->read_tier == overlaypool_id) {
+      err = 0;
+      ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+      goto reply;
+    }
+    if (p->has_read_tier()) {
+      ss << "pool '" << poolstr << "' has overlay '"
+	 << osdmap.get_pool_name(p->read_tier)
+	 << "'; please remove-overlay first";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    // go
+    pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+    np->read_tier = overlaypool_id;
+    np->write_tier = overlaypool_id;
+    np->set_last_force_op_resend(pending_inc.epoch);
+    pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
+    noverlay_p->set_last_force_op_resend(pending_inc.epoch);
+    ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+    if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
+      ss <<" (WARNING: overlay pool cache_mode is still NONE)";
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd tier remove-overlay" ||
+             prefix == "osd tier rm-overlay") {
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+    if (pool_id < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+    ceph_assert(p);
+    if (!p->has_read_tier()) {
+      err = 0;
+      ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+      goto reply;
+    }
+
+    if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
+      goto reply;
+    }
+
+    // go
+    pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+    if (np->has_read_tier()) {
+      const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
+      pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
+      nop->set_last_force_op_resend(pending_inc.epoch);
+    }
+    if (np->has_write_tier()) {
+      const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
+      pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
+      nop->set_last_force_op_resend(pending_inc.epoch);
+    }
+    np->clear_read_tier();
+    np->clear_write_tier();
+    np->set_last_force_op_resend(pending_inc.epoch);
+    ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd tier cache-mode") {
+    err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+    if (err == -EAGAIN)
+      goto wait;
+    if (err)
+      goto reply;
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+    if (pool_id < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+    ceph_assert(p);
+    if (!p->is_tier()) {
+      ss << "pool '" << poolstr << "' is not a tier";
+      err = -EINVAL;
+      goto reply;
+    }
+    string modestr;
+    cmd_getval(cmdmap, "mode", modestr);
+    pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
+    if (int(mode) < 0) {
+      ss << "'" << modestr << "' is not a valid cache mode";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+
+    if (mode == pg_pool_t::CACHEMODE_FORWARD ||
+	mode == pg_pool_t::CACHEMODE_READFORWARD) {
+      ss << "'" << modestr << "' is no longer a supported cache mode";
+      err = -EPERM;
+      goto reply;
+    }
+    if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+	 mode != pg_pool_t::CACHEMODE_NONE &&
+	 mode != pg_pool_t::CACHEMODE_PROXY &&
+	 mode != pg_pool_t::CACHEMODE_READPROXY) &&
+	 !sure) {
+      ss << "'" << modestr << "' is not a well-supported cache mode and may "
+	 << "corrupt your data.  pass --yes-i-really-mean-it to force.";
+      err = -EPERM;
+      goto reply;
+    }
+
+    // pool already has this cache-mode set and there are no pending changes
+    if (p->cache_mode == mode &&
+	(pending_inc.new_pools.count(pool_id) == 0 ||
+	 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
+      ss << "set cache-mode for pool '" << poolstr << "'"
+         << " to " << pg_pool_t::get_cache_mode_name(mode);
+      err = 0;
+      goto reply;
+    }
+
+    /* Mode description:
+     *
+     *  none:       No cache-mode defined
+     *  forward:    Forward all reads and writes to base pool [removed]
+     *  writeback:  Cache writes, promote reads from base pool
+     *  readonly:   Forward writes to base pool
+     *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
+     *  proxy:       Proxy all reads and writes to base pool
+     *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
+     *
+     * Hence, these are the allowed transitions:
+     *
+     *  none -> any
+     *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
+     *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
+     *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
+     *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
+     *  writeback -> readproxy || proxy
+     *  readonly -> any
+     */
+
+    // We check if the transition is valid against the current pool mode, as
+    // it is the only committed state thus far.  We will blantly squash
+    // whatever mode is on the pending state.
+
+    if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
+        (mode != pg_pool_t::CACHEMODE_PROXY &&
+	  mode != pg_pool_t::CACHEMODE_READPROXY)) {
+      ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
+         << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
+         << "' pool; only '"
+         << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
+        << "' allowed.";
+      err = -EINVAL;
+      goto reply;
+    }
+    if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
+        (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+	  mode != pg_pool_t::CACHEMODE_PROXY &&
+	  mode != pg_pool_t::CACHEMODE_READPROXY)) ||
+
+        (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
+        (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+	  mode != pg_pool_t::CACHEMODE_PROXY)) ||
+
+        (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
+        (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+	  mode != pg_pool_t::CACHEMODE_READPROXY)) ||
+
+        (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
+        (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+	  mode != pg_pool_t::CACHEMODE_PROXY &&
+	  mode != pg_pool_t::CACHEMODE_READPROXY))) {
+
+      const pool_stat_t* pstats =
+        mon.mgrstatmon()->get_pool_stat(pool_id);
+
+      if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
+        ss << "unable to set cache-mode '"
+           << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
+           << "': dirty objects found";
+        err = -EBUSY;
+        goto reply;
+      }
+    }
+    // go
+    pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+    np->cache_mode = mode;
+    // set this both when moving to and from cache_mode NONE.  this is to
+    // capture legacy pools that were set up before this flag existed.
+    np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
+    ss << "set cache-mode for pool '" << poolstr
+	<< "' to " << pg_pool_t::get_cache_mode_name(mode);
+    if (mode == pg_pool_t::CACHEMODE_NONE) {
+      const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
+      ceph_assert(base_pool);
+      if (base_pool->read_tier == pool_id ||
+	  base_pool->write_tier == pool_id)
+	ss <<" (WARNING: pool is still configured as read or write tier)";
+    }
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd tier add-cache") {
+    err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
+    if (err == -EAGAIN)
+      goto wait;
+    if (err)
+      goto reply;
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+    if (pool_id < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    string tierpoolstr;
+    cmd_getval(cmdmap, "tierpool", tierpoolstr);
+    int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+    if (tierpool_id < 0) {
+      ss << "unrecognized pool '" << tierpoolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+    const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+    ceph_assert(p);
+    const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+    ceph_assert(tp);
+
+    if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
+      goto reply;
+    }
+
+    int64_t size = 0;
+    if (!cmd_getval(cmdmap, "size", size)) {
+      ss << "unable to parse 'size' value '"
+         << cmd_vartype_stringify(cmdmap.at("size")) << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    // make sure new tier is empty
+    const pool_stat_t *pstats =
+      mon.mgrstatmon()->get_pool_stat(tierpool_id);
+    if (pstats && pstats->stats.sum.num_objects != 0) {
+      ss << "tier pool '" << tierpoolstr << "' is not empty";
+      err = -ENOTEMPTY;
+      goto reply;
+    }
+    auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
+    pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
+    if (int(mode) < 0) {
+      ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
+      err = -EINVAL;
+      goto reply;
+    }
+    HitSet::Params hsp;
+    auto& cache_hit_set_type =
+      g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
+    if (cache_hit_set_type == "bloom") {
+      BloomHitSet::Params *bsp = new BloomHitSet::Params;
+      bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
+      hsp = HitSet::Params(bsp);
+    } else if (cache_hit_set_type == "explicit_hash") {
+      hsp = HitSet::Params(new ExplicitHashHitSet::Params);
+    } else if (cache_hit_set_type == "explicit_object") {
+      hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
+    } else {
+      ss << "osd tier cache default hit set type '"
+	 << cache_hit_set_type << "' is not a known type";
+      err = -EINVAL;
+      goto reply;
+    }
+    // go
+    pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+    pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
+    if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+      return true;
+    }
+    np->tiers.insert(tierpool_id);
+    np->read_tier = np->write_tier = tierpool_id;
+    np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
+    np->set_last_force_op_resend(pending_inc.epoch);
+    ntp->set_last_force_op_resend(pending_inc.epoch);
+    ntp->tier_of = pool_id;
+    ntp->cache_mode = mode;
+    ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
+    ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
+    ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
+    ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
+    ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
+    ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
+    ntp->hit_set_params = hsp;
+    ntp->target_max_bytes = size;
+    ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd pool set-quota") {
+    string poolstr;
+    cmd_getval(cmdmap, "pool", poolstr);
+    int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+    if (pool_id < 0) {
+      ss << "unrecognized pool '" << poolstr << "'";
+      err = -ENOENT;
+      goto reply;
+    }
+
+    string field;
+    cmd_getval(cmdmap, "field", field);
+    if (field != "max_objects" && field != "max_bytes") {
+      ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
+      err = -EINVAL;
+      goto reply;
+    }
+
+    // val could contain unit designations, so we treat as a string
+    string val;
+    cmd_getval(cmdmap, "val", val);
+    string tss;
+    int64_t value;
+    if (field == "max_objects") {
+      value = strict_sistrtoll(val.c_str(), &tss);
+    } else if (field == "max_bytes") {
+      value = strict_iecstrtoll(val.c_str(), &tss);
+    } else {
+      ceph_abort_msg("unrecognized option");
+    }
+    if (!tss.empty()) {
+      ss << "error parsing value '" << val << "': " << tss;
+      err = -EINVAL;
+      goto reply;
+    }
+
+    pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
+    if (field == "max_objects") {
+      pi->quota_max_objects = value;
+    } else if (field == "max_bytes") {
+      pi->quota_max_bytes = value;
+    } else {
+      ceph_abort_msg("unrecognized option");
+    }
+    ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
+    rs = ss.str();
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					      get_last_committed() + 1));
+    return true;
+  } else if (prefix == "osd pool application enable" ||
+             prefix == "osd pool application disable" ||
+             prefix == "osd pool application set" ||
+             prefix == "osd pool application rm") {
+    err = prepare_command_pool_application(prefix, cmdmap, ss);
+    if (err == -EAGAIN) {
+      goto wait;
+    } else if (err < 0) {
+      goto reply;
+    } else {
+      goto update;
+    }
+  } else if (prefix == "osd force-create-pg") {
+    pg_t pgid;
+    string pgidstr;
+    cmd_getval(cmdmap, "pgid", pgidstr);
+    if (!pgid.parse(pgidstr.c_str())) {
+      ss << "invalid pgid '" << pgidstr << "'";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (!osdmap.pg_exists(pgid)) {
+      ss << "pg " << pgid << " should not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
+	 << "that the cluster will give up ever trying to recover the lost data.  Do this "
+	 << "only if you are certain that all copies of the PG are in fact lost and you are "
+	 << "willing to accept that the data is permanently destroyed.  Pass "
+	 << "--yes-i-really-mean-it to proceed.";
+      err = -EPERM;
+      goto reply;
+    }
+    bool creating_now;
+    {
+      std::lock_guard<std::mutex> l(creating_pgs_lock);
+      auto emplaced = creating_pgs.pgs.emplace(
+	pgid,
+	creating_pgs_t::pg_create_info(osdmap.get_epoch(),
+				       ceph_clock_now()));
+      creating_now = emplaced.second;
+    }
+    if (creating_now) {
+      ss << "pg " << pgidstr << " now creating, ok";
+      // set the pool's CREATING flag so that (1) the osd won't ignore our
+      // create message and (2) we won't propose any future pg_num changes
+      // until after the PG has been instantiated.
+      if (pending_inc.new_pools.count(pgid.pool()) == 0) {
+	pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
+      }
+      pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
+      err = 0;
+      goto update;
+    } else {
+      ss << "pg " << pgid << " already creating";
+      err = 0;
+      goto reply;
+    }
+  } else if (prefix == "osd force_healthy_stretch_mode") {
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      ss << "This command will require peering across multiple CRUSH buckets "
+	"(probably two data centers or availability zones?) and may result in PGs "
+	"going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
+      err = -EPERM;
+      goto reply;
+    }
+    try_end_recovery_stretch_mode(true);
+    ss << "Triggering healthy stretch mode";
+    err = 0;
+    goto reply;
+  } else if (prefix == "osd force_recovery_stretch_mode") {
+    bool sure = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
+    if (!sure) {
+      ss << "This command will increase pool sizes to try and spread them "
+	"across multiple CRUSH buckets (probably two data centers or "
+	"availability zones?) and should have happened automatically"
+	"Pass --yes-i-really-mean-it to proceed.";
+      err = -EPERM;
+      goto reply;
+    }
+    mon.go_recovery_stretch_mode();
+    ss << "Triggering recovery stretch mode";
+    err = 0;
+    goto reply;
+  } else {
+    err = -EINVAL;
+  }
+
+ reply:
+  getline(ss, rs);
+  if (err < 0 && rs.length() == 0)
+    rs = cpp_strerror(err);
+  mon.reply_command(op, err, rs, rdata, get_last_committed());
+  return ret;
+
+ update:
+  getline(ss, rs);
+  wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+					    get_last_committed() + 1));
+  return true;
+
+ wait:
+  wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+  return true;
+}
+
+bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+
+  auto m = op->get_req<MPoolOp>();
+  MonSession *session = op->get_session();
+  if (!session) {
+    _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+    return true;
+  }
+
+  switch (m->op) {
+  case POOL_OP_CREATE_UNMANAGED_SNAP:
+  case POOL_OP_DELETE_UNMANAGED_SNAP:
+    {
+      const std::string* pool_name = nullptr;
+      const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
+      if (pg_pool != nullptr) {
+        pool_name = &osdmap.get_pool_name(m->pool);
+      }
+
+      if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
+                                          session->entity_name, session->caps,
+					  session->get_peer_socket_addr(),
+                                          pool_name)) {
+        dout(0) << "got unmanaged-snap pool op from entity with insufficient "
+                << "privileges. message: " << *m  << std::endl
+                << "caps: " << session->caps << dendl;
+        _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+        return true;
+      }
+    }
+    break;
+  default:
+    if (!session->is_capable("osd", MON_CAP_W)) {
+      dout(0) << "got pool op from entity with insufficient privileges. "
+              << "message: " << *m  << std::endl
+              << "caps: " << session->caps << dendl;
+      _pool_op_reply(op, -EPERM, osdmap.get_epoch());
+      return true;
+    }
+    break;
+  }
+
+  return false;
+}
+
+bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MPoolOp>();
+
+  if (enforce_pool_op_caps(op)) {
+    return true;
+  }
+
+  if (m->fsid != mon.monmap->fsid) {
+    dout(0) << __func__ << " drop message on fsid " << m->fsid
+            << " != " << mon.monmap->fsid << " for " << *m << dendl;
+    _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+    return true;
+  }
+
+  if (m->op == POOL_OP_CREATE)
+    return preprocess_pool_op_create(op);
+
+  const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
+  if (p == nullptr) {
+    dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
+    if (m->op == POOL_OP_DELETE) {
+      _pool_op_reply(op, 0, osdmap.get_epoch());
+    } else {
+      _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+    }
+    return true;
+  }
+
+  // check if the snap and snapname exist
+  bool snap_exists = false;
+  if (p->snap_exists(m->name.c_str()))
+    snap_exists = true;
+
+  switch (m->op) {
+  case POOL_OP_CREATE_SNAP:
+    if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
+      _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+      return true;
+    }
+    if (snap_exists) {
+      _pool_op_reply(op, 0, osdmap.get_epoch());
+      return true;
+    }
+    return false;
+  case POOL_OP_CREATE_UNMANAGED_SNAP:
+    if (p->is_pool_snaps_mode()) {
+      _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+      return true;
+    }
+    return false;
+  case POOL_OP_DELETE_SNAP:
+    if (p->is_unmanaged_snaps_mode()) {
+      _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+      return true;
+    }
+    if (!snap_exists) {
+      _pool_op_reply(op, 0, osdmap.get_epoch());
+      return true;
+    }
+    return false;
+  case POOL_OP_DELETE_UNMANAGED_SNAP:
+    if (p->is_pool_snaps_mode()) {
+      _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+      return true;
+    }
+    if (_is_removed_snap(m->pool, m->snapid)) {
+      _pool_op_reply(op, 0, osdmap.get_epoch());
+      return true;
+    }
+    return false;
+  case POOL_OP_DELETE:
+    if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
+      _pool_op_reply(op, 0, osdmap.get_epoch());
+      return true;
+    }
+    return false;
+  case POOL_OP_AUID_CHANGE:
+    return false;
+  default:
+    ceph_abort();
+    break;
+  }
+
+  return false;
+}
+
+bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
+{
+  if (!osdmap.have_pg_pool(pool)) {
+    dout(10) << __func__ << " pool " << pool << " snap " << snap
+	     << " - pool dne" << dendl;
+    return true;
+  }
+  if (osdmap.in_removed_snaps_queue(pool, snap)) {
+    dout(10) << __func__ << " pool " << pool << " snap " << snap
+	     << " - in osdmap removed_snaps_queue" << dendl;
+    return true;
+  }
+  snapid_t begin, end;
+  int r = lookup_purged_snap(pool, snap, &begin, &end);
+  if (r == 0) {
+    dout(10) << __func__ << " pool " << pool << " snap " << snap
+	     << " - purged, [" << begin << "," << end << ")" << dendl;
+    return true;
+  }
+  return false;
+}
+
+bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
+{
+  if (pending_inc.old_pools.count(pool)) {
+    dout(10) << __func__ << " pool " << pool << " snap " << snap
+	     << " - pool pending deletion" << dendl;
+    return true;
+  }
+  if (pending_inc.in_new_removed_snaps(pool, snap)) {
+    dout(10) << __func__ << " pool " << pool << " snap " << snap
+	     << " - in pending new_removed_snaps" << dendl;
+    return true;
+  }
+  return false;
+}
+
+bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MPoolOp>();
+  int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
+  if (pool >= 0) {
+    _pool_op_reply(op, 0, osdmap.get_epoch());
+    return true;
+  }
+
+  return false;
+}
+
+bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MPoolOp>();
+  dout(10) << "prepare_pool_op " << *m << dendl;
+  if (m->op == POOL_OP_CREATE) {
+    return prepare_pool_op_create(op);
+  } else if (m->op == POOL_OP_DELETE) {
+    return prepare_pool_op_delete(op);
+  }
+
+  int ret = 0;
+  bool changed = false;
+
+  if (!osdmap.have_pg_pool(m->pool)) {
+    _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+    return false;
+  }
+
+  const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
+
+  switch (m->op) {
+    case POOL_OP_CREATE_SNAP:
+      if (pool->is_tier()) {
+        ret = -EINVAL;
+        _pool_op_reply(op, ret, osdmap.get_epoch());
+        return false;
+      }  // else, fall through
+    case POOL_OP_DELETE_SNAP:
+      if (!pool->is_unmanaged_snaps_mode()) {
+        bool snap_exists = pool->snap_exists(m->name.c_str());
+        if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
+          || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
+          ret = 0;
+        } else {
+          break;
+        }
+      } else {
+        ret = -EINVAL;
+      }
+      _pool_op_reply(op, ret, osdmap.get_epoch());
+      return false;
+
+    case POOL_OP_DELETE_UNMANAGED_SNAP:
+      // we won't allow removal of an unmanaged snapshot from a pool
+      // not in unmanaged snaps mode.
+      if (!pool->is_unmanaged_snaps_mode()) {
+        _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
+        return false;
+      }
+      /* fall-thru */
+    case POOL_OP_CREATE_UNMANAGED_SNAP:
+      // but we will allow creating an unmanaged snapshot on any pool
+      // as long as it is not in 'pool' snaps mode.
+      if (pool->is_pool_snaps_mode()) {
+        _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
+        return false;
+      }
+  }
+
+  // projected pool info
+  pg_pool_t pp;
+  if (pending_inc.new_pools.count(m->pool))
+    pp = pending_inc.new_pools[m->pool];
+  else
+    pp = *osdmap.get_pg_pool(m->pool);
+
+  bufferlist reply_data;
+
+  // pool snaps vs unmanaged snaps are mutually exclusive
+  switch (m->op) {
+  case POOL_OP_CREATE_SNAP:
+  case POOL_OP_DELETE_SNAP:
+    if (pp.is_unmanaged_snaps_mode()) {
+      ret = -EINVAL;
+      goto out;
+    }
+    break;
+
+  case POOL_OP_CREATE_UNMANAGED_SNAP:
+  case POOL_OP_DELETE_UNMANAGED_SNAP:
+    if (pp.is_pool_snaps_mode()) {
+      ret = -EINVAL;
+      goto out;
+    }
+  }
+
+  switch (m->op) {
+  case POOL_OP_CREATE_SNAP:
+    if (!pp.snap_exists(m->name.c_str())) {
+      pp.add_snap(m->name.c_str(), ceph_clock_now());
+      dout(10) << "create snap in pool " << m->pool << " " << m->name
+	       << " seq " << pp.get_snap_epoch() << dendl;
+      changed = true;
+    }
+    break;
+
+  case POOL_OP_DELETE_SNAP:
+    {
+      snapid_t s = pp.snap_exists(m->name.c_str());
+      if (s) {
+	pp.remove_snap(s);
+	pending_inc.new_removed_snaps[m->pool].insert(s);
+	changed = true;
+      }
+    }
+    break;
+
+  case POOL_OP_CREATE_UNMANAGED_SNAP:
+    {
+      uint64_t snapid = pp.add_unmanaged_snap(
+	osdmap.require_osd_release < ceph_release_t::octopus);
+      encode(snapid, reply_data);
+      changed = true;
+    }
+    break;
+
+  case POOL_OP_DELETE_UNMANAGED_SNAP:
+    if (!_is_removed_snap(m->pool, m->snapid) &&
+	!_is_pending_removed_snap(m->pool, m->snapid)) {
+      if (m->snapid > pp.get_snap_seq()) {
+        _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
+        return false;
+      }
+      pp.remove_unmanaged_snap(
+	m->snapid,
+	osdmap.require_osd_release < ceph_release_t::octopus);
+      pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
+      // also record the new seq as purged: this avoids a discontinuity
+      // after all of the snaps have been purged, since the seq assigned
+      // during removal lives in the same namespace as the actual snaps.
+      pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
+      changed = true;
+    }
+    break;
+
+  case POOL_OP_AUID_CHANGE:
+    _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
+    return false;
+
+  default:
+    ceph_abort();
+    break;
+  }
+
+  if (changed) {
+    pp.set_snap_epoch(pending_inc.epoch);
+    pending_inc.new_pools[m->pool] = pp;
+  }
+
+ out:
+  wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
+  return true;
+}
+
+bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  int err = prepare_new_pool(op);
+  wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
+  return true;
+}
+
+int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
+				   ostream *ss)
+{
+  const string& poolstr = osdmap.get_pool_name(pool_id);
+
+  // If the Pool is in use by CephFS, refuse to delete it
+  FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
+  if (pending_fsmap.pool_in_use(pool_id)) {
+    *ss << "pool '" << poolstr << "' is in use by CephFS";
+    return -EBUSY;
+  }
+
+  if (pool.tier_of >= 0) {
+    *ss << "pool '" << poolstr << "' is a tier of '"
+	<< osdmap.get_pool_name(pool.tier_of) << "'";
+    return -EBUSY;
+  }
+  if (!pool.tiers.empty()) {
+    *ss << "pool '" << poolstr << "' has tiers";
+    for(auto tier : pool.tiers) {
+      *ss << " " << osdmap.get_pool_name(tier);
+    }
+    return -EBUSY;
+  }
+
+  if (!g_conf()->mon_allow_pool_delete) {
+    *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
+    return -EPERM;
+  }
+
+  if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
+    *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
+    return -EPERM;
+  }
+
+  *ss << "pool '" << poolstr << "' removed";
+  return 0;
+}
+
+/**
+ * Check if it is safe to add a tier to a base pool
+ *
+ * @return
+ * True if the operation should proceed, false if we should abort here
+ * (abort doesn't necessarily mean error, could be idempotency)
+ */
+bool OSDMonitor::_check_become_tier(
+    const int64_t tier_pool_id, const pg_pool_t *tier_pool,
+    const int64_t base_pool_id, const pg_pool_t *base_pool,
+    int *err,
+    ostream *ss) const
+{
+  const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
+  const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
+
+  const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
+  if (pending_fsmap.pool_in_use(tier_pool_id)) {
+    *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
+    *err = -EBUSY;
+    return false;
+  }
+
+  if (base_pool->tiers.count(tier_pool_id)) {
+    ceph_assert(tier_pool->tier_of == base_pool_id);
+    *err = 0;
+    *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
+      << base_pool_name << "'";
+    return false;
+  }
+
+  if (base_pool->is_tier()) {
+    *ss << "pool '" << base_pool_name << "' is already a tier of '"
+      << osdmap.get_pool_name(base_pool->tier_of) << "', "
+      << "multiple tiers are not yet supported.";
+    *err = -EINVAL;
+    return false;
+  }
+
+  if (tier_pool->has_tiers()) {
+    *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
+    for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
+         it != tier_pool->tiers.end(); ++it)
+      *ss << "'" << osdmap.get_pool_name(*it) << "',";
+    *ss << " multiple tiers are not yet supported.";
+    *err = -EINVAL;
+    return false;
+  }
+
+  if (tier_pool->is_tier()) {
+    *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
+       << osdmap.get_pool_name(tier_pool->tier_of) << "'";
+    *err = -EINVAL;
+    return false;
+  }
+
+  *err = 0;
+  return true;
+}
+
+
+/**
+ * Check if it is safe to remove a tier from this base pool
+ *
+ * @return
+ * True if the operation should proceed, false if we should abort here
+ * (abort doesn't necessarily mean error, could be idempotency)
+ */
+bool OSDMonitor::_check_remove_tier(
+    const int64_t base_pool_id, const pg_pool_t *base_pool,
+    const pg_pool_t *tier_pool,
+    int *err, ostream *ss) const
+{
+  const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
+
+  // Apply CephFS-specific checks
+  const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
+  if (pending_fsmap.pool_in_use(base_pool_id)) {
+    if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
+      // If the underlying pool is erasure coded and does not allow EC
+      // overwrites, we can't permit the removal of the replicated tier that
+      // CephFS relies on to access it
+      *ss << "pool '" << base_pool_name <<
+          "' does not allow EC overwrites and is in use by CephFS"
+          " via its tier";
+      *err = -EBUSY;
+      return false;
+    }
+
+    if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
+      *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
+             "tier is still in use as a writeback cache.  Change the cache "
+             "mode and flush the cache before removing it";
+      *err = -EBUSY;
+      return false;
+    }
+  }
+
+  *err = 0;
+  return true;
+}
+
+int OSDMonitor::_prepare_remove_pool(
+  int64_t pool, ostream *ss, bool no_fake)
+{
+  dout(10) << __func__ << " " << pool << dendl;
+  const pg_pool_t *p = osdmap.get_pg_pool(pool);
+  int r = _check_remove_pool(pool, *p, ss);
+  if (r < 0)
+    return r;
+
+  auto new_pool = pending_inc.new_pools.find(pool);
+  if (new_pool != pending_inc.new_pools.end()) {
+    // if there is a problem with the pending info, wait and retry
+    // this op.
+    const auto& p = new_pool->second;
+    int r = _check_remove_pool(pool, p, ss);
+    if (r < 0)
+      return -EAGAIN;
+  }
+
+  if (pending_inc.old_pools.count(pool)) {
+    dout(10) << __func__ << " " << pool << " already pending removal"
+	     << dendl;
+    return 0;
+  }
+
+  if (g_conf()->mon_fake_pool_delete && !no_fake) {
+    string old_name = osdmap.get_pool_name(pool);
+    string new_name = old_name + "." + stringify(pool) + ".DELETED";
+    dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
+	    << old_name << " -> " << new_name << dendl;
+    pending_inc.new_pool_names[pool] = new_name;
+    return 0;
+  }
+
+  // remove
+  pending_inc.old_pools.insert(pool);
+
+  // remove any pg_temp mappings for this pool
+  for (auto p = osdmap.pg_temp->begin();
+       p != osdmap.pg_temp->end();
+       ++p) {
+    if (p->first.pool() == pool) {
+      dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
+	       << p->first << dendl;
+      pending_inc.new_pg_temp[p->first].clear();
+    }
+  }
+  // remove any primary_temp mappings for this pool
+  for (auto p = osdmap.primary_temp->begin();
+      p != osdmap.primary_temp->end();
+      ++p) {
+    if (p->first.pool() == pool) {
+      dout(10) << __func__ << " " << pool
+               << " removing obsolete primary_temp" << p->first << dendl;
+      pending_inc.new_primary_temp[p->first] = -1;
+    }
+  }
+  // remove any pg_upmap mappings for this pool
+  for (auto& p : osdmap.pg_upmap) {
+    if (p.first.pool() == pool) {
+      dout(10) << __func__ << " " << pool
+               << " removing obsolete pg_upmap "
+               << p.first << dendl;
+      pending_inc.old_pg_upmap.insert(p.first);
+    }
+  }
+  // remove any pending pg_upmap mappings for this pool
+  {
+    auto it = pending_inc.new_pg_upmap.begin();
+    while (it != pending_inc.new_pg_upmap.end()) {
+      if (it->first.pool() == pool) {
+        dout(10) << __func__ << " " << pool
+                 << " removing pending pg_upmap "
+                 << it->first << dendl;
+        it = pending_inc.new_pg_upmap.erase(it);
+      } else {
+        it++;
+      }
+    }
+  }
+  // remove any pg_upmap_items mappings for this pool
+  for (auto& p : osdmap.pg_upmap_items) {
+    if (p.first.pool() == pool) {
+      dout(10) << __func__ << " " << pool
+               << " removing obsolete pg_upmap_items " << p.first
+               << dendl;
+      pending_inc.old_pg_upmap_items.insert(p.first);
+    }
+  }
+  // remove any pending pg_upmap mappings for this pool
+  {
+    auto it = pending_inc.new_pg_upmap_items.begin();
+    while (it != pending_inc.new_pg_upmap_items.end()) {
+      if (it->first.pool() == pool) {
+        dout(10) << __func__ << " " << pool
+                 << " removing pending pg_upmap_items "
+                 << it->first << dendl;
+        it = pending_inc.new_pg_upmap_items.erase(it);
+      } else {
+        it++;
+      }
+    }
+  }
+
+  // remove any choose_args for this pool
+  CrushWrapper newcrush;
+  _get_pending_crush(newcrush);
+  if (newcrush.have_choose_args(pool)) {
+    dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
+    newcrush.rm_choose_args(pool);
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
+  }
+  return 0;
+}
+
+int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
+{
+  dout(10) << "_prepare_rename_pool " << pool << dendl;
+  if (pending_inc.old_pools.count(pool)) {
+    dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
+    return -ENOENT;
+  }
+  for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
+       p != pending_inc.new_pool_names.end();
+       ++p) {
+    if (p->second == newname && p->first != pool) {
+      return -EEXIST;
+    }
+  }
+
+  pending_inc.new_pool_names[pool] = newname;
+  return 0;
+}
+
+bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MPoolOp>();
+  ostringstream ss;
+  int ret = _prepare_remove_pool(m->pool, &ss, false);
+  if (ret == -EAGAIN) {
+    wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+    return true;
+  }
+  if (ret < 0)
+    dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
+  wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
+						      pending_inc.epoch));
+  return true;
+}
+
+void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
+                                int ret, epoch_t epoch, bufferlist *blp)
+{
+  op->mark_osdmon_event(__func__);
+  auto m = op->get_req<MPoolOp>();
+  dout(20) << "_pool_op_reply " << ret << dendl;
+  MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
+					 ret, epoch, get_last_committed(), blp);
+  mon.send_reply(op, reply);
+}
+
+void OSDMonitor::convert_pool_priorities(void)
+{
+  pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
+  int64_t max_prio = 0;
+  int64_t min_prio = 0;
+  for (const auto &i : osdmap.get_pools()) {
+    const auto &pool = i.second;
+
+    if (pool.opts.is_set(key)) {
+      int64_t prio = 0;
+      pool.opts.get(key, &prio);
+      if (prio > max_prio)
+	max_prio = prio;
+      if (prio < min_prio)
+	min_prio = prio;
+    }
+  }
+  if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
+    dout(20) << __func__ << " nothing to fix" << dendl;
+    return;
+  }
+  // Current pool priorities exceeds new maximum
+  for (const auto &i : osdmap.get_pools()) {
+    const auto pool_id = i.first;
+    pg_pool_t pool = i.second;
+
+    int64_t prio = 0;
+    pool.opts.get(key, &prio);
+    int64_t n;
+
+    if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
+      // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
+      n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
+    } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
+      // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
+      n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
+    } else {
+      continue;
+    }
+    if (n == 0) {
+      pool.opts.unset(key);
+    } else {
+      pool.opts.set(key, static_cast<int64_t>(n));
+    }
+    dout(10) << __func__ << " pool " << pool_id
+	     << " recovery_priority adjusted "
+	     << prio << " to " << n << dendl;
+    pool.last_change = pending_inc.epoch;
+    pending_inc.new_pools[pool_id] = pool;
+  }
+}
+
+void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
+					       int *errcode,
+					       set<pg_pool_t*>* pools,
+					       const string& new_crush_rule)
+{
+  dout(20) << __func__ << dendl;
+  *okay = false;
+  int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
+  if (new_crush_rule_result < 0) {
+    ss << "unrecognized crush rule " << new_crush_rule_result;
+    *errcode = new_crush_rule_result;
+    return;
+  }
+  __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
+  for (const auto& pooli : osdmap.pools) {
+    int64_t poolid = pooli.first;
+    const pg_pool_t *p = &pooli.second;
+    if (!p->is_replicated()) {
+      ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
+      *errcode = -EINVAL;
+      return;
+    }
+    uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
+    if ((p->get_size() != default_size ||
+	 (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
+	(p->get_crush_rule() != new_rule)) {
+      ss << "we currently require stretch mode pools start out with the"
+	" default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
+      *errcode = -EINVAL;
+      return;
+    }
+    pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
+    // TODO: The part where we unconditionally copy the pools into pending_inc is bad
+    // the attempt may fail and then we have these pool updates...but they won't do anything
+    // if there is a failure, so if it's hard to change the interface, no need to bother
+    pools->insert(pp);
+  }
+  *okay = true;
+  return;
+}
+
+void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
+					 int *errcode, bool commit,
+					 const string& dividing_bucket,
+					 uint32_t bucket_count,
+					 const set<pg_pool_t*>& pools,
+					 const string& new_crush_rule)
+{
+  dout(20) << __func__ << dendl;
+  *okay = false;
+  CrushWrapper crush;
+  _get_pending_crush(crush);
+  int dividing_id;
+  int retval = crush.get_validated_type_id(dividing_bucket, &dividing_id);
+  if (retval == -1) {
+    ss << dividing_bucket << " is not a valid crush bucket type";
+    *errcode = -ENOENT;
+    ceph_assert(!commit || retval != -1);
+    return;
+  }
+  vector<int> subtrees;
+  crush.get_subtree_of_type(dividing_id, &subtrees);
+  if (subtrees.size() != 2) {
+    ss << "there are " << subtrees.size() << dividing_bucket
+       << "'s in the cluster but stretch mode currently only works with 2!";
+    *errcode = -EINVAL;
+    ceph_assert(!commit || subtrees.size() == 2);
+    return;
+  }
+
+  int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
+  if (new_crush_rule_result < 0) {
+    ss << "unrecognized crush rule " << new_crush_rule;
+    *errcode = new_crush_rule_result;
+    ceph_assert(!commit || (new_crush_rule_result > 0));
+    return;
+  }
+  __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
+
+  int weight1 = crush.get_item_weight(subtrees[0]);
+  int weight2 = crush.get_item_weight(subtrees[1]);
+  if (weight1 != weight2) {
+    // TODO: I'm really not sure this is a good idea?
+    ss << "the 2 " << dividing_bucket
+       << "instances in the cluster have differing weights "
+       << weight1 << " and " << weight2
+       <<" but stretch mode currently requires they be the same!";
+    *errcode = -EINVAL;
+    ceph_assert(!commit || (weight1 == weight2));
+    return;
+  }
+  if (bucket_count != 2) {
+    ss << "currently we only support 2-site stretch clusters!";
+    *errcode = -EINVAL;
+    ceph_assert(!commit || bucket_count == 2);
+    return;
+  }
+  // TODO: check CRUSH rules for pools so that we are appropriately divided
+  if (commit) {
+    for (auto pool : pools) {
+      pool->crush_rule = new_rule;
+      pool->peering_crush_bucket_count = bucket_count;
+      pool->peering_crush_bucket_target = bucket_count;
+      pool->peering_crush_bucket_barrier = dividing_id;
+      pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+      pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
+      pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+    }
+    pending_inc.change_stretch_mode = true;
+    pending_inc.stretch_mode_enabled = true;
+    pending_inc.new_stretch_bucket_count = bucket_count;
+    pending_inc.new_degraded_stretch_mode = 0;
+    pending_inc.new_stretch_mode_bucket = dividing_id;
+  }
+  *okay = true;
+  return;
+}
+
+bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
+					    set<int> *really_down_buckets,
+					    set<string> *really_down_mons)
+{
+  dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
+  ceph_assert(is_readable());
+  if (dead_buckets.empty()) return false;
+  set<int> down_cache;
+  bool really_down = false;
+  for (auto dbi : dead_buckets) {
+    const string& bucket_name = dbi.first;
+    ceph_assert(osdmap.crush->name_exists(bucket_name));
+    int bucket_id = osdmap.crush->get_item_id(bucket_name);
+    dout(20) << "Checking " << bucket_name << " id " << bucket_id
+	     << " to see if OSDs are also down" << dendl;
+    bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
+    if (subtree_down) {
+      dout(20) << "subtree is down!" << dendl;
+      really_down = true;
+      really_down_buckets->insert(bucket_id);
+      really_down_mons->insert(dbi.second.begin(), dbi.second.end());
+    }
+  }
+  dout(10) << "We determined CRUSH buckets " << *really_down_buckets
+	   << " and mons " << *really_down_mons << " are really down" << dendl;
+  return really_down;
+}
+
+void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
+					       const set<string>& live_zones)
+{
+  dout(20) << __func__ << dendl;
+  stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
+  // update the general OSDMap changes
+  pending_inc.change_stretch_mode = true;
+  pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
+  pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
+  int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
+  ceph_assert(new_site_count == 1); // stretch count 2!
+  pending_inc.new_degraded_stretch_mode = new_site_count;
+  pending_inc.new_recovering_stretch_mode = 0;
+  pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
+
+  // and then apply them to all the pg_pool_ts
+  ceph_assert(live_zones.size() == 1); // only support 2 zones now
+  const string& remaining_site_name = *(live_zones.begin());
+  ceph_assert(osdmap.crush->name_exists(remaining_site_name));
+  int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
+  for (auto pgi : osdmap.pools) {
+    if (pgi.second.peering_crush_bucket_count) {
+      pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
+      newp.peering_crush_bucket_count = new_site_count;
+      newp.peering_crush_mandatory_member = remaining_site;
+      newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
+      newp.set_last_force_op_resend(pending_inc.epoch);
+    }
+  }
+  propose_pending();
+}
+
+void OSDMonitor::trigger_recovery_stretch_mode()
+{
+  dout(20) << __func__ << dendl;
+  stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
+  pending_inc.change_stretch_mode = true;
+  pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
+  pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
+  pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
+  pending_inc.new_recovering_stretch_mode = 1;
+  pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
+
+  for (auto pgi : osdmap.pools) {
+    if (pgi.second.peering_crush_bucket_count) {
+      pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
+      newp.set_last_force_op_resend(pending_inc.epoch);
+    }
+  }
+  propose_pending();
+}
+
+void OSDMonitor::set_degraded_stretch_mode()
+{
+  stretch_recovery_triggered.set_from_double(0);
+}
+
+void OSDMonitor::set_recovery_stretch_mode()
+{
+  if (stretch_recovery_triggered.is_zero()) {
+    stretch_recovery_triggered = ceph_clock_now();
+  }
+}
+
+void OSDMonitor::set_healthy_stretch_mode()
+{
+  stretch_recovery_triggered.set_from_double(0);
+}
+
+void OSDMonitor::notify_new_pg_digest()
+{
+  dout(20) << __func__ << dendl;
+  if (!stretch_recovery_triggered.is_zero()) {
+    try_end_recovery_stretch_mode(false);
+  }
+}
+
+struct CMonExitRecovery : public Context {
+  OSDMonitor *m;
+  bool force;
+  CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
+  void finish(int r) {
+    m->try_end_recovery_stretch_mode(force);
+  }
+};
+
+void OSDMonitor::try_end_recovery_stretch_mode(bool force)
+{
+  dout(20) << __func__ << dendl;
+  if (!mon.is_leader()) return;
+  if (!mon.is_degraded_stretch_mode()) return;
+  if (!mon.is_recovering_stretch_mode()) return;
+  if (!is_readable()) {
+    wait_for_readable_ctx(new CMonExitRecovery(this, force));
+    return;
+  }
+
+  if (osdmap.recovering_stretch_mode &&
+      ((!stretch_recovery_triggered.is_zero() &&
+	ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
+	stretch_recovery_triggered) ||
+       force)) {
+    if (!mon.mgrstatmon()->is_readable()) {
+      mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
+      return;
+    }
+    const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
+    double misplaced, degraded, inactive, unknown;
+    pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
+    if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
+      // we can exit degraded stretch mode!
+      mon.trigger_healthy_stretch_mode();
+    }
+  }
+}
+
+void OSDMonitor::trigger_healthy_stretch_mode()
+{
+  ceph_assert(is_writeable());
+  stretch_recovery_triggered.set_from_double(0);
+  pending_inc.change_stretch_mode = true;
+  pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
+  pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
+  pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
+  pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
+  pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
+  for (auto pgi : osdmap.pools) {
+    if (pgi.second.peering_crush_bucket_count) {
+      pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
+      newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
+      newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+      newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+      newp.set_last_force_op_resend(pending_inc.epoch);
+    }
+  }
+  propose_pending();
+}
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
new file mode 100644
index 000000000..e7701a639
--- /dev/null
+++ b/src/mon/OSDMonitor.h
@@ -0,0 +1,874 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+/* Object Store Device (OSD) Monitor
+ */
+
+#ifndef CEPH_OSDMONITOR_H
+#define CEPH_OSDMONITOR_H
+
+#include <map>
+#include <set>
+#include <utility>
+
+#include "include/types.h"
+#include "include/encoding.h"
+#include "common/simple_cache.hpp"
+#include "common/PriorityCache.h"
+#include "msg/Messenger.h"
+
+#include "osd/OSDMap.h"
+#include "osd/OSDMapMapping.h"
+
+#include "CreatingPGs.h"
+#include "PaxosService.h"
+
+#include "erasure-code/ErasureCodeInterface.h"
+#include "mon/MonOpRequest.h"
+#include <boost/functional/hash.hpp>
+
+class Monitor;
+class PGMap;
+struct MonSession;
+class MOSDMap;
+
+
+/// information about a particular peer's failure reports for one osd
+struct failure_reporter_t {
+  utime_t failed_since;     ///< when they think it failed
+  MonOpRequestRef op;       ///< failure op request
+
+  failure_reporter_t() {}
+  failure_reporter_t(utime_t s, MonOpRequestRef op)
+    : failed_since(s), op(op) {}
+  ~failure_reporter_t() { }
+};
+
+/// information about all failure reports for one osd
+struct failure_info_t {
+  std::map<int, failure_reporter_t> reporters;  ///< reporter -> failed_since etc
+  utime_t max_failed_since;                ///< most recent failed_since
+
+  failure_info_t() {}
+
+  utime_t get_failed_since() {
+    if (max_failed_since == utime_t() && !reporters.empty()) {
+      // the old max must have canceled; recalculate.
+      for (auto p = reporters.begin(); p != reporters.end(); ++p)
+	if (p->second.failed_since > max_failed_since)
+	  max_failed_since = p->second.failed_since;
+    }
+    return max_failed_since;
+  }
+
+  // set the message for the latest report.
+  void add_report(int who, utime_t failed_since, MonOpRequestRef op) {
+    [[maybe_unused]] auto [it, new_reporter] =
+      reporters.insert_or_assign(who, failure_reporter_t{failed_since, op});
+    if (new_reporter) {
+      if (max_failed_since != utime_t() && max_failed_since < failed_since) {
+	max_failed_since = failed_since;
+      }
+    }
+  }
+
+  void take_report_messages(std::list<MonOpRequestRef>& ls) {
+    for (auto p = reporters.begin(); p != reporters.end(); ++p) {
+      if (p->second.op) {
+	ls.push_back(p->second.op);
+        p->second.op.reset();
+      }
+    }
+  }
+
+  void cancel_report(int who) {
+    reporters.erase(who);
+    max_failed_since = utime_t();
+  }
+};
+
+
+class LastEpochClean {
+  struct Lec {
+    std::vector<epoch_t> epoch_by_pg;
+    ps_t next_missing = 0;
+    epoch_t floor = std::numeric_limits<epoch_t>::max();
+    void report(unsigned pg_num, ps_t pg, epoch_t last_epoch_clean);
+  };
+  std::map<uint64_t, Lec> report_by_pool;
+public:
+  void report(unsigned pg_num, const pg_t& pg, epoch_t last_epoch_clean);
+  void remove_pool(uint64_t pool);
+  epoch_t get_lower_bound(const OSDMap& latest) const;
+
+  void dump(Formatter *f) const;
+};
+
+
+struct osdmap_manifest_t {
+  // all the maps we have pinned -- i.e., won't be removed unless
+  // they are inside a trim interval.
+  std::set<version_t> pinned;
+
+  osdmap_manifest_t() {}
+
+  version_t get_last_pinned() const
+  {
+    auto it = pinned.crbegin();
+    if (it == pinned.crend()) {
+      return 0;
+    }
+    return *it;
+  }
+
+  version_t get_first_pinned() const
+  {
+    auto it = pinned.cbegin();
+    if (it == pinned.cend()) {
+      return 0;
+    }
+    return *it;
+  }
+
+  bool is_pinned(version_t v) const
+  {
+    return pinned.find(v) != pinned.end();
+  }
+
+  void pin(version_t v)
+  {
+    pinned.insert(v);
+  }
+
+  version_t get_lower_closest_pinned(version_t v) const {
+    auto p = pinned.lower_bound(v);
+    if (p == pinned.cend()) {
+      return 0;
+    } else if (*p > v) {
+      if (p == pinned.cbegin()) {
+        return 0;
+      }
+      --p;
+    }
+    return *p;
+  }
+
+  void encode(ceph::buffer::list& bl) const
+  {
+    ENCODE_START(1, 1, bl);
+    encode(pinned, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator& bl)
+  {
+    DECODE_START(1, bl);
+    decode(pinned, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list& bl) {
+    auto p = bl.cbegin();
+    decode(p);
+  }
+
+  void dump(ceph::Formatter *f) {
+    f->dump_unsigned("first_pinned", get_first_pinned());
+    f->dump_unsigned("last_pinned", get_last_pinned());
+    f->open_array_section("pinned_maps");
+    for (auto& i : pinned) {
+      f->dump_unsigned("epoch", i);
+    }
+    f->close_section();
+ }
+};
+WRITE_CLASS_ENCODER(osdmap_manifest_t);
+
+class OSDMonitor : public PaxosService,
+                   public md_config_obs_t {
+  CephContext *cct;
+
+public:
+  OSDMap osdmap;
+
+  // config observer
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+    const std::set<std::string> &changed) override;
+  // [leader]
+  OSDMap::Incremental pending_inc;
+  std::map<int, ceph::buffer::list> pending_metadata;
+  std::set<int>             pending_metadata_rm;
+  std::map<int, failure_info_t> failure_info;
+  std::map<int,utime_t>    down_pending_out;  // osd down -> out
+  bool priority_convert = false;
+  std::map<int64_t,std::set<snapid_t>> pending_pseudo_purged_snaps;
+  std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr;
+  std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
+  ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock");
+
+  std::map<int,double> osd_weight;
+
+  using osdmap_key_t = std::pair<version_t, uint64_t>;
+  using osdmap_cache_t = SimpleLRU<osdmap_key_t,
+                                   ceph::buffer::list,
+                                   std::less<osdmap_key_t>,
+                                   boost::hash<osdmap_key_t>>;
+  osdmap_cache_t inc_osd_cache;
+  osdmap_cache_t full_osd_cache;
+
+  bool has_osdmap_manifest;
+  osdmap_manifest_t osdmap_manifest;
+
+  bool check_failures(utime_t now);
+  bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
+  utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const;
+  bool is_failure_stale(utime_t now, failure_info_t& fi) const;
+  void force_failure(int target_osd, int by);
+
+  bool _have_pending_crush();
+  CrushWrapper &_get_stable_crush();
+  void _get_pending_crush(CrushWrapper& newcrush);
+
+  enum FastReadType {
+    FAST_READ_OFF,
+    FAST_READ_ON,
+    FAST_READ_DEFAULT
+  };
+
+  struct CleanUpmapJob : public ParallelPGMapper::Job {
+    CephContext *cct;
+    const OSDMap& osdmap;
+    OSDMap::Incremental& pending_inc;
+    // lock to protect pending_inc form changing
+    // when checking is done
+    ceph::mutex pending_inc_lock =
+      ceph::make_mutex("CleanUpmapJob::pending_inc_lock");
+
+    CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi)
+      : ParallelPGMapper::Job(&om),
+        cct(cct),
+        osdmap(om),
+        pending_inc(pi) {}
+
+    void process(const std::vector<pg_t>& to_check) override {
+      std::vector<pg_t> to_cancel;
+      std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> to_remap;
+      osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
+      // don't bother taking lock if nothing changes
+      if (!to_cancel.empty() || !to_remap.empty()) {
+        std::lock_guard l(pending_inc_lock);
+        osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap);
+      }
+    }
+
+    void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {}
+    void complete() override {}
+  }; // public as this will need to be accessible from TestTestOSDMap.cc
+
+  // svc
+public:
+  void create_initial() override;
+  void get_store_prefixes(std::set<std::string>& s) const override;
+
+private:
+  void update_from_paxos(bool *need_bootstrap) override;
+  void create_pending() override;  // prepare a new pending
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  void on_active() override;
+  void on_restart() override;
+  void on_shutdown() override;
+
+  /* osdmap full map prune */
+  void load_osdmap_manifest();
+  bool should_prune() const;
+  void _prune_update_trimmed(
+      MonitorDBStore::TransactionRef tx,
+      version_t first);
+  void prune_init(osdmap_manifest_t& manifest);
+  bool _prune_sanitize_options() const;
+  bool is_prune_enabled() const;
+  bool is_prune_supported() const;
+  bool do_prune(MonitorDBStore::TransactionRef tx);
+
+  // Priority cache control
+  uint32_t mon_osd_cache_size = 0;  ///< Number of cached OSDMaps
+  uint64_t rocksdb_cache_size = 0;  ///< Cache for kv Db
+  double cache_kv_ratio = 0;        ///< Cache ratio dedicated to kv
+  double cache_inc_ratio = 0;       ///< Cache ratio dedicated to inc
+  double cache_full_ratio = 0;      ///< Cache ratio dedicated to full
+  uint64_t mon_memory_base = 0;     ///< Mon base memory for cache autotuning
+  double mon_memory_fragmentation = 0; ///< Expected memory fragmentation
+  uint64_t mon_memory_target = 0;   ///< Mon target memory for cache autotuning
+  uint64_t mon_memory_min = 0;      ///< Min memory to cache osdmaps
+  bool mon_memory_autotune = false; ///< Cache auto tune setting
+  int register_cache_with_pcm();
+  int _set_cache_sizes();
+  int _set_cache_ratios();
+  void _set_new_cache_sizes();
+  void _set_cache_autotuning();
+  int _update_mon_cache_settings();
+
+  friend struct OSDMemCache;
+  friend struct IncCache;
+  friend struct FullCache;
+
+  /**
+   * we haven't delegated full version stashing to paxosservice for some time
+   * now, making this function useless in current context.
+   */
+  void encode_full(MonitorDBStore::TransactionRef t) override { }
+  /**
+   * do not let paxosservice periodically stash full osdmaps, or we will break our
+   * locally-managed full maps.  (update_from_paxos loads the latest and writes them
+   * out going forward from there, but if we just synced that may mean we skip some.)
+   */
+  bool should_stash_full() override {
+    return false;
+  }
+
+  /**
+   * hook into trim to include the oldest full map in the trim transaction
+   *
+   * This ensures that anyone post-sync will have enough to rebuild their
+   * full osdmaps.
+   */
+  void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
+
+  void update_msgr_features();
+  /**
+   * check if the cluster supports the features required by the
+   * given crush map. Outputs the daemons which don't support it
+   * to the stringstream.
+   *
+   * @returns true if the map is passable, false otherwise
+   */
+  bool validate_crush_against_features(const CrushWrapper *newcrush,
+				       std::stringstream &ss);
+  void check_osdmap_subs();
+  void share_map_with_random_osd();
+
+  ceph::mutex prime_pg_temp_lock =
+    ceph::make_mutex("OSDMonitor::prime_pg_temp_lock");
+  struct PrimeTempJob : public ParallelPGMapper::Job {
+    OSDMonitor *osdmon;
+    PrimeTempJob(const OSDMap& om, OSDMonitor *m)
+      : ParallelPGMapper::Job(&om), osdmon(m) {}
+    void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
+      for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
+	pg_t pgid(ps, pool);
+	osdmon->prime_pg_temp(*osdmap, pgid);
+      }
+    }
+    void process(const std::vector<pg_t>& pgs) override {}
+    void complete() override {}
+  };
+  void maybe_prime_pg_temp();
+  void prime_pg_temp(const OSDMap& next, pg_t pgid);
+
+  ParallelPGMapper mapper;                        ///< for background pg work
+  OSDMapMapping mapping;                          ///< pg <-> osd mappings
+  std::unique_ptr<ParallelPGMapper::Job> mapping_job;  ///< background mapping job
+  void start_mapping();
+
+  void update_logger();
+
+  void handle_query(PaxosServiceMessage *m);
+  bool preprocess_query(MonOpRequestRef op) override;  // true if processed.
+  bool prepare_update(MonOpRequestRef op) override;
+  bool should_propose(double &delay) override;
+
+  version_t get_trim_to() const override;
+
+  bool can_mark_down(int o);
+  bool can_mark_up(int o);
+  bool can_mark_out(int o);
+  bool can_mark_in(int o);
+
+  // ...
+  MOSDMap *build_latest_full(uint64_t features);
+  MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features);
+  void send_full(MonOpRequestRef op);
+  void send_incremental(MonOpRequestRef op, epoch_t first);
+public:
+  /**
+   * Make sure the existing (up) OSDs support the given features
+   * @return 0 on success, or an error code if any OSDs re missing features.
+   * @param ss Filled in with ane explanation of failure, if any
+   */
+  int check_cluster_features(uint64_t features, std::stringstream &ss);
+  // @param req an optional op request, if the osdmaps are replies to it. so
+  //            @c Monitor::send_reply() can mark_event with it.
+  void send_incremental(epoch_t first, MonSession *session, bool onetime,
+			MonOpRequestRef req = MonOpRequestRef());
+
+private:
+  void print_utilization(std::ostream &out, ceph::Formatter *f, bool tree) const;
+
+  bool check_source(MonOpRequestRef op, uuid_d fsid);
+ 
+  bool preprocess_get_osdmap(MonOpRequestRef op);
+
+  bool preprocess_mark_me_down(MonOpRequestRef op);
+
+  friend class C_AckMarkedDown;
+  bool preprocess_failure(MonOpRequestRef op);
+  bool prepare_failure(MonOpRequestRef op);
+  bool prepare_mark_me_down(MonOpRequestRef op);
+  void process_failures();
+  void take_all_failures(std::list<MonOpRequestRef>& ls);
+
+  bool preprocess_mark_me_dead(MonOpRequestRef op);
+  bool prepare_mark_me_dead(MonOpRequestRef op);
+
+  bool preprocess_full(MonOpRequestRef op);
+  bool prepare_full(MonOpRequestRef op);
+
+  bool preprocess_boot(MonOpRequestRef op);
+  bool prepare_boot(MonOpRequestRef op);
+  void _booted(MonOpRequestRef op, bool logit);
+
+  void update_up_thru(int from, epoch_t up_thru);
+  bool preprocess_alive(MonOpRequestRef op);
+  bool prepare_alive(MonOpRequestRef op);
+  void _reply_map(MonOpRequestRef op, epoch_t e);
+
+  bool preprocess_pgtemp(MonOpRequestRef op);
+  bool prepare_pgtemp(MonOpRequestRef op);
+
+  bool preprocess_pg_created(MonOpRequestRef op);
+  bool prepare_pg_created(MonOpRequestRef op);
+
+  bool preprocess_pg_ready_to_merge(MonOpRequestRef op);
+  bool prepare_pg_ready_to_merge(MonOpRequestRef op);
+
+  int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, std::ostream *ss);
+  bool _check_become_tier(
+      int64_t tier_pool_id, const pg_pool_t *tier_pool,
+      int64_t base_pool_id, const pg_pool_t *base_pool,
+      int *err, std::ostream *ss) const;
+  bool _check_remove_tier(
+      int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
+      int *err, std::ostream *ss) const;
+
+  int _prepare_remove_pool(int64_t pool, std::ostream *ss, bool no_fake);
+  int _prepare_rename_pool(int64_t pool, std::string newname);
+
+  bool enforce_pool_op_caps(MonOpRequestRef op);
+  bool preprocess_pool_op (MonOpRequestRef op);
+  bool preprocess_pool_op_create (MonOpRequestRef op);
+  bool prepare_pool_op (MonOpRequestRef op);
+  bool prepare_pool_op_create (MonOpRequestRef op);
+  bool prepare_pool_op_delete(MonOpRequestRef op);
+  int crush_rename_bucket(const std::string& srcname,
+			  const std::string& dstname,
+			  std::ostream *ss);
+  void check_legacy_ec_plugin(const std::string& plugin, 
+			      const std::string& profile) const;
+  int normalize_profile(const std::string& profilename, 
+			ceph::ErasureCodeProfile &profile,
+			bool force,
+			std::ostream *ss);
+  int crush_rule_create_erasure(const std::string &name,
+				const std::string &profile,
+				int *rule,
+				std::ostream *ss);
+  int get_crush_rule(const std::string &rule_name,
+		     int *crush_rule,
+		     std::ostream *ss);
+  int get_erasure_code(const std::string &erasure_code_profile,
+		       ceph::ErasureCodeInterfaceRef *erasure_code,
+		       std::ostream *ss) const;
+  int prepare_pool_crush_rule(const unsigned pool_type,
+			      const std::string &erasure_code_profile,
+			      const std::string &rule_name,
+			      int *crush_rule,
+			      std::ostream *ss);
+  bool erasure_code_profile_in_use(
+    const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
+    const std::string &profile,
+    std::ostream *ss);
+  int parse_erasure_code_profile(const std::vector<std::string> &erasure_code_profile,
+				 std::map<std::string,std::string> *erasure_code_profile_map,
+				 std::ostream *ss);
+  int prepare_pool_size(const unsigned pool_type,
+			const std::string &erasure_code_profile,
+                        uint8_t repl_size,
+			unsigned *size, unsigned *min_size,
+			std::ostream *ss);
+  int prepare_pool_stripe_width(const unsigned pool_type,
+				const std::string &erasure_code_profile,
+				unsigned *stripe_width,
+				std::ostream *ss);
+  int check_pg_num(int64_t pool, int pg_num, int size, std::ostream* ss);
+  int prepare_new_pool(std::string& name,
+		       int crush_rule,
+		       const std::string &crush_rule_name,
+                       unsigned pg_num, unsigned pgp_num,
+		       unsigned pg_num_min,
+		       unsigned pg_num_max,
+                       uint64_t repl_size,
+		       const uint64_t target_size_bytes,
+		       const float target_size_ratio,
+		       const std::string &erasure_code_profile,
+                       const unsigned pool_type,
+                       const uint64_t expected_num_objects,
+                       FastReadType fast_read,
+		       const std::string& pg_autoscale_mode,
+		       bool bulk,
+		       std::ostream *ss);
+  int prepare_new_pool(MonOpRequestRef op);
+
+  void set_pool_flags(int64_t pool_id, uint64_t flags);
+  void clear_pool_flags(int64_t pool_id, uint64_t flags);
+  bool update_pools_status();
+
+  bool _is_removed_snap(int64_t pool_id, snapid_t snapid);
+  bool _is_pending_removed_snap(int64_t pool_id, snapid_t snapid);
+
+  std::string make_purged_snap_epoch_key(epoch_t epoch);
+  std::string make_purged_snap_key(int64_t pool, snapid_t snap);
+  std::string make_purged_snap_key_value(int64_t pool, snapid_t snap, snapid_t num,
+				    epoch_t epoch, ceph::buffer::list *v);
+
+  bool try_prune_purged_snaps();
+  int lookup_purged_snap(int64_t pool, snapid_t snap,
+			 snapid_t *begin, snapid_t *end);
+
+  void insert_purged_snap_update(
+    int64_t pool,
+    snapid_t start, snapid_t end,
+    epoch_t epoch,
+    MonitorDBStore::TransactionRef t);
+
+  bool prepare_set_flag(MonOpRequestRef op, int flag);
+  bool prepare_unset_flag(MonOpRequestRef op, int flag);
+
+  void _pool_op_reply(MonOpRequestRef op,
+                      int ret, epoch_t epoch, ceph::buffer::list *blp=NULL);
+
+  struct C_Booted : public C_MonOp {
+    OSDMonitor *cmon;
+    bool logit;
+    C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
+      C_MonOp(op_), cmon(cm), logit(l) {}
+    void _finish(int r) override {
+      if (r >= 0)
+	cmon->_booted(op, logit);
+      else if (r == -ECANCELED)
+        return;
+      else if (r == -EAGAIN)
+        cmon->dispatch(op);
+      else
+	ceph_abort_msg("bad C_Booted return value");
+    }
+  };
+
+  struct C_ReplyMap : public C_MonOp {
+    OSDMonitor *osdmon;
+    epoch_t e;
+    C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
+      : C_MonOp(op_), osdmon(o), e(ee) {}
+    void _finish(int r) override {
+      if (r >= 0)
+	osdmon->_reply_map(op, e);
+      else if (r == -ECANCELED)
+        return;
+      else if (r == -EAGAIN)
+	osdmon->dispatch(op);
+      else
+	ceph_abort_msg("bad C_ReplyMap return value");
+    }    
+  };
+  struct C_PoolOp : public C_MonOp {
+    OSDMonitor *osdmon;
+    int replyCode;
+    int epoch;
+    ceph::buffer::list reply_data;
+    C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, ceph::buffer::list *rd=NULL) :
+      C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
+      if (rd)
+	reply_data = *rd;
+    }
+    void _finish(int r) override {
+      if (r >= 0)
+	osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
+      else if (r == -ECANCELED)
+        return;
+      else if (r == -EAGAIN)
+	osdmon->dispatch(op);
+      else
+	ceph_abort_msg("bad C_PoolOp return value");
+    }
+  };
+
+  bool preprocess_remove_snaps(MonOpRequestRef op);
+  bool prepare_remove_snaps(MonOpRequestRef op);
+
+  bool preprocess_get_purged_snaps(MonOpRequestRef op);
+
+  int load_metadata(int osd, std::map<std::string, std::string>& m,
+		    std::ostream *err);
+  void count_metadata(const std::string& field, ceph::Formatter *f);
+
+  void reencode_incremental_map(ceph::buffer::list& bl, uint64_t features);
+  void reencode_full_map(ceph::buffer::list& bl, uint64_t features);
+public:
+  void count_metadata(const std::string& field, std::map<std::string,int> *out);
+  void get_versions(std::map<std::string, std::list<std::string>> &versions);
+protected:
+  int get_osd_objectstore_type(int osd, std::string *type);
+  bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
+				       std::ostream *err);
+
+  // when we last received PG stats from each osd and the osd's osd_beacon_report_interval
+  std::map<int, std::pair<utime_t, int>> last_osd_report;
+  // TODO: use last_osd_report to store the osd report epochs, once we don't
+  //       need to upgrade from pre-luminous releases.
+  std::map<int,epoch_t> osd_epochs;
+  LastEpochClean last_epoch_clean;
+  bool preprocess_beacon(MonOpRequestRef op);
+  bool prepare_beacon(MonOpRequestRef op);
+  epoch_t get_min_last_epoch_clean() const;
+
+  friend class C_UpdateCreatingPGs;
+  std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch;
+  std::vector<pg_t> pending_created_pgs;
+  // the epoch when the pg mapping was calculated
+  epoch_t creating_pgs_epoch = 0;
+  creating_pgs_t creating_pgs;
+  mutable std::mutex creating_pgs_lock;
+
+  creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
+				    const OSDMap& nextmap);
+  unsigned scan_for_creating_pgs(
+    const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
+    const mempool::osdmap::set<int64_t>& removed_pools,
+    utime_t modified,
+    creating_pgs_t* creating_pgs) const;
+  std::pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
+  void update_creating_pgs();
+  void check_pg_creates_subs();
+  epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
+
+  int32_t _allocate_osd_id(int32_t* existing_id);
+
+  int get_grace_interval_threshold();
+  bool grace_interval_threshold_exceeded(int last_failed);
+  void set_default_laggy_params(int target_osd);
+
+public:
+  OSDMonitor(CephContext *cct, Monitor &mn, Paxos &p, const std::string& service_name);
+
+  void tick() override;  // check state, take actions
+
+  bool preprocess_command(MonOpRequestRef op);
+  bool prepare_command(MonOpRequestRef op);
+  bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap);
+
+  int validate_osd_create(
+      const int32_t id,
+      const uuid_d& uuid,
+      const bool check_osd_exists,
+      int32_t* existing_id,
+      std::stringstream& ss);
+  int prepare_command_osd_create(
+      const int32_t id,
+      const uuid_d& uuid,
+      int32_t* existing_id,
+      std::stringstream& ss);
+  void do_osd_create(const int32_t id, const uuid_d& uuid,
+		     const std::string& device_class,
+		     int32_t* new_id);
+  int prepare_command_osd_purge(int32_t id, std::stringstream& ss);
+  int prepare_command_osd_destroy(int32_t id, std::stringstream& ss);
+  int _prepare_command_osd_crush_remove(
+      CrushWrapper &newcrush,
+      int32_t id,
+      int32_t ancestor,
+      bool has_ancestor,
+      bool unlink_only);
+  void do_osd_crush_remove(CrushWrapper& newcrush);
+  int prepare_command_osd_crush_remove(
+      CrushWrapper &newcrush,
+      int32_t id,
+      int32_t ancestor,
+      bool has_ancestor,
+      bool unlink_only);
+  int prepare_command_osd_remove(int32_t id);
+  int prepare_command_osd_new(
+      MonOpRequestRef op,
+      const cmdmap_t& cmdmap,
+      const std::map<std::string,std::string>& secrets,
+      std::stringstream &ss,
+      ceph::Formatter *f);
+
+  int prepare_command_pool_set(const cmdmap_t& cmdmap,
+                               std::stringstream& ss);
+
+  int prepare_command_pool_application(const std::string &prefix,
+                                       const cmdmap_t& cmdmap,
+                                       std::stringstream& ss);
+  int preprocess_command_pool_application(const std::string &prefix,
+                                          const cmdmap_t& cmdmap,
+                                          std::stringstream& ss,
+                                          bool *modified);
+  int _command_pool_application(const std::string &prefix,
+				const cmdmap_t& cmdmap,
+				std::stringstream& ss,
+				bool *modified,
+				bool preparing);
+
+  bool handle_osd_timeouts(const utime_t &now,
+			   std::map<int, std::pair<utime_t, int>> &last_osd_report);
+
+  void send_latest(MonOpRequestRef op, epoch_t start=0);
+  void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
+    op->mark_osdmon_event(__func__);
+    send_incremental(op, start);
+  }
+
+  int get_version(version_t ver, ceph::buffer::list& bl) override;
+  int get_version(version_t ver, uint64_t feature, ceph::buffer::list& bl);
+
+  int get_version_full(version_t ver, uint64_t feature, ceph::buffer::list& bl);
+  int get_version_full(version_t ver, ceph::buffer::list& bl) override;
+  int get_inc(version_t ver, OSDMap::Incremental& inc);
+  int get_full_from_pinned_map(version_t ver, ceph::buffer::list& bl);
+
+  epoch_t blocklist(const entity_addrvec_t& av, utime_t until);
+  epoch_t blocklist(entity_addr_t a, utime_t until);
+
+  void dump_info(ceph::Formatter *f);
+  int dump_osd_metadata(int osd, ceph::Formatter *f, std::ostream *err);
+  void print_nodes(ceph::Formatter *f);
+
+  void check_osdmap_sub(Subscription *sub);
+  void check_pg_creates_sub(Subscription *sub);
+
+  void do_application_enable(int64_t pool_id, const std::string &app_name,
+			     const std::string &app_key="",
+			     const std::string &app_value="",
+			     bool force=false);
+  void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt,
+		       pool_opts_t::value_t);
+
+  void add_flag(int flag) {
+    if (!(osdmap.flags & flag)) {
+      if (pending_inc.new_flags < 0)
+	pending_inc.new_flags = osdmap.flags;
+      pending_inc.new_flags |= flag;
+    }
+  }
+
+  void remove_flag(int flag) {
+    if(osdmap.flags & flag) {
+      if (pending_inc.new_flags < 0)
+	pending_inc.new_flags = osdmap.flags;
+      pending_inc.new_flags &= ~flag;
+    }
+  }
+  void convert_pool_priorities(void);
+  /**
+   * Find the pools which are requested to be put into stretch mode,
+   * validate that they are allowed to be in stretch mode (eg, are replicated)
+   * and place copies of them in the pools set.
+   * This does not make any changes to the pools or state; it's just
+   * a safety-check-and-collect function.
+   */
+  void try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
+				     int *errcode,
+				     set<pg_pool_t*>* pools, const string& new_crush_rule);
+  /**
+   * Check validity of inputs and OSD/CRUSH state to
+   * engage stretch mode. Designed to be used with
+   * MonmapMonitor::try_enable_stretch_mode() where we call both twice,
+   * first with commit=false to validate.
+   * @param ss: a stringstream to write errors into
+   * @param okay: Filled to true if okay, false if validation fails
+   * @param errcode: filled with -errno if there's a problem
+   * @param commit: true if we should commit the change, false if just testing
+   * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster
+   * @param bucket_count: The number of buckets required in peering.
+   *  Currently must be 2.
+   * @param pools: The pg_pool_ts which are being set to stretch mode (obtained
+   *   from try_enable_stretch_mode_pools()).
+   * @param new_crush_rule: The crush rule to set the pools to.
+   */
+  void try_enable_stretch_mode(stringstream& ss, bool *okay,
+			       int *errcode, bool commit,
+			       const string& dividing_bucket,
+			       uint32_t bucket_count,
+			       const set<pg_pool_t*>& pools,
+			       const string& new_crush_rule);
+  /**
+   * Check the input dead_buckets mapping (buckets->dead monitors) to see
+   * if the OSDs are also down. If so, fill in really_down_buckets and
+   * really_down_mons and return true; else return false.
+   */
+  bool check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
+				  set<int> *really_down_buckets,
+				  set<string> *really_down_mons);
+  /**
+   * Set degraded mode in the OSDMap, adding the given dead buckets to the dead set
+   * and using the live_zones (should presently be size 1)
+   */
+  void trigger_degraded_stretch_mode(const set<int>& dead_buckets,
+				     const set<string>& live_zones);
+  /**
+   * This is just to maintain stretch_recovery_triggered; below
+   */
+  void set_degraded_stretch_mode();
+  /**
+   * Set recovery stretch mode in the OSDMap, resetting pool size back to normal
+   */
+  void trigger_recovery_stretch_mode();
+  /**
+   * This is just to maintain stretch_recovery_triggered; below
+   */
+  void set_recovery_stretch_mode();
+  /**
+   * This is just to maintain stretch_recovery_triggered; below
+   */
+  void set_healthy_stretch_mode();
+  /**
+   * Tells the OSD there's a new pg digest, in case it's interested.
+   * (It's interested when in recovering stretch mode.)
+   */
+  void notify_new_pg_digest();
+  /**
+   * Check if we can exit recovery stretch mode and go back to normal.
+   * @param force If true, we will force the exit through once it is legal,
+   * without regard to the reported PG status.
+   */
+  void try_end_recovery_stretch_mode(bool force);
+  /**
+   * Sets the osdmap and pg_pool_t values back to healthy stretch mode status.
+   */
+  void trigger_healthy_stretch_mode();
+  /**
+   * Obtain the crush rule being used for stretch pools.
+   * Note that right now this is heuristic and simply selects the
+   * most-used rule on replicated stretch pools.
+   * @return the crush rule ID, or a negative errno
+   */
+  int get_replicated_stretch_crush_rule();
+private:
+  utime_t stretch_recovery_triggered; // what time we committed a switch to recovery mode
+};
+
+#endif
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
new file mode 100644
index 000000000..220317603
--- /dev/null
+++ b/src/mon/PGMap.cc
@@ -0,0 +1,4171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/algorithm/string.hpp>
+
+#include "PGMap.h"
+
+#define dout_subsys ceph_subsys_mon
+#include "common/debug.h"
+#include "common/Clock.h"
+#include "common/Formatter.h"
+#include "global/global_context.h"
+#include "include/ceph_features.h"
+#include "include/stringify.h"
+
+#include "osd/osd_types.h"
+#include "osd/OSDMap.h"
+#include <boost/range/adaptor/reversed.hpp>
+
+#define dout_context g_ceph_context
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::fixed_u_to_string;
+
+using TOPNSPC::common::cmd_getval;
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
+MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
+MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
+
+
+// ---------------------
+// PGMapDigest
+
+void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
+{
+  // NOTE: see PGMap::encode_digest
+  uint8_t v = 4;
+  if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+    v = 1;
+  } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+    v = 3;
+  }
+  ENCODE_START(v, 1, bl);
+  encode(num_pg, bl);
+  encode(num_pg_active, bl);
+  encode(num_pg_unknown, bl);
+  encode(num_osd, bl);
+  encode(pg_pool_sum, bl, features);
+  encode(pg_sum, bl, features);
+  encode(osd_sum, bl, features);
+  if (v >= 2) {
+    encode(num_pg_by_state, bl);
+  } else {
+    uint32_t n = num_pg_by_state.size();
+    encode(n, bl);
+    for (auto p : num_pg_by_state) {
+      encode((int32_t)p.first, bl);
+      encode(p.second, bl);
+    }
+  }
+  encode(num_pg_by_osd, bl);
+  encode(num_pg_by_pool, bl);
+  encode(osd_last_seq, bl);
+  encode(per_pool_sum_delta, bl, features);
+  encode(per_pool_sum_deltas_stamps, bl);
+  encode(pg_sum_delta, bl, features);
+  encode(stamp_delta, bl);
+  encode(avail_space_by_rule, bl);
+  if (struct_v >= 3) {
+    encode(purged_snaps, bl);
+  }
+  if (struct_v >= 4) {
+    encode(osd_sum_by_class, bl, features);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void PGMapDigest::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(4, p);
+  decode(num_pg, p);
+  decode(num_pg_active, p);
+  decode(num_pg_unknown, p);
+  decode(num_osd, p);
+  decode(pg_pool_sum, p);
+  decode(pg_sum, p);
+  decode(osd_sum, p);
+  if (struct_v >= 2) {
+    decode(num_pg_by_state, p);
+  } else {
+    map<int32_t, int32_t> nps;
+    decode(nps, p);
+    num_pg_by_state.clear();
+    for (auto i : nps) {
+      num_pg_by_state[i.first] = i.second;
+    }
+  }
+  decode(num_pg_by_osd, p);
+  decode(num_pg_by_pool, p);
+  decode(osd_last_seq, p);
+  decode(per_pool_sum_delta, p);
+  decode(per_pool_sum_deltas_stamps, p);
+  decode(pg_sum_delta, p);
+  decode(stamp_delta, p);
+  decode(avail_space_by_rule, p);
+  if (struct_v >= 3) {
+    decode(purged_snaps, p);
+  }
+  if (struct_v >= 4) {
+    decode(osd_sum_by_class, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void PGMapDigest::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("num_pg", num_pg);
+  f->dump_unsigned("num_pg_active", num_pg_active);
+  f->dump_unsigned("num_pg_unknown", num_pg_unknown);
+  f->dump_unsigned("num_osd", num_osd);
+  f->dump_object("pool_sum", pg_sum);
+  f->dump_object("osd_sum", osd_sum);
+
+  f->open_object_section("osd_sum_by_class");
+  for (auto& i : osd_sum_by_class) {
+    f->dump_object(i.first.c_str(), i.second);
+  }
+  f->close_section();
+
+  f->open_array_section("pool_stats");
+  for (auto& p : pg_pool_sum) {
+    f->open_object_section("pool_stat");
+    f->dump_int("poolid", p.first);
+    auto q = num_pg_by_pool.find(p.first);
+    if (q != num_pg_by_pool.end())
+      f->dump_unsigned("num_pg", q->second);
+    p.second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("osd_stats");
+  int i = 0;
+  // TODO: this isn't really correct since we can dump non-existent OSDs
+  // I dunno what osd_last_seq is set to in that case...
+  for (auto& p : osd_last_seq) {
+    f->open_object_section("osd_stat");
+    f->dump_int("osd", i);
+    f->dump_unsigned("seq", p);
+    f->close_section();
+    ++i;
+  }
+  f->close_section();
+  f->open_array_section("num_pg_by_state");
+  for (auto& p : num_pg_by_state) {
+    f->open_object_section("count");
+    f->dump_string("state", pg_state_string(p.first));
+    f->dump_unsigned("num", p.second);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("num_pg_by_osd");
+  for (auto& p : num_pg_by_osd) {
+    f->open_object_section("count");
+    f->dump_unsigned("osd", p.first);
+    f->dump_unsigned("num_primary_pg", p.second.primary);
+    f->dump_unsigned("num_acting_pg", p.second.acting);
+    f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("purged_snaps");
+  for (auto& j : purged_snaps) {
+    f->open_object_section("pool");
+    f->dump_int("pool", j.first);
+    f->open_object_section("purged_snaps");
+    for (auto i = j.second.begin(); i != j.second.end(); ++i) {
+      f->open_object_section("interval");
+      f->dump_stream("start") << i.get_start();
+      f->dump_stream("length") << i.get_len();
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
+{
+  ls.push_back(new PGMapDigest);
+}
+
+inline std::string percentify(const float& a) {
+  std::stringstream ss;
+  if (a < 0.01)
+    ss << "0";
+  else
+    ss << std::fixed << std::setprecision(2) << a;
+  return ss.str();
+}
+
+void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const
+{
+  if (f)
+    f->open_array_section("pgs_by_state");
+
+  // list is descending numeric order (by count)
+  std::multimap<int,uint64_t> state_by_count;  // count -> state
+  for (auto p = num_pg_by_state.begin();
+       p != num_pg_by_state.end();
+       ++p) {
+    state_by_count.insert(make_pair(p->second, p->first));
+  }
+  if (f) {
+    for (auto p = state_by_count.rbegin();
+         p != state_by_count.rend();
+         ++p)
+    {
+      f->open_object_section("pgs_by_state_element");
+      f->dump_string("state_name", pg_state_string(p->second));
+      f->dump_unsigned("count", p->first);
+      f->close_section();
+    }
+  }
+  if (f)
+    f->close_section();
+
+  if (f) {
+    f->dump_unsigned("num_pgs", num_pg);
+    f->dump_unsigned("num_pools", pg_pool_sum.size());
+    f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
+    f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
+    f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
+    f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
+    f->dump_unsigned("bytes_total", osd_sum.statfs.total);
+  } else {
+    *out << "    pools:   " << pg_pool_sum.size() << " pools, "
+         << num_pg << " pgs\n";
+    *out << "    objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
+         << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
+    *out << "    usage:   "
+         << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
+         << byte_u_t(osd_sum.statfs.available) << " / "
+         << byte_u_t(osd_sum.statfs.total) << " avail\n";
+    *out << "    pgs:     ";
+  }
+
+  bool pad = false;
+
+  if (num_pg_unknown > 0) {
+    float p = (float)num_pg_unknown / (float)num_pg;
+    if (f) {
+      f->dump_float("unknown_pgs_ratio", p);
+    } else {
+      char b[20];
+      snprintf(b, sizeof(b), "%.3lf", p * 100.0);
+      *out << b << "% pgs unknown\n";
+      pad = true;
+    }
+  }
+
+  int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
+  if (num_pg_inactive > 0) {
+    float p = (float)num_pg_inactive / (float)num_pg;
+    if (f) {
+      f->dump_float("inactive_pgs_ratio", p);
+    } else {
+      if (pad) {
+        *out << "             ";
+      }
+      char b[20];
+      snprintf(b, sizeof(b), "%.3f", p * 100.0);
+      *out << b << "% pgs not active\n";
+      pad = true;
+    }
+  }
+
+  list<string> sl;
+  overall_recovery_summary(f, &sl);
+  if (!f && !sl.empty()) {
+    for (auto p = sl.begin(); p != sl.end(); ++p) {
+      if (pad) {
+        *out << "             ";
+      }
+      *out << *p << "\n";
+      pad = true;
+    }
+  }
+  sl.clear();
+
+  if (!f) {
+    unsigned max_width = 1;
+    for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
+    {
+      std::stringstream ss;
+      ss << p->first;
+      max_width = std::max<size_t>(ss.str().size(), max_width);
+    }
+
+    for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
+    {
+      if (pad) {
+        *out << "             ";
+      }
+      pad = true;
+      out->setf(std::ios::left);
+      *out << std::setw(max_width) << p->first
+           << " " << pg_state_string(p->second) << "\n";
+      out->unsetf(std::ios::left);
+    }
+  }
+
+  ostringstream ss_rec_io;
+  overall_recovery_rate_summary(f, &ss_rec_io);
+  ostringstream ss_client_io;
+  overall_client_io_rate_summary(f, &ss_client_io);
+  ostringstream ss_cache_io;
+  overall_cache_io_rate_summary(f, &ss_cache_io);
+
+  if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
+             || ss_cache_io.str().length())) {
+    *out << "\n \n";
+    *out << "  io:\n";
+  }
+
+  if (!f && ss_client_io.str().length())
+    *out << "    client:   " << ss_client_io.str() << "\n";
+  if (!f && ss_rec_io.str().length())
+    *out << "    recovery: " << ss_rec_io.str() << "\n";
+  if (!f && ss_cache_io.str().length())
+    *out << "    cache:    " << ss_cache_io.str() << "\n";
+}
+
+void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const
+{
+  std::stringstream ss;
+
+  if (f)
+    f->open_array_section("num_pg_by_state");
+  for (auto p = num_pg_by_state.begin();
+       p != num_pg_by_state.end();
+       ++p) {
+    if (f) {
+      f->open_object_section("state");
+      f->dump_string("name", pg_state_string(p->first));
+      f->dump_unsigned("num", p->second);
+      f->close_section();
+    }
+    if (p != num_pg_by_state.begin())
+      ss << ", ";
+    ss << p->second << " " << pg_state_string(p->first);
+  }
+  if (f)
+    f->close_section();
+
+  string states = ss.str();
+  if (out)
+    *out << num_pg << " pgs: "
+         << states << "; "
+         << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
+         << byte_u_t(osd_sum.statfs.get_used()) << " used, "
+         << byte_u_t(osd_sum.statfs.available) << " / "
+         << byte_u_t(osd_sum.statfs.total) << " avail";
+  if (f) {
+    f->dump_unsigned("num_pgs", num_pg);
+    f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
+    f->dump_int("total_bytes", osd_sum.statfs.total);
+    f->dump_int("total_avail_bytes", osd_sum.statfs.available);
+    f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
+    f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
+  }
+
+  // make non-negative; we can get negative values if osds send
+  // uncommitted stats and then "go backward" or if they are just
+  // buggy/wrong.
+  pool_stat_t pos_delta = pg_sum_delta;
+  pos_delta.floor(0);
+  if (pos_delta.stats.sum.num_rd ||
+      pos_delta.stats.sum.num_wr) {
+    if (out)
+      *out << "; ";
+    if (pos_delta.stats.sum.num_rd) {
+      int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
+      if (out)
+	*out << byte_u_t(rd) << "/s rd, ";
+      if (f)
+	f->dump_unsigned("read_bytes_sec", rd);
+    }
+    if (pos_delta.stats.sum.num_wr) {
+      int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
+      if (out)
+	*out << byte_u_t(wr) << "/s wr, ";
+      if (f)
+	f->dump_unsigned("write_bytes_sec", wr);
+    }
+    int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
+    if (out)
+      *out << si_u_t(iops) << " op/s";
+    if (f)
+      f->dump_unsigned("io_sec", iops);
+  }
+
+  list<string> sl;
+  overall_recovery_summary(f, &sl);
+  if (out)
+    for (auto p = sl.begin(); p != sl.end(); ++p)
+      *out << "; " << *p;
+  std::stringstream ssr;
+  overall_recovery_rate_summary(f, &ssr);
+  if (out && ssr.str().length())
+    *out << "; " << ssr.str() << " recovering";
+}
+
+void PGMapDigest::get_recovery_stats(
+    double *misplaced_ratio,
+    double *degraded_ratio,
+    double *inactive_pgs_ratio,
+    double *unknown_pgs_ratio) const
+{
+  if (pg_sum.stats.sum.num_objects_degraded &&
+      pg_sum.stats.sum.num_object_copies > 0) {
+    *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
+      (double)pg_sum.stats.sum.num_object_copies;
+  } else {
+    *degraded_ratio = 0;
+  }
+  if (pg_sum.stats.sum.num_objects_misplaced &&
+      pg_sum.stats.sum.num_object_copies > 0) {
+    *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
+      (double)pg_sum.stats.sum.num_object_copies;
+  } else {
+    *misplaced_ratio = 0;
+  }
+  if (num_pg > 0) {
+    int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
+    *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
+    *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
+ } else {
+    *inactive_pgs_ratio = 0;
+    *unknown_pgs_ratio = 0;
+  }
+}
+
+void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl,
+                             const pool_stat_t& pool_sum) const
+{
+  if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
+    double pc = (double)pool_sum.stats.sum.num_objects_degraded /
+                (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
+    char b[20];
+    snprintf(b, sizeof(b), "%.3lf", pc);
+    if (f) {
+      f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
+      f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
+      f->dump_float("degraded_ratio", pc / 100.0);
+    } else {
+      ostringstream ss;
+      ss << pool_sum.stats.sum.num_objects_degraded
+         << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
+      psl->push_back(ss.str());
+    }
+  }
+  if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
+    double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
+                (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
+    char b[20];
+    snprintf(b, sizeof(b), "%.3lf", pc);
+    if (f) {
+      f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
+      f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
+      f->dump_float("misplaced_ratio", pc / 100.0);
+    } else {
+      ostringstream ss;
+      ss << pool_sum.stats.sum.num_objects_misplaced
+         << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
+      psl->push_back(ss.str());
+    }
+  }
+  if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
+    double pc = (double)pool_sum.stats.sum.num_objects_unfound /
+                (double)pool_sum.stats.sum.num_objects * (double)100.0;
+    char b[20];
+    snprintf(b, sizeof(b), "%.3lf", pc);
+    if (f) {
+      f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
+      f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
+      f->dump_float("unfound_ratio", pc / 100.0);
+    } else {
+      ostringstream ss;
+      ss << pool_sum.stats.sum.num_objects_unfound
+         << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
+      psl->push_back(ss.str());
+    }
+  }
+}
+
+void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out,
+                                  const pool_stat_t& delta_sum,
+                                  utime_t delta_stamp) const
+{
+  // make non-negative; we can get negative values if osds send
+  // uncommitted stats and then "go backward" or if they are just
+  // buggy/wrong.
+  pool_stat_t pos_delta = delta_sum;
+  pos_delta.floor(0);
+  if (pos_delta.stats.sum.num_objects_recovered ||
+      pos_delta.stats.sum.num_bytes_recovered ||
+      pos_delta.stats.sum.num_keys_recovered) {
+    int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
+    int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
+    int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
+    if (f) {
+      f->dump_int("recovering_objects_per_sec", objps);
+      f->dump_int("recovering_bytes_per_sec", bps);
+      f->dump_int("recovering_keys_per_sec", kps);
+      f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
+      f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
+      f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
+    } else {
+      *out << byte_u_t(bps) << "/s";
+      if (pos_delta.stats.sum.num_keys_recovered)
+	*out << ", " << si_u_t(kps) << " keys/s";
+      *out << ", " << si_u_t(objps) << " objects/s";
+    }
+  }
+}
+
+void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const
+{
+  recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const
+{
+  recovery_summary(f, psl, pg_sum);
+}
+
+void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out,
+                                       uint64_t poolid) const
+{
+  auto p = per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+
+  auto ts = per_pool_sum_deltas_stamps.find(p->first);
+  ceph_assert(ts != per_pool_sum_deltas_stamps.end());
+  recovery_rate_summary(f, out, p->second.first, ts->second);
+}
+
+void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl,
+                                  uint64_t poolid) const
+{
+  auto p = pg_pool_sum.find(poolid);
+  if (p == pg_pool_sum.end())
+    return;
+
+  recovery_summary(f, psl, p->second);
+}
+
+void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out,
+                                   const pool_stat_t& delta_sum,
+                                   utime_t delta_stamp) const
+{
+  pool_stat_t pos_delta = delta_sum;
+  pos_delta.floor(0);
+  if (pos_delta.stats.sum.num_rd ||
+      pos_delta.stats.sum.num_wr) {
+    if (pos_delta.stats.sum.num_rd) {
+      int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
+      if (f) {
+	f->dump_int("read_bytes_sec", rd);
+      } else {
+	*out << byte_u_t(rd) << "/s rd, ";
+      }
+    }
+    if (pos_delta.stats.sum.num_wr) {
+      int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
+      if (f) {
+	f->dump_int("write_bytes_sec", wr);
+      } else {
+	*out << byte_u_t(wr) << "/s wr, ";
+      }
+    }
+    int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
+    int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
+    if (f) {
+      f->dump_int("read_op_per_sec", iops_rd);
+      f->dump_int("write_op_per_sec", iops_wr);
+    } else {
+      *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
+    }
+  }
+}
+
+void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const
+{
+  client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out,
+                                        uint64_t poolid) const
+{
+  auto p = per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+
+  auto ts = per_pool_sum_deltas_stamps.find(p->first);
+  ceph_assert(ts != per_pool_sum_deltas_stamps.end());
+  client_io_rate_summary(f, out, p->second.first, ts->second);
+}
+
+void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out,
+                                  const pool_stat_t& delta_sum,
+                                  utime_t delta_stamp) const
+{
+  pool_stat_t pos_delta = delta_sum;
+  pos_delta.floor(0);
+  bool have_output = false;
+
+  if (pos_delta.stats.sum.num_flush) {
+    int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
+    if (f) {
+      f->dump_int("flush_bytes_sec", flush);
+    } else {
+      *out << byte_u_t(flush) << "/s flush";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_evict) {
+    int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
+    if (f) {
+      f->dump_int("evict_bytes_sec", evict);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << byte_u_t(evict) << "/s evict";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_promote) {
+    int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
+    if (f) {
+      f->dump_int("promote_op_per_sec", promote);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << si_u_t(promote) << " op/s promote";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_flush_mode_low) {
+    if (f) {
+      f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_flush_mode_high) {
+    if (f) {
+      f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_evict_mode_some) {
+    if (f) {
+      f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
+      have_output = true;
+    }
+  }
+  if (pos_delta.stats.sum.num_evict_mode_full) {
+    if (f) {
+      f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
+    } else {
+      if (have_output)
+	*out << ", ";
+      *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
+    }
+  }
+}
+
+void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const
+{
+  cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out,
+                                       uint64_t poolid) const
+{
+  auto p = per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+
+  auto ts = per_pool_sum_deltas_stamps.find(p->first);
+  ceph_assert(ts != per_pool_sum_deltas_stamps.end());
+  cache_io_rate_summary(f, out, p->second.first, ts->second);
+}
+
+ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
+				    boost::optional<int64_t> data_pool) const
+{
+  ceph_statfs statfs;
+  bool filter = false;
+  object_stat_sum_t sum;
+
+  if (data_pool) {
+    auto i = pg_pool_sum.find(*data_pool);
+    if (i != pg_pool_sum.end()) {
+      sum = i->second.stats.sum;
+      filter = true;
+    }
+  }
+
+  if (filter) {
+    statfs.kb_used = (sum.num_bytes >> 10);
+    statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
+    statfs.num_objects = sum.num_objects;
+    statfs.kb = statfs.kb_used + statfs.kb_avail;
+  } else {
+    // these are in KB.
+    statfs.kb = osd_sum.statfs.kb();
+    statfs.kb_used = osd_sum.statfs.kb_used_raw();
+    statfs.kb_avail = osd_sum.statfs.kb_avail();
+    statfs.num_objects = pg_sum.stats.sum.num_objects;
+  }
+
+  return statfs;
+}
+
+void PGMapDigest::dump_pool_stats_full(
+  const OSDMap &osd_map,
+  stringstream *ss,
+  ceph::Formatter *f,
+  bool verbose) const
+{
+  TextTable tbl;
+
+  if (f) {
+    f->open_array_section("pools");
+  } else {
+    tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("ID", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("PGS", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("STORED", TextTable::RIGHT, TextTable::RIGHT);
+    if (verbose) {
+      tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT);
+    }
+    tbl.define_column("OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
+    if (verbose) {
+      tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT);
+    }
+    tbl.define_column("%USED", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("MAX AVAIL", TextTable::RIGHT, TextTable::RIGHT);
+
+    if (verbose) {
+      tbl.define_column("QUOTA OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("QUOTA BYTES", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("DIRTY", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("USED COMPR", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("UNDER COMPR", TextTable::RIGHT, TextTable::RIGHT);
+    }
+  }
+
+  map<int,uint64_t> avail_by_rule;
+  for (auto p = osd_map.get_pools().begin();
+       p != osd_map.get_pools().end(); ++p) {
+    int64_t pool_id = p->first;
+    if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
+      continue;
+
+    const string& pool_name = osd_map.get_pool_name(pool_id);
+    auto pool_pg_num = osd_map.get_pg_num(pool_id);
+    const pool_stat_t &stat = pg_pool_sum.at(pool_id);
+
+    const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
+    int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
+                                         pool->get_type(),
+                                         pool->get_size());
+    int64_t avail;
+    if (avail_by_rule.count(ruleno) == 0) {
+      // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
+      avail = get_rule_avail(ruleno);
+      if (avail < 0)
+	avail = 0;
+      avail_by_rule[ruleno] = avail;
+    } else {
+      avail = avail_by_rule[ruleno];
+    }
+    if (f) {
+      f->open_object_section("pool");
+      f->dump_string("name", pool_name);
+      f->dump_int("id", pool_id);
+      f->open_object_section("stats");
+    } else {
+      tbl << pool_name
+          << pool_id
+          << pool_pg_num;
+    }
+    float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
+    bool per_pool = use_per_pool_stats();
+    bool per_pool_omap = use_per_pool_omap_stats();
+    dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
+			 per_pool_omap, pool);
+    if (f) {
+      f->close_section();  // stats
+      f->close_section();  // pool
+    } else {
+      tbl << TextTable::endrow;
+    }
+  }
+  if (f)
+    f->close_section();
+  else {
+    ceph_assert(ss != nullptr);
+    *ss << "--- POOLS ---\n";
+    *ss << tbl;
+  }
+}
+
+void PGMapDigest::dump_cluster_stats(stringstream *ss,
+				     ceph::Formatter *f,
+				     bool verbose) const
+{
+  if (f) {
+    f->open_object_section("stats");
+    f->dump_int("total_bytes", osd_sum.statfs.total);
+    f->dump_int("total_avail_bytes", osd_sum.statfs.available);
+    f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
+    f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
+    f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
+    f->dump_unsigned("num_osds", osd_sum.num_osds);
+    f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
+    f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds);
+    f->close_section();
+    f->open_object_section("stats_by_class");
+    for (auto& i : osd_sum_by_class) {
+      f->open_object_section(i.first.c_str());
+      f->dump_int("total_bytes", i.second.statfs.total);
+      f->dump_int("total_avail_bytes", i.second.statfs.available);
+      f->dump_int("total_used_bytes", i.second.statfs.get_used());
+      f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
+      f->dump_float("total_used_raw_ratio",
+		    i.second.statfs.get_used_raw_ratio());
+      f->close_section();
+    }
+    f->close_section();
+  } else {
+    ceph_assert(ss != nullptr);
+    TextTable tbl;
+    tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("SIZE", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("AVAIL", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("RAW USED", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("%RAW USED", TextTable::RIGHT, TextTable::RIGHT);
+
+
+    for (auto& i : osd_sum_by_class) {
+      tbl << i.first;
+      tbl << stringify(byte_u_t(i.second.statfs.total))
+	  << stringify(byte_u_t(i.second.statfs.available))
+	  << stringify(byte_u_t(i.second.statfs.get_used()))
+	  << stringify(byte_u_t(i.second.statfs.get_used_raw()))
+	  << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
+	  << TextTable::endrow;
+    }
+    tbl << "TOTAL";
+    tbl << stringify(byte_u_t(osd_sum.statfs.total))
+        << stringify(byte_u_t(osd_sum.statfs.available))
+        << stringify(byte_u_t(osd_sum.statfs.get_used()))
+        << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
+	<< percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
+	<< TextTable::endrow;
+
+    *ss << "--- RAW STORAGE ---\n";
+    *ss << tbl;
+  }
+}
+
+void PGMapDigest::dump_object_stat_sum(
+  TextTable &tbl, ceph::Formatter *f,
+  const pool_stat_t &pool_stat, uint64_t avail,
+  float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap,
+  const pg_pool_t *pool)
+{
+  const object_stat_sum_t &sum = pool_stat.stats.sum;
+  const store_statfs_t statfs = pool_stat.store_stats;
+
+  if (sum.num_object_copies > 0) {
+    raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
+  }
+
+  uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool);
+  uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap);
+  uint64_t used_bytes = used_data_bytes + used_omap_bytes;
+
+  float used = 0.0;
+  // note avail passed in is raw_avail, calc raw_used here.
+  if (avail) {
+    used = used_bytes;
+    used /= used + avail;
+  } else if (used_bytes) {
+    used = 1.0;
+  }
+  auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
+  // an approximation for actually stored user data
+  auto stored_data_normalized = pool_stat.get_user_data_bytes(
+    raw_used_rate, per_pool);
+  auto stored_omap_normalized = pool_stat.get_user_omap_bytes(
+    raw_used_rate, per_pool_omap);
+  auto stored_normalized = stored_data_normalized + stored_omap_normalized;
+  // same, amplied by replication or EC
+  auto stored_raw = stored_normalized * raw_used_rate;
+  if (f) {
+    f->dump_int("stored", stored_normalized);
+    if (verbose) {
+      f->dump_int("stored_data", stored_data_normalized);
+      f->dump_int("stored_omap", stored_omap_normalized);
+    }
+    f->dump_int("objects", sum.num_objects);
+    f->dump_int("kb_used", shift_round_up(used_bytes, 10));
+    f->dump_int("bytes_used", used_bytes);
+    if (verbose) {
+      f->dump_int("data_bytes_used", used_data_bytes);
+      f->dump_int("omap_bytes_used", used_omap_bytes);
+    }
+    f->dump_float("percent_used", used);
+    f->dump_unsigned("max_avail", avail_res);
+    if (verbose) {
+      f->dump_int("quota_objects", pool->quota_max_objects);
+      f->dump_int("quota_bytes", pool->quota_max_bytes);
+      if (pool->is_tier()) {
+        f->dump_int("dirty", sum.num_objects_dirty);
+      } else {
+        f->dump_int("dirty", 0);
+      }
+      f->dump_int("rd", sum.num_rd);
+      f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
+      f->dump_int("wr", sum.num_wr);
+      f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
+      f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
+      f->dump_int("compress_under_bytes", statfs.data_compressed_original);
+      // Stored by user amplified by replication
+      f->dump_int("stored_raw", stored_raw);
+      f->dump_unsigned("avail_raw", avail);
+    }
+  } else {
+    tbl << stringify(byte_u_t(stored_normalized));
+    if (verbose) {
+      tbl << stringify(byte_u_t(stored_data_normalized));
+      tbl << stringify(byte_u_t(stored_omap_normalized));
+    }
+    tbl << stringify(si_u_t(sum.num_objects));
+    tbl << stringify(byte_u_t(used_bytes));
+    if (verbose) {
+      tbl << stringify(byte_u_t(used_data_bytes));
+      tbl << stringify(byte_u_t(used_omap_bytes));
+    }
+    tbl << percentify(used*100);
+    tbl << stringify(byte_u_t(avail_res));
+    if (verbose) {
+      if (pool->quota_max_objects == 0)
+        tbl << "N/A";
+      else
+        tbl << stringify(si_u_t(pool->quota_max_objects));
+      if (pool->quota_max_bytes == 0)
+        tbl << "N/A";
+      else
+        tbl << stringify(byte_u_t(pool->quota_max_bytes));
+      if (pool->is_tier()) {
+        tbl << stringify(si_u_t(sum.num_objects_dirty));
+      } else {
+        tbl << "N/A";
+      }
+      tbl << stringify(byte_u_t(statfs.data_compressed_allocated));
+      tbl << stringify(byte_u_t(statfs.data_compressed_original));
+    }
+  }
+}
+
+int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
+					 int64_t poolid) const
+{
+  const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
+  int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
+					pool->get_type(),
+					pool->get_size());
+  int64_t avail;
+  avail = get_rule_avail(ruleno);
+  if (avail < 0)
+    avail = 0;
+
+  return avail / osd_map.pool_raw_used_rate(poolid);
+}
+
+int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
+{
+  map<int,float> wm;
+  int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
+  if (r < 0) {
+    return r;
+  }
+  if (wm.empty()) {
+    return 0;
+  }
+
+  float fratio = osdmap.get_full_ratio();
+
+  int64_t min = -1;
+  for (auto p = wm.begin(); p != wm.end(); ++p) {
+    auto osd_info = osd_stat.find(p->first);
+    if (osd_info != osd_stat.end()) {
+      if (osd_info->second.statfs.total == 0 || p->second == 0) {
+	// osd must be out, hence its stats have been zeroed
+	// (unless we somehow managed to have a disk with size 0...)
+	//
+	// (p->second == 0), if osd weight is 0, no need to
+	// calculate proj below.
+	continue;
+      }
+      double unusable = (double)osd_info->second.statfs.kb() *
+	(1.0 - fratio);
+      double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
+      avail *= 1024.0;
+      int64_t proj = (int64_t)(avail / (double)p->second);
+      if (min < 0 || proj < min) {
+	min = proj;
+      }
+    } else {
+      if (osdmap.is_up(p->first)) {
+        // This is a level 4 rather than an error, because we might have
+        // only just started, and not received the first stats message yet.
+        dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
+      }
+    }
+  }
+  return min;
+}
+
+void PGMap::get_rules_avail(const OSDMap& osdmap,
+			    std::map<int,int64_t> *avail_map) const
+{
+  avail_map->clear();
+  for (auto p : osdmap.get_pools()) {
+    int64_t pool_id = p.first;
+    if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
+      continue;
+    const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
+    int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
+					 pool->get_type(),
+					 pool->get_size());
+    if (avail_map->count(ruleno) == 0)
+      (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
+  }
+}
+
+// ---------------------
+// PGMap
+
+void PGMap::Incremental::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("version", version);
+  f->dump_stream("stamp") << stamp;
+  f->dump_unsigned("osdmap_epoch", osdmap_epoch);
+  f->dump_unsigned("pg_scan_epoch", pg_scan);
+
+  f->open_array_section("pg_stat_updates");
+  for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
+    f->open_object_section("pg_stat");
+    f->dump_stream("pgid") << p->first;
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("osd_stat_updates");
+  for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
+    f->open_object_section("osd_stat");
+    f->dump_int("osd", p->first);
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("pool_statfs_updates");
+  for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
+    f->open_object_section("pool_statfs");
+    f->dump_stream("poolid/osd") << p->first;
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("osd_stat_removals");
+  for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
+    f->dump_int("osd", *p);
+  f->close_section();
+
+  f->open_array_section("pg_removals");
+  for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
+    f->dump_stream("pgid") << *p;
+  f->close_section();
+}
+
+void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
+{
+  o.push_back(new Incremental);
+  o.push_back(new Incremental);
+  o.back()->version = 1;
+  o.back()->stamp = utime_t(123,345);
+  o.push_back(new Incremental);
+  o.back()->version = 2;
+  o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
+  o.back()->osd_stat_updates[5] = osd_stat_t();
+  o.push_back(new Incremental);
+  o.back()->version = 3;
+  o.back()->osdmap_epoch = 1;
+  o.back()->pg_scan = 2;
+  o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
+  o.back()->osd_stat_updates[6] = osd_stat_t();
+  o.back()->pg_remove.insert(pg_t(1,2));
+  o.back()->osd_stat_rm.insert(5);
+  o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
+}
+
+// --
+
+void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
+{
+  ceph_assert(inc.version == version+1);
+  version++;
+
+  pool_stat_t pg_sum_old = pg_sum;
+  mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
+  pg_pool_sum_old = pg_pool_sum;
+
+  for (auto p = inc.pg_stat_updates.begin();
+       p != inc.pg_stat_updates.end();
+       ++p) {
+    const pg_t &update_pg(p->first);
+    auto update_pool = update_pg.pool();
+    const pg_stat_t &update_stat(p->second);
+
+    auto pg_stat_iter = pg_stat.find(update_pg);
+    pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
+    if (pg_stat_iter == pg_stat.end()) {
+      pg_stat.insert(make_pair(update_pg, update_stat));
+    } else {
+      stat_pg_sub(update_pg, pg_stat_iter->second);
+      pool_sum_ref.sub(pg_stat_iter->second);
+      pg_stat_iter->second = update_stat;
+    }
+    stat_pg_add(update_pg, update_stat);
+    pool_sum_ref.add(update_stat);
+  }
+
+  for (auto p = inc.pool_statfs_updates.begin();
+       p != inc.pool_statfs_updates.end();
+       ++p) {
+    auto update_pool = p->first.first;
+    auto update_osd =  p->first.second;
+    auto& statfs_inc = p->second;
+
+    auto pool_statfs_iter =
+      pool_statfs.find(std::make_pair(update_pool, update_osd));
+    if (pg_pool_sum.count(update_pool)) {
+      pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
+      if (pool_statfs_iter == pool_statfs.end()) {
+        pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
+      } else {
+        pool_sum_ref.sub(pool_statfs_iter->second);
+        pool_statfs_iter->second = statfs_inc;
+      }
+      pool_sum_ref.add(statfs_inc);
+    }
+  }
+
+  for (auto p = inc.get_osd_stat_updates().begin();
+       p != inc.get_osd_stat_updates().end();
+       ++p) {
+    int osd = p->first;
+    const osd_stat_t &new_stats(p->second);
+
+    auto t = osd_stat.find(osd);
+    if (t == osd_stat.end()) {
+      osd_stat.insert(make_pair(osd, new_stats));
+    } else {
+      stat_osd_sub(t->first, t->second);
+      t->second = new_stats;
+    }
+    stat_osd_add(osd, new_stats);
+  }
+  set<int64_t> deleted_pools;
+  for (auto p = inc.pg_remove.begin();
+       p != inc.pg_remove.end();
+       ++p) {
+    const pg_t &removed_pg(*p);
+    auto s = pg_stat.find(removed_pg);
+    bool pool_erased = false;
+    if (s != pg_stat.end()) {
+      pool_erased = stat_pg_sub(removed_pg, s->second);
+
+      // decrease pool stats if pg was removed
+      auto pool_stats_it = pg_pool_sum.find(removed_pg.pool());
+      if (pool_stats_it != pg_pool_sum.end()) {
+        pool_stats_it->second.sub(s->second);
+      }
+
+      pg_stat.erase(s);
+      if (pool_erased) {
+        deleted_pools.insert(removed_pg.pool());
+      }
+    }
+  }
+
+  for (auto p = inc.get_osd_stat_rm().begin();
+       p != inc.get_osd_stat_rm().end();
+       ++p) {
+    auto t = osd_stat.find(*p);
+    if (t != osd_stat.end()) {
+      stat_osd_sub(t->first, t->second);
+      osd_stat.erase(t);
+    }
+    for (auto i = pool_statfs.begin();  i != pool_statfs.end(); ++i) {
+      if (i->first.second == *p) {
+	pg_pool_sum[i->first.first].sub(i->second);
+	pool_statfs.erase(i);
+      }
+    }
+  }
+
+  // skip calculating delta while sum was not synchronized
+  if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
+    utime_t delta_t;
+    delta_t = inc.stamp;
+    delta_t -= stamp;
+    // calculate a delta, and average over the last 2 deltas.
+    pool_stat_t d = pg_sum;
+    d.stats.sub(pg_sum_old.stats);
+    pg_sum_deltas.push_back(make_pair(d, delta_t));
+    stamp_delta += delta_t;
+    pg_sum_delta.stats.add(d.stats);
+    auto smooth_intervals =
+      cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
+    while (pg_sum_deltas.size() > smooth_intervals) {
+      pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
+      stamp_delta -= pg_sum_deltas.front().second;
+      pg_sum_deltas.pop_front();
+    }
+  }
+  stamp = inc.stamp;
+
+  update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
+
+  for (auto p : deleted_pools) {
+    if (cct)
+      dout(20) << " deleted pool " << p << dendl;
+    deleted_pool(p);
+  }
+
+  if (inc.osdmap_epoch)
+    last_osdmap_epoch = inc.osdmap_epoch;
+  if (inc.pg_scan)
+    last_pg_scan = inc.pg_scan;
+}
+
+void PGMap::calc_stats()
+{
+  num_pg = 0;
+  num_pg_active = 0;
+  num_pg_unknown = 0;
+  num_osd = 0;
+  pg_pool_sum.clear();
+  num_pg_by_pool.clear();
+  pg_by_osd.clear();
+  pg_sum = pool_stat_t();
+  osd_sum = osd_stat_t();
+  osd_sum_by_class.clear();
+  num_pg_by_state.clear();
+  num_pg_by_pool_state.clear();
+  num_pg_by_osd.clear();
+
+  for (auto p = pg_stat.begin();
+       p != pg_stat.end();
+       ++p) {
+    auto pg = p->first;
+    stat_pg_add(pg, p->second);
+    pg_pool_sum[pg.pool()].add(p->second);
+  }
+  for (auto p = pool_statfs.begin();
+       p != pool_statfs.end();
+       ++p) {
+    auto pool = p->first.first;
+    pg_pool_sum[pool].add(p->second);
+  }
+  for (auto p = osd_stat.begin();
+       p != osd_stat.end();
+       ++p)
+    stat_osd_add(p->first, p->second);
+}
+
+void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
+                        bool sameosds)
+{
+  auto pool = pgid.pool();
+  pg_sum.add(s);
+
+  num_pg++;
+  num_pg_by_state[s.state]++;
+  num_pg_by_pool_state[pgid.pool()][s.state]++;
+  num_pg_by_pool[pool]++;
+
+  if ((s.state & PG_STATE_CREATING) &&
+      s.parent_split_bits == 0) {
+    creating_pgs.insert(pgid);
+    if (s.acting_primary >= 0) {
+      creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
+    }
+  }
+
+  if (s.state & PG_STATE_ACTIVE) {
+    ++num_pg_active;
+  }
+  if (s.state == 0) {
+    ++num_pg_unknown;
+  }
+
+  if (sameosds)
+    return;
+
+  for (auto p = s.blocked_by.begin();
+       p != s.blocked_by.end();
+       ++p) {
+    ++blocked_by_sum[*p];
+  }
+
+  for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
+    pg_by_osd[*p].insert(pgid);
+    num_pg_by_osd[*p].acting++;
+  }
+  for (auto p = s.up.begin(); p != s.up.end(); ++p) {
+    auto& t = pg_by_osd[*p];
+    if (t.find(pgid) == t.end()) {
+      t.insert(pgid);
+      num_pg_by_osd[*p].up_not_acting++;
+    }
+  }
+
+  if (s.up_primary >= 0) {
+    num_pg_by_osd[s.up_primary].primary++;
+  }
+}
+
+bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
+                        bool sameosds)
+{
+  bool pool_erased = false;
+  pg_sum.sub(s);
+
+  num_pg--;
+  int end = --num_pg_by_state[s.state];
+  ceph_assert(end >= 0);
+  if (end == 0)
+    num_pg_by_state.erase(s.state);
+  if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
+    num_pg_by_pool_state[pgid.pool()].erase(s.state);
+  }
+  end = --num_pg_by_pool[pgid.pool()];
+  if (end == 0) {
+    pool_erased = true;
+  }
+
+  if ((s.state & PG_STATE_CREATING) &&
+      s.parent_split_bits == 0) {
+    creating_pgs.erase(pgid);
+    if (s.acting_primary >= 0) {
+      map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
+      r[s.mapping_epoch].erase(pgid);
+      if (r[s.mapping_epoch].empty())
+	r.erase(s.mapping_epoch);
+      if (r.empty())
+	creating_pgs_by_osd_epoch.erase(s.acting_primary);
+    }
+  }
+
+  if (s.state & PG_STATE_ACTIVE) {
+    --num_pg_active;
+  }
+  if (s.state == 0) {
+    --num_pg_unknown;
+  }
+
+  if (sameosds)
+    return pool_erased;
+
+  for (auto p = s.blocked_by.begin();
+       p != s.blocked_by.end();
+       ++p) {
+    auto q = blocked_by_sum.find(*p);
+    ceph_assert(q != blocked_by_sum.end());
+    --q->second;
+    if (q->second == 0)
+      blocked_by_sum.erase(q);
+  }
+
+  set<int32_t> actingset;
+  for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
+    actingset.insert(*p);
+    auto& oset = pg_by_osd[*p];
+    oset.erase(pgid);
+    if (oset.empty())
+      pg_by_osd.erase(*p);
+    auto it = num_pg_by_osd.find(*p);
+    if (it != num_pg_by_osd.end() && it->second.acting > 0)
+      it->second.acting--;
+  }
+  for (auto p = s.up.begin(); p != s.up.end(); ++p) {
+    auto& oset = pg_by_osd[*p];
+    oset.erase(pgid);
+    if (oset.empty())
+      pg_by_osd.erase(*p);
+    if (actingset.count(*p))
+      continue;
+    auto it = num_pg_by_osd.find(*p);
+    if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
+      it->second.up_not_acting--;
+  }
+
+  if (s.up_primary >= 0) {
+    auto it = num_pg_by_osd.find(s.up_primary);
+    if (it != num_pg_by_osd.end() && it->second.primary > 0)
+      it->second.primary--;
+  }
+  return pool_erased;
+}
+
+void PGMap::calc_purged_snaps()
+{
+  purged_snaps.clear();
+  set<int64_t> unknown;
+  for (auto& i : pg_stat) {
+    if (i.second.state == 0) {
+      unknown.insert(i.first.pool());
+      purged_snaps.erase(i.first.pool());
+      continue;
+    } else if (unknown.count(i.first.pool())) {
+      continue;
+    }
+    auto j = purged_snaps.find(i.first.pool());
+    if (j == purged_snaps.end()) {
+      // base case
+      purged_snaps[i.first.pool()] = i.second.purged_snaps;
+    } else {
+      j->second.intersection_of(i.second.purged_snaps);
+    }
+  }
+}
+
+void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
+{
+  osd_sum_by_class.clear();
+  for (auto& i : osd_stat) {
+    const char *class_name = osdmap.crush->get_item_class(i.first);
+    if (class_name) {
+      osd_sum_by_class[class_name].add(i.second);
+    }
+  }
+}
+
+void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
+{
+  num_osd++;
+  osd_sum.add(s);
+  if (osd >= (int)osd_last_seq.size()) {
+    osd_last_seq.resize(osd + 1);
+  }
+  osd_last_seq[osd] = s.seq;
+}
+
+void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
+{
+  num_osd--;
+  osd_sum.sub(s);
+  ceph_assert(osd < (int)osd_last_seq.size());
+  osd_last_seq[osd] = 0;
+}
+
+void PGMap::encode_digest(const OSDMap& osdmap,
+			  bufferlist& bl, uint64_t features)
+{
+  get_rules_avail(osdmap, &avail_space_by_rule);
+  calc_osd_sum_by_class(osdmap);
+  calc_purged_snaps();
+  PGMapDigest::encode(bl, features);
+}
+
+void PGMap::encode(bufferlist &bl, uint64_t features) const
+{
+  ENCODE_START(8, 8, bl);
+  encode(version, bl);
+  encode(pg_stat, bl);
+  encode(osd_stat, bl, features);
+  encode(last_osdmap_epoch, bl);
+  encode(last_pg_scan, bl);
+  encode(stamp, bl);
+  encode(pool_statfs, bl, features);
+  ENCODE_FINISH(bl);
+}
+
+void PGMap::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(8, bl);
+  decode(version, bl);
+  decode(pg_stat, bl);
+  decode(osd_stat, bl);
+  decode(last_osdmap_epoch, bl);
+  decode(last_pg_scan, bl);
+  decode(stamp, bl);
+  decode(pool_statfs, bl);
+  DECODE_FINISH(bl);
+
+  calc_stats();
+}
+
+void PGMap::dump(ceph::Formatter *f, bool with_net) const
+{
+  dump_basic(f);
+  dump_pg_stats(f, false);
+  dump_pool_stats(f);
+  dump_osd_stats(f, with_net);
+}
+
+void PGMap::dump_basic(ceph::Formatter *f) const
+{
+  f->dump_unsigned("version", version);
+  f->dump_stream("stamp") << stamp;
+  f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
+  f->dump_unsigned("last_pg_scan", last_pg_scan);
+
+  f->open_object_section("pg_stats_sum");
+  pg_sum.dump(f);
+  f->close_section();
+
+  f->open_object_section("osd_stats_sum");
+  osd_sum.dump(f);
+  f->close_section();
+
+  dump_delta(f);
+}
+
+void PGMap::dump_delta(ceph::Formatter *f) const
+{
+  f->open_object_section("pg_stats_delta");
+  pg_sum_delta.dump(f);
+  f->dump_stream("stamp_delta") << stamp_delta;
+  f->close_section();
+}
+
+void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const
+{
+  f->open_array_section("pg_stats");
+  for (auto i = pg_stat.begin();
+       i != pg_stat.end();
+       ++i) {
+    f->open_object_section("pg_stat");
+    f->dump_stream("pgid") << i->first;
+    if (brief)
+      i->second.dump_brief(f);
+    else
+      i->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void PGMap::dump_pg_progress(ceph::Formatter *f) const
+{
+  f->open_object_section("pgs");
+  for (auto& i : pg_stat) {
+    std::string n = stringify(i.first);
+    f->open_object_section(n.c_str());
+    f->dump_int("num_bytes_recovered", i.second.stats.sum.num_bytes_recovered);
+    f->dump_int("num_bytes", i.second.stats.sum.num_bytes);
+    f->dump_unsigned("reported_epoch", i.second.reported_epoch);
+    f->dump_string("state", pg_state_string(i.second.state));
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void PGMap::dump_pool_stats(ceph::Formatter *f) const
+{
+  f->open_array_section("pool_stats");
+  for (auto p = pg_pool_sum.begin();
+       p != pg_pool_sum.end();
+       ++p) {
+    f->open_object_section("pool_stat");
+    f->dump_int("poolid", p->first);
+    auto q = num_pg_by_pool.find(p->first);
+    if (q != num_pg_by_pool.end())
+      f->dump_unsigned("num_pg", q->second);
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void PGMap::dump_osd_stats(ceph::Formatter *f, bool with_net) const
+{
+  f->open_array_section("osd_stats");
+  for (auto q = osd_stat.begin();
+       q != osd_stat.end();
+       ++q) {
+    f->open_object_section("osd_stat");
+    f->dump_int("osd", q->first);
+    q->second.dump(f, with_net);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("pool_statfs");
+  for (auto& p : pool_statfs) {
+    f->open_object_section("item");
+    f->dump_int("poolid", p.first.first);
+    f->dump_int("osd", p.first.second);
+    p.second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void PGMap::dump_osd_ping_times(ceph::Formatter *f) const
+{
+  f->open_array_section("osd_ping_times");
+  for (auto& [osd, stat] : osd_stat) {
+    f->open_object_section("osd_ping_time");
+    f->dump_int("osd", osd);
+    stat.dump_ping_time(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void PGMap::dump_pg_stats_plain(
+  ostream& ss,
+  const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
+  bool brief) const
+{
+  TextTable tab;
+
+  if (brief){
+    tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
+    tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+  }
+  else {
+    tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
+    tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
+  }
+
+  for (auto i = pg_stats.begin();
+       i != pg_stats.end(); ++i) {
+    const pg_stat_t &st(i->second);
+    if (brief) {
+      tab << i->first
+          << pg_state_string(st.state)
+          << st.up
+          << st.up_primary
+          << st.acting
+          << st.acting_primary
+          << TextTable::endrow;
+    } else {
+      ostringstream reported;
+      reported << st.reported_epoch << ":" << st.reported_seq;
+
+      tab << i->first
+          << st.stats.sum.num_objects
+          << st.stats.sum.num_objects_missing_on_primary
+          << st.stats.sum.num_objects_degraded
+          << st.stats.sum.num_objects_misplaced
+          << st.stats.sum.num_objects_unfound
+          << st.stats.sum.num_bytes
+          << st.stats.sum.num_omap_bytes
+          << st.stats.sum.num_omap_keys
+          << st.log_size
+          << st.ondisk_log_size
+          << pg_state_string(st.state)
+          << st.last_change
+          << st.version
+          << reported.str()
+          << pg_vector_string(st.up)
+          << st.up_primary
+          << pg_vector_string(st.acting)
+          << st.acting_primary
+          << st.last_scrub
+          << st.last_scrub_stamp
+          << st.last_deep_scrub
+          << st.last_deep_scrub_stamp
+          << st.snaptrimq_len
+          << TextTable::endrow;
+    }
+  }
+
+  ss << tab;
+}
+
+void PGMap::dump(ostream& ss) const
+{
+  dump_basic(ss);
+  dump_pg_stats(ss, false);
+  dump_pool_stats(ss, false);
+  dump_pg_sum_stats(ss, false);
+  dump_osd_stats(ss);
+}
+
+void PGMap::dump_basic(ostream& ss) const
+{
+  ss << "version " << version << std::endl;
+  ss << "stamp " << stamp << std::endl;
+  ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
+  ss << "last_pg_scan " << last_pg_scan << std::endl;
+}
+
+void PGMap::dump_pg_stats(ostream& ss, bool brief) const
+{
+  dump_pg_stats_plain(ss, pg_stat, brief);
+}
+
+void PGMap::dump_pool_stats(ostream& ss, bool header) const
+{
+  TextTable tab;
+
+  if (header) {
+    tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
+    tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
+  } else {
+    tab.define_column("", TextTable::LEFT, TextTable::LEFT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+  }
+
+  for (auto p = pg_pool_sum.begin();
+       p != pg_pool_sum.end();
+       ++p) {
+    tab << p->first
+        << p->second.stats.sum.num_objects
+        << p->second.stats.sum.num_objects_missing_on_primary
+        << p->second.stats.sum.num_objects_degraded
+        << p->second.stats.sum.num_objects_misplaced
+        << p->second.stats.sum.num_objects_unfound
+        << p->second.stats.sum.num_bytes
+        << p->second.stats.sum.num_omap_bytes
+        << p->second.stats.sum.num_omap_keys
+        << p->second.log_size
+        << p->second.ondisk_log_size
+        << TextTable::endrow;
+  }
+
+  ss << tab;
+}
+
+void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
+{
+  TextTable tab;
+
+  if (header) {
+    tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
+    tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
+  } else {
+    tab.define_column("", TextTable::LEFT, TextTable::LEFT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
+  };
+
+  tab << "sum"
+      << pg_sum.stats.sum.num_objects
+      << pg_sum.stats.sum.num_objects_missing_on_primary
+      << pg_sum.stats.sum.num_objects_degraded
+      << pg_sum.stats.sum.num_objects_misplaced
+      << pg_sum.stats.sum.num_objects_unfound
+      << pg_sum.stats.sum.num_bytes
+      << pg_sum.stats.sum.num_omap_bytes
+      << pg_sum.stats.sum.num_omap_keys
+      << pg_sum.log_size
+      << pg_sum.ondisk_log_size
+      << TextTable::endrow;
+
+  ss << tab;
+}
+
+void PGMap::dump_osd_stats(ostream& ss) const
+{
+  TextTable tab;
+
+  tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
+  tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
+
+  for (auto p = osd_stat.begin();
+       p != osd_stat.end();
+       ++p) {
+    tab << p->first
+        << byte_u_t(p->second.statfs.get_used())
+        << byte_u_t(p->second.statfs.available)
+        << byte_u_t(p->second.statfs.get_used_raw())
+        << byte_u_t(p->second.statfs.total)
+        << p->second.hb_peers
+        << get_num_pg_by_osd(p->first)
+        << get_num_primary_pg_by_osd(p->first)
+        << TextTable::endrow;
+  }
+
+  tab << "sum"
+      << byte_u_t(osd_sum.statfs.get_used())
+      << byte_u_t(osd_sum.statfs.available)
+      << byte_u_t(osd_sum.statfs.get_used_raw())
+      << byte_u_t(osd_sum.statfs.total)
+      << TextTable::endrow;
+
+  ss << tab;
+}
+
+void PGMap::dump_osd_sum_stats(ostream& ss) const
+{
+  TextTable tab;
+
+  tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
+  tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
+
+  tab << "sum"
+      << byte_u_t(osd_sum.statfs.get_used())
+      << byte_u_t(osd_sum.statfs.available)
+      << byte_u_t(osd_sum.statfs.get_used_raw())
+      << byte_u_t(osd_sum.statfs.total)
+      << TextTable::endrow;
+
+  ss << tab;
+}
+
+void PGMap::get_stuck_stats(
+  int types, const utime_t cutoff,
+  mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
+{
+  ceph_assert(types != 0);
+  for (auto i = pg_stat.begin();
+       i != pg_stat.end();
+       ++i) {
+    utime_t val = cutoff; // don't care about >= cutoff so that is infinity
+
+    if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
+      if (i->second.last_active < val)
+	val = i->second.last_active;
+    }
+
+    if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
+      if (i->second.last_clean < val)
+	val = i->second.last_clean;
+    }
+
+    if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
+      if (i->second.last_undegraded < val)
+	val = i->second.last_undegraded;
+    }
+
+    if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
+      if (i->second.last_fullsized < val)
+	val = i->second.last_fullsized;
+    }
+
+    if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
+      if (i->second.last_unstale < val)
+	val = i->second.last_unstale;
+    }
+
+    // val is now the earliest any of the requested stuck states began
+    if (val < cutoff) {
+      stuck_pgs[i->first] = i->second;
+    }
+  }
+}
+
+bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
+{
+  int inactive = 0;
+  int unclean = 0;
+  int degraded = 0;
+  int undersized = 0;
+  int stale = 0;
+
+  for (auto i = pg_stat.begin();
+       i != pg_stat.end();
+       ++i) {
+    if (! (i->second.state & PG_STATE_ACTIVE)) {
+      if (i->second.last_active < cutoff)
+        ++inactive;
+    }
+    if (! (i->second.state & PG_STATE_CLEAN)) {
+      if (i->second.last_clean < cutoff)
+        ++unclean;
+    }
+    if (i->second.state & PG_STATE_DEGRADED) {
+      if (i->second.last_undegraded < cutoff)
+        ++degraded;
+    }
+    if (i->second.state & PG_STATE_UNDERSIZED) {
+      if (i->second.last_fullsized < cutoff)
+        ++undersized;
+    }
+    if (i->second.state & PG_STATE_STALE) {
+      if (i->second.last_unstale < cutoff)
+        ++stale;
+    }
+  }
+
+  if (inactive)
+    note["stuck inactive"] = inactive;
+
+  if (unclean)
+    note["stuck unclean"] = unclean;
+
+  if (undersized)
+    note["stuck undersized"] = undersized;
+
+  if (degraded)
+    note["stuck degraded"] = degraded;
+
+  if (stale)
+    note["stuck stale"] = stale;
+
+  return inactive || unclean || undersized || degraded || stale;
+}
+
+void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const
+{
+  mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
+  get_stuck_stats(types, cutoff, stuck_pg_stats);
+  f->open_array_section("stuck_pg_stats");
+  for (auto i = stuck_pg_stats.begin();
+       i != stuck_pg_stats.end();
+       ++i) {
+    f->open_object_section("pg_stat");
+    f->dump_stream("pgid") << i->first;
+    i->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
+{
+  mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
+  get_stuck_stats(types, cutoff, stuck_pg_stats);
+  if (!stuck_pg_stats.empty())
+    dump_pg_stats_plain(ss, stuck_pg_stats, true);
+}
+
+int PGMap::dump_stuck_pg_stats(
+  stringstream &ds,
+  ceph::Formatter *f,
+  int threshold,
+  vector<string>& args) const
+{
+  int stuck_types = 0;
+
+  for (auto i = args.begin(); i != args.end(); ++i) {
+    if (*i == "inactive")
+      stuck_types |= PGMap::STUCK_INACTIVE;
+    else if (*i == "unclean")
+      stuck_types |= PGMap::STUCK_UNCLEAN;
+    else if (*i == "undersized")
+      stuck_types |= PGMap::STUCK_UNDERSIZED;
+    else if (*i == "degraded")
+      stuck_types |= PGMap::STUCK_DEGRADED;
+    else if (*i == "stale")
+      stuck_types |= PGMap::STUCK_STALE;
+    else {
+      ds << "Unknown type: " << *i << std::endl;
+      return -EINVAL;
+    }
+  }
+
+  utime_t now(ceph_clock_now());
+  utime_t cutoff = now - utime_t(threshold, 0);
+
+  if (!f) {
+    dump_stuck_plain(ds, stuck_types, cutoff);
+  } else {
+    dump_stuck(f, stuck_types, cutoff);
+    f->flush(ds);
+  }
+
+  return 0;
+}
+
+void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const
+{
+  f->open_array_section("osd_perf_infos");
+  for (auto i = osd_stat.begin();
+       i != osd_stat.end();
+       ++i) {
+    f->open_object_section("osd");
+    f->dump_int("id", i->first);
+    {
+      f->open_object_section("perf_stats");
+      i->second.os_perf_stat.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+void PGMap::print_osd_perf_stats(std::ostream *ss) const
+{
+  TextTable tab;
+  tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
+  for (auto i = osd_stat.begin();
+       i != osd_stat.end();
+       ++i) {
+    tab << i->first;
+    tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
+    tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
+    tab << TextTable::endrow;
+  }
+  (*ss) << tab;
+}
+
+void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const
+{
+  f->open_array_section("osd_blocked_by_infos");
+  for (auto i = blocked_by_sum.begin();
+       i != blocked_by_sum.end();
+       ++i) {
+    f->open_object_section("osd");
+    f->dump_int("id", i->first);
+    f->dump_int("num_blocked", i->second);
+    f->close_section();
+  }
+  f->close_section();
+}
+void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
+{
+  TextTable tab;
+  tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
+  for (auto i = blocked_by_sum.begin();
+       i != blocked_by_sum.end();
+       ++i) {
+    tab << i->first;
+    tab << i->second;
+    tab << TextTable::endrow;
+  }
+  (*ss) << tab;
+}
+
+
+/**
+ * update aggregated delta
+ *
+ * @param cct               ceph context
+ * @param ts                Timestamp for the stats being delta'ed
+ * @param old_pool_sum      Previous stats sum
+ * @param last_ts           Last timestamp for pool
+ * @param result_pool_sum   Resulting stats
+ * @param result_pool_delta Resulting pool delta
+ * @param result_ts_delta   Resulting timestamp delta
+ * @param delta_avg_list    List of last N computed deltas, used to average
+ */
+void PGMap::update_delta(
+  CephContext *cct,
+  const utime_t ts,
+  const pool_stat_t& old_pool_sum,
+  utime_t *last_ts,
+  const pool_stat_t& current_pool_sum,
+  pool_stat_t *result_pool_delta,
+  utime_t *result_ts_delta,
+  mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
+{
+  /* @p ts is the timestamp we want to associate with the data
+   * in @p old_pool_sum, and on which we will base ourselves to
+   * calculate the delta, stored in 'delta_t'.
+   */
+  utime_t delta_t;
+  delta_t = ts;         // start with the provided timestamp
+  delta_t -= *last_ts;  // take the last timestamp we saw
+  *last_ts = ts;        // @p ts becomes the last timestamp we saw
+
+  // adjust delta_t, quick start if there is no update in a long period
+  delta_t = std::min(delta_t,
+                    utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
+
+  // calculate a delta, and average over the last 6 deltas by default.
+  /* start by taking a copy of our current @p result_pool_sum, and by
+   * taking out the stats from @p old_pool_sum.  This generates a stats
+   * delta.  Stash this stats delta in @p delta_avg_list, along with the
+   * timestamp delta for these results.
+   */
+  pool_stat_t d = current_pool_sum;
+  d.stats.sub(old_pool_sum.stats);
+
+  /* Aggregate current delta, and take out the last seen delta (if any) to
+   * average it out.
+   * Skip calculating delta while sum was not synchronized.
+   */
+  if(!old_pool_sum.stats.sum.is_zero()) {
+    delta_avg_list->push_back(make_pair(d,delta_t));
+    *result_ts_delta += delta_t;
+    result_pool_delta->stats.add(d.stats);
+  }
+  size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
+  while (delta_avg_list->size() > s) {
+    result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
+    *result_ts_delta -= delta_avg_list->front().second;
+    delta_avg_list->pop_front();
+  }
+}
+
+/**
+ * Update a given pool's deltas
+ *
+ * @param cct           Ceph Context
+ * @param ts            Timestamp for the stats being delta'ed
+ * @param pool          Pool's id
+ * @param old_pool_sum  Previous stats sum
+ */
+void PGMap::update_one_pool_delta(
+  CephContext *cct,
+  const utime_t ts,
+  const int64_t pool,
+  const pool_stat_t& old_pool_sum)
+{
+  if (per_pool_sum_deltas.count(pool) == 0) {
+    ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
+    ceph_assert(per_pool_sum_delta.count(pool) == 0);
+  }
+
+  auto& sum_delta = per_pool_sum_delta[pool];
+
+  update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
+               &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
+               &per_pool_sum_deltas[pool]);
+}
+
+/**
+ * Update pools' deltas
+ *
+ * @param cct               CephContext
+ * @param ts                Timestamp for the stats being delta'ed
+ * @param pg_pool_sum_old   Map of pool stats for delta calcs.
+ */
+void PGMap::update_pool_deltas(
+  CephContext *cct, const utime_t ts,
+  const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
+{
+  for (auto it = pg_pool_sum_old.begin();
+       it != pg_pool_sum_old.end(); ++it) {
+    update_one_pool_delta(cct, ts, it->first, it->second);
+  }
+}
+
+void PGMap::clear_delta()
+{
+  pg_sum_delta = pool_stat_t();
+  pg_sum_deltas.clear();
+  stamp_delta = utime_t();
+}
+
+void PGMap::generate_test_instances(list<PGMap*>& o)
+{
+  o.push_back(new PGMap);
+  list<Incremental*> inc;
+  Incremental::generate_test_instances(inc);
+  delete inc.front();
+  inc.pop_front();
+  while (!inc.empty()) {
+    PGMap *pmp = new PGMap();
+    *pmp = *o.back();
+    o.push_back(pmp);
+    o.back()->apply_incremental(NULL, *inc.front());
+    delete inc.front();
+    inc.pop_front();
+  }
+}
+
+void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
+                                  bool primary, set<pg_t>& pgs) const
+{
+  for (auto i = pg_stat.begin();
+       i != pg_stat.end();
+       ++i) {
+    if ((poolid >= 0) && (poolid != i->first.pool()))
+      continue;
+    if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
+      continue;
+    if (state == (uint64_t)-1 ||                 // "all"
+	(i->second.state & state) ||             // matches a state bit
+	(state == 0 && i->second.state == 0)) {  // matches "unknown" (== 0)
+      pgs.insert(i->first);
+    }
+  }
+}
+
+void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const
+{
+  f->open_array_section("pg_stats");
+  for (auto i = pgs.begin(); i != pgs.end(); ++i) {
+    const pg_stat_t& st = pg_stat.at(*i);
+    f->open_object_section("pg_stat");
+    f->dump_stream("pgid") << *i;
+    st.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
+{
+  TextTable tab;
+  utime_t now = ceph_clock_now();
+
+  tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
+  tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+
+  for (auto i = pgs.begin(); i != pgs.end(); ++i) {
+    const pg_stat_t& st = pg_stat.at(*i);
+
+    ostringstream reported;
+    reported << st.reported_epoch << ":" << st.reported_seq;
+
+    ostringstream upstr, actingstr;
+    upstr << pg_vector_string(st.up) << 'p' << st.up_primary;
+    actingstr << pg_vector_string(st.acting) << 'p' << st.acting_primary;
+    tab << *i
+        << st.stats.sum.num_objects
+        << st.stats.sum.num_objects_degraded
+        << st.stats.sum.num_objects_misplaced
+        << st.stats.sum.num_objects_unfound
+        << st.stats.sum.num_bytes
+        << st.stats.sum.num_omap_bytes
+        << st.stats.sum.num_omap_keys
+        << st.log_size
+        << pg_state_string(st.state)
+        << utimespan_str(now - st.last_change)
+        << st.version
+        << reported.str()
+        << upstr.str()
+        << actingstr.str()
+        << st.last_scrub_stamp
+        << st.last_deep_scrub_stamp
+        << TextTable::endrow;
+  }
+
+  ss << tab;
+}
+
+void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
+                                        ceph::Formatter *f,
+                                        stringstream *rs) const {
+  string pool_name = osd_map.get_pool_name(poolid);
+  if (f) {
+    f->open_object_section("pool");
+    f->dump_string("pool_name", pool_name.c_str());
+    f->dump_int("pool_id", poolid);
+    f->open_object_section("recovery");
+  }
+  list<string> sl;
+  stringstream tss;
+  pool_recovery_summary(f, &sl, poolid);
+  if (!f && !sl.empty()) {
+    for (auto &p : sl)
+      tss << "  " << p << "\n";
+  }
+  if (f) {
+    f->close_section(); // object section recovery
+    f->open_object_section("recovery_rate");
+  }
+  ostringstream rss;
+  pool_recovery_rate_summary(f, &rss, poolid);
+  if (!f && !rss.str().empty())
+    tss << "  recovery io " << rss.str() << "\n";
+  if (f) {
+    f->close_section(); // object section recovery_rate
+    f->open_object_section("client_io_rate");
+  }
+  rss.clear();
+  rss.str("");
+  pool_client_io_rate_summary(f, &rss, poolid);
+  if (!f && !rss.str().empty())
+    tss << "  client io " << rss.str() << "\n";
+  // dump cache tier IO rate for cache pool
+  const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
+  if (pool->is_tier()) {
+    if (f) {
+      f->close_section(); // object section client_io_rate
+      f->open_object_section("cache_io_rate");
+    }
+    rss.clear();
+    rss.str("");
+    pool_cache_io_rate_summary(f, &rss, poolid);
+    if (!f && !rss.str().empty())
+      tss << "  cache tier io " << rss.str() << "\n";
+  }
+  if (f) {
+    f->close_section(); // object section cache_io_rate
+    f->close_section(); // object section pool
+  } else {
+    *rs << "pool " << pool_name << " id " << poolid << "\n";
+    if (!tss.str().empty())
+      *rs << tss.str() << "\n";
+    else
+      *rs << "  nothing is going on\n\n";
+  }
+}
+
+// Get crush parentage for an osd (skip root)
+set<std::string> PGMap::osd_parentage(const OSDMap& osdmap, int id) const
+{
+  set<std::string> reporters_by_subtree;
+  auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
+
+  auto loc = osdmap.crush->get_full_location(id);
+  for (auto& [parent_bucket_type, parent_id] : loc) {
+    // Should we show the root?  Might not be too informative like "default"
+    if (parent_bucket_type != "root" &&
+        parent_bucket_type != reporter_subtree_level) {
+      reporters_by_subtree.insert(parent_id);
+    }
+  }
+  return reporters_by_subtree;
+}
+
+void PGMap::get_health_checks(
+  CephContext *cct,
+  const OSDMap& osdmap,
+  health_check_map_t *checks) const
+{
+  utime_t now = ceph_clock_now();
+  const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
+  const auto& pools = osdmap.get_pools();
+
+  typedef enum pg_consequence_t {
+    UNAVAILABLE = 1,   // Client IO to the pool may block
+    DEGRADED = 2,      // Fewer than the requested number of replicas are present
+    BACKFILL_FULL = 3, // Backfill is blocked for space considerations
+                       // This may or may not be a deadlock condition.
+    DAMAGED = 4,        // The data may be missing or inconsistent on disk and
+                       //  requires repair
+    RECOVERY_FULL = 5  // Recovery is blocked because OSDs are full
+  } pg_consequence_t;
+
+  // For a given PG state, how should it be reported at the pool level?
+  class PgStateResponse {
+    public:
+    pg_consequence_t consequence;
+    typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
+    stuck_cb stuck_since;
+    bool invert;
+
+    PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
+      : consequence(c), stuck_since(std::move(s)), invert(false)
+    {
+    }
+
+    PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
+      : consequence(c), stuck_since(std::move(s)), invert(i)
+    {
+    }
+  };
+
+  // Record the PG state counts that contributed to a reported pool state
+  class PgCauses {
+    public:
+    // Map of PG_STATE_* to number of pgs in that state.
+    std::map<unsigned, unsigned> states;
+
+    // List of all PG IDs that had a state contributing
+    // to this health condition.
+    std::set<pg_t> pgs;
+
+    std::map<pg_t, std::string> pg_messages;
+  };
+
+  // Map of PG state to how to respond to it
+  std::map<unsigned, PgStateResponse> state_to_response = {
+    // Immediate reports
+    { PG_STATE_INCONSISTENT,     {DAMAGED,     {}} },
+    { PG_STATE_INCOMPLETE,       {UNAVAILABLE, {}} },
+    { PG_STATE_SNAPTRIM_ERROR,   {DAMAGED,     {}} },
+    { PG_STATE_RECOVERY_UNFOUND, {DAMAGED,     {}} },
+    { PG_STATE_BACKFILL_UNFOUND, {DAMAGED,     {}} },
+    { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
+    { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
+    { PG_STATE_DEGRADED,         {DEGRADED,    {}} },
+    { PG_STATE_DOWN,             {UNAVAILABLE, {}} },
+    // Delayed (wait until stuck) reports
+    { PG_STATE_PEERING,          {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;}    } },
+    { PG_STATE_UNDERSIZED,       {DEGRADED,    [](const pg_stat_t &p){return p.last_fullsized;} } },
+    { PG_STATE_STALE,            {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;}   } },
+    // Delayed and inverted reports
+    { PG_STATE_ACTIVE,           {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
+  };
+
+  // Specialized state printer that takes account of inversion of
+  // ACTIVE, CLEAN checks.
+  auto state_name = [](const uint64_t &state) {
+    // Special cases for the states that are inverted checks
+    if (state == PG_STATE_CLEAN) {
+      return std::string("unclean");
+    } else if (state == PG_STATE_ACTIVE) {
+      return std::string("inactive");
+    } else {
+      return pg_state_string(state);
+    }
+  };
+
+  // Map of what is wrong to information about why, implicitly also stores
+  // the list of what is wrong.
+  std::map<pg_consequence_t, PgCauses> detected;
+
+  // Optimisation: trim down the number of checks to apply based on
+  // the summary counters
+  std::map<unsigned, PgStateResponse> possible_responses;
+  for (const auto &i : num_pg_by_state) {
+    for (const auto &j : state_to_response) {
+      if (!j.second.invert) {
+        // Check for normal tests by seeing if any pgs have the flag
+        if (i.first & j.first) {
+          possible_responses.insert(j);
+        }
+      }
+    }
+  }
+
+  for (const auto &j : state_to_response) {
+    if (j.second.invert) {
+      // Check for inverted tests by seeing if not-all pgs have the flag
+      const auto &found = num_pg_by_state.find(j.first);
+      if (found == num_pg_by_state.end() || found->second != num_pg) {
+        possible_responses.insert(j);
+      }
+    }
+  }
+
+  utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
+  // Loop over all PGs, if there are any possibly-unhealthy states in there
+  if (!possible_responses.empty()) {
+    for (const auto& i : pg_stat) {
+      const auto &pg_id = i.first;
+      const auto &pg_info = i.second;
+
+      for (const auto &j : state_to_response) {
+        const auto &pg_response_state = j.first;
+        const auto &pg_response = j.second;
+
+        // Apply the state test
+        if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
+          continue;
+        }
+
+        // Apply stuckness test if needed
+        if (pg_response.stuck_since) {
+          // Delayed response, check for stuckness
+          utime_t last_whatever = pg_response.stuck_since(pg_info);
+          if (last_whatever.is_zero() &&
+            pg_info.last_change >= cutoff) {
+            // still moving, ignore
+            continue;
+          } else if (last_whatever >= cutoff) {
+            // Not stuck enough, ignore.
+            continue;
+          } else {
+
+          }
+        }
+
+        auto &causes = detected[pg_response.consequence];
+        causes.states[pg_response_state]++;
+        causes.pgs.insert(pg_id);
+
+        // Don't bother composing detail string if we have already recorded
+        // too many
+        if (causes.pg_messages.size() > max) {
+          continue;
+        }
+
+        std::ostringstream ss;
+        if (pg_response.stuck_since) {
+          utime_t since = pg_response.stuck_since(pg_info);
+          ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
+          if (since == utime_t()) {
+            ss << " since forever";
+          } else {
+            utime_t dur = now - since;
+            ss << " for " << utimespan_str(dur);
+          }
+          ss << ", current state " << pg_state_string(pg_info.state)
+             << ", last acting " << pg_info.acting;
+        } else {
+          ss << "pg " << pg_id << " is "
+             << pg_state_string(pg_info.state);
+          ss << ", acting " << pg_info.acting;
+          if (pg_info.stats.sum.num_objects_unfound) {
+            ss << ", " << pg_info.stats.sum.num_objects_unfound
+               << " unfound";
+          }
+        }
+
+        if (pg_info.state & PG_STATE_INCOMPLETE) {
+          const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
+          if (pi && pi->min_size > 1) {
+            ss << " (reducing pool "
+               << osdmap.get_pool_name(pg_id.pool())
+               << " min_size from " << (int)pi->min_size
+               << " may help; search ceph.com/docs for 'incomplete')";
+          }
+        }
+
+        causes.pg_messages[pg_id] = ss.str();
+      }
+    }
+  } else {
+    dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
+  }
+
+  for (const auto &i : detected) {
+    std::string health_code;
+    health_status_t sev;
+    std::string summary;
+    switch(i.first) {
+      case UNAVAILABLE:
+        health_code = "PG_AVAILABILITY";
+        sev = HEALTH_WARN;
+        summary = "Reduced data availability: ";
+        break;
+      case DEGRADED:
+        health_code = "PG_DEGRADED";
+        summary = "Degraded data redundancy: ";
+        sev = HEALTH_WARN;
+        break;
+      case BACKFILL_FULL:
+        health_code = "PG_BACKFILL_FULL";
+        summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
+        sev = HEALTH_WARN;
+        break;
+      case DAMAGED:
+        health_code = "PG_DAMAGED";
+        summary = "Possible data damage: ";
+        sev = HEALTH_ERR;
+        break;
+      case RECOVERY_FULL:
+        health_code = "PG_RECOVERY_FULL";
+        summary = "Full OSDs blocking recovery: ";
+        sev = HEALTH_ERR;
+        break;
+      default:
+        ceph_abort();
+    }
+
+    if (i.first == DEGRADED) {
+      if (pg_sum.stats.sum.num_objects_degraded &&
+          pg_sum.stats.sum.num_object_copies > 0) {
+        double pc = (double)pg_sum.stats.sum.num_objects_degraded /
+          (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+        char b[20];
+        snprintf(b, sizeof(b), "%.3lf", pc);
+        ostringstream ss;
+        ss << pg_sum.stats.sum.num_objects_degraded
+           << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
+           << b << "%)";
+
+        // Throw in a comma for the benefit of the following PG counts
+        summary += ss.str() + ", ";
+      }
+    }
+
+    // Compose summary message saying how many PGs in what states led
+    // to this health check failing
+    std::vector<std::string> pg_msgs;
+    int64_t count = 0;
+    for (const auto &j : i.second.states) {
+      std::ostringstream msg;
+      msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
+      pg_msgs.push_back(msg.str());
+      count += j.second;
+    }
+    summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
+
+    health_check_t *check = &checks->add(
+        health_code,
+        sev,
+        summary,
+	count);
+
+    // Compose list of PGs contributing to this health check failing
+    for (const auto &j : i.second.pg_messages) {
+      check->detail.push_back(j.second);
+    }
+  }
+
+  // OSD_SCRUB_ERRORS
+  if (pg_sum.stats.sum.num_scrub_errors) {
+    ostringstream ss;
+    ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
+    checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
+		pg_sum.stats.sum.num_scrub_errors);
+  }
+
+  // LARGE_OMAP_OBJECTS
+  if (pg_sum.stats.sum.num_large_omap_objects) {
+    list<string> detail;
+    for (auto &pool : pools) {
+      const string& pool_name = osdmap.get_pool_name(pool.first);
+      auto it2 = pg_pool_sum.find(pool.first);
+      if (it2 == pg_pool_sum.end()) {
+        continue;
+      }
+      const pool_stat_t *pstat = &it2->second;
+      if (pstat == nullptr) {
+        continue;
+      }
+      const object_stat_sum_t& sum = pstat->stats.sum;
+      if (sum.num_large_omap_objects) {
+        stringstream ss;
+        ss << sum.num_large_omap_objects << " large objects found in pool "
+           << "'" << pool_name << "'";
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
+      auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
+			    pg_sum.stats.sum.num_large_omap_objects);
+      stringstream tip;
+      tip << "Search the cluster log for 'Large omap object found' for more "
+          << "details.";
+      detail.push_back(tip.str());
+      d.detail.swap(detail);
+    }
+  }
+
+  // CACHE_POOL_NEAR_FULL
+  {
+    list<string> detail;
+    unsigned num_pools = 0;
+    for (auto& p : pools) {
+      if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
+	  !pg_pool_sum.count(p.first)) {
+	continue;
+      }
+      bool nearfull = false;
+      const string& name = osdmap.get_pool_name(p.first);
+      const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
+      uint64_t ratio = p.second.cache_target_full_ratio_micro +
+	((1000000 - p.second.cache_target_full_ratio_micro) *
+	 cct->_conf->mon_cache_target_full_warn_ratio);
+      if (p.second.target_max_objects &&
+	  (uint64_t)(st.stats.sum.num_objects -
+		     st.stats.sum.num_objects_hit_set_archive) >
+	  p.second.target_max_objects * (ratio / 1000000.0)) {
+	ostringstream ss;
+	ss << "cache pool '" << name << "' with "
+	   << si_u_t(st.stats.sum.num_objects)
+	   << " objects at/near target max "
+	   << si_u_t(p.second.target_max_objects) << " objects";
+	detail.push_back(ss.str());
+	nearfull = true;
+      }
+      if (p.second.target_max_bytes &&
+	  (uint64_t)(st.stats.sum.num_bytes -
+		     st.stats.sum.num_bytes_hit_set_archive) >
+	  p.second.target_max_bytes * (ratio / 1000000.0)) {
+	ostringstream ss;
+	ss << "cache pool '" << name
+	   << "' with " << byte_u_t(st.stats.sum.num_bytes)
+	   << " at/near target max "
+	   << byte_u_t(p.second.target_max_bytes);
+	detail.push_back(ss.str());
+	nearfull = true;
+      }
+      if (nearfull) {
+	++num_pools;
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << num_pools << " cache pools at or near target size";
+      auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
+			    num_pools);
+      d.detail.swap(detail);
+    }
+  }
+
+  // TOO_FEW_PGS
+  unsigned num_in = osdmap.get_num_in_osds();
+  auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
+  const auto min_pg_per_osd =
+    cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
+  if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
+    auto per = sum_pg_up / num_in;
+    if (per < min_pg_per_osd && per) {
+      ostringstream ss;
+      ss << "too few PGs per OSD (" << per
+	 << " < min " << min_pg_per_osd << ")";
+      checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
+		  min_pg_per_osd - per);
+    }
+  }
+
+  // TOO_MANY_PGS
+  auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
+  if (num_in && max_pg_per_osd > 0) {
+    auto per = sum_pg_up / num_in;
+    if (per > max_pg_per_osd) {
+      ostringstream ss;
+      ss << "too many PGs per OSD (" << per
+	 << " > max " << max_pg_per_osd << ")";
+      checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
+		  per - max_pg_per_osd);
+    }
+  }
+
+  // TOO_FEW_OSDS
+  auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
+  auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
+  if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
+    ostringstream ss;
+    ss << "OSD count " << osdmap.get_num_osds()
+	 << " < osd_pool_default_size " << osd_pool_default_size;
+    checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
+		osd_pool_default_size - osdmap.get_num_osds());
+  }
+
+  // SLOW_PING_TIME
+  // Convert milliseconds to microseconds
+  auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
+  auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+  if (warn_slow_ping_time == 0) {
+    double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
+    warn_slow_ping_time = grace;
+    warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+  }
+  if (warn_slow_ping_time > 0) {
+
+    struct mon_ping_item_t {
+      uint32_t pingtime;
+      int from;
+      int to;
+      bool improving;
+
+      bool operator<(const mon_ping_item_t& rhs) const {
+        if (pingtime < rhs.pingtime)
+          return true;
+        if (pingtime > rhs.pingtime)
+          return false;
+        if (from < rhs.from)
+          return true;
+        if (from > rhs.from)
+          return false;
+        return to < rhs.to;
+      }
+    };
+
+    list<string> detail_back;
+    list<string> detail_front;
+    list<string> detail;
+    set<mon_ping_item_t> back_sorted, front_sorted;
+    for (auto i : osd_stat) {
+      for (auto j : i.second.hb_pingtime) {
+
+	// Maybe source info is old
+	if (now.sec() - j.second.last_update > grace * 60)
+	  continue;
+
+	mon_ping_item_t back;
+	back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+	back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
+	back.from = i.first;
+	back.to = j.first;
+	if (back.pingtime > warn_slow_ping_time) {
+	  back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
+			    && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
+	  back_sorted.emplace(back);
+	}
+
+	mon_ping_item_t front;
+	front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+	front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
+	front.from = i.first;
+	front.to = j.first;
+	if (front.pingtime > warn_slow_ping_time) {
+	  front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
+			     && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
+	  front_sorted.emplace(front);
+	}
+      }
+      if (i.second.num_shards_repaired >
+		      cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) {
+        ostringstream ss;
+	ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired";
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << "Too many repaired reads on " << detail.size() << " OSDs";
+      auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str(),
+		      detail.size());
+      d.detail.swap(detail);
+    }
+    int max_detail = 10;
+    for (auto &sback : boost::adaptors::reverse(back_sorted)) {
+      ostringstream ss;
+      if (max_detail == 0) {
+	ss << "Truncated long network list.  Use ceph daemon mgr.# dump_osd_network for more information";
+        detail_back.push_back(ss.str());
+        break;
+      }
+      max_detail--;
+      ss << "Slow OSD heartbeats on back from osd." << sback.from
+	 << " [" << osd_parentage(osdmap, sback.from) << "]"
+         << (osdmap.is_down(sback.from) ? " (down)" : "")
+	 << " to osd." << sback.to
+	 << " [" << osd_parentage(osdmap, sback.to) << "]"
+         << (osdmap.is_down(sback.to) ? " (down)" : "")
+	 << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
+	 << (sback.improving ? " possibly improving" : "");
+      detail_back.push_back(ss.str());
+    }
+    max_detail = 10;
+    for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
+      ostringstream ss;
+      if (max_detail == 0) {
+	ss << "Truncated long network list.  Use ceph daemon mgr.# dump_osd_network for more information";
+        detail_front.push_back(ss.str());
+        break;
+      }
+      max_detail--;
+      // Get crush parentage for each osd
+      ss << "Slow OSD heartbeats on front from osd." << sfront.from
+	 << " [" << osd_parentage(osdmap, sfront.from) << "]"
+         << (osdmap.is_down(sfront.from) ? " (down)" : "")
+         << " to osd." << sfront.to
+	 << " [" << osd_parentage(osdmap, sfront.to) << "]"
+         << (osdmap.is_down(sfront.to) ? " (down)" : "")
+	 << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
+	 << (sfront.improving ? " possibly improving" : "");
+      detail_front.push_back(ss.str());
+    }
+    if (detail_back.size() != 0) {
+      ostringstream ss;
+      ss << "Slow OSD heartbeats on back (longest "
+	 << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)";
+      auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
+		      back_sorted.size());
+      d.detail.swap(detail_back);
+    }
+    if (detail_front.size() != 0) {
+      ostringstream ss;
+      ss << "Slow OSD heartbeats on front (longest "
+	 << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)";
+      auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
+		      front_sorted.size());
+      d.detail.swap(detail_front);
+    }
+  }
+
+  // SMALLER_PGP_NUM
+  // MANY_OBJECTS_PER_PG
+  if (!pg_stat.empty()) {
+    list<string> pgp_detail, many_detail;
+    const auto mon_pg_warn_min_objects =
+      cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
+    const auto mon_pg_warn_min_pool_objects =
+      cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
+    const auto mon_pg_warn_max_object_skew =
+      cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
+    for (auto p = pg_pool_sum.begin();
+         p != pg_pool_sum.end();
+         ++p) {
+      const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
+      if (!pi)
+	continue;   // in case osdmap changes haven't propagated to PGMap yet
+      const string& name = osdmap.get_pool_name(p->first);
+      // NOTE: we use pg_num_target and pgp_num_target for the purposes of
+      // the warnings.  If the cluster is failing to converge on the target
+      // values that is a separate issue!
+      if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
+	  !(name.find(".DELETED") != string::npos &&
+	    cct->_conf->mon_fake_pool_delete)) {
+	ostringstream ss;
+	ss << "pool " << name << " pg_num "
+	   << pi->get_pg_num_target()
+	   << " > pgp_num " << pi->get_pgp_num_target();
+	pgp_detail.push_back(ss.str());
+      }
+      int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
+      if (average_objects_per_pg > 0 &&
+          pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
+          p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
+	int objects_per_pg = p->second.stats.sum.num_objects /
+	  pi->get_pg_num_target();
+	float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
+	if (mon_pg_warn_max_object_skew > 0 &&
+	    ratio > mon_pg_warn_max_object_skew) {
+	  ostringstream ss;
+	  if (pi->pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::ON) {
+	      ss << "pool " << name << " objects per pg ("
+		 << objects_per_pg << ") is more than " << ratio
+		 << " times cluster average ("
+		 << average_objects_per_pg << ")";
+	      many_detail.push_back(ss.str());
+	  }
+	}
+      }
+    }
+    if (!pgp_detail.empty()) {
+      ostringstream ss;
+      ss << pgp_detail.size() << " pools have pg_num > pgp_num";
+      auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
+			    pgp_detail.size());
+      d.detail.swap(pgp_detail);
+    }
+    if (!many_detail.empty()) {
+      ostringstream ss;
+      ss << many_detail.size() << " pools have many more objects per pg than"
+	 << " average";
+      auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
+			    many_detail.size());
+      d.detail.swap(many_detail);
+    }
+  }
+
+  // POOL_FULL
+  // POOL_NEAR_FULL
+  {
+    float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
+    float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
+    list<string> full_detail, nearfull_detail;
+    unsigned full_pools = 0, nearfull_pools = 0;
+    for (auto it : pools) {
+      auto it2 = pg_pool_sum.find(it.first);
+      if (it2 == pg_pool_sum.end()) {
+	continue;
+      }
+      const pool_stat_t *pstat = &it2->second;
+      const object_stat_sum_t& sum = pstat->stats.sum;
+      const string& pool_name = osdmap.get_pool_name(it.first);
+      const pg_pool_t &pool = it.second;
+      bool full = false, nearfull = false;
+      if (pool.quota_max_objects > 0) {
+	stringstream ss;
+	if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
+	} else if (crit_threshold > 0 &&
+		   sum.num_objects >= pool.quota_max_objects*crit_threshold) {
+	  ss << "pool '" << pool_name
+	     << "' has " << sum.num_objects << " objects"
+	     << " (max " << pool.quota_max_objects << ")";
+	  full_detail.push_back(ss.str());
+	  full = true;
+	} else if (warn_threshold > 0 &&
+		   sum.num_objects >= pool.quota_max_objects*warn_threshold) {
+	  ss << "pool '" << pool_name
+	     << "' has " << sum.num_objects << " objects"
+	     << " (max " << pool.quota_max_objects << ")";
+	  nearfull_detail.push_back(ss.str());
+	  nearfull = true;
+	}
+      }
+      if (pool.quota_max_bytes > 0) {
+	stringstream ss;
+	if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
+	} else if (crit_threshold > 0 &&
+		   sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
+	  ss << "pool '" << pool_name
+	     << "' has " << byte_u_t(sum.num_bytes)
+	     << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
+	  full_detail.push_back(ss.str());
+	  full = true;
+	} else if (warn_threshold > 0 &&
+		   sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
+	  ss << "pool '" << pool_name
+	     << "' has " << byte_u_t(sum.num_bytes)
+	     << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
+	  nearfull_detail.push_back(ss.str());
+	  nearfull = true;
+	}
+      }
+      if (full) {
+	++full_pools;
+      }
+      if (nearfull) {
+	++nearfull_pools;
+      }
+    }
+    if (full_pools) {
+      ostringstream ss;
+      ss << full_pools << " pools full";
+      auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
+      d.detail.swap(full_detail);
+    }
+    if (nearfull_pools) {
+      ostringstream ss;
+      ss << nearfull_pools << " pools nearfull";
+      auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
+      d.detail.swap(nearfull_detail);
+    }
+  }
+
+  // OBJECT_MISPLACED
+  if (pg_sum.stats.sum.num_objects_misplaced &&
+      pg_sum.stats.sum.num_object_copies > 0 &&
+      cct->_conf->mon_warn_on_misplaced) {
+    double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
+      (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+    char b[20];
+    snprintf(b, sizeof(b), "%.3lf", pc);
+    ostringstream ss;
+    ss << pg_sum.stats.sum.num_objects_misplaced
+       << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
+       << b << "%)";
+    checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
+		pg_sum.stats.sum.num_objects_misplaced);
+  }
+
+  // OBJECT_UNFOUND
+  if (pg_sum.stats.sum.num_objects_unfound &&
+      pg_sum.stats.sum.num_objects) {
+    double pc = (double)pg_sum.stats.sum.num_objects_unfound /
+      (double)pg_sum.stats.sum.num_objects * (double)100.0;
+    char b[20];
+    snprintf(b, sizeof(b), "%.3lf", pc);
+    ostringstream ss;
+    ss << pg_sum.stats.sum.num_objects_unfound
+       << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
+    auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
+			  pg_sum.stats.sum.num_objects_unfound);
+
+    for (auto& p : pg_stat) {
+      if (p.second.stats.sum.num_objects_unfound) {
+	ostringstream ss;
+	ss << "pg " << p.first
+	   << " has " << p.second.stats.sum.num_objects_unfound
+	   << " unfound objects";
+	d.detail.push_back(ss.str());
+	if (d.detail.size() > max) {
+	  d.detail.push_back("(additional pgs left out for brevity)");
+	  break;
+	}
+      }
+    }
+  }
+
+  // REQUEST_SLOW
+  // REQUEST_STUCK
+  // SLOW_OPS unifies them in mimic.
+  if (osdmap.require_osd_release < ceph_release_t::mimic &&
+      cct->_conf->mon_osd_warn_op_age > 0 &&
+      !osd_sum.op_queue_age_hist.h.empty() &&
+      osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
+      cct->_conf->mon_osd_warn_op_age) {
+    list<string> warn_detail, error_detail;
+    unsigned warn = 0, error = 0;
+    float err_age =
+      cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
+    const pow2_hist_t& h = osd_sum.op_queue_age_hist;
+    for (unsigned i = h.h.size() - 1; i > 0; --i) {
+      float ub = (float)(1 << i) / 1000.0;
+      if (ub < cct->_conf->mon_osd_warn_op_age)
+	break;
+      if (h.h[i]) {
+	ostringstream ss;
+	ss << h.h[i] << " ops are blocked > " << ub << " sec";
+	if (ub > err_age) {
+	  error += h.h[i];
+	  error_detail.push_back(ss.str());
+	} else {
+	  warn += h.h[i];
+	  warn_detail.push_back(ss.str());
+	}
+      }
+    }
+
+    map<float,set<int>> warn_osd_by_max; // max -> osds
+    map<float,set<int>> error_osd_by_max; // max -> osds
+    if (!warn_detail.empty() || !error_detail.empty()) {
+      for (auto& p : osd_stat) {
+	const pow2_hist_t& h = p.second.op_queue_age_hist;
+	for (unsigned i = h.h.size() - 1; i > 0; --i) {
+	  float ub = (float)(1 << i) / 1000.0;
+	  if (ub < cct->_conf->mon_osd_warn_op_age)
+	    break;
+	  if (h.h[i]) {
+	    if (ub > err_age) {
+	      error_osd_by_max[ub].insert(p.first);
+	    } else {
+	      warn_osd_by_max[ub].insert(p.first);
+	    }
+	    break;
+	  }
+	}
+      }
+    }
+
+    if (!warn_detail.empty()) {
+      ostringstream ss;
+      ss << warn << " slow requests are blocked > "
+	 << cct->_conf->mon_osd_warn_op_age << " sec";
+      auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
+      d.detail.swap(warn_detail);
+      int left = max;
+      for (auto& p : warn_osd_by_max) {
+	ostringstream ss;
+	if (p.second.size() > 1) {
+	  ss << "osds " << p.second
+             << " have blocked requests > " << p.first << " sec";
+	} else {
+	  ss << "osd." << *p.second.begin()
+             << " has blocked requests > " << p.first << " sec";
+	}
+	d.detail.push_back(ss.str());
+	if (--left == 0) {
+	  break;
+	}
+      }
+    }
+    if (!error_detail.empty()) {
+      ostringstream ss;
+      ss << error << " stuck requests are blocked > "
+	 << err_age << " sec";
+      auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
+      d.detail.swap(error_detail);
+      int left = max;
+      for (auto& p : error_osd_by_max) {
+	ostringstream ss;
+	if (p.second.size() > 1) {
+	  ss << "osds " << p.second
+             << " have stuck requests > " << p.first << " sec";
+	} else {
+	  ss << "osd." << *p.second.begin()
+             << " has stuck requests > " << p.first << " sec";
+	}
+	d.detail.push_back(ss.str());
+	if (--left == 0) {
+	  break;
+	}
+      }
+    }
+  }
+
+  // OBJECT_STORE_WARN
+  if (osd_sum.os_alerts.size()) {
+    map<string, pair<size_t, list<string>>> os_alerts_sum;
+
+    for (auto& a : osd_sum.os_alerts) {
+      int left = max;
+      string s0 = " osd.";
+      s0 += stringify(a.first);
+      for (auto& aa : a.second) {
+        string s(s0);
+        s += " ";
+        s += aa.second;
+        auto it = os_alerts_sum.find(aa.first);
+        if (it == os_alerts_sum.end()) {
+          list<string> d;
+          d.emplace_back(s);
+          os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
+        } else {
+          auto& p = it->second;
+          ++p.first;
+          p.second.emplace_back(s);
+        }
+	if (--left == 0) {
+	  break;
+	}
+      }
+    }
+
+    for (auto& asum : os_alerts_sum) {
+      string summary = stringify(asum.second.first) + " OSD(s)";
+      if (asum.first == "BLUEFS_SPILLOVER") {
+	summary += " experiencing BlueFS spillover";
+      } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
+	summary += " have broken BlueStore compression";
+      } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
+	summary += " reporting legacy (not per-pool) BlueStore stats";
+      } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
+	summary += " have dangerous mismatch between BlueStore block device and free list sizes";
+      } else if (asum.first == "BLUESTORE_NO_PER_PG_OMAP") {
+	summary += " reporting legacy (not per-pg) BlueStore omap";
+      } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
+	summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
+      } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") {
+        summary += " have spurious read errors";
+      }
+
+      auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
+      for (auto& s : asum.second.second) {
+        d.detail.push_back(s);
+      }
+    }
+  }
+  // PG_NOT_SCRUBBED
+  // PG_NOT_DEEP_SCRUBBED
+  if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
+        cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
+    list<string> detail, deep_detail;
+    int detail_max = max, deep_detail_max = max;
+    int detail_more = 0, deep_detail_more = 0;
+    int detail_total = 0, deep_detail_total = 0;
+    for (auto& p : pg_stat) {
+      int64_t pnum =  p.first.pool();
+      auto pool = osdmap.get_pg_pool(pnum);
+      if (!pool)
+        continue;
+      if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
+        double scrub_max_interval = 0;
+        pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
+        if (scrub_max_interval <= 0) {
+          scrub_max_interval = cct->_conf->osd_scrub_max_interval;
+        }
+        const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
+          scrub_max_interval;
+        utime_t cutoff = now;
+        cutoff -= age;
+        if (p.second.last_scrub_stamp < cutoff) {
+          if (detail_max > 0) {
+            ostringstream ss;
+            ss << "pg " << p.first << " not scrubbed since "
+               << p.second.last_scrub_stamp;
+            detail.push_back(ss.str());
+            --detail_max;
+          } else {
+            ++detail_more;
+          }
+          ++detail_total;
+        }
+      }
+      if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
+        double deep_scrub_interval = 0;
+        pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
+        if (deep_scrub_interval <= 0) {
+          deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+        }
+        double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
+          deep_scrub_interval;
+        utime_t deep_cutoff = now;
+        deep_cutoff -= deep_age;
+        if (p.second.last_deep_scrub_stamp < deep_cutoff) {
+          if (deep_detail_max > 0) {
+            ostringstream ss;
+            ss << "pg " << p.first << " not deep-scrubbed since "
+               << p.second.last_deep_scrub_stamp;
+            deep_detail.push_back(ss.str());
+            --deep_detail_max;
+          } else {
+            ++deep_detail_more;
+          }
+          ++deep_detail_total;
+        }
+      }
+    }
+    if (detail_total) {
+      ostringstream ss;
+      ss << detail_total << " pgs not scrubbed in time";
+      auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
+
+      if (!detail.empty()) {
+        d.detail.swap(detail);
+
+        if (detail_more) {
+          ostringstream ss;
+          ss << detail_more << " more pgs... ";
+          d.detail.push_back(ss.str());
+        }
+      }
+    }
+    if (deep_detail_total) {
+      ostringstream ss;
+      ss << deep_detail_total << " pgs not deep-scrubbed in time";
+      auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
+			    deep_detail_total);
+
+      if (!deep_detail.empty()) {
+        d.detail.swap(deep_detail);
+
+        if (deep_detail_more) {
+          ostringstream ss;
+          ss << deep_detail_more << " more pgs... ";
+          d.detail.push_back(ss.str());
+        }
+      }
+    }
+  }
+
+  // POOL_APP
+  if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
+    list<string> detail;
+    for (auto &it : pools) {
+      const pg_pool_t &pool = it.second;
+      const string& pool_name = osdmap.get_pool_name(it.first);
+      auto it2 = pg_pool_sum.find(it.first);
+      if (it2 == pg_pool_sum.end()) {
+        continue;
+      }
+      const pool_stat_t *pstat = &it2->second;
+      if (pstat == nullptr) {
+        continue;
+      }
+      const object_stat_sum_t& sum = pstat->stats.sum;
+      // application metadata is not encoded until luminous is minimum
+      // required release
+      if (sum.num_objects > 0 && pool.application_metadata.empty() &&
+          !pool.is_tier()) {
+        stringstream ss;
+        ss << "application not enabled on pool '" << pool_name << "'";
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " pool(s) do not have an application enabled";
+      auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
+			    detail.size());
+      stringstream tip;
+      tip << "use 'ceph osd pool application enable <pool-name> "
+          << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
+          << "or freeform for custom applications.";
+      detail.push_back(tip.str());
+      d.detail.swap(detail);
+    }
+  }
+
+  // PG_SLOW_SNAP_TRIMMING
+  if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
+    uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
+    uint64_t snaptrimq_exceeded = 0;
+    uint32_t longest_queue = 0;
+    const pg_t* longest_q_pg = nullptr;
+    list<string> detail;
+
+    for (auto& i: pg_stat) {
+      uint32_t current_len = i.second.snaptrimq_len;
+      if (current_len >= snapthreshold) {
+        snaptrimq_exceeded++;
+        if (longest_queue <= current_len) {
+          longest_q_pg = &i.first;
+          longest_queue = current_len;
+        }
+        if (detail.size() < max - 1) {
+          stringstream ss;
+          ss << "snap trim queue for pg " << i.first << " at " << current_len;
+          detail.push_back(ss.str());
+          continue;
+        }
+        if (detail.size() < max) {
+          detail.push_back("...more pgs affected");
+          continue;
+        }
+      }
+    }
+
+    if (snaptrimq_exceeded) {
+      {
+         ostringstream ss;
+         ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
+         detail.push_back(ss.str());
+      }
+
+      stringstream ss;
+      ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
+      auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
+			    snaptrimq_exceeded);
+      detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
+      d.detail.swap(detail);
+    }
+  }
+}
+
+void PGMap::print_summary(ceph::Formatter *f, ostream *out) const
+{
+  if (f) {
+    f->open_array_section("pgs_by_pool_state");
+    for (auto& i: num_pg_by_pool_state) {
+      f->open_object_section("per_pool_pgs_by_state");
+      f->dump_int("pool_id", i.first);
+      f->open_array_section("pg_state_counts");
+      for (auto& j : i.second) {
+        f->open_object_section("pg_state_count");
+        f->dump_string("state_name", pg_state_string(j.first));
+        f->dump_int("count", j.second);
+        f->close_section();
+      }
+      f->close_section();
+      f->close_section();
+    }
+    f->close_section();
+  }
+  PGMapDigest::print_summary(f, out);
+}
+
+int process_pg_map_command(
+  const string& orig_prefix,
+  const cmdmap_t& orig_cmdmap,
+  const PGMap& pg_map,
+  const OSDMap& osdmap,
+  ceph::Formatter *f,
+  stringstream *ss,
+  bufferlist *odata)
+{
+  string prefix = orig_prefix;
+  auto cmdmap = orig_cmdmap;
+
+  string omap_stats_note =
+      "\n* NOTE: Omap statistics are gathered during deep scrub and "
+      "may be inaccurate soon afterwards depending on utilization. See "
+      "http://docs.ceph.com/en/latest/dev/placement-group/#omap-statistics "
+      "for further details.\n";
+  bool omap_stats_note_required = false;
+
+  // perhaps these would be better in the parsing, but it's weird
+  bool primary = false;
+  if (prefix == "pg dump_json") {
+    vector<string> v;
+    v.push_back(string("all"));
+    cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
+    prefix = "pg dump";
+  } else if (prefix == "pg dump_pools_json") {
+    vector<string> v;
+    v.push_back(string("pools"));
+    cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
+    prefix = "pg dump";
+  } else if (prefix == "pg ls-by-primary") {
+    primary = true;
+    prefix = "pg ls";
+  } else if (prefix == "pg ls-by-osd") {
+    prefix = "pg ls";
+  } else if (prefix == "pg ls-by-pool") {
+    prefix = "pg ls";
+    string poolstr;
+    cmd_getval(cmdmap, "poolstr", poolstr);
+    int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+    if (pool < 0) {
+      *ss << "pool " << poolstr << " does not exist";
+      return -ENOENT;
+    }
+    cmd_putval(g_ceph_context, cmdmap, "pool", pool);
+  }
+
+  stringstream ds;
+  if (prefix == "pg stat") {
+    if (f) {
+      f->open_object_section("pg_summary");
+      pg_map.print_oneline_summary(f, NULL);
+      f->close_section();
+      f->flush(ds);
+    } else {
+      ds << pg_map;
+    }
+    odata->append(ds);
+    return 0;
+  }
+
+  if (prefix == "pg getmap") {
+    pg_map.encode(*odata);
+    *ss << "got pgmap version " << pg_map.version;
+    return 0;
+  }
+
+  if (prefix == "pg dump") {
+    string val;
+    vector<string> dumpcontents;
+    set<string> what;
+    if (cmd_getval(cmdmap, "dumpcontents", dumpcontents)) {
+      copy(dumpcontents.begin(), dumpcontents.end(),
+           inserter(what, what.end()));
+    }
+    if (what.empty())
+      what.insert("all");
+    if (f) {
+      if (what.count("all")) {
+	f->open_object_section("pg_map");
+	pg_map.dump(f);
+	f->close_section();
+      } else if (what.count("summary") || what.count("sum")) {
+	f->open_object_section("pg_map");
+	pg_map.dump_basic(f);
+	f->close_section();
+      } else {
+	if (what.count("pools")) {
+	  pg_map.dump_pool_stats(f);
+	}
+	if (what.count("osds")) {
+	  pg_map.dump_osd_stats(f);
+	}
+	if (what.count("pgs")) {
+	  pg_map.dump_pg_stats(f, false);
+	}
+	if (what.count("pgs_brief")) {
+	  pg_map.dump_pg_stats(f, true);
+	}
+	if (what.count("delta")) {
+	  f->open_object_section("delta");
+	  pg_map.dump_delta(f);
+	  f->close_section();
+	}
+      }
+      f->flush(*odata);
+    } else {
+      if (what.count("all")) {
+	pg_map.dump(ds);
+        omap_stats_note_required = true;
+      } else if (what.count("summary") || what.count("sum")) {
+	pg_map.dump_basic(ds);
+	pg_map.dump_pg_sum_stats(ds, true);
+	pg_map.dump_osd_sum_stats(ds);
+        omap_stats_note_required = true;
+      } else {
+	if (what.count("pgs_brief")) {
+	  pg_map.dump_pg_stats(ds, true);
+	}
+	bool header = true;
+	if (what.count("pgs")) {
+	  pg_map.dump_pg_stats(ds, false);
+	  header = false;
+          omap_stats_note_required = true;
+	}
+	if (what.count("pools")) {
+	  pg_map.dump_pool_stats(ds, header);
+          omap_stats_note_required = true;
+	}
+	if (what.count("osds")) {
+	  pg_map.dump_osd_stats(ds);
+	}
+      }
+      odata->append(ds);
+      if (omap_stats_note_required) {
+        odata->append(omap_stats_note);
+      }
+    }
+    *ss << "dumped " << what;
+    return 0;
+  }
+
+  if (prefix == "pg ls") {
+    int64_t osd = -1;
+    int64_t pool = -1;
+    vector<string>states;
+    set<pg_t> pgs;
+    cmd_getval(cmdmap, "pool", pool);
+    cmd_getval(cmdmap, "osd", osd);
+    cmd_getval(cmdmap, "states", states);
+    if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
+      *ss << "pool " << pool << " does not exist";
+      return -ENOENT;
+    }
+    if (osd >= 0 && !osdmap.is_up(osd)) {
+      *ss << "osd " << osd << " is not up";
+      return -EAGAIN;
+    }
+    if (states.empty())
+      states.push_back("all");
+
+    uint64_t state = 0;
+
+    while (!states.empty()) {
+      string state_str = states.back();
+
+      if (state_str == "all") {
+        state = -1;
+        break;
+      } else {
+        auto filter = pg_string_state(state_str);
+        if (!filter) {
+          *ss << "'" << state_str << "' is not a valid pg state,"
+              << " available choices: " << pg_state_string(0xFFFFFFFF);
+          return -EINVAL;
+        }
+        state |= *filter;
+      }
+
+      states.pop_back();
+    }
+
+    pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
+
+    if (f && !pgs.empty()) {
+      pg_map.dump_filtered_pg_stats(f, pgs);
+      f->flush(*odata);
+    } else if (!pgs.empty()) {
+      pg_map.dump_filtered_pg_stats(ds, pgs);
+      odata->append(ds);
+      odata->append(omap_stats_note);
+    }
+    return 0;
+  }
+
+  if (prefix == "pg dump_stuck") {
+    vector<string> stuckop_vec;
+    cmd_getval(cmdmap, "stuckops", stuckop_vec);
+    if (stuckop_vec.empty())
+      stuckop_vec.push_back("unclean");
+    int64_t threshold;
+    cmd_getval(cmdmap, "threshold", threshold,
+               g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
+
+    if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
+      *ss << "failed";
+    } else {
+      *ss << "ok";
+    }
+    odata->append(ds);
+    return 0;
+  }
+
+  if (prefix == "pg debug") {
+    string debugop;
+    cmd_getval(cmdmap, "debugop", debugop,
+	       string("unfound_objects_exist"));
+    if (debugop == "unfound_objects_exist") {
+      bool unfound_objects_exist = false;
+      for (const auto& p : pg_map.pg_stat) {
+	if (p.second.stats.sum.num_objects_unfound > 0) {
+	  unfound_objects_exist = true;
+	  break;
+	}
+      }
+      if (unfound_objects_exist)
+	ds << "TRUE";
+      else
+	ds << "FALSE";
+      odata->append(ds);
+      return 0;
+    }
+    if (debugop == "degraded_pgs_exist") {
+      bool degraded_pgs_exist = false;
+      for (const auto& p : pg_map.pg_stat) {
+	if (p.second.stats.sum.num_objects_degraded > 0) {
+	  degraded_pgs_exist = true;
+	  break;
+	}
+      }
+      if (degraded_pgs_exist)
+	ds << "TRUE";
+      else
+	ds << "FALSE";
+      odata->append(ds);
+      return 0;
+    }
+  }
+
+  if (prefix == "osd perf") {
+    if (f) {
+      f->open_object_section("osdstats");
+      pg_map.dump_osd_perf_stats(f);
+      f->close_section();
+      f->flush(ds);
+    } else {
+      pg_map.print_osd_perf_stats(&ds);
+    }
+    odata->append(ds);
+    return 0;
+  }
+
+  if (prefix == "osd blocked-by") {
+    if (f) {
+      f->open_object_section("osd_blocked_by");
+      pg_map.dump_osd_blocked_by_stats(f);
+      f->close_section();
+      f->flush(ds);
+    } else {
+      pg_map.print_osd_blocked_by_stats(&ds);
+    }
+    odata->append(ds);
+    return 0;
+  }
+
+  return -EOPNOTSUPP;
+}
+
+void PGMapUpdater::check_osd_map(
+  CephContext *cct,
+  const OSDMap& osdmap,
+  const PGMap& pgmap,
+  PGMap::Incremental *pending_inc)
+{
+  for (auto& p : pgmap.osd_stat) {
+    if (!osdmap.exists(p.first)) {
+      // remove osd_stat
+      pending_inc->rm_stat(p.first);
+    } else if (osdmap.is_out(p.first)) {
+      // zero osd_stat
+      if (p.second.statfs.total != 0) {
+	pending_inc->stat_osd_out(p.first);
+      }
+    } else if (!osdmap.is_up(p.first)) {
+      // zero the op_queue_age_hist
+      if (!p.second.op_queue_age_hist.empty()) {
+	pending_inc->stat_osd_down_up(p.first, pgmap);
+      }
+    }
+  }
+
+  // deleted pgs (pools)?
+  for (auto& p : pgmap.pg_pool_sum) {
+    if (!osdmap.have_pg_pool(p.first)) {
+      ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
+		     << dendl;
+      for (auto& q : pgmap.pg_stat) {
+	if (q.first.pool() == p.first) {
+	  pending_inc->pg_remove.insert(q.first);
+	}
+      }
+      auto q = pending_inc->pg_stat_updates.begin();
+      while (q != pending_inc->pg_stat_updates.end()) {
+	if (q->first.pool() == p.first) {
+	  q = pending_inc->pg_stat_updates.erase(q);
+	} else {
+	  ++q;
+	}
+      }
+    }
+  }
+
+  // new (split or new pool) or merged pgs?
+  map<int64_t,unsigned> new_pg_num;
+  for (auto& p : osdmap.get_pools()) {
+    int64_t poolid = p.first;
+    const pg_pool_t& pi = p.second;
+    auto q = pgmap.num_pg_by_pool.find(poolid);
+    unsigned my_pg_num = 0;
+    if (q != pgmap.num_pg_by_pool.end())
+      my_pg_num = q->second;
+    unsigned pg_num = pi.get_pg_num();
+    new_pg_num[poolid] = pg_num;
+    if (my_pg_num < pg_num) {
+      ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
+		    << " > my pg_num " << my_pg_num << dendl;
+      for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
+	pg_t pgid(ps, poolid);
+	if (pending_inc->pg_stat_updates.count(pgid) == 0) {
+	  ldout(cct,20) << __func__ << " adding " << pgid << dendl;
+	  pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
+	  stats.last_fresh = osdmap.get_modified();
+	  stats.last_active = osdmap.get_modified();
+	  stats.last_change = osdmap.get_modified();
+	  stats.last_peered = osdmap.get_modified();
+	  stats.last_clean = osdmap.get_modified();
+	  stats.last_unstale = osdmap.get_modified();
+	  stats.last_undegraded = osdmap.get_modified();
+	  stats.last_fullsized = osdmap.get_modified();
+	  stats.last_scrub_stamp = osdmap.get_modified();
+	  stats.last_deep_scrub_stamp = osdmap.get_modified();
+	  stats.last_clean_scrub_stamp = osdmap.get_modified();
+	}
+      }
+    } else if (my_pg_num > pg_num) {
+      ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
+		    << " < my pg_num " << my_pg_num << dendl;
+      for (unsigned i = pg_num; i < my_pg_num; ++i) {
+	pg_t pgid(i, poolid);
+	ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
+	if (pgmap.pg_stat.count(pgid)) {
+	  pending_inc->pg_remove.insert(pgid);
+	}
+	pending_inc->pg_stat_updates.erase(pgid);
+      }
+    }
+  }
+  auto i = pending_inc->pg_stat_updates.begin();
+  while (i != pending_inc->pg_stat_updates.end()) {
+    auto j = new_pg_num.find(i->first.pool());
+    if (j == new_pg_num.end() ||
+	i->first.ps() >= j->second) {
+      ldout(cct,20) << __func__ << " removing pending update to old "
+		    << i->first << dendl;
+      i = pending_inc->pg_stat_updates.erase(i);
+    } else {
+      ++i;
+    }
+  }
+}
+
+static void _try_mark_pg_stale(
+  const OSDMap& osdmap,
+  pg_t pgid,
+  const pg_stat_t& cur,
+  PGMap::Incremental *pending_inc)
+{
+  if ((cur.state & PG_STATE_STALE) == 0 &&
+      cur.acting_primary != -1 &&
+      osdmap.is_down(cur.acting_primary)) {
+    pg_stat_t *newstat;
+    auto q = pending_inc->pg_stat_updates.find(pgid);
+    if (q != pending_inc->pg_stat_updates.end()) {
+      if ((q->second.acting_primary == cur.acting_primary) ||
+	  ((q->second.state & PG_STATE_STALE) == 0 &&
+	   q->second.acting_primary != -1 &&
+	   osdmap.is_down(q->second.acting_primary))) {
+	newstat = &q->second;
+      } else {
+	// pending update is no longer down or already stale
+	return;
+      }
+    } else {
+      newstat = &pending_inc->pg_stat_updates[pgid];
+      *newstat = cur;
+    }
+    dout(10) << __func__ << " marking pg " << pgid
+	     << " stale (acting_primary " << newstat->acting_primary
+	     << ")" << dendl;
+    newstat->state |= PG_STATE_STALE;
+    newstat->last_unstale = ceph_clock_now();
+  }
+}
+
+void PGMapUpdater::check_down_pgs(
+    const OSDMap &osdmap,
+    const PGMap &pg_map,
+    bool check_all,
+    const set<int>& need_check_down_pg_osds,
+    PGMap::Incremental *pending_inc)
+{
+  // if a large number of osds changed state, just iterate over the whole
+  // pg map.
+  if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
+      g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
+    check_all = true;
+  }
+
+  if (check_all) {
+    for (const auto& p : pg_map.pg_stat) {
+      _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
+    }
+  } else {
+    for (auto osd : need_check_down_pg_osds) {
+      if (osdmap.is_down(osd)) {
+	auto p = pg_map.pg_by_osd.find(osd);
+	if (p == pg_map.pg_by_osd.end()) {
+	  continue;
+	}
+	for (auto pgid : p->second) {
+	  const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
+	  ceph_assert(stat.acting_primary == osd);
+	  _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
+	}
+      }
+    }
+  }
+}
+
+int reweight::by_utilization(
+    const OSDMap &osdmap,
+    const PGMap &pgm,
+    int oload,
+    double max_changef,
+    int max_osds,
+    bool by_pg, const set<int64_t> *pools,
+    bool no_increasing,
+    mempool::osdmap::map<int32_t, uint32_t>* new_weights,
+    std::stringstream *ss,
+    std::string *out_str,
+    ceph::Formatter *f)
+{
+  if (oload <= 100) {
+    *ss << "You must give a percentage higher than 100. "
+      "The reweighting threshold will be calculated as <average-utilization> "
+      "times <input-percentage>. For example, an argument of 200 would "
+      "reweight OSDs which are twice as utilized as the average OSD.\n";
+    return -EINVAL;
+  }
+
+  vector<int> pgs_by_osd(osdmap.get_max_osd());
+
+  // Avoid putting a small number (or 0) in the denominator when calculating
+  // average_util
+  double average_util;
+  if (by_pg) {
+    // by pg mapping
+    double weight_sum = 0.0;      // sum up the crush weights
+    unsigned num_pg_copies = 0;
+    int num_osds = 0;
+    for (const auto& pg : pgm.pg_stat) {
+      if (pools && pools->count(pg.first.pool()) == 0)
+	continue;
+      for (const auto acting : pg.second.acting) {
+        if (!osdmap.exists(acting)) {
+          continue;
+        }
+	if (acting >= (int)pgs_by_osd.size())
+	  pgs_by_osd.resize(acting);
+	if (pgs_by_osd[acting] == 0) {
+          if (osdmap.crush->get_item_weightf(acting) <= 0) {
+            //skip if we currently can not identify item
+            continue;
+          }
+	  weight_sum += osdmap.crush->get_item_weightf(acting);
+	  ++num_osds;
+	}
+	++pgs_by_osd[acting];
+	++num_pg_copies;
+      }
+    }
+
+    if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
+      *ss << "Refusing to reweight: we only have " << num_pg_copies
+	  << " PGs across " << num_osds << " osds!\n";
+      return -EDOM;
+    }
+
+    average_util = (double)num_pg_copies / weight_sum;
+  } else {
+    // by osd utilization
+    int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
+    if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
+	< g_conf()->mon_reweight_min_bytes_per_osd) {
+      *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
+	  << " kb across all osds!\n";
+      return -EDOM;
+    }
+    if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
+	< g_conf()->mon_reweight_min_bytes_per_osd) {
+      *ss << "Refusing to reweight: we only have "
+	  << pgm.osd_sum.statfs.kb_used_raw()
+	  << " kb used across all osds!\n";
+      return -EDOM;
+    }
+
+    average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
+      (double)pgm.osd_sum.statfs.total;
+  }
+
+  // adjust down only if we are above the threshold
+  const double overload_util = average_util * (double)oload / 100.0;
+
+  // but aggressively adjust weights up whenever possible.
+  const double underload_util = average_util;
+
+  const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
+
+  ostringstream oss;
+  if (f) {
+    f->open_object_section("reweight_by_utilization");
+    f->dump_int("overload_min", oload);
+    f->dump_float("max_change", max_changef);
+    f->dump_int("max_change_osds", max_osds);
+    f->dump_float("average_utilization", average_util);
+    f->dump_float("overload_utilization", overload_util);
+  } else {
+    oss << "oload " << oload << "\n";
+    oss << "max_change " << max_changef << "\n";
+    oss << "max_change_osds " << max_osds << "\n";
+    oss.precision(4);
+    oss << "average_utilization " << std::fixed << average_util << "\n";
+    oss << "overload_utilization " << overload_util << "\n";
+  }
+  int num_changed = 0;
+
+  // precompute util for each OSD
+  std::vector<std::pair<int, float> > util_by_osd;
+  for (const auto& p : pgm.osd_stat) {
+    std::pair<int, float> osd_util;
+    osd_util.first = p.first;
+    if (by_pg) {
+      if (p.first >= (int)pgs_by_osd.size() ||
+        pgs_by_osd[p.first] == 0) {
+        // skip if this OSD does not contain any pg
+        // belonging to the specified pool(s).
+        continue;
+      }
+
+      if (osdmap.crush->get_item_weightf(p.first) <= 0) {
+        // skip if we are unable to locate item.
+        continue;
+      }
+
+      osd_util.second =
+	pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
+    } else {
+      osd_util.second =
+	(double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
+    }
+    util_by_osd.push_back(osd_util);
+  }
+
+  // sort by absolute deviation from the mean utilization,
+  // in descending order.
+  std::sort(util_by_osd.begin(), util_by_osd.end(),
+    [average_util](std::pair<int, float> l, std::pair<int, float> r) {
+      return abs(l.second - average_util) > abs(r.second - average_util);
+    }
+  );
+
+  if (f)
+    f->open_array_section("reweights");
+
+  for (const auto& p : util_by_osd) {
+    unsigned weight = osdmap.get_weight(p.first);
+    if (weight == 0) {
+      // skip if OSD is currently out
+      continue;
+    }
+    float util = p.second;
+
+    if (util >= overload_util) {
+      // Assign a lower weight to overloaded OSDs. The current weight
+      // is a factor to take into account the original weights,
+      // to represent e.g. differing storage capacities
+      unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
+      if (weight > max_change)
+	new_weight = std::max(new_weight, weight - max_change);
+      new_weights->insert({p.first, new_weight});
+      if (f) {
+	f->open_object_section("osd");
+	f->dump_int("osd", p.first);
+	f->dump_float("weight", (float)weight / (float)0x10000);
+	f->dump_float("new_weight", (float)new_weight / (float)0x10000);
+	f->close_section();
+      } else {
+        oss << "osd." << p.first << " weight "
+            << (float)weight / (float)0x10000 << " -> "
+            << (float)new_weight / (float)0x10000 << "\n";
+      }
+      if (++num_changed >= max_osds)
+	break;
+    }
+    if (!no_increasing && util <= underload_util) {
+      // assign a higher weight.. if we can.
+      unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
+      new_weight = std::min(new_weight, weight + max_change);
+      if (new_weight > 0x10000)
+	new_weight = 0x10000;
+      if (new_weight > weight) {
+	new_weights->insert({p.first, new_weight});
+        oss << "osd." << p.first << " weight "
+            << (float)weight / (float)0x10000 << " -> "
+            << (float)new_weight / (float)0x10000 << "\n";
+	if (++num_changed >= max_osds)
+	  break;
+      }
+    }
+  }
+  if (f) {
+    f->close_section();
+  }
+
+  OSDMap newmap;
+  newmap.deepish_copy_from(osdmap);
+  OSDMap::Incremental newinc;
+  newinc.fsid = newmap.get_fsid();
+  newinc.epoch = newmap.get_epoch() + 1;
+  newinc.new_weight = *new_weights;
+  newmap.apply_incremental(newinc);
+
+  osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
+
+  if (f) {
+    f->close_section();
+  } else {
+    *out_str += "\n";
+    *out_str += oss.str();
+  }
+  return num_changed;
+}
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
new file mode 100644
index 000000000..9bdabb046
--- /dev/null
+++ b/src/mon/PGMap.h
@@ -0,0 +1,558 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+ 
+/*
+ * Placement Group Map. Placement Groups are logical sets of objects
+ * that are replicated by the same set of devices. pgid=(r,hash(o)&m)
+ * where & is a bit-wise AND and m=2^k-1
+ */
+
+#ifndef CEPH_PGMAP_H
+#define CEPH_PGMAP_H
+
+#include "include/health.h"
+#include "common/debug.h"
+#include "common/TextTable.h"
+#include "osd/osd_types.h"
+#include "include/mempool.h"
+#include "mon/health_check.h"
+#include <sstream>
+
+namespace ceph { class Formatter; }
+
+class PGMapDigest {
+public:
+  MEMPOOL_CLASS_HELPERS();
+  virtual ~PGMapDigest() {}
+
+  mempool::pgmap::vector<uint64_t> osd_last_seq;
+
+  mutable std::map<int, int64_t> avail_space_by_rule;
+
+  // aggregate state, populated by PGMap child
+  int64_t num_pg = 0, num_osd = 0;
+  int64_t num_pg_active = 0;
+  int64_t num_pg_unknown = 0;
+  mempool::pgmap::unordered_map<int32_t,pool_stat_t> pg_pool_sum;
+  mempool::pgmap::map<int64_t,int64_t> num_pg_by_pool;
+  pool_stat_t pg_sum;
+  osd_stat_t osd_sum;
+  mempool::pgmap::map<std::string,osd_stat_t> osd_sum_by_class;
+  mempool::pgmap::unordered_map<uint64_t,int32_t> num_pg_by_state;
+  struct pg_count {
+    int32_t acting = 0;
+    int32_t up_not_acting = 0;
+    int32_t primary = 0;
+    void encode(ceph::buffer::list& bl) const {
+      using ceph::encode;
+      encode(acting, bl);
+      encode(up_not_acting, bl);
+      encode(primary, bl);
+    }
+    void decode(ceph::buffer::list::const_iterator& p) {
+      using ceph::decode;
+      decode(acting, p);
+      decode(up_not_acting, p);
+      decode(primary, p);
+    }
+  };
+  mempool::pgmap::unordered_map<int32_t,pg_count> num_pg_by_osd;
+
+  mempool::pgmap::map<int64_t,interval_set<snapid_t>> purged_snaps;
+
+  bool use_per_pool_stats() const {
+    return osd_sum.num_osds == osd_sum.num_per_pool_osds;
+  }
+  bool use_per_pool_omap_stats() const {
+    return osd_sum.num_osds == osd_sum.num_per_pool_omap_osds;
+  }
+
+  // recent deltas, and summation
+  /**
+   * keep track of last deltas for each pool, calculated using
+   * @p pg_pool_sum as baseline.
+   */
+  mempool::pgmap::unordered_map<int64_t, mempool::pgmap::list<std::pair<pool_stat_t, utime_t> > > per_pool_sum_deltas;
+  /**
+   * keep track of per-pool timestamp deltas, according to last update on
+   * each pool.
+   */
+  mempool::pgmap::unordered_map<int64_t, utime_t> per_pool_sum_deltas_stamps;
+  /**
+   * keep track of sum deltas, per-pool, taking into account any previous
+   * deltas existing in @p per_pool_sum_deltas.  The utime_t as second member
+   * of the pair is the timestamp referring to the last update (i.e., the first
+   * member of the pair) for a given pool.
+   */
+  mempool::pgmap::unordered_map<int64_t, std::pair<pool_stat_t,utime_t> > per_pool_sum_delta;
+
+  pool_stat_t pg_sum_delta;
+  utime_t stamp_delta;
+
+  void get_recovery_stats(
+    double *misplaced_ratio,
+    double *degraded_ratio,
+    double *inactive_ratio,
+    double *unknown_pgs_ratio) const;
+
+  void print_summary(ceph::Formatter *f, std::ostream *out) const;
+  void print_oneline_summary(ceph::Formatter *f, std::ostream *out) const;
+
+  void recovery_summary(ceph::Formatter *f, std::list<std::string> *psl,
+                        const pool_stat_t& pool_sum) const;
+  void overall_recovery_summary(ceph::Formatter *f, std::list<std::string> *psl) const;
+  void pool_recovery_summary(ceph::Formatter *f, std::list<std::string> *psl,
+                             uint64_t poolid) const;
+  void recovery_rate_summary(ceph::Formatter *f, std::ostream *out,
+                             const pool_stat_t& delta_sum,
+                             utime_t delta_stamp) const;
+  void overall_recovery_rate_summary(ceph::Formatter *f, std::ostream *out) const;
+  void pool_recovery_rate_summary(ceph::Formatter *f, std::ostream *out,
+                                  uint64_t poolid) const;
+  /**
+   * Obtain a formatted/plain output for client I/O, source from stats for a
+   * given @p delta_sum pool over a given @p delta_stamp period of time.
+   */
+  void client_io_rate_summary(ceph::Formatter *f, std::ostream *out,
+                              const pool_stat_t& delta_sum,
+                              utime_t delta_stamp) const;
+  /**
+   * Obtain a formatted/plain output for the overall client I/O, which is
+   * calculated resorting to @p pg_sum_delta and @p stamp_delta.
+   */
+  void overall_client_io_rate_summary(ceph::Formatter *f, std::ostream *out) const;
+  /**
+   * Obtain a formatted/plain output for client I/O over a given pool
+   * with id @p pool_id.  We will then obtain pool-specific data
+   * from @p per_pool_sum_delta.
+   */
+  void pool_client_io_rate_summary(ceph::Formatter *f, std::ostream *out,
+                                   uint64_t poolid) const;
+  /**
+   * Obtain a formatted/plain output for cache tier IO, source from stats for a
+   * given @p delta_sum pool over a given @p delta_stamp period of time.
+   */
+  void cache_io_rate_summary(ceph::Formatter *f, std::ostream *out,
+                             const pool_stat_t& delta_sum,
+                             utime_t delta_stamp) const;
+  /**
+   * Obtain a formatted/plain output for the overall cache tier IO, which is
+   * calculated resorting to @p pg_sum_delta and @p stamp_delta.
+   */
+  void overall_cache_io_rate_summary(ceph::Formatter *f, std::ostream *out) const;
+  /**
+   * Obtain a formatted/plain output for cache tier IO over a given pool
+   * with id @p pool_id.  We will then obtain pool-specific data
+   * from @p per_pool_sum_delta.
+   */
+  void pool_cache_io_rate_summary(ceph::Formatter *f, std::ostream *out,
+                                  uint64_t poolid) const;
+
+  /**
+   * Return the number of additional bytes that can be stored in this
+   * pool before the first OSD fills up, accounting for PG overhead.
+   */
+  int64_t get_pool_free_space(const OSDMap &osd_map, int64_t poolid) const;
+
+
+  /**
+   * Dump pool usage and io ops/bytes, used by "ceph df" command
+   */
+  virtual void dump_pool_stats_full(const OSDMap &osd_map, std::stringstream *ss,
+				    ceph::Formatter *f, bool verbose) const;
+  void dump_cluster_stats(std::stringstream *ss, ceph::Formatter *f, bool verbose) const;
+  static void dump_object_stat_sum(TextTable &tbl, ceph::Formatter *f,
+				   const pool_stat_t &pool_stat,
+				   uint64_t avail,
+				   float raw_used_rate,
+				   bool verbose,
+				   bool per_pool,
+				   bool per_pool_omap,
+				   const pg_pool_t *pool);
+
+  size_t get_num_pg_by_osd(int osd) const {
+    auto p = num_pg_by_osd.find(osd);
+    if (p == num_pg_by_osd.end())
+      return 0;
+    else
+      return p->second.acting;
+  }
+  int get_num_primary_pg_by_osd(int osd) const {
+    auto p = num_pg_by_osd.find(osd);
+    if (p == num_pg_by_osd.end())
+      return 0;
+    else
+      return p->second.primary;
+  }
+
+  ceph_statfs get_statfs(OSDMap &osdmap,
+                         boost::optional<int64_t> data_pool) const;
+
+  int64_t get_rule_avail(int ruleno) const {
+    auto i = avail_space_by_rule.find(ruleno);
+    if (i != avail_space_by_rule.end())
+      return avail_space_by_rule[ruleno];
+    else
+      return 0;
+  }
+
+  // kill me post-mimic or -nautilus
+  bool definitely_converted_snapsets() const {
+    // false negative is okay; false positive is not!
+    return
+      num_pg &&
+      num_pg_unknown == 0 &&
+      pg_sum.stats.sum.num_legacy_snapsets == 0;
+  }
+
+  uint64_t get_last_osd_stat_seq(int osd) {
+    if (osd < (int)osd_last_seq.size())
+      return osd_last_seq[osd];
+    return 0;
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<PGMapDigest*>& ls);
+};
+WRITE_CLASS_ENCODER(PGMapDigest::pg_count);
+WRITE_CLASS_ENCODER_FEATURES(PGMapDigest);
+
+class PGMap : public PGMapDigest {
+public:
+  MEMPOOL_CLASS_HELPERS();
+
+  // the map
+  version_t version;
+  epoch_t last_osdmap_epoch;   // last osdmap epoch i applied to the pgmap
+  epoch_t last_pg_scan;  // osdmap epoch
+  mempool::pgmap::unordered_map<int32_t,osd_stat_t> osd_stat;
+  mempool::pgmap::unordered_map<pg_t,pg_stat_t> pg_stat;
+
+  typedef mempool::pgmap::map<
+    std::pair<int64_t, int>,  // <pool, osd>
+    store_statfs_t>
+      per_osd_pool_statfs_t;
+
+  per_osd_pool_statfs_t pool_statfs;
+
+  class Incremental {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+    version_t version;
+    mempool::pgmap::map<pg_t,pg_stat_t> pg_stat_updates;
+    epoch_t osdmap_epoch;
+    epoch_t pg_scan;  // osdmap epoch
+    mempool::pgmap::set<pg_t> pg_remove;
+    utime_t stamp;
+    per_osd_pool_statfs_t pool_statfs_updates;
+
+  private:
+    mempool::pgmap::map<int32_t,osd_stat_t> osd_stat_updates;
+    mempool::pgmap::set<int32_t> osd_stat_rm;
+  public:
+
+    const mempool::pgmap::map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
+      return osd_stat_updates;
+    }
+    const mempool::pgmap::set<int32_t> &get_osd_stat_rm() const {
+      return osd_stat_rm;
+    }
+    template<typename OsdStat>
+    void update_stat(int32_t osd, OsdStat&& stat) {
+      osd_stat_updates[osd] = std::forward<OsdStat>(stat);
+    }
+    void stat_osd_out(int32_t osd) {
+      osd_stat_updates[osd] = osd_stat_t();
+    }
+    void stat_osd_down_up(int32_t osd, const PGMap& pg_map) {
+      // 0 the op_queue_age_hist for this osd
+      auto p = osd_stat_updates.find(osd);
+      if (p != osd_stat_updates.end()) {
+	p->second.op_queue_age_hist.clear();
+	return;
+      }
+      auto q = pg_map.osd_stat.find(osd);
+      if (q != pg_map.osd_stat.end()) {
+	osd_stat_t& t = osd_stat_updates[osd] = q->second;
+	t.op_queue_age_hist.clear();
+      }
+    }
+    void rm_stat(int32_t osd) {
+      osd_stat_rm.insert(osd);
+      osd_stat_updates.erase(osd);
+    }
+    void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<Incremental*>& o);
+
+    Incremental() : version(0), osdmap_epoch(0), pg_scan(0) {}
+  };
+
+
+  // aggregate stats (soft state), generated by calc_stats()
+  mempool::pgmap::unordered_map<int,std::set<pg_t> > pg_by_osd;
+  mempool::pgmap::unordered_map<int,int> blocked_by_sum;
+  mempool::pgmap::list<std::pair<pool_stat_t, utime_t> > pg_sum_deltas;
+  mempool::pgmap::unordered_map<int64_t,mempool::pgmap::unordered_map<uint64_t,int32_t>> num_pg_by_pool_state;
+
+  utime_t stamp;
+
+  void update_pool_deltas(
+    CephContext *cct,
+    const utime_t ts,
+    const mempool::pgmap::unordered_map<int32_t, pool_stat_t>& pg_pool_sum_old);
+  void clear_delta();
+
+  void deleted_pool(int64_t pool) {
+    for (auto i = pool_statfs.begin();  i != pool_statfs.end();) {
+      if (i->first.first == pool) {
+	i = pool_statfs.erase(i);
+      } else {
+        ++i;
+      }
+    }
+
+    pg_pool_sum.erase(pool);
+    num_pg_by_pool_state.erase(pool);
+    num_pg_by_pool.erase(pool);
+    per_pool_sum_deltas.erase(pool);
+    per_pool_sum_deltas_stamps.erase(pool);
+    per_pool_sum_delta.erase(pool);
+  }
+
+ private:
+  void update_delta(
+    CephContext *cct,
+    const utime_t ts,
+    const pool_stat_t& old_pool_sum,
+    utime_t *last_ts,
+    const pool_stat_t& current_pool_sum,
+    pool_stat_t *result_pool_delta,
+    utime_t *result_ts_delta,
+    mempool::pgmap::list<std::pair<pool_stat_t,utime_t> > *delta_avg_list);
+
+  void update_one_pool_delta(CephContext *cct,
+                             const utime_t ts,
+                             const int64_t pool,
+                             const pool_stat_t& old_pool_sum);
+
+ public:
+
+  mempool::pgmap::set<pg_t> creating_pgs;
+  mempool::pgmap::map<int,std::map<epoch_t,std::set<pg_t> > > creating_pgs_by_osd_epoch;
+
+  // Bits that use to be enum StuckPG
+  static const int STUCK_INACTIVE = (1<<0);
+  static const int STUCK_UNCLEAN = (1<<1);
+  static const int STUCK_UNDERSIZED = (1<<2);
+  static const int STUCK_DEGRADED = (1<<3);
+  static const int STUCK_STALE = (1<<4);
+  
+  PGMap()
+    : version(0),
+      last_osdmap_epoch(0), last_pg_scan(0)
+  {}
+
+  version_t get_version() const {
+    return version;
+  }
+  void set_version(version_t v) {
+    version = v;
+  }
+  epoch_t get_last_osdmap_epoch() const {
+    return last_osdmap_epoch;
+  }
+  void set_last_osdmap_epoch(epoch_t e) {
+    last_osdmap_epoch = e;
+  }
+  epoch_t get_last_pg_scan() const {
+    return last_pg_scan;
+  }
+  void set_last_pg_scan(epoch_t e) {
+    last_pg_scan = e;
+  }
+  utime_t get_stamp() const {
+    return stamp;
+  }
+  void set_stamp(utime_t s) {
+    stamp = s;
+  }
+
+  pool_stat_t get_pg_pool_sum_stat(int64_t pool) const {
+    auto p = pg_pool_sum.find(pool);
+    if (p != pg_pool_sum.end())
+      return p->second;
+    return pool_stat_t();
+  }
+
+  osd_stat_t get_osd_sum(const std::set<int>& osds) const {
+    if (osds.empty()) // all
+      return osd_sum;
+    osd_stat_t sum;
+    for (auto i : osds) {
+      auto os = get_osd_stat(i);
+      if (os)
+        sum.add(*os);
+    }
+    return sum;
+  }
+
+  const osd_stat_t *get_osd_stat(int osd) const {
+    auto i = osd_stat.find(osd);
+    if (i == osd_stat.end()) {
+      return nullptr;
+    }
+    return &i->second;
+  }
+
+
+  void apply_incremental(CephContext *cct, const Incremental& inc);
+  void calc_stats();
+  void stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
+		   bool sameosds=false);
+  bool stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
+		   bool sameosds=false);
+  void calc_purged_snaps();
+  void calc_osd_sum_by_class(const OSDMap& osdmap);
+  void stat_osd_add(int osd, const osd_stat_t &s);
+  void stat_osd_sub(int osd, const osd_stat_t &s);
+  
+  void encode(ceph::buffer::list &bl, uint64_t features=-1) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+
+  /// encode subset of our data to a PGMapDigest
+  void encode_digest(const OSDMap& osdmap,
+		     ceph::buffer::list& bl, uint64_t features);
+
+  int64_t get_rule_avail(const OSDMap& osdmap, int ruleno) const;
+  void get_rules_avail(const OSDMap& osdmap,
+		       std::map<int,int64_t> *avail_map) const;
+  void dump(ceph::Formatter *f, bool with_net = true) const;
+  void dump_basic(ceph::Formatter *f) const;
+  void dump_pg_stats(ceph::Formatter *f, bool brief) const;
+  void dump_pg_progress(ceph::Formatter *f) const;
+  void dump_pool_stats(ceph::Formatter *f) const;
+  void dump_osd_stats(ceph::Formatter *f, bool with_net = true) const;
+  void dump_osd_ping_times(ceph::Formatter *f) const;
+  void dump_delta(ceph::Formatter *f) const;
+  void dump_filtered_pg_stats(ceph::Formatter *f, std::set<pg_t>& pgs) const;
+  void dump_pool_stats_full(const OSDMap &osd_map, std::stringstream *ss,
+			    ceph::Formatter *f, bool verbose) const override {
+    get_rules_avail(osd_map, &avail_space_by_rule);
+    PGMapDigest::dump_pool_stats_full(osd_map, ss, f, verbose);
+  }
+
+  /*
+  * Dump client io rate, recovery io rate, cache io rate and recovery information.
+  * this function is used by "ceph osd pool stats" command
+  */
+  void dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map, ceph::Formatter *f,
+				   std::stringstream *ss) const;
+
+  void dump_pg_stats_plain(
+    std::ostream& ss,
+    const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
+    bool brief) const;
+  void get_stuck_stats(
+    int types, const utime_t cutoff,
+    mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const;
+  bool get_stuck_counts(const utime_t cutoff, std::map<std::string, int>& note) const;
+  void dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const;
+  void dump_stuck_plain(std::ostream& ss, int types, utime_t cutoff) const;
+  int dump_stuck_pg_stats(std::stringstream &ds,
+			  ceph::Formatter *f,
+			  int threshold,
+			  std::vector<std::string>& args) const;
+  void dump(std::ostream& ss) const;
+  void dump_basic(std::ostream& ss) const;
+  void dump_pg_stats(std::ostream& ss, bool brief) const;
+  void dump_pg_sum_stats(std::ostream& ss, bool header) const;
+  void dump_pool_stats(std::ostream& ss, bool header) const;
+  void dump_osd_stats(std::ostream& ss) const;
+  void dump_osd_sum_stats(std::ostream& ss) const;
+  void dump_filtered_pg_stats(std::ostream& ss, std::set<pg_t>& pgs) const;
+
+  void dump_osd_perf_stats(ceph::Formatter *f) const;
+  void print_osd_perf_stats(std::ostream *ss) const;
+
+  void dump_osd_blocked_by_stats(ceph::Formatter *f) const;
+  void print_osd_blocked_by_stats(std::ostream *ss) const;
+
+  void get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
+                             bool primary, std::set<pg_t>& pgs) const;
+
+  std::set<std::string> osd_parentage(const OSDMap& osdmap, int id) const;
+  void get_health_checks(
+    CephContext *cct,
+    const OSDMap& osdmap,
+    health_check_map_t *checks) const;
+  void print_summary(ceph::Formatter *f, std::ostream *out) const;
+
+  static void generate_test_instances(std::list<PGMap*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(PGMap)
+
+inline std::ostream& operator<<(std::ostream& out, const PGMapDigest& m) {
+  m.print_oneline_summary(NULL, &out);
+  return out;
+}
+
+int process_pg_map_command(
+  const std::string& prefix,
+  const cmdmap_t& cmdmap,
+  const PGMap& pg_map,
+  const OSDMap& osdmap,
+  ceph::Formatter *f,
+  std::stringstream *ss,
+  ceph::buffer::list *odata);
+
+class PGMapUpdater
+{
+public:
+  static void check_osd_map(
+    CephContext *cct,
+    const OSDMap &osdmap,
+    const PGMap& pg_map,
+    PGMap::Incremental *pending_inc);
+
+  // mark pg's state stale if its acting primary osd is down
+  static void check_down_pgs(
+      const OSDMap &osd_map,
+      const PGMap &pg_map,
+      bool check_all,
+      const std::set<int>& need_check_down_pg_osds,
+      PGMap::Incremental *pending_inc);
+};
+
+namespace reweight {
+/* Assign a lower weight to overloaded OSDs.
+ *
+ * The osds that will get a lower weight are those with with a utilization
+ * percentage 'oload' percent greater than the average utilization.
+ */
+  int by_utilization(const OSDMap &osd_map,
+		     const PGMap &pg_map,
+		     int oload,
+		     double max_changef,
+		     int max_osds,
+		     bool by_pg, const std::set<int64_t> *pools,
+		     bool no_increasing,
+		     mempool::osdmap::map<int32_t, uint32_t>* new_weights,
+		     std::stringstream *ss,
+		     std::string *out_str,
+		     ceph::Formatter *f);
+}
+
+#endif
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
new file mode 100644
index 000000000..21f244239
--- /dev/null
+++ b/src/mon/Paxos.cc
@@ -0,0 +1,1591 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <sstream>
+#include "Paxos.h"
+#include "Monitor.h"
+#include "messages/MMonPaxos.h"
+
+#include "mon/mon_types.h"
+#include "common/config.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "common/Timer.h"
+#include "messages/PaxosServiceMessage.h"
+
+using std::string;
+using std::unique_lock;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::to_timespan;
+
+#define dout_subsys ceph_subsys_paxos
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, mon.name, mon.rank, paxos_name, state, first_committed, last_committed)
+static std::ostream& _prefix(std::ostream *_dout, Monitor &mon, const string& name,
+			     int rank, const string& paxos_name, int state,
+			     version_t first_committed, version_t last_committed)
+{
+  return *_dout << "mon." << name << "@" << rank
+		<< "(" << mon.get_state_name() << ")"
+		<< ".paxos(" << paxos_name << " " << Paxos::get_statename(state)
+		<< " c " << first_committed << ".." << last_committed
+		<< ") ";
+}
+
+class Paxos::C_Trimmed : public Context {
+  Paxos *paxos;
+public:
+  explicit C_Trimmed(Paxos *p) : paxos(p) { }
+  void finish(int r) override {
+    paxos->trimming = false;
+  }
+};
+
+MonitorDBStore *Paxos::get_store()
+{
+  return mon.store;
+}
+
+void Paxos::read_and_prepare_transactions(MonitorDBStore::TransactionRef tx,
+					  version_t first, version_t last)
+{
+  dout(10) << __func__ << " first " << first << " last " << last << dendl;
+  for (version_t v = first; v <= last; ++v) {
+    dout(30) << __func__ << " apply version " << v << dendl;
+    bufferlist bl;
+    int err = get_store()->get(get_name(), v, bl);
+    ceph_assert(err == 0);
+    ceph_assert(bl.length());
+    decode_append_transaction(tx, bl);
+  }
+  dout(15) << __func__ << " total versions " << (last-first) << dendl;
+}
+
+void Paxos::init()
+{
+  // load paxos variables from stable storage
+  last_pn = get_store()->get(get_name(), "last_pn");
+  accepted_pn = get_store()->get(get_name(), "accepted_pn");
+  last_committed = get_store()->get(get_name(), "last_committed");
+  first_committed = get_store()->get(get_name(), "first_committed");
+
+  dout(10) << __func__ << " last_pn: " << last_pn << " accepted_pn: "
+	   << accepted_pn << " last_committed: " << last_committed
+	   << " first_committed: " << first_committed << dendl;
+
+  dout(10) << "init" << dendl;
+  ceph_assert(is_consistent());
+}
+
+void Paxos::init_logger()
+{
+  PerfCountersBuilder pcb(g_ceph_context, "paxos", l_paxos_first, l_paxos_last);
+
+  // Because monitors are so few in number, the resource cost of capturing
+  // almost all their perf counters at USEFUL is trivial.
+  pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+  pcb.add_u64_counter(l_paxos_start_leader, "start_leader", "Starts in leader role");
+  pcb.add_u64_counter(l_paxos_start_peon, "start_peon", "Starts in peon role");
+  pcb.add_u64_counter(l_paxos_restart, "restart", "Restarts");
+  pcb.add_u64_counter(l_paxos_refresh, "refresh", "Refreshes");
+  pcb.add_time_avg(l_paxos_refresh_latency, "refresh_latency", "Refresh latency");
+  pcb.add_u64_counter(l_paxos_begin, "begin", "Started and handled begins");
+  pcb.add_u64_avg(l_paxos_begin_keys, "begin_keys", "Keys in transaction on begin");
+  pcb.add_u64_avg(l_paxos_begin_bytes, "begin_bytes", "Data in transaction on begin", NULL, 0, unit_t(UNIT_BYTES));
+  pcb.add_time_avg(l_paxos_begin_latency, "begin_latency", "Latency of begin operation");
+  pcb.add_u64_counter(l_paxos_commit, "commit",
+      "Commits", "cmt");
+  pcb.add_u64_avg(l_paxos_commit_keys, "commit_keys", "Keys in transaction on commit");
+  pcb.add_u64_avg(l_paxos_commit_bytes, "commit_bytes", "Data in transaction on commit", NULL, 0, unit_t(UNIT_BYTES));
+  pcb.add_time_avg(l_paxos_commit_latency, "commit_latency",
+      "Commit latency", "clat");
+  pcb.add_u64_counter(l_paxos_collect, "collect", "Peon collects");
+  pcb.add_u64_avg(l_paxos_collect_keys, "collect_keys", "Keys in transaction on peon collect");
+  pcb.add_u64_avg(l_paxos_collect_bytes, "collect_bytes", "Data in transaction on peon collect", NULL, 0, unit_t(UNIT_BYTES));
+  pcb.add_time_avg(l_paxos_collect_latency, "collect_latency", "Peon collect latency");
+  pcb.add_u64_counter(l_paxos_collect_uncommitted, "collect_uncommitted", "Uncommitted values in started and handled collects");
+  pcb.add_u64_counter(l_paxos_collect_timeout, "collect_timeout", "Collect timeouts");
+  pcb.add_u64_counter(l_paxos_accept_timeout, "accept_timeout", "Accept timeouts");
+  pcb.add_u64_counter(l_paxos_lease_ack_timeout, "lease_ack_timeout", "Lease acknowledgement timeouts");
+  pcb.add_u64_counter(l_paxos_lease_timeout, "lease_timeout", "Lease timeouts");
+  pcb.add_u64_counter(l_paxos_store_state, "store_state", "Store a shared state on disk");
+  pcb.add_u64_avg(l_paxos_store_state_keys, "store_state_keys", "Keys in transaction in stored state");
+  pcb.add_u64_avg(l_paxos_store_state_bytes, "store_state_bytes", "Data in transaction in stored state", NULL, 0, unit_t(UNIT_BYTES));
+  pcb.add_time_avg(l_paxos_store_state_latency, "store_state_latency", "Storing state latency");
+  pcb.add_u64_counter(l_paxos_share_state, "share_state", "Sharings of state");
+  pcb.add_u64_avg(l_paxos_share_state_keys, "share_state_keys", "Keys in shared state");
+  pcb.add_u64_avg(l_paxos_share_state_bytes, "share_state_bytes", "Data in shared state", NULL, 0, unit_t(UNIT_BYTES));
+  pcb.add_u64_counter(l_paxos_new_pn, "new_pn", "New proposal number queries");
+  pcb.add_time_avg(l_paxos_new_pn_latency, "new_pn_latency", "New proposal number getting latency");
+  logger = pcb.create_perf_counters();
+  g_ceph_context->get_perfcounters_collection()->add(logger);
+}
+
+void Paxos::dump_info(Formatter *f)
+{
+  f->open_object_section("paxos");
+  f->dump_unsigned("first_committed", first_committed);
+  f->dump_unsigned("last_committed", last_committed);
+  f->dump_unsigned("last_pn", last_pn);
+  f->dump_unsigned("accepted_pn", accepted_pn);
+  f->close_section();
+}
+
+// ---------------------------------
+
+// PHASE 1
+
+// leader
+void Paxos::collect(version_t oldpn)
+{
+  // we're recoverying, it seems!
+  state = STATE_RECOVERING;
+  ceph_assert(mon.is_leader());
+
+  // reset the number of lasts received
+  uncommitted_v = 0;
+  uncommitted_pn = 0;
+  uncommitted_value.clear();
+  peer_first_committed.clear();
+  peer_last_committed.clear();
+
+  // look for uncommitted value
+  if (get_store()->exists(get_name(), last_committed+1)) {
+    version_t v = get_store()->get(get_name(), "pending_v");
+    version_t pn = get_store()->get(get_name(), "pending_pn");
+    if (v && pn && v == last_committed + 1) {
+      uncommitted_pn = pn;
+    } else {
+      dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << accepted_pn
+	       << " and crossing our fingers" << dendl;
+      uncommitted_pn = accepted_pn;
+    }
+    uncommitted_v = last_committed+1;
+
+    get_store()->get(get_name(), last_committed+1, uncommitted_value);
+    ceph_assert(uncommitted_value.length());
+    dout(10) << "learned uncommitted " << (last_committed+1)
+	     << " pn " << uncommitted_pn
+	     << " (" << uncommitted_value.length() << " bytes) from myself" 
+	     << dendl;
+
+    logger->inc(l_paxos_collect_uncommitted);
+  }
+
+  // pick new pn
+  accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn));
+  accepted_pn_from = last_committed;
+  num_last = 1;
+  dout(10) << "collect with pn " << accepted_pn << dendl;
+
+  // send collect
+  for (auto p = mon.get_quorum().begin();
+       p != mon.get_quorum().end();
+       ++p) {
+    if (*p == mon.rank) continue;
+
+    MMonPaxos *collect = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COLLECT,
+				       ceph_clock_now());
+    collect->last_committed = last_committed;
+    collect->first_committed = first_committed;
+    collect->pn = accepted_pn;
+    mon.send_mon_message(collect, *p);
+  }
+
+  // set timeout event
+  collect_timeout_event = mon.timer.add_event_after(
+    g_conf()->mon_accept_timeout_factor *
+    g_conf()->mon_lease,
+    new C_MonContext{&mon, [this](int r) {
+	if (r == -ECANCELED)
+	  return;
+	collect_timeout();
+    }});
+}
+
+
+// peon
+void Paxos::handle_collect(MonOpRequestRef op)
+{
+  
+  op->mark_paxos_event("handle_collect");
+
+  auto collect = op->get_req<MMonPaxos>();
+  dout(10) << "handle_collect " << *collect << dendl;
+
+  ceph_assert(mon.is_peon()); // mon epoch filter should catch strays
+
+  // we're recoverying, it seems!
+  state = STATE_RECOVERING;
+
+  //update the peon recovery timeout 
+  reset_lease_timeout();
+
+  if (collect->first_committed > last_committed+1) {
+    dout(2) << __func__
+            << " leader's lowest version is too high for our last committed"
+            << " (theirs: " << collect->first_committed
+            << "; ours: " << last_committed << ") -- bootstrap!" << dendl;
+    op->mark_paxos_event("need to bootstrap");
+    mon.bootstrap();
+    return;
+  }
+
+  // reply
+  MMonPaxos *last = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LAST,
+				  ceph_clock_now());
+  last->last_committed = last_committed;
+  last->first_committed = first_committed;
+  
+  version_t previous_pn = accepted_pn;
+
+  // can we accept this pn?
+  if (collect->pn > accepted_pn) {
+    // ok, accept it
+    accepted_pn = collect->pn;
+    accepted_pn_from = collect->pn_from;
+    dout(10) << "accepting pn " << accepted_pn << " from " 
+	     << accepted_pn_from << dendl;
+  
+    auto t(std::make_shared<MonitorDBStore::Transaction>());
+    t->put(get_name(), "accepted_pn", accepted_pn);
+
+    dout(30) << __func__ << " transaction dump:\n";
+    JSONFormatter f(true);
+    t->dump(&f);
+    f.flush(*_dout);
+    *_dout << dendl;
+
+    logger->inc(l_paxos_collect);
+    logger->inc(l_paxos_collect_keys, t->get_keys());
+    logger->inc(l_paxos_collect_bytes, t->get_bytes());
+
+    auto start = ceph::coarse_mono_clock::now();
+    get_store()->apply_transaction(t);
+    auto end = ceph::coarse_mono_clock::now();
+
+    logger->tinc(l_paxos_collect_latency, to_timespan(end - start));
+  } else {
+    // don't accept!
+    dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from
+	     << ", we already accepted " << accepted_pn
+	     << " from " << accepted_pn_from << dendl;
+  }
+  last->pn = accepted_pn;
+  last->pn_from = accepted_pn_from;
+
+  // share whatever committed values we have
+  if (collect->last_committed < last_committed)
+    share_state(last, collect->first_committed, collect->last_committed);
+
+  // do we have an accepted but uncommitted value?
+  //  (it'll be at last_committed+1)
+  bufferlist bl;
+  if (collect->last_committed <= last_committed &&
+      get_store()->exists(get_name(), last_committed+1)) {
+    get_store()->get(get_name(), last_committed+1, bl);
+    ceph_assert(bl.length() > 0);
+    dout(10) << " sharing our accepted but uncommitted value for " 
+	     << last_committed+1 << " (" << bl.length() << " bytes)" << dendl;
+    last->values[last_committed+1] = bl;
+
+    version_t v = get_store()->get(get_name(), "pending_v");
+    version_t pn = get_store()->get(get_name(), "pending_pn");
+    if (v && pn && v == last_committed + 1) {
+      last->uncommitted_pn = pn;
+    } else {
+      // previously we didn't record which pn a value was accepted
+      // under!  use the pn value we just had...  :(
+      dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << previous_pn
+	       << " and crossing our fingers" << dendl;
+      last->uncommitted_pn = previous_pn;
+    }
+
+    logger->inc(l_paxos_collect_uncommitted);
+  }
+
+  // send reply
+  collect->get_connection()->send_message(last);
+}
+
+/**
+ * @note This is Okay. We share our versions between peer_last_committed and
+ *	 our last_committed (inclusive), and add their bufferlists to the
+ *	 message. It will be the peer's job to apply them to its store, as
+ *	 these bufferlists will contain raw transactions.
+ *	 This function is called by both the Peon and the Leader. The Peon will
+ *	 share the state with the Leader during handle_collect(), sharing any
+ *	 values the leader may be missing (i.e., the leader's last_committed is
+ *	 lower than the peon's last_committed). The Leader will share the state
+ *	 with the Peon during handle_last(), if the peon's last_committed is
+ *	 lower than the leader's last_committed.
+ */
+void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed,
+			version_t peer_last_committed)
+{
+  ceph_assert(peer_last_committed < last_committed);
+
+  dout(10) << "share_state peer has fc " << peer_first_committed 
+	   << " lc " << peer_last_committed << dendl;
+  version_t v = peer_last_committed + 1;
+
+  // include incrementals
+  uint64_t bytes = 0;
+  for ( ; v <= last_committed; v++) {
+    if (get_store()->exists(get_name(), v)) {
+      get_store()->get(get_name(), v, m->values[v]);
+      ceph_assert(m->values[v].length());
+      dout(10) << " sharing " << v << " ("
+	       << m->values[v].length() << " bytes)" << dendl;
+      bytes += m->values[v].length() + 16;  // paxos_ + 10 digits = 16
+    }
+  }
+  logger->inc(l_paxos_share_state);
+  logger->inc(l_paxos_share_state_keys, m->values.size());
+  logger->inc(l_paxos_share_state_bytes, bytes);
+
+  m->last_committed = last_committed;
+}
+
+/**
+ * Store on disk a state that was shared with us
+ *
+ * Basically, we received a set of version. Or just one. It doesn't matter.
+ * What matters is that we have to stash it in the store. So, we will simply
+ * write every single bufferlist into their own versions on our side (i.e.,
+ * onto paxos-related keys), and then we will decode those same bufferlists
+ * we just wrote and apply the transactions they hold. We will also update
+ * our first and last committed values to point to the new values, if need
+ * be. All all this is done tightly wrapped in a transaction to ensure we
+ * enjoy the atomicity guarantees given by our awesome k/v store.
+ */
+bool Paxos::store_state(MMonPaxos *m)
+{
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  auto start = m->values.begin();
+  bool changed = false;
+
+  // build map of values to store
+  // we want to write the range [last_committed, m->last_committed] only.
+  if (start != m->values.end() &&
+      start->first > last_committed + 1) {
+    // ignore everything if values start in the future.
+    dout(10) << "store_state ignoring all values, they start at " << start->first
+	     << " > last_committed+1" << dendl;
+    return false;
+  }
+
+  // push forward the start position on the message's values iterator, up until
+  // we run out of positions or we find a position matching 'last_committed'.
+  while (start != m->values.end() && start->first <= last_committed) {
+    ++start;
+  }
+
+  // make sure we get the right interval of values to apply by pushing forward
+  // the 'end' iterator until it matches the message's 'last_committed'.
+  auto end = start;
+  while (end != m->values.end() && end->first <= m->last_committed) {
+    last_committed = end->first;
+    ++end;
+  }
+
+  if (start == end) {
+    dout(10) << "store_state nothing to commit" << dendl;
+  } else {
+    dout(10) << "store_state [" << start->first << ".." 
+	     << last_committed << "]" << dendl;
+    t->put(get_name(), "last_committed", last_committed);
+
+    // we should apply the state here -- decode every single bufferlist in the
+    // map and append the transactions to 't'.
+    for (auto it = start; it != end; ++it) {
+      // write the bufferlist as the version's value
+      t->put(get_name(), it->first, it->second);
+      // decode the bufferlist and append it to the transaction we will shortly
+      // apply.
+      decode_append_transaction(t, it->second);
+    }
+
+    // discard obsolete uncommitted value?
+    if (uncommitted_v && uncommitted_v <= last_committed) {
+      dout(10) << " forgetting obsolete uncommitted value " << uncommitted_v
+	       << " pn " << uncommitted_pn << dendl;
+      uncommitted_v = 0;
+      uncommitted_pn = 0;
+      uncommitted_value.clear();
+    }
+  }
+  if (!t->empty()) {
+    dout(30) << __func__ << " transaction dump:\n";
+    JSONFormatter f(true);
+    t->dump(&f);
+    f.flush(*_dout);
+    *_dout << dendl;
+
+    logger->inc(l_paxos_store_state);
+    logger->inc(l_paxos_store_state_bytes, t->get_bytes());
+    logger->inc(l_paxos_store_state_keys, t->get_keys());
+
+    auto start = ceph::coarse_mono_clock::now();
+    get_store()->apply_transaction(t);
+    auto end = ceph::coarse_mono_clock::now();
+
+    logger->tinc(l_paxos_store_state_latency, to_timespan(end-start));
+
+    // refresh first_committed; this txn may have trimmed.
+    first_committed = get_store()->get(get_name(), "first_committed");
+
+    _sanity_check_store();
+    changed = true;
+  }
+
+  return changed;
+}
+
+void Paxos::_sanity_check_store()
+{
+  version_t lc = get_store()->get(get_name(), "last_committed");
+  ceph_assert(lc == last_committed);
+}
+
+
+// leader
+void Paxos::handle_last(MonOpRequestRef op)
+{
+  op->mark_paxos_event("handle_last");
+  auto last = op->get_req<MMonPaxos>();
+  bool need_refresh = false;
+  int from = last->get_source().num();
+
+  dout(10) << "handle_last " << *last << dendl;
+
+  if (!mon.is_leader()) {
+    dout(10) << "not leader, dropping" << dendl;
+    return;
+  }
+
+  // note peer's first_ and last_committed, in case we learn a new
+  // commit and need to push it to them.
+  peer_first_committed[from] = last->first_committed;
+  peer_last_committed[from] = last->last_committed;
+
+  if (last->first_committed > last_committed + 1) {
+    dout(5) << __func__
+            << " mon." << from
+	    << " lowest version is too high for our last committed"
+            << " (theirs: " << last->first_committed
+            << "; ours: " << last_committed << ") -- bootstrap!" << dendl;
+    op->mark_paxos_event("need to bootstrap");
+    mon.bootstrap();
+    return;
+  }
+
+  ceph_assert(g_conf()->paxos_kill_at != 1);
+
+  // store any committed values if any are specified in the message
+  need_refresh = store_state(last);
+
+  ceph_assert(g_conf()->paxos_kill_at != 2);
+
+  // is everyone contiguous and up to date?
+  for (auto p = peer_last_committed.begin();
+       p != peer_last_committed.end();
+       ++p) {
+    if (p->second + 1 < first_committed && first_committed > 1) {
+      dout(5) << __func__
+	      << " peon " << p->first
+	      << " last_committed (" << p->second
+	      << ") is too low for our first_committed (" << first_committed
+	      << ") -- bootstrap!" << dendl;
+      op->mark_paxos_event("need to bootstrap");
+      mon.bootstrap();
+      return;
+    }
+    if (p->second < last_committed) {
+      // share committed values
+      dout(10) << " sending commit to mon." << p->first << dendl;
+      MMonPaxos *commit = new MMonPaxos(mon.get_epoch(),
+					MMonPaxos::OP_COMMIT,
+					ceph_clock_now());
+      share_state(commit, peer_first_committed[p->first], p->second);
+      mon.send_mon_message(commit, p->first);
+    }
+  }
+
+  // do they accept your pn?
+  if (last->pn > accepted_pn) {
+    // no, try again.
+    dout(10) << " they had a higher pn than us, picking a new one." << dendl;
+
+    // cancel timeout event
+    mon.timer.cancel_event(collect_timeout_event);
+    collect_timeout_event = 0;
+
+    collect(last->pn);
+  } else if (last->pn == accepted_pn) {
+    // yes, they accepted our pn.  great.
+    num_last++;
+    dout(10) << " they accepted our pn, we now have " 
+	     << num_last << " peons" << dendl;
+
+    // did this person send back an accepted but uncommitted value?
+    if (last->uncommitted_pn) {
+      if (last->uncommitted_pn >= uncommitted_pn &&
+	  last->last_committed >= last_committed &&
+	  last->last_committed + 1 >= uncommitted_v) {
+	uncommitted_v = last->last_committed+1;
+	uncommitted_pn = last->uncommitted_pn;
+	uncommitted_value = last->values[uncommitted_v];
+	dout(10) << "we learned an uncommitted value for " << uncommitted_v
+		 << " pn " << uncommitted_pn
+		 << " " << uncommitted_value.length() << " bytes"
+		 << dendl;
+      } else {
+	dout(10) << "ignoring uncommitted value for " << (last->last_committed+1)
+		 << " pn " << last->uncommitted_pn
+		 << " " << last->values[last->last_committed+1].length() << " bytes"
+		 << dendl;
+      }
+    }
+    
+    // is that everyone?
+    if (num_last == mon.get_quorum().size()) {
+      // cancel timeout event
+      mon.timer.cancel_event(collect_timeout_event);
+      collect_timeout_event = 0;
+      peer_first_committed.clear();
+      peer_last_committed.clear();
+
+      // almost...
+
+      // did we learn an old value?
+      if (uncommitted_v == last_committed+1 &&
+	  uncommitted_value.length()) {
+	dout(10) << "that's everyone.  begin on old learned value" << dendl;
+	state = STATE_UPDATING_PREVIOUS;
+	begin(uncommitted_value);
+      } else {
+	// active!
+	dout(10) << "that's everyone.  active!" << dendl;
+	extend_lease();
+
+	need_refresh = false;
+	if (do_refresh()) {
+	  finish_round();
+	}
+      }
+    }
+  } else {
+    // no, this is an old message, discard
+    dout(10) << "old pn, ignoring" << dendl;
+  }
+
+  if (need_refresh)
+    (void)do_refresh();
+}
+
+void Paxos::collect_timeout()
+{
+  dout(1) << "collect timeout, calling fresh election" << dendl;
+  collect_timeout_event = 0;
+  logger->inc(l_paxos_collect_timeout);
+  ceph_assert(mon.is_leader());
+  mon.bootstrap();
+}
+
+
+// leader
+void Paxos::begin(bufferlist& v)
+{
+  dout(10) << "begin for " << last_committed+1 << " " 
+	   << v.length() << " bytes"
+	   << dendl;
+
+  ceph_assert(mon.is_leader());
+  ceph_assert(is_updating() || is_updating_previous());
+
+  // we must already have a majority for this to work.
+  ceph_assert(mon.get_quorum().size() == 1 ||
+	 num_last > (unsigned)mon.monmap->size()/2);
+  
+  // and no value, yet.
+  ceph_assert(new_value.length() == 0);
+
+  // accept it ourselves
+  accepted.clear();
+  accepted.insert(mon.rank);
+  new_value = v;
+
+  if (last_committed == 0) {
+    auto t(std::make_shared<MonitorDBStore::Transaction>());
+    // initial base case; set first_committed too
+    t->put(get_name(), "first_committed", 1);
+    decode_append_transaction(t, new_value);
+
+    bufferlist tx_bl;
+    t->encode(tx_bl);
+
+    new_value = tx_bl;
+  }
+
+  // store the proposed value in the store. IF it is accepted, we will then
+  // have to decode it into a transaction and apply it.
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  t->put(get_name(), last_committed+1, new_value);
+
+  // note which pn this pending value is for.
+  t->put(get_name(), "pending_v", last_committed + 1);
+  t->put(get_name(), "pending_pn", accepted_pn);
+
+  dout(30) << __func__ << " transaction dump:\n";
+  JSONFormatter f(true);
+  t->dump(&f);
+  f.flush(*_dout);
+  auto debug_tx(std::make_shared<MonitorDBStore::Transaction>());
+  auto new_value_it = new_value.cbegin();
+  debug_tx->decode(new_value_it);
+  debug_tx->dump(&f);
+  *_dout << "\nbl dump:\n";
+  f.flush(*_dout);
+  *_dout << dendl;
+
+  logger->inc(l_paxos_begin);
+  logger->inc(l_paxos_begin_keys, t->get_keys());
+  logger->inc(l_paxos_begin_bytes, t->get_bytes());
+
+  auto start = ceph::coarse_mono_clock::now();
+  get_store()->apply_transaction(t);
+  auto end = ceph::coarse_mono_clock::now();
+
+  logger->tinc(l_paxos_begin_latency, to_timespan(end - start));
+
+  ceph_assert(g_conf()->paxos_kill_at != 3);
+
+  if (mon.get_quorum().size() == 1) {
+    // we're alone, take it easy
+    commit_start();
+    return;
+  }
+
+  // ask others to accept it too!
+  for (auto p = mon.get_quorum().begin();
+       p != mon.get_quorum().end();
+       ++p) {
+    if (*p == mon.rank) continue;
+    
+    dout(10) << " sending begin to mon." << *p << dendl;
+    MMonPaxos *begin = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_BEGIN,
+				     ceph_clock_now());
+    begin->values[last_committed+1] = new_value;
+    begin->last_committed = last_committed;
+    begin->pn = accepted_pn;
+    
+    mon.send_mon_message(begin, *p);
+  }
+
+  // set timeout event
+  accept_timeout_event = mon.timer.add_event_after(
+    g_conf()->mon_accept_timeout_factor * g_conf()->mon_lease,
+    new C_MonContext{&mon, [this](int r) {
+	if (r == -ECANCELED)
+	  return;
+	accept_timeout();
+      }});
+}
+
+// peon
+void Paxos::handle_begin(MonOpRequestRef op)
+{
+  op->mark_paxos_event("handle_begin");
+  auto begin = op->get_req<MMonPaxos>();
+  dout(10) << "handle_begin " << *begin << dendl;
+
+  // can we accept this?
+  if (begin->pn < accepted_pn) {
+    dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl;
+    op->mark_paxos_event("have higher pn, ignore");
+    return;
+  }
+  ceph_assert(begin->pn == accepted_pn);
+  ceph_assert(begin->last_committed == last_committed);
+  
+  ceph_assert(g_conf()->paxos_kill_at != 4);
+
+  logger->inc(l_paxos_begin);
+
+  // set state.
+  state = STATE_UPDATING;
+  lease_expire = {};  // cancel lease
+
+  // yes.
+  version_t v = last_committed+1;
+  dout(10) << "accepting value for " << v << " pn " << accepted_pn << dendl;
+  // store the accepted value onto our store. We will have to decode it and
+  // apply its transaction once we receive permission to commit.
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  t->put(get_name(), v, begin->values[v]);
+
+  // note which pn this pending value is for.
+  t->put(get_name(), "pending_v", v);
+  t->put(get_name(), "pending_pn", accepted_pn);
+
+  dout(30) << __func__ << " transaction dump:\n";
+  JSONFormatter f(true);
+  t->dump(&f);
+  f.flush(*_dout);
+  *_dout << dendl;
+
+  logger->inc(l_paxos_begin_bytes, t->get_bytes());
+
+  auto start = ceph::coarse_mono_clock::now();
+  get_store()->apply_transaction(t);
+  auto end = ceph::coarse_mono_clock::now();
+
+  logger->tinc(l_paxos_begin_latency, to_timespan(end - start));
+
+  ceph_assert(g_conf()->paxos_kill_at != 5);
+
+  // reply
+  MMonPaxos *accept = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_ACCEPT,
+				    ceph_clock_now());
+  accept->pn = accepted_pn;
+  accept->last_committed = last_committed;
+  begin->get_connection()->send_message(accept);
+}
+
+// leader
+void Paxos::handle_accept(MonOpRequestRef op)
+{
+  op->mark_paxos_event("handle_accept");
+  auto accept = op->get_req<MMonPaxos>();
+  dout(10) << "handle_accept " << *accept << dendl;
+  int from = accept->get_source().num();
+
+  if (accept->pn != accepted_pn) {
+    // we accepted a higher pn, from some other leader
+    dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl;
+    op->mark_paxos_event("have higher pn, ignore");
+    return;
+  }
+  if (last_committed > 0 &&
+      accept->last_committed < last_committed-1) {
+    dout(10) << " this is from an old round, ignoring" << dendl;
+    op->mark_paxos_event("old round, ignore");
+    return;
+  }
+  ceph_assert(accept->last_committed == last_committed ||   // not committed
+	 accept->last_committed == last_committed-1);  // committed
+
+  ceph_assert(is_updating() || is_updating_previous());
+  ceph_assert(accepted.count(from) == 0);
+  accepted.insert(from);
+  dout(10) << " now " << accepted << " have accepted" << dendl;
+
+  ceph_assert(g_conf()->paxos_kill_at != 6);
+
+  // only commit (and expose committed state) when we get *all* quorum
+  // members to accept.  otherwise, they may still be sharing the now
+  // stale state.
+  // FIXME: we can improve this with an additional lease revocation message
+  // that doesn't block for the persist.
+  if (accepted == mon.get_quorum()) {
+    // yay, commit!
+    dout(10) << " got majority, committing, done with update" << dendl;
+    op->mark_paxos_event("commit_start");
+    commit_start();
+  }
+}
+
+void Paxos::accept_timeout()
+{
+  dout(1) << "accept timeout, calling fresh election" << dendl;
+  accept_timeout_event = 0;
+  ceph_assert(mon.is_leader());
+  ceph_assert(is_updating() || is_updating_previous() || is_writing() ||
+	 is_writing_previous());
+  logger->inc(l_paxos_accept_timeout);
+  mon.bootstrap();
+}
+
+struct C_Committed : public Context {
+  Paxos *paxos;
+  explicit C_Committed(Paxos *p) : paxos(p) {}
+  void finish(int r) override {
+    ceph_assert(r >= 0);
+    std::lock_guard l(paxos->mon.lock);
+    if (paxos->is_shutdown()) {
+      paxos->abort_commit();
+      return;
+    }
+    paxos->commit_finish();
+  }
+};
+
+void Paxos::abort_commit()
+{
+  ceph_assert(commits_started > 0);
+  --commits_started;
+  if (commits_started == 0)
+    shutdown_cond.notify_all();
+}
+
+void Paxos::commit_start()
+{
+  dout(10) << __func__ << " " << (last_committed+1) << dendl;
+
+  ceph_assert(g_conf()->paxos_kill_at != 7);
+
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+
+  // commit locally
+  t->put(get_name(), "last_committed", last_committed + 1);
+
+  // decode the value and apply its transaction to the store.
+  // this value can now be read from last_committed.
+  decode_append_transaction(t, new_value);
+
+  dout(30) << __func__ << " transaction dump:\n";
+  JSONFormatter f(true);
+  t->dump(&f);
+  f.flush(*_dout);
+  *_dout << dendl;
+
+  logger->inc(l_paxos_commit);
+  logger->inc(l_paxos_commit_keys, t->get_keys());
+  logger->inc(l_paxos_commit_bytes, t->get_bytes());
+  commit_start_stamp = ceph_clock_now();
+
+  get_store()->queue_transaction(t, new C_Committed(this));
+
+  if (is_updating_previous())
+    state = STATE_WRITING_PREVIOUS;
+  else if (is_updating())
+    state = STATE_WRITING;
+  else
+    ceph_abort();
+  ++commits_started;
+
+  if (mon.get_quorum().size() > 1) {
+    // cancel timeout event
+    mon.timer.cancel_event(accept_timeout_event);
+    accept_timeout_event = 0;
+  }
+}
+
+void Paxos::commit_finish()
+{
+  dout(20) << __func__ << " " << (last_committed+1) << dendl;
+  utime_t end = ceph_clock_now();
+  logger->tinc(l_paxos_commit_latency, end - commit_start_stamp);
+
+  ceph_assert(g_conf()->paxos_kill_at != 8);
+
+  // cancel lease - it was for the old value.
+  //  (this would only happen if message layer lost the 'begin', but
+  //   leader still got a majority and committed with out us.)
+  lease_expire = {};  // cancel lease
+
+  last_committed++;
+  last_commit_time = ceph_clock_now();
+
+  // refresh first_committed; this txn may have trimmed.
+  first_committed = get_store()->get(get_name(), "first_committed");
+
+  _sanity_check_store();
+
+  // tell everyone
+  for (auto p = mon.get_quorum().begin();
+       p != mon.get_quorum().end();
+       ++p) {
+    if (*p == mon.rank) continue;
+
+    dout(10) << " sending commit to mon." << *p << dendl;
+    MMonPaxos *commit = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COMMIT,
+				      ceph_clock_now());
+    commit->values[last_committed] = new_value;
+    commit->pn = accepted_pn;
+    commit->last_committed = last_committed;
+
+    mon.send_mon_message(commit, *p);
+  }
+
+  ceph_assert(g_conf()->paxos_kill_at != 9);
+
+  // get ready for a new round.
+  new_value.clear();
+
+  // WRITING -> REFRESH
+  // among other things, this lets do_refresh() -> mon.bootstrap() ->
+  // wait_for_paxos_write() know that it doesn't need to flush the store
+  // queue. and it should not, as we are in the async completion thread now!
+  ceph_assert(is_writing() || is_writing_previous());
+  state = STATE_REFRESH;
+  ceph_assert(commits_started > 0);
+  --commits_started;
+
+  if (do_refresh()) {
+    commit_proposal();
+    if (mon.get_quorum().size() > 1) {
+      extend_lease();
+    }
+
+    ceph_assert(g_conf()->paxos_kill_at != 10);
+
+    finish_round();
+  }
+}
+
+
+void Paxos::handle_commit(MonOpRequestRef op)
+{
+  op->mark_paxos_event("handle_commit");
+  auto commit = op->get_req<MMonPaxos>();
+  dout(10) << "handle_commit on " << commit->last_committed << dendl;
+
+  logger->inc(l_paxos_commit);
+
+  if (!mon.is_peon()) {
+    dout(10) << "not a peon, dropping" << dendl;
+    ceph_abort();
+    return;
+  }
+
+  op->mark_paxos_event("store_state");
+  store_state(commit);
+
+  (void)do_refresh();
+}
+
+void Paxos::extend_lease()
+{
+  ceph_assert(mon.is_leader());
+  //assert(is_active());
+
+  lease_expire = ceph::real_clock::now();
+  lease_expire += ceph::make_timespan(g_conf()->mon_lease);
+  acked_lease.clear();
+  acked_lease.insert(mon.rank);
+
+  dout(7) << "extend_lease now+" << g_conf()->mon_lease
+	  << " (" << lease_expire << ")" << dendl;
+
+  // bcast
+  for (auto p = mon.get_quorum().begin();
+      p != mon.get_quorum().end(); ++p) {
+
+    if (*p == mon.rank) continue;
+    MMonPaxos *lease = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE,
+				     ceph_clock_now());
+    lease->last_committed = last_committed;
+    lease->lease_timestamp = utime_t{lease_expire};
+    lease->first_committed = first_committed;
+    mon.send_mon_message(lease, *p);
+  }
+
+  // set timeout event.
+  //  if old timeout is still in place, leave it.
+  if (!lease_ack_timeout_event) {
+    lease_ack_timeout_event = mon.timer.add_event_after(
+      g_conf()->mon_lease_ack_timeout_factor * g_conf()->mon_lease,
+      new C_MonContext{&mon, [this](int r) {
+	  if (r == -ECANCELED)
+	    return;
+	  lease_ack_timeout();
+	}});
+  }
+
+  // set renew event
+  auto at = lease_expire;
+  at -= ceph::make_timespan(g_conf()->mon_lease);
+  at += ceph::make_timespan(g_conf()->mon_lease_renew_interval_factor *
+			    g_conf()->mon_lease);
+  lease_renew_event = mon.timer.add_event_at(
+    at, new C_MonContext{&mon, [this](int r) {
+	if (r == -ECANCELED)
+	  return;
+	lease_renew_timeout();
+    }});
+}
+
+void Paxos::warn_on_future_time(utime_t t, entity_name_t from)
+{
+  utime_t now = ceph_clock_now();
+  if (t > now) {
+    utime_t diff = t - now;
+    if (diff > g_conf()->mon_clock_drift_allowed) {
+      utime_t warn_diff = now - last_clock_drift_warn;
+      if (warn_diff >
+	  pow(g_conf()->mon_clock_drift_warn_backoff, clock_drift_warned)) {
+	mon.clog->warn() << "message from " << from << " was stamped " << diff
+			 << "s in the future, clocks not synchronized";
+	last_clock_drift_warn = ceph_clock_now();
+	++clock_drift_warned;
+      }
+    }
+  }
+
+}
+
+bool Paxos::do_refresh()
+{
+  bool need_bootstrap = false;
+
+  // make sure we have the latest state loaded up
+  auto start = ceph::coarse_mono_clock::now();
+  mon.refresh_from_paxos(&need_bootstrap);
+  auto end = ceph::coarse_mono_clock::now();
+
+  logger->inc(l_paxos_refresh);
+  logger->tinc(l_paxos_refresh_latency, to_timespan(end - start));
+
+  if (need_bootstrap) {
+    dout(10) << " doing requested bootstrap" << dendl;
+    mon.bootstrap();
+    return false;
+  }
+
+  return true;
+}
+
+void Paxos::commit_proposal()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(mon.is_leader());
+  ceph_assert(is_refresh());
+
+  finish_contexts(g_ceph_context, committing_finishers);
+}
+
+void Paxos::finish_round()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(mon.is_leader());
+
+  // ok, now go active!
+  state = STATE_ACTIVE;
+
+  dout(20) << __func__ << " waiting_for_acting" << dendl;
+  finish_contexts(g_ceph_context, waiting_for_active);
+  dout(20) << __func__ << " waiting_for_readable" << dendl;
+  finish_contexts(g_ceph_context, waiting_for_readable);
+  dout(20) << __func__ << " waiting_for_writeable" << dendl;
+  finish_contexts(g_ceph_context, waiting_for_writeable);
+  
+  dout(10) << __func__ << " done w/ waiters, state " << get_statename(state) << dendl;
+
+  if (should_trim()) {
+    trim();
+  }
+
+  if (is_active() && pending_proposal) {
+    propose_pending();
+  }
+}
+
+
+// peon
+void Paxos::handle_lease(MonOpRequestRef op)
+{
+  op->mark_paxos_event("handle_lease");
+  auto lease = op->get_req<MMonPaxos>();
+  // sanity
+  if (!mon.is_peon() ||
+      last_committed != lease->last_committed) {
+    dout(10) << "handle_lease i'm not a peon, or they're not the leader,"
+	     << " or the last_committed doesn't match, dropping" << dendl;
+    op->mark_paxos_event("invalid lease, ignore");
+    return;
+  }
+
+  warn_on_future_time(lease->sent_timestamp, lease->get_source());
+
+  // extend lease
+  if (auto new_expire = lease->lease_timestamp.to_real_time();
+      lease_expire < new_expire) {
+    lease_expire = new_expire;
+
+    auto now = ceph::real_clock::now();
+    if (lease_expire < now) {
+      auto diff = now - lease_expire;
+      derr << "lease_expire from " << lease->get_source_inst() << " is " << diff << " seconds in the past; mons are probably laggy (or possibly clocks are too skewed)" << dendl;
+    }
+  }
+
+  state = STATE_ACTIVE;
+
+  dout(10) << "handle_lease on " << lease->last_committed
+	   << " now " << lease_expire << dendl;
+
+  // ack
+  MMonPaxos *ack = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE_ACK,
+				 ceph_clock_now());
+  ack->last_committed = last_committed;
+  ack->first_committed = first_committed;
+  ack->lease_timestamp = ceph_clock_now();
+  encode(mon.session_map.feature_map, ack->feature_map);
+  lease->get_connection()->send_message(ack);
+
+  // (re)set timeout event.
+  reset_lease_timeout();
+
+  // kick waiters
+  finish_contexts(g_ceph_context, waiting_for_active);
+  if (is_readable())
+    finish_contexts(g_ceph_context, waiting_for_readable);
+}
+
+void Paxos::handle_lease_ack(MonOpRequestRef op)
+{
+  op->mark_paxos_event("handle_lease_ack");
+  auto ack = op->get_req<MMonPaxos>();
+  int from = ack->get_source().num();
+
+  if (!lease_ack_timeout_event) {
+    dout(10) << "handle_lease_ack from " << ack->get_source()
+	     << " -- stray (probably since revoked)" << dendl;
+
+  } else if (acked_lease.count(from) == 0) {
+    acked_lease.insert(from);
+    if (ack->feature_map.length()) {
+      auto p = ack->feature_map.cbegin();
+      FeatureMap& t = mon.quorum_feature_map[from];
+      decode(t, p);
+    }
+    if (acked_lease == mon.get_quorum()) {
+      // yay!
+      dout(10) << "handle_lease_ack from " << ack->get_source()
+	       << " -- got everyone" << dendl;
+      mon.timer.cancel_event(lease_ack_timeout_event);
+      lease_ack_timeout_event = 0;
+
+
+    } else {
+      dout(10) << "handle_lease_ack from " << ack->get_source()
+	       << " -- still need "
+	       << mon.get_quorum().size() - acked_lease.size()
+	       << " more" << dendl;
+    }
+  } else {
+    dout(10) << "handle_lease_ack from " << ack->get_source()
+	     << " dup (lagging!), ignoring" << dendl;
+  }
+
+  warn_on_future_time(ack->sent_timestamp, ack->get_source());
+}
+
+void Paxos::lease_ack_timeout()
+{
+  dout(1) << "lease_ack_timeout -- calling new election" << dendl;
+  ceph_assert(mon.is_leader());
+  ceph_assert(is_active());
+  logger->inc(l_paxos_lease_ack_timeout);
+  lease_ack_timeout_event = 0;
+  mon.bootstrap();
+}
+
+void Paxos::reset_lease_timeout()
+{
+  dout(20) << "reset_lease_timeout - setting timeout event" << dendl;
+  if (lease_timeout_event)
+    mon.timer.cancel_event(lease_timeout_event);
+  lease_timeout_event = mon.timer.add_event_after(
+    g_conf()->mon_lease_ack_timeout_factor * g_conf()->mon_lease,
+    new C_MonContext{&mon, [this](int r) {
+	if (r == -ECANCELED)
+	  return;
+	lease_timeout();
+      }});
+}
+
+void Paxos::lease_timeout()
+{
+  dout(1) << "lease_timeout -- calling new election" << dendl;
+  ceph_assert(mon.is_peon());
+  logger->inc(l_paxos_lease_timeout);
+  lease_timeout_event = 0;
+  mon.bootstrap();
+}
+
+void Paxos::lease_renew_timeout()
+{
+  lease_renew_event = 0;
+  extend_lease();
+}
+
+
+/*
+ * trim old states
+ */
+void Paxos::trim()
+{
+  ceph_assert(should_trim());
+  version_t end = std::min(get_version() - g_conf()->paxos_min,
+		      get_first_committed() + g_conf()->paxos_trim_max);
+
+  if (first_committed >= end)
+    return;
+
+  dout(10) << "trim to " << end << " (was " << first_committed << ")" << dendl;
+
+  MonitorDBStore::TransactionRef t = get_pending_transaction();
+
+  for (version_t v = first_committed; v < end; ++v) {
+    dout(10) << "trim " << v << dendl;
+    t->erase(get_name(), v);
+  }
+  t->put(get_name(), "first_committed", end);
+  if (g_conf()->mon_compact_on_trim) {
+    dout(10) << " compacting trimmed range" << dendl;
+    t->compact_range(get_name(), stringify(first_committed - 1), stringify(end));
+  }
+
+  trimming = true;
+  queue_pending_finisher(new C_Trimmed(this));
+}
+
+/*
+ * return a globally unique, monotonically increasing proposal number
+ */
+version_t Paxos::get_new_proposal_number(version_t gt)
+{
+  if (last_pn < gt) 
+    last_pn = gt;
+  
+  // update. make it unique among all monitors.
+  last_pn /= 100;
+  last_pn++;
+  last_pn *= 100;
+  last_pn += (version_t)mon.rank;
+
+  // write
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  t->put(get_name(), "last_pn", last_pn);
+
+  dout(30) << __func__ << " transaction dump:\n";
+  JSONFormatter f(true);
+  t->dump(&f);
+  f.flush(*_dout);
+  *_dout << dendl;
+
+  logger->inc(l_paxos_new_pn);
+
+  auto start = ceph::coarse_mono_clock::now();
+  get_store()->apply_transaction(t);
+  auto end = ceph::coarse_mono_clock::now();
+
+  logger->tinc(l_paxos_new_pn_latency, to_timespan(end - start));
+
+  dout(10) << "get_new_proposal_number = " << last_pn << dendl;
+  return last_pn;
+}
+
+
+void Paxos::cancel_events()
+{
+  if (collect_timeout_event) {
+    mon.timer.cancel_event(collect_timeout_event);
+    collect_timeout_event = 0;
+  }
+  if (accept_timeout_event) {
+    mon.timer.cancel_event(accept_timeout_event);
+    accept_timeout_event = 0;
+  }
+  if (lease_renew_event) {
+    mon.timer.cancel_event(lease_renew_event);
+    lease_renew_event = 0;
+  }
+  if (lease_ack_timeout_event) {
+    mon.timer.cancel_event(lease_ack_timeout_event);
+    lease_ack_timeout_event = 0;
+  }  
+  if (lease_timeout_event) {
+    mon.timer.cancel_event(lease_timeout_event);
+    lease_timeout_event = 0;
+  }
+}
+
+void Paxos::shutdown()
+{
+  dout(10) << __func__ << " cancel all contexts" << dendl;
+
+  state = STATE_SHUTDOWN;
+
+  // discard pending transaction
+  pending_proposal.reset();
+
+  // Let store finish commits in progress
+  // XXX: I assume I can't use finish_contexts() because the store
+  // is going to trigger
+  unique_lock l{mon.lock, std::adopt_lock};
+  shutdown_cond.wait(l, [this] { return commits_started <= 0; });
+  // Monitor::shutdown() will unlock it
+  l.release();
+
+  finish_contexts(g_ceph_context, waiting_for_writeable, -ECANCELED);
+  finish_contexts(g_ceph_context, waiting_for_readable, -ECANCELED);
+  finish_contexts(g_ceph_context, waiting_for_active, -ECANCELED);
+  finish_contexts(g_ceph_context, pending_finishers, -ECANCELED);
+  finish_contexts(g_ceph_context, committing_finishers, -ECANCELED);
+  if (logger)
+    g_ceph_context->get_perfcounters_collection()->remove(logger);
+}
+
+void Paxos::leader_init()
+{
+  cancel_events();
+  new_value.clear();
+
+  // discard pending transaction
+  pending_proposal.reset();
+
+  reset_pending_committing_finishers();
+
+  logger->inc(l_paxos_start_leader);
+
+  if (mon.get_quorum().size() == 1) {
+    state = STATE_ACTIVE;
+    return;
+  }
+
+  state = STATE_RECOVERING;
+  lease_expire = {};
+  dout(10) << "leader_init -- starting paxos recovery" << dendl;
+  collect(0);
+}
+
+void Paxos::peon_init()
+{
+  cancel_events();
+  new_value.clear();
+
+  state = STATE_RECOVERING;
+  lease_expire = {};
+  dout(10) << "peon_init -- i am a peon" << dendl;
+
+  // start a timer, in case the leader never manages to issue a lease
+  reset_lease_timeout();
+
+  // discard pending transaction
+  pending_proposal.reset();
+
+  // no chance to write now!
+  reset_pending_committing_finishers();
+  finish_contexts(g_ceph_context, waiting_for_writeable, -EAGAIN);
+
+  logger->inc(l_paxos_start_peon);
+}
+
+void Paxos::restart()
+{
+  dout(10) << "restart -- canceling timeouts" << dendl;
+  cancel_events();
+  new_value.clear();
+
+  if (is_writing() || is_writing_previous()) {
+    dout(10) << __func__ << " flushing" << dendl;
+    mon.lock.unlock();
+    mon.store->flush();
+    mon.lock.lock();
+    dout(10) << __func__ << " flushed" << dendl;
+  }
+  state = STATE_RECOVERING;
+
+  // discard pending transaction
+  pending_proposal.reset();
+
+  reset_pending_committing_finishers();
+  finish_contexts(g_ceph_context, waiting_for_active, -EAGAIN);
+
+  logger->inc(l_paxos_restart);
+}
+
+void Paxos::reset_pending_committing_finishers()
+{
+  committing_finishers.splice(committing_finishers.end(), pending_finishers);
+  finish_contexts(g_ceph_context, committing_finishers, -EAGAIN);
+}
+
+void Paxos::dispatch(MonOpRequestRef op)
+{
+  ceph_assert(op->is_type_paxos());
+  op->mark_paxos_event("dispatch");
+
+  if (op->get_req()->get_type() != MSG_MON_PAXOS) {
+    dout(0) << "Got unexpected message type " << op->get_req()->get_type()
+	    << " in Paxos::dispatch, aborting!" << dendl;
+    ceph_abort();
+  }
+  
+  auto *req = op->get_req<MMonPaxos>();
+
+  // election in progress?
+  if (!mon.is_leader() && !mon.is_peon()) {
+    dout(5) << "election in progress, dropping " << *req << dendl;
+    return;    
+  }
+
+  // check sanity
+  ceph_assert(mon.is_leader() || 
+	      (mon.is_peon() && req->get_source().num() == mon.get_leader()));  
+
+  // NOTE: these ops are defined in messages/MMonPaxos.h
+  switch (req->op) {
+    // learner
+  case MMonPaxos::OP_COLLECT:
+    handle_collect(op);
+    break;
+  case MMonPaxos::OP_LAST:
+    handle_last(op);
+    break;
+  case MMonPaxos::OP_BEGIN:
+    handle_begin(op);
+    break;
+  case MMonPaxos::OP_ACCEPT:
+    handle_accept(op);
+    break;		
+  case MMonPaxos::OP_COMMIT:
+    handle_commit(op);
+    break;
+  case MMonPaxos::OP_LEASE:
+    handle_lease(op);
+    break;
+  case MMonPaxos::OP_LEASE_ACK:
+    handle_lease_ack(op);
+    break;
+  default:
+    ceph_abort();
+  }
+}
+
+
+// -----------------
+// service interface
+
+// -- READ --
+
+bool Paxos::is_readable(version_t v)
+{
+  bool ret;
+  if (v > last_committed)
+    ret = false;
+  else
+    ret =
+      (mon.is_peon() || mon.is_leader()) &&
+      (is_active() || is_updating() || is_writing()) &&
+      last_committed > 0 && is_lease_valid(); // must have a value alone, or have lease
+  dout(5) << __func__ << " = " << (int)ret
+	  << " - now=" << ceph_clock_now()
+	  << " lease_expire=" << lease_expire
+	  << " has v" << v << " lc " << last_committed
+	  << dendl;
+  return ret;
+}
+
+bool Paxos::read(version_t v, bufferlist &bl)
+{
+  if (!get_store()->get(get_name(), v, bl))
+    return false;
+  return true;
+}
+
+version_t Paxos::read_current(bufferlist &bl)
+{
+  if (read(last_committed, bl))
+    return last_committed;
+  return 0;
+}
+
+
+bool Paxos::is_lease_valid()
+{
+  return ((mon.get_quorum().size() == 1)
+	  || (ceph::real_clock::now() < lease_expire));
+}
+
+// -- WRITE --
+
+bool Paxos::is_writeable()
+{
+  return
+    mon.is_leader() &&
+    is_active() &&
+    is_lease_valid();
+}
+
+void Paxos::propose_pending()
+{
+  ceph_assert(is_active());
+  ceph_assert(pending_proposal);
+
+  cancel_events();
+
+  bufferlist bl;
+  pending_proposal->encode(bl);
+
+  dout(10) << __func__ << " " << (last_committed + 1)
+	   << " " << bl.length() << " bytes" << dendl;
+  dout(30) << __func__ << " transaction dump:\n";
+  JSONFormatter f(true);
+  pending_proposal->dump(&f);
+  f.flush(*_dout);
+  *_dout << dendl;
+
+  pending_proposal.reset();
+
+  committing_finishers.swap(pending_finishers);
+  state = STATE_UPDATING;
+  begin(bl);
+}
+
+void Paxos::queue_pending_finisher(Context *onfinished)
+{
+  dout(5) << __func__ << " " << onfinished << dendl;
+  ceph_assert(onfinished);
+  pending_finishers.push_back(onfinished);
+}
+
+MonitorDBStore::TransactionRef Paxos::get_pending_transaction()
+{
+  ceph_assert(mon.is_leader());
+  if (!pending_proposal) {
+    pending_proposal.reset(new MonitorDBStore::Transaction);
+    ceph_assert(pending_finishers.empty());
+  }
+  return pending_proposal;
+}
+
+bool Paxos::trigger_propose()
+{
+  if (plugged) {
+    dout(10) << __func__ << " plugged, not proposing now" << dendl;
+    return false;
+  } else if (is_active()) {
+    dout(10) << __func__ << " active, proposing now" << dendl;
+    propose_pending();
+    return true;
+  } else {
+    dout(10) << __func__ << " not active, will propose later" << dendl;
+    return false;
+  }
+}
+
+bool Paxos::is_consistent()
+{
+  return (first_committed <= last_committed);
+}
+
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
new file mode 100644
index 000000000..c197f26f7
--- /dev/null
+++ b/src/mon/Paxos.h
@@ -0,0 +1,1384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+/*
+time---->
+
+cccccccccccccccccca????????????????????????????????????????
+cccccccccccccccccca????????????????????????????????????????
+cccccccccccccccccca???????????????????????????????????????? leader
+cccccccccccccccccc????????????????????????????????????????? 
+ccccc?????????????????????????????????????????????????????? 
+
+last_committed
+
+pn_from
+pn
+
+a 12v 
+b 12v
+c 14v
+d
+e 12v
+*/
+
+/**
+ * Paxos storage layout and behavior
+ *
+ * Currently, we use a key/value store to hold all the Paxos-related data, but
+ * it can logically be depicted as this:
+ *
+ *  paxos:
+ *    first_committed -> 1
+ *     last_committed -> 4
+ *		    1 -> value_1
+ *		    2 -> value_2
+ *		    3 -> value_3
+ *		    4 -> value_4
+ *
+ * Since we are relying on a k/v store supporting atomic transactions, we can
+ * guarantee that if 'last_committed' has a value of '4', then we have up to
+ * version 4 on the store, and no more than that; the same applies to
+ * 'first_committed', which holding '1' will strictly meaning that our lowest
+ * version is 1.
+ *
+ * Each version's value (value_1, value_2, ..., value_n) is a blob of data,
+ * incomprehensible to the Paxos. These values are proposed to the Paxos on
+ * propose_new_value() and each one is a transaction encoded in a ceph::buffer::list.
+ *
+ * The Paxos will write the value to disk, associating it with its version,
+ * but will take a step further: the value shall be decoded, and the operations
+ * on that transaction shall be applied during the same transaction that will
+ * write the value's encoded ceph::buffer::list to disk. This behavior ensures that
+ * whatever is being proposed will only be available on the store when it is
+ * applied by Paxos, which will then be aware of such new values, guaranteeing
+ * the store state is always consistent without requiring shady workarounds.
+ *
+ * So, let's say that FooMonitor proposes the following transaction, neatly
+ * encoded on a ceph::buffer::list of course:
+ *
+ *  Tx_Foo
+ *    put(foo, last_committed, 3)
+ *    put(foo, 3, foo_value_3)
+ *    erase(foo, 2)
+ *    erase(foo, 1)
+ *    put(foo, first_committed, 3)
+ *
+ * And knowing that the Paxos is proposed Tx_Foo as a ceph::buffer::list, once it is
+ * ready to commit, and assuming we are now committing version 5 of the Paxos,
+ * we will do something along the lines of:
+ *
+ *  Tx proposed_tx;
+ *  proposed_tx.decode(Tx_foo_ceph::buffer::list);
+ *
+ *  Tx our_tx;
+ *  our_tx.put(paxos, last_committed, 5);
+ *  our_tx.put(paxos, 5, Tx_foo_ceph::buffer::list);
+ *  our_tx.append(proposed_tx);
+ *
+ *  store_apply(our_tx);
+ *
+ * And the store should look like this after we apply 'our_tx':
+ *
+ *  paxos:
+ *    first_committed -> 1
+ *     last_committed -> 5
+ *		    1 -> value_1
+ *		    2 -> value_2
+ *		    3 -> value_3
+ *		    4 -> value_4
+ *		    5 -> Tx_foo_ceph::buffer::list
+ *  foo:
+ *    first_committed -> 3
+ *     last_committed -> 3
+ *		    3 -> foo_value_3
+ *
+ */
+
+#ifndef CEPH_MON_PAXOS_H
+#define CEPH_MON_PAXOS_H
+
+#include "include/types.h"
+#include "mon_types.h"
+#include "include/buffer.h"
+#include "msg/msg_types.h"
+#include "include/Context.h"
+#include "common/perf_counters.h"
+#include <errno.h>
+
+#include "MonitorDBStore.h"
+#include "mon/MonOpRequest.h"
+
+class Monitor;
+class MMonPaxos;
+
+enum {
+  l_paxos_first = 45800,
+  l_paxos_start_leader,
+  l_paxos_start_peon,
+  l_paxos_restart,
+  l_paxos_refresh,
+  l_paxos_refresh_latency,
+  l_paxos_begin,
+  l_paxos_begin_keys,
+  l_paxos_begin_bytes,
+  l_paxos_begin_latency,
+  l_paxos_commit,
+  l_paxos_commit_keys,
+  l_paxos_commit_bytes,
+  l_paxos_commit_latency,
+  l_paxos_collect,
+  l_paxos_collect_keys,
+  l_paxos_collect_bytes,
+  l_paxos_collect_latency,
+  l_paxos_collect_uncommitted,
+  l_paxos_collect_timeout,
+  l_paxos_accept_timeout,
+  l_paxos_lease_ack_timeout,
+  l_paxos_lease_timeout,
+  l_paxos_store_state,
+  l_paxos_store_state_keys,
+  l_paxos_store_state_bytes,
+  l_paxos_store_state_latency,
+  l_paxos_share_state,
+  l_paxos_share_state_keys,
+  l_paxos_share_state_bytes,
+  l_paxos_new_pn,
+  l_paxos_new_pn_latency,
+  l_paxos_last,
+};
+
+
+// i am one state machine.
+/**
+ * This library is based on the Paxos algorithm, but varies in a few key ways:
+ *  1- Only a single new value is generated at a time, simplifying the recovery logic.
+ *  2- Nodes track "committed" values, and share them generously (and trustingly)
+ *  3- A 'leasing' mechanism is built-in, allowing nodes to determine when it is 
+ *     safe to "read" their copy of the last committed value.
+ *
+ * This provides a simple replication substrate that services can be built on top of.
+ * See PaxosService.h
+ */
+class Paxos {
+  /**
+   * @defgroup Paxos_h_class Paxos
+   * @{
+   */
+  /**
+   * The Monitor to which this Paxos class is associated with.
+   */
+  Monitor &mon;
+
+  /// perf counter for internal instrumentations
+  PerfCounters *logger;
+
+  void init_logger();
+
+  // my state machine info
+  const std::string paxos_name;
+
+  friend class Monitor;
+  friend class PaxosService;
+
+  std::list<std::string> extra_state_dirs;
+
+  // LEADER+PEON
+
+  // -- generic state --
+public:
+  /**
+   * @defgroup Paxos_h_states States on which the leader/peon may be.
+   * @{
+   */
+  enum {
+    /**
+     * Leader/Peon is in Paxos' Recovery state
+     */
+    STATE_RECOVERING,
+    /**
+     * Leader/Peon is idle, and the Peon may or may not have a valid lease.
+     */
+    STATE_ACTIVE,
+    /**
+     * Leader/Peon is updating to a new value.
+     */
+    STATE_UPDATING,
+    /*
+     * Leader proposing an old value
+     */
+    STATE_UPDATING_PREVIOUS,
+    /*
+     * Leader/Peon is writing a new commit.  readable, but not
+     * writeable.
+     */
+    STATE_WRITING,
+    /*
+     * Leader/Peon is writing a new commit from a previous round.
+     */
+    STATE_WRITING_PREVIOUS,
+    // leader: refresh following a commit
+    STATE_REFRESH,
+    // Shutdown after WRITING or WRITING_PREVIOUS
+    STATE_SHUTDOWN
+  };
+
+  /**
+   * Obtain state name from constant value.
+   *
+   * @note This function will raise a fatal error if @p s is not
+   *	   a valid state value.
+   *
+   * @param s State value.
+   * @return The state's name.
+   */
+  static const std::string get_statename(int s) {
+    switch (s) {
+    case STATE_RECOVERING:
+      return "recovering";
+    case STATE_ACTIVE:
+      return "active";
+    case STATE_UPDATING:
+      return "updating";
+    case STATE_UPDATING_PREVIOUS:
+      return "updating-previous";
+    case STATE_WRITING:
+      return "writing";
+    case STATE_WRITING_PREVIOUS:
+      return "writing-previous";
+    case STATE_REFRESH:
+      return "refresh";
+    case STATE_SHUTDOWN:
+      return "shutdown";
+    default:
+      return "UNKNOWN";
+    }
+  }
+
+private:
+  /**
+   * The state we are in.
+   */
+  int state;
+  /**
+   * @}
+   */
+  int commits_started = 0;
+
+  ceph::condition_variable shutdown_cond;
+
+public:
+  /**
+   * Check if we are recovering.
+   *
+   * @return 'true' if we are on the Recovering state; 'false' otherwise.
+   */
+  bool is_recovering() const { return (state == STATE_RECOVERING); }
+  /**
+   * Check if we are active.
+   *
+   * @return 'true' if we are on the Active state; 'false' otherwise.
+   */
+  bool is_active() const { return state == STATE_ACTIVE; }
+  /**
+   * Check if we are updating.
+   *
+   * @return 'true' if we are on the Updating state; 'false' otherwise.
+   */
+  bool is_updating() const { return state == STATE_UPDATING; }
+
+  /**
+   * Check if we are updating/proposing a previous value from a
+   * previous quorum
+   */
+  bool is_updating_previous() const { return state == STATE_UPDATING_PREVIOUS; }
+
+  /// @return 'true' if we are writing an update to disk
+  bool is_writing() const { return state == STATE_WRITING; }
+
+  /// @return 'true' if we are writing an update-previous to disk
+  bool is_writing_previous() const { return state == STATE_WRITING_PREVIOUS; }
+
+  /// @return 'true' if we are refreshing an update just committed
+  bool is_refresh() const { return state == STATE_REFRESH; }
+
+  /// @return 'true' if we are in the process of shutting down
+  bool is_shutdown() const { return state == STATE_SHUTDOWN; }
+
+private:
+  /**
+   * @defgroup Paxos_h_recovery_vars Common recovery-related member variables
+   * @note These variables are common to both the Leader and the Peons.
+   * @{
+   */
+  /**
+   *
+   */
+  version_t first_committed;
+  /**
+   * Last Proposal Number
+   *
+   * @todo Expand description
+   */
+  version_t last_pn;
+  /**
+   * Last committed value's version.
+   *
+   * On both the Leader and the Peons, this is the last value's version that 
+   * was accepted by a given quorum and thus committed, that this instance 
+   * knows about.
+   *
+   * @note It may not be the last committed value's version throughout the
+   *	   system. If we are a Peon, we may have not been part of the quorum
+   *	   that accepted the value, and for this very same reason we may still
+   *	   be a (couple of) version(s) behind, until we learn about the most
+   *	   recent version. This should only happen if we are not active (i.e.,
+   *	   part of the quorum), which should not happen if we are up, running
+   *	   and able to communicate with others -- thus able to be part of the
+   *	   monmap and trigger new elections.
+   */
+  version_t last_committed;
+  /**
+   * Last committed value's time.
+   *
+   * When the commit finished.
+   */
+  utime_t last_commit_time;
+  /**
+   * The last Proposal Number we have accepted.
+   *
+   * On the Leader, it will be the Proposal Number picked by the Leader 
+   * itself. On the Peon, however, it will be the proposal sent by the Leader
+   * and it will only be updated if its value is higher than the one
+   * already known by the Peon.
+   */
+  version_t accepted_pn;
+  /**
+   * The last_committed epoch of the leader at the time we accepted the last pn.
+   *
+   * This has NO SEMANTIC MEANING, and is there only for the debug output.
+   */
+  version_t accepted_pn_from;
+  /**
+   * Map holding the first committed version by each quorum member.
+   *
+   * The versions kept in this map are updated during the collect phase.
+   * When the Leader starts the collect phase, each Peon will reply with its
+   * first committed version, which will then be kept in this map.
+   */
+  std::map<int,version_t> peer_first_committed;
+  /**
+   * Map holding the last committed version by each quorum member.
+   *
+   * The versions kept in this map are updated during the collect phase.
+   * When the Leader starts the collect phase, each Peon will reply with its
+   * last committed version, which will then be kept in this map.
+   */
+  std::map<int,version_t> peer_last_committed;
+  /**
+   * @}
+   */
+
+  // active (phase 2)
+  /**
+   * @defgroup Paxos_h_active_vars Common active-related member variables
+   * @{
+   */
+  /**
+   * When does our read lease expires.
+   *
+   * Instead of performing a full commit each time a read is requested, we
+   * keep leases. Each lease will have an expiration date, which may or may
+   * not be extended. 
+   */
+  ceph::real_clock::time_point lease_expire;
+  /**
+   * List of callbacks waiting for our state to change into STATE_ACTIVE.
+   */
+  std::list<Context*> waiting_for_active;
+  /**
+   * List of callbacks waiting for the chance to read a version from us.
+   *
+   * Each entry on the list may result from an attempt to read a version that
+   * wasn't available at the time, or an attempt made during a period during
+   * which we could not satisfy the read request. The first case happens if
+   * the requested version is greater than our last committed version. The
+   * second scenario may happen if we are recovering, or if we don't have a
+   * valid lease.
+   *
+   * The list will be woken up once we change to STATE_ACTIVE with an extended
+   * lease -- which can be achieved if we have everyone on the quorum on board
+   * with the latest proposal, or if we don't really care about the remaining
+   * uncommitted values --, or if we're on a quorum of one.
+   */
+  std::list<Context*> waiting_for_readable;
+  /**
+   * @}
+   */
+
+  // -- leader --
+  // recovery (paxos phase 1)
+  /**
+   * @defgroup Paxos_h_leader_recovery Leader-specific Recovery-related vars
+   * @{
+   */
+  /**
+   * Number of replies to the collect phase we've received so far.
+   *
+   * This variable is reset to 1 each time we start a collect phase; it is
+   * incremented each time we receive a reply to the collect message, and
+   * is used to determine whether or not we have received replies from the
+   * whole quorum.
+   */
+  unsigned   num_last;
+  /**
+   * Uncommitted value's version.
+   *
+   * If we have, or end up knowing about, an uncommitted value, then its
+   * version will be kept in this variable.
+   *
+   * @note If this version equals @p last_committed+1 when we reach the final
+   *	   steps of recovery, then the algorithm will assume this is a value
+   *	   the Leader does not know about, and trustingly the Leader will 
+   *	   propose this version's value.
+   */
+  version_t  uncommitted_v;
+  /**
+   * Uncommitted value's Proposal Number.
+   *
+   * We use this variable to assess if the Leader should take into consideration
+   * an uncommitted value sent by a Peon. Given that the Peon will send back to
+   * the Leader the last Proposal Number it accepted, the Leader will be able
+   * to infer if this value is more recent than the one the Leader has, thus
+   * more relevant.
+   */
+  version_t  uncommitted_pn;
+  /**
+   * Uncommitted Value.
+   *
+   * If the system fails in-between the accept replies from the Peons and the
+   * instruction to commit from the Leader, then we may end up with accepted
+   * but yet-uncommitted values. During the Leader's recovery, it will attempt
+   * to bring the whole system to the latest state, and that means committing
+   * past accepted but uncommitted values.
+   *
+   * This variable will hold an uncommitted value, which may originate either
+   * on the Leader, or learnt by the Leader from a Peon during the collect
+   * phase.
+   */
+  ceph::buffer::list uncommitted_value;
+  /**
+   * Used to specify when an on-going collect phase times out.
+   */
+  Context    *collect_timeout_event;
+  /**
+   * @}
+   */
+
+  // active
+  /**
+   * @defgroup Paxos_h_leader_active Leader-specific Active-related vars
+   * @{
+   */
+  /**
+   * Set of participants (Leader & Peons) that have acked a lease extension.
+   *
+   * Each Peon that acknowledges a lease extension will have its place in this
+   * set, which will be used to account for all the acks from all the quorum
+   * members, guaranteeing that we trigger new elections if some don't ack in
+   * the expected timeframe.
+   */
+  std::set<int>   acked_lease;
+  /**
+   * Callback responsible for extending the lease periodically.
+   */
+  Context    *lease_renew_event;
+  /**
+   * Callback to trigger new elections once the time for acks is out.
+   */
+  Context    *lease_ack_timeout_event;
+  /**
+   * @}
+   */
+  /**
+   * @defgroup Paxos_h_peon_active Peon-specific Active-related vars
+   * @{
+   */
+  /**
+   * Callback to trigger new elections when the Peon's lease times out.
+   *
+   * If the Peon's lease is extended, this callback will be reset (i.e.,
+   * we cancel the event and reschedule a new one with starting from the
+   * beginning).
+   */
+  Context    *lease_timeout_event;
+  /**
+   * @}
+   */
+
+  // updating (paxos phase 2)
+  /**
+   * @defgroup Paxos_h_leader_updating Leader-specific Updating-related vars
+   * @{
+   */
+  /**
+   * New Value being proposed to the Peons.
+   *
+   * This ceph::buffer::list holds the value the Leader is proposing to the Peons, and
+   * that will be committed if the Peons do accept the proposal.
+   */
+  ceph::buffer::list new_value;
+  /**
+   * Set of participants (Leader & Peons) that accepted the new proposed value.
+   *
+   * This set is used to keep track of those who have accepted the proposed
+   * value, so the leader may know when to issue a commit (when a majority of
+   * participants has accepted the proposal), and when to extend the lease
+   * (when all the quorum members have accepted the proposal).
+   */
+  std::set<int>   accepted;
+  /**
+   * Callback to trigger a new election if the proposal is not accepted by the
+   * full quorum within a given timeframe.
+   *
+   * If the full quorum does not accept the proposal, then it means that the
+   * Leader may no longer be recognized as the leader, or that the quorum has
+   * changed, and the value may have not reached all the participants. Thus,
+   * the leader must call new elections, and go through a recovery phase in
+   * order to propagate the new value throughout the system.
+   *
+   * This does not mean that we won't commit. We will commit as soon as we
+   * have a majority of acceptances. But if we do not have full acceptance
+   * from the quorum, then we cannot extend the lease, as some participants
+   * may not have the latest committed value.
+   */
+  Context    *accept_timeout_event;
+
+  /**
+   * List of callbacks waiting for it to be possible to write again.
+   *
+   * @remarks It is not possible to write if we are not the Leader, or we are
+   *	      not on the active state, or if the lease has expired.
+   */
+  std::list<Context*> waiting_for_writeable;
+
+  /**
+   * Pending proposal transaction
+   *
+   * This is the transaction that is under construction and pending
+   * proposal.  We will add operations to it until we decide it is
+   * time to start a paxos round.
+   */
+  MonitorDBStore::TransactionRef pending_proposal;
+
+  /**
+   * Finishers for pending transaction
+   *
+   * These are waiting for updates in the pending proposal/transaction
+   * to be committed.
+   */
+  std::list<Context*> pending_finishers;
+
+  /**
+   * Finishers for committing transaction
+   *
+   * When the pending_proposal is submitted, pending_finishers move to
+   * this list.  When it commits, these finishers are notified.
+   */
+  std::list<Context*> committing_finishers;
+  /**
+   * This function re-triggers pending_ and committing_finishers
+   * safely, so as to maintain existing system invariants. In particular
+   * we maintain ordering by triggering committing before pending, and
+   * we clear out pending_finishers prior to any triggers so that
+   * we don't trigger asserts on them being empty. You should
+   * use it instead of sending -EAGAIN to them with finish_contexts.
+   */
+  void reset_pending_committing_finishers();
+
+  /**
+   * @defgroup Paxos_h_sync_warns Synchronization warnings
+   * @todo Describe these variables
+   * @{
+   */
+  utime_t last_clock_drift_warn;
+  int clock_drift_warned;
+  /**
+   * @}
+   */
+
+  /**
+   * Should be true if we have proposed to trim, or are in the middle of
+   * trimming; false otherwise.
+   */
+  bool trimming;
+
+  /**
+   * true if we want trigger_propose to *not* propose (yet)
+   */
+  bool plugged = false;
+
+  /**
+   * @defgroup Paxos_h_callbacks Callback classes.
+   * @{
+   */
+  /**
+   * Callback class responsible for handling a Collect Timeout.
+   */
+  class C_CollectTimeout;
+  /**
+   * Callback class responsible for handling an Accept Timeout.
+   */
+  class C_AcceptTimeout;
+  /**
+   * Callback class responsible for handling a Lease Ack Timeout.
+   */
+  class C_LeaseAckTimeout;
+
+  /**
+   * Callback class responsible for handling a Lease Timeout.
+   */
+  class C_LeaseTimeout;
+
+  /**
+   * Callback class responsible for handling a Lease Renew Timeout.
+   */
+  class C_LeaseRenew;
+
+  class C_Trimmed;
+  /**
+   *
+   */
+public:
+  class C_Proposal : public Context {
+    Context *proposer_context;
+  public:
+    ceph::buffer::list bl;
+    // for debug purposes. Will go away. Soon.
+    bool proposed;
+    utime_t proposal_time;
+
+    C_Proposal(Context *c, ceph::buffer::list& proposal_bl) :
+	proposer_context(c),
+	bl(proposal_bl),
+	proposed(false),
+	proposal_time(ceph_clock_now())
+      { }
+
+    void finish(int r) override {
+      if (proposer_context) {
+	proposer_context->complete(r);
+	proposer_context = NULL;
+      }
+    }
+  };
+  /**
+   * @}
+   */
+private:
+  /**
+   * @defgroup Paxos_h_election_triggered Steps triggered by an election.
+   *
+   * @note All these functions play a significant role in the Recovery Phase,
+   *	   which is triggered right after an election once someone becomes
+   *	   the Leader.
+   * @{
+   */
+  /**
+   * Create a new Proposal Number and propose it to the Peons.
+   *
+   * This function starts the Recovery Phase, which can be directly mapped
+   * onto the original Paxos' Prepare phase. Basically, we'll generate a
+   * Proposal Number, taking @p oldpn into consideration, and we will send
+   * it to a quorum, along with our first and last committed versions. By
+   * sending these information in a message to the quorum, we expect to
+   * obtain acceptances from a majority, allowing us to commit, or be
+   * informed of a higher Proposal Number known by one or more of the Peons
+   * in the quorum.
+   *
+   * @pre We are the Leader.
+   * @post Recovery Phase initiated by sending messages to the quorum.
+   *
+   * @param oldpn A proposal number taken as the highest known so far, that
+   *		  should be taken into consideration when generating a new 
+   *		  Proposal Number for the Recovery Phase.
+   */
+  void collect(version_t oldpn);
+  /**
+   * Handle the reception of a collect message from the Leader and reply
+   * accordingly.
+   *
+   * Once a Peon receives a collect message from the Leader it will reply
+   * with its first and last committed versions, as well as information so
+   * the Leader may know if its Proposal Number was, or was not, accepted by
+   * the Peon. The Peon will accept the Leader's Proposal Number if it is
+   * higher than the Peon's currently accepted Proposal Number. The Peon may
+   * also inform the Leader of accepted but uncommitted values.
+   *
+   * @invariant The message is an operation of type OP_COLLECT.
+   * @pre We are a Peon.
+   * @post Replied to the Leader, accepting or not accepting its PN.
+   *
+   * @param collect The collect message sent by the Leader to the Peon.
+   */
+  void handle_collect(MonOpRequestRef op);
+  /**
+   * Handle a response from a Peon to the Leader's collect phase.
+   *
+   * The received message will state the Peon's last committed version, as 
+   * well as its last proposal number. This will lead to one of the following
+   * scenarios: if the replied Proposal Number is equal to the one we proposed,
+   * then the Peon has accepted our proposal, and if all the Peons do accept
+   * our Proposal Number, then we are allowed to proceed with the commit;
+   * however, if a Peon replies with a higher Proposal Number, we assume he
+   * knows something we don't and the Leader will have to abort the current
+   * proposal in order to retry with the Proposal Number specified by the Peon.
+   * It may also occur that the Peon replied with a lower Proposal Number, in
+   * which case we assume it is a reply to an older value and we'll simply
+   * drop it.
+   * This function will also check if the Peon replied with an accepted but
+   * yet uncommitted value. In this case, if its version is higher than our
+   * last committed value by one, we assume that the Peon knows a value from a
+   * previous proposal that has never been committed, and we should try to
+   * commit that value by proposing it next. On the other hand, if that is
+   * not the case, we'll assume it is an old, uncommitted value, we do not
+   * care about and we'll consider the system active by extending the leases.
+   *
+   * @invariant The message is an operation of type OP_LAST.
+   * @pre We are the Leader.
+   * @post We initiate a commit, or we retry with a higher Proposal Number, 
+   *	   or we drop the message.
+   * @post We move from STATE_RECOVERING to STATE_ACTIVE.
+   *
+   * @param last The message sent by the Peon to the Leader.
+   */
+  void handle_last(MonOpRequestRef op);
+  /**
+   * The Recovery Phase timed out, meaning that a significant part of the
+   * quorum does not believe we are the Leader, and we thus should trigger new
+   * elections.
+   *
+   * @pre We believe to be the Leader.
+   * @post Trigger new elections.
+   */
+  void collect_timeout();
+  /**
+   * @}
+   */
+
+  /**
+   * @defgroup Paxos_h_updating_funcs Functions used during the Updating State
+   * 
+   * These functions may easily be mapped to the original Paxos Algorithm's 
+   * phases. 
+   *
+   * Taking into account the algorithm can be divided in 4 phases (Prepare,
+   * Promise, Accept Request and Accepted), we can easily map Paxos::begin to
+   * both the Prepare and Accept Request phases; the Paxos::handle_begin to
+   * the Promise phase; and the Paxos::handle_accept to the Accepted phase.
+   * @{
+   */
+  /**
+   * Start a new proposal with the intent of committing @p value.
+   *
+   * If we are alone on the system (i.e., a quorum of one), then we will
+   * simply commit the value, but if we are not alone, then we need to propose
+   * the value to the quorum.
+   *
+   * @pre We are the Leader
+   * @pre We are on STATE_ACTIVE
+   * @post We commit, if we are alone, or we send a message to each quorum 
+   *	   member
+   * @post We are on STATE_ACTIVE, if we are alone, or on 
+   *	   STATE_UPDATING otherwise
+   *
+   * @param value The value being proposed to the quorum
+   */
+  void begin(ceph::buffer::list& value);
+  /**
+   * Accept or decline (by ignoring) a proposal from the Leader.
+   *
+   * We will decline the proposal (by ignoring it) if we have promised to
+   * accept a higher numbered proposal. If that is not the case, we will
+   * accept it and accordingly reply to the Leader.
+   *
+   * @pre We are a Peon
+   * @pre We are on STATE_ACTIVE
+   * @post We are on STATE_UPDATING if we accept the Leader's proposal
+   * @post We send a reply message to the Leader if we accept its proposal
+   *
+   * @invariant The received message is an operation of type OP_BEGIN
+   *
+   * @param begin The message sent by the Leader to the Peon during the
+   *		  Paxos::begin function
+   *
+   */
+  void handle_begin(MonOpRequestRef op);
+  /**
+   * Handle an Accept message sent by a Peon.
+   *
+   * In order to commit, the Leader has to receive accepts from a majority of
+   * the quorum. If that does happen, then the Leader may proceed with the
+   * commit. However, the Leader needs the accepts from all the quorum members
+   * in order to extend the lease and move on to STATE_ACTIVE.
+   *
+   * This function handles these two situations, accounting for the amount of
+   * received accepts.
+   *
+   * @pre We are the Leader
+   * @pre We are on STATE_UPDATING
+   * @post We are on STATE_ACTIVE if we received accepts from the full quorum
+   * @post We extended the lease if we moved on to STATE_ACTIVE
+   * @post We are on STATE_UPDATING if we didn't received accepts from the
+   *	   full quorum
+   * @post We have committed if we received accepts from a majority
+   *
+   * @invariant The received message is an operation of type OP_ACCEPT
+   *
+   * @param accept The message sent by the Peons to the Leader during the
+   *		   Paxos::handle_begin function
+   */
+  void handle_accept(MonOpRequestRef op);
+  /**
+   * Trigger a fresh election.
+   *
+   * During Paxos::begin we set a Callback of type Paxos::C_AcceptTimeout in
+   * order to limit the amount of time we spend waiting for Accept replies.
+   * This callback will call Paxos::accept_timeout when it is fired.
+   *
+   * This is essential to the algorithm because there may be the chance that
+   * we are no longer the Leader (i.e., others don't believe in us) and we
+   * are getting ignored, or we dropped out of the quorum and haven't realised
+   * it. So, our only option is to trigger fresh elections.
+   *
+   * @pre We are the Leader
+   * @pre We are on STATE_UPDATING
+   * @post Triggered fresh elections
+   */
+  void accept_timeout();
+  /**
+   * @}
+   */
+
+
+  utime_t commit_start_stamp;
+  friend struct C_Committed;
+
+  /**
+   * Commit a value throughout the system.
+   *
+   * The Leader will cancel the current lease (as it was for the old value),
+   * and will store the committed value locally. It will then instruct every
+   * quorum member to do so as well.
+   *
+   * @pre We are the Leader
+   * @pre We are on STATE_UPDATING
+   * @pre A majority of quorum members accepted our proposal
+   * @post Value locally stored
+   * @post Quorum members instructed to commit the new value.
+   */
+  void commit_start();
+  void commit_finish();   ///< finish a commit after txn becomes durable
+  void abort_commit();    ///< Handle commit finish after shutdown started
+  /**
+   * Commit the new value to stable storage as being the latest available
+   * version.
+   *
+   * @pre We are a Peon
+   * @post The new value is locally stored
+   * @post Fire up the callbacks waiting on waiting_for_commit
+   *
+   * @invariant The received message is an operation of type OP_COMMIT
+   *
+   * @param commit The message sent by the Leader to the Peon during
+   *		   Paxos::commit
+   */
+  void handle_commit(MonOpRequestRef op);
+  /**
+   * Extend the system's lease.
+   *
+   * This means that the Leader considers that it should now safe to read from
+   * any node on the system, since every quorum member is now in possession of
+   * the latest version. Therefore, the Leader will send a message stating just
+   * this to each quorum member, and will impose a limited timeframe during
+   * which acks will be accepted. If there aren't as many acks as expected
+   * (i.e, if at least one quorum member does not ack the lease) during this
+   * timeframe, then we will force fresh elections.
+   *
+   * @pre We are the Leader
+   * @pre We are on STATE_ACTIVE
+   * @post A message extending the lease is sent to each quorum member
+   * @post A timeout callback is set to limit the amount of time we will wait
+   *	   for lease acks.
+   * @post A timer is set in order to renew the lease after a certain amount
+   *	   of time.
+   */
+  void extend_lease();
+  /**
+   * Update the lease on the Peon's side of things.
+   *
+   * Once a Peon receives a Lease message, it will update its lease_expire
+   * variable, reply to the Leader acknowledging the lease update and set a
+   * timeout callback to be fired upon the lease's expiration. Finally, the
+   * Peon will fire up all the callbacks waiting for it to become active,
+   * which it just did, and all those waiting for it to become readable,
+   * which should be true if the Peon's lease didn't expire in the mean time.
+   *
+   * @pre We are a Peon
+   * @post We update the lease accordingly
+   * @post A lease timeout callback is set
+   * @post Move to STATE_ACTIVE
+   * @post Fire up all the callbacks waiting for STATE_ACTIVE
+   * @post Fire up all the callbacks waiting for readable if we are readable
+   * @post Ack the lease to the Leader
+   *
+   * @invariant The received message is an operation of type OP_LEASE
+   *
+   * @param lease The message sent by the Leader to the Peon during the
+   *	    Paxos::extend_lease function
+   */
+  void handle_lease(MonOpRequestRef op);
+  /**
+   * Account for all the Lease Acks the Leader receives from the Peons.
+   *
+   * Once the Leader receives all the Lease Acks from the Peons, it will be
+   * able to cancel the Lease Ack timeout callback, thus avoiding calling
+   * fresh elections.
+   *
+   * @pre We are the Leader
+   * @post Cancel the Lease Ack timeout callback if we receive acks from all
+   *	   the quorum members
+   *
+   * @invariant The received message is an operation of type OP_LEASE_ACK
+   *
+   * @param ack The message sent by a Peon to the Leader during the
+   *		Paxos::handle_lease function
+   */
+  void handle_lease_ack(MonOpRequestRef op);
+  /**
+   * Call fresh elections because at least one Peon didn't acked our lease.
+   *
+   * @pre We are the Leader
+   * @pre We are on STATE_ACTIVE
+   * @post Trigger fresh elections
+   */
+  void lease_ack_timeout();
+  /**
+   * Extend lease since we haven't had new committed values meanwhile.
+   *
+   * @pre We are the Leader
+   * @pre We are on STATE_ACTIVE
+   * @post Go through with Paxos::extend_lease
+   */
+  void lease_renew_timeout();
+  /**
+   * Call fresh elections because the Peon's lease expired without being
+   * renewed or receiving a fresh lease.
+   *
+   * This means that the Peon is no longer assumed as being in the quorum
+   * (or there is no Leader to speak of), so just trigger fresh elections
+   * to circumvent this issue.
+   *
+   * @pre We are a Peon
+   * @post Trigger fresh elections
+   */
+  void lease_timeout();        // on peon, if lease isn't extended
+
+  /// restart the lease timeout timer
+  void reset_lease_timeout();
+
+  /**
+   * Cancel all of Paxos' timeout/renew events. 
+   */
+  void cancel_events();
+  /**
+   * Shutdown this Paxos machine
+   */
+  void shutdown();
+
+  /**
+   * Generate a new Proposal Number based on @p gt
+   *
+   * @todo Check what @p gt actually means and what its usage entails
+   * @param gt A hint for the geration of the Proposal Number
+   * @return A globally unique, monotonically increasing Proposal Number
+   */
+  version_t get_new_proposal_number(version_t gt=0);
+ 
+  /**
+   * @todo document sync function
+   */
+  void warn_on_future_time(utime_t t, entity_name_t from);
+
+  /**
+   * Begin proposing the pending_proposal.
+   */
+  void propose_pending();
+
+  /**
+   * refresh state from store
+   *
+   * Called when we have new state for the mon to consume.  If we return false,
+   * abort (we triggered a bootstrap).
+   *
+   * @returns true on success, false if we are now bootstrapping
+   */
+  bool do_refresh();
+
+  void commit_proposal();
+  void finish_round();
+
+public:
+  /**
+   * @param m A monitor
+   * @param name A name for the paxos service. It serves as the naming space
+   * of the underlying persistent storage for this service.
+   */
+  Paxos(Monitor &m, const std::string &name) 
+		 : mon(m),
+		   logger(NULL),
+		   paxos_name(name),
+		   state(STATE_RECOVERING),
+		   first_committed(0),
+		   last_pn(0),
+		   last_committed(0),
+		   accepted_pn(0),
+		   accepted_pn_from(0),
+		   num_last(0),
+		   uncommitted_v(0), uncommitted_pn(0),
+		   collect_timeout_event(0),
+		   lease_renew_event(0),
+		   lease_ack_timeout_event(0),
+		   lease_timeout_event(0),
+		   accept_timeout_event(0),
+		   clock_drift_warned(0),
+		   trimming(false) { }
+
+  ~Paxos() {
+    delete logger;
+  }
+
+  const std::string get_name() const {
+    return paxos_name;
+  }
+
+  void dispatch(MonOpRequestRef op);
+
+  void read_and_prepare_transactions(MonitorDBStore::TransactionRef tx,
+				     version_t from, version_t last);
+
+  void init();
+
+  /**
+   * dump state info to a formatter
+   */
+  void dump_info(ceph::Formatter *f);
+
+  /**
+   * This function runs basic consistency checks. Importantly, if
+   * it is inconsistent and shouldn't be, it asserts out.
+   *
+   * @return True if consistent, false if not.
+   */
+  bool is_consistent();
+
+  void restart();
+  /**
+   * Initiate the Leader after it wins an election.
+   *
+   * Once an election is won, the Leader will be initiated and there are two
+   * possible outcomes of this method: the Leader directly jumps to the active
+   * state (STATE_ACTIVE) if it believes to be the only one in the quorum, or
+   * will start recovering (STATE_RECOVERING) by initiating the collect phase. 
+   *
+   * @pre Our monitor is the Leader.
+   * @post We are either on STATE_ACTIVE if we're the only one in the quorum,
+   *	   or on STATE_RECOVERING otherwise.
+   */
+  void leader_init();
+  /**
+   * Initiate a Peon after it loses an election.
+   *
+   * If we are a Peon, then there must be a Leader and we are not alone in the
+   * quorum, thus automatically assume we are on STATE_RECOVERING, which means
+   * we will soon be enrolled into the Leader's collect phase.
+   *
+   * @pre There is a Leader, and it?s about to start the collect phase.
+   * @post We are on STATE_RECOVERING and will soon receive collect phase's 
+   *	   messages.
+   */
+  void peon_init();
+
+  /**
+   * Include an incremental state of values, ranging from peer_first_committed
+   * to the last committed value, on the message m
+   *
+   * @param m A message
+   * @param peer_first_committed Lowest version to take into account
+   * @param peer_last_committed Highest version to take into account
+   */
+  void share_state(MMonPaxos *m, version_t peer_first_committed,
+		   version_t peer_last_committed);
+  /**
+   * Store on disk a state that was shared with us
+   *
+   * Basically, we received a set of version. Or just one. It doesn't matter.
+   * What matters is that we have to stash it in the store. So, we will simply
+   * write every single ceph::buffer::list into their own versions on our side (i.e.,
+   * onto paxos-related keys), and then we will decode those same ceph::buffer::lists
+   * we just wrote and apply the transactions they hold. We will also update
+   * our first and last committed values to point to the new values, if need
+   * be. All this is done tightly wrapped in a transaction to ensure we
+   * enjoy the atomicity guarantees given by our awesome k/v store.
+   *
+   * @param m A message
+   * @returns true if we stored something new; false otherwise
+   */
+  bool store_state(MMonPaxos *m);
+  void _sanity_check_store();
+
+  /**
+   * Helper function to decode a ceph::buffer::list into a transaction and append it
+   * to another transaction.
+   *
+   * This function is used during the Leader's commit and during the
+   * Paxos::store_state in order to apply the ceph::buffer::list's transaction onto
+   * the store.
+   *
+   * @param t The transaction to which we will append the operations
+   * @param bl A ceph::buffer::list containing an encoded transaction
+   */
+  static void decode_append_transaction(MonitorDBStore::TransactionRef t,
+					ceph::buffer::list& bl) {
+    auto vt(std::make_shared<MonitorDBStore::Transaction>());
+    auto it = bl.cbegin();
+    vt->decode(it);
+    t->append(vt);
+  }
+
+  /**
+   * @todo This appears to be used only by the OSDMonitor, and I would say
+   *	   its objective is to allow a third-party to have a "private"
+   *	   state dir. -JL
+   */
+  void add_extra_state_dir(std::string s) {
+    extra_state_dirs.push_back(s);
+  }
+
+  // -- service interface --
+  /**
+   * Add c to the list of callbacks waiting for us to become active.
+   *
+   * @param c A callback
+   */
+  void wait_for_active(MonOpRequestRef op, Context *c) {
+    if (op)
+      op->mark_event("paxos:wait_for_active");
+    waiting_for_active.push_back(c);
+  }
+  void wait_for_active(Context *c) {
+    MonOpRequestRef o;
+    wait_for_active(o, c);
+  }
+
+  /**
+   * Trim the Paxos state as much as we can.
+   */
+  void trim();
+
+  /**
+   * Check if we should trim.
+   *
+   * If trimming is disabled, we must take that into consideration and only
+   * return true if we are positively sure that we should trim soon.
+   *
+   * @returns true if we should trim; false otherwise.
+   */
+  bool should_trim() {
+    int available_versions = get_version() - get_first_committed();
+    int maximum_versions = g_conf()->paxos_min + g_conf()->paxos_trim_min;
+
+    if (trimming || (available_versions <= maximum_versions))
+      return false;
+
+    return true;
+  }
+
+  bool is_plugged() const {
+    return plugged;
+  }
+  void plug() {
+    ceph_assert(plugged == false);
+    plugged = true;
+  }
+  void unplug() {
+    ceph_assert(plugged == true);
+    plugged = false;
+  }
+
+  // read
+  /**
+   * @defgroup Paxos_h_read_funcs Read-related functions
+   * @{
+   */
+  /**
+   * Get latest committed version
+   *
+   * @return latest committed version
+   */
+  version_t get_version() { return last_committed; }
+  /**
+   * Get first committed version
+   *
+   * @return the first committed version
+   */
+  version_t get_first_committed() { return first_committed; }
+  /**
+   * Check if a given version is readable.
+   *
+   * A version may not be readable for a myriad of reasons:
+   *  @li the version @e v is higher that the last committed version
+   *  @li we are not the Leader nor a Peon (election may be on-going)
+   *  @li we do not have a committed value yet
+   *  @li we do not have a valid lease
+   *
+   * @param seen The version we want to check if it is readable.
+   * @return 'true' if the version is readable; 'false' otherwise.
+   */
+  bool is_readable(version_t seen=0);
+  /**
+   * Read version @e v and store its value in @e bl
+   *
+   * @param[in] v The version we want to read
+   * @param[out] bl The version's value
+   * @return 'true' if we successfully read the value; 'false' otherwise
+   */
+  bool read(version_t v, ceph::buffer::list &bl);
+  /**
+   * Read the latest committed version
+   *
+   * @param[out] bl The version's value
+   * @return the latest committed version if we successfully read the value;
+   *	     or 0 (zero) otherwise.
+   */
+  version_t read_current(ceph::buffer::list &bl);
+  /**
+   * Add onreadable to the list of callbacks waiting for us to become readable.
+   *
+   * @param onreadable A callback
+   */
+  void wait_for_readable(MonOpRequestRef op, Context *onreadable) {
+    ceph_assert(!is_readable());
+    if (op)
+      op->mark_event("paxos:wait_for_readable");
+    waiting_for_readable.push_back(onreadable);
+  }
+  void wait_for_readable(Context *onreadable) {
+    MonOpRequestRef o;
+    wait_for_readable(o, onreadable);
+  }
+  /**
+   * @}
+   */
+
+  /**
+   * Check if we have a valid lease.
+   *
+   * @returns true if the lease is still valid; false otherwise.
+   */
+  bool is_lease_valid();
+  // write
+  /**
+   * @defgroup Paxos_h_write_funcs Write-related functions
+   * @{
+   */
+  /**
+   * Check if we are writeable.
+   *
+   * We are writeable if we are alone (i.e., a quorum of one), or if we match
+   * all the following conditions:
+   *  @li We are the Leader
+   *  @li We are on STATE_ACTIVE
+   *  @li We have a valid lease
+   *
+   * @return 'true' if we are writeable; 'false' otherwise.
+   */
+  bool is_writeable();
+  /**
+   * Add c to the list of callbacks waiting for us to become writeable.
+   *
+   * @param c A callback
+   */
+  void wait_for_writeable(MonOpRequestRef op, Context *c) {
+    ceph_assert(!is_writeable());
+    if (op)
+      op->mark_event("paxos:wait_for_writeable");
+    waiting_for_writeable.push_back(c);
+  }
+  void wait_for_writeable(Context *c) {
+    MonOpRequestRef o;
+    wait_for_writeable(o, c);
+  }
+
+  /**
+   * Get a transaction to submit operations to propose against
+   *
+   * Apply operations to this transaction.  It will eventually be proposed
+   * to paxos.
+   */
+  MonitorDBStore::TransactionRef get_pending_transaction();
+
+  /**
+   * Queue a completion for the pending proposal
+   *
+   * This completion will get triggered when the pending proposal
+   * transaction commits.
+   */
+  void queue_pending_finisher(Context *onfinished);
+
+  /**
+   * (try to) trigger a proposal
+   *
+   * Tell paxos that it should submit the pending proposal.  Note that if it
+   * is not active (e.g., because it is already in the midst of committing
+   * something) that will be deferred (e.g., until the current round finishes).
+   */
+  bool trigger_propose();
+  /**
+   * @}
+   */
+
+  /**
+   * @}
+   */
+ protected:
+  MonitorDBStore *get_store();
+};
+
+inline std::ostream& operator<<(std::ostream& out, Paxos::C_Proposal& p)
+{
+  std::string proposed = (p.proposed ? "proposed" : "unproposed");
+  out << " " << proposed
+      << " queued " << (ceph_clock_now() - p.proposal_time)
+      << " tx dump:\n";
+  auto t(std::make_shared<MonitorDBStore::Transaction>());
+  auto p_it = p.bl.cbegin();
+  t->decode(p_it);
+  ceph::JSONFormatter f(true);
+  t->dump(&f);
+  f.flush(out);
+  return out;
+}
+
+#endif
diff --git a/src/mon/PaxosFSMap.h b/src/mon/PaxosFSMap.h
new file mode 100644
index 000000000..e32c44e0b
--- /dev/null
+++ b/src/mon/PaxosFSMap.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PAXOS_FSMAP_H
+#define CEPH_PAXOS_FSMAP_H
+
+#include "mds/FSMap.h"
+#include "mds/MDSMap.h"
+
+#include "include/ceph_assert.h"
+
+class PaxosFSMap {
+public:
+  virtual ~PaxosFSMap() {}
+
+  const FSMap &get_pending_fsmap() const { ceph_assert(is_leader()); return pending_fsmap; }
+  const FSMap &get_fsmap() const { return fsmap; }
+
+  virtual bool is_leader() const = 0;
+
+protected:
+  FSMap &get_pending_fsmap_writeable() { ceph_assert(is_leader()); return pending_fsmap; }
+
+  FSMap &create_pending() {
+    ceph_assert(is_leader());
+    pending_fsmap = fsmap;
+    pending_fsmap.epoch++;
+    return pending_fsmap;
+  }
+
+  void decode(ceph::buffer::list &bl) {
+    fsmap.decode(bl);
+    pending_fsmap = FSMap(); /* nuke it to catch invalid access */
+  }
+
+private:
+  /* Keep these PRIVATE to prevent unprotected manipulation. */
+  FSMap fsmap; /* the current epoch */
+  FSMap pending_fsmap; /* the next epoch */
+};
+
+
+#endif
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
new file mode 100644
index 000000000..0a6a9a9ea
--- /dev/null
+++ b/src/mon/PaxosService.cc
@@ -0,0 +1,466 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "PaxosService.h"
+#include "common/Clock.h"
+#include "common/config.h"
+#include "include/stringify.h"
+#include "include/ceph_assert.h"
+#include "mon/MonOpRequest.h"
+
+using std::ostream;
+using std::string;
+
+using ceph::bufferlist;
+
+#define dout_subsys ceph_subsys_paxos
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, paxos, service_name, get_first_committed(), get_last_committed())
+static ostream& _prefix(std::ostream *_dout, Monitor &mon, Paxos &paxos, string service_name,
+			version_t fc, version_t lc) {
+  return *_dout << "mon." << mon.name << "@" << mon.rank
+		<< "(" << mon.get_state_name()
+		<< ").paxosservice(" << service_name << " " << fc << ".." << lc << ") ";
+}
+
+bool PaxosService::dispatch(MonOpRequestRef op)
+{
+  ceph_assert(op->is_type_service() || op->is_type_command());
+  auto m = op->get_req<PaxosServiceMessage>();
+  op->mark_event("psvc:dispatch");
+
+  dout(10) << __func__ << " " << m << " " << *m
+	   << " from " << m->get_orig_source_inst()
+	   << " con " << m->get_connection() << dendl;
+
+  if (mon.is_shutdown()) {
+    return true;
+  }
+
+  // make sure this message isn't forwarded from a previous election epoch
+  if (m->rx_election_epoch &&
+      m->rx_election_epoch < mon.get_epoch()) {
+    dout(10) << " discarding forwarded message from previous election epoch "
+	     << m->rx_election_epoch << " < " << mon.get_epoch() << dendl;
+    return true;
+  }
+
+  // make sure the client is still connected.  note that a proxied
+  // connection will be disconnected with a null message; don't drop
+  // those.  also ignore loopback (e.g., log) messages.
+  if (m->get_connection() &&
+      !m->get_connection()->is_connected() &&
+      m->get_connection() != mon.con_self &&
+      m->get_connection()->get_messenger() != NULL) {
+    dout(10) << " discarding message from disconnected client "
+	     << m->get_source_inst() << " " << *m << dendl;
+    return true;
+  }
+
+  // make sure our map is readable and up to date
+  if (!is_readable(m->version)) {
+    dout(10) << " waiting for paxos -> readable (v" << m->version << ")" << dendl;
+    wait_for_readable(op, new C_RetryMessage(this, op), m->version);
+    return true;
+  }
+
+  // preprocess
+  if (preprocess_query(op)) 
+    return true;  // easy!
+
+  // leader?
+  if (!mon.is_leader()) {
+    mon.forward_request_leader(op);
+    return true;
+  }
+  
+  // writeable?
+  if (!is_writeable()) {
+    dout(10) << " waiting for paxos -> writeable" << dendl;
+    wait_for_writeable(op, new C_RetryMessage(this, op));
+    return true;
+  }
+
+  // update
+  if (!prepare_update(op)) {
+    // no changes made.
+    return true;
+  }
+
+  if (need_immediate_propose) {
+    dout(10) << __func__ << " forced immediate propose" << dendl;
+    need_immediate_propose = false;
+    propose_pending();
+    return true;
+  }
+
+  double delay = 0.0;
+  if (!should_propose(delay)) {
+    dout(10) << " not proposing" << dendl;
+    return true;
+  }
+
+  if (delay == 0.0) {
+    propose_pending();
+    return true;
+  }
+
+  // delay a bit
+  if (!proposal_timer) {
+    /**
+       * Callback class used to propose the pending value once the proposal_timer
+       * fires up.
+       */
+    auto do_propose = new C_MonContext{&mon, [this](int r) {
+        proposal_timer = 0;
+        if (r >= 0) {
+          propose_pending();
+        } else if (r == -ECANCELED || r == -EAGAIN) {
+          return;
+        } else {
+          ceph_abort_msg("bad return value for proposal_timer");
+        }
+    }};
+    dout(10) << " setting proposal_timer " << do_propose
+             << " with delay of " << delay << dendl;
+    proposal_timer = mon.timer.add_event_after(delay, do_propose);
+  } else {
+    dout(10) << " proposal_timer already set" << dendl;
+  }
+  return true;
+}
+
+void PaxosService::refresh(bool *need_bootstrap)
+{
+  // update cached versions
+  cached_first_committed = mon.store->get(get_service_name(), first_committed_name);
+  cached_last_committed = mon.store->get(get_service_name(), last_committed_name);
+
+  version_t new_format = get_value("format_version");
+  if (new_format != format_version) {
+    dout(1) << __func__ << " upgraded, format " << format_version << " -> " << new_format << dendl;
+    on_upgrade();
+  }
+  format_version = new_format;
+
+  dout(10) << __func__ << dendl;
+
+  update_from_paxos(need_bootstrap);
+}
+
+void PaxosService::post_refresh()
+{
+  dout(10) << __func__ << dendl;
+
+  post_paxos_update();
+
+  if (mon.is_peon() && !waiting_for_finished_proposal.empty()) {
+    finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+  }
+}
+
+bool PaxosService::should_propose(double& delay)
+{
+  // simple default policy: quick startup, then some damping.
+  if (get_last_committed() <= 1) {
+    delay = 0.0;
+  } else {
+    utime_t now = ceph_clock_now();
+    if ((now - paxos.last_commit_time) > g_conf()->paxos_propose_interval)
+      delay = (double)g_conf()->paxos_min_wait;
+    else
+      delay = (double)(g_conf()->paxos_propose_interval + paxos.last_commit_time
+		       - now);
+  }
+  return true;
+}
+
+
+void PaxosService::propose_pending()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(have_pending);
+  ceph_assert(!proposing);
+  ceph_assert(mon.is_leader());
+  ceph_assert(is_active());
+
+  if (proposal_timer) {
+    dout(10) << " canceling proposal_timer " << proposal_timer << dendl;
+    mon.timer.cancel_event(proposal_timer);
+    proposal_timer = NULL;
+  }
+
+  /**
+   * @note What we contribute to the pending Paxos transaction is
+   *	   obtained by calling a function that must be implemented by
+   *	   the class implementing us.  I.e., the function
+   *	   encode_pending will be the one responsible to encode
+   *	   whatever is pending on the implementation class into a
+   *	   bufferlist, so we can then propose that as a value through
+   *	   Paxos.
+   */
+  MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
+
+  if (should_stash_full())
+    encode_full(t);
+
+  encode_pending(t);
+  have_pending = false;
+
+  if (format_version > 0) {
+    t->put(get_service_name(), "format_version", format_version);
+  }
+
+  // apply to paxos
+  proposing = true;
+  /**
+   * Callback class used to mark us as active once a proposal finishes going
+   * through Paxos.
+   *
+   * We should wake people up *only* *after* we inform the service we
+   * just went active. And we should wake people up only once we finish
+   * going active. This is why we first go active, avoiding to wake up the
+   * wrong people at the wrong time, such as waking up a C_RetryMessage
+   * before waking up a C_Active, thus ending up without a pending value.
+   */
+  class C_Committed : public Context {
+    PaxosService *ps;
+  public:
+    explicit C_Committed(PaxosService *p) : ps(p) { }
+    void finish(int r) override {
+      ps->proposing = false;
+      if (r >= 0)
+	ps->_active();
+      else if (r == -ECANCELED || r == -EAGAIN)
+	return;
+      else
+	ceph_abort_msg("bad return value for C_Committed");
+    }
+  };
+  paxos.queue_pending_finisher(new C_Committed(this));
+  paxos.trigger_propose();
+}
+
+bool PaxosService::should_stash_full()
+{
+  version_t latest_full = get_version_latest_full();
+  /* @note The first member of the condition is moot and it is here just for
+   *	   clarity's sake. The second member would end up returing true
+   *	   nonetheless because, in that event,
+   *	      latest_full == get_trim_to() == 0.
+   */
+  return (!latest_full ||
+	  (latest_full <= get_trim_to()) ||
+	  (get_last_committed() - latest_full > (version_t)g_conf()->paxos_stash_full_interval));
+}
+
+void PaxosService::restart()
+{
+  dout(10) << __func__ << dendl;
+  if (proposal_timer) {
+    dout(10) << " canceling proposal_timer " << proposal_timer << dendl;
+    mon.timer.cancel_event(proposal_timer);
+    proposal_timer = 0;
+  }
+
+  finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+
+  if (have_pending) {
+    discard_pending();
+    have_pending = false;
+  }
+  proposing = false;
+
+  on_restart();
+}
+
+void PaxosService::election_finished()
+{
+  dout(10) << __func__ << dendl;
+
+  finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+
+  // make sure we update our state
+  _active();
+}
+
+void PaxosService::_active()
+{
+  if (is_proposing()) {
+    dout(10) << __func__ << " - proposing" << dendl;
+    return;
+  }
+  if (!is_active()) {
+    dout(10) << __func__ << " - not active" << dendl;
+    /**
+     * Callback used to make sure we call the PaxosService::_active function
+     * whenever a condition is fulfilled.
+     *
+     * This is used in multiple situations, from waiting for the Paxos to commit
+     * our proposed value, to waiting for the Paxos to become active once an
+     * election is finished.
+     */
+    class C_Active : public Context {
+      PaxosService *svc;
+    public:
+      explicit C_Active(PaxosService *s) : svc(s) {}
+      void finish(int r) override {
+	if (r >= 0)
+	  svc->_active();
+      }
+    };
+    wait_for_active_ctx(new C_Active(this));
+    return;
+  }
+  dout(10) << __func__ << dendl;
+
+  // create pending state?
+  if (mon.is_leader()) {
+    dout(7) << __func__ << " creating new pending" << dendl;
+    if (!have_pending) {
+      create_pending();
+      have_pending = true;
+    }
+
+    if (get_last_committed() == 0) {
+      // create initial state
+      create_initial();
+      propose_pending();
+      return;
+    }
+  } else {
+    dout(7) << __func__ << " we are not the leader, hence we propose nothing!" << dendl;
+  }
+
+  // wake up anyone who came in while we were proposing.  note that
+  // anyone waiting for the previous proposal to commit is no longer
+  // on this list; it is on Paxos's.
+  finish_contexts(g_ceph_context, waiting_for_finished_proposal, 0);
+
+  if (mon.is_leader())
+    upgrade_format();
+
+  // NOTE: it's possible that this will get called twice if we commit
+  // an old paxos value.  Implementations should be mindful of that.
+  on_active();
+}
+
+
+void PaxosService::shutdown()
+{
+  cancel_events();
+
+  if (proposal_timer) {
+    dout(10) << " canceling proposal_timer " << proposal_timer << dendl;
+    mon.timer.cancel_event(proposal_timer);
+    proposal_timer = 0;
+  }
+
+  finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+
+  on_shutdown();
+}
+
+void PaxosService::maybe_trim()
+{
+  if (!is_writeable())
+    return;
+
+  const version_t first_committed = get_first_committed();
+  version_t trim_to = get_trim_to();
+  dout(20) << __func__ << " " << first_committed << "~" << trim_to << dendl;
+
+  if (trim_to < first_committed) {
+    dout(10) << __func__ << " trim_to " << trim_to << " < first_committed "
+	     << first_committed << dendl;
+    return;
+  }
+
+  version_t to_remove = trim_to - first_committed;
+  const version_t trim_min = g_conf().get_val<version_t>("paxos_service_trim_min");
+  if (trim_min > 0 &&
+      to_remove < trim_min) {
+    dout(10) << __func__ << " trim_to " << trim_to << " would only trim " << to_remove
+	     << " < paxos_service_trim_min " << trim_min << dendl;
+    return;
+  }
+
+  to_remove = [to_remove, trim_to, this] {
+    const version_t trim_max = g_conf().get_val<version_t>("paxos_service_trim_max");
+    if (trim_max == 0 || to_remove < trim_max) {
+      return to_remove;
+    }
+    if (to_remove < trim_max * 1.5) {
+      dout(10) << __func__ << " trim to " << trim_to << " would only trim " << to_remove
+             << " > paxos_service_trim_max, limiting to " << trim_max
+             << dendl;
+      return trim_max;
+    }
+    const version_t new_trim_max = (trim_max + to_remove) / 2;
+    const uint64_t trim_max_multiplier = g_conf().get_val<uint64_t>("paxos_service_trim_max_multiplier");
+    if (trim_max_multiplier) {
+      return std::min(new_trim_max, trim_max * trim_max_multiplier);
+    } else {
+      return new_trim_max;
+    }
+  }();
+  trim_to = first_committed + to_remove;
+
+  dout(10) << __func__ << " trimming to " << trim_to << ", " << to_remove << " states" << dendl;
+  MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
+  trim(t, first_committed, trim_to);
+  put_first_committed(t, trim_to);
+  cached_first_committed = trim_to;
+
+  // let the service add any extra stuff
+  encode_trim_extra(t, trim_to);
+
+  paxos.trigger_propose();
+}
+
+void PaxosService::trim(MonitorDBStore::TransactionRef t,
+			version_t from, version_t to)
+{
+  dout(10) << __func__ << " from " << from << " to " << to << dendl;
+  ceph_assert(from != to);
+
+  for (version_t v = from; v < to; ++v) {
+    dout(20) << __func__ << " " << v << dendl;
+    t->erase(get_service_name(), v);
+
+    string full_key = mon.store->combine_strings("full", v);
+    if (mon.store->exists(get_service_name(), full_key)) {
+      dout(20) << __func__ << " " << full_key << dendl;
+      t->erase(get_service_name(), full_key);
+    }
+  }
+  if (g_conf()->mon_compact_on_trim) {
+    dout(20) << " compacting prefix " << get_service_name() << dendl;
+    t->compact_range(get_service_name(), stringify(from - 1), stringify(to));
+    t->compact_range(get_service_name(),
+		     mon.store->combine_strings(full_prefix_name, from - 1),
+		     mon.store->combine_strings(full_prefix_name, to));
+  }
+}
+
+void PaxosService::load_health()
+{
+  bufferlist bl;
+  mon.store->get("health", service_name, bl);
+  if (bl.length()) {
+    auto p = bl.cbegin();
+    using ceph::decode;
+    decode(health_checks, p);
+  }
+}
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
new file mode 100644
index 000000000..93c5e7c81
--- /dev/null
+++ b/src/mon/PaxosService.h
@@ -0,0 +1,901 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_PAXOSSERVICE_H
+#define CEPH_PAXOSSERVICE_H
+
+#include "include/Context.h"
+#include "Paxos.h"
+#include "Monitor.h"
+#include "MonitorDBStore.h"
+
+/**
+ * A Paxos Service is an abstraction that easily allows one to obtain an
+ * association between a Monitor and a Paxos class, in order to implement any
+ * service.
+ */
+class PaxosService {
+  /**
+   * @defgroup PaxosService_h_class Paxos Service
+   * @{
+   */
+ public:
+  /**
+   * The Monitor to which this class is associated with
+   */
+  Monitor &mon;
+  /**
+   * The Paxos instance to which this class is associated with
+   */
+  Paxos &paxos;
+  /**
+   * Our name. This will be associated with the class implementing us, and will
+   * be used mainly for store-related operations.
+   */
+  std::string service_name;
+  /**
+   * If we are or have queued anything for proposal, this variable will be true
+   * until our proposal has been finished.
+   */
+  bool proposing;
+
+  bool need_immediate_propose = false;
+
+protected:
+  /**
+   * Services implementing us used to depend on the Paxos version, back when
+   * each service would have a Paxos instance for itself. However, now we only
+   * have a single Paxos instance, shared by all the services. Each service now
+   * must keep its own version, if so they wish. This variable should be used
+   * for that purpose.
+   */
+  version_t service_version;
+
+ private:
+  /**
+   * Event callback responsible for proposing our pending value once a timer 
+   * runs out and fires.
+   */
+  Context *proposal_timer;
+  /**
+   * If the implementation class has anything pending to be proposed to Paxos,
+   * then have_pending should be true; otherwise, false.
+   */
+  bool have_pending; 
+
+  /**
+   * health checks for this service
+   *
+   * Child must populate this during encode_pending() by calling encode_health().
+   */
+  health_check_map_t health_checks;
+protected:
+  /**
+   * format of our state in leveldb, 0 for default
+   */
+  version_t format_version;
+
+public:
+  const health_check_map_t& get_health_checks() const {
+    return health_checks;
+  }
+
+  /**
+   * @defgroup PaxosService_h_callbacks Callback classes
+   * @{
+   */
+  /**
+   * Retry dispatching a given service message
+   *
+   * This callback class is used when we had to wait for some condition to
+   * become true while we were dispatching it.
+   *
+   * For instance, if the message's version isn't readable, according to Paxos,
+   * then we must wait for it to become readable. So, we just queue an
+   * instance of this class onto the Paxos::wait_for_readable function, and
+   * we will retry the whole dispatch again once the callback is fired.
+   */
+  class C_RetryMessage : public C_MonOp {
+    PaxosService *svc;
+  public:
+    C_RetryMessage(PaxosService *s, MonOpRequestRef op_) :
+      C_MonOp(op_), svc(s) { }
+    void _finish(int r) override {
+      if (r == -EAGAIN || r >= 0)
+	svc->dispatch(op);
+      else if (r == -ECANCELED)
+        return;
+      else
+	ceph_abort_msg("bad C_RetryMessage return value");
+    }
+  };
+
+  class C_ReplyOp : public C_MonOp {
+    Monitor &mon;
+    MonOpRequestRef op;
+    MessageRef reply;
+  public:
+    C_ReplyOp(PaxosService *s, MonOpRequestRef o, MessageRef r) :
+      C_MonOp(o), mon(s->mon), op(o), reply(r) { }
+    void _finish(int r) override {
+      if (r >= 0) {
+	mon.send_reply(op, reply.detach());
+      }
+    }
+  };
+
+  /**
+   * @}
+   */
+
+  /**
+   * @param mn A Monitor instance
+   * @param p A Paxos instance
+   * @param name Our service's name.
+   */
+  PaxosService(Monitor &mn, Paxos &p, std::string name) 
+    : mon(mn), paxos(p), service_name(name),
+      proposing(false),
+      service_version(0), proposal_timer(0), have_pending(false),
+      format_version(0),
+      last_committed_name("last_committed"),
+      first_committed_name("first_committed"),
+      full_prefix_name("full"), full_latest_name("latest"),
+      cached_first_committed(0), cached_last_committed(0)
+  {
+  }
+
+  virtual ~PaxosService() {}
+
+  /**
+   * Get the service's name.
+   *
+   * @returns The service's name.
+   */
+  const std::string& get_service_name() const { return service_name; }
+
+  /**
+   * Get the store prefixes we utilize
+   */
+  virtual void get_store_prefixes(std::set<std::string>& s) const {
+    s.insert(service_name);
+  }
+  
+  // i implement and you ignore
+  /**
+   * Informs this instance that it should consider itself restarted.
+   *
+   * This means that we will cancel our proposal_timer event, if any exists.
+   */
+  void restart();
+  /**
+   * Informs this instance that an election has finished.
+   *
+   * This means that we will invoke a PaxosService::discard_pending while
+   * setting have_pending to false (basically, ignore our pending state) and
+   * we will then make sure we obtain a new state.
+   *
+   * Our state shall be updated by PaxosService::_active if the Paxos is
+   * active; otherwise, we will wait for it to become active by adding a 
+   * PaxosService::C_Active callback to it.
+   */
+  void election_finished();
+  /**
+   * Informs this instance that it is supposed to shutdown.
+   *
+   * Basically, it will instruct Paxos to cancel all events/callbacks and then
+   * will cancel the proposal_timer event if any exists.
+   */
+  void shutdown();
+
+private:
+  /**
+   * Update our state by updating it from Paxos, and then creating a new
+   * pending state if need be.
+   *
+   * @remarks We only create a pending state we our Monitor is the Leader.
+   *
+   * @pre Paxos is active
+   * @post have_pending is true if our Monitor is the Leader and Paxos is
+   *	   active
+   */
+  void _active();
+
+public:
+  /**
+   * Propose a new value through Paxos.
+   *
+   * This function should be called by the classes implementing 
+   * PaxosService, in order to propose a new value through Paxos.
+   *
+   * @pre The implementation class implements the encode_pending function.
+   * @pre have_pending is true
+   * @pre Our monitor is the Leader
+   * @pre Paxos is active
+   * @post Cancel the proposal timer, if any
+   * @post have_pending is false
+   * @post propose pending value through Paxos
+   *
+   * @note This function depends on the implementation of encode_pending on
+   *	   the class that is implementing PaxosService
+   */
+  void propose_pending();
+
+  /**
+   * Let others request us to propose.
+   *
+   * At the moment, this is just a wrapper to propose_pending() with an
+   * extra check for is_writeable(), but it's a good practice to dissociate
+   * requests for proposals from direct usage of propose_pending() for
+   * future use -- we might want to perform additional checks or put a
+   * request on hold, for instance.
+   */
+  void request_proposal() {
+    ceph_assert(is_writeable());
+
+    propose_pending();
+  }
+  /**
+   * Request service @p other to perform a proposal.
+   *
+   * We could simply use the function above, requesting @p other directly,
+   * but we might eventually want to do something to the request -- say,
+   * set a flag stating we're waiting on a cross-proposal to be finished.
+   */
+  void request_proposal(PaxosService *other) {
+    ceph_assert(other != NULL);
+    ceph_assert(other->is_writeable());
+
+    other->request_proposal();
+  }
+
+  /**
+   * Dispatch a message by passing it to several different functions that are
+   * either implemented directly by this service, or that should be implemented
+   * by the class implementing this service.
+   *
+   * @param m A message
+   * @returns 'true' on successful dispatch; 'false' otherwise.
+   */
+  bool dispatch(MonOpRequestRef op);
+
+  void refresh(bool *need_bootstrap);
+  void post_refresh();
+
+  /**
+   * @defgroup PaxosService_h_override_funcs Functions that should be
+   *					     overridden.
+   *
+   * These functions should be overridden at will by the class implementing
+   * this service.
+   * @{
+   */
+  /**
+   * Create the initial state for your system.
+   *
+   * In some of ours the state is actually set up elsewhere so this does
+   * nothing.
+   */
+  virtual void create_initial() = 0;
+
+  /**
+   * Query the Paxos system for the latest state and apply it if it's newer
+   * than the current Monitor state.
+   */
+  virtual void update_from_paxos(bool *need_bootstrap) = 0;
+
+  /**
+   * Hook called after all services have refreshed their state from paxos
+   *
+   * This is useful for doing any update work that depends on other
+   * service's having up-to-date state.
+   */
+  virtual void post_paxos_update() {}
+
+  /**
+   * Init on startup
+   *
+   * This is called on mon startup, after all of the PaxosService instances'
+   * update_from_paxos() methods have been called
+   */
+  virtual void init() {}
+
+  /**
+   * Create the pending state.
+   *
+   * @invariant This function is only called on a Leader.
+   * @remarks This created state is then modified by incoming messages.
+   * @remarks Called at startup and after every Paxos ratification round.
+   */
+  virtual void create_pending() = 0;
+
+  /**
+   * Encode the pending state into a ceph::buffer::list for ratification and
+   * transmission as the next state.
+   *
+   * @invariant This function is only called on a Leader.
+   *
+   * @param t The transaction to hold all changes.
+   */
+  virtual void encode_pending(MonitorDBStore::TransactionRef t) = 0;
+
+  /**
+   * Discard the pending state
+   *
+   * @invariant This function is only called on a Leader.
+   *
+   * @remarks This function is NOT overridden in any of our code, but it is
+   *	      called in PaxosService::election_finished if have_pending is
+   *	      true.
+   */
+  virtual void discard_pending() { }
+
+  /**
+   * Look at the query; if the query can be handled without changing state,
+   * do so.
+   *
+   * @param m A query message
+   * @returns 'true' if the query was handled (e.g., was a read that got
+   *	      answered, was a state change that has no effect); 'false' 
+   *	      otherwise.
+   */
+  virtual bool preprocess_query(MonOpRequestRef op) = 0;
+
+  /**
+   * Apply the message to the pending state.
+   *
+   * @invariant This function is only called on a Leader.
+   *
+   * @param m An update message
+   * @returns 'true' if the update message was handled (e.g., a command that
+   *	      went through); 'false' otherwise.
+   */
+  virtual bool prepare_update(MonOpRequestRef op) = 0;
+  /**
+   * @}
+   */
+
+  /**
+   * Determine if the Paxos system should vote on pending, and if so how long
+   * it should wait to vote.
+   *
+   * @param[out] delay The wait time, used so we can limit the update traffic
+   *		       spamming.
+   * @returns 'true' if the Paxos system should propose; 'false' otherwise.
+   */
+  virtual bool should_propose(double &delay);
+
+  /**
+   * force an immediate propose.
+   *
+   * This is meant to be called from prepare_update(op).
+   */
+  void force_immediate_propose() {
+    need_immediate_propose = true;
+  }
+
+  /**
+   * @defgroup PaxosService_h_courtesy Courtesy functions
+   *
+   * Courtesy functions, in case the class implementing this service has
+   * anything it wants/needs to do at these times.
+   * @{
+   */
+  /**
+   * This is called when the Paxos state goes to active.
+   *
+   * On the peon, this is after each election.
+   * On the leader, this is after each election, *and* after each completed
+   * proposal.
+   *
+   * @note This function may get called twice in certain recovery cases.
+   */
+  virtual void on_active() { }
+
+  /**
+   * This is called when we are shutting down
+   */
+  virtual void on_shutdown() {}
+
+  /**
+   * this is called when activating on the leader
+   *
+   * it should conditionally upgrade the on-disk format by proposing a transaction
+   */
+  virtual void upgrade_format() { }
+
+  /**
+   * this is called when we detect the store has just upgraded underneath us
+   */
+  virtual void on_upgrade() {}
+
+  /**
+   * Called when the Paxos system enters a Leader election.
+   *
+   * @remarks It's a courtesy method, in case the class implementing this
+   *	      service has anything it wants/needs to do at that time.
+   */
+  virtual void on_restart() { }
+  /**
+   * @}
+   */
+
+  /**
+   * Tick.
+   */
+  virtual void tick() {}
+
+  void encode_health(const health_check_map_t& next,
+		     MonitorDBStore::TransactionRef t) {
+    using ceph::encode;
+    ceph::buffer::list bl;
+    encode(next, bl);
+    t->put("health", service_name, bl);
+    mon.log_health(next, health_checks, t);
+  }
+  void load_health();
+
+  /**
+   * @defgroup PaxosService_h_store_keys Set of keys that are usually used on
+   *					 all the services implementing this
+   *					 class, and, being almost the only keys
+   *					 used, should be standardized to avoid
+   *					 mistakes.
+   * @{
+   */
+  const std::string last_committed_name;
+  const std::string first_committed_name;
+  const std::string full_prefix_name;
+  const std::string full_latest_name;
+  /**
+   * @}
+   */
+
+ private:
+  /**
+   * @defgroup PaxosService_h_version_cache Variables holding cached values
+   *                                        for the most used versions (first
+   *                                        and last committed); we only have
+   *                                        to read them when the store is
+   *                                        updated, so in-between updates we
+   *                                        may very well use cached versions
+   *                                        and avoid the overhead.
+   * @{
+   */
+  version_t cached_first_committed;
+  version_t cached_last_committed;
+  /**
+   * @}
+   */
+
+  /**
+   * Callback list to be used whenever we are running a proposal through
+   * Paxos. These callbacks will be awaken whenever the said proposal
+   * finishes.
+   */
+  std::list<Context*> waiting_for_finished_proposal;
+
+ public:
+
+  /**
+   * Check if we are proposing a value through Paxos
+   *
+   * @returns true if we are proposing; false otherwise.
+   */
+  bool is_proposing() const {
+    return proposing;
+  }
+
+  /**
+   * Check if we are in the Paxos ACTIVE state.
+   *
+   * @note This function is a wrapper for Paxos::is_active
+   *
+   * @returns true if in state ACTIVE; false otherwise.
+   */
+  bool is_active() const {
+    return
+      !is_proposing() &&
+      (paxos.is_active() || paxos.is_updating() || paxos.is_writing());
+  }
+
+  /**
+   * Check if we are readable.
+   *
+   * This mirrors on the paxos check, except that we also verify that
+   *
+   *  - the client hasn't seen the future relative to this PaxosService
+   *  - this service isn't proposing.
+   *  - we have committed our initial state (last_committed > 0)
+   *
+   * @param ver The version we want to check if is readable
+   * @returns true if it is readable; false otherwise
+   */
+  bool is_readable(version_t ver = 0) const {
+    if (ver > get_last_committed() ||
+	!paxos.is_readable(0) ||
+	get_last_committed() == 0)
+      return false;
+    return true;
+  }
+
+  /**
+   * Check if we are writeable.
+   *
+   * We consider to be writeable iff:
+   *
+   *  - we are not proposing a new version;
+   *  - we are ready to be written to -- i.e., we have a pending value.
+   *  - paxos is (active or updating or writing or refresh)
+   *
+   * @returns true if writeable; false otherwise
+   */
+  bool is_writeable() const {
+    return is_active() && have_pending;
+  }
+
+  /**
+   * Wait for a proposal to finish.
+   *
+   * Add a callback to be awaken whenever our current proposal finishes being
+   * proposed through Paxos.
+   *
+   * @param c The callback to be awaken once the proposal is finished.
+   */
+  void wait_for_finished_proposal(MonOpRequestRef op, Context *c) {
+    if (op)
+      op->mark_event(service_name + ":wait_for_finished_proposal");
+    waiting_for_finished_proposal.push_back(c);
+  }
+  void wait_for_finished_proposal_ctx(Context *c) {
+    MonOpRequestRef o;
+    wait_for_finished_proposal(o, c);
+  }
+
+  /**
+   * Wait for us to become active
+   *
+   * @param c The callback to be awaken once we become active.
+   */
+  void wait_for_active(MonOpRequestRef op, Context *c) {
+    if (op)
+      op->mark_event(service_name + ":wait_for_active");
+
+    if (!is_proposing()) {
+      paxos.wait_for_active(op, c);
+      return;
+    }
+    wait_for_finished_proposal(op, c);
+  }
+  void wait_for_active_ctx(Context *c) {
+    MonOpRequestRef o;
+    wait_for_active(o, c);
+  }
+
+  /**
+   * Wait for us to become readable
+   *
+   * @param c The callback to be awaken once we become active.
+   * @param ver The version we want to wait on.
+   */
+  void wait_for_readable(MonOpRequestRef op, Context *c, version_t ver = 0) {
+    /* This is somewhat of a hack. We only do check if a version is readable on
+     * PaxosService::dispatch(), but, nonetheless, we must make sure that if that
+     * is why we are not readable, then we must wait on PaxosService and not on
+     * Paxos; otherwise, we may assert on Paxos::wait_for_readable() if it
+     * happens to be readable at that specific point in time.
+     */
+    if (op)
+      op->mark_event(service_name + ":wait_for_readable");
+
+    if (is_proposing() ||
+	ver > get_last_committed() ||
+	get_last_committed() == 0)
+      wait_for_finished_proposal(op, c);
+    else {
+      if (op)
+        op->mark_event(service_name + ":wait_for_readable/paxos");
+
+      paxos.wait_for_readable(op, c);
+    }
+  }
+
+  void wait_for_readable_ctx(Context *c, version_t ver = 0) {
+    MonOpRequestRef o; // will initialize the shared_ptr to NULL
+    wait_for_readable(o, c, ver);
+  }
+
+  /**
+   * Wait for us to become writeable
+   *
+   * @param c The callback to be awaken once we become writeable.
+   */
+  void wait_for_writeable(MonOpRequestRef op, Context *c) {
+    if (op)
+      op->mark_event(service_name + ":wait_for_writeable");
+
+    if (is_proposing())
+      wait_for_finished_proposal(op, c);
+    else if (!is_writeable())
+      wait_for_active(op, c);
+    else
+      paxos.wait_for_writeable(op, c);
+  }
+  void wait_for_writeable_ctx(Context *c) {
+    MonOpRequestRef o;
+    wait_for_writeable(o, c);
+  }
+
+  
+  /**
+   * @defgroup PaxosService_h_Trim Functions for trimming states
+   * @{
+   */
+  /**
+   * trim service states if appropriate
+   *
+   * Called at same interval as tick()
+   */
+  void maybe_trim();
+
+  /**
+   * Auxiliary function to trim our state from version @p from to version
+   * @p to, not including; i.e., the interval [from, to[
+   *
+   * @param t The transaction to which we will add the trim operations.
+   * @param from the lower limit of the interval to be trimmed
+   * @param to the upper limit of the interval to be trimmed (not including)
+   */
+  void trim(MonitorDBStore::TransactionRef t, version_t from, version_t to);
+
+  /**
+   * encode service-specific extra bits into trim transaction
+   *
+   * @param tx transaction
+   * @param first new first_committed value
+   */
+  virtual void encode_trim_extra(MonitorDBStore::TransactionRef tx,
+				 version_t first) {}
+
+  /**
+   * Get the version we should trim to.
+   *
+   * Should be overloaded by service if it wants to trim states.
+   *
+   * @returns the version we should trim to; if we return zero, it should be
+   *	      assumed that there's no version to trim to.
+   */
+  virtual version_t get_trim_to() const {
+    return 0;
+  }
+
+  /**
+   * @}
+   */
+  /**
+   * @defgroup PaxosService_h_Stash_Full
+   * @{
+   */
+  virtual bool should_stash_full();
+  /**
+   * Encode a full version on @p t
+   *
+   * @note We force every service to implement this function, since we strongly
+   *	   desire the encoding of full versions.
+   * @note Services that do not trim their state, will be bound to only create
+   *	   one full version. Full version stashing is determined/controlled by
+   *	   trimming: we stash a version each time a trim is bound to erase the
+   *	   latest full version.
+   *
+   * @param t Transaction on which the full version shall be encoded.
+   */
+  virtual void encode_full(MonitorDBStore::TransactionRef t) = 0;
+
+  /**
+   * @}
+   */
+
+  /**
+   * Cancel events.
+   *
+   * @note This function is a wrapper for Paxos::cancel_events
+   */
+  void cancel_events() {
+    paxos.cancel_events();
+  }
+
+  /**
+   * @defgroup PaxosService_h_store_funcs Back storage interface functions
+   * @{
+   */
+  /**
+   * @defgroup PaxosService_h_store_modify Wrapper function interface to access
+   *					   the back store for modification
+   *					   purposes
+   * @{
+   */
+  void put_first_committed(MonitorDBStore::TransactionRef t, version_t ver) {
+    t->put(get_service_name(), first_committed_name, ver);
+  }
+  /**
+   * Set the last committed version to @p ver
+   *
+   * @param t A transaction to which we add this put operation
+   * @param ver The last committed version number being put
+   */
+  void put_last_committed(MonitorDBStore::TransactionRef t, version_t ver) {
+    t->put(get_service_name(), last_committed_name, ver);
+
+    /* We only need to do this once, and that is when we are about to make our
+     * first proposal. There are some services that rely on first_committed
+     * being set -- and it should! -- so we need to guarantee that it is,
+     * specially because the services itself do not do it themselves. They do
+     * rely on it, but they expect us to deal with it, and so we shall.
+     */
+    if (!get_first_committed())
+      put_first_committed(t, ver);
+  }
+  /**
+   * Put the contents of @p bl into version @p ver
+   *
+   * @param t A transaction to which we will add this put operation
+   * @param ver The version to which we will add the value
+   * @param bl A ceph::buffer::list containing the version's value
+   */
+  void put_version(MonitorDBStore::TransactionRef t, version_t ver,
+		   ceph::buffer::list& bl) {
+    t->put(get_service_name(), ver, bl);
+  }
+  /**
+   * Put the contents of @p bl into a full version key for this service, that
+   * will be created with @p ver in mind.
+   *
+   * @param t The transaction to which we will add this put operation
+   * @param ver A version number
+   * @param bl A ceph::buffer::list containing the version's value
+   */
+  void put_version_full(MonitorDBStore::TransactionRef t,
+			version_t ver, ceph::buffer::list& bl) {
+    std::string key = mon.store->combine_strings(full_prefix_name, ver);
+    t->put(get_service_name(), key, bl);
+  }
+  /**
+   * Put the version number in @p ver into the key pointing to the latest full
+   * version of this service.
+   *
+   * @param t The transaction to which we will add this put operation
+   * @param ver A version number
+   */
+  void put_version_latest_full(MonitorDBStore::TransactionRef t, version_t ver) {
+    std::string key = mon.store->combine_strings(full_prefix_name, full_latest_name);
+    t->put(get_service_name(), key, ver);
+  }
+  /**
+   * Put the contents of @p bl into the key @p key.
+   *
+   * @param t A transaction to which we will add this put operation
+   * @param key The key to which we will add the value
+   * @param bl A ceph::buffer::list containing the value
+   */
+  void put_value(MonitorDBStore::TransactionRef t,
+		 const std::string& key, ceph::buffer::list& bl) {
+    t->put(get_service_name(), key, bl);
+  }
+
+  /**
+   * Put integer value @v into the key @p key.
+   *
+   * @param t A transaction to which we will add this put operation
+   * @param key The key to which we will add the value
+   * @param v An integer
+   */
+  void put_value(MonitorDBStore::TransactionRef t,
+		 const std::string& key, version_t v) {
+    t->put(get_service_name(), key, v);
+  }
+
+  /**
+   * @}
+   */
+
+  /**
+   * @defgroup PaxosService_h_store_get Wrapper function interface to access
+   *					the back store for reading purposes
+   * @{
+   */
+
+  /**
+   * @defgroup PaxosService_h_version_cache Obtain cached versions for this
+   *                                        service.
+   * @{
+   */
+  /**
+   * Get the first committed version
+   *
+   * @returns Our first committed version (that is available)
+   */
+  version_t get_first_committed() const{
+    return cached_first_committed;
+  }
+  /**
+   * Get the last committed version
+   *
+   * @returns Our last committed version
+   */
+  version_t get_last_committed() const{
+    return cached_last_committed;
+  }
+
+  /**
+   * @}
+   */
+
+  /**
+   * Get the contents of a given version @p ver
+   *
+   * @param ver The version being obtained
+   * @param bl The ceph::buffer::list to be populated
+   * @return 0 on success; <0 otherwise
+   */
+  virtual int get_version(version_t ver, ceph::buffer::list& bl) {
+    return mon.store->get(get_service_name(), ver, bl);
+  }
+  /**
+   * Get the contents of a given full version of this service.
+   *
+   * @param ver A version number
+   * @param bl The ceph::buffer::list to be populated
+   * @returns 0 on success; <0 otherwise
+   */
+  virtual int get_version_full(version_t ver, ceph::buffer::list& bl) {
+    std::string key = mon.store->combine_strings(full_prefix_name, ver);
+    return mon.store->get(get_service_name(), key, bl);
+  }
+  /**
+   * Get the latest full version number
+   *
+   * @returns A version number
+   */
+  version_t get_version_latest_full() {
+    std::string key = mon.store->combine_strings(full_prefix_name, full_latest_name);
+    return mon.store->get(get_service_name(), key);
+  }
+
+  /**
+   * Get a value from a given key.
+   *
+   * @param[in] key The key
+   * @param[out] bl The ceph::buffer::list to be populated with the value
+   */
+  int get_value(const std::string& key, ceph::buffer::list& bl) {
+    return mon.store->get(get_service_name(), key, bl);
+  }
+  /**
+   * Get an integer value from a given key.
+   *
+   * @param[in] key The key
+   */
+  version_t get_value(const std::string& key) {
+    return mon.store->get(get_service_name(), key);
+  }
+
+  /**
+   * @}
+   */
+  /**
+   * @}
+   */
+};
+
+#endif
diff --git a/src/mon/Session.h b/src/mon/Session.h
new file mode 100644
index 000000000..3009d0239
--- /dev/null
+++ b/src/mon/Session.h
@@ -0,0 +1,295 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MON_SESSION_H
+#define CEPH_MON_SESSION_H
+
+#include <string>
+#include <string_view>
+
+#include "include/utime.h"
+#include "include/xlist.h"
+
+#include "global/global_context.h"
+#include "msg/msg_types.h"
+#include "mon/mon_types.h"
+
+#include "auth/AuthServiceHandler.h"
+#include "osd/OSDMap.h"
+
+#include "MonCap.h"
+
+struct MonSession;
+
+struct Subscription {
+  MonSession *session;
+  std::string type;
+  xlist<Subscription*>::item type_item;
+  version_t next;
+  bool onetime;
+  bool incremental_onetime;  // has CEPH_FEATURE_INCSUBOSDMAP
+  
+  Subscription(MonSession *s, const std::string& t) : session(s), type(t), type_item(this),
+						 next(0), onetime(false), incremental_onetime(false) {}
+};
+
+struct MonSession : public RefCountedObject {
+  ConnectionRef con;
+  int con_type = 0;
+  uint64_t con_features = 0;  // zero if AnonConnection
+  entity_name_t name;
+  entity_addrvec_t addrs;
+  entity_addr_t socket_addr;
+  utime_t session_timeout;
+  bool closed = false;
+  xlist<MonSession*>::item item;
+  std::set<uint64_t> routed_request_tids;
+  MonCap caps;
+  bool validated_stretch_connection = false;
+
+  bool authenticated = false;  ///< true if auth handshake is complete
+
+  std::map<std::string, Subscription*> sub_map;
+  epoch_t osd_epoch = 0;       ///< the osdmap epoch sent to the mon client
+
+  AuthServiceHandler *auth_handler = nullptr;
+  EntityName entity_name;
+  uint64_t global_id = 0;
+  global_id_status_t global_id_status = global_id_status_t::NONE;
+
+  ConnectionRef proxy_con;
+  uint64_t proxy_tid = 0;
+
+  std::string remote_host;                ///< remote host name
+  std::map<std::string,std::string,std::less<>> last_config;    ///< most recently shared config
+  bool any_config = false;
+
+  MonSession(Connection *c)
+    : RefCountedObject(g_ceph_context),
+      con(c),
+      item(this) { }
+
+  void _ident(const entity_name_t& n, const entity_addrvec_t& av) {
+    con_type = con->get_peer_type();
+    name = n;
+    addrs = av;
+    socket_addr = con->get_peer_socket_addr();
+    if (con->get_messenger()) {
+      // only fill in features if this is a non-anonymous connection
+      con_features = con->get_features();
+    }
+  }
+
+  ~MonSession() override {
+    //generic_dout(0) << "~MonSession " << this << dendl;
+    // we should have been removed before we get destructed; see MonSessionMap::remove_session()
+    ceph_assert(!item.is_on_list());
+    ceph_assert(sub_map.empty());
+    delete auth_handler;
+  }
+
+  bool is_capable(std::string service, int mask) {
+    std::map<std::string,std::string> args;
+    return caps.is_capable(
+      g_ceph_context,
+      entity_name,
+      service, "", args,
+      mask & MON_CAP_R, mask & MON_CAP_W, mask & MON_CAP_X,
+      get_peer_socket_addr());
+  }
+
+  std::vector<string> get_allowed_fs_names() const {
+    return caps.allowed_fs_names();
+  }
+
+  bool fs_name_capable(string_view fsname, __u8 mask) {
+    return caps.fs_name_capable(entity_name, fsname, mask);
+  }
+
+  const entity_addr_t& get_peer_socket_addr() {
+    return socket_addr;
+  }
+
+  void dump(ceph::Formatter *f) const {
+    f->dump_stream("name") << name;
+    f->dump_stream("entity_name") << entity_name;
+    f->dump_object("addrs", addrs);
+    f->dump_object("socket_addr", socket_addr);
+    f->dump_string("con_type", ceph_entity_type_name(con_type));
+    f->dump_unsigned("con_features", con_features);
+    f->dump_stream("con_features_hex") << std::hex << con_features << std::dec;
+    f->dump_string("con_features_release",
+		   ceph_release_name(ceph_release_from_features(con_features)));
+    f->dump_bool("open", !closed);
+    f->dump_object("caps", caps);
+    f->dump_bool("authenticated", authenticated);
+    f->dump_unsigned("global_id", global_id);
+    f->dump_stream("global_id_status") << global_id_status;
+    f->dump_unsigned("osd_epoch", osd_epoch);
+    f->dump_string("remote_host", remote_host);
+  }
+};
+
+
+struct MonSessionMap {
+  xlist<MonSession*> sessions;
+  std::map<std::string, xlist<Subscription*>* > subs;
+  std::multimap<int, MonSession*> by_osd;
+  FeatureMap feature_map; // type -> features -> count
+
+  MonSessionMap() {}
+  ~MonSessionMap() {
+    while (!subs.empty()) {
+      ceph_assert(subs.begin()->second->empty());
+      delete subs.begin()->second;
+      subs.erase(subs.begin());
+    }
+  }
+
+  unsigned get_size() const {
+    return sessions.size();
+  }
+
+  void remove_session(MonSession *s) {
+    ceph_assert(!s->closed);
+    for (std::map<std::string,Subscription*>::iterator p = s->sub_map.begin(); p != s->sub_map.end(); ++p) {
+      p->second->type_item.remove_myself();
+      delete p->second;
+    }
+    s->sub_map.clear();
+    s->item.remove_myself();
+    if (s->name.is_osd() &&
+	s->name.num() >= 0) {
+      for (auto p = by_osd.find(s->name.num());
+	   p->first == s->name.num();
+	   ++p)
+	if (p->second == s) {
+	  by_osd.erase(p);
+	  break;
+	}
+    }
+    if (s->con_features) {
+      feature_map.rm(s->con_type, s->con_features);
+    }
+    s->closed = true;
+    s->put();
+  }
+
+  MonSession *new_session(const entity_name_t& n,
+			  const entity_addrvec_t& av,
+			  Connection *c) {
+    MonSession *s = new MonSession(c);
+    ceph_assert(s);
+    s->_ident(n, av);
+    add_session(s);
+    return s;
+  }
+
+  void add_session(MonSession *s) {
+    s->session_timeout = ceph_clock_now();
+    s->session_timeout += g_conf()->mon_session_timeout;
+
+    sessions.push_back(&s->item);
+    s->get();
+    if (s->name.is_osd() &&
+	s->name.num() >= 0) {
+      by_osd.insert(std::pair<int,MonSession*>(s->name.num(), s));
+    }
+    if (s->con_features) {
+      feature_map.add(s->con_type, s->con_features);
+    }
+  }
+
+  MonSession *get_random_osd_session(OSDMap *osdmap) {
+    // ok, this isn't actually random, but close enough.
+    if (by_osd.empty())
+      return 0;
+    int n = by_osd.rbegin()->first + 1;
+    int r = rand() % n;
+
+    auto p = by_osd.lower_bound(r);
+    if (p == by_osd.end())
+      --p;
+
+    if (!osdmap) {
+      return p->second;
+    }
+
+    MonSession *s = NULL;
+
+    auto b = p;
+    auto f = p;
+    bool backward = true, forward = true;
+    while (backward || forward) {
+      if (backward) {
+        if (osdmap->is_up(b->first) &&
+	    osdmap->get_addrs(b->first) == b->second->con->get_peer_addrs()) {
+          s = b->second;
+          break;
+        }
+        if (b != by_osd.begin())
+          --b;
+        else
+          backward = false;
+      }
+
+      forward = (f != by_osd.end());
+      if (forward) {
+        if (osdmap->is_up(f->first)) {
+          s = f->second;
+          break;
+        }
+        ++f;
+      }
+    }
+
+    return s;
+  }
+
+  void add_update_sub(MonSession *s, const std::string& what, version_t start, bool onetime, bool incremental_onetime) {
+    Subscription *sub = 0;
+    if (s->sub_map.count(what)) {
+      sub = s->sub_map[what];
+    } else {
+      sub = new Subscription(s, what);
+      s->sub_map[what] = sub;
+
+      if (!subs.count(what))
+	subs[what] = new xlist<Subscription*>;
+      subs[what]->push_back(&sub->type_item);
+    }
+    sub->next = start;
+    sub->onetime = onetime;
+    sub->incremental_onetime = onetime && incremental_onetime;
+  }
+
+  void remove_sub(Subscription *sub) {
+    sub->session->sub_map.erase(sub->type);
+    sub->type_item.remove_myself();
+    delete sub;
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const MonSession& s)
+{
+  out << "MonSession(" << s.name << " " << s.addrs
+      << " is " << (s.closed ? "closed" : "open")
+      << " " << s.caps
+      << ", features 0x" << std::hex << s.con_features << std::dec
+      <<  " (" << ceph_release_name(ceph_release_from_features(s.con_features))
+      << "))";
+  return out;
+}
+
+#endif
diff --git a/src/mon/error_code.cc b/src/mon/error_code.cc
new file mode 100644
index 000000000..a2cd39299
--- /dev/null
+++ b/src/mon/error_code.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <string>
+
+#include "common/error_code.h"
+#include "common/errno.h"
+#include "error_code.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+
+namespace bs = boost::system;
+
+class mon_error_category : public ceph::converting_category {
+public:
+  mon_error_category(){}
+  const char* name() const noexcept override;
+  const char* message(int ev, char*, std::size_t) const noexcept override;
+  std::string message(int ev) const override;
+  bs::error_condition default_error_condition(int ev) const noexcept
+    override;
+  bool equivalent(int ev, const bs::error_condition& c) const
+    noexcept override;
+  using ceph::converting_category::equivalent;
+  int from_code(int ev) const noexcept override;
+};
+
+const char* mon_error_category::name() const noexcept {
+  return "mon";
+}
+
+const char* mon_error_category::message(int ev, char* buf,
+					std::size_t len) const noexcept {
+  if (ev == 0)
+    return "No error";
+
+  if (len) {
+    auto s = cpp_strerror(ev);
+    auto n = s.copy(buf, len - 1);
+    *(buf + n) = '\0';
+  }
+  return buf;
+}
+
+std::string mon_error_category::message(int ev) const {
+  if (ev == 0)
+    return "No error";
+
+  return cpp_strerror(ev);
+}
+
+bs::error_condition
+mon_error_category::default_error_condition(int ev) const noexcept {
+  return { ev, bs::generic_category() };
+}
+
+bool mon_error_category::equivalent(int ev,const bs::error_condition& c) const noexcept {
+  return default_error_condition(ev) == c;
+}
+
+int mon_error_category::from_code(int ev) const noexcept {
+  return -ev;
+}
+
+const bs::error_category& mon_category() noexcept {
+  static const mon_error_category c;
+  return c;
+}
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
diff --git a/src/mon/error_code.h b/src/mon/error_code.h
new file mode 100644
index 000000000..2a6e88061
--- /dev/null
+++ b/src/mon/error_code.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <boost/system/error_code.hpp>
+
+#include "include/rados.h"
+
+const boost::system::error_category& mon_category() noexcept;
+
+// The Monitor, like the OSD, mostly replies with POSIX error codes.
+
+enum class mon_errc {
+};
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::mon_errc> {
+  static const bool value = true;
+};
+
+template<>
+struct is_error_condition_enum<::mon_errc> {
+  static const bool value = false;
+};
+}
+
+//  explicit conversion:
+inline boost::system::error_code make_error_code(mon_errc e) noexcept {
+  return { static_cast<int>(e), mon_category() };
+}
+
+// implicit conversion:
+inline boost::system::error_condition make_error_condition(mon_errc e) noexcept {
+  return { static_cast<int>(e), mon_category() };
+}
diff --git a/src/mon/health_check.h b/src/mon/health_check.h
new file mode 100644
index 000000000..4e74637f9
--- /dev/null
+++ b/src/mon/health_check.h
@@ -0,0 +1,198 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <map>
+
+#include "include/health.h"
+#include "include/utime.h"
+#include "common/Formatter.h"
+
+struct health_check_t {
+  health_status_t severity;
+  std::string summary;
+  std::list<std::string> detail;
+  int64_t count = 0;
+
+  DENC(health_check_t, v, p) {
+    DENC_START(2, 1, p);
+    denc(v.severity, p);
+    denc(v.summary, p);
+    denc(v.detail, p);
+    if (struct_v >= 2) {
+      denc(v.count, p);
+    }
+    DENC_FINISH(p);
+  }
+
+  friend bool operator==(const health_check_t& l,
+			 const health_check_t& r) {
+    return l.severity == r.severity &&
+      l.summary == r.summary &&
+      l.detail == r.detail &&
+      l.count == r.count;
+  }
+  friend bool operator!=(const health_check_t& l,
+			 const health_check_t& r) {
+    return !(l == r);
+  }
+
+  void dump(ceph::Formatter *f, bool want_detail=true) const {
+    f->dump_stream("severity") << severity;
+
+    f->open_object_section("summary");
+    f->dump_string("message", summary);
+    f->dump_int("count", count);
+    f->close_section();
+
+    if (want_detail) {
+      f->open_array_section("detail");
+      for (auto& p : detail) {
+	f->open_object_section("detail_item");
+	f->dump_string("message", p);
+	f->close_section();
+      }
+      f->close_section();
+    }
+  }
+
+  static void generate_test_instances(std::list<health_check_t*>& ls) {
+    ls.push_back(new health_check_t);
+    ls.push_back(new health_check_t);
+    ls.back()->severity = HEALTH_ERR;
+    ls.back()->summary = "summarization";
+    ls.back()->detail = {"one", "two", "three"};
+    ls.back()->count = 42;
+  }
+};
+WRITE_CLASS_DENC(health_check_t)
+
+
+struct health_mute_t {
+  std::string code;
+  utime_t ttl;
+  bool sticky = false;
+  std::string summary;
+  int64_t count;
+
+  DENC(health_mute_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.code, p);
+    denc(v.ttl, p);
+    denc(v.sticky, p);
+    denc(v.summary, p);
+    denc(v.count, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(ceph::Formatter *f) const {
+    f->dump_string("code", code);
+    if (ttl != utime_t()) {
+      f->dump_stream("ttl") << ttl;
+    }
+    f->dump_bool("sticky", sticky);
+    f->dump_string("summary", summary);
+    f->dump_int("count", count);
+  }
+
+  static void generate_test_instances(std::list<health_mute_t*>& ls) {
+    ls.push_back(new health_mute_t);
+    ls.push_back(new health_mute_t);
+    ls.back()->code = "OSD_DOWN";
+    ls.back()->ttl = utime_t(1, 2);
+    ls.back()->sticky = true;
+    ls.back()->summary = "foo bar";
+    ls.back()->count = 2;
+  }
+};
+WRITE_CLASS_DENC(health_mute_t)
+
+struct health_check_map_t {
+  std::map<std::string,health_check_t> checks;
+
+  DENC(health_check_map_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.checks, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(ceph::Formatter *f) const {
+    for (auto& [code, check] : checks) {
+      f->dump_object(code, check);
+    }
+  }
+
+  static void generate_test_instances(std::list<health_check_map_t*>& ls) {
+    ls.push_back(new health_check_map_t);
+    ls.push_back(new health_check_map_t);
+    {
+      auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo", 2);
+      d.detail.push_back("a");
+      d.detail.push_back("b");
+    }
+    {
+      auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!", 3);
+      d.detail.push_back("c");
+      d.detail.push_back("d");
+      d.detail.push_back("e");
+    }
+  }
+
+  void clear() {
+    checks.clear();
+  }
+  bool empty() const {
+    return checks.empty();
+  }
+  void swap(health_check_map_t& other) {
+    checks.swap(other.checks);
+  }
+
+  health_check_t& add(const std::string& code,
+		      health_status_t severity,
+		      const std::string& summary,
+		      int64_t count) {
+    ceph_assert(checks.count(code) == 0);
+    health_check_t& r = checks[code];
+    r.severity = severity;
+    r.summary = summary;
+    r.count = count;
+    return r;
+  }
+  health_check_t& get_or_add(const std::string& code,
+			     health_status_t severity,
+			     const std::string& summary,
+			     int64_t count) {
+    health_check_t& r = checks[code];
+    r.severity = severity;
+    r.summary = summary;
+    r.count += count;
+    return r;
+  }
+
+  void merge(const health_check_map_t& o) {
+    for (auto& [code, check] : o.checks) {
+      auto [it, new_check] = checks.try_emplace(code, check);
+      if (!new_check) {
+        // merge details, and hope the summary matches!
+        it->second.detail.insert(
+          it->second.detail.end(),
+          check.detail.begin(),
+          check.detail.end());
+        it->second.count += check.count;
+      }
+    }
+  }
+
+  friend bool operator==(const health_check_map_t& l,
+			 const health_check_map_t& r) {
+    return l.checks == r.checks;
+  }
+  friend bool operator!=(const health_check_map_t& l,
+			 const health_check_map_t& r) {
+    return !(l == r);
+  }
+};
+WRITE_CLASS_DENC(health_check_map_t)
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h
new file mode 100644
index 000000000..ce7184f37
--- /dev/null
+++ b/src/mon/mon_types.h
@@ -0,0 +1,660 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MON_TYPES_H
+#define CEPH_MON_TYPES_H
+
+#include <map>
+
+#include "include/Context.h"
+#include "include/util.h"
+#include "include/utime.h"
+#include "common/Formatter.h"
+#include "common/bit_str.h"
+#include "common/ceph_releases.h"
+
+// use as paxos_service index
+enum {
+  PAXOS_MDSMAP,
+  PAXOS_OSDMAP,
+  PAXOS_LOG,
+  PAXOS_MONMAP,
+  PAXOS_AUTH,
+  PAXOS_MGR,
+  PAXOS_MGRSTAT,
+  PAXOS_HEALTH,
+  PAXOS_CONFIG,
+  PAXOS_KV,
+  PAXOS_NUM
+};
+
+#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v012"
+
+// map of entity_type -> features -> count
+struct FeatureMap {
+  std::map<uint32_t,std::map<uint64_t,uint64_t>> m;
+
+  void add(uint32_t type, uint64_t features) {
+    if (type == CEPH_ENTITY_TYPE_MON) {
+      return;
+    }
+    m[type][features]++;
+  }
+
+  void add_mon(uint64_t features) {
+    m[CEPH_ENTITY_TYPE_MON][features]++;
+  }
+
+  void rm(uint32_t type, uint64_t features) {
+    if (type == CEPH_ENTITY_TYPE_MON) {
+      return;
+    }
+    auto p = m.find(type);
+    ceph_assert(p != m.end());
+    auto q = p->second.find(features);
+    ceph_assert(q != p->second.end());
+    if (--q->second == 0) {
+      p->second.erase(q);
+      if (p->second.empty()) {
+	m.erase(p);
+      }
+    }
+  }
+
+  FeatureMap& operator+=(const FeatureMap& o) {
+    for (auto& p : o.m) {
+      auto &v = m[p.first];
+      for (auto& q : p.second) {
+	v[q.first] += q.second;
+      }
+    }
+    return *this;
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(m, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(m, p);
+    DECODE_FINISH(p);
+  }
+
+  void dump(ceph::Formatter *f) const {
+    for (auto& p : m) {
+      f->open_array_section(ceph_entity_type_name(p.first));
+      for (auto& q : p.second) {
+	f->open_object_section("group");
+        std::stringstream ss;
+        ss << "0x" << std::hex << q.first << std::dec;
+        f->dump_string("features", ss.str());
+	f->dump_string("release", ceph_release_name(
+			 ceph_release_from_features(q.first)));
+	f->dump_unsigned("num", q.second);
+	f->close_section();
+      }
+      f->close_section();
+    }
+  }
+};
+WRITE_CLASS_ENCODER(FeatureMap)
+
+/**
+ * leveldb store stats
+ *
+ * If we ever decide to support multiple backends for the monitor store,
+ * we should then create an abstract class 'MonitorStoreStats' of sorts
+ * and inherit it on LevelDBStoreStats.  I'm sure you'll figure something
+ * out.
+ */
+struct LevelDBStoreStats {
+  uint64_t bytes_total;
+  uint64_t bytes_sst;
+  uint64_t bytes_log;
+  uint64_t bytes_misc;
+  utime_t last_update;
+
+  LevelDBStoreStats() :
+    bytes_total(0),
+    bytes_sst(0),
+    bytes_log(0),
+    bytes_misc(0)
+  {}
+
+  void dump(ceph::Formatter *f) const {
+    ceph_assert(f != NULL);
+    f->dump_int("bytes_total", bytes_total);
+    f->dump_int("bytes_sst", bytes_sst);
+    f->dump_int("bytes_log", bytes_log);
+    f->dump_int("bytes_misc", bytes_misc);
+    f->dump_stream("last_updated") << last_update;
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(bytes_total, bl);
+    encode(bytes_sst, bl);
+    encode(bytes_log, bl);
+    encode(bytes_misc, bl);
+    encode(last_update, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator &p) {
+    DECODE_START(1, p);
+    decode(bytes_total, p);
+    decode(bytes_sst, p);
+    decode(bytes_log, p);
+    decode(bytes_misc, p);
+    decode(last_update, p);
+    DECODE_FINISH(p);
+  }
+
+  static void generate_test_instances(std::list<LevelDBStoreStats*>& ls) {
+    ls.push_back(new LevelDBStoreStats);
+    ls.push_back(new LevelDBStoreStats);
+    ls.back()->bytes_total = 1024*1024;
+    ls.back()->bytes_sst = 512*1024;
+    ls.back()->bytes_log = 256*1024;
+    ls.back()->bytes_misc = 256*1024;
+    ls.back()->last_update = utime_t();
+  }
+};
+WRITE_CLASS_ENCODER(LevelDBStoreStats)
+
+// data stats
+
+struct DataStats {
+  ceph_data_stats_t fs_stats;
+  // data dir
+  utime_t last_update;
+  LevelDBStoreStats store_stats;
+
+  void dump(ceph::Formatter *f) const {
+    ceph_assert(f != NULL);
+    f->dump_int("kb_total", (fs_stats.byte_total/1024));
+    f->dump_int("kb_used", (fs_stats.byte_used/1024));
+    f->dump_int("kb_avail", (fs_stats.byte_avail/1024));
+    f->dump_int("avail_percent", fs_stats.avail_percent);
+    f->dump_stream("last_updated") << last_update;
+    f->open_object_section("store_stats");
+    store_stats.dump(f);
+    f->close_section();
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(fs_stats.byte_total, bl);
+    encode(fs_stats.byte_used, bl);
+    encode(fs_stats.byte_avail, bl);
+    encode(fs_stats.avail_percent, bl);
+    encode(last_update, bl);
+    encode(store_stats, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &p) {
+    DECODE_START(1, p);
+    // we moved from having fields in kb to fields in byte
+    if (struct_v > 2) {
+      decode(fs_stats.byte_total, p);
+      decode(fs_stats.byte_used, p);
+      decode(fs_stats.byte_avail, p);
+    } else {
+      uint64_t t;
+      decode(t, p);
+      fs_stats.byte_total = t*1024;
+      decode(t, p);
+      fs_stats.byte_used = t*1024;
+      decode(t, p);
+      fs_stats.byte_avail = t*1024;
+    }
+    decode(fs_stats.avail_percent, p);
+    decode(last_update, p);
+    if (struct_v > 1)
+      decode(store_stats, p);
+
+    DECODE_FINISH(p);
+  }
+};
+WRITE_CLASS_ENCODER(DataStats)
+
+struct ScrubResult {
+  std::map<std::string,uint32_t> prefix_crc;  ///< prefix -> crc
+  std::map<std::string,uint64_t> prefix_keys; ///< prefix -> key count
+
+  bool operator!=(const ScrubResult& other) {
+    return prefix_crc != other.prefix_crc || prefix_keys != other.prefix_keys;
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(prefix_crc, bl);
+    encode(prefix_keys, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(prefix_crc, p);
+    decode(prefix_keys, p);
+    DECODE_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->open_object_section("crc");
+    for (auto p = prefix_crc.begin(); p != prefix_crc.end(); ++p)
+      f->dump_unsigned(p->first.c_str(), p->second);
+    f->close_section();
+    f->open_object_section("keys");
+    for (auto p = prefix_keys.begin(); p != prefix_keys.end(); ++p)
+      f->dump_unsigned(p->first.c_str(), p->second);
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<ScrubResult*>& ls) {
+    ls.push_back(new ScrubResult);
+    ls.push_back(new ScrubResult);
+    ls.back()->prefix_crc["foo"] = 123;
+    ls.back()->prefix_keys["bar"] = 456;
+  }
+};
+WRITE_CLASS_ENCODER(ScrubResult)
+
+inline std::ostream& operator<<(std::ostream& out, const ScrubResult& r) {
+  return out << "ScrubResult(keys " << r.prefix_keys << " crc " << r.prefix_crc << ")";
+}
+
+/// for information like os, kernel, hostname, memory info, cpu model.
+typedef std::map<std::string, std::string> Metadata;
+
+namespace ceph {
+  namespace features {
+    namespace mon {
+      /**
+       * Get a feature's name based on its value.
+       *
+       * @param b raw feature value
+       *
+       * @remarks
+       *    Consumers should not assume this interface will never change.
+       * @remarks
+       *    As the number of features increase, so may the internal representation
+       *    of the raw features. When this happens, this interface will change
+       *    accordingly. So should consumers of this interface.
+       */
+      static inline const char *get_feature_name(uint64_t b);
+    }
+  }
+}
+
+
+inline const char *ceph_mon_feature_name(uint64_t b)
+{
+  return ceph::features::mon::get_feature_name(b);
+};
+
+class mon_feature_t {
+
+  static constexpr int HEAD_VERSION = 1;
+  static constexpr int COMPAT_VERSION = 1;
+
+  // mon-specific features
+  uint64_t features;
+
+public:
+
+  explicit constexpr
+  mon_feature_t(const uint64_t f) : features(f) { }
+
+  mon_feature_t() :
+    features(0) { }
+
+  constexpr
+  mon_feature_t(const mon_feature_t &o) :
+    features(o.features) { }
+
+  mon_feature_t& operator&=(const mon_feature_t other) {
+    features &= other.features;
+    return (*this);
+  }
+
+  /**
+   * Obtain raw features
+   *
+   * @remarks
+   *    Consumers should not assume this interface will never change.
+   * @remarks
+   *    As the number of features increase, so may the internal representation
+   *    of the raw features. When this happens, this interface will change
+   *    accordingly. So should consumers of this interface.
+   */
+  uint64_t get_raw() const {
+    return features;
+  }
+
+  constexpr
+  friend mon_feature_t operator&(const mon_feature_t a,
+                                 const mon_feature_t b) {
+    return mon_feature_t(a.features & b.features);
+  }
+
+  mon_feature_t& operator|=(const mon_feature_t other) {
+    features |= other.features;
+    return (*this);
+  }
+
+  constexpr
+  friend mon_feature_t operator|(const mon_feature_t a,
+                                 const mon_feature_t b) {
+    return mon_feature_t(a.features | b.features);
+  }
+
+  constexpr
+  friend mon_feature_t operator^(const mon_feature_t a,
+                                 const mon_feature_t b) {
+    return mon_feature_t(a.features ^ b.features);
+  }
+
+  mon_feature_t& operator^=(const mon_feature_t other) {
+    features ^= other.features;
+    return (*this);
+  }
+
+  bool operator==(const mon_feature_t other) const {
+    return (features == other.features);
+  }
+
+  bool operator!=(const mon_feature_t other) const {
+    return (features != other.features);
+  }
+
+  bool empty() const {
+    return features == 0;
+  }
+
+  /**
+   * Set difference of our features in respect to @p other
+   *
+   * Returns all the elements in our features that are not in @p other
+   *
+   * @returns all the features not in @p other
+   */
+  mon_feature_t diff(const mon_feature_t other) const {
+    return mon_feature_t((features ^ other.features) & features);
+  }
+
+  /**
+   * Set intersection of our features and @p other
+   *
+   * Returns all the elements common to both our features and the
+   * features of @p other
+   *
+   * @returns the features common to @p other and us
+   */
+  mon_feature_t intersection(const mon_feature_t other) const {
+    return mon_feature_t((features & other.features));
+  }
+
+  /**
+   * Checks whether we have all the features in @p other
+   *
+   * Returns true if we have all the features in @p other
+   *
+   * @returns true if we contain all the features in @p other
+   * @returns false if we do not contain some of the features in @p other
+   */
+  bool contains_all(const mon_feature_t other) const {
+    mon_feature_t d = intersection(other);
+    return d == other;
+  }
+
+  /**
+   * Checks whether we contain any of the features in @p other.
+   *
+   * @returns true if we contain any of the features in @p other
+   * @returns false if we don't contain any of the features in @p other
+   */
+  bool contains_any(const mon_feature_t other) const {
+    mon_feature_t d = intersection(other);
+    return !d.empty();
+  }
+
+  void set_feature(const mon_feature_t f) {
+    features |= f.features;
+  }
+
+  void unset_feature(const mon_feature_t f) {
+    features &= ~(f.features);
+  }
+
+  void print(std::ostream& out) const {
+    out << "[";
+    print_bit_str(features, out, ceph::features::mon::get_feature_name);
+    out << "]";
+  }
+
+  void print_with_value(std::ostream& out) const {
+    out << "[";
+    print_bit_str(features, out, ceph::features::mon::get_feature_name, true);
+    out << "]";
+  }
+
+  void dump(ceph::Formatter *f, const char *sec_name = NULL) const {
+    f->open_array_section((sec_name ? sec_name : "features"));
+    dump_bit_str(features, f, ceph::features::mon::get_feature_name);
+    f->close_section();
+  }
+
+  void dump_with_value(ceph::Formatter *f, const char *sec_name = NULL) const {
+    f->open_array_section((sec_name ? sec_name : "features"));
+    dump_bit_str(features, f, ceph::features::mon::get_feature_name, true);
+    f->close_section();
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(HEAD_VERSION, COMPAT_VERSION, bl);
+    encode(features, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(COMPAT_VERSION, p);
+    decode(features, p);
+    DECODE_FINISH(p);
+  }
+};
+WRITE_CLASS_ENCODER(mon_feature_t)
+
+namespace ceph {
+  namespace features {
+    namespace mon {
+      constexpr mon_feature_t FEATURE_KRAKEN(     (1ULL << 0));
+      constexpr mon_feature_t FEATURE_LUMINOUS(   (1ULL << 1));
+      constexpr mon_feature_t FEATURE_MIMIC(      (1ULL << 2));
+      constexpr mon_feature_t FEATURE_OSDMAP_PRUNE (1ULL << 3);
+      constexpr mon_feature_t FEATURE_NAUTILUS(    (1ULL << 4));
+      constexpr mon_feature_t FEATURE_OCTOPUS(    (1ULL << 5));
+      constexpr mon_feature_t FEATURE_PACIFIC(    (1ULL << 6));
+      // elector pinging and CONNECTIVITY mode:
+      constexpr mon_feature_t FEATURE_PINGING(    (1ULL << 7));
+
+      constexpr mon_feature_t FEATURE_RESERVED(   (1ULL << 63));
+      constexpr mon_feature_t FEATURE_NONE(       (0ULL));
+
+      /**
+       * All the features this monitor supports
+       *
+       * If there's a feature above, it should be OR'ed to this list.
+       */
+      constexpr mon_feature_t get_supported() {
+        return (
+	  FEATURE_KRAKEN |
+	  FEATURE_LUMINOUS |
+	  FEATURE_MIMIC |
+          FEATURE_OSDMAP_PRUNE |
+	  FEATURE_NAUTILUS |
+	  FEATURE_OCTOPUS |
+	  FEATURE_PACIFIC |
+	  FEATURE_PINGING |
+	  FEATURE_NONE
+	  );
+      }
+      /**
+       * All the features that, once set, cannot be removed.
+       *
+       * Features should only be added to this list if you want to make
+       * sure downgrades are not possible after a quorum supporting all
+       * these features has been formed.
+       *
+       * Any feature in this list will be automatically set on the monmap's
+       * features once all the monitors in the quorum support it.
+       */
+      constexpr mon_feature_t get_persistent() {
+        return (
+	  FEATURE_KRAKEN |
+	  FEATURE_LUMINOUS |
+	  FEATURE_MIMIC |
+	  FEATURE_NAUTILUS |
+	  FEATURE_OSDMAP_PRUNE |
+	  FEATURE_OCTOPUS |
+	  FEATURE_PACIFIC |
+	  FEATURE_PINGING |
+	  FEATURE_NONE
+	  );
+      }
+
+      constexpr mon_feature_t get_optional() {
+        return (
+          FEATURE_OSDMAP_PRUNE |
+          FEATURE_NONE
+          );
+      }
+
+      static inline mon_feature_t get_feature_by_name(const std::string &n);
+    }
+  }
+}
+
+static inline ceph_release_t infer_ceph_release_from_mon_features(mon_feature_t f)
+{
+  if (f.contains_all(ceph::features::mon::FEATURE_PACIFIC)) {
+    return ceph_release_t::pacific;
+  }
+  if (f.contains_all(ceph::features::mon::FEATURE_OCTOPUS)) {
+    return ceph_release_t::octopus;
+  }
+  if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
+    return ceph_release_t::nautilus;
+  }
+  if (f.contains_all(ceph::features::mon::FEATURE_MIMIC)) {
+    return ceph_release_t::mimic;
+  }
+  if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) {
+    return ceph_release_t::luminous;
+  }
+  if (f.contains_all(ceph::features::mon::FEATURE_KRAKEN)) {
+    return ceph_release_t::kraken;
+  }
+  return ceph_release_t::unknown;
+}
+
+static inline const char *ceph::features::mon::get_feature_name(uint64_t b) {
+  mon_feature_t f(b);
+
+  if (f == FEATURE_KRAKEN) {
+    return "kraken";
+  } else if (f == FEATURE_LUMINOUS) {
+    return "luminous";
+  } else if (f == FEATURE_MIMIC) {
+    return "mimic";
+  } else if (f == FEATURE_OSDMAP_PRUNE) {
+    return "osdmap-prune";
+  } else if (f == FEATURE_NAUTILUS) {
+    return "nautilus";
+  } else if (f == FEATURE_PINGING) {
+    return "elector-pinging";
+  } else if (f == FEATURE_OCTOPUS) {
+    return "octopus";
+  } else if (f == FEATURE_PACIFIC) {
+    return "pacific";
+  } else if (f == FEATURE_RESERVED) {
+    return "reserved";
+  }
+  return "unknown";
+}
+
+inline mon_feature_t ceph::features::mon::get_feature_by_name(const std::string &n) {
+
+  if (n == "kraken") {
+    return FEATURE_KRAKEN;
+  } else if (n == "luminous") {
+    return FEATURE_LUMINOUS;
+  } else if (n == "mimic") {
+    return FEATURE_MIMIC;
+  } else if (n == "osdmap-prune") {
+    return FEATURE_OSDMAP_PRUNE;
+  } else if (n == "nautilus") {
+    return FEATURE_NAUTILUS;
+  } else if (n == "feature-pinging") {
+    return FEATURE_PINGING;
+  } else if (n == "octopus") {
+    return FEATURE_OCTOPUS;
+  } else if (n == "pacific") {
+    return FEATURE_PACIFIC;
+  } else if (n == "reserved") {
+    return FEATURE_RESERVED;
+  }
+  return FEATURE_NONE;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const mon_feature_t& f) {
+  out << "mon_feature_t(";
+  f.print(out);
+  out << ")";
+  return out;
+}
+
+
+struct ProgressEvent {
+  std::string message;                  ///< event description
+  float progress;                  ///< [0..1]
+  bool add_to_ceph_s;
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(message, bl);
+    encode(progress, bl);
+    encode(add_to_ceph_s, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(2, p);
+    decode(message, p);
+    decode(progress, p);
+    if (struct_v >= 2){
+	decode(add_to_ceph_s, p);
+    } else {
+      if (!message.empty()) {
+	add_to_ceph_s = true;
+      }
+    }
+    DECODE_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_string("message", message);
+    f->dump_float("progress", progress);
+    f->dump_bool("add_to_ceph_s", add_to_ceph_s);
+  }
+};
+WRITE_CLASS_ENCODER(ProgressEvent)
+
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/mon
parent	Initial commit. (diff)
download	ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip