67 files changed, 18455 insertions, 0 deletions
diff --git a/src/mgr/ActivePyModule.cc b/src/mgr/ActivePyModule.cc
new file mode 100644
index 000000000..c776acfd0
--- /dev/null
+++ b/src/mgr/ActivePyModule.cc
@@ -0,0 +1,275 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "PyFormatter.h"
+
+#include "common/debug.h"
+#include "mon/MonCommand.h"
+
+#include "ActivePyModule.h"
+#include "MgrSession.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+int ActivePyModule::load(ActivePyModules *py_modules)
+{
+  ceph_assert(py_modules);
+  Gil gil(py_module->pMyThreadState, true);
+
+  // We tell the module how we name it, so that it can be consistent
+  // with us in logging etc.
+  auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+  auto pPyModules = PyCapsule_New(py_modules, nullptr, nullptr);
+  auto pModuleName = PyUnicode_FromString(get_name().c_str());
+  auto pArgs = PyTuple_Pack(3, pModuleName, pPyModules, pThisPtr);
+
+  pClassInstance = PyObject_CallObject(py_module->pClass, pArgs);
+  Py_DECREF(pModuleName);
+  Py_DECREF(pArgs);
+  if (pClassInstance == nullptr) {
+    derr << "Failed to construct class in '" << get_name() << "'" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  } else {
+    dout(1) << "Constructed class from module: " << get_name() << dendl;
+  }
+
+  return 0;
+}
+
+void ActivePyModule::notify(const std::string &notify_type, const std::string &notify_id)
+{
+  if (is_dead()) {
+    dout(5) << "cancelling notify " << notify_type << " " << notify_id << dendl;
+    return;
+  }
+
+  ceph_assert(pClassInstance != nullptr);
+
+  Gil gil(py_module->pMyThreadState, true);
+
+  // Execute
+  auto pValue = PyObject_CallMethod(pClassInstance,
+       const_cast<char*>("notify"), const_cast<char*>("(ss)"),
+       notify_type.c_str(), notify_id.c_str());
+
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << get_name() << ".notify:" << dendl;
+    derr << handle_pyerror() << dendl;
+    // FIXME: callers can't be expected to handle a python module
+    // that has spontaneously broken, but Mgr() should provide
+    // a hook to unload misbehaving modules when they have an
+    // error somewhere like this
+  }
+}
+
+void ActivePyModule::notify_clog(const LogEntry &log_entry)
+{
+  if (is_dead()) {
+    dout(5) << "cancelling notify_clog" << dendl;
+    return;
+  }
+
+  ceph_assert(pClassInstance != nullptr);
+
+  Gil gil(py_module->pMyThreadState, true);
+
+  // Construct python-ized LogEntry
+  PyFormatter f;
+  log_entry.dump(&f);
+  auto py_log_entry = f.get();
+
+  // Execute
+  auto pValue = PyObject_CallMethod(pClassInstance,
+       const_cast<char*>("notify"), const_cast<char*>("(sN)"),
+       "clog", py_log_entry);
+
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << get_name() << ".notify_clog:" << dendl;
+    derr << handle_pyerror() << dendl;
+    // FIXME: callers can't be expected to handle a python module
+    // that has spontaneously broken, but Mgr() should provide
+    // a hook to unload misbehaving modules when they have an
+    // error somewhere like this
+  }
+}
+
+bool ActivePyModule::method_exists(const std::string &method) const
+{
+  Gil gil(py_module->pMyThreadState, true);
+
+  auto boundMethod = PyObject_GetAttrString(pClassInstance, method.c_str());
+  if (boundMethod == nullptr) {
+    return false;
+  } else {
+    Py_DECREF(boundMethod);
+    return true;
+  }
+}
+
+PyObject *ActivePyModule::dispatch_remote(
+    const std::string &method,
+    PyObject *args,
+    PyObject *kwargs,
+    std::string *err)
+{
+  ceph_assert(err != nullptr);
+
+  // Rather than serializing arguments, pass the CPython objects.
+  // Works because we happen to know that the subinterpreter
+  // implementation shares a GIL, allocator, deallocator and GC state, so
+  // it's okay to pass the objects between subinterpreters.
+  // But in future this might involve serialization to support a CSP-aware
+  // future Python interpreter a la PEP554
+
+  Gil gil(py_module->pMyThreadState, true);
+
+  // Fire the receiving method
+  auto boundMethod = PyObject_GetAttrString(pClassInstance, method.c_str());
+
+  // Caller should have done method_exists check first!
+  ceph_assert(boundMethod != nullptr);
+
+  dout(20) << "Calling " << py_module->get_name()
+           << "." << method << "..." << dendl;
+
+  auto remoteResult = PyObject_Call(boundMethod,
+      args, kwargs);
+  Py_DECREF(boundMethod);
+
+  if (remoteResult == nullptr) {
+    // Because the caller is in a different context, we can't let this
+    // exception bubble up, need to re-raise it from the caller's
+    // context later.
+    *err = handle_pyerror();
+  } else {
+    dout(20) << "Success calling '" << method << "'" << dendl;
+  }
+
+  return remoteResult;
+}
+
+void ActivePyModule::config_notify()
+{
+  if (is_dead()) {
+    dout(5) << "cancelling config_notify" << dendl;
+    return;
+  }
+
+  Gil gil(py_module->pMyThreadState, true);
+  dout(20) << "Calling " << py_module->get_name() << "._config_notify..."
+	   << dendl;
+  auto remoteResult = PyObject_CallMethod(pClassInstance,
+					  const_cast<char*>("_config_notify"),
+					  (char*)NULL);
+  if (remoteResult != nullptr) {
+    Py_DECREF(remoteResult);
+  }
+}
+
+int ActivePyModule::handle_command(
+  const ModuleCommand& module_command,
+  const MgrSession& session,
+  const cmdmap_t &cmdmap,
+  const bufferlist &inbuf,
+  std::stringstream *ds,
+  std::stringstream *ss)
+{
+  ceph_assert(ss != nullptr);
+  ceph_assert(ds != nullptr);
+
+  if (pClassInstance == nullptr) {
+    // Not the friendliest error string, but we could only
+    // hit this in quite niche cases, if at all.
+    *ss << "Module not instantiated";
+    return -EINVAL;
+  }
+
+  Gil gil(py_module->pMyThreadState, true);
+
+  PyFormatter f;
+  TOPNSPC::common::cmdmap_dump(cmdmap, &f);
+  PyObject *py_cmd = f.get();
+  string instr;
+  inbuf.begin().copy(inbuf.length(), instr);
+
+  ceph_assert(m_session == nullptr);
+  m_command_perms = module_command.perm;
+  m_session = &session;
+
+  auto pResult = PyObject_CallMethod(pClassInstance,
+      const_cast<char*>("_handle_command"), const_cast<char*>("s#O"),
+      instr.c_str(), instr.length(), py_cmd);
+
+  m_command_perms.clear();
+  m_session = nullptr;
+  Py_DECREF(py_cmd);
+
+  int r = 0;
+  if (pResult != NULL) {
+    if (PyTuple_Size(pResult) != 3) {
+      derr << "module '" << py_module->get_name() << "' command handler "
+              "returned wrong type!" << dendl;
+      r = -EINVAL;
+    } else {
+      r = PyLong_AsLong(PyTuple_GetItem(pResult, 0));
+      *ds << PyUnicode_AsUTF8(PyTuple_GetItem(pResult, 1));
+      *ss << PyUnicode_AsUTF8(PyTuple_GetItem(pResult, 2));
+    }
+
+    Py_DECREF(pResult);
+  } else {
+    derr << "module '" << py_module->get_name() << "' command handler "
+            "threw exception: " << peek_pyerror() << dendl;
+    *ds << "";
+    *ss << handle_pyerror();
+    r = -EINVAL;
+  }
+
+  return r;
+}
+
+void ActivePyModule::get_health_checks(health_check_map_t *checks)
+{
+  if (is_dead()) {
+    dout(5) << "cancelling get_health_checks" << dendl;
+    return;
+  }
+  checks->merge(health_checks);
+}
+
+bool ActivePyModule::is_authorized(
+    const std::map<std::string, std::string>& arguments) const {
+  if (m_session == nullptr) {
+    return false;
+  }
+
+  // No need to pass command prefix here since that would have already been
+  // tested before command invokation. Instead, only test for service/module
+  // arguments as defined by the module itself.
+  MonCommand mon_command {"", "", "", m_command_perms};
+  return m_session->caps.is_capable(nullptr, m_session->entity_name, "py",
+                                    py_module->get_name(), "", arguments,
+                                    mon_command.requires_perm('r'),
+                                    mon_command.requires_perm('w'),
+                                    mon_command.requires_perm('x'),
+                                    m_session->get_peer_addr());
+}
diff --git a/src/mgr/ActivePyModule.h b/src/mgr/ActivePyModule.h
new file mode 100644
index 000000000..1cbf6d18a
--- /dev/null
+++ b/src/mgr/ActivePyModule.h
@@ -0,0 +1,102 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#pragma once
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include "common/cmdparse.h"
+#include "common/LogEntry.h"
+#include "common/Thread.h"
+#include "mon/health_check.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
+
+#include <vector>
+#include <string>
+
+
+class ActivePyModule;
+class ActivePyModules;
+class MgrSession;
+class ModuleCommand;
+
+class ActivePyModule : public PyModuleRunner
+{
+private:
+  health_check_map_t health_checks;
+
+  // Optional, URI exposed by plugins that implement serve()
+  std::string uri;
+
+  std::string m_command_perms;
+  const MgrSession* m_session = nullptr;
+
+public:
+  ActivePyModule(const PyModuleRef &py_module_,
+      LogChannelRef clog_)
+    : PyModuleRunner(py_module_, clog_)
+  {}
+
+  int load(ActivePyModules *py_modules);
+  void notify(const std::string &notify_type, const std::string &notify_id);
+  void notify_clog(const LogEntry &le);
+
+  bool method_exists(const std::string &method) const;
+
+  PyObject *dispatch_remote(
+      const std::string &method,
+      PyObject *args,
+      PyObject *kwargs,
+      std::string *err);
+
+  int handle_command(
+    const ModuleCommand& module_command,
+    const MgrSession& session,
+    const cmdmap_t &cmdmap,
+    const bufferlist &inbuf,
+    std::stringstream *ds,
+    std::stringstream *ss);
+
+
+  bool set_health_checks(health_check_map_t&& c) {
+    // when health checks change a report is immediately sent to the monitors.
+    // currently modules have static health check details, but this equality
+    // test could be made smarter if too much noise shows up in the future.
+    bool changed = health_checks != c;
+    health_checks = std::move(c);
+    return changed;
+  }
+  void get_health_checks(health_check_map_t *checks);
+  void config_notify();
+
+  void set_uri(const std::string &str)
+  {
+    uri = str;
+  }
+
+  std::string get_uri() const
+  {
+    return uri;
+  }
+
+  bool is_authorized(const std::map<std::string, std::string>& arguments) const;
+
+};
+
+std::string handle_pyerror();
+
diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc
new file mode 100644
index 000000000..e62e93b30
--- /dev/null
+++ b/src/mgr/ActivePyModules.cc
@@ -0,0 +1,1513 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+// Include this first to get python headers earlier
+#include "Gil.h"
+
+#include "common/errno.h"
+#include "include/stringify.h"
+
+#include "PyFormatter.h"
+
+#include "osd/OSDMap.h"
+#include "mon/MonMap.h"
+#include "osd/osd_types.h"
+#include "mgr/MgrContext.h"
+#include "mgr/TTLCache.h"
+#include "mgr/mgr_perf_counters.h"
+
+// For ::mgr_store_prefix
+#include "PyModule.h"
+#include "PyModuleRegistry.h"
+#include "PyUtil.h"
+
+#include "ActivePyModules.h"
+#include "DaemonKey.h"
+#include "DaemonServer.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+ActivePyModules::ActivePyModules(
+  PyModuleConfig &module_config_,
+  std::map<std::string, std::string> store_data,
+  bool mon_provides_kv_sub,
+  DaemonStateIndex &ds, ClusterState &cs,
+  MonClient &mc, LogChannelRef clog_,
+  LogChannelRef audit_clog_, Objecter &objecter_,
+  Client &client_, Finisher &f, DaemonServer &server,
+  PyModuleRegistry &pmr)
+: module_config(module_config_), daemon_state(ds), cluster_state(cs),
+  monc(mc), clog(clog_), audit_clog(audit_clog_), objecter(objecter_),
+  client(client_), finisher(f),
+  cmd_finisher(g_ceph_context, "cmd_finisher", "cmdfin"),
+  server(server), py_module_registry(pmr)
+{
+  store_cache = std::move(store_data);
+  // we can only trust our ConfigMap if the mon cluster has provided
+  // kv sub since our startup.
+  have_local_config_map = mon_provides_kv_sub;
+  _refresh_config_map();
+  cmd_finisher.start();
+}
+
+ActivePyModules::~ActivePyModules() = default;
+
+void ActivePyModules::dump_server(const std::string &hostname,
+                      const DaemonStateCollection &dmc,
+                      Formatter *f)
+{
+  f->dump_string("hostname", hostname);
+  f->open_array_section("services");
+  std::string ceph_version;
+
+  for (const auto &[key, state] : dmc) {
+    std::string id;
+    without_gil([&ceph_version, &id, state=state] {
+      std::lock_guard l(state->lock);
+      // TODO: pick the highest version, and make sure that
+      // somewhere else (during health reporting?) we are
+      // indicating to the user if we see mixed versions
+      auto ver_iter = state->metadata.find("ceph_version");
+      if (ver_iter != state->metadata.end()) {
+        ceph_version = state->metadata.at("ceph_version");
+      }
+      if (state->metadata.find("id") != state->metadata.end()) {
+        id = state->metadata.at("id");
+      }
+    });
+    f->open_object_section("service");
+    f->dump_string("type", key.type);
+    f->dump_string("id", key.name);
+    f->dump_string("ceph_version", ceph_version);
+    if (!id.empty()) {
+      f->dump_string("name", id);
+    }
+    f->close_section();
+  }
+  f->close_section();
+
+  f->dump_string("ceph_version", ceph_version);
+}
+
+PyObject *ActivePyModules::get_server_python(const std::string &hostname)
+{
+  const auto dmc = without_gil([&]{
+    std::lock_guard l(lock);
+    dout(10) << " (" << hostname << ")" << dendl;
+    return daemon_state.get_by_server(hostname);
+  });
+  PyFormatter f;
+  dump_server(hostname, dmc, &f);
+  return f.get();
+}
+
+
+PyObject *ActivePyModules::list_servers_python()
+{
+  dout(10) << " >" << dendl;
+
+  without_gil_t no_gil;
+  return daemon_state.with_daemons_by_server([this, &no_gil]
+      (const std::map<std::string, DaemonStateCollection> &all) {
+    with_gil_t with_gil{no_gil};
+    PyFormatter f(false, true);
+    for (const auto &[hostname, daemon_state] : all) {
+      f.open_object_section("server");
+      dump_server(hostname, daemon_state, &f);
+      f.close_section();
+    }
+    return f.get();
+  });
+}
+
+PyObject *ActivePyModules::get_metadata_python(
+  const std::string &svc_type,
+  const std::string &svc_id)
+{
+  auto metadata = daemon_state.get(DaemonKey{svc_type, svc_id});
+  if (metadata == nullptr) {
+    derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+    Py_RETURN_NONE;
+  }
+  auto l = without_gil([&] {
+    return std::lock_guard(lock);
+  });
+  PyFormatter f;
+  f.dump_string("hostname", metadata->hostname);
+  for (const auto &[key, val] : metadata->metadata) {
+    f.dump_string(key, val);
+  }
+
+  return f.get();
+}
+
+PyObject *ActivePyModules::get_daemon_status_python(
+  const std::string &svc_type,
+  const std::string &svc_id)
+{
+  auto metadata = daemon_state.get(DaemonKey{svc_type, svc_id});
+  if (metadata == nullptr) {
+    derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+    Py_RETURN_NONE;
+  }
+  auto l = without_gil([&] {
+    return std::lock_guard(lock);
+  });
+  PyFormatter f;
+  for (const auto &[daemon, status] : metadata->service_status) {
+    f.dump_string(daemon, status);
+  }
+  return f.get();
+}
+
+void ActivePyModules::update_cache_metrics() {
+    auto hit_miss_ratio = ttl_cache.get_hit_miss_ratio();
+    perfcounter->set(l_mgr_cache_hit, hit_miss_ratio.first);
+    perfcounter->set(l_mgr_cache_miss, hit_miss_ratio.second);
+}
+
+PyObject *ActivePyModules::cacheable_get_python(const std::string &what)
+{
+  uint64_t ttl_seconds = g_conf().get_val<uint64_t>("mgr_ttl_cache_expire_seconds");
+  if(ttl_seconds > 0) {
+    ttl_cache.set_ttl(ttl_seconds);
+    try{
+      PyObject* cached = ttl_cache.get(what);
+      update_cache_metrics();
+      return cached;
+    } catch (std::out_of_range& e) {}
+  }
+
+  PyObject *obj = get_python(what);
+  if(ttl_seconds && ttl_cache.is_cacheable(what)) {
+    ttl_cache.insert(what, obj);
+    Py_INCREF(obj);
+  }
+  update_cache_metrics();
+  return obj;
+}
+
+PyObject *ActivePyModules::get_python(const std::string &what)
+{
+  uint64_t ttl_seconds = g_conf().get_val<uint64_t>("mgr_ttl_cache_expire_seconds");
+
+  PyFormatter pf;
+  PyJSONFormatter jf;
+  // Use PyJSONFormatter if TTL cache is enabled.
+  Formatter &f = ttl_seconds ? (Formatter&)jf : (Formatter&)pf;
+
+  if (what == "fs_map") {
+    without_gil_t no_gil;
+    cluster_state.with_fsmap([&](const FSMap &fsmap) {
+      no_gil.acquire_gil();
+      fsmap.dump(&f);
+    });
+  } else if (what == "osdmap_crush_map_text") {
+    without_gil_t no_gil;
+    bufferlist rdata;
+    cluster_state.with_osdmap([&](const OSDMap &osd_map){
+      osd_map.crush->encode(rdata, CEPH_FEATURES_SUPPORTED_DEFAULT);
+    });
+    std::string crush_text = rdata.to_str();
+    with_gil_t with_gil{no_gil};
+    return PyUnicode_FromString(crush_text.c_str());
+  } else if (what.substr(0, 7) == "osd_map") {
+    without_gil_t no_gil;
+    cluster_state.with_osdmap([&](const OSDMap &osd_map){
+      no_gil.acquire_gil();
+      if (what == "osd_map") {
+        osd_map.dump(&f);
+      } else if (what == "osd_map_tree") {
+        osd_map.print_tree(&f, nullptr);
+      } else if (what == "osd_map_crush") {
+        osd_map.crush->dump(&f);
+      }
+    });
+  } else if (what == "modified_config_options") {
+    without_gil_t no_gil;
+    auto all_daemons = daemon_state.get_all();
+    set<string> names;
+    for (auto& [key, daemon] : all_daemons) {
+      std::lock_guard l(daemon->lock);
+      for (auto& [name, valmap] : daemon->config) {
+	names.insert(name);
+      }
+    }
+    with_gil_t with_gil{no_gil};
+    f.open_array_section("options");
+    for (auto& name : names) {
+      f.dump_string("name", name);
+    }
+    f.close_section();
+  } else if (what.substr(0, 6) == "config") {
+    if (what == "config_options") {
+      g_conf().config_options(&f);
+    } else if (what == "config") {
+      g_conf().show_config(&f);
+    }
+  } else if (what == "mon_map") {
+    without_gil_t no_gil;
+    cluster_state.with_monmap([&](const MonMap &monmap) {
+      no_gil.acquire_gil();
+      monmap.dump(&f);
+    });
+  } else if (what == "service_map") {
+    without_gil_t no_gil;
+    cluster_state.with_servicemap([&](const ServiceMap &service_map) {
+      no_gil.acquire_gil();
+      service_map.dump(&f);
+    });
+  } else if (what == "osd_metadata") {
+    without_gil_t no_gil;
+    auto dmc = daemon_state.get_by_service("osd");
+    for (const auto &[key, state] : dmc) {
+      std::lock_guard l(state->lock);
+      with_gil(no_gil, [&f, &name=key.name, state=state] {
+        f.open_object_section(name.c_str());
+        f.dump_string("hostname", state->hostname);
+        for (const auto &[name, val] : state->metadata) {
+          f.dump_string(name.c_str(), val);
+        }
+        f.close_section();
+      });
+    }
+  } else if (what == "mds_metadata") {
+    without_gil_t no_gil;
+    auto dmc = daemon_state.get_by_service("mds");
+    for (const auto &[key, state] : dmc) {
+      std::lock_guard l(state->lock);
+      with_gil(no_gil, [&f, &name=key.name, state=state] {
+        f.open_object_section(name.c_str());
+        f.dump_string("hostname", state->hostname);
+        for (const auto &[name, val] : state->metadata) {
+          f.dump_string(name.c_str(), val);
+        }
+        f.close_section();
+      });
+    }
+  } else if (what == "pg_summary") {
+    without_gil_t no_gil;
+    cluster_state.with_pgmap(
+        [&f, &no_gil](const PGMap &pg_map) {
+          std::map<std::string, std::map<std::string, uint32_t> > osds;
+          std::map<std::string, std::map<std::string, uint32_t> > pools;
+          std::map<std::string, uint32_t> all;
+          for (const auto &i : pg_map.pg_stat) {
+            const auto pool = i.first.m_pool;
+            const std::string state = pg_state_string(i.second.state);
+            // Insert to per-pool map
+            pools[stringify(pool)][state]++;
+            for (const auto &osd_id : i.second.acting) {
+              osds[stringify(osd_id)][state]++;
+            }
+            all[state]++;
+          }
+          with_gil_t with_gil{no_gil};
+          f.open_object_section("by_osd");
+          for (const auto &i : osds) {
+            f.open_object_section(i.first.c_str());
+            for (const auto &j : i.second) {
+              f.dump_int(j.first.c_str(), j.second);
+            }
+            f.close_section();
+          }
+          f.close_section();
+          f.open_object_section("by_pool");
+          for (const auto &i : pools) {
+            f.open_object_section(i.first.c_str());
+            for (const auto &j : i.second) {
+              f.dump_int(j.first.c_str(), j.second);
+            }
+            f.close_section();
+          }
+          f.close_section();
+          f.open_object_section("all");
+          for (const auto &i : all) {
+            f.dump_int(i.first.c_str(), i.second);
+          }
+          f.close_section();
+          f.open_object_section("pg_stats_sum");
+          pg_map.pg_sum.dump(&f);
+          f.close_section();
+        }
+    );
+  } else if (what == "pg_status") {
+    without_gil_t no_gil;
+    cluster_state.with_pgmap(
+        [&](const PGMap &pg_map) {
+	  with_gil_t with_gil{no_gil};
+	  pg_map.print_summary(&f, nullptr);
+        }
+    );
+  } else if (what == "pg_dump") {
+    without_gil_t no_gil;
+    cluster_state.with_pgmap(
+      [&](const PGMap &pg_map) {
+	with_gil_t with_gil{no_gil};
+	pg_map.dump(&f, false);
+      }
+    );
+  } else if (what == "devices") {
+    without_gil_t no_gil;
+    daemon_state.with_devices2(
+      [&] {
+        with_gil(no_gil, [&] { f.open_array_section("devices"); });
+      },
+      [&](const DeviceState &dev) {
+        with_gil(no_gil, [&] { f.dump_object("device", dev); });
+      });
+    with_gil(no_gil, [&] {
+      f.close_section();
+    });
+  } else if (what.size() > 7 &&
+	     what.substr(0, 7) == "device ") {
+    without_gil_t no_gil;
+    string devid = what.substr(7);
+    if (!daemon_state.with_device(devid,
+      [&] (const DeviceState& dev) {
+        with_gil_t with_gil{no_gil};
+        f.dump_object("device", dev);
+      })) {
+      // device not found
+    }
+  } else if (what == "io_rate") {
+    without_gil_t no_gil;
+    cluster_state.with_pgmap(
+      [&](const PGMap &pg_map) {
+        with_gil_t with_gil{no_gil};
+        pg_map.dump_delta(&f);
+      }
+    );
+  } else if (what == "df") {
+    without_gil_t no_gil;
+    cluster_state.with_osdmap_and_pgmap(
+      [&](
+	const OSDMap& osd_map,
+	const PGMap &pg_map) {
+        with_gil_t with_gil{no_gil};
+        pg_map.dump_cluster_stats(nullptr, &f, true);
+        pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true);
+      });
+  } else if (what == "pg_stats") {
+    without_gil_t no_gil;
+    cluster_state.with_pgmap([&](const PGMap &pg_map) {
+      no_gil.acquire_gil();
+      pg_map.dump_pg_stats(&f, false);
+    });
+  } else if (what == "pool_stats") {
+    without_gil_t no_gil;
+    cluster_state.with_pgmap([&](const PGMap &pg_map) {
+      no_gil.acquire_gil();
+      pg_map.dump_pool_stats(&f);
+    });
+  } else if (what == "pg_ready") {
+    server.dump_pg_ready(&f);
+  } else if (what == "pg_progress") {
+    without_gil_t no_gil;
+    cluster_state.with_pgmap([&](const PGMap &pg_map) {
+      no_gil.acquire_gil();
+      pg_map.dump_pg_progress(&f);
+      server.dump_pg_ready(&f);
+    });
+  } else if (what == "osd_stats") {
+    without_gil_t no_gil;
+    cluster_state.with_pgmap([&](const PGMap &pg_map) {
+      no_gil.acquire_gil();
+      pg_map.dump_osd_stats(&f, false);
+    });
+  } else if (what == "osd_ping_times") {
+    without_gil_t no_gil;
+    cluster_state.with_pgmap([&](const PGMap &pg_map) {
+      no_gil.acquire_gil();
+      pg_map.dump_osd_ping_times(&f);
+    });
+  } else if (what == "osd_pool_stats") {
+    int64_t poolid = -ENOENT;
+    without_gil_t no_gil;
+    cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap,
+					    const PGMap& pg_map) {
+      with_gil_t with_gil{no_gil};
+      f.open_array_section("pool_stats");
+      for (auto &p : osdmap.get_pools()) {
+        poolid = p.first;
+        pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, &f, nullptr);
+      }
+      f.close_section();
+    });
+  } else if (what == "health") {
+    without_gil_t no_gil;
+    cluster_state.with_health([&](const ceph::bufferlist &health_json) {
+      no_gil.acquire_gil();
+      f.dump_string("json", health_json.to_str());
+    });
+  } else if (what == "mon_status") {
+    without_gil_t no_gil;
+    cluster_state.with_mon_status(
+        [&](const ceph::bufferlist &mon_status_json) {
+      with_gil_t with_gil{no_gil};
+      f.dump_string("json", mon_status_json.to_str());
+    });
+  } else if (what == "mgr_map") {
+    without_gil_t no_gil;
+    cluster_state.with_mgrmap([&](const MgrMap &mgr_map) {
+      no_gil.acquire_gil();
+      mgr_map.dump(&f);
+    });
+  } else if (what == "mgr_ips") {
+    entity_addrvec_t myaddrs = server.get_myaddrs();
+    f.open_array_section("ips");
+    std::set<std::string> did;
+    for (auto& i : myaddrs.v) {
+      std::string ip = i.ip_only_to_str();
+      if (auto [where, inserted] = did.insert(ip); inserted) {
+	f.dump_string("ip", ip);
+      }
+    }
+    f.close_section();
+  } else if (what == "have_local_config_map") {
+    f.dump_bool("have_local_config_map", have_local_config_map);
+  } else if (what == "active_clean_pgs"){
+    without_gil_t no_gil;
+    cluster_state.with_pgmap(
+        [&](const PGMap &pg_map) {
+      with_gil_t with_gil{no_gil};
+      f.open_array_section("pg_stats");
+      for (auto &i : pg_map.pg_stat) {
+        const auto state = i.second.state;
+	const auto pgid_raw = i.first;
+	const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed);
+	const auto reported_epoch = i.second.reported_epoch;
+	if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) {
+	  f.open_object_section("pg_stat");
+	  f.dump_string("pgid", pgid);
+	  f.dump_string("state", pg_state_string(state));
+	  f.dump_unsigned("reported_epoch", reported_epoch);
+	  f.close_section();
+	}
+      }
+      f.close_section();
+      const auto num_pg = pg_map.num_pg;
+      f.dump_unsigned("total_num_pgs", num_pg);
+    });
+  } else {
+    derr << "Python module requested unknown data '" << what << "'" << dendl;
+    Py_RETURN_NONE;
+  }
+  if(ttl_seconds) {
+    return jf.get();
+  } else {
+    return pf.get();
+  }
+}
+
+void ActivePyModules::start_one(PyModuleRef py_module)
+{
+  std::lock_guard l(lock);
+
+  const auto name = py_module->get_name();
+  auto active_module = std::make_shared<ActivePyModule>(py_module, clog);
+
+  pending_modules.insert(name);
+  // Send all python calls down a Finisher to avoid blocking
+  // C++ code, and avoid any potential lock cycles.
+  finisher.queue(new LambdaContext([this, active_module, name](int) {
+    int r = active_module->load(this);
+    std::lock_guard l(lock);
+    pending_modules.erase(name);
+    if (r != 0) {
+      derr << "Failed to run module in active mode ('" << name << "')"
+           << dendl;
+    } else {
+      auto em = modules.emplace(name, active_module);
+      ceph_assert(em.second); // actually inserted
+
+      dout(4) << "Starting thread for " << name << dendl;
+      active_module->thread.create(active_module->get_thread_name());
+    }
+  }));
+}
+
+void ActivePyModules::shutdown()
+{
+  std::lock_guard locker(lock);
+
+  // Signal modules to drop out of serve() and/or tear down resources
+  for (auto& [name, module] : modules) {
+    lock.unlock();
+    dout(10) << "calling module " << name << " shutdown()" << dendl;
+    module->shutdown();
+    dout(10) << "module " << name << " shutdown() returned" << dendl;
+    lock.lock();
+  }
+
+  // For modules implementing serve(), finish the threads where we
+  // were running that.
+  for (auto& [name, module] : modules) {
+    lock.unlock();
+    dout(10) << "joining module " << name << dendl;
+    module->thread.join();
+    dout(10) << "joined module " << name << dendl;
+    lock.lock();
+  }
+
+  cmd_finisher.wait_for_empty();
+  cmd_finisher.stop();
+
+  modules.clear();
+}
+
+void ActivePyModules::notify_all(const std::string &notify_type,
+                     const std::string &notify_id)
+{
+  std::lock_guard l(lock);
+
+  dout(10) << __func__ << ": notify_all " << notify_type << dendl;
+  for (auto& [name, module] : modules) {
+    if (!py_module_registry.should_notify(name, notify_type)) {
+      continue;
+    }
+    // Send all python calls down a Finisher to avoid blocking
+    // C++ code, and avoid any potential lock cycles.
+    dout(15) << "queuing notify (" << notify_type << ") to " << name << dendl;
+    // workaround for https://bugs.llvm.org/show_bug.cgi?id=35984
+    finisher.queue(new LambdaContext([module=module, notify_type, notify_id]
+      (int r){
+        module->notify(notify_type, notify_id);
+    }));
+  }
+}
+
+void ActivePyModules::notify_all(const LogEntry &log_entry)
+{
+  std::lock_guard l(lock);
+
+  dout(10) << __func__ << ": notify_all (clog)" << dendl;
+  for (auto& [name, module] : modules) {
+    if (!py_module_registry.should_notify(name, "clog")) {
+      continue;
+    }
+    // Send all python calls down a Finisher to avoid blocking
+    // C++ code, and avoid any potential lock cycles.
+    //
+    // Note intentional use of non-reference lambda binding on
+    // log_entry: we take a copy because caller's instance is
+    // probably ephemeral.
+    dout(15) << "queuing notify (clog) to " << name << dendl;
+    // workaround for https://bugs.llvm.org/show_bug.cgi?id=35984
+    finisher.queue(new LambdaContext([module=module, log_entry](int r){
+      module->notify_clog(log_entry);
+    }));
+  }
+}
+
+bool ActivePyModules::get_store(const std::string &module_name,
+    const std::string &key, std::string *val) const
+{
+  without_gil_t no_gil;
+  std::lock_guard l(lock);
+
+  const std::string global_key = PyModule::mgr_store_prefix
+    + module_name + "/" + key;
+
+  dout(4) << __func__ << " key: " << global_key << dendl;
+
+  auto i = store_cache.find(global_key);
+  if (i != store_cache.end()) {
+    *val = i->second;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+PyObject *ActivePyModules::dispatch_remote(
+    const std::string &other_module,
+    const std::string &method,
+    PyObject *args,
+    PyObject *kwargs,
+    std::string *err)
+{
+  auto mod_iter = modules.find(other_module);
+  ceph_assert(mod_iter != modules.end());
+
+  return mod_iter->second->dispatch_remote(method, args, kwargs, err);
+}
+
+bool ActivePyModules::get_config(const std::string &module_name,
+    const std::string &key, std::string *val) const
+{
+  const std::string global_key = "mgr/" + module_name + "/" + key;
+
+  dout(20) << " key: " << global_key << dendl;
+
+  std::lock_guard lock(module_config.lock);
+
+  auto i = module_config.config.find(global_key);
+  if (i != module_config.config.end()) {
+    *val = i->second;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+PyObject *ActivePyModules::get_typed_config(
+  const std::string &module_name,
+  const std::string &key,
+  const std::string &prefix) const
+{
+  without_gil_t no_gil;
+  std::string value;
+  std::string final_key;
+  bool found = false;
+  if (prefix.size()) {
+    final_key = prefix + "/" + key;
+    found = get_config(module_name, final_key, &value);
+  }
+  if (!found) {
+    final_key = key;
+    found = get_config(module_name, final_key, &value);
+  }
+  if (found) {
+    PyModuleRef module = py_module_registry.get_module(module_name);
+    with_gil_t with_gil{no_gil};
+    if (!module) {
+        derr << "Module '" << module_name << "' is not available" << dendl;
+        Py_RETURN_NONE;
+    }
+    // removing value to hide sensitive data going into mgr logs
+    // leaving this for debugging purposes
+    // dout(10) << __func__ << " " << final_key << " found: " << value << dendl;
+    dout(10) << __func__ << " " << final_key << " found" << dendl;
+    return module->get_typed_option_value(key, value);
+  }
+  if (prefix.size()) {
+    dout(10) << " [" << prefix << "/]" << key << " not found "
+	    << dendl;
+  } else {
+    dout(10) << " " << key << " not found " << dendl;
+  }
+  with_gil_t with_gil{no_gil};
+  Py_RETURN_NONE;
+}
+
+PyObject *ActivePyModules::get_store_prefix(const std::string &module_name,
+    const std::string &prefix) const
+{
+  without_gil_t no_gil;
+  std::lock_guard l(lock);
+  std::lock_guard lock(module_config.lock);
+
+  const std::string base_prefix = PyModule::mgr_store_prefix
+                                    + module_name + "/";
+  const std::string global_prefix = base_prefix + prefix;
+  dout(4) << __func__ << " prefix: " << global_prefix << dendl;
+
+  return with_gil(no_gil, [&] {
+    PyFormatter f;
+    for (auto p = store_cache.lower_bound(global_prefix);
+         p != store_cache.end() && p->first.find(global_prefix) == 0; ++p) {
+      f.dump_string(p->first.c_str() + base_prefix.size(), p->second);
+    }
+    return f.get();
+  });
+}
+
+void ActivePyModules::set_store(const std::string &module_name,
+    const std::string &key, const boost::optional<std::string>& val)
+{
+  const std::string global_key = PyModule::mgr_store_prefix
+                                   + module_name + "/" + key;
+
+  Command set_cmd;
+  {
+    std::lock_guard l(lock);
+
+    // NOTE: this isn't strictly necessary since we'll also get an MKVData
+    // update from the mon due to our subscription *before* our command is acked.
+    if (val) {
+      store_cache[global_key] = *val;
+    } else {
+      store_cache.erase(global_key);
+    }
+
+    std::ostringstream cmd_json;
+    JSONFormatter jf;
+    jf.open_object_section("cmd");
+    if (val) {
+      jf.dump_string("prefix", "config-key set");
+      jf.dump_string("key", global_key);
+      jf.dump_string("val", *val);
+    } else {
+      jf.dump_string("prefix", "config-key del");
+      jf.dump_string("key", global_key);
+    }
+    jf.close_section();
+    jf.flush(cmd_json);
+    set_cmd.run(&monc, cmd_json.str());
+  }
+  set_cmd.wait();
+
+  if (set_cmd.r != 0) {
+    // config-key set will fail if mgr's auth key has insufficient
+    // permission to set config keys
+    // FIXME: should this somehow raise an exception back into Python land?
+    dout(0) << "`config-key set " << global_key << " " << val << "` failed: "
+      << cpp_strerror(set_cmd.r) << dendl;
+    dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
+  }
+}
+
+void ActivePyModules::set_config(const std::string &module_name,
+    const std::string &key, const boost::optional<std::string>& val)
+{
+  module_config.set_config(&monc, module_name, key, val);
+}
+
+std::map<std::string, std::string> ActivePyModules::get_services() const
+{
+  std::map<std::string, std::string> result;
+  std::lock_guard l(lock);
+  for (const auto& [name, module] : modules) {
+    std::string svc_str = module->get_uri();
+    if (!svc_str.empty()) {
+      result[name] = svc_str;
+    }
+  }
+
+  return result;
+}
+
+void ActivePyModules::update_kv_data(
+  const std::string prefix,
+  bool incremental,
+  const map<std::string, boost::optional<bufferlist>, std::less<>>& data)
+{
+  std::lock_guard l(lock);
+  bool do_config = false;
+  if (!incremental) {
+    dout(10) << "full update on " << prefix << dendl;
+    auto p = store_cache.lower_bound(prefix);
+    while (p != store_cache.end() && p->first.find(prefix) == 0) {
+      dout(20) << " rm prior " << p->first << dendl;
+      p = store_cache.erase(p);
+    }
+  } else {
+    dout(10) << "incremental update on " << prefix << dendl;
+  }
+  for (auto& i : data) {
+    if (i.second) {
+      dout(20) << " set " << i.first << " = " << i.second->to_str() << dendl;
+      store_cache[i.first] = i.second->to_str();
+    } else {
+      dout(20) << " rm " << i.first << dendl;
+      store_cache.erase(i.first);
+    }
+    if (i.first.find("config/") == 0) {
+      do_config = true;
+    }
+  }
+  if (do_config) {
+    _refresh_config_map();
+  }
+}
+
+void ActivePyModules::_refresh_config_map()
+{
+  dout(10) << dendl;
+  config_map.clear();
+  for (auto p = store_cache.lower_bound("config/");
+       p != store_cache.end() && p->first.find("config/") == 0;
+       ++p) {
+    string key = p->first.substr(7);
+    if (key.find("mgr/") == 0) {
+      // NOTE: for now, we ignore module options.  see also ceph_foreign_option_get().
+      continue;
+    }
+    string value = p->second;
+    string name;
+    string who;
+    config_map.parse_key(key, &name, &who);
+
+    const Option *opt = g_conf().find_option(name);
+    if (!opt) {
+      config_map.stray_options.push_back(
+	std::unique_ptr<Option>(
+	  new Option(name, Option::TYPE_STR, Option::LEVEL_UNKNOWN)));
+      opt = config_map.stray_options.back().get();
+    }
+
+    string err;
+    int r = opt->pre_validate(&value, &err);
+    if (r < 0) {
+      dout(10) << __func__ << " pre-validate failed on '" << name << "' = '"
+	       << value << "' for " << name << dendl;
+    }
+
+    MaskedOption mopt(opt);
+    mopt.raw_value = value;
+    string section_name;
+    if (who.size() &&
+	!ConfigMap::parse_mask(who, &section_name, &mopt.mask)) {
+      derr << __func__ << " invalid mask for key " << key << dendl;
+    } else if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+      dout(10) << __func__ << " NO_MON_UPDATE option '"
+	       << name << "' = '" << value << "' for " << name
+	       << dendl;
+    } else {
+      Section *section = &config_map.global;;
+      if (section_name.size() && section_name != "global") {
+	if (section_name.find('.') != std::string::npos) {
+	  section = &config_map.by_id[section_name];
+	} else {
+	  section = &config_map.by_type[section_name];
+	}
+      }
+      section->options.insert(make_pair(name, std::move(mopt)));
+    }
+  }
+}
+
+PyObject* ActivePyModules::with_perf_counters(
+    std::function<void(PerfCounterInstance& counter_instance, PerfCounterType& counter_type, PyFormatter& f)> fct,
+    const std::string &svc_name,
+    const std::string &svc_id,
+    const std::string &path) const
+{
+  PyFormatter f;
+  f.open_array_section(path);
+  {
+    without_gil_t no_gil;
+    std::lock_guard l(lock);
+    auto metadata = daemon_state.get(DaemonKey{svc_name, svc_id});
+    if (metadata) {
+      std::lock_guard l2(metadata->lock);
+      if (metadata->perf_counters.instances.count(path)) {
+        auto counter_instance = metadata->perf_counters.instances.at(path);
+        auto counter_type = metadata->perf_counters.types.at(path);
+        with_gil(no_gil, [&] {
+          fct(counter_instance, counter_type, f);
+        });
+      } else {
+        dout(4) << "Missing counter: '" << path << "' ("
+		<< svc_name << "." << svc_id << ")" << dendl;
+        dout(20) << "Paths are:" << dendl;
+        for (const auto &i : metadata->perf_counters.instances) {
+          dout(20) << i.first << dendl;
+        }
+      }
+    } else {
+      dout(4) << "No daemon state for " << svc_name << "." << svc_id << ")"
+              << dendl;
+    }
+  }
+  f.close_section();
+  return f.get();
+}
+
+PyObject* ActivePyModules::get_counter_python(
+    const std::string &svc_name,
+    const std::string &svc_id,
+    const std::string &path)
+{
+  auto extract_counters = [](
+      PerfCounterInstance& counter_instance,
+      PerfCounterType& counter_type,
+      PyFormatter& f)
+  {
+    if (counter_type.type & PERFCOUNTER_LONGRUNAVG) {
+      const auto &avg_data = counter_instance.get_data_avg();
+      for (const auto &datapoint : avg_data) {
+        f.open_array_section("datapoint");
+        f.dump_float("t", datapoint.t);
+        f.dump_unsigned("s", datapoint.s);
+        f.dump_unsigned("c", datapoint.c);
+        f.close_section();
+      }
+    } else {
+      const auto &data = counter_instance.get_data();
+      for (const auto &datapoint : data) {
+        f.open_array_section("datapoint");
+        f.dump_float("t", datapoint.t);
+        f.dump_unsigned("v", datapoint.v);
+        f.close_section();
+      }
+    }
+  };
+  return with_perf_counters(extract_counters, svc_name, svc_id, path);
+}
+
+PyObject* ActivePyModules::get_latest_counter_python(
+    const std::string &svc_name,
+    const std::string &svc_id,
+    const std::string &path)
+{
+  auto extract_latest_counters = [](
+      PerfCounterInstance& counter_instance,
+      PerfCounterType& counter_type,
+      PyFormatter& f)
+  {
+    if (counter_type.type & PERFCOUNTER_LONGRUNAVG) {
+      const auto &datapoint = counter_instance.get_latest_data_avg();
+      f.dump_float("t", datapoint.t);
+      f.dump_unsigned("s", datapoint.s);
+      f.dump_unsigned("c", datapoint.c);
+    } else {
+      const auto &datapoint = counter_instance.get_latest_data();
+      f.dump_float("t", datapoint.t);
+      f.dump_unsigned("v", datapoint.v);
+    }
+  };
+  return with_perf_counters(extract_latest_counters, svc_name, svc_id, path);
+}
+
+PyObject* ActivePyModules::get_perf_schema_python(
+    const std::string &svc_type,
+    const std::string &svc_id)
+{
+  without_gil_t no_gil;
+  std::lock_guard l(lock);
+
+  DaemonStateCollection daemons;
+
+  if (svc_type == "") {
+    daemons = daemon_state.get_all();
+  } else if (svc_id.empty()) {
+    daemons = daemon_state.get_by_service(svc_type);
+  } else {
+    auto key = DaemonKey{svc_type, svc_id};
+    // so that the below can be a loop in all cases
+    auto got = daemon_state.get(key);
+    if (got != nullptr) {
+      daemons[key] = got;
+    }
+  }
+
+  auto f = with_gil(no_gil, [&] {
+    return PyFormatter();
+  });
+  if (!daemons.empty()) {
+    for (auto& [key, state] : daemons) {
+      std::lock_guard l(state->lock);
+      with_gil(no_gil, [&, key=ceph::to_string(key), state=state] {
+        f.open_object_section(key.c_str());
+        for (auto ctr_inst_iter : state->perf_counters.instances) {
+          const auto &counter_name = ctr_inst_iter.first;
+          f.open_object_section(counter_name.c_str());
+          auto type = state->perf_counters.types[counter_name];
+          f.dump_string("description", type.description);
+          if (!type.nick.empty()) {
+            f.dump_string("nick", type.nick);
+          }
+          f.dump_unsigned("type", type.type);
+          f.dump_unsigned("priority", type.priority);
+          f.dump_unsigned("units", type.unit);
+          f.close_section();
+        }
+        f.close_section();
+      });
+    }
+  } else {
+    dout(4) << __func__ << ": No daemon state found for "
+              << svc_type << "." << svc_id << ")" << dendl;
+  }
+  return f.get();
+}
+
+PyObject *ActivePyModules::get_context()
+{
+  auto l = without_gil([&] {
+    return std::lock_guard(lock);
+  });
+  // Construct a capsule containing ceph context.
+  // Not incrementing/decrementing ref count on the context because
+  // it's the global one and it has process lifetime.
+  auto capsule = PyCapsule_New(g_ceph_context, nullptr, nullptr);
+  return capsule;
+}
+
+/**
+ * Helper for our wrapped types that take a capsule in their constructor.
+ */
+PyObject *construct_with_capsule(
+    const std::string &module_name,
+    const std::string &clsname,
+    void *wrapped)
+{
+  // Look up the OSDMap type which we will construct
+  PyObject *module = PyImport_ImportModule(module_name.c_str());
+  if (!module) {
+    derr << "Failed to import python module:" << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+  ceph_assert(module);
+
+  PyObject *wrapper_type = PyObject_GetAttrString(
+      module, (const char*)clsname.c_str());
+  if (!wrapper_type) {
+    derr << "Failed to get python type:" << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+  ceph_assert(wrapper_type);
+
+  // Construct a capsule containing an OSDMap.
+  auto wrapped_capsule = PyCapsule_New(wrapped, nullptr, nullptr);
+  ceph_assert(wrapped_capsule);
+
+  // Construct the python OSDMap
+  auto pArgs = PyTuple_Pack(1, wrapped_capsule);
+  auto wrapper_instance = PyObject_CallObject(wrapper_type, pArgs);
+  if (wrapper_instance == nullptr) {
+    derr << "Failed to construct python OSDMap:" << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+  ceph_assert(wrapper_instance != nullptr);
+  Py_DECREF(pArgs);
+  Py_DECREF(wrapped_capsule);
+
+  Py_DECREF(wrapper_type);
+  Py_DECREF(module);
+
+  return wrapper_instance;
+}
+
+PyObject *ActivePyModules::get_osdmap()
+{
+  auto newmap = without_gil([&] {
+    OSDMap *newmap = new OSDMap;
+    cluster_state.with_osdmap([&](const OSDMap& o) {
+      newmap->deepish_copy_from(o);
+    });
+    return newmap;
+  });
+  return construct_with_capsule("mgr_module", "OSDMap", (void*)newmap);
+}
+
+PyObject *ActivePyModules::get_foreign_config(
+  const std::string& who,
+  const std::string& name)
+{
+  dout(10) << "ceph_foreign_option_get " << who << " " << name << dendl;
+
+  // NOTE: for now this will only work with build-in options, not module options.
+  const Option *opt = g_conf().find_option(name);
+  if (!opt) {
+    dout(4) << "ceph_foreign_option_get " << name << " not found " << dendl;
+    PyErr_Format(PyExc_KeyError, "option not found: %s", name.c_str());
+    return nullptr;
+  }
+
+  // If the monitors are not yet running pacific, we cannot rely on our local
+  // ConfigMap
+  if (!have_local_config_map) {
+    dout(20) << "mon cluster wasn't pacific when we started: falling back to 'config get'"
+	     << dendl;
+    without_gil_t no_gil;
+    Command cmd;
+    {
+      std::lock_guard l(lock);
+      cmd.run(
+	&monc,
+	"{\"prefix\": \"config get\","s +
+	"\"who\": \""s + who + "\","s +
+	"\"key\": \""s + name + "\"}");
+    }
+    cmd.wait();
+    dout(10) << "ceph_foreign_option_get (mon command) " << who << " " << name << " = "
+	     << cmd.outbl.to_str() << dendl;
+    with_gil_t gil(no_gil);
+    return get_python_typed_option_value(opt->type, cmd.outbl.to_str());
+  }
+
+  // mimic the behavor of mon/ConfigMonitor's 'config get' command
+  EntityName entity;
+  if (!entity.from_str(who) &&
+      !entity.from_str(who + ".")) {
+    dout(5) << "unrecognized entity '" << who << "'" << dendl;
+    PyErr_Format(PyExc_KeyError, "invalid entity: %s", who.c_str());
+    return nullptr;
+  }
+
+  without_gil_t no_gil;
+  lock.lock();
+
+  // FIXME: this is super inefficient, since we generate the entire daemon
+  // config just to extract one value from it!
+
+  std::map<std::string,std::string,std::less<>> config;
+  cluster_state.with_osdmap([&](const OSDMap &osdmap) {
+      map<string,string> crush_location;
+      string device_class;
+      if (entity.is_osd()) {
+	osdmap.crush->get_full_location(who, &crush_location);
+	int id = atoi(entity.get_id().c_str());
+	const char *c = osdmap.crush->get_item_class(id);
+	if (c) {
+	  device_class = c;
+	}
+	dout(10) << __func__ << " crush_location " << crush_location
+		 << " class " << device_class << dendl;
+      }
+
+      std::map<std::string,pair<std::string,const MaskedOption*>> src;
+      config = config_map.generate_entity_map(
+	entity,
+	crush_location,
+	osdmap.crush.get(),
+	device_class,
+	&src);
+    });
+
+  // get a single value
+  string value;
+  auto p = config.find(name);
+  if (p != config.end()) {
+    value = p->second;
+  } else {
+    if (!entity.is_client() &&
+	!boost::get<boost::blank>(&opt->daemon_value)) {
+      value = Option::to_str(opt->daemon_value);
+    } else {
+      value = Option::to_str(opt->value);
+    }
+  }
+
+  dout(10) << "ceph_foreign_option_get (configmap) " << who << " " << name << " = "
+	   << value << dendl;
+  lock.unlock();
+  with_gil_t with_gil(no_gil);
+  return get_python_typed_option_value(opt->type, value);
+}
+
+void ActivePyModules::set_health_checks(const std::string& module_name,
+				  health_check_map_t&& checks)
+{
+  bool changed = false;
+
+  lock.lock();
+  auto p = modules.find(module_name);
+  if (p != modules.end()) {
+    changed = p->second->set_health_checks(std::move(checks));
+  }
+  lock.unlock();
+
+  // immediately schedule a report to be sent to the monitors with the new
+  // health checks that have changed. This is done asynchronusly to avoid
+  // blocking python land. ActivePyModules::lock needs to be dropped to make
+  // lockdep happy:
+  //
+  //   send_report callers: DaemonServer::lock -> PyModuleRegistery::lock
+  //   active_start: PyModuleRegistry::lock -> ActivePyModules::lock
+  //
+  // if we don't release this->lock before calling schedule_tick a cycle is
+  // formed with the addition of ActivePyModules::lock -> DaemonServer::lock.
+  // This is still correct as send_report is run asynchronously under
+  // DaemonServer::lock.
+  if (changed)
+    server.schedule_tick(0);
+}
+
+int ActivePyModules::handle_command(
+  const ModuleCommand& module_command,
+  const MgrSession& session,
+  const cmdmap_t &cmdmap,
+  const bufferlist &inbuf,
+  std::stringstream *ds,
+  std::stringstream *ss)
+{
+  lock.lock();
+  auto mod_iter = modules.find(module_command.module_name);
+  if (mod_iter == modules.end()) {
+    *ss << "Module '" << module_command.module_name << "' is not available";
+    lock.unlock();
+    return -ENOENT;
+  }
+
+  lock.unlock();
+  return mod_iter->second->handle_command(module_command, session, cmdmap,
+                                          inbuf, ds, ss);
+}
+
+void ActivePyModules::get_health_checks(health_check_map_t *checks)
+{
+  std::lock_guard l(lock);
+  for (auto& [name, module] : modules) {
+    dout(15) << "getting health checks for " << name << dendl;
+    module->get_health_checks(checks);
+  }
+}
+
+void ActivePyModules::update_progress_event(
+  const std::string& evid,
+  const std::string& desc,
+  float progress,
+  bool add_to_ceph_s)
+{
+  std::lock_guard l(lock);
+  auto& pe = progress_events[evid];
+  pe.message = desc;
+  pe.progress = progress;
+  pe.add_to_ceph_s = add_to_ceph_s;
+}
+
+void ActivePyModules::complete_progress_event(const std::string& evid)
+{
+  std::lock_guard l(lock);
+  progress_events.erase(evid);
+}
+
+void ActivePyModules::clear_all_progress_events()
+{
+  std::lock_guard l(lock);
+  progress_events.clear();
+}
+
+void ActivePyModules::get_progress_events(std::map<std::string,ProgressEvent> *events)
+{
+  std::lock_guard l(lock);
+  *events = progress_events;
+}
+
+void ActivePyModules::config_notify()
+{
+  std::lock_guard l(lock);
+  for (auto& [name, module] : modules) {
+    // Send all python calls down a Finisher to avoid blocking
+    // C++ code, and avoid any potential lock cycles.
+    dout(15) << "notify (config) " << name << dendl;
+    // workaround for https://bugs.llvm.org/show_bug.cgi?id=35984
+    finisher.queue(new LambdaContext([module=module](int r){
+      module->config_notify();
+    }));
+  }
+}
+
+void ActivePyModules::set_uri(const std::string& module_name,
+                        const std::string &uri)
+{
+  std::lock_guard l(lock);
+
+  dout(4) << " module " << module_name << " set URI '" << uri << "'" << dendl;
+
+  modules.at(module_name)->set_uri(uri);
+}
+
+void ActivePyModules::set_device_wear_level(const std::string& devid,
+					    float wear_level)
+{
+  // update mgr state
+  map<string,string> meta;
+  daemon_state.with_device(
+    devid,
+    [wear_level, &meta] (DeviceState& dev) {
+      dev.set_wear_level(wear_level);
+      meta = dev.metadata;
+    });
+
+  // tell mon
+  json_spirit::Object json_object;
+  for (auto& i : meta) {
+    json_spirit::Config::add(json_object, i.first, i.second);
+  }
+  bufferlist json;
+  json.append(json_spirit::write(json_object));
+  const string cmd =
+    "{"
+    "\"prefix\": \"config-key set\", "
+    "\"key\": \"device/" + devid + "\""
+    "}";
+
+  Command set_cmd;
+  set_cmd.run(&monc, cmd, json);
+  set_cmd.wait();
+}
+
+MetricQueryID ActivePyModules::add_osd_perf_query(
+    const OSDPerfMetricQuery &query,
+    const std::optional<OSDPerfMetricLimit> &limit)
+{
+  return server.add_osd_perf_query(query, limit);
+}
+
+void ActivePyModules::remove_osd_perf_query(MetricQueryID query_id)
+{
+  int r = server.remove_osd_perf_query(query_id);
+  if (r < 0) {
+    dout(0) << "remove_osd_perf_query for query_id=" << query_id << " failed: "
+            << cpp_strerror(r) << dendl;
+  }
+}
+
+PyObject *ActivePyModules::get_osd_perf_counters(MetricQueryID query_id)
+{
+  OSDPerfCollector collector(query_id);
+  int r = server.get_osd_perf_counters(&collector);
+  if (r < 0) {
+    dout(0) << "get_osd_perf_counters for query_id=" << query_id << " failed: "
+            << cpp_strerror(r) << dendl;
+    Py_RETURN_NONE;
+  }
+
+  PyFormatter f;
+  const std::map<OSDPerfMetricKey, PerformanceCounters> &counters = collector.counters;
+
+  f.open_array_section("counters");
+  for (auto &[key, instance_counters] : counters) {
+    f.open_object_section("i");
+    f.open_array_section("k");
+    for (auto &sub_key : key) {
+      f.open_array_section("s");
+      for (size_t i = 0; i < sub_key.size(); i++) {
+        f.dump_string(stringify(i).c_str(), sub_key[i]);
+      }
+      f.close_section(); // s
+    }
+    f.close_section(); // k
+    f.open_array_section("c");
+    for (auto &c : instance_counters) {
+      f.open_array_section("p");
+      f.dump_unsigned("0", c.first);
+      f.dump_unsigned("1", c.second);
+      f.close_section(); // p
+    }
+    f.close_section(); // c
+    f.close_section(); // i
+  }
+  f.close_section(); // counters
+
+  return f.get();
+}
+
+MetricQueryID ActivePyModules::add_mds_perf_query(
+    const MDSPerfMetricQuery &query,
+    const std::optional<MDSPerfMetricLimit> &limit)
+{
+  return server.add_mds_perf_query(query, limit);
+}
+
+void ActivePyModules::remove_mds_perf_query(MetricQueryID query_id)
+{
+  int r = server.remove_mds_perf_query(query_id);
+  if (r < 0) {
+    dout(0) << "remove_mds_perf_query for query_id=" << query_id << " failed: "
+            << cpp_strerror(r) << dendl;
+  }
+}
+
+void ActivePyModules::reregister_mds_perf_queries()
+{
+  server.reregister_mds_perf_queries();
+}
+
+PyObject *ActivePyModules::get_mds_perf_counters(MetricQueryID query_id)
+{
+  MDSPerfCollector collector(query_id);
+  int r = server.get_mds_perf_counters(&collector);
+  if (r < 0) {
+    dout(0) << "get_mds_perf_counters for query_id=" << query_id << " failed: "
+            << cpp_strerror(r) << dendl;
+    Py_RETURN_NONE;
+  }
+
+  PyFormatter f;
+  const std::map<MDSPerfMetricKey, PerformanceCounters> &counters = collector.counters;
+
+  f.open_array_section("metrics");
+
+  f.open_array_section("delayed_ranks");
+  f.dump_string("ranks", stringify(collector.delayed_ranks).c_str());
+  f.close_section(); // delayed_ranks
+
+  f.open_array_section("counters");
+  for (auto &[key, instance_counters] : counters) {
+    f.open_object_section("i");
+    f.open_array_section("k");
+    for (auto &sub_key : key) {
+      f.open_array_section("s");
+      for (size_t i = 0; i < sub_key.size(); i++) {
+        f.dump_string(stringify(i).c_str(), sub_key[i]);
+      }
+      f.close_section(); // s
+    }
+    f.close_section(); // k
+    f.open_array_section("c");
+    for (auto &c : instance_counters) {
+      f.open_array_section("p");
+      f.dump_unsigned("0", c.first);
+      f.dump_unsigned("1", c.second);
+      f.close_section(); // p
+    }
+    f.close_section(); // c
+    f.close_section(); // i
+  }
+  f.close_section(); // counters
+
+  f.open_array_section("last_updated");
+  f.dump_float("last_updated_mono", collector.last_updated_mono);
+  f.close_section(); // last_updated
+
+  f.close_section(); // metrics
+
+  return f.get();
+}
+
+void ActivePyModules::cluster_log(const std::string &channel, clog_type prio,
+  const std::string &message)
+{
+  std::lock_guard l(lock);
+
+  auto cl = monc.get_log_client()->create_channel(channel);
+  map<string,string> log_to_monitors;
+  map<string,string> log_to_syslog;
+  map<string,string> log_channel;
+  map<string,string> log_prio;
+  map<string,string> log_to_graylog;
+  map<string,string> log_to_graylog_host;
+  map<string,string> log_to_graylog_port;
+  uuid_d fsid;
+  string host;
+  if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog,
+			       log_channel, log_prio, log_to_graylog,
+			       log_to_graylog_host, log_to_graylog_port,
+			       fsid, host) == 0)
+    cl->update_config(log_to_monitors, log_to_syslog,
+		      log_channel, log_prio, log_to_graylog,
+		      log_to_graylog_host, log_to_graylog_port,
+		      fsid, host);
+  cl->do_log(prio, message);
+}
+
+void ActivePyModules::register_client(std::string_view name, std::string addrs)
+{
+  std::lock_guard l(lock);
+
+  entity_addrvec_t addrv;
+  addrv.parse(addrs.data());
+
+  dout(7) << "registering msgr client handle " << addrv << dendl;
+  py_module_registry.register_client(name, std::move(addrv));
+}
+
+void ActivePyModules::unregister_client(std::string_view name, std::string addrs)
+{
+  std::lock_guard l(lock);
+
+  entity_addrvec_t addrv;
+  addrv.parse(addrs.data());
+
+  dout(7) << "unregistering msgr client handle " << addrv << dendl;
+  py_module_registry.unregister_client(name, addrv);
+}
diff --git a/src/mgr/ActivePyModules.h b/src/mgr/ActivePyModules.h
new file mode 100644
index 000000000..d916bdcca
--- /dev/null
+++ b/src/mgr/ActivePyModules.h
@@ -0,0 +1,228 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include "ActivePyModule.h"
+
+#include "common/Finisher.h"
+#include "common/ceph_mutex.h"
+
+#include "PyFormatter.h"
+
+#include "osdc/Objecter.h"
+#include "client/Client.h"
+#include "common/LogClient.h"
+#include "mon/MgrMap.h"
+#include "mon/MonCommand.h"
+#include "mon/mon_types.h"
+#include "mon/ConfigMap.h"
+#include "mgr/TTLCache.h"
+
+#include "DaemonState.h"
+#include "ClusterState.h"
+#include "OSDPerfMetricTypes.h"
+
+class health_check_map_t;
+class DaemonServer;
+class MgrSession;
+class ModuleCommand;
+class PyModuleRegistry;
+
+class ActivePyModules
+{
+  // module class instances not yet created
+  std::set<std::string, std::less<>> pending_modules;
+  // module class instances already created
+  std::map<std::string, std::shared_ptr<ActivePyModule>> modules;
+  PyModuleConfig &module_config;
+  bool have_local_config_map = false;
+  std::map<std::string, std::string> store_cache;
+  ConfigMap config_map;  ///< derived from store_cache config/ keys
+  DaemonStateIndex &daemon_state;
+  ClusterState &cluster_state;
+  MonClient &monc;
+  LogChannelRef clog, audit_clog;
+  Objecter &objecter;
+  Client   &client;
+  Finisher &finisher;
+  TTLCache<string, PyObject*> ttl_cache;
+public:
+  Finisher cmd_finisher;
+private:
+  DaemonServer &server;
+  PyModuleRegistry &py_module_registry;
+
+  map<std::string,ProgressEvent> progress_events;
+
+  mutable ceph::mutex lock = ceph::make_mutex("ActivePyModules::lock");
+
+public:
+  ActivePyModules(
+    PyModuleConfig &module_config,
+    std::map<std::string, std::string> store_data,
+    bool mon_provides_kv_sub,
+    DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+    LogChannelRef clog_, LogChannelRef audit_clog_, Objecter &objecter_, Client &client_,
+    Finisher &f, DaemonServer &server, PyModuleRegistry &pmr);
+
+  ~ActivePyModules();
+
+  // FIXME: wrap for send_command?
+  MonClient &get_monc() {return monc;}
+  Objecter  &get_objecter() {return objecter;}
+  Client    &get_client() {return client;}
+  PyObject *cacheable_get_python(const std::string &what);
+  PyObject *get_python(const std::string &what);
+  PyObject *get_server_python(const std::string &hostname);
+  PyObject *list_servers_python();
+  PyObject *get_metadata_python(
+    const std::string &svc_type, const std::string &svc_id);
+  PyObject *get_daemon_status_python(
+    const std::string &svc_type, const std::string &svc_id);
+  PyObject *get_counter_python(
+    const std::string &svc_type,
+    const std::string &svc_id,
+    const std::string &path);
+  PyObject *get_latest_counter_python(
+    const std::string &svc_type,
+    const std::string &svc_id,
+    const std::string &path);
+  PyObject *get_perf_schema_python(
+     const std::string &svc_type,
+     const std::string &svc_id);
+  PyObject *get_context();
+  PyObject *get_osdmap();
+  /// @note @c fct is not allowed to acquire locks when holding GIL
+  PyObject *with_perf_counters(
+      std::function<void(
+        PerfCounterInstance& counter_instance,
+        PerfCounterType& counter_type,
+        PyFormatter& f)> fct,
+      const std::string &svc_name,
+      const std::string &svc_id,
+      const std::string &path) const;
+
+  MetricQueryID add_osd_perf_query(
+      const OSDPerfMetricQuery &query,
+      const std::optional<OSDPerfMetricLimit> &limit);
+  void remove_osd_perf_query(MetricQueryID query_id);
+  PyObject *get_osd_perf_counters(MetricQueryID query_id);
+
+  MetricQueryID add_mds_perf_query(
+      const MDSPerfMetricQuery &query,
+      const std::optional<MDSPerfMetricLimit> &limit);
+  void remove_mds_perf_query(MetricQueryID query_id);
+  void reregister_mds_perf_queries();
+  PyObject *get_mds_perf_counters(MetricQueryID query_id);
+
+  bool get_store(const std::string &module_name,
+      const std::string &key, std::string *val) const;
+  PyObject *get_store_prefix(const std::string &module_name,
+			      const std::string &prefix) const;
+  void set_store(const std::string &module_name,
+      const std::string &key, const boost::optional<std::string> &val);
+
+  bool get_config(const std::string &module_name,
+      const std::string &key, std::string *val) const;
+  void set_config(const std::string &module_name,
+      const std::string &key, const boost::optional<std::string> &val);
+
+  PyObject *get_typed_config(const std::string &module_name,
+			     const std::string &key,
+			     const std::string &prefix = "") const;
+  PyObject *get_foreign_config(
+    const std::string& who,
+    const std::string& name);
+
+  void set_health_checks(const std::string& module_name,
+			 health_check_map_t&& checks);
+  void get_health_checks(health_check_map_t *checks);
+
+  void update_progress_event(const std::string& evid,
+			     const std::string& desc,
+			     float progress,
+			     bool add_to_ceph_s);
+  void complete_progress_event(const std::string& evid);
+  void clear_all_progress_events();
+  void get_progress_events(std::map<std::string,ProgressEvent>* events);
+
+  void register_client(std::string_view name, std::string addrs);
+  void unregister_client(std::string_view name, std::string addrs);
+
+  void config_notify();
+
+  void set_uri(const std::string& module_name, const std::string &uri);
+  void set_device_wear_level(const std::string& devid, float wear_level);
+
+  int handle_command(
+    const ModuleCommand& module_command,
+    const MgrSession& session,
+    const cmdmap_t &cmdmap,
+    const bufferlist &inbuf,
+    std::stringstream *ds,
+    std::stringstream *ss);
+
+  std::map<std::string, std::string> get_services() const;
+
+  void update_kv_data(
+    const std::string prefix,
+    bool incremental,
+    const map<std::string, boost::optional<bufferlist>, std::less<>>& data);
+  void _refresh_config_map();
+
+  // Public so that MonCommandCompletion can use it
+  // FIXME: for send_command completion notifications,
+  // send it to only the module that sent the command, not everyone
+  void notify_all(const std::string &notify_type,
+                  const std::string &notify_id);
+  void notify_all(const LogEntry &log_entry);
+
+  bool is_pending(std::string_view name) const {
+    return pending_modules.count(name) > 0;
+  }
+  bool module_exists(const std::string &name) const
+  {
+    return modules.count(name) > 0;
+  }
+
+  bool method_exists(
+      const std::string &module_name,
+      const std::string &method_name) const
+  {
+    return modules.at(module_name)->method_exists(method_name);
+  }
+
+  PyObject *dispatch_remote(
+      const std::string &other_module,
+      const std::string &method,
+      PyObject *args,
+      PyObject *kwargs,
+      std::string *err);
+
+  int init();
+  void shutdown();
+
+  void start_one(PyModuleRef py_module);
+
+  void dump_server(const std::string &hostname,
+                   const DaemonStateCollection &dmc,
+                   Formatter *f);
+
+  void cluster_log(const std::string &channel, clog_type prio,
+    const std::string &message);
+
+  bool inject_python_on() const;
+  void update_cache_metrics();
+};
+
diff --git a/src/mgr/BaseMgrModule.cc b/src/mgr/BaseMgrModule.cc
new file mode 100644
index 000000000..3f49976d8
--- /dev/null
+++ b/src/mgr/BaseMgrModule.cc
@@ -0,0 +1,1623 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+/**
+ * The interface we present to python code that runs within
+ * ceph-mgr.  This is implemented as a Python class from which
+ * all modules must inherit -- access to the Ceph state is then
+ * available as methods on that object.
+ */
+
+#include "Python.h"
+
+#include "Mgr.h"
+
+#include "mon/MonClient.h"
+#include "common/errno.h"
+#include "common/version.h"
+#include "mgr/Types.h"
+
+#include "PyUtil.h"
+#include "BaseMgrModule.h"
+#include "Gil.h"
+
+#include <algorithm>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#define PLACEHOLDER ""
+
+
+typedef struct {
+  PyObject_HEAD
+  ActivePyModules *py_modules;
+  ActivePyModule *this_module;
+} BaseMgrModule;
+
+class MonCommandCompletion : public Context
+{
+  ActivePyModules *py_modules;
+  PyObject *python_completion;
+  const std::string tag;
+  SafeThreadState pThreadState;
+
+public:
+  std::string outs;
+  bufferlist outbl;
+
+  MonCommandCompletion(
+      ActivePyModules *py_modules_, PyObject* ev,
+      const std::string &tag_, PyThreadState *ts_)
+    : py_modules(py_modules_), python_completion(ev),
+      tag(tag_), pThreadState(ts_)
+  {
+    ceph_assert(python_completion != nullptr);
+    Py_INCREF(python_completion);
+  }
+
+  ~MonCommandCompletion() override
+  {
+    if (python_completion) {
+      // Usually do this in finish(): this path is only for if we're
+      // being destroyed without completing.
+      Gil gil(pThreadState, true);
+      Py_DECREF(python_completion);
+      python_completion = nullptr;
+    }
+  }
+
+  void finish(int r) override
+  {
+    ceph_assert(python_completion != nullptr);
+
+    dout(10) << "MonCommandCompletion::finish()" << dendl;
+    {
+      // Scoped so the Gil is released before calling notify_all()
+      // Create new thread state because this is called via the MonClient
+      // Finisher, not the PyModules finisher.
+      Gil gil(pThreadState, true);
+
+      auto set_fn = PyObject_GetAttrString(python_completion, "complete");
+      ceph_assert(set_fn != nullptr);
+
+      auto pyR = PyLong_FromLong(r);
+      auto pyOutBl = PyUnicode_FromString(outbl.to_str().c_str());
+      auto pyOutS = PyUnicode_FromString(outs.c_str());
+      auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS);
+      Py_DECREF(pyR);
+      Py_DECREF(pyOutBl);
+      Py_DECREF(pyOutS);
+
+      auto rtn = PyObject_CallObject(set_fn, args);
+      if (rtn != nullptr) {
+	Py_DECREF(rtn);
+      }
+      Py_DECREF(args);
+      Py_DECREF(set_fn);
+
+      Py_DECREF(python_completion);
+      python_completion = nullptr;
+    }
+    py_modules->notify_all("command", tag);
+  }
+};
+
+
+static PyObject*
+ceph_send_command(BaseMgrModule *self, PyObject *args)
+{
+  // Like mon, osd, mds
+  char *type = nullptr;
+
+  // Like "23" for an OSD or "myid" for an MDS
+  char *name = nullptr;
+
+  char *cmd_json = nullptr;
+  char *tag = nullptr;
+  char *inbuf_ptr = nullptr;
+  Py_ssize_t inbuf_len = 0;
+  bufferlist inbuf = {};
+
+  PyObject *completion = nullptr;
+  if (!PyArg_ParseTuple(args, "Ossssz#:ceph_send_command",
+        &completion, &type, &name, &cmd_json, &tag, &inbuf_ptr, &inbuf_len)) {
+    return nullptr;
+  }
+
+  if (inbuf_ptr) {
+    inbuf.append(inbuf_ptr, (unsigned)inbuf_len);
+  }
+
+  auto set_fn = PyObject_GetAttrString(completion, "complete");
+  if (set_fn == nullptr) {
+    ceph_abort();  // TODO raise python exception instead
+  } else {
+    ceph_assert(PyCallable_Check(set_fn));
+  }
+  Py_DECREF(set_fn);
+
+  MonCommandCompletion *command_c = new MonCommandCompletion(self->py_modules,
+      completion, tag, PyThreadState_Get());
+
+  PyThreadState *tstate = PyEval_SaveThread();
+
+  if (std::string(type) == "mon") {
+
+    // Wait for the latest OSDMap after each command we send to
+    // the mons.  This is a heavy-handed hack to make life simpler
+    // for python module authors, so that they know whenever they
+    // run a command they've gt a fresh OSDMap afterwards.
+    // TODO: enhance MCommand interface so that it returns
+    // latest cluster map versions on completion, and callers
+    // can wait for those.
+    auto c = new LambdaContext([command_c, self](int command_r){
+      self->py_modules->get_objecter().wait_for_latest_osdmap(
+	[command_c, command_r](boost::system::error_code) {
+	  command_c->complete(command_r);
+	});
+    });
+
+    self->py_modules->get_monc().start_mon_command(
+        name,
+        {cmd_json},
+        inbuf,
+        &command_c->outbl,
+        &command_c->outs,
+        new C_OnFinisher(c, &self->py_modules->cmd_finisher));
+  } else if (std::string(type) == "osd") {
+    std::string err;
+    uint64_t osd_id = strict_strtoll(name, 10, &err);
+    if (!err.empty()) {
+      delete command_c;
+      string msg("invalid osd_id: ");
+      msg.append("\"").append(name).append("\"");
+      PyEval_RestoreThread(tstate);
+      PyErr_SetString(PyExc_ValueError, msg.c_str());
+      return nullptr;
+    }
+
+    ceph_tid_t tid;
+    self->py_modules->get_objecter().osd_command(
+        osd_id,
+        {cmd_json},
+        inbuf,
+        &tid,
+	[command_c, f = &self->py_modules->cmd_finisher]
+	(boost::system::error_code ec, std::string s, ceph::buffer::list bl) {
+	  command_c->outs = std::move(s);
+	  command_c->outbl = std::move(bl);
+	  f->queue(command_c);
+	});
+  } else if (std::string(type) == "mds") {
+    int r = self->py_modules->get_client().mds_command(
+        name,
+        {cmd_json},
+        inbuf,
+        &command_c->outbl,
+        &command_c->outs,
+        new C_OnFinisher(command_c, &self->py_modules->cmd_finisher));
+    if (r != 0) {
+      string msg("failed to send command to mds: ");
+      msg.append(cpp_strerror(r));
+      PyEval_RestoreThread(tstate);
+      PyErr_SetString(PyExc_RuntimeError, msg.c_str());
+      return nullptr;
+    }
+  } else if (std::string(type) == "pg") {
+    pg_t pgid;
+    if (!pgid.parse(name)) {
+      delete command_c;
+      string msg("invalid pgid: ");
+      msg.append("\"").append(name).append("\"");
+      PyEval_RestoreThread(tstate);
+      PyErr_SetString(PyExc_ValueError, msg.c_str());
+      return nullptr;
+    }
+
+    ceph_tid_t tid;
+    self->py_modules->get_objecter().pg_command(
+        pgid,
+        {cmd_json},
+        inbuf,
+        &tid,
+	[command_c, f = &self->py_modules->cmd_finisher]
+	(boost::system::error_code ec, std::string s, ceph::buffer::list bl) {
+	  command_c->outs = std::move(s);
+	  command_c->outbl = std::move(bl);
+	  f->queue(command_c);
+	});
+    PyEval_RestoreThread(tstate);
+    return nullptr;
+  } else {
+    delete command_c;
+    string msg("unknown service type: ");
+    msg.append(type);
+    PyEval_RestoreThread(tstate);
+    PyErr_SetString(PyExc_ValueError, msg.c_str());
+    return nullptr;
+  }
+
+  PyEval_RestoreThread(tstate);
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_set_health_checks(BaseMgrModule *self, PyObject *args)
+{
+  PyObject *checks = NULL;
+  if (!PyArg_ParseTuple(args, "O:ceph_set_health_checks", &checks)) {
+    return NULL;
+  }
+  if (!PyDict_Check(checks)) {
+    derr << __func__ << " arg not a dict" << dendl;
+    Py_RETURN_NONE;
+  }
+  PyObject *checksls = PyDict_Items(checks);
+  health_check_map_t out_checks;
+  for (int i = 0; i < PyList_Size(checksls); ++i) {
+    PyObject *kv = PyList_GET_ITEM(checksls, i);
+    char *check_name = nullptr;
+    PyObject *check_info = nullptr;
+    if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) {
+      derr << __func__ << " dict item " << i
+	   << " not a size 2 tuple" << dendl;
+      continue;
+    }
+    if (!PyDict_Check(check_info)) {
+      derr << __func__ << " item " << i << " " << check_name
+	   << " value not a dict" << dendl;
+      continue;
+    }
+    health_status_t severity = HEALTH_OK;
+    string summary;
+    list<string> detail;
+    int64_t count = 0;
+    PyObject *infols = PyDict_Items(check_info);
+    for (int j = 0; j < PyList_Size(infols); ++j) {
+      PyObject *pair = PyList_GET_ITEM(infols, j);
+      if (!PyTuple_Check(pair)) {
+	derr << __func__ << " item " << i << " pair " << j
+	     << " not a tuple" << dendl;
+	continue;
+      }
+      char *k = nullptr;
+      PyObject *v = nullptr;
+      if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) {
+	derr << __func__ << " item " << i << " pair " << j
+	     << " not a size 2 tuple" << dendl;
+	continue;
+      }
+      string ks(k);
+      if (ks == "severity") {
+	if (!PyUnicode_Check(v)) {
+	  derr << __func__ << " check " << check_name
+	       << " severity value not string" << dendl;
+	  continue;
+	}
+	if (const string vs = PyUnicode_AsUTF8(v); vs == "warning") {
+	  severity = HEALTH_WARN;
+	} else if (vs == "error") {
+	  severity = HEALTH_ERR;
+	}
+      } else if (ks == "summary") {
+	if (!PyUnicode_Check(v)) {
+	  derr << __func__ << " check " << check_name
+	       << " summary value not [unicode] string" << dendl;
+	  continue;
+	} else {
+	  summary = PyUnicode_AsUTF8(v);
+	}
+      } else if (ks == "count") {
+	if (PyLong_Check(v)) {
+	  count = PyLong_AsLong(v);
+	} else {
+	  derr << __func__ << " check " << check_name
+	       << " count value not int" << dendl;
+	  continue;
+	}
+      } else if (ks == "detail") {
+	if (!PyList_Check(v)) {
+	  derr << __func__ << " check " << check_name
+	       << " detail value not list" << dendl;
+	  continue;
+	}
+	for (int k = 0; k < PyList_Size(v); ++k) {
+	  PyObject *di = PyList_GET_ITEM(v, k);
+	  if (!PyUnicode_Check(di)) {
+	    derr << __func__ << " check " << check_name
+		 << " detail item " << k << " not a [unicode] string" << dendl;
+	    continue;
+	  } else {
+	    detail.push_back(PyUnicode_AsUTF8(di));
+	  }
+	}
+      } else {
+	derr << __func__ << " check " << check_name
+	     << " unexpected key " << k << dendl;
+      }
+    }
+    auto& d = out_checks.add(check_name, severity, summary, count);
+    d.detail.swap(detail);
+  }
+
+  JSONFormatter jf(true);
+  dout(10) << "module " << self->this_module->get_name()
+          << " health checks:\n";
+  out_checks.dump(&jf);
+  jf.flush(*_dout);
+  *_dout << dendl;
+
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->set_health_checks(self->this_module->get_name(),
+                                      std::move(out_checks));
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+
+static PyObject*
+ceph_state_get(BaseMgrModule *self, PyObject *args)
+{
+  char *what = NULL;
+  if (!PyArg_ParseTuple(args, "s:ceph_state_get", &what)) {
+    return NULL;
+  }
+
+  return self->py_modules->cacheable_get_python(what);
+}
+
+
+static PyObject*
+ceph_get_server(BaseMgrModule *self, PyObject *args)
+{
+  char *hostname = NULL;
+  if (!PyArg_ParseTuple(args, "z:ceph_get_server", &hostname)) {
+    return NULL;
+  }
+
+  if (hostname) {
+    return self->py_modules->get_server_python(hostname);
+  } else {
+    return self->py_modules->list_servers_python();
+  }
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrModule *self, PyObject *args)
+{
+  return PyUnicode_FromString(g_conf()->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_option_get(BaseMgrModule *self, PyObject *args)
+{
+  char *what = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_option_get", &what)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  const Option *opt = g_conf().find_option(string(what));
+  if (opt) {
+    std::string value;
+    switch (int r = g_conf().get_val(string(what), &value); r) {
+    case -ENOMEM:
+      PyErr_NoMemory();
+      return nullptr;
+    case -ENAMETOOLONG:
+      PyErr_SetString(PyExc_ValueError, "value too long");
+      return nullptr;
+    default:
+      ceph_assert(r == 0);
+      break;
+    }
+    dout(10) << "ceph_option_get " << what << " found: " << value << dendl;
+    return get_python_typed_option_value(opt->type, value);
+  } else {
+    dout(4) << "ceph_option_get " << what << " not found " << dendl;
+    PyErr_Format(PyExc_KeyError, "option not found: %s", what);
+    return nullptr;
+  }
+}
+
+static PyObject*
+ceph_foreign_option_get(BaseMgrModule *self, PyObject *args)
+{
+  char *who = nullptr;
+  char *what = nullptr;
+  if (!PyArg_ParseTuple(args, "ss:ceph_foreign_option_get", &who, &what)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+  return self->py_modules->get_foreign_config(who, what);
+}
+
+static PyObject*
+ceph_get_module_option(BaseMgrModule *self, PyObject *args)
+{
+  char *module = nullptr;
+  char *key = nullptr;
+  char *prefix = nullptr;
+  if (!PyArg_ParseTuple(args, "ss|s:ceph_get_module_option", &module, &key,
+			&prefix)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+  std::string str_prefix;
+  if (prefix) {
+    str_prefix = prefix;
+  }
+  assert(self->this_module->py_module);
+  auto pResult = self->py_modules->get_typed_config(module, key, str_prefix);
+  return pResult;
+}
+
+static PyObject*
+ceph_store_get_prefix(BaseMgrModule *self, PyObject *args)
+{
+  char *prefix = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_store_get_prefix", &prefix)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  return self->py_modules->get_store_prefix(self->this_module->get_name(),
+      prefix);
+}
+
+static PyObject*
+ceph_set_module_option(BaseMgrModule *self, PyObject *args)
+{
+  char *module = nullptr;
+  char *key = nullptr;
+  char *value = nullptr;
+  if (!PyArg_ParseTuple(args, "ssz:ceph_set_module_option",
+        &module, &key, &value)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+  boost::optional<string> val;
+  if (value) {
+    val = value;
+  }
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->set_config(module, key, val);
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_store_get(BaseMgrModule *self, PyObject *args)
+{
+  char *what = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_store_get", &what)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  std::string value;
+  bool found = self->py_modules->get_store(self->this_module->get_name(),
+      what, &value);
+  if (found) {
+    dout(10) << "ceph_store_get " << what << " found: " << value.c_str() << dendl;
+    return PyUnicode_FromString(value.c_str());
+  } else {
+    dout(4) << "ceph_store_get " << what << " not found " << dendl;
+    Py_RETURN_NONE;
+  }
+}
+
+static PyObject*
+ceph_store_set(BaseMgrModule *self, PyObject *args)
+{
+  char *key = nullptr;
+  char *value = nullptr;
+  if (!PyArg_ParseTuple(args, "sz:ceph_store_set", &key, &value)) {
+    return nullptr;
+  }
+  boost::optional<string> val;
+  if (value) {
+    val = value;
+  }
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->set_store(self->this_module->get_name(), key, val);
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+get_metadata(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = NULL;
+  char *svc_id = NULL;
+  if (!PyArg_ParseTuple(args, "ss:get_metadata", &svc_name, &svc_id)) {
+    return nullptr;
+  }
+  return self->py_modules->get_metadata_python(svc_name, svc_id);
+}
+
+static PyObject*
+get_daemon_status(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = NULL;
+  char *svc_id = NULL;
+  if (!PyArg_ParseTuple(args, "ss:get_daemon_status", &svc_name,
+			&svc_id)) {
+    return nullptr;
+  }
+  return self->py_modules->get_daemon_status_python(svc_name, svc_id);
+}
+
+static PyObject*
+ceph_log(BaseMgrModule *self, PyObject *args)
+{
+  char *record = nullptr;
+  if (!PyArg_ParseTuple(args, "s:log", &record)) {
+    return nullptr;
+  }
+
+  ceph_assert(self->this_module);
+
+  self->this_module->log(record);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_cluster_log(BaseMgrModule *self, PyObject *args)
+{
+  int prio = 0;
+  char *channel = nullptr;
+  char *message = nullptr;
+
+  if (!PyArg_ParseTuple(args, "sis:ceph_cluster_log", &channel, &prio, &message)) {
+    return nullptr;
+  }
+
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->cluster_log(channel, (clog_type)prio, message);
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject *
+ceph_get_version(BaseMgrModule *self, PyObject *args)
+{
+  return PyUnicode_FromString(pretty_version_to_str().c_str());
+}
+
+static PyObject *
+ceph_get_ceph_conf_path(BaseMgrModule *self, PyObject *args)
+{
+  return PyUnicode_FromString(g_conf().get_conf_path().c_str());
+}
+
+static PyObject *
+ceph_get_release_name(BaseMgrModule *self, PyObject *args)
+{
+  return PyUnicode_FromString(ceph_release_to_str());
+}
+
+static PyObject *
+ceph_lookup_release_name(BaseMgrModule *self, PyObject *args)
+{
+  int major = 0;
+  if (!PyArg_ParseTuple(args, "i:ceph_lookup_release_name", &major)) {
+    return nullptr;
+  }
+  return PyUnicode_FromString(ceph_release_name(major));
+}
+
+static PyObject *
+ceph_get_context(BaseMgrModule *self)
+{
+  return self->py_modules->get_context();
+}
+
+static PyObject*
+get_counter(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = nullptr;
+  char *svc_id = nullptr;
+  char *counter_path = nullptr;
+  if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name,
+                                                  &svc_id, &counter_path)) {
+    return nullptr;
+  }
+  return self->py_modules->get_counter_python(
+      svc_name, svc_id, counter_path);
+}
+
+static PyObject*
+get_latest_counter(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_name = nullptr;
+  char *svc_id = nullptr;
+  char *counter_path = nullptr;
+  if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name,
+                                                  &svc_id, &counter_path)) {
+    return nullptr;
+  }
+  return self->py_modules->get_latest_counter_python(
+      svc_name, svc_id, counter_path);
+}
+
+static PyObject*
+get_perf_schema(BaseMgrModule *self, PyObject *args)
+{
+  char *type_str = nullptr;
+  char *svc_id = nullptr;
+  if (!PyArg_ParseTuple(args, "ss:get_perf_schema", &type_str,
+                                                    &svc_id)) {
+    return nullptr;
+  }
+
+  return self->py_modules->get_perf_schema_python(type_str, svc_id);
+}
+
+static PyObject *
+ceph_get_osdmap(BaseMgrModule *self, PyObject *args)
+{
+  return self->py_modules->get_osdmap();
+}
+
+static PyObject*
+ceph_set_uri(BaseMgrModule *self, PyObject *args)
+{
+  char *svc_str = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_advertize_service",
+        &svc_str)) {
+    return nullptr;
+  }
+
+  // We call down into PyModules even though we have a MgrPyModule
+  // reference here, because MgrPyModule's fields are protected
+  // by PyModules' lock.
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->set_uri(self->this_module->get_name(), svc_str);
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_set_wear_level(BaseMgrModule *self, PyObject *args)
+{
+  char *devid = nullptr;
+  float wear_level;
+  if (!PyArg_ParseTuple(args, "sf:ceph_set_wear_level",
+			&devid, &wear_level)) {
+    return nullptr;
+  }
+
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->set_device_wear_level(devid, wear_level);
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_have_mon_connection(BaseMgrModule *self, PyObject *args)
+{
+  if (self->py_modules->get_monc().is_connected()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+static PyObject*
+ceph_update_progress_event(BaseMgrModule *self, PyObject *args)
+{
+  char *evid = nullptr;
+  char *desc = nullptr;
+  float progress = 0.0;
+  bool add_to_ceph_s = false;
+  if (!PyArg_ParseTuple(args, "ssfb:ceph_update_progress_event",
+			&evid, &desc, &progress, &add_to_ceph_s)) {
+    return nullptr;
+  }
+
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->update_progress_event(evid, desc, progress, add_to_ceph_s);
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_complete_progress_event(BaseMgrModule *self, PyObject *args)
+{
+  char *evid = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_complete_progress_event",
+			&evid)) {
+    return nullptr;
+  }
+
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->complete_progress_event(evid);
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_clear_all_progress_events(BaseMgrModule *self, PyObject *args)
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->clear_all_progress_events();
+  PyEval_RestoreThread(tstate);
+
+  Py_RETURN_NONE;
+}
+
+
+
+static PyObject *
+ceph_dispatch_remote(BaseMgrModule *self, PyObject *args)
+{
+  char *other_module = nullptr;
+  char *method = nullptr;
+  PyObject *remote_args = nullptr;
+  PyObject *remote_kwargs = nullptr;
+  if (!PyArg_ParseTuple(args, "ssOO:ceph_dispatch_remote",
+        &other_module, &method, &remote_args, &remote_kwargs)) {
+    return nullptr;
+  }
+
+  // Early error handling, because if the module doesn't exist then we
+  // won't be able to use its thread state to set python error state
+  // inside dispatch_remote().
+  if (!self->py_modules->module_exists(other_module)) {
+    derr << "no module '" << other_module << "'" << dendl;
+    PyErr_SetString(PyExc_ImportError, "Module not found");
+    return nullptr;
+  }
+
+  // Drop GIL from calling python thread state, it will be taken
+  // both for checking for method existence and for executing method.
+  PyThreadState *tstate = PyEval_SaveThread();
+
+  if (!self->py_modules->method_exists(other_module, method)) {
+    PyEval_RestoreThread(tstate);
+    PyErr_SetString(PyExc_NameError, "Method not found");
+    return nullptr;
+  }
+
+  std::string err;
+  auto result = self->py_modules->dispatch_remote(other_module, method,
+      remote_args, remote_kwargs, &err);
+
+  PyEval_RestoreThread(tstate);
+
+  if (result == nullptr) {
+    std::stringstream ss;
+    ss << "Remote method threw exception: " << err;
+    PyErr_SetString(PyExc_RuntimeError, ss.str().c_str());
+    derr << ss.str() << dendl;
+  }
+
+  return result;
+}
+
+static PyObject*
+ceph_add_osd_perf_query(BaseMgrModule *self, PyObject *args)
+{
+  static const std::string NAME_KEY_DESCRIPTOR = "key_descriptor";
+  static const std::string NAME_COUNTERS_DESCRIPTORS =
+      "performance_counter_descriptors";
+  static const std::string NAME_LIMIT = "limit";
+  static const std::string NAME_SUB_KEY_TYPE = "type";
+  static const std::string NAME_SUB_KEY_REGEX = "regex";
+  static const std::string NAME_LIMIT_ORDER_BY = "order_by";
+  static const std::string NAME_LIMIT_MAX_COUNT = "max_count";
+  static const std::map<std::string, OSDPerfMetricSubKeyType> sub_key_types = {
+    {"client_id", OSDPerfMetricSubKeyType::CLIENT_ID},
+    {"client_address", OSDPerfMetricSubKeyType::CLIENT_ADDRESS},
+    {"pool_id", OSDPerfMetricSubKeyType::POOL_ID},
+    {"namespace", OSDPerfMetricSubKeyType::NAMESPACE},
+    {"osd_id", OSDPerfMetricSubKeyType::OSD_ID},
+    {"pg_id", OSDPerfMetricSubKeyType::PG_ID},
+    {"object_name", OSDPerfMetricSubKeyType::OBJECT_NAME},
+    {"snap_id", OSDPerfMetricSubKeyType::SNAP_ID},
+  };
+  static const std::map<std::string, PerformanceCounterType> counter_types = {
+    {"ops", PerformanceCounterType::OPS},
+    {"write_ops", PerformanceCounterType::WRITE_OPS},
+    {"read_ops", PerformanceCounterType::READ_OPS},
+    {"bytes", PerformanceCounterType::BYTES},
+    {"write_bytes", PerformanceCounterType::WRITE_BYTES},
+    {"read_bytes", PerformanceCounterType::READ_BYTES},
+    {"latency", PerformanceCounterType::LATENCY},
+    {"write_latency", PerformanceCounterType::WRITE_LATENCY},
+    {"read_latency", PerformanceCounterType::READ_LATENCY},
+  };
+
+  PyObject *py_query = nullptr;
+  if (!PyArg_ParseTuple(args, "O:ceph_add_osd_perf_query", &py_query)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+  if (!PyDict_Check(py_query)) {
+    derr << __func__ << " arg not a dict" << dendl;
+    Py_RETURN_NONE;
+  }
+
+  PyObject *query_params = PyDict_Items(py_query);
+  OSDPerfMetricQuery query;
+  std::optional<OSDPerfMetricLimit> limit;
+
+  // {
+  //   'key_descriptor': [
+  //     {'type': subkey_type, 'regex': regex_pattern},
+  //     ...
+  //   ],
+  //   'performance_counter_descriptors': [
+  //     list, of, descriptor, types
+  //   ],
+  //   'limit': {'order_by': performance_counter_type, 'max_count': n},
+  // }
+
+  for (int i = 0; i < PyList_Size(query_params); ++i) {
+    PyObject *kv = PyList_GET_ITEM(query_params, i);
+    char *query_param_name = nullptr;
+    PyObject *query_param_val = nullptr;
+    if (!PyArg_ParseTuple(kv, "sO:pair", &query_param_name, &query_param_val)) {
+      derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl;
+      Py_RETURN_NONE;
+    }
+    if (query_param_name == NAME_KEY_DESCRIPTOR) {
+      if (!PyList_Check(query_param_val)) {
+        derr << __func__ << " " << query_param_name << " not a list" << dendl;
+        Py_RETURN_NONE;
+      }
+      for (int j = 0; j < PyList_Size(query_param_val); j++) {
+        PyObject *sub_key = PyList_GET_ITEM(query_param_val, j);
+        if (!PyDict_Check(sub_key)) {
+          derr << __func__ << " query " << query_param_name << " item " << j
+               << " not a dict" << dendl;
+          Py_RETURN_NONE;
+        }
+        OSDPerfMetricSubKeyDescriptor d;
+        PyObject *sub_key_params = PyDict_Items(sub_key);
+        for (int k = 0; k < PyList_Size(sub_key_params); ++k) {
+          PyObject *pair = PyList_GET_ITEM(sub_key_params, k);
+          if (!PyTuple_Check(pair)) {
+            derr << __func__ << " query " << query_param_name << " item " << j
+                 << " pair " << k << " not a tuple" << dendl;
+            Py_RETURN_NONE;
+          }
+          char *param_name = nullptr;
+          PyObject *param_value = nullptr;
+          if (!PyArg_ParseTuple(pair, "sO:pair", &param_name, &param_value)) {
+            derr << __func__ << " query " << query_param_name << " item " << j
+                 << " pair " << k << " not a size 2 tuple" << dendl;
+            Py_RETURN_NONE;
+          }
+          if (param_name == NAME_SUB_KEY_TYPE) {
+            if (!PyUnicode_Check(param_value)) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " contains invalid param " << param_name << dendl;
+              Py_RETURN_NONE;
+            }
+            auto type = PyUnicode_AsUTF8(param_value);
+            auto it = sub_key_types.find(type);
+            if (it == sub_key_types.end()) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " contains invalid type " << dendl;
+              Py_RETURN_NONE;
+            }
+            d.type = it->second;
+          } else if (param_name == NAME_SUB_KEY_REGEX) {
+            if (!PyUnicode_Check(param_value)) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " contains invalid param " << param_name << dendl;
+              Py_RETURN_NONE;
+            }
+            d.regex_str = PyUnicode_AsUTF8(param_value);
+            try {
+              d.regex = d.regex_str.c_str();
+            } catch (const std::regex_error& e) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " contains invalid regex " << d.regex_str << dendl;
+              Py_RETURN_NONE;
+            }
+            if (d.regex.mark_count() == 0) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " regex " << d.regex_str << ": no capturing groups"
+                   << dendl;
+              Py_RETURN_NONE;
+            }
+          } else {
+            derr << __func__ << " query " << query_param_name << " item " << j
+                 << " contains invalid param " << param_name << dendl;
+            Py_RETURN_NONE;
+          }
+        }
+        if (d.type == static_cast<OSDPerfMetricSubKeyType>(-1) ||
+            d.regex_str.empty()) {
+          derr << __func__ << " query " << query_param_name << " item " << i
+               << " invalid" << dendl;
+          Py_RETURN_NONE;
+        }
+        query.key_descriptor.push_back(d);
+      }
+    } else if (query_param_name == NAME_COUNTERS_DESCRIPTORS) {
+      if (!PyList_Check(query_param_val)) {
+        derr << __func__ << " " << query_param_name << " not a list" << dendl;
+        Py_RETURN_NONE;
+      }
+      for (int j = 0; j < PyList_Size(query_param_val); j++) {
+        PyObject *py_type = PyList_GET_ITEM(query_param_val, j);
+        if (!PyUnicode_Check(py_type)) {
+          derr << __func__ << " query " << query_param_name << " item " << j
+               << " not a string" << dendl;
+          Py_RETURN_NONE;
+        }
+        auto type = PyUnicode_AsUTF8(py_type);
+        auto it = counter_types.find(type);
+        if (it == counter_types.end()) {
+          derr << __func__ << " query " << query_param_name << " item " << type
+               << " is not valid type" << dendl;
+          Py_RETURN_NONE;
+        }
+        query.performance_counter_descriptors.push_back(it->second);
+      }
+    } else if (query_param_name == NAME_LIMIT) {
+      if (!PyDict_Check(query_param_val)) {
+        derr << __func__ << " query " << query_param_name << " not a dict"
+             << dendl;
+        Py_RETURN_NONE;
+      }
+
+      limit = OSDPerfMetricLimit();
+      PyObject *limit_params = PyDict_Items(query_param_val);
+
+      for (int j = 0; j < PyList_Size(limit_params); ++j) {
+        PyObject *kv = PyList_GET_ITEM(limit_params, j);
+        char *limit_param_name = nullptr;
+        PyObject *limit_param_val = nullptr;
+        if (!PyArg_ParseTuple(kv, "sO:pair", &limit_param_name,
+                              &limit_param_val)) {
+          derr << __func__ << " limit item " << j << " not a size 2 tuple"
+               << dendl;
+          Py_RETURN_NONE;
+        }
+
+        if (limit_param_name == NAME_LIMIT_ORDER_BY) {
+          if (!PyUnicode_Check(limit_param_val)) {
+            derr << __func__ << " " << limit_param_name << " not a string"
+                 << dendl;
+            Py_RETURN_NONE;
+          }
+          auto order_by = PyUnicode_AsUTF8(limit_param_val);
+          auto it = counter_types.find(order_by);
+          if (it == counter_types.end()) {
+            derr << __func__ << " limit " << limit_param_name
+                 << " not a valid counter type" << dendl;
+            Py_RETURN_NONE;
+          }
+          limit->order_by = it->second;
+        } else if (limit_param_name == NAME_LIMIT_MAX_COUNT) {
+          if (!PyLong_Check(limit_param_val)) {
+            derr << __func__ << " " << limit_param_name << " not an int"
+                 << dendl;
+            Py_RETURN_NONE;
+          }
+          limit->max_count = PyLong_AsLong(limit_param_val);
+        } else {
+          derr << __func__ << " unknown limit param: " << limit_param_name
+               << dendl;
+          Py_RETURN_NONE;
+        }
+      }
+    } else {
+      derr << __func__ << " unknown query param: " << query_param_name << dendl;
+      Py_RETURN_NONE;
+    }
+  }
+
+  if (query.key_descriptor.empty() ||
+      query.performance_counter_descriptors.empty()) {
+    derr << __func__ << " invalid query" << dendl;
+    Py_RETURN_NONE;
+  }
+
+  if (limit) {
+    auto &ds = query.performance_counter_descriptors;
+    if (std::find(ds.begin(), ds.end(), limit->order_by) == ds.end()) {
+      derr << __func__ << " limit order_by " << limit->order_by
+           << " not in performance_counter_descriptors" << dendl;
+      Py_RETURN_NONE;
+    }
+  }
+
+  auto query_id = self->py_modules->add_osd_perf_query(query, limit);
+  return PyLong_FromLong(query_id);
+}
+
+static PyObject*
+ceph_remove_osd_perf_query(BaseMgrModule *self, PyObject *args)
+{
+  MetricQueryID query_id;
+  if (!PyArg_ParseTuple(args, "i:ceph_remove_osd_perf_query", &query_id)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  self->py_modules->remove_osd_perf_query(query_id);
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_get_osd_perf_counters(BaseMgrModule *self, PyObject *args)
+{
+  MetricQueryID query_id;
+  if (!PyArg_ParseTuple(args, "i:ceph_get_osd_perf_counters", &query_id)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  return self->py_modules->get_osd_perf_counters(query_id);
+}
+
+// MDS perf query interface -- mostly follows ceph_add_osd_perf_query()
+// style
+
+static PyObject*
+ceph_add_mds_perf_query(BaseMgrModule *self, PyObject *args)
+{
+  static const std::string NAME_KEY_DESCRIPTOR = "key_descriptor";
+  static const std::string NAME_COUNTERS_DESCRIPTORS =
+      "performance_counter_descriptors";
+  static const std::string NAME_LIMIT = "limit";
+  static const std::string NAME_SUB_KEY_TYPE = "type";
+  static const std::string NAME_SUB_KEY_REGEX = "regex";
+  static const std::string NAME_LIMIT_ORDER_BY = "order_by";
+  static const std::string NAME_LIMIT_MAX_COUNT = "max_count";
+  static const std::map<std::string, MDSPerfMetricSubKeyType> sub_key_types = {
+    {"mds_rank", MDSPerfMetricSubKeyType::MDS_RANK},
+    {"client_id", MDSPerfMetricSubKeyType::CLIENT_ID},
+  };
+  static const std::map<std::string, MDSPerformanceCounterType> counter_types = {
+    {"cap_hit", MDSPerformanceCounterType::CAP_HIT_METRIC},
+    {"read_latency", MDSPerformanceCounterType::READ_LATENCY_METRIC},
+    {"write_latency", MDSPerformanceCounterType::WRITE_LATENCY_METRIC},
+    {"metadata_latency", MDSPerformanceCounterType::METADATA_LATENCY_METRIC},
+    {"dentry_lease", MDSPerformanceCounterType::DENTRY_LEASE_METRIC},
+    {"opened_files", MDSPerformanceCounterType::OPENED_FILES_METRIC},
+    {"pinned_icaps", MDSPerformanceCounterType::PINNED_ICAPS_METRIC},
+    {"opened_inodes", MDSPerformanceCounterType::OPENED_INODES_METRIC},
+    {"read_io_sizes", MDSPerformanceCounterType::READ_IO_SIZES_METRIC},
+    {"write_io_sizes", MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC},
+    {"avg_read_latency", MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC},
+    {"stdev_read_latency", MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC},
+    {"avg_write_latency", MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC},
+    {"stdev_write_latency", MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC},
+    {"avg_metadata_latency", MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC},
+    {"stdev_metadata_latency", MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC},
+  };
+
+  PyObject *py_query = nullptr;
+  if (!PyArg_ParseTuple(args, "O:ceph_add_mds_perf_query", &py_query)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+  if (!PyDict_Check(py_query)) {
+    derr << __func__ << " arg not a dict" << dendl;
+    Py_RETURN_NONE;
+  }
+
+  PyObject *query_params = PyDict_Items(py_query);
+  MDSPerfMetricQuery query;
+  std::optional<MDSPerfMetricLimit> limit;
+
+  // {
+  //   'key_descriptor': [
+  //     {'type': subkey_type, 'regex': regex_pattern},
+  //     ...
+  //   ],
+  //   'performance_counter_descriptors': [
+  //     list, of, descriptor, types
+  //   ],
+  //   'limit': {'order_by': performance_counter_type, 'max_count': n},
+  // }
+
+  for (int i = 0; i < PyList_Size(query_params); ++i) {
+    PyObject *kv = PyList_GET_ITEM(query_params, i);
+    char *query_param_name = nullptr;
+    PyObject *query_param_val = nullptr;
+    if (!PyArg_ParseTuple(kv, "sO:pair", &query_param_name, &query_param_val)) {
+      derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl;
+      Py_RETURN_NONE;
+    }
+    if (query_param_name == NAME_KEY_DESCRIPTOR) {
+      if (!PyList_Check(query_param_val)) {
+        derr << __func__ << " " << query_param_name << " not a list" << dendl;
+        Py_RETURN_NONE;
+      }
+      for (int j = 0; j < PyList_Size(query_param_val); j++) {
+        PyObject *sub_key = PyList_GET_ITEM(query_param_val, j);
+        if (!PyDict_Check(sub_key)) {
+          derr << __func__ << " query " << query_param_name << " item " << j
+               << " not a dict" << dendl;
+          Py_RETURN_NONE;
+        }
+        MDSPerfMetricSubKeyDescriptor d;
+        PyObject *sub_key_params = PyDict_Items(sub_key);
+        for (int k = 0; k < PyList_Size(sub_key_params); ++k) {
+          PyObject *pair = PyList_GET_ITEM(sub_key_params, k);
+          if (!PyTuple_Check(pair)) {
+            derr << __func__ << " query " << query_param_name << " item " << j
+                 << " pair " << k << " not a tuple" << dendl;
+            Py_RETURN_NONE;
+          }
+          char *param_name = nullptr;
+          PyObject *param_value = nullptr;
+          if (!PyArg_ParseTuple(pair, "sO:pair", &param_name, &param_value)) {
+            derr << __func__ << " query " << query_param_name << " item " << j
+                 << " pair " << k << " not a size 2 tuple" << dendl;
+            Py_RETURN_NONE;
+          }
+          if (param_name == NAME_SUB_KEY_TYPE) {
+            if (!PyUnicode_Check(param_value)) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " contains invalid param " << param_name << dendl;
+              Py_RETURN_NONE;
+            }
+            auto type = PyUnicode_AsUTF8(param_value);
+            auto it = sub_key_types.find(type);
+            if (it == sub_key_types.end()) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " contains invalid type " << dendl;
+              Py_RETURN_NONE;
+            }
+            d.type = it->second;
+          } else if (param_name == NAME_SUB_KEY_REGEX) {
+            if (!PyUnicode_Check(param_value)) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " contains invalid param " << param_name << dendl;
+              Py_RETURN_NONE;
+            }
+            d.regex_str = PyUnicode_AsUTF8(param_value);
+            try {
+              d.regex = d.regex_str.c_str();
+            } catch (const std::regex_error& e) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " contains invalid regex " << d.regex_str << dendl;
+              Py_RETURN_NONE;
+            }
+            if (d.regex.mark_count() == 0) {
+              derr << __func__ << " query " << query_param_name << " item " << j
+                   << " regex " << d.regex_str << ": no capturing groups"
+                   << dendl;
+              Py_RETURN_NONE;
+            }
+          } else {
+            derr << __func__ << " query " << query_param_name << " item " << j
+                 << " contains invalid param " << param_name << dendl;
+            Py_RETURN_NONE;
+          }
+        }
+        if (d.type == static_cast<MDSPerfMetricSubKeyType>(-1) ||
+            d.regex_str.empty()) {
+          derr << __func__ << " query " << query_param_name << " item " << i
+               << " invalid" << dendl;
+          Py_RETURN_NONE;
+        }
+        query.key_descriptor.push_back(d);
+      }
+    } else if (query_param_name == NAME_COUNTERS_DESCRIPTORS) {
+      if (!PyList_Check(query_param_val)) {
+        derr << __func__ << " " << query_param_name << " not a list" << dendl;
+        Py_RETURN_NONE;
+      }
+      for (int j = 0; j < PyList_Size(query_param_val); j++) {
+        PyObject *py_type = PyList_GET_ITEM(query_param_val, j);
+        if (!PyUnicode_Check(py_type)) {
+          derr << __func__ << " query " << query_param_name << " item " << j
+               << " not a string" << dendl;
+          Py_RETURN_NONE;
+        }
+        auto type = PyUnicode_AsUTF8(py_type);
+        auto it = counter_types.find(type);
+        if (it == counter_types.end()) {
+          derr << __func__ << " query " << query_param_name << " item " << type
+               << " is not valid type" << dendl;
+          Py_RETURN_NONE;
+        }
+        query.performance_counter_descriptors.push_back(it->second);
+      }
+    } else if (query_param_name == NAME_LIMIT) {
+      if (!PyDict_Check(query_param_val)) {
+        derr << __func__ << " query " << query_param_name << " not a dict"
+             << dendl;
+        Py_RETURN_NONE;
+      }
+
+      limit = MDSPerfMetricLimit();
+      PyObject *limit_params = PyDict_Items(query_param_val);
+
+      for (int j = 0; j < PyList_Size(limit_params); ++j) {
+        PyObject *kv = PyList_GET_ITEM(limit_params, j);
+        char *limit_param_name = nullptr;
+        PyObject *limit_param_val = nullptr;
+        if (!PyArg_ParseTuple(kv, "sO:pair", &limit_param_name,
+                              &limit_param_val)) {
+          derr << __func__ << " limit item " << j << " not a size 2 tuple"
+               << dendl;
+          Py_RETURN_NONE;
+        }
+
+        if (limit_param_name == NAME_LIMIT_ORDER_BY) {
+          if (!PyUnicode_Check(limit_param_val)) {
+            derr << __func__ << " " << limit_param_name << " not a string"
+                 << dendl;
+            Py_RETURN_NONE;
+          }
+          auto order_by = PyUnicode_AsUTF8(limit_param_val);
+          auto it = counter_types.find(order_by);
+          if (it == counter_types.end()) {
+            derr << __func__ << " limit " << limit_param_name
+                 << " not a valid counter type" << dendl;
+            Py_RETURN_NONE;
+          }
+          limit->order_by = it->second;
+        } else if (limit_param_name == NAME_LIMIT_MAX_COUNT) {
+#if PY_MAJOR_VERSION <= 2
+          if (!PyInt_Check(limit_param_val) && !PyLong_Check(limit_param_val)) {
+#else
+          if (!PyLong_Check(limit_param_val)) {
+#endif
+            derr << __func__ << " " << limit_param_name << " not an int"
+                 << dendl;
+            Py_RETURN_NONE;
+          }
+          limit->max_count = PyLong_AsLong(limit_param_val);
+        } else {
+          derr << __func__ << " unknown limit param: " << limit_param_name
+               << dendl;
+          Py_RETURN_NONE;
+        }
+      }
+    } else {
+      derr << __func__ << " unknown query param: " << query_param_name << dendl;
+      Py_RETURN_NONE;
+    }
+  }
+
+  if (query.key_descriptor.empty()) {
+    derr << __func__ << " invalid query" << dendl;
+    Py_RETURN_NONE;
+  }
+
+  if (limit) {
+    auto &ds = query.performance_counter_descriptors;
+    if (std::find(ds.begin(), ds.end(), limit->order_by) == ds.end()) {
+      derr << __func__ << " limit order_by " << limit->order_by
+           << " not in performance_counter_descriptors" << dendl;
+      Py_RETURN_NONE;
+    }
+  }
+
+  auto query_id = self->py_modules->add_mds_perf_query(query, limit);
+  return PyLong_FromLong(query_id);
+}
+
+static PyObject*
+ceph_remove_mds_perf_query(BaseMgrModule *self, PyObject *args)
+{
+  MetricQueryID query_id;
+  if (!PyArg_ParseTuple(args, "i:ceph_remove_mds_perf_query", &query_id)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  self->py_modules->remove_mds_perf_query(query_id);
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_reregister_mds_perf_queries(BaseMgrModule *self, PyObject *args)
+{
+  self->py_modules->reregister_mds_perf_queries();
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_get_mds_perf_counters(BaseMgrModule *self, PyObject *args)
+{
+  MetricQueryID query_id;
+  if (!PyArg_ParseTuple(args, "i:ceph_get_mds_perf_counters", &query_id)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  return self->py_modules->get_mds_perf_counters(query_id);
+}
+
+static PyObject*
+ceph_is_authorized(BaseMgrModule *self, PyObject *args)
+{
+  PyObject *args_dict = NULL;
+  if (!PyArg_ParseTuple(args, "O:ceph_is_authorized", &args_dict)) {
+    return nullptr;
+  }
+
+  if (!PyDict_Check(args_dict)) {
+    derr << __func__ << " arg not a dict" << dendl;
+    Py_RETURN_FALSE;
+  }
+
+  std::map<std::string, std::string> arguments;
+
+  PyObject *args_list = PyDict_Items(args_dict);
+  for (int i = 0; i < PyList_Size(args_list); ++i) {
+    PyObject *kv = PyList_GET_ITEM(args_list, i);
+
+    char *arg_key = nullptr;
+    char *arg_value = nullptr;
+    if (!PyArg_ParseTuple(kv, "ss:pair", &arg_key, &arg_value)) {
+      derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl;
+      continue;
+    }
+
+    arguments[arg_key] = arg_value;
+  }
+
+  PyThreadState *tstate = PyEval_SaveThread();
+  bool r = self->this_module->is_authorized(arguments);
+  PyEval_RestoreThread(tstate);
+
+  if (r) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+}
+
+static PyObject*
+ceph_register_client(BaseMgrModule *self, PyObject *args)
+{
+  char *addrs = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_register_client", &addrs)) {
+    return nullptr;
+  }
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->register_client(self->this_module->get_name(), addrs);
+  PyEval_RestoreThread(tstate);
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_unregister_client(BaseMgrModule *self, PyObject *args)
+{
+  char *addrs = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_unregister_client", &addrs)) {
+    return nullptr;
+  }
+  PyThreadState *tstate = PyEval_SaveThread();
+  self->py_modules->unregister_client(self->this_module->get_name(), addrs);
+  PyEval_RestoreThread(tstate);
+  Py_RETURN_NONE;
+}
+
+PyMethodDef BaseMgrModule_methods[] = {
+  {"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS,
+   "Get a cluster object"},
+
+  {"_ceph_get_server", (PyCFunction)ceph_get_server, METH_VARARGS,
+   "Get a server object"},
+
+  {"_ceph_get_metadata", (PyCFunction)get_metadata, METH_VARARGS,
+   "Get a service's metadata"},
+
+  {"_ceph_get_daemon_status", (PyCFunction)get_daemon_status, METH_VARARGS,
+   "Get a service's status"},
+
+  {"_ceph_send_command", (PyCFunction)ceph_send_command, METH_VARARGS,
+   "Send a mon command"},
+
+  {"_ceph_set_health_checks", (PyCFunction)ceph_set_health_checks, METH_VARARGS,
+   "Set health checks for this module"},
+
+  {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+   "Get the name of the Mgr daemon where we are running"},
+
+  {"_ceph_get_ceph_conf_path", (PyCFunction)ceph_get_ceph_conf_path, METH_NOARGS,
+   "Get path to ceph.conf"},
+
+  {"_ceph_get_option", (PyCFunction)ceph_option_get, METH_VARARGS,
+   "Get a native configuration option value"},
+
+  {"_ceph_get_foreign_option", (PyCFunction)ceph_foreign_option_get, METH_VARARGS,
+   "Get a native configuration option value for another entity"},
+
+  {"_ceph_get_module_option", (PyCFunction)ceph_get_module_option, METH_VARARGS,
+   "Get a module configuration option value"},
+
+  {"_ceph_get_store_prefix", (PyCFunction)ceph_store_get_prefix, METH_VARARGS,
+   "Get all KV store values with a given prefix"},
+
+  {"_ceph_set_module_option", (PyCFunction)ceph_set_module_option, METH_VARARGS,
+   "Set a module configuration option value"},
+
+  {"_ceph_get_store", (PyCFunction)ceph_store_get, METH_VARARGS,
+   "Get a stored field"},
+
+  {"_ceph_set_store", (PyCFunction)ceph_store_set, METH_VARARGS,
+   "Set a stored field"},
+
+  {"_ceph_get_counter", (PyCFunction)get_counter, METH_VARARGS,
+    "Get a performance counter"},
+
+  {"_ceph_get_latest_counter", (PyCFunction)get_latest_counter, METH_VARARGS,
+    "Get the latest performance counter"},
+
+  {"_ceph_get_perf_schema", (PyCFunction)get_perf_schema, METH_VARARGS,
+    "Get the performance counter schema"},
+
+  {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+   "Emit a (local) log message"},
+
+  {"_ceph_cluster_log", (PyCFunction)ceph_cluster_log, METH_VARARGS,
+   "Emit a cluster log message"},
+
+  {"_ceph_get_version", (PyCFunction)ceph_get_version, METH_NOARGS,
+   "Get the ceph version of this process"},
+
+  {"_ceph_get_release_name", (PyCFunction)ceph_get_release_name, METH_NOARGS,
+   "Get the ceph release name of this process"},
+
+  {"_ceph_lookup_release_name", (PyCFunction)ceph_lookup_release_name, METH_VARARGS,
+   "Get the ceph release name for a given major number"},
+
+  {"_ceph_get_context", (PyCFunction)ceph_get_context, METH_NOARGS,
+    "Get a CephContext* in a python capsule"},
+
+  {"_ceph_get_osdmap", (PyCFunction)ceph_get_osdmap, METH_NOARGS,
+    "Get an OSDMap* in a python capsule"},
+
+  {"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS,
+    "Advertize a service URI served by this module"},
+
+  {"_ceph_set_device_wear_level", (PyCFunction)ceph_set_wear_level, METH_VARARGS,
+   "Set device wear_level value"},
+
+  {"_ceph_have_mon_connection", (PyCFunction)ceph_have_mon_connection,
+    METH_NOARGS, "Find out whether this mgr daemon currently has "
+                 "a connection to a monitor"},
+
+  {"_ceph_update_progress_event", (PyCFunction)ceph_update_progress_event,
+   METH_VARARGS, "Update status of a progress event"},
+  {"_ceph_complete_progress_event", (PyCFunction)ceph_complete_progress_event,
+   METH_VARARGS, "Complete a progress event"},
+  {"_ceph_clear_all_progress_events", (PyCFunction)ceph_clear_all_progress_events,
+   METH_NOARGS, "Clear all progress events"},
+
+  {"_ceph_dispatch_remote", (PyCFunction)ceph_dispatch_remote,
+    METH_VARARGS, "Dispatch a call to another module"},
+
+  {"_ceph_add_osd_perf_query", (PyCFunction)ceph_add_osd_perf_query,
+    METH_VARARGS, "Add an osd perf query"},
+
+  {"_ceph_remove_osd_perf_query", (PyCFunction)ceph_remove_osd_perf_query,
+    METH_VARARGS, "Remove an osd perf query"},
+
+  {"_ceph_get_osd_perf_counters", (PyCFunction)ceph_get_osd_perf_counters,
+    METH_VARARGS, "Get osd perf counters"},
+
+  {"_ceph_add_mds_perf_query", (PyCFunction)ceph_add_mds_perf_query,
+    METH_VARARGS, "Add an osd perf query"},
+
+  {"_ceph_remove_mds_perf_query", (PyCFunction)ceph_remove_mds_perf_query,
+    METH_VARARGS, "Remove an osd perf query"},
+
+  {"_ceph_reregister_mds_perf_queries", (PyCFunction)ceph_reregister_mds_perf_queries,
+    METH_NOARGS, "Re-register mds perf queries"},
+
+  {"_ceph_get_mds_perf_counters", (PyCFunction)ceph_get_mds_perf_counters,
+    METH_VARARGS, "Get osd perf counters"},
+
+  {"_ceph_is_authorized", (PyCFunction)ceph_is_authorized,
+    METH_VARARGS, "Verify the current session caps are valid"},
+
+  {"_ceph_register_client", (PyCFunction)ceph_register_client,
+    METH_VARARGS, "Register RADOS instance for potential blocklisting"},
+
+  {"_ceph_unregister_client", (PyCFunction)ceph_unregister_client,
+    METH_VARARGS, "Unregister RADOS instance for potential blocklisting"},
+
+  {NULL, NULL, 0, NULL}
+};
+
+
+static PyObject *
+BaseMgrModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    BaseMgrModule *self;
+
+    self = (BaseMgrModule *)type->tp_alloc(type, 0);
+
+    return (PyObject *)self;
+}
+
+static int
+BaseMgrModule_init(BaseMgrModule *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *py_modules_capsule = nullptr;
+    PyObject *this_module_capsule = nullptr;
+    static const char *kwlist[] = {"py_modules", "this_module", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "OO",
+                                      const_cast<char**>(kwlist),
+                                      &py_modules_capsule,
+                                      &this_module_capsule)) {
+        return -1;
+    }
+
+    self->py_modules = static_cast<ActivePyModules*>(PyCapsule_GetPointer(
+        py_modules_capsule, nullptr));
+    ceph_assert(self->py_modules);
+    self->this_module = static_cast<ActivePyModule*>(PyCapsule_GetPointer(
+        this_module_capsule, nullptr));
+    ceph_assert(self->this_module);
+
+    return 0;
+}
+
+PyTypeObject BaseMgrModuleType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BaseMgrModule", /* tp_name */
+  sizeof(BaseMgrModule),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  0,                         /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "ceph-mgr Python Plugin", /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BaseMgrModule_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BaseMgrModule_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  BaseMgrModule_new,     /* tp_new */
+};
diff --git a/src/mgr/BaseMgrModule.h b/src/mgr/BaseMgrModule.h
new file mode 100644
index 000000000..2c2e5deb3
--- /dev/null
+++ b/src/mgr/BaseMgrModule.h
@@ -0,0 +1,7 @@
+
+#pragma once
+
+#include "Python.h"
+
+extern PyTypeObject BaseMgrModuleType;
+
diff --git a/src/mgr/BaseMgrStandbyModule.cc b/src/mgr/BaseMgrStandbyModule.cc
new file mode 100644
index 000000000..6f35088d0
--- /dev/null
+++ b/src/mgr/BaseMgrStandbyModule.cc
@@ -0,0 +1,269 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#include "BaseMgrStandbyModule.h"
+
+#include "StandbyPyModules.h"
+#include "PyFormatter.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+typedef struct {
+  PyObject_HEAD
+  StandbyPyModule *this_module;
+} BaseMgrStandbyModule;
+
+static PyObject *
+BaseMgrStandbyModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    BaseMgrStandbyModule *self;
+
+    self = (BaseMgrStandbyModule *)type->tp_alloc(type, 0);
+
+    return (PyObject *)self;
+}
+
+static int
+BaseMgrStandbyModule_init(BaseMgrStandbyModule *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *this_module_capsule = nullptr;
+    static const char *kwlist[] = {"this_module", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &this_module_capsule)) {
+        return -1;
+    }
+
+    self->this_module = static_cast<StandbyPyModule*>(PyCapsule_GetPointer(
+        this_module_capsule, nullptr));
+    ceph_assert(self->this_module);
+
+    return 0;
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrStandbyModule *self, PyObject *args)
+{
+  return PyUnicode_FromString(g_conf()->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_get_module_option(BaseMgrStandbyModule *self, PyObject *args)
+{
+  char *what = nullptr;
+  char *prefix = nullptr;
+  if (!PyArg_ParseTuple(args, "s|s:ceph_get_module_option", &what, &prefix)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+  PyThreadState *tstate = PyEval_SaveThread();
+  std::string final_key;
+  std::string value;
+  bool found = false;
+  if (prefix) {
+    final_key = std::string(prefix) + "/" + what;
+    found = self->this_module->get_config(final_key, &value);
+  }
+  if (!found) {
+    final_key = what;
+    found = self->this_module->get_config(final_key, &value);
+  }
+  PyEval_RestoreThread(tstate);
+  if (found) {
+    dout(10) << __func__ << " " << final_key << " found: " << value
+	     << dendl;
+    return self->this_module->py_module->get_typed_option_value(what, value);
+  } else {
+    if (prefix) {
+      dout(4) << __func__ << " [" << prefix << "/]" << what << " not found "
+	      << dendl;
+    } else {
+      dout(4) << __func__ << " " << what << " not found " << dendl;
+    }
+    Py_RETURN_NONE;
+  }
+}
+
+static PyObject*
+ceph_option_get(BaseMgrStandbyModule *self, PyObject *args)
+{
+  char *what = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_option_get", &what)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  std::string value;
+  int r = g_conf().get_val(string(what), &value);
+  if (r >= 0) {
+    dout(10) << "ceph_option_get " << what << " found: " << value << dendl;
+    return PyUnicode_FromString(value.c_str());
+  } else {
+    dout(4) << "ceph_option_get " << what << " not found " << dendl;
+    Py_RETURN_NONE;
+  }
+}
+
+static PyObject*
+ceph_store_get(BaseMgrStandbyModule *self, PyObject *args)
+{
+  char *what = nullptr;
+  if (!PyArg_ParseTuple(args, "s:ceph_store_get", &what)) {
+    derr << "Invalid args!" << dendl;
+    return nullptr;
+  }
+
+  // Drop GIL for blocking mon command execution
+  PyThreadState *tstate = PyEval_SaveThread();
+
+  std::string value;
+  bool found = self->this_module->get_store(what, &value);
+
+  PyEval_RestoreThread(tstate);
+
+  if (found) {
+    dout(10) << "ceph_store_get " << what << " found: " << value.c_str() << dendl;
+    return PyUnicode_FromString(value.c_str());
+  } else {
+    dout(4) << "ceph_store_get " << what << " not found " << dendl;
+    Py_RETURN_NONE;
+  }
+}
+
+static PyObject*
+ceph_get_active_uri(BaseMgrStandbyModule *self, PyObject *args)
+{
+  return PyUnicode_FromString(self->this_module->get_active_uri().c_str());
+}
+
+static PyObject*
+ceph_log(BaseMgrStandbyModule *self, PyObject *args)
+{
+  char *record = nullptr;
+  if (!PyArg_ParseTuple(args, "s:log", &record)) {
+    return nullptr;
+  }
+
+  ceph_assert(self->this_module);
+
+  self->this_module->log(record);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_standby_state_get(BaseMgrStandbyModule *self, PyObject *args)
+{
+  char *whatc = NULL;
+  if (!PyArg_ParseTuple(args, "s:ceph_state_get", &whatc)) {
+    return NULL;
+  }
+  std::string what(whatc);
+
+  PyFormatter f;
+
+  // Drop the GIL, as most of the following blocks will block on
+  // a mutex -- they are all responsible for re-taking the GIL before
+  // touching the PyFormatter instance or returning from the function.
+  without_gil_t no_gil;
+
+  if (what == "mgr_ips") {
+    entity_addrvec_t myaddrs = self->this_module->get_myaddrs();
+    with_gil_t with_gil{no_gil};
+    f.open_array_section("ips");
+    std::set<std::string> did;
+    for (auto& i : myaddrs.v) {
+      std::string ip = i.ip_only_to_str();
+      if (auto [where, inserted] = did.insert(ip); inserted) {
+	f.dump_string("ip", ip);
+      }
+    }
+    f.close_section();
+    return f.get();
+  } else {
+    derr << "Python module requested unknown data '" << what << "'" << dendl;
+    with_gil_t with_gil{no_gil};
+    Py_RETURN_NONE;
+  }
+}
+
+
+PyMethodDef BaseMgrStandbyModule_methods[] = {
+  {"_ceph_get", (PyCFunction)ceph_standby_state_get, METH_VARARGS,
+   "Get a cluster object (standby)"},
+
+  {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+   "Get the name of the Mgr daemon where we are running"},
+
+  {"_ceph_get_module_option", (PyCFunction)ceph_get_module_option, METH_VARARGS,
+   "Get a module configuration option value"},
+
+  {"_ceph_get_option", (PyCFunction)ceph_option_get, METH_VARARGS,
+   "Get a native configuration option value"},
+
+  {"_ceph_get_store", (PyCFunction)ceph_store_get, METH_VARARGS,
+   "Get a KV store value"},
+
+  {"_ceph_get_active_uri", (PyCFunction)ceph_get_active_uri, METH_NOARGS,
+   "Get the URI of the active instance of this module, if any"},
+
+  {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+   "Emit a log message"},
+
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BaseMgrStandbyModuleType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BaseMgrStandbyModule", /* tp_name */
+  sizeof(BaseMgrStandbyModule),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  0,                         /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "ceph-mgr Standby Python Plugin", /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BaseMgrStandbyModule_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BaseMgrStandbyModule_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  BaseMgrStandbyModule_new,     /* tp_new */
+};
diff --git a/src/mgr/BaseMgrStandbyModule.h b/src/mgr/BaseMgrStandbyModule.h
new file mode 100644
index 000000000..82bda9105
--- /dev/null
+++ b/src/mgr/BaseMgrStandbyModule.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <Python.h>
+
+extern PyTypeObject BaseMgrStandbyModuleType;
+
diff --git a/src/mgr/CMakeLists.txt b/src/mgr/CMakeLists.txt
new file mode 100644
index 000000000..8e152e060
--- /dev/null
+++ b/src/mgr/CMakeLists.txt
@@ -0,0 +1,46 @@
+add_library(mgr_cap_obj OBJECT
+  MgrCap.cc)
+
+if(WITH_MGR)
+  set(mgr_srcs
+    ${CMAKE_SOURCE_DIR}/src/ceph_mgr.cc
+    ${CMAKE_SOURCE_DIR}/src/mon/PGMap.cc
+    ${CMAKE_SOURCE_DIR}/src/mon/ConfigMap.cc
+    ActivePyModule.cc
+    ActivePyModules.cc
+    BaseMgrModule.cc
+    BaseMgrStandbyModule.cc
+    ClusterState.cc
+    DaemonHealthMetricCollector.cc
+    DaemonKey.cc
+    DaemonServer.cc
+    DaemonState.cc
+    Gil.cc
+    Mgr.cc
+    mgr_perf_counters.cc
+    MgrStandby.cc
+    MetricCollector.cc
+    OSDPerfMetricTypes.cc
+    OSDPerfMetricCollector.cc
+    MDSPerfMetricTypes.cc
+    MDSPerfMetricCollector.cc
+    PyFormatter.cc
+    PyUtil.cc
+    PyModule.cc
+    PyModuleRegistry.cc
+    PyModuleRunner.cc
+    PyOSDMap.cc
+    StandbyPyModules.cc
+    mgr_commands.cc
+    $<TARGET_OBJECTS:mgr_cap_obj>)
+  add_executable(ceph-mgr ${mgr_srcs})
+  target_compile_definitions(ceph-mgr PRIVATE PY_SSIZE_T_CLEAN)
+  target_link_libraries(ceph-mgr
+    osdc client heap_profiler
+    global-static ceph-common
+    Boost::python${MGR_PYTHON_VERSION_MAJOR}${MGR_PYTHON_VERSION_MINOR}
+    Python3::Python ${CMAKE_DL_LIBS} ${GSSAPI_LIBRARIES})
+  set_target_properties(ceph-mgr PROPERTIES
+    POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE})
+  install(TARGETS ceph-mgr DESTINATION bin)
+endif()
diff --git a/src/mgr/ClusterState.cc b/src/mgr/ClusterState.cc
new file mode 100644
index 000000000..28340d56d
--- /dev/null
+++ b/src/mgr/ClusterState.cc
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "messages/MMgrDigest.h"
+#include "messages/MMonMgrReport.h"
+#include "messages/MPGStats.h"
+
+#include "mgr/ClusterState.h"
+#include <time.h>
+#include <boost/range/adaptor/reversed.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+ClusterState::ClusterState(
+  MonClient *monc_,
+  Objecter *objecter_,
+  const MgrMap& mgrmap)
+  : monc(monc_),
+    objecter(objecter_),
+    mgr_map(mgrmap),
+    asok_hook(NULL)
+{}
+
+void ClusterState::set_objecter(Objecter *objecter_)
+{
+  std::lock_guard l(lock);
+
+  objecter = objecter_;
+}
+
+void ClusterState::set_fsmap(FSMap const &new_fsmap)
+{
+  std::lock_guard l(lock);
+
+  fsmap = new_fsmap;
+}
+
+void ClusterState::set_mgr_map(MgrMap const &new_mgrmap)
+{
+  std::lock_guard l(lock);
+  mgr_map = new_mgrmap;
+}
+
+void ClusterState::set_service_map(ServiceMap const &new_service_map)
+{
+  std::lock_guard l(lock);
+  servicemap = new_service_map;
+}
+
+void ClusterState::load_digest(MMgrDigest *m)
+{
+  std::lock_guard l(lock);
+  health_json = std::move(m->health_json);
+  mon_status_json = std::move(m->mon_status_json);
+}
+
+void ClusterState::ingest_pgstats(ref_t<MPGStats> stats)
+{
+  std::lock_guard l(lock);
+
+  const int from = stats->get_orig_source().num();
+  bool is_in = with_osdmap([from](const OSDMap& osdmap) {
+    return osdmap.is_in(from);
+  });
+
+  if (is_in) {
+    pending_inc.update_stat(from, std::move(stats->osd_stat));
+  } else {
+    osd_stat_t empty_stat;
+    empty_stat.seq = stats->osd_stat.seq;
+    pending_inc.update_stat(from, std::move(empty_stat));  
+  }
+
+  for (auto p : stats->pg_stat) {
+    pg_t pgid = p.first;
+    const auto &pg_stats = p.second;
+
+    // In case we're hearing about a PG that according to last
+    // OSDMap update should not exist
+    auto r = existing_pools.find(pgid.pool());
+    if (r == existing_pools.end()) {
+      dout(15) << " got " << pgid
+	       << " reported at " << pg_stats.reported_epoch << ":"
+               << pg_stats.reported_seq
+               << " state " << pg_state_string(pg_stats.state)
+               << " but pool not in " << existing_pools
+               << dendl;
+      continue;
+    }
+    if (pgid.ps() >= r->second) {
+      dout(15) << " got " << pgid
+	       << " reported at " << pg_stats.reported_epoch << ":"
+               << pg_stats.reported_seq
+               << " state " << pg_state_string(pg_stats.state)
+               << " but > pg_num " << r->second
+               << dendl;
+      continue;
+    }
+    // In case we already heard about more recent stats from this PG
+    // from another OSD
+    const auto q = pg_map.pg_stat.find(pgid);
+    if (q != pg_map.pg_stat.end() &&
+	q->second.get_version_pair() > pg_stats.get_version_pair()) {
+      dout(15) << " had " << pgid << " from "
+	       << q->second.reported_epoch << ":"
+	       << q->second.reported_seq << dendl;
+      continue;
+    }
+
+    pending_inc.pg_stat_updates[pgid] = pg_stats;
+  }
+  for (auto p : stats->pool_stat) {
+    pending_inc.pool_statfs_updates[std::make_pair(p.first, from)] = p.second;
+  }
+}
+
+void ClusterState::update_delta_stats()
+{
+  pending_inc.stamp = ceph_clock_now();
+  pending_inc.version = pg_map.version + 1; // to make apply_incremental happy
+  dout(10) << " v" << pending_inc.version << dendl;
+
+  dout(30) << " pg_map before:\n";
+  JSONFormatter jf(true);
+  jf.dump_object("pg_map", pg_map);
+  jf.flush(*_dout);
+  *_dout << dendl;
+  dout(30) << " incremental:\n";
+  JSONFormatter jf(true);
+  jf.dump_object("pending_inc", pending_inc);
+  jf.flush(*_dout);
+  *_dout << dendl;
+  pg_map.apply_incremental(g_ceph_context, pending_inc);
+  pending_inc = PGMap::Incremental();
+}
+
+void ClusterState::notify_osdmap(const OSDMap &osd_map)
+{
+  assert(ceph_mutex_is_locked(lock));
+
+  pending_inc.stamp = ceph_clock_now();
+  pending_inc.version = pg_map.version + 1; // to make apply_incremental happy
+  dout(10) << " v" << pending_inc.version << dendl;
+
+  PGMapUpdater::check_osd_map(g_ceph_context, osd_map, pg_map, &pending_inc);
+
+  // update our list of pools that exist, so that we can filter pg_map updates
+  // in synchrony with this OSDMap.
+  existing_pools.clear();
+  for (auto& p : osd_map.get_pools()) {
+    existing_pools[p.first] = p.second.get_pg_num();
+  }
+
+  // brute force this for now (don't bother being clever by only
+  // checking osds that went up/down)
+  set<int> need_check_down_pg_osds;
+  PGMapUpdater::check_down_pgs(osd_map, pg_map, true,
+			       need_check_down_pg_osds, &pending_inc);
+
+  dout(30) << " pg_map before:\n";
+  JSONFormatter jf(true);
+  jf.dump_object("pg_map", pg_map);
+  jf.flush(*_dout);
+  *_dout << dendl;
+  dout(30) << " incremental:\n";
+  JSONFormatter jf(true);
+  jf.dump_object("pending_inc", pending_inc);
+  jf.flush(*_dout);
+  *_dout << dendl;
+
+  pg_map.apply_incremental(g_ceph_context, pending_inc);
+  pending_inc = PGMap::Incremental();
+  // TODO: Complete the separation of PG state handling so
+  // that a cut-down set of functionality remains in PGMonitor
+  // while the full-blown PGMap lives only here.
+}
+
+class ClusterSocketHook : public AdminSocketHook {
+  ClusterState *cluster_state;
+public:
+  explicit ClusterSocketHook(ClusterState *o) : cluster_state(o) {}
+  int call(std::string_view admin_command, const cmdmap_t& cmdmap,
+	   Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    stringstream outss;
+    int r = 0;
+    try {
+      r = cluster_state->asok_command(admin_command, cmdmap, f, outss);
+      out.append(outss);
+    } catch (const TOPNSPC::common::bad_cmd_get& e) {
+      errss << e.what();
+      r = -EINVAL;
+    }
+    return r;
+  }
+};
+
+void ClusterState::final_init()
+{
+  AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+  asok_hook = new ClusterSocketHook(this);
+  int r = admin_socket->register_command(
+    "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
+    "Dump osd heartbeat network ping times");
+  ceph_assert(r == 0);
+}
+
+void ClusterState::shutdown()
+{
+  // unregister commands
+  g_ceph_context->get_admin_socket()->unregister_commands(asok_hook);
+  delete asok_hook;
+  asok_hook = NULL;
+}
+
+bool ClusterState::asok_command(
+  std::string_view admin_command,
+  const cmdmap_t& cmdmap,
+  Formatter *f,
+  ostream& ss)
+{
+  std::lock_guard l(lock);
+  if (admin_command == "dump_osd_network") {
+    int64_t value = 0;
+    // Default to health warning level if nothing specified
+    if (!(TOPNSPC::common::cmd_getval(cmdmap, "value", value))) {
+      // Convert milliseconds to microseconds
+      value = static_cast<int64_t>(g_ceph_context->_conf.get_val<double>("mon_warn_on_slow_ping_time")) * 1000;
+      if (value == 0) {
+        double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
+	value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
+	value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+      }
+    } else {
+      // Convert user input to microseconds
+      value *= 1000;
+    }
+    if (value < 0)
+      value = 0;
+
+    struct mgr_ping_time_t {
+      uint32_t pingtime;
+      int from;
+      int to;
+      bool back;
+      std::array<uint32_t,3> times;
+      std::array<uint32_t,3> min;
+      std::array<uint32_t,3> max;
+      uint32_t last;
+      uint32_t last_update;
+
+      bool operator<(const mgr_ping_time_t& rhs) const {
+        if (pingtime < rhs.pingtime)
+          return true;
+        if (pingtime > rhs.pingtime)
+          return false;
+        if (from < rhs.from)
+          return true;
+        if (from > rhs.from)
+          return false;
+        if (to < rhs.to)
+          return true;
+        if (to > rhs.to)
+          return false;
+        return back;
+      }
+    };
+
+    set<mgr_ping_time_t> sorted;
+    utime_t now = ceph_clock_now();
+    for (auto i : pg_map.osd_stat) {
+      for (auto j : i.second.hb_pingtime) {
+
+	if (j.second.last_update == 0)
+	  continue;
+	auto stale_time = g_ceph_context->_conf.get_val<int64_t>("osd_mon_heartbeat_stat_stale");
+	if (now.sec() - j.second.last_update > stale_time) {
+	  dout(20) << __func__ << " time out heartbeat for osd " << i.first
+	           << " last_update " << j.second.last_update << dendl;
+	   continue;
+	}
+	mgr_ping_time_t item;
+	item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+	item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
+	if (!value || item.pingtime >= value) {
+	  item.from = i.first;
+	  item.to = j.first;
+	  item.times[0] = j.second.back_pingtime[0];
+	  item.times[1] = j.second.back_pingtime[1];
+	  item.times[2] = j.second.back_pingtime[2];
+	  item.min[0] = j.second.back_min[0];
+	  item.min[1] = j.second.back_min[1];
+	  item.min[2] = j.second.back_min[2];
+	  item.max[0] = j.second.back_max[0];
+	  item.max[1] = j.second.back_max[1];
+	  item.max[2] = j.second.back_max[2];
+	  item.last = j.second.back_last;
+	  item.back = true;
+	  item.last_update = j.second.last_update;
+	  sorted.emplace(item);
+	}
+
+	if (j.second.front_last == 0)
+	  continue;
+	item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+	item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
+	if (!value || item.pingtime >= value) {
+	  item.from = i.first;
+	  item.to = j.first;
+	  item.times[0] = j.second.front_pingtime[0];
+	  item.times[1] = j.second.front_pingtime[1];
+	  item.times[2] = j.second.front_pingtime[2];
+	  item.min[0] = j.second.front_min[0];
+	  item.min[1] = j.second.front_min[1];
+	  item.min[2] = j.second.front_min[2];
+	  item.max[0] = j.second.front_max[0];
+	  item.max[1] = j.second.front_max[1];
+	  item.max[2] = j.second.front_max[2];
+	  item.last = j.second.front_last;
+	  item.back = false;
+	  item.last_update = j.second.last_update;
+	  sorted.emplace(item);
+	}
+      }
+    }
+
+    // Network ping times (1min 5min 15min)
+    f->open_object_section("network_ping_times");
+    f->dump_int("threshold", value / 1000);
+    f->open_array_section("entries");
+    for (auto &sitem : boost::adaptors::reverse(sorted)) {
+      ceph_assert(!value || sitem.pingtime >= value);
+
+      f->open_object_section("entry");
+
+      const time_t lu(sitem.last_update);
+      char buffer[26];
+      string lustr(ctime_r(&lu, buffer));
+      lustr.pop_back();   // Remove trailing \n
+      auto stale = g_ceph_context->_conf.get_val<int64_t>("osd_heartbeat_stale");
+      f->dump_string("last update", lustr);
+      f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
+      f->dump_int("from osd", sitem.from);
+      f->dump_int("to osd", sitem.to);
+      f->dump_string("interface", (sitem.back ? "back" : "front"));
+      f->open_object_section("average");
+      f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
+      f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
+      f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
+      f->close_section(); // average
+      f->open_object_section("min");
+      f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.min[0],3).c_str());
+      f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.min[1],3).c_str());
+      f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.min[2],3).c_str());
+      f->close_section(); // min
+      f->open_object_section("max");
+      f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
+      f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
+      f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
+      f->close_section(); // max
+      f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
+      f->close_section(); // entry
+    }
+    f->close_section(); // entries
+    f->close_section(); // network_ping_times
+  } else {
+    ceph_abort_msg("broken asok registration");
+  }
+  return true;
+}
diff --git a/src/mgr/ClusterState.h b/src/mgr/ClusterState.h
new file mode 100644
index 000000000..eeff1f76b
--- /dev/null
+++ b/src/mgr/ClusterState.h
@@ -0,0 +1,163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CLUSTER_STATE_H_
+#define CLUSTER_STATE_H_
+
+#include "mds/FSMap.h"
+#include "mon/MgrMap.h"
+#include "common/ceph_mutex.h"
+
+#include "osdc/Objecter.h"
+#include "mon/MonClient.h"
+#include "mon/PGMap.h"
+#include "mgr/ServiceMap.h"
+
+class MMgrDigest;
+class MMonMgrReport;
+class MPGStats;
+
+
+/**
+ * Cluster-scope state (things like cluster maps) as opposed
+ * to daemon-level state (things like perf counters and smart)
+ */
+class ClusterState
+{
+protected:
+  MonClient *monc;
+  Objecter *objecter;
+  FSMap fsmap;
+  ServiceMap servicemap;
+  mutable ceph::mutex lock = ceph::make_mutex("ClusterState");
+
+  MgrMap mgr_map;
+
+  map<int64_t,unsigned> existing_pools; ///< pools that exist, and pg_num, as of PGMap epoch
+  PGMap pg_map;
+  PGMap::Incremental pending_inc;
+
+  bufferlist health_json;
+  bufferlist mon_status_json;
+
+  class ClusterSocketHook *asok_hook;
+
+public:
+
+  void load_digest(MMgrDigest *m);
+  void ingest_pgstats(ceph::ref_t<MPGStats> stats);
+
+  void update_delta_stats();
+
+  ClusterState(MonClient *monc_, Objecter *objecter_, const MgrMap& mgrmap);
+
+  void set_objecter(Objecter *objecter_);
+  void set_fsmap(FSMap const &new_fsmap);
+  void set_mgr_map(MgrMap const &new_mgrmap);
+  void set_service_map(ServiceMap const &new_service_map);
+
+  void notify_osdmap(const OSDMap &osd_map);
+
+  bool have_fsmap() const {
+    std::lock_guard l(lock);
+    return fsmap.get_epoch() > 0;
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_servicemap(Callback&& cb, Args&&...args) const
+  {
+    std::lock_guard l(lock);
+    return std::forward<Callback>(cb)(servicemap, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_fsmap(Callback&& cb, Args&&...args) const
+  {
+    std::lock_guard l(lock);
+    return std::forward<Callback>(cb)(fsmap, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_mgrmap(Callback&& cb, Args&&...args) const
+  {
+    std::lock_guard l(lock);
+    return std::forward<Callback>(cb)(mgr_map, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_pgmap(Callback&& cb, Args&&...args) const ->
+    decltype(cb(pg_map, std::forward<Args>(args)...))
+  {
+    std::lock_guard l(lock);
+    return std::forward<Callback>(cb)(pg_map, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_mutable_pgmap(Callback&& cb, Args&&...args) ->
+    decltype(cb(pg_map, std::forward<Args>(args)...))
+  {
+    std::lock_guard l(lock);
+    return std::forward<Callback>(cb)(pg_map, std::forward<Args>(args)...);
+  }
+
+  template<typename... Args>
+  auto with_monmap(Args &&... args) const
+  {
+    std::lock_guard l(lock);
+    ceph_assert(monc != nullptr);
+    return monc->with_monmap(std::forward<Args>(args)...);
+  }
+
+  template<typename... Args>
+  auto with_osdmap(Args &&... args) const ->
+    decltype(objecter->with_osdmap(std::forward<Args>(args)...))
+  {
+    ceph_assert(objecter != nullptr);
+    return objecter->with_osdmap(std::forward<Args>(args)...);
+  }
+
+  // call cb(osdmap, pg_map, ...args) with the appropriate locks
+  template <typename Callback, typename ...Args>
+  auto with_osdmap_and_pgmap(Callback&& cb, Args&& ...args) const {
+    ceph_assert(objecter != nullptr);
+    std::lock_guard l(lock);
+    return objecter->with_osdmap(
+      std::forward<Callback>(cb),
+      pg_map,
+      std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_health(Callback&& cb, Args&&...args) const
+  {
+    std::lock_guard l(lock);
+    return std::forward<Callback>(cb)(health_json, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_mon_status(Callback&& cb, Args&&...args) const
+  {
+    std::lock_guard l(lock);
+    return std::forward<Callback>(cb)(mon_status_json, std::forward<Args>(args)...);
+  }
+
+  void final_init();
+  void shutdown();
+  bool asok_command(std::string_view admin_command,
+		    const cmdmap_t& cmdmap,
+		    Formatter *f,
+		    ostream& ss);
+};
+
+#endif
+
diff --git a/src/mgr/DaemonHealthMetric.h b/src/mgr/DaemonHealthMetric.h
new file mode 100644
index 000000000..ad3ea29ef
--- /dev/null
+++ b/src/mgr/DaemonHealthMetric.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <ostream>
+#include "include/denc.h"
+
+enum class daemon_metric : uint8_t {
+  SLOW_OPS,
+  PENDING_CREATING_PGS,
+  NONE,
+};
+
+static inline const char *daemon_metric_name(daemon_metric t) {
+  switch (t) {
+  case daemon_metric::SLOW_OPS: return "SLOW_OPS";
+  case daemon_metric::PENDING_CREATING_PGS: return "PENDING_CREATING_PGS";
+  case daemon_metric::NONE: return "NONE";
+  default: return "???";
+  }
+}
+
+union daemon_metric_t {
+  struct {
+    uint32_t n1;
+    uint32_t n2;
+  };
+  uint64_t n;
+  daemon_metric_t(uint32_t x, uint32_t y)
+    : n1(x), n2(y)
+  {}
+  daemon_metric_t(uint64_t x = 0)
+    : n(x)
+  {}
+};
+
+class DaemonHealthMetric
+{
+public:
+  DaemonHealthMetric() = default;
+  DaemonHealthMetric(daemon_metric type_, uint64_t n)
+    : type(type_), value(n)
+  {}
+  DaemonHealthMetric(daemon_metric type_, uint32_t n1, uint32_t n2)
+    : type(type_), value(n1, n2)
+  {}
+  daemon_metric get_type() const {
+    return type;
+  }
+  uint64_t get_n() const {
+    return value.n;
+  }
+  uint32_t get_n1() const {
+    return value.n1;
+  }
+  uint32_t get_n2() const {
+    return value.n2;
+  }
+  DENC(DaemonHealthMetric, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.type, p);
+    denc(v.value.n, p);
+    DENC_FINISH(p);
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const DaemonHealthMetric& m) {
+    return out << daemon_metric_name(m.get_type()) << "("
+	       << m.get_n() << "|(" << m.get_n1() << "," << m.get_n2() << "))";
+  }
+private:
+  daemon_metric type = daemon_metric::NONE;
+  daemon_metric_t value;
+};
+WRITE_CLASS_DENC(DaemonHealthMetric)
diff --git a/src/mgr/DaemonHealthMetricCollector.cc b/src/mgr/DaemonHealthMetricCollector.cc
new file mode 100644
index 000000000..53c0b78a6
--- /dev/null
+++ b/src/mgr/DaemonHealthMetricCollector.cc
@@ -0,0 +1,101 @@
+#include <fmt/format.h>
+
+#include "include/health.h"
+#include "include/types.h"
+#include "DaemonHealthMetricCollector.h"
+
+namespace {
+
+class SlowOps final : public DaemonHealthMetricCollector {
+  bool _is_relevant(daemon_metric type) const override {
+    return type == daemon_metric::SLOW_OPS;
+  }
+  health_check_t& _get_check(health_check_map_t& cm) const override {
+    return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "", 1);
+  }
+  bool _update(const DaemonKey& daemon,
+               const DaemonHealthMetric& metric) override {
+    auto num_slow = metric.get_n1();
+    auto blocked_time = metric.get_n2();
+    value.n1 += num_slow;
+    value.n2 = std::max(value.n2, blocked_time);
+    if (num_slow || blocked_time) {
+      daemons.push_back(daemon);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  void _summarize(health_check_t& check) const override {
+    if (daemons.empty()) {
+      return;
+    }
+    // Note this message format is used in mgr/prometheus, so any change in format
+    // requires a corresponding change in the mgr/prometheus module.
+    ostringstream ss;
+    if (daemons.size() > 1) {
+      if (daemons.size() > 10) {
+        ss << "daemons " << vector<DaemonKey>(daemons.begin(), daemons.begin()+10)
+           << "..." << " have slow ops.";
+      } else {
+        ss << "daemons " << daemons << " have slow ops.";
+      }
+    } else {
+      ss << daemons.front() << " has slow ops";
+    }
+    check.summary =
+        fmt::format("{} slow ops, oldest one blocked for {} sec, {}",
+                    value.n1, value.n2, ss.str());
+    // No detail
+  }
+  vector<DaemonKey> daemons;
+};
+
+
+class PendingPGs final : public DaemonHealthMetricCollector {
+  bool _is_relevant(daemon_metric type) const override {
+    return type == daemon_metric::PENDING_CREATING_PGS;
+  }
+  health_check_t& _get_check(health_check_map_t& cm) const override {
+    return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "", 1);
+  }
+  bool _update(const DaemonKey& osd,
+               const DaemonHealthMetric& metric) override {
+    value.n += metric.get_n();
+    if (metric.get_n()) {
+      osds.push_back(osd);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  void _summarize(health_check_t& check) const override {
+    if (osds.empty()) {
+      return;
+    }
+    check.summary = fmt::format("{} PGs pending on creation", value.n);
+    ostringstream ss;
+    if (osds.size() > 1) {
+      ss << "osds " << osds << " have pending PGs.";
+    } else {
+      ss << osds.front() << " has pending PGs";
+    }
+    check.detail.push_back(ss.str());
+  }
+  vector<DaemonKey> osds;
+};
+
+} // anonymous namespace
+
+unique_ptr<DaemonHealthMetricCollector>
+DaemonHealthMetricCollector::create(daemon_metric m)
+{
+  switch (m) {
+  case daemon_metric::SLOW_OPS:
+    return std::make_unique<SlowOps>();
+  case daemon_metric::PENDING_CREATING_PGS:
+    return std::make_unique<PendingPGs>();
+  default:
+    return {};
+  }
+}
diff --git a/src/mgr/DaemonHealthMetricCollector.h b/src/mgr/DaemonHealthMetricCollector.h
new file mode 100644
index 000000000..558f4e334
--- /dev/null
+++ b/src/mgr/DaemonHealthMetricCollector.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "DaemonHealthMetric.h"
+#include "DaemonKey.h"
+#include "mon/health_check.h"
+
+class DaemonHealthMetricCollector {
+public:
+  static std::unique_ptr<DaemonHealthMetricCollector> create(daemon_metric m);
+  void update(const DaemonKey& daemon, const DaemonHealthMetric& metric) {
+    if (_is_relevant(metric.get_type())) {
+      reported |= _update(daemon, metric);
+    }
+  }
+  void summarize(health_check_map_t& cm) {
+    if (reported) {
+      _summarize(_get_check(cm));
+    }
+  }
+  virtual ~DaemonHealthMetricCollector() {}
+private:
+  virtual bool _is_relevant(daemon_metric type) const = 0;
+  virtual health_check_t& _get_check(health_check_map_t& cm) const = 0;
+  virtual bool _update(const DaemonKey& daemon, const DaemonHealthMetric& metric) = 0;
+  virtual void _summarize(health_check_t& check) const = 0;
+protected:
+  daemon_metric_t value;
+  bool reported = false;
+};
diff --git a/src/mgr/DaemonKey.cc b/src/mgr/DaemonKey.cc
new file mode 100644
index 000000000..5501ac106
--- /dev/null
+++ b/src/mgr/DaemonKey.cc
@@ -0,0 +1,35 @@
+#include "DaemonKey.h"
+
+std::pair<DaemonKey, bool> DaemonKey::parse(const std::string& s)
+{
+  auto p = s.find('.');
+  if (p == s.npos) {
+    return {{}, false};
+  } else {
+    return {DaemonKey{s.substr(0, p), s.substr(p + 1)}, true};
+  }
+}
+
+bool operator<(const DaemonKey& lhs, const DaemonKey& rhs)
+{
+  if (int cmp = lhs.type.compare(rhs.type); cmp < 0) {
+    return true;
+  } else if (cmp > 0) {
+    return false;
+  } else {
+    return lhs.name < rhs.name;
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const DaemonKey& key)
+{
+  return os << key.type << '.' << key.name;
+}
+
+namespace ceph {
+std::string to_string(const DaemonKey& key)
+{
+  return key.type + '.' + key.name;
+}
+}
+
diff --git a/src/mgr/DaemonKey.h b/src/mgr/DaemonKey.h
new file mode 100644
index 000000000..92bacd649
--- /dev/null
+++ b/src/mgr/DaemonKey.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <utility>
+
+// Unique reference to a daemon within a cluster
+struct DaemonKey
+{
+  std::string type; // service type, like "osd", "mon"
+  std::string name; // service id / name, like "1", "a"
+  static std::pair<DaemonKey, bool> parse(const std::string& s);
+};
+
+bool operator<(const DaemonKey& lhs, const DaemonKey& rhs);
+std::ostream& operator<<(std::ostream& os, const DaemonKey& key);
+
+namespace ceph {
+  std::string to_string(const DaemonKey& key);
+}
+
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc
new file mode 100644
index 000000000..430911f6f
--- /dev/null
+++ b/src/mgr/DaemonServer.cc
@@ -0,0 +1,3146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "DaemonServer.h"
+#include <boost/algorithm/string.hpp>
+#include "mgr/Mgr.h"
+
+#include "include/stringify.h"
+#include "include/str_list.h"
+#include "auth/RotatingKeyRing.h"
+#include "json_spirit/json_spirit_writer.h"
+
+#include "mgr/mgr_commands.h"
+#include "mgr/DaemonHealthMetricCollector.h"
+#include "mgr/OSDPerfMetricCollector.h"
+#include "mgr/MDSPerfMetricCollector.h"
+#include "mon/MonCommand.h"
+
+#include "messages/MMgrOpen.h"
+#include "messages/MMgrUpdate.h"
+#include "messages/MMgrClose.h"
+#include "messages/MMgrConfigure.h"
+#include "messages/MMonMgrReport.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MMgrCommand.h"
+#include "messages/MMgrCommandReply.h"
+#include "messages/MPGStats.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrub2.h"
+#include "messages/MOSDForceRecovery.h"
+#include "common/errno.h"
+#include "common/pick_address.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr.server " << __func__ << " "
+using namespace TOPNSPC::common;
+namespace {
+  template <typename Map>
+  bool map_compare(Map const &lhs, Map const &rhs) {
+    return lhs.size() == rhs.size()
+      && std::equal(lhs.begin(), lhs.end(), rhs.begin(),
+                    [] (auto a, auto b) { return a.first == b.first && a.second == b.second; });
+  }
+}
+
+DaemonServer::DaemonServer(MonClient *monc_,
+                           Finisher &finisher_,
+			   DaemonStateIndex &daemon_state_,
+			   ClusterState &cluster_state_,
+			   PyModuleRegistry &py_modules_,
+			   LogChannelRef clog_,
+			   LogChannelRef audit_clog_)
+    : Dispatcher(g_ceph_context),
+      client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes",
+					 g_conf().get_val<Option::size_t>("mgr_client_bytes"))),
+      client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages",
+					g_conf().get_val<uint64_t>("mgr_client_messages"))),
+      osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes",
+				      g_conf().get_val<Option::size_t>("mgr_osd_bytes"))),
+      osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages",
+				     g_conf().get_val<uint64_t>("mgr_osd_messages"))),
+      mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes",
+				      g_conf().get_val<Option::size_t>("mgr_mds_bytes"))),
+      mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages",
+				     g_conf().get_val<uint64_t>("mgr_mds_messages"))),
+      mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes",
+				      g_conf().get_val<Option::size_t>("mgr_mon_bytes"))),
+      mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages",
+				     g_conf().get_val<uint64_t>("mgr_mon_messages"))),
+      msgr(nullptr),
+      monc(monc_),
+      finisher(finisher_),
+      daemon_state(daemon_state_),
+      cluster_state(cluster_state_),
+      py_modules(py_modules_),
+      clog(clog_),
+      audit_clog(audit_clog_),
+      pgmap_ready(false),
+      timer(g_ceph_context, lock),
+      shutting_down(false),
+      tick_event(nullptr),
+      osd_perf_metric_collector_listener(this),
+      osd_perf_metric_collector(osd_perf_metric_collector_listener),
+      mds_perf_metric_collector_listener(this),
+      mds_perf_metric_collector(mds_perf_metric_collector_listener)
+{
+  g_conf().add_observer(this);
+}
+
+DaemonServer::~DaemonServer() {
+  delete msgr;
+  g_conf().remove_observer(this);
+}
+
+int DaemonServer::init(uint64_t gid, entity_addrvec_t client_addrs)
+{
+  // Initialize Messenger
+  std::string public_msgr_type = g_conf()->ms_public_type.empty() ?
+    g_conf().get_val<std::string>("ms_type") : g_conf()->ms_public_type;
+  msgr = Messenger::create(g_ceph_context, public_msgr_type,
+			   entity_name_t::MGR(gid),
+			   "mgr",
+			   Messenger::get_pid_nonce());
+  msgr->set_default_policy(Messenger::Policy::stateless_server(0));
+
+  msgr->set_auth_client(monc);
+
+  // throttle clients
+  msgr->set_policy_throttlers(entity_name_t::TYPE_CLIENT,
+			      client_byte_throttler.get(),
+			      client_msg_throttler.get());
+
+  // servers
+  msgr->set_policy_throttlers(entity_name_t::TYPE_OSD,
+			      osd_byte_throttler.get(),
+			      osd_msg_throttler.get());
+  msgr->set_policy_throttlers(entity_name_t::TYPE_MDS,
+			      mds_byte_throttler.get(),
+			      mds_msg_throttler.get());
+  msgr->set_policy_throttlers(entity_name_t::TYPE_MON,
+			      mon_byte_throttler.get(),
+			      mon_msg_throttler.get());
+
+  entity_addrvec_t addrs;
+  int r = pick_addresses(cct, CEPH_PICK_ADDRESS_PUBLIC, &addrs);
+  if (r < 0) {
+    return r;
+  }
+  dout(20) << __func__ << " will bind to " << addrs << dendl;
+  r = msgr->bindv(addrs);
+  if (r < 0) {
+    derr << "unable to bind mgr to " << addrs << dendl;
+    return r;
+  }
+
+  msgr->set_myname(entity_name_t::MGR(gid));
+  msgr->set_addr_unknowns(client_addrs);
+
+  msgr->start();
+  msgr->add_dispatcher_tail(this);
+
+  msgr->set_auth_server(monc);
+  monc->set_handle_authentication_dispatcher(this);
+
+  started_at = ceph_clock_now();
+
+  std::lock_guard l(lock);
+  timer.init();
+
+  schedule_tick_locked(
+    g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count());
+
+  return 0;
+}
+
+entity_addrvec_t DaemonServer::get_myaddrs() const
+{
+  return msgr->get_myaddrs();
+}
+
+int DaemonServer::ms_handle_authentication(Connection *con)
+{
+  auto s = ceph::make_ref<MgrSession>(cct);
+  con->set_priv(s);
+  s->inst.addr = con->get_peer_addr();
+  s->entity_name = con->peer_name;
+  dout(10) << __func__ << " new session " << s << " con " << con
+	   << " entity " << con->peer_name
+	   << " addr " << con->get_peer_addrs()
+	   << dendl;
+
+  AuthCapsInfo &caps_info = con->get_peer_caps_info();
+  if (caps_info.allow_all) {
+    dout(10) << " session " << s << " " << s->entity_name
+	     << " allow_all" << dendl;
+    s->caps.set_allow_all();
+  } else if (caps_info.caps.length() > 0) {
+    auto p = caps_info.caps.cbegin();
+    string str;
+    try {
+      decode(str, p);
+    }
+    catch (buffer::error& e) {
+      dout(10) << " session " << s << " " << s->entity_name
+               << " failed to decode caps" << dendl;
+      return -EACCES;
+    }
+    if (!s->caps.parse(str)) {
+      dout(10) << " session " << s << " " << s->entity_name
+	       << " failed to parse caps '" << str << "'" << dendl;
+      return -EACCES;
+    }
+    dout(10) << " session " << s << " " << s->entity_name
+             << " has caps " << s->caps << " '" << str << "'" << dendl;
+  }
+
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
+    std::lock_guard l(lock);
+    s->osd_id = atoi(s->entity_name.get_id().c_str());
+    dout(10) << "registering osd." << s->osd_id << " session "
+	     << s << " con " << con << dendl;
+    osd_cons[s->osd_id].insert(con);
+  }
+
+  return 1;
+}
+
+bool DaemonServer::ms_handle_reset(Connection *con)
+{
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
+    auto priv = con->get_priv();
+    auto session = static_cast<MgrSession*>(priv.get());
+    if (!session) {
+      return false;
+    }
+    std::lock_guard l(lock);
+    dout(10) << "unregistering osd." << session->osd_id
+	     << "  session " << session << " con " << con << dendl;
+    osd_cons[session->osd_id].erase(con);
+
+    auto iter = daemon_connections.find(con);
+    if (iter != daemon_connections.end()) {
+      daemon_connections.erase(iter);
+    }
+  }
+  return false;
+}
+
+bool DaemonServer::ms_handle_refused(Connection *con)
+{
+  // do nothing for now
+  return false;
+}
+
+bool DaemonServer::ms_dispatch2(const ref_t<Message>& m)
+{
+  // Note that we do *not* take ::lock here, in order to avoid
+  // serializing all message handling.  It's up to each handler
+  // to take whatever locks it needs.
+  switch (m->get_type()) {
+    case MSG_PGSTATS:
+      cluster_state.ingest_pgstats(ref_cast<MPGStats>(m));
+      maybe_ready(m->get_source().num());
+      return true;
+    case MSG_MGR_REPORT:
+      return handle_report(ref_cast<MMgrReport>(m));
+    case MSG_MGR_OPEN:
+      return handle_open(ref_cast<MMgrOpen>(m));
+    case MSG_MGR_UPDATE:
+      return handle_update(ref_cast<MMgrUpdate>(m));
+    case MSG_MGR_CLOSE:
+      return handle_close(ref_cast<MMgrClose>(m));
+    case MSG_COMMAND:
+      return handle_command(ref_cast<MCommand>(m));
+    case MSG_MGR_COMMAND:
+      return handle_command(ref_cast<MMgrCommand>(m));
+    default:
+      dout(1) << "Unhandled message type " << m->get_type() << dendl;
+      return false;
+  };
+}
+
+void DaemonServer::dump_pg_ready(ceph::Formatter *f)
+{
+  f->dump_bool("pg_ready", pgmap_ready.load());
+}
+
+void DaemonServer::maybe_ready(int32_t osd_id)
+{
+  if (pgmap_ready.load()) {
+    // Fast path: we don't need to take lock because pgmap_ready
+    // is already set
+  } else {
+    std::lock_guard l(lock);
+
+    if (reported_osds.find(osd_id) == reported_osds.end()) {
+      dout(4) << "initial report from osd " << osd_id << dendl;
+      reported_osds.insert(osd_id);
+      std::set<int32_t> up_osds;
+
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+          osdmap.get_up_osds(up_osds);
+      });
+
+      std::set<int32_t> unreported_osds;
+      std::set_difference(up_osds.begin(), up_osds.end(),
+                          reported_osds.begin(), reported_osds.end(),
+                          std::inserter(unreported_osds, unreported_osds.begin()));
+
+      if (unreported_osds.size() == 0) {
+        dout(4) << "all osds have reported, sending PG state to mon" << dendl;
+        pgmap_ready = true;
+        reported_osds.clear();
+        // Avoid waiting for next tick
+        send_report();
+      } else {
+        dout(4) << "still waiting for " << unreported_osds.size() << " osds"
+                   " to report in before PGMap is ready" << dendl;
+      }
+    }
+  }
+}
+
+void DaemonServer::tick()
+{
+  dout(10) << dendl;
+  send_report();
+  adjust_pgs();
+
+  schedule_tick_locked(
+    g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count());
+}
+
+// Currently modules do not set health checks in response to events delivered to
+// all modules (e.g. notify) so we do not risk a thundering hurd situation here.
+// if this pattern emerges in the future, this scheduler could be modified to
+// fire after all modules have had a chance to set their health checks.
+void DaemonServer::schedule_tick_locked(double delay_sec)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  if (tick_event) {
+    timer.cancel_event(tick_event);
+    tick_event = nullptr;
+  }
+
+  // on shutdown start rejecting explicit requests to send reports that may
+  // originate from python land which may still be running.
+  if (shutting_down)
+    return;
+
+  tick_event = timer.add_event_after(delay_sec,
+    new LambdaContext([this](int r) {
+      tick();
+  }));
+}
+
+void DaemonServer::schedule_tick(double delay_sec)
+{
+  std::lock_guard l(lock);
+  schedule_tick_locked(delay_sec);
+}
+
+void DaemonServer::handle_osd_perf_metric_query_updated()
+{
+  dout(10) << dendl;
+
+  // Send a fresh MMgrConfigure to all clients, so that they can follow
+  // the new policy for transmitting stats
+  finisher.queue(new LambdaContext([this](int r) {
+        std::lock_guard l(lock);
+        for (auto &c : daemon_connections) {
+          if (c->peer_is_osd()) {
+            _send_configure(c);
+          }
+        }
+      }));
+}
+
+void DaemonServer::handle_mds_perf_metric_query_updated()
+{
+  dout(10) << dendl;
+
+  // Send a fresh MMgrConfigure to all clients, so that they can follow
+  // the new policy for transmitting stats
+  finisher.queue(new LambdaContext([this](int r) {
+        std::lock_guard l(lock);
+        for (auto &c : daemon_connections) {
+          if (c->peer_is_mds()) {
+            _send_configure(c);
+          }
+        }
+      }));
+}
+
+void DaemonServer::shutdown()
+{
+  dout(10) << "begin" << dendl;
+  msgr->shutdown();
+  msgr->wait();
+  cluster_state.shutdown();
+  dout(10) << "done" << dendl;
+
+  std::lock_guard l(lock);
+  shutting_down = true;
+  timer.shutdown();
+}
+
+static DaemonKey key_from_service(
+  const std::string& service_name,
+  int peer_type,
+  const std::string& daemon_name)
+{
+  if (!service_name.empty()) {
+    return DaemonKey{service_name, daemon_name};
+  } else {
+    return DaemonKey{ceph_entity_type_name(peer_type), daemon_name};
+  }
+}
+
+void DaemonServer::fetch_missing_metadata(const DaemonKey& key,
+					  const entity_addr_t& addr)
+{
+  if (!daemon_state.is_updating(key) &&
+      (key.type == "osd" || key.type == "mds" || key.type == "mon")) {
+    std::ostringstream oss;
+    auto c = new MetadataUpdate(daemon_state, key);
+    if (key.type == "osd") {
+      oss << "{\"prefix\": \"osd metadata\", \"id\": "
+	  << key.name<< "}";
+    } else if (key.type == "mds") {
+      c->set_default("addr", stringify(addr));
+      oss << "{\"prefix\": \"mds metadata\", \"who\": \""
+	  << key.name << "\"}";
+    } else if (key.type == "mon") {
+      oss << "{\"prefix\": \"mon metadata\", \"id\": \""
+	  << key.name << "\"}";
+    } else {
+      ceph_abort();
+    }
+    monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c);
+  }
+}
+
+bool DaemonServer::handle_open(const ref_t<MMgrOpen>& m)
+{
+  std::unique_lock l(lock);
+
+  DaemonKey key = key_from_service(m->service_name,
+				   m->get_connection()->get_peer_type(),
+				   m->daemon_name);
+
+  auto con = m->get_connection();
+  dout(10) << "from " << key << " " << con->get_peer_addr() << dendl;
+
+  _send_configure(con);
+
+  DaemonStatePtr daemon;
+  if (daemon_state.exists(key)) {
+    dout(20) << "updating existing DaemonState for " << key << dendl;
+    daemon = daemon_state.get(key);
+  }
+  if (!daemon) {
+    if (m->service_daemon) {
+      dout(4) << "constructing new DaemonState for " << key << dendl;
+      daemon = std::make_shared<DaemonState>(daemon_state.types);
+      daemon->key = key;
+      daemon->service_daemon = true;
+      daemon_state.insert(daemon);
+    } else {
+      /* A normal Ceph daemon has connected but we are or should be waiting on
+       * metadata for it. Close the session so that it tries to reconnect.
+       */
+      dout(2) << "ignoring open from " << key << " " << con->get_peer_addr()
+              << "; not ready for session (expect reconnect)" << dendl;
+      con->mark_down();
+      l.unlock();
+      fetch_missing_metadata(key, m->get_source_addr());
+      return true;
+    }
+  }
+  if (daemon) {
+    if (m->service_daemon) {
+      // update the metadata through the daemon state index to
+      // ensure it's kept up-to-date
+      daemon_state.update_metadata(daemon, m->daemon_metadata);
+    }
+
+    std::lock_guard l(daemon->lock);
+    daemon->perf_counters.clear();
+
+    daemon->service_daemon = m->service_daemon;
+    if (m->service_daemon) {
+      daemon->service_status = m->daemon_status;
+
+      utime_t now = ceph_clock_now();
+      auto [d, added] = pending_service_map.get_daemon(m->service_name,
+						       m->daemon_name);
+      if (added || d->gid != (uint64_t)m->get_source().num()) {
+	dout(10) << "registering " << key << " in pending_service_map" << dendl;
+	d->gid = m->get_source().num();
+	d->addr = m->get_source_addr();
+	d->start_epoch = pending_service_map.epoch;
+	d->start_stamp = now;
+	d->metadata = m->daemon_metadata;
+	pending_service_map_dirty = pending_service_map.epoch;
+      }
+    }
+
+    auto p = m->config_bl.cbegin();
+    if (p != m->config_bl.end()) {
+      decode(daemon->config, p);
+      decode(daemon->ignored_mon_config, p);
+      dout(20) << " got config " << daemon->config
+	       << " ignored " << daemon->ignored_mon_config << dendl;
+    }
+    daemon->config_defaults_bl = m->config_defaults_bl;
+    daemon->config_defaults.clear();
+    dout(20) << " got config_defaults_bl " << daemon->config_defaults_bl.length()
+	     << " bytes" << dendl;
+  }
+
+  if (con->get_peer_type() != entity_name_t::TYPE_CLIENT &&
+      m->service_name.empty())
+  {
+    // Store in set of the daemon/service connections, i.e. those
+    // connections that require an update in the event of stats
+    // configuration changes.
+    daemon_connections.insert(con);
+  }
+
+  return true;
+}
+
+bool DaemonServer::handle_update(const ref_t<MMgrUpdate>& m)
+{
+  DaemonKey key;
+  if (!m->service_name.empty()) {
+    key.type = m->service_name;
+  } else {
+    key.type = ceph_entity_type_name(m->get_connection()->get_peer_type());
+  }
+  key.name = m->daemon_name;
+
+  dout(10) << "from " << m->get_connection() << " " << key << dendl;
+
+  if (m->get_connection()->get_peer_type() == entity_name_t::TYPE_CLIENT &&
+      m->service_name.empty()) {
+    // Clients should not be sending us update request
+    dout(10) << "rejecting update request from non-daemon client " << m->daemon_name
+	     << dendl;
+    clog->warn() << "rejecting report from non-daemon client " << m->daemon_name
+		 << " at " << m->get_connection()->get_peer_addrs();
+    m->get_connection()->mark_down();
+    return true;
+  }
+
+
+  {
+    std::unique_lock locker(lock);
+
+    DaemonStatePtr daemon;
+    // Look up the DaemonState
+    if (daemon_state.exists(key)) {
+      dout(20) << "updating existing DaemonState for " << key << dendl;
+
+      daemon = daemon_state.get(key);
+      if (m->need_metadata_update &&
+          !m->daemon_metadata.empty()) {
+        daemon_state.update_metadata(daemon, m->daemon_metadata);
+      }
+    }
+  }
+
+  return true;
+}
+
+bool DaemonServer::handle_close(const ref_t<MMgrClose>& m)
+{
+  std::lock_guard l(lock);
+
+  DaemonKey key = key_from_service(m->service_name,
+				   m->get_connection()->get_peer_type(),
+				   m->daemon_name);
+  dout(4) << "from " << m->get_connection() << "  " << key << dendl;
+
+  if (daemon_state.exists(key)) {
+    DaemonStatePtr daemon = daemon_state.get(key);
+    daemon_state.rm(key);
+    {
+      std::lock_guard l(daemon->lock);
+      if (daemon->service_daemon) {
+	pending_service_map.rm_daemon(m->service_name, m->daemon_name);
+	pending_service_map_dirty = pending_service_map.epoch;
+      }
+    }
+  }
+
+  // send same message back as a reply
+  m->get_connection()->send_message2(m);
+  return true;
+}
+
+void DaemonServer::update_task_status(
+  DaemonKey key,
+  const std::map<std::string,std::string>& task_status)
+{
+  dout(10) << "got task status from " << key << dendl;
+
+  [[maybe_unused]] auto [daemon, added] =
+    pending_service_map.get_daemon(key.type, key.name);
+  if (daemon->task_status != task_status) {
+    daemon->task_status = task_status;
+    pending_service_map_dirty = pending_service_map.epoch;
+  }
+}
+
+bool DaemonServer::handle_report(const ref_t<MMgrReport>& m)
+{
+  DaemonKey key;
+  if (!m->service_name.empty()) {
+    key.type = m->service_name;
+  } else {
+    key.type = ceph_entity_type_name(m->get_connection()->get_peer_type());
+  }
+  key.name = m->daemon_name;
+
+  dout(10) << "from " << m->get_connection() << " " << key << dendl;
+
+  if (m->get_connection()->get_peer_type() == entity_name_t::TYPE_CLIENT &&
+      m->service_name.empty()) {
+    // Clients should not be sending us stats unless they are declaring
+    // themselves to be a daemon for some service.
+    dout(10) << "rejecting report from non-daemon client " << m->daemon_name
+	     << dendl;
+    clog->warn() << "rejecting report from non-daemon client " << m->daemon_name
+		 << " at " << m->get_connection()->get_peer_addrs();
+    m->get_connection()->mark_down();
+    return true;
+  }
+
+
+  {
+    std::unique_lock locker(lock);
+
+    DaemonStatePtr daemon;
+    // Look up the DaemonState
+    if (daemon_state.exists(key)) {
+      dout(20) << "updating existing DaemonState for " << key << dendl;
+      daemon = daemon_state.get(key);
+    } else {
+      locker.unlock();
+
+      // we don't know the hostname at this stage, reject MMgrReport here.
+      dout(5) << "rejecting report from " << key << ", since we do not have its metadata now."
+              << dendl;
+      // issue metadata request in background
+      fetch_missing_metadata(key, m->get_source_addr());
+
+      locker.lock();
+
+      // kill session
+      auto priv = m->get_connection()->get_priv();
+      auto session = static_cast<MgrSession*>(priv.get());
+      if (!session) {
+        return false;
+      }
+      m->get_connection()->mark_down();
+
+      dout(10) << "unregistering osd." << session->osd_id
+               << "  session " << session << " con " << m->get_connection() << dendl;
+      
+      if (osd_cons.find(session->osd_id) != osd_cons.end()) {
+        osd_cons[session->osd_id].erase(m->get_connection());
+      }
+
+      auto iter = daemon_connections.find(m->get_connection());
+      if (iter != daemon_connections.end()) {
+        daemon_connections.erase(iter);
+      }
+
+      return false;
+    }
+
+    // Update the DaemonState
+    ceph_assert(daemon != nullptr);
+    {
+      std::lock_guard l(daemon->lock);
+      auto &daemon_counters = daemon->perf_counters;
+      daemon_counters.update(*m.get());
+
+      auto p = m->config_bl.cbegin();
+      if (p != m->config_bl.end()) {
+        decode(daemon->config, p);
+        decode(daemon->ignored_mon_config, p);
+        dout(20) << " got config " << daemon->config
+                 << " ignored " << daemon->ignored_mon_config << dendl;
+      }
+
+      utime_t now = ceph_clock_now();
+      if (daemon->service_daemon) {
+        if (m->daemon_status) {
+          daemon->service_status_stamp = now;
+          daemon->service_status = *m->daemon_status;
+        }
+        daemon->last_service_beacon = now;
+      } else if (m->daemon_status) {
+        derr << "got status from non-daemon " << key << dendl;
+      }
+      // update task status
+      if (m->task_status) {
+        update_task_status(key, *m->task_status);
+        daemon->last_service_beacon = now;
+      }
+      if (m->get_connection()->peer_is_osd() || m->get_connection()->peer_is_mon()) {
+        // only OSD and MON send health_checks to me now
+        daemon->daemon_health_metrics = std::move(m->daemon_health_metrics);
+        dout(10) << "daemon_health_metrics " << daemon->daemon_health_metrics
+                 << dendl;
+      }
+    }
+  }
+
+  // if there are any schema updates, notify the python modules
+  /* no users currently
+  if (!m->declare_types.empty() || !m->undeclare_types.empty()) {
+    py_modules.notify_all("perf_schema_update", ceph::to_string(key));
+  }
+  */
+
+  if (m->get_connection()->peer_is_osd()) {
+    osd_perf_metric_collector.process_reports(m->osd_perf_metric_reports);
+  }
+
+  if (m->metric_report_message) {
+    const MetricReportMessage &message = *m->metric_report_message;
+    boost::apply_visitor(HandlePayloadVisitor(this), message.payload);
+  }
+
+  return true;
+}
+
+
+void DaemonServer::_generate_command_map(
+  cmdmap_t& cmdmap,
+  map<string,string> &param_str_map)
+{
+  for (auto p = cmdmap.begin();
+       p != cmdmap.end(); ++p) {
+    if (p->first == "prefix")
+      continue;
+    if (p->first == "caps") {
+      vector<string> cv;
+      if (cmd_getval(cmdmap, "caps", cv) &&
+	  cv.size() % 2 == 0) {
+	for (unsigned i = 0; i < cv.size(); i += 2) {
+	  string k = string("caps_") + cv[i];
+	  param_str_map[k] = cv[i + 1];
+	}
+	continue;
+      }
+    }
+    param_str_map[p->first] = cmd_vartype_stringify(p->second);
+  }
+}
+
+const MonCommand *DaemonServer::_get_mgrcommand(
+  const string &cmd_prefix,
+  const std::vector<MonCommand> &cmds)
+{
+  const MonCommand *this_cmd = nullptr;
+  for (const auto &cmd : cmds) {
+    if (cmd.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
+      this_cmd = &cmd;
+      break;
+    }
+  }
+  return this_cmd;
+}
+
+bool DaemonServer::_allowed_command(
+  MgrSession *s,
+  const string &service,
+  const string &module,
+  const string &prefix,
+  const cmdmap_t& cmdmap,
+  const map<string,string>& param_str_map,
+  const MonCommand *this_cmd) {
+
+  if (s->entity_name.is_mon()) {
+    // mon is all-powerful.  even when it is forwarding commands on behalf of
+    // old clients; we expect the mon is validating commands before proxying!
+    return true;
+  }
+
+  bool cmd_r = this_cmd->requires_perm('r');
+  bool cmd_w = this_cmd->requires_perm('w');
+  bool cmd_x = this_cmd->requires_perm('x');
+
+  bool capable = s->caps.is_capable(
+    g_ceph_context,
+    s->entity_name,
+    service, module, prefix, param_str_map,
+    cmd_r, cmd_w, cmd_x,
+    s->get_peer_addr());
+
+  dout(10) << " " << s->entity_name << " "
+	   << (capable ? "" : "not ") << "capable" << dendl;
+  return capable;
+}
+
+/**
+ * The working data for processing an MCommand.  This lives in
+ * a class to enable passing it into other threads for processing
+ * outside of the thread/locks that called handle_command.
+ */
+class CommandContext {
+public:
+  ceph::ref_t<MCommand> m_tell;
+  ceph::ref_t<MMgrCommand> m_mgr;
+  const std::vector<std::string>& cmd;  ///< ref into m_tell or m_mgr
+  const bufferlist& data;               ///< ref into m_tell or m_mgr
+  bufferlist odata;
+  cmdmap_t cmdmap;
+
+  explicit CommandContext(ceph::ref_t<MCommand> m)
+    : m_tell{std::move(m)},
+      cmd(m_tell->cmd),
+      data(m_tell->get_data()) {
+  }
+  explicit CommandContext(ceph::ref_t<MMgrCommand> m)
+    : m_mgr{std::move(m)},
+      cmd(m_mgr->cmd),
+      data(m_mgr->get_data()) {
+  }
+
+  void reply(int r, const std::stringstream &ss) {
+    reply(r, ss.str());
+  }
+
+  void reply(int r, const std::string &rs) {
+    // Let the connection drop as soon as we've sent our response
+    ConnectionRef con = m_tell ? m_tell->get_connection()
+      : m_mgr->get_connection();
+    if (con) {
+      con->mark_disposable();
+    }
+
+    if (r == 0) {
+      dout(20) << "success" << dendl;
+    } else {
+      derr << __func__ << " " << cpp_strerror(r) << " " << rs << dendl;
+    }
+    if (con) {
+      if (m_tell) {
+	MCommandReply *reply = new MCommandReply(r, rs);
+	reply->set_tid(m_tell->get_tid());
+	reply->set_data(odata);
+	con->send_message(reply);
+      } else {
+	MMgrCommandReply *reply = new MMgrCommandReply(r, rs);
+	reply->set_tid(m_mgr->get_tid());
+	reply->set_data(odata);
+	con->send_message(reply);
+      }
+    }
+  }
+};
+
+/**
+ * A context for receiving a bufferlist/error string from a background
+ * function and then calling back to a CommandContext when it's done
+ */
+class ReplyOnFinish : public Context {
+  std::shared_ptr<CommandContext> cmdctx;
+
+public:
+  bufferlist from_mon;
+  string outs;
+
+  explicit ReplyOnFinish(const std::shared_ptr<CommandContext> &cmdctx_)
+    : cmdctx(cmdctx_)
+    {}
+  void finish(int r) override {
+    cmdctx->odata.claim_append(from_mon);
+    cmdctx->reply(r, outs);
+  }
+};
+
+bool DaemonServer::handle_command(const ref_t<MCommand>& m)
+{
+  std::lock_guard l(lock);
+  auto cmdctx = std::make_shared<CommandContext>(m);
+  try {
+    return _handle_command(cmdctx);
+  } catch (const bad_cmd_get& e) {
+    cmdctx->reply(-EINVAL, e.what());
+    return true;
+  }
+}
+
+bool DaemonServer::handle_command(const ref_t<MMgrCommand>& m)
+{
+  std::lock_guard l(lock);
+  auto cmdctx = std::make_shared<CommandContext>(m);
+  try {
+    return _handle_command(cmdctx);
+  } catch (const bad_cmd_get& e) {
+    cmdctx->reply(-EINVAL, e.what());
+    return true;
+  }
+}
+
+void DaemonServer::log_access_denied(
+    std::shared_ptr<CommandContext>& cmdctx,
+    MgrSession* session, std::stringstream& ss) {
+  dout(1) << " access denied" << dendl;
+  audit_clog->info() << "from='" << session->inst << "' "
+                     << "entity='" << session->entity_name << "' "
+                     << "cmd=" << cmdctx->cmd << ":  access denied";
+  ss << "access denied: does your client key have mgr caps? "
+        "See http://docs.ceph.com/en/latest/mgr/administrator/"
+        "#client-authentication";
+}
+
+void DaemonServer::_check_offlines_pgs(
+  const set<int>& osds,
+  const OSDMap& osdmap,
+  const PGMap& pgmap,
+  offline_pg_report *report)
+{
+  // reset output
+  *report = offline_pg_report();
+  report->osds = osds;
+
+  for (const auto& q : pgmap.pg_stat) {
+    set<int32_t> pg_acting;  // net acting sets (with no missing if degraded)
+    bool found = false;
+    if (q.second.state == 0) {
+      report->unknown.insert(q.first);
+      continue;
+    }
+    if (q.second.state & PG_STATE_DEGRADED) {
+      for (auto& anm : q.second.avail_no_missing) {
+	if (osds.count(anm.osd)) {
+	  found = true;
+	  continue;
+	}
+	if (anm.osd != CRUSH_ITEM_NONE) {
+	  pg_acting.insert(anm.osd);
+	}
+      }
+    } else {
+      for (auto& a : q.second.acting) {
+	if (osds.count(a)) {
+	  found = true;
+	  continue;
+	}
+	if (a != CRUSH_ITEM_NONE) {
+	  pg_acting.insert(a);
+	}
+      }
+    }
+    if (!found) {
+      continue;
+    }
+    const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool());
+    bool dangerous = false;
+    if (!pi) {
+      report->bad_no_pool.insert(q.first); // pool is creating or deleting
+      dangerous = true;
+    }
+    if (!(q.second.state & PG_STATE_ACTIVE)) {
+      report->bad_already_inactive.insert(q.first);
+      dangerous = true;
+    }
+    if (pg_acting.size() < pi->min_size) {
+      report->bad_become_inactive.insert(q.first);
+      dangerous = true;
+    }
+    if (dangerous) {
+      report->not_ok.insert(q.first);
+    } else {
+      report->ok.insert(q.first);
+      if (q.second.state & PG_STATE_DEGRADED) {
+	report->ok_become_more_degraded.insert(q.first);
+      } else {
+	report->ok_become_degraded.insert(q.first);
+      }
+    }
+  }
+  dout(20) << osds << " -> " << report->ok.size() << " ok, "
+	   << report->not_ok.size() << " not ok, "
+	   << report->unknown.size() << " unknown"
+	   << dendl;
+}
+
+void DaemonServer::_maximize_ok_to_stop_set(
+  const set<int>& orig_osds,
+  unsigned max,
+  const OSDMap& osdmap,
+  const PGMap& pgmap,
+  offline_pg_report *out_report)
+{
+  dout(20) << "orig_osds " << orig_osds << " max " << max << dendl;
+  _check_offlines_pgs(orig_osds, osdmap, pgmap, out_report);
+  if (!out_report->ok_to_stop()) {
+    return;
+  }
+  if (orig_osds.size() >= max) {
+    // already at max
+    return;
+  }
+
+  // semi-arbitrarily start with the first osd in the set
+  offline_pg_report report;
+  set<int> osds = orig_osds;
+  int parent = *osds.begin();
+  set<int> children;
+
+  while (true) {
+    // identify the next parent
+    int r = osdmap.crush->get_immediate_parent_id(parent, &parent);
+    if (r < 0) {
+      return;  // just go with what we have so far!
+    }
+
+    // get candidate additions that are beneath this point in the tree
+    children.clear();
+    r = osdmap.crush->get_all_children(parent, &children);
+    if (r < 0) {
+      return;  // just go with what we have so far!
+    }
+    dout(20) << "  parent " << parent << " children " << children << dendl;
+
+    // try adding in more osds
+    int failed = 0;  // how many children we failed to add to our set
+    for (auto o : children) {
+      if (o >= 0 && osdmap.is_up(o) && osds.count(o) == 0) {
+	osds.insert(o);
+	_check_offlines_pgs(osds, osdmap, pgmap, &report);
+	if (!report.ok_to_stop()) {
+	  osds.erase(o);
+	  ++failed;
+	  continue;
+	}
+	*out_report = report;
+	if (osds.size() == max) {
+	  dout(20) << " hit max" << dendl;
+	  return;  // yay, we hit the max
+	}
+      }
+    }
+
+    if (failed) {
+      // we hit some failures; go with what we have
+      dout(20) << " hit some peer failures" << dendl;
+      return;
+    }
+  }
+}
+
+bool DaemonServer::_handle_command(
+  std::shared_ptr<CommandContext>& cmdctx)
+{
+  MessageRef m;
+  bool admin_socket_cmd = false;
+  if (cmdctx->m_tell) {
+    m = cmdctx->m_tell;
+    // a blank fsid in MCommand signals a legacy client sending a "mon-mgr" CLI
+    // command.
+    admin_socket_cmd = (cmdctx->m_tell->fsid != uuid_d());
+  } else {
+    m = cmdctx->m_mgr;
+  }
+  auto priv = m->get_connection()->get_priv();
+  auto session = static_cast<MgrSession*>(priv.get());
+  if (!session) {
+    return true;
+  }
+  if (session->inst.name == entity_name_t()) {
+    session->inst.name = m->get_source();
+  }
+
+  map<string,string> param_str_map;
+  std::stringstream ss;
+  int r = 0;
+
+  if (!cmdmap_from_json(cmdctx->cmd, &(cmdctx->cmdmap), ss)) {
+    cmdctx->reply(-EINVAL, ss);
+    return true;
+  }
+
+  string prefix;
+  cmd_getval(cmdctx->cmdmap, "prefix", prefix);
+  dout(10) << "decoded-size=" << cmdctx->cmdmap.size() << " prefix=" << prefix << dendl;
+
+  boost::scoped_ptr<Formatter> f;
+  {
+    std::string format;
+    if (boost::algorithm::ends_with(prefix, "_json")) {
+      format = "json";
+    } else {
+      cmd_getval(cmdctx->cmdmap, "format", format, string("plain"));
+    }
+    f.reset(Formatter::create(format));
+  }
+
+  // this is just for mgr commands - admin socket commands will fall
+  // through and use the admin socket version of
+  // get_command_descriptions
+  if (prefix == "get_command_descriptions" && !admin_socket_cmd) {
+    dout(10) << "reading commands from python modules" << dendl;
+    const auto py_commands = py_modules.get_commands();
+
+    int cmdnum = 0;
+    JSONFormatter f;
+    f.open_object_section("command_descriptions");
+
+    auto dump_cmd = [&cmdnum, &f, m](const MonCommand &mc){
+      ostringstream secname;
+      secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+      dump_cmddesc_to_json(&f, m->get_connection()->get_features(),
+                           secname.str(), mc.cmdstring, mc.helpstring,
+                           mc.module, mc.req_perms, 0);
+      cmdnum++;
+    };
+
+    for (const auto &pyc : py_commands) {
+      dump_cmd(pyc);
+    }
+
+    for (const auto &mgr_cmd : mgr_commands) {
+      dump_cmd(mgr_cmd);
+    }
+
+    f.close_section();	// command_descriptions
+    f.flush(cmdctx->odata);
+    cmdctx->reply(0, ss);
+    return true;
+  }
+
+  // lookup command
+  const MonCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands);
+  _generate_command_map(cmdctx->cmdmap, param_str_map);
+
+  bool is_allowed = false;
+  ModuleCommand py_command;
+  if (admin_socket_cmd) {
+    // admin socket commands require all capabilities
+    is_allowed = session->caps.is_allow_all();
+  } else if (!mgr_cmd) {
+    // Resolve the command to the name of the module that will
+    // handle it (if the command exists)
+    auto py_commands = py_modules.get_py_commands();
+    for (const auto &pyc : py_commands) {
+      auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
+      if (pyc_prefix == prefix) {
+        py_command = pyc;
+        break;
+      }
+    }
+
+    MonCommand pyc = {"", "", "py", py_command.perm};
+    is_allowed = _allowed_command(session, "py", py_command.module_name,
+                                  prefix, cmdctx->cmdmap, param_str_map,
+                                  &pyc);
+  } else {
+    // validate user's permissions for requested command
+    is_allowed = _allowed_command(session, mgr_cmd->module, "",
+      prefix, cmdctx->cmdmap,  param_str_map, mgr_cmd);
+  }
+
+  if (!is_allowed) {
+      log_access_denied(cmdctx, session, ss);
+      cmdctx->reply(-EACCES, ss);
+      return true;
+  }
+
+  audit_clog->debug()
+    << "from='" << session->inst << "' "
+    << "entity='" << session->entity_name << "' "
+    << "cmd=" << cmdctx->cmd << ": dispatch";
+
+  if (admin_socket_cmd) {
+    cct->get_admin_socket()->queue_tell_command(cmdctx->m_tell);
+    return true;
+  }
+
+  // ----------------
+  // service map commands
+  if (prefix == "service dump") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    cluster_state.with_servicemap([&](const ServiceMap &service_map) {
+	f->dump_object("service_map", service_map);
+      });
+    f->flush(cmdctx->odata);
+    cmdctx->reply(0, ss);
+    return true;
+  }
+  if (prefix == "service status") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    // only include state from services that are in the persisted service map
+    f->open_object_section("service_status");
+    for (auto& [type, service] : pending_service_map.services) {
+      if (ServiceMap::is_normal_ceph_entity(type)) {
+        continue;
+      }
+
+      f->open_object_section(type.c_str());
+      for (auto& q : service.daemons) {
+	f->open_object_section(q.first.c_str());
+	DaemonKey key{type, q.first};
+	ceph_assert(daemon_state.exists(key));
+	auto daemon = daemon_state.get(key);
+	std::lock_guard l(daemon->lock);
+	f->dump_stream("status_stamp") << daemon->service_status_stamp;
+	f->dump_stream("last_beacon") << daemon->last_service_beacon;
+	f->open_object_section("status");
+	for (auto& r : daemon->service_status) {
+	  f->dump_string(r.first.c_str(), r.second);
+	}
+	f->close_section();
+	f->close_section();
+      }
+      f->close_section();
+    }
+    f->close_section();
+    f->flush(cmdctx->odata);
+    cmdctx->reply(0, ss);
+    return true;
+  }
+
+  if (prefix == "config set") {
+    std::string key;
+    std::string val;
+    cmd_getval(cmdctx->cmdmap, "key", key);
+    cmd_getval(cmdctx->cmdmap, "value", val);
+    r = cct->_conf.set_val(key, val, &ss);
+    if (r == 0) {
+      cct->_conf.apply_changes(nullptr);
+    }
+    cmdctx->reply(0, ss);
+    return true;
+  }
+
+  // -----------
+  // PG commands
+
+  if (prefix == "pg scrub" ||
+      prefix == "pg repair" ||
+      prefix == "pg deep-scrub") {
+    string scrubop = prefix.substr(3, string::npos);
+    pg_t pgid;
+    spg_t spgid;
+    string pgidstr;
+    cmd_getval(cmdctx->cmdmap, "pgid", pgidstr);
+    if (!pgid.parse(pgidstr.c_str())) {
+      ss << "invalid pgid '" << pgidstr << "'";
+      cmdctx->reply(-EINVAL, ss);
+      return true;
+    }
+    bool pg_exists = false;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	pg_exists = osdmap.pg_exists(pgid);
+      });
+    if (!pg_exists) {
+      ss << "pg " << pgid << " does not exist";
+      cmdctx->reply(-ENOENT, ss);
+      return true;
+    }
+    int acting_primary = -1;
+    epoch_t epoch;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	epoch = osdmap.get_epoch();
+	osdmap.get_primary_shard(pgid, &acting_primary, &spgid);
+      });
+    if (acting_primary == -1) {
+      ss << "pg " << pgid << " has no primary osd";
+      cmdctx->reply(-EAGAIN, ss);
+      return true;
+    }
+    auto p = osd_cons.find(acting_primary);
+    if (p == osd_cons.end()) {
+      ss << "pg " << pgid << " primary osd." << acting_primary
+	 << " is not currently connected";
+      cmdctx->reply(-EAGAIN, ss);
+      return true;
+    }
+    for (auto& con : p->second) {
+      if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) {
+	vector<spg_t> pgs = { spgid };
+	con->send_message(new MOSDScrub2(monc->get_fsid(),
+					 epoch,
+					 pgs,
+					 scrubop == "repair",
+					 scrubop == "deep-scrub"));
+      } else {
+	vector<pg_t> pgs = { pgid };
+	con->send_message(new MOSDScrub(monc->get_fsid(),
+					pgs,
+					scrubop == "repair",
+					scrubop == "deep-scrub"));
+      }
+    }
+    ss << "instructing pg " << spgid << " on osd." << acting_primary
+       << " to " << scrubop;
+    cmdctx->reply(0, ss);
+    return true;
+  } else if (prefix == "osd scrub" ||
+	      prefix == "osd deep-scrub" ||
+	      prefix == "osd repair") {
+    string whostr;
+    cmd_getval(cmdctx->cmdmap, "who", whostr);
+    vector<string> pvec;
+    get_str_vec(prefix, pvec);
+
+    set<int> osds;
+    if (whostr == "*" || whostr == "all" || whostr == "any") {
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	  for (int i = 0; i < osdmap.get_max_osd(); i++)
+	    if (osdmap.is_up(i)) {
+	      osds.insert(i);
+	    }
+	});
+    } else {
+      long osd = parse_osd_id(whostr.c_str(), &ss);
+      if (osd < 0) {
+	ss << "invalid osd '" << whostr << "'";
+	cmdctx->reply(-EINVAL, ss);
+	return true;
+      }
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	  if (osdmap.is_up(osd)) {
+	    osds.insert(osd);
+	  }
+	});
+      if (osds.empty()) {
+	ss << "osd." << osd << " is not up";
+	cmdctx->reply(-EAGAIN, ss);
+	return true;
+      }
+    }
+    set<int> sent_osds, failed_osds;
+    for (auto osd : osds) {
+      vector<spg_t> spgs;
+      epoch_t epoch;
+      cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) {
+	  epoch = osdmap.get_epoch();
+	  auto p = pgmap.pg_by_osd.find(osd);
+	  if (p != pgmap.pg_by_osd.end()) {
+	    for (auto pgid : p->second) {
+	      int primary;
+	      spg_t spg;
+	      osdmap.get_primary_shard(pgid, &primary, &spg);
+	      if (primary == osd) {
+		spgs.push_back(spg);
+	      }
+	    }
+	  }
+	});
+      auto p = osd_cons.find(osd);
+      if (p == osd_cons.end()) {
+	failed_osds.insert(osd);
+      } else {
+	sent_osds.insert(osd);
+	for (auto& con : p->second) {
+	  if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) {
+	    con->send_message(new MOSDScrub2(monc->get_fsid(),
+					     epoch,
+					     spgs,
+					     pvec.back() == "repair",
+					     pvec.back() == "deep-scrub"));
+	  } else {
+	    con->send_message(new MOSDScrub(monc->get_fsid(),
+					    pvec.back() == "repair",
+					    pvec.back() == "deep-scrub"));
+	  }
+	}
+      }
+    }
+    if (failed_osds.size() == osds.size()) {
+      ss << "failed to instruct osd(s) " << osds << " to " << pvec.back()
+	 << " (not connected)";
+      r = -EAGAIN;
+    } else {
+      ss << "instructed osd(s) " << sent_osds << " to " << pvec.back();
+      if (!failed_osds.empty()) {
+	ss << "; osd(s) " << failed_osds << " were not connected";
+      }
+      r = 0;
+    }
+    cmdctx->reply(0, ss);
+    return true;
+  } else if (prefix == "osd pool scrub" ||
+             prefix == "osd pool deep-scrub" ||
+             prefix == "osd pool repair") {
+    vector<string> pool_names;
+    cmd_getval(cmdctx->cmdmap, "who", pool_names);
+    if (pool_names.empty()) {
+      ss << "must specify one or more pool names";
+      cmdctx->reply(-EINVAL, ss);
+      return true;
+    }
+    epoch_t epoch;
+    map<int32_t, vector<pg_t>> pgs_by_primary; // legacy
+    map<int32_t, vector<spg_t>> spgs_by_primary;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+      epoch = osdmap.get_epoch();
+      for (auto& pool_name : pool_names) {
+        auto pool_id = osdmap.lookup_pg_pool_name(pool_name);
+        if (pool_id < 0) {
+          ss << "unrecognized pool '" << pool_name << "'";
+          r = -ENOENT;
+          return;
+        }
+        auto pool_pg_num = osdmap.get_pg_num(pool_id);
+        for (int i = 0; i < pool_pg_num; i++) {
+          pg_t pg(i, pool_id);
+          int primary;
+          spg_t spg;
+          auto got = osdmap.get_primary_shard(pg, &primary, &spg);
+          if (!got)
+            continue;
+          pgs_by_primary[primary].push_back(pg);
+          spgs_by_primary[primary].push_back(spg);
+        }
+      }
+    });
+    if (r < 0) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    for (auto& it : spgs_by_primary) {
+      auto primary = it.first;
+      auto p = osd_cons.find(primary);
+      if (p == osd_cons.end()) {
+        ss << "osd." << primary << " is not currently connected";
+        cmdctx->reply(-EAGAIN, ss);
+        return true;
+      }
+      for (auto& con : p->second) {
+        if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) {
+          con->send_message(new MOSDScrub2(monc->get_fsid(),
+                                           epoch,
+                                           it.second,
+                                           prefix == "osd pool repair",
+                                           prefix == "osd pool deep-scrub"));
+        } else {
+          // legacy
+          auto q = pgs_by_primary.find(primary);
+          ceph_assert(q != pgs_by_primary.end());
+          con->send_message(new MOSDScrub(monc->get_fsid(),
+                                          q->second,
+                                          prefix == "osd pool repair",
+                                          prefix == "osd pool deep-scrub"));
+        }
+      }
+    }
+    cmdctx->reply(0, "");
+    return true;
+  } else if (prefix == "osd reweight-by-pg" ||
+	     prefix == "osd reweight-by-utilization" ||
+	     prefix == "osd test-reweight-by-pg" ||
+	     prefix == "osd test-reweight-by-utilization") {
+    bool by_pg =
+      prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
+    bool dry_run =
+      prefix == "osd test-reweight-by-pg" ||
+      prefix == "osd test-reweight-by-utilization";
+    int64_t oload;
+    cmd_getval(cmdctx->cmdmap, "oload", oload, int64_t(120));
+    set<int64_t> pools;
+    vector<string> poolnames;
+    cmd_getval(cmdctx->cmdmap, "pools", poolnames);
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	for (const auto& poolname : poolnames) {
+	  int64_t pool = osdmap.lookup_pg_pool_name(poolname);
+	  if (pool < 0) {
+	    ss << "pool '" << poolname << "' does not exist";
+	    r = -ENOENT;
+	  }
+	  pools.insert(pool);
+	}
+      });
+    if (r) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    
+    double max_change = g_conf().get_val<double>("mon_reweight_max_change");
+    cmd_getval(cmdctx->cmdmap, "max_change", max_change);
+    if (max_change <= 0.0) {
+      ss << "max_change " << max_change << " must be positive";
+      cmdctx->reply(-EINVAL, ss);
+      return true;
+    }
+    int64_t max_osds = g_conf().get_val<int64_t>("mon_reweight_max_osds");
+    cmd_getval(cmdctx->cmdmap, "max_osds", max_osds);
+    if (max_osds <= 0) {
+      ss << "max_osds " << max_osds << " must be positive";
+      cmdctx->reply(-EINVAL, ss);
+      return true;
+    }
+    bool no_increasing = false;
+    cmd_getval(cmdctx->cmdmap, "no_increasing", no_increasing);
+    string out_str;
+    mempool::osdmap::map<int32_t, uint32_t> new_weights;
+    r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap &osdmap, const PGMap& pgmap) {
+	return reweight::by_utilization(osdmap, pgmap,
+					oload,
+					max_change,
+					max_osds,
+					by_pg,
+					pools.empty() ? NULL : &pools,
+					no_increasing,
+					&new_weights,
+					&ss, &out_str, f.get());
+      });
+    if (r >= 0) {
+      dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
+    }
+    if (f) {
+      f->flush(cmdctx->odata);
+    } else {
+      cmdctx->odata.append(out_str);
+    }
+    if (r < 0) {
+      ss << "FAILED reweight-by-pg";
+      cmdctx->reply(r, ss);
+      return true;
+    } else if (r == 0 || dry_run) {
+      ss << "no change";
+      cmdctx->reply(r, ss);
+      return true;
+    } else {
+      json_spirit::Object json_object;
+      for (const auto& osd_weight : new_weights) {
+	json_spirit::Config::add(json_object,
+				 std::to_string(osd_weight.first),
+				 std::to_string(osd_weight.second));
+      }
+      string s = json_spirit::write(json_object);
+      std::replace(begin(s), end(s), '\"', '\'');
+      const string cmd =
+	"{"
+	"\"prefix\": \"osd reweightn\", "
+	"\"weights\": \"" + s + "\""
+	"}";
+      auto on_finish = new ReplyOnFinish(cmdctx);
+      monc->start_mon_command({cmd}, {},
+			      &on_finish->from_mon, &on_finish->outs, on_finish);
+      return true;
+    }
+  } else if (prefix == "osd df") {
+    string method, filter;
+    cmd_getval(cmdctx->cmdmap, "output_method", method);
+    cmd_getval(cmdctx->cmdmap, "filter", filter);
+    stringstream rs;
+    r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) {
+        // sanity check filter(s)
+        if (!filter.empty() &&
+             osdmap.lookup_pg_pool_name(filter) < 0 &&
+            !osdmap.crush->class_exists(filter) &&
+            !osdmap.crush->name_exists(filter)) {
+          rs << "'" << filter << "' not a pool, crush node or device class name";
+          return -EINVAL;
+        }
+	print_osd_utilization(osdmap, pgmap, ss,
+                              f.get(), method == "tree", filter);
+	cmdctx->odata.append(ss);
+	return 0;
+      });
+    cmdctx->reply(r, rs);
+    return true;
+  } else if (prefix == "osd pool stats") {
+    string pool_name;
+    cmd_getval(cmdctx->cmdmap, "pool_name", pool_name);
+    int64_t poolid = -ENOENT;
+    bool one_pool = false;
+    r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+        if (!pool_name.empty()) {
+          poolid = osdmap.lookup_pg_pool_name(pool_name);
+          if (poolid < 0) {
+            ceph_assert(poolid == -ENOENT);
+            ss << "unrecognized pool '" << pool_name << "'";
+            return -ENOENT;
+          }
+          one_pool = true;
+        }
+        stringstream rs;
+        if (f)
+          f->open_array_section("pool_stats");
+        else {
+          if (osdmap.get_pools().empty()) {
+            ss << "there are no pools!";
+            goto stats_out;
+          }
+        }
+        for (auto &p : osdmap.get_pools()) {
+          if (!one_pool) {
+            poolid = p.first;
+          }
+          pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, f.get(), &rs); 
+          if (one_pool) {
+            break;
+          }
+        }
+      stats_out:
+        if (f) {
+          f->close_section();
+          f->flush(cmdctx->odata);
+        } else {
+          cmdctx->odata.append(rs.str());
+        }
+        return 0;
+      });
+    if (r != -EOPNOTSUPP) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+  } else if (prefix == "osd safe-to-destroy" ||
+	     prefix == "osd destroy" ||
+	     prefix == "osd purge") {
+    set<int> osds;
+    int r = 0;
+    if (prefix == "osd safe-to-destroy") {
+      vector<string> ids;
+      cmd_getval(cmdctx->cmdmap, "ids", ids);
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+				  r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+				});
+      if (!r && osds.empty()) {
+	ss << "must specify one or more OSDs";
+	r = -EINVAL;
+      }
+    } else {
+      int64_t id;
+      if (!cmd_getval(cmdctx->cmdmap, "id", id)) {
+	r = -EINVAL;
+	ss << "must specify OSD id";
+      } else {
+	osds.insert(id);
+      }
+    }
+    if (r < 0) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    set<int> active_osds, missing_stats, stored_pgs, safe_to_destroy;
+    int affected_pgs = 0;
+    cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+	if (pg_map.num_pg_unknown > 0) {
+	  ss << pg_map.num_pg_unknown << " pgs have unknown state; cannot draw"
+	     << " any conclusions";
+	  r = -EAGAIN;
+	  return;
+	}
+	int num_active_clean = 0;
+	for (auto& p : pg_map.num_pg_by_state) {
+	  unsigned want = PG_STATE_ACTIVE|PG_STATE_CLEAN;
+	  if ((p.first & want) == want) {
+	    num_active_clean += p.second;
+	  }
+	}
+	for (auto osd : osds) {
+	  if (!osdmap.exists(osd)) {
+	    safe_to_destroy.insert(osd);
+	    continue;  // clearly safe to destroy
+	  }
+	  auto q = pg_map.num_pg_by_osd.find(osd);
+	  if (q != pg_map.num_pg_by_osd.end()) {
+	    if (q->second.acting > 0 || q->second.up_not_acting > 0) {
+	      active_osds.insert(osd);
+	      // XXX: For overlapping PGs, this counts them again
+	      affected_pgs += q->second.acting + q->second.up_not_acting;
+	      continue;
+	    }
+	  }
+	  if (num_active_clean < pg_map.num_pg) {
+	    // all pgs aren't active+clean; we need to be careful.
+	    auto p = pg_map.osd_stat.find(osd);
+	    if (p == pg_map.osd_stat.end() || !osdmap.is_up(osd)) {
+	      missing_stats.insert(osd);
+	      continue;
+	    } else if (p->second.num_pgs > 0) {
+	      stored_pgs.insert(osd);
+	      continue;
+	    }
+	  }
+	  safe_to_destroy.insert(osd);
+	}
+      });
+    if (r && prefix == "osd safe-to-destroy") {
+      cmdctx->reply(r, ss); // regardless of formatter
+      return true;
+    }
+    if (!r && (!active_osds.empty() ||
+               !missing_stats.empty() || !stored_pgs.empty())) {
+       if (!safe_to_destroy.empty()) {
+         ss << "OSD(s) " << safe_to_destroy
+            << " are safe to destroy without reducing data durability. ";
+       }
+       if (!active_osds.empty()) {
+         ss << "OSD(s) " << active_osds << " have " << affected_pgs
+            << " pgs currently mapped to them. ";
+       }
+       if (!missing_stats.empty()) {
+         ss << "OSD(s) " << missing_stats << " have no reported stats, and not all"
+            << " PGs are active+clean; we cannot draw any conclusions. ";
+       }
+       if (!stored_pgs.empty()) {
+         ss << "OSD(s) " << stored_pgs << " last reported they still store some PG"
+            << " data, and not all PGs are active+clean; we cannot be sure they"
+            << " aren't still needed.";
+       }
+       if (!active_osds.empty() || !stored_pgs.empty()) {
+         r = -EBUSY;
+       } else {
+         r = -EAGAIN;
+       }
+    }
+
+    if (prefix == "osd safe-to-destroy") {
+      if (!r) {
+        ss << "OSD(s) " << osds << " are safe to destroy without reducing data"
+           << " durability.";
+      }
+      if (f) {
+        f->open_object_section("osd_status");
+        f->open_array_section("safe_to_destroy");
+        for (auto i : safe_to_destroy)
+          f->dump_int("osd", i);
+        f->close_section();
+        f->open_array_section("active");
+        for (auto i : active_osds)
+          f->dump_int("osd", i);
+        f->close_section();
+        f->open_array_section("missing_stats");
+        for (auto i : missing_stats)
+          f->dump_int("osd", i);
+        f->close_section();
+        f->open_array_section("stored_pgs");
+        for (auto i : stored_pgs)
+          f->dump_int("osd", i);
+        f->close_section();
+        f->close_section(); // osd_status
+        f->flush(cmdctx->odata);
+        r = 0;
+        std::stringstream().swap(ss);
+      }
+      cmdctx->reply(r, ss);
+      return true;
+    }
+
+    if (r) {
+      bool force = false;
+      cmd_getval(cmdctx->cmdmap, "force", force);
+      if (!force) {
+        // Backward compat
+        cmd_getval(cmdctx->cmdmap, "yes_i_really_mean_it", force);
+      }
+      if (!force) {
+        ss << "\nYou can proceed by passing --force, but be warned that"
+              " this will likely mean real, permanent data loss.";
+      } else {
+        r = 0;
+      }
+    }
+    if (r) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    const string cmd =
+      "{"
+      "\"prefix\": \"" + prefix + "-actual\", "
+      "\"id\": " + stringify(osds) + ", "
+      "\"yes_i_really_mean_it\": true"
+      "}";
+    auto on_finish = new ReplyOnFinish(cmdctx);
+    monc->start_mon_command({cmd}, {}, nullptr, &on_finish->outs, on_finish);
+    return true;
+  } else if (prefix == "osd ok-to-stop") {
+    vector<string> ids;
+    cmd_getval(cmdctx->cmdmap, "ids", ids);
+    set<int> osds;
+    int64_t max = 1;
+    cmd_getval(cmdctx->cmdmap, "max", max);
+    int r;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+      });
+    if (!r && osds.empty()) {
+      ss << "must specify one or more OSDs";
+      r = -EINVAL;
+    }
+    if (max < (int)osds.size()) {
+      max = osds.size();
+    }
+    if (r < 0) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    offline_pg_report out_report;
+    cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+	_maximize_ok_to_stop_set(
+	  osds, max, osdmap, pg_map,
+	  &out_report);
+      });
+    if (!f) {
+      f.reset(Formatter::create("json"));
+    }
+    f->dump_object("ok_to_stop", out_report);
+    f->flush(cmdctx->odata);
+    cmdctx->odata.append("\n");
+    if (!out_report.unknown.empty()) {
+      ss << out_report.unknown.size() << " pgs have unknown state; "
+	 << "cannot draw any conclusions";
+      cmdctx->reply(-EAGAIN, ss);
+    }
+    if (!out_report.ok_to_stop()) {
+      ss << "unsafe to stop osd(s) at this time (" << out_report.not_ok.size() << " PGs are or would become offline)";
+      cmdctx->reply(-EBUSY, ss);
+    } else {
+      cmdctx->reply(0, ss);
+    }
+    return true;
+  } else if (prefix == "pg force-recovery" ||
+  	     prefix == "pg force-backfill" ||
+  	     prefix == "pg cancel-force-recovery" ||
+  	     prefix == "pg cancel-force-backfill" ||
+             prefix == "osd pool force-recovery" ||
+             prefix == "osd pool force-backfill" ||
+             prefix == "osd pool cancel-force-recovery" ||
+             prefix == "osd pool cancel-force-backfill") {
+    vector<string> vs;
+    get_str_vec(prefix, vs);
+    auto& granularity = vs.front();
+    auto& forceop = vs.back();
+    vector<pg_t> pgs;
+
+    // figure out actual op just once
+    int actual_op = 0;
+    if (forceop == "force-recovery") {
+      actual_op = OFR_RECOVERY;
+    } else if (forceop == "force-backfill") {
+      actual_op = OFR_BACKFILL;
+    } else if (forceop == "cancel-force-backfill") {
+      actual_op = OFR_BACKFILL | OFR_CANCEL;
+    } else if (forceop == "cancel-force-recovery") {
+      actual_op = OFR_RECOVERY | OFR_CANCEL;
+    }
+
+    set<pg_t> candidates; // deduped
+    if (granularity == "pg") {
+      // covnert pg names to pgs, discard any invalid ones while at it
+      vector<string> pgids;
+      cmd_getval(cmdctx->cmdmap, "pgid", pgids);
+      for (auto& i : pgids) {
+        pg_t pgid;
+        if (!pgid.parse(i.c_str())) {
+          ss << "invlaid pgid '" << i << "'; ";
+          r = -EINVAL;
+          continue;
+        }
+        candidates.insert(pgid);
+      }
+    } else {
+      // per pool
+      vector<string> pool_names;
+      cmd_getval(cmdctx->cmdmap, "who", pool_names);
+      if (pool_names.empty()) {
+        ss << "must specify one or more pool names";
+        cmdctx->reply(-EINVAL, ss);
+        return true;
+      }
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+        for (auto& pool_name : pool_names) {
+          auto pool_id = osdmap.lookup_pg_pool_name(pool_name);
+          if (pool_id < 0) {
+            ss << "unrecognized pool '" << pool_name << "'";
+            r = -ENOENT;
+            return;
+          }
+          auto pool_pg_num = osdmap.get_pg_num(pool_id);
+          for (int i = 0; i < pool_pg_num; i++)
+            candidates.insert({(unsigned int)i, (uint64_t)pool_id});
+        }
+      });
+      if (r < 0) {
+        cmdctx->reply(r, ss);
+        return true;
+      }
+    }
+
+    cluster_state.with_pgmap([&](const PGMap& pg_map) {
+      for (auto& i : candidates) {
+	auto it = pg_map.pg_stat.find(i);
+	if (it == pg_map.pg_stat.end()) {
+	  ss << "pg " << i << " does not exist; ";
+	  r = -ENOENT;
+          continue;
+	}
+        auto state = it->second.state;
+	// discard pgs for which user requests are pointless
+	switch (actual_op) {
+        case OFR_RECOVERY:
+          if ((state & (PG_STATE_DEGRADED |
+                        PG_STATE_RECOVERY_WAIT |
+                        PG_STATE_RECOVERING)) == 0) {
+            // don't return error, user script may be racing with cluster.
+            // not fatal.
+            ss << "pg " << i << " doesn't require recovery; ";
+            continue;
+          } else  if (state & PG_STATE_FORCED_RECOVERY) {
+            ss << "pg " << i << " recovery already forced; ";
+            // return error, as it may be a bug in user script
+            r = -EINVAL;
+            continue;
+          }
+          break;
+        case OFR_BACKFILL:
+          if ((state & (PG_STATE_DEGRADED |
+                        PG_STATE_BACKFILL_WAIT |
+                        PG_STATE_BACKFILLING)) == 0) {
+            ss << "pg " << i << " doesn't require backfilling; ";
+            continue;
+          } else if (state & PG_STATE_FORCED_BACKFILL) {
+            ss << "pg " << i << " backfill already forced; ";
+            r = -EINVAL;
+            continue;
+          }
+          break;
+        case OFR_BACKFILL | OFR_CANCEL:
+          if ((state & PG_STATE_FORCED_BACKFILL) == 0) {
+            ss << "pg " << i << " backfill not forced; ";
+            continue;
+          }
+          break;
+        case OFR_RECOVERY | OFR_CANCEL:
+          if ((state & PG_STATE_FORCED_RECOVERY) == 0) {
+            ss << "pg " << i << " recovery not forced; ";
+            continue;
+          }
+          break;
+        default:
+          ceph_abort_msg("actual_op value is not supported");
+        }
+	pgs.push_back(i);
+      } // for
+    });
+
+    // respond with error only when no pgs are correct
+    // yes, in case of mixed errors, only the last one will be emitted,
+    // but the message presented will be fine
+    if (pgs.size() != 0) {
+      // clear error to not confuse users/scripts
+      r = 0;
+    }
+
+    // optimize the command -> messages conversion, use only one
+    // message per distinct OSD
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	// group pgs to process by osd
+	map<int, vector<spg_t>> osdpgs;
+	for (auto& pgid : pgs) {
+	  int primary;
+	  spg_t spg;
+	  if (osdmap.get_primary_shard(pgid, &primary, &spg)) {
+	    osdpgs[primary].push_back(spg);
+	  }
+	}
+	for (auto& i : osdpgs) {
+	  if (osdmap.is_up(i.first)) {
+	    auto p = osd_cons.find(i.first);
+	    if (p == osd_cons.end()) {
+	      ss << "osd." << i.first << " is not currently connected";
+	      r = -EAGAIN;
+	      continue;
+	    }
+	    for (auto& con : p->second) {
+	      con->send_message(
+		new MOSDForceRecovery(monc->get_fsid(), i.second, actual_op));
+	    }
+	    ss << "instructing pg(s) " << i.second << " on osd." << i.first
+	       << " to " << forceop << "; ";
+	  }
+	}
+      });
+    ss << std::endl;
+    cmdctx->reply(r, ss);
+    return true;
+  } else if (prefix == "config show" ||
+	     prefix == "config show-with-defaults") {
+    string who;
+    cmd_getval(cmdctx->cmdmap, "who", who);
+    auto [key, valid] = DaemonKey::parse(who);
+    if (!valid) {
+      ss << "invalid daemon name: use <type>.<id>";
+      cmdctx->reply(-EINVAL, ss);
+      return true;
+    }
+    DaemonStatePtr daemon = daemon_state.get(key);
+    if (!daemon) {
+      ss << "no config state for daemon " << who;
+      cmdctx->reply(-ENOENT, ss);
+      return true;
+    }
+
+    std::lock_guard l(daemon->lock);
+
+    int r = 0;
+    string name;
+    if (cmd_getval(cmdctx->cmdmap, "key", name)) {
+      // handle special options
+      if (name == "fsid") {
+       cmdctx->odata.append(stringify(monc->get_fsid()) + "\n");
+       cmdctx->reply(r, ss);
+       return true;
+      }
+      auto p = daemon->config.find(name);
+      if (p != daemon->config.end() &&
+	  !p->second.empty()) {
+	cmdctx->odata.append(p->second.rbegin()->second + "\n");
+      } else {
+	auto& defaults = daemon->_get_config_defaults();
+	auto q = defaults.find(name);
+	if (q != defaults.end()) {
+	  cmdctx->odata.append(q->second + "\n");
+	} else {
+	  r = -ENOENT;
+	}
+      }
+    } else if (daemon->config_defaults_bl.length() > 0) {
+      TextTable tbl;
+      if (f) {
+	f->open_array_section("config");
+      } else {
+	tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+	tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT);
+	tbl.define_column("SOURCE", TextTable::LEFT, TextTable::LEFT);
+	tbl.define_column("OVERRIDES", TextTable::LEFT, TextTable::LEFT);
+	tbl.define_column("IGNORES", TextTable::LEFT, TextTable::LEFT);
+      }
+      if (prefix == "config show") {
+	// show
+	for (auto& i : daemon->config) {
+	  dout(20) << " " << i.first << " -> " << i.second << dendl;
+	  if (i.second.empty()) {
+	    continue;
+	  }
+	  if (f) {
+	    f->open_object_section("value");
+	    f->dump_string("name", i.first);
+	    f->dump_string("value", i.second.rbegin()->second);
+	    f->dump_string("source", ceph_conf_level_name(
+			     i.second.rbegin()->first));
+	    if (i.second.size() > 1) {
+	      f->open_array_section("overrides");
+	      auto j = i.second.rend();
+	      for (--j; j != i.second.rbegin(); --j) {
+		f->open_object_section("value");
+		f->dump_string("source", ceph_conf_level_name(j->first));
+		f->dump_string("value", j->second);
+		f->close_section();
+	      }
+	      f->close_section();
+	    }
+	    if (daemon->ignored_mon_config.count(i.first)) {
+	      f->dump_string("ignores", "mon");
+	    }
+	    f->close_section();
+	  } else {
+	    tbl << i.first;
+	    tbl << i.second.rbegin()->second;
+	    tbl << ceph_conf_level_name(i.second.rbegin()->first);
+	    if (i.second.size() > 1) {
+	      list<string> ov;
+	      auto j = i.second.rend();
+	      for (--j; j != i.second.rbegin(); --j) {
+		if (j->second == i.second.rbegin()->second) {
+		  ov.push_front(string("(") + ceph_conf_level_name(j->first) +
+				string("[") + j->second + string("]") +
+				string(")"));
+		} else {
+                  ov.push_front(ceph_conf_level_name(j->first) +
+                                string("[") + j->second + string("]"));
+
+		}
+	      }
+	      tbl << ov;
+	    } else {
+	      tbl << "";
+	    }
+	    tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : "");
+	    tbl << TextTable::endrow;
+	  }
+	}
+      } else {
+	// show-with-defaults
+	auto& defaults = daemon->_get_config_defaults();
+	for (auto& i : defaults) {
+	  if (f) {
+	    f->open_object_section("value");
+	    f->dump_string("name", i.first);
+	  } else {
+	    tbl << i.first;
+	  }
+	  auto j = daemon->config.find(i.first);
+	  if (j != daemon->config.end() && !j->second.empty()) {
+	    // have config
+	    if (f) {
+	      f->dump_string("value", j->second.rbegin()->second);
+	      f->dump_string("source", ceph_conf_level_name(
+			       j->second.rbegin()->first));
+	      if (j->second.size() > 1) {
+		f->open_array_section("overrides");
+		auto k = j->second.rend();
+		for (--k; k != j->second.rbegin(); --k) {
+		  f->open_object_section("value");
+		  f->dump_string("source", ceph_conf_level_name(k->first));
+		  f->dump_string("value", k->second);
+		  f->close_section();
+		}
+		f->close_section();
+	      }
+	      if (daemon->ignored_mon_config.count(i.first)) {
+		f->dump_string("ignores", "mon");
+	      }
+	      f->close_section();
+	    } else {
+	      tbl << j->second.rbegin()->second;
+	      tbl << ceph_conf_level_name(j->second.rbegin()->first);
+	      if (j->second.size() > 1) {
+		list<string> ov;
+		auto k = j->second.rend();
+		for (--k; k != j->second.rbegin(); --k) {
+		  if (k->second == j->second.rbegin()->second) {
+		    ov.push_front(string("(") + ceph_conf_level_name(k->first) +
+				  string("[") + k->second + string("]") +
+				  string(")"));
+		  } else {
+                    ov.push_front(ceph_conf_level_name(k->first) +
+                                  string("[") + k->second + string("]"));
+		  }
+		}
+		tbl << ov;
+	      } else {
+		tbl << "";
+	      }
+	      tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : "");
+	      tbl << TextTable::endrow;
+	    }
+	  } else {
+	    // only have default
+	    if (f) {
+	      f->dump_string("value", i.second);
+	      f->dump_string("source", ceph_conf_level_name(CONF_DEFAULT));
+	      f->close_section();
+	    } else {
+	      tbl << i.second;
+	      tbl << ceph_conf_level_name(CONF_DEFAULT);
+	      tbl << "";
+	      tbl << "";
+	      tbl << TextTable::endrow;
+	    }
+	  }
+	}
+      }
+      if (f) {
+	f->close_section();
+	f->flush(cmdctx->odata);
+      } else {
+	cmdctx->odata.append(stringify(tbl));
+      }
+    }
+    cmdctx->reply(r, ss);
+    return true;
+  } else if (prefix == "device ls") {
+    set<string> devids;
+    TextTable tbl;
+    if (f) {
+      f->open_array_section("devices");
+      daemon_state.with_devices([&f](const DeviceState& dev) {
+	  f->dump_object("device", dev);
+	});
+      f->close_section();
+      f->flush(cmdctx->odata);
+    } else {
+      tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("WEAR", TextTable::RIGHT, TextTable::RIGHT);
+      tbl.define_column("LIFE EXPECTANCY", TextTable::LEFT, TextTable::LEFT);
+      auto now = ceph_clock_now();
+      daemon_state.with_devices([&tbl, now](const DeviceState& dev) {
+	  string h;
+	  for (auto& i : dev.attachments) {
+	    if (h.size()) {
+	      h += " ";
+	    }
+	    h += std::get<0>(i) + ":" + std::get<1>(i);
+	  }
+	  string d;
+	  for (auto& i : dev.daemons) {
+	    if (d.size()) {
+	      d += " ";
+	    }
+	    d += to_string(i);
+	  }
+	  char wear_level_str[16] = {0};
+	  if (dev.wear_level >= 0) {
+	    snprintf(wear_level_str, sizeof(wear_level_str)-1, "%d%%",
+		     (int)(100.1 * dev.wear_level));
+	  }
+	  tbl << dev.devid
+	      << h
+	      << d
+	      << wear_level_str
+	      << dev.get_life_expectancy_str(now)
+	      << TextTable::endrow;
+	});
+      cmdctx->odata.append(stringify(tbl));
+    }
+    cmdctx->reply(0, ss);
+    return true;
+  } else if (prefix == "device ls-by-daemon") {
+    string who;
+    cmd_getval(cmdctx->cmdmap, "who", who);
+    if (auto [k, valid] = DaemonKey::parse(who); !valid) {
+      ss << who << " is not a valid daemon name";
+      r = -EINVAL;
+    } else {
+      auto dm = daemon_state.get(k);
+      if (dm) {
+	if (f) {
+	  f->open_array_section("devices");
+	  for (auto& i : dm->devices) {
+	    daemon_state.with_device(i.first, [&f] (const DeviceState& dev) {
+		f->dump_object("device", dev);
+	      });
+	  }
+	  f->close_section();
+	  f->flush(cmdctx->odata);
+	} else {
+	  TextTable tbl;
+	  tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT);
+	  tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT);
+	  tbl.define_column("EXPECTED FAILURE", TextTable::LEFT,
+			    TextTable::LEFT);
+	  auto now = ceph_clock_now();
+	  for (auto& i : dm->devices) {
+	    daemon_state.with_device(
+	      i.first, [&tbl, now] (const DeviceState& dev) {
+		string h;
+		for (auto& i : dev.attachments) {
+		  if (h.size()) {
+		    h += " ";
+		  }
+		  h += std::get<0>(i) + ":" + std::get<1>(i);
+		}
+		tbl << dev.devid
+		    << h
+		    << dev.get_life_expectancy_str(now)
+		    << TextTable::endrow;
+	      });
+	  }
+	  cmdctx->odata.append(stringify(tbl));
+	}
+      } else {
+	r = -ENOENT;
+	ss << "daemon " << who << " not found";
+      }
+      cmdctx->reply(r, ss);
+    }
+  } else if (prefix == "device ls-by-host") {
+    string host;
+    cmd_getval(cmdctx->cmdmap, "host", host);
+    set<string> devids;
+    daemon_state.list_devids_by_server(host, &devids);
+    if (f) {
+      f->open_array_section("devices");
+      for (auto& devid : devids) {
+	daemon_state.with_device(
+	  devid, [&f] (const DeviceState& dev) {
+	    f->dump_object("device", dev);
+	  });
+      }
+      f->close_section();
+      f->flush(cmdctx->odata);
+    } else {
+      TextTable tbl;
+      tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("DEV", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("EXPECTED FAILURE", TextTable::LEFT, TextTable::LEFT);
+      auto now = ceph_clock_now();
+      for (auto& devid : devids) {
+	daemon_state.with_device(
+	  devid, [&tbl, &host, now] (const DeviceState& dev) {
+	    string n;
+	    for (auto& j : dev.attachments) {
+	      if (std::get<0>(j) == host) {
+		if (n.size()) {
+		  n += " ";
+		}
+		n += std::get<1>(j);
+	      }
+	    }
+	    string d;
+	    for (auto& i : dev.daemons) {
+	      if (d.size()) {
+		d += " ";
+	      }
+	      d += to_string(i);
+	    }
+	    tbl << dev.devid
+		<< n
+		<< d
+		<< dev.get_life_expectancy_str(now)
+		<< TextTable::endrow;
+	  });
+      }
+      cmdctx->odata.append(stringify(tbl));
+    }
+    cmdctx->reply(0, ss);
+    return true;
+  } else if (prefix == "device info") {
+    string devid;
+    cmd_getval(cmdctx->cmdmap, "devid", devid);
+    int r = 0;
+    ostringstream rs;
+    if (!daemon_state.with_device(devid,
+				  [&f, &rs] (const DeviceState& dev) {
+	  if (f) {
+	    f->dump_object("device", dev);
+	  } else {
+	    dev.print(rs);
+	  }
+	})) {
+      ss << "device " << devid << " not found";
+      r = -ENOENT;
+    } else {
+      if (f) {
+	f->flush(cmdctx->odata);
+      } else {
+	cmdctx->odata.append(rs.str());
+      }
+    }
+    cmdctx->reply(r, ss);
+    return true;
+  } else if (prefix == "device set-life-expectancy") {
+    string devid;
+    cmd_getval(cmdctx->cmdmap, "devid", devid);
+    string from_str, to_str;
+    cmd_getval(cmdctx->cmdmap, "from", from_str);
+    cmd_getval(cmdctx->cmdmap, "to", to_str);
+    utime_t from, to;
+    if (!from.parse(from_str)) {
+      ss << "unable to parse datetime '" << from_str << "'";
+      r = -EINVAL;
+      cmdctx->reply(r, ss);
+    } else if (to_str.size() && !to.parse(to_str)) {
+      ss << "unable to parse datetime '" << to_str << "'";
+      r = -EINVAL;
+      cmdctx->reply(r, ss);
+    } else {
+      map<string,string> meta;
+      daemon_state.with_device_create(
+	devid,
+	[from, to, &meta] (DeviceState& dev) {
+	  dev.set_life_expectancy(from, to, ceph_clock_now());
+	  meta = dev.metadata;
+	});
+      json_spirit::Object json_object;
+      for (auto& i : meta) {
+	json_spirit::Config::add(json_object, i.first, i.second);
+      }
+      bufferlist json;
+      json.append(json_spirit::write(json_object));
+      const string cmd =
+	"{"
+	"\"prefix\": \"config-key set\", "
+	"\"key\": \"device/" + devid + "\""
+	"}";
+      auto on_finish = new ReplyOnFinish(cmdctx);
+      monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish);
+    }
+    return true;
+  } else if (prefix == "device rm-life-expectancy") {
+    string devid;
+    cmd_getval(cmdctx->cmdmap, "devid", devid);
+    map<string,string> meta;
+    if (daemon_state.with_device_write(devid, [&meta] (DeviceState& dev) {
+	  dev.rm_life_expectancy();
+	  meta = dev.metadata;
+	})) {
+      string cmd;
+      bufferlist json;
+      if (meta.empty()) {
+	cmd =
+	  "{"
+	  "\"prefix\": \"config-key rm\", "
+	  "\"key\": \"device/" + devid + "\""
+	  "}";
+      } else {
+	json_spirit::Object json_object;
+	for (auto& i : meta) {
+	  json_spirit::Config::add(json_object, i.first, i.second);
+	}
+	json.append(json_spirit::write(json_object));
+	cmd =
+	  "{"
+	  "\"prefix\": \"config-key set\", "
+	  "\"key\": \"device/" + devid + "\""
+	  "}";
+      }
+      auto on_finish = new ReplyOnFinish(cmdctx);
+      monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish);
+    } else {
+      cmdctx->reply(0, ss);
+    }
+    return true;
+  } else {
+    if (!pgmap_ready) {
+      ss << "Warning: due to ceph-mgr restart, some PG states may not be up to date\n";
+    }
+    if (f) {
+       f->open_object_section("pg_info");
+       f->dump_bool("pg_ready", pgmap_ready);
+    }
+
+    // fall back to feeding command to PGMap
+    r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+	return process_pg_map_command(prefix, cmdctx->cmdmap, pg_map, osdmap,
+				      f.get(), &ss, &cmdctx->odata);
+      });
+
+    if (f) {
+      f->close_section();
+    }
+    if (r != -EOPNOTSUPP) {
+      if (f) {
+	f->flush(cmdctx->odata);
+      }
+      cmdctx->reply(r, ss);
+      return true;
+    }
+  }
+
+  // Was the command unfound?
+  if (py_command.cmdstring.empty()) {
+    ss << "No handler found for '" << prefix << "'";
+    dout(4) << "No handler found for '" << prefix << "'" << dendl;
+    cmdctx->reply(-EINVAL, ss);
+    return true;
+  }
+
+  dout(10) << "passing through command '" << prefix << "' size " << cmdctx->cmdmap.size() << dendl;
+  finisher.queue(new LambdaContext([this, cmdctx, session, py_command, prefix]
+                                   (int r_) mutable {
+    std::stringstream ss;
+
+    dout(10) << "dispatching command '" << prefix << "' size " << cmdctx->cmdmap.size() << dendl;
+
+    // Validate that the module is enabled
+    auto& py_handler_name = py_command.module_name;
+    PyModuleRef module = py_modules.get_module(py_handler_name);
+    ceph_assert(module);
+    if (!module->is_enabled()) {
+      ss << "Module '" << py_handler_name << "' is not enabled (required by "
+            "command '" << prefix << "'): use `ceph mgr module enable "
+            << py_handler_name << "` to enable it";
+      dout(4) << ss.str() << dendl;
+      cmdctx->reply(-EOPNOTSUPP, ss);
+      return;
+    }
+
+    // Hack: allow the self-test method to run on unhealthy modules.
+    // Fix this in future by creating a special path for self test rather
+    // than having the hook be a normal module command.
+    std::string self_test_prefix = py_handler_name + " " + "self-test";
+
+    // Validate that the module is healthy
+    bool accept_command;
+    if (module->is_loaded()) {
+      if (module->get_can_run() && !module->is_failed()) {
+        // Healthy module
+        accept_command = true;
+      } else if (self_test_prefix == prefix) {
+        // Unhealthy, but allow because it's a self test command
+        accept_command = true;
+      } else {
+        accept_command = false;
+        ss << "Module '" << py_handler_name << "' has experienced an error and "
+              "cannot handle commands: " << module->get_error_string();
+      }
+    } else {
+      // Module not loaded
+      accept_command = false;
+      ss << "Module '" << py_handler_name << "' failed to load and "
+            "cannot handle commands: " << module->get_error_string();
+    }
+
+    if (!accept_command) {
+      dout(4) << ss.str() << dendl;
+      cmdctx->reply(-EIO, ss);
+      return;
+    }
+
+    std::stringstream ds;
+    bufferlist inbl = cmdctx->data;
+    int r = py_modules.handle_command(py_command, *session, cmdctx->cmdmap,
+                                      inbl, &ds, &ss);
+    if (r == -EACCES) {
+      log_access_denied(cmdctx, session, ss);
+    }
+
+    cmdctx->odata.append(ds);
+    cmdctx->reply(r, ss);
+    dout(10) << " command returned " << r << dendl;
+  }));
+  return true;
+}
+
+void DaemonServer::_prune_pending_service_map()
+{
+  utime_t cutoff = ceph_clock_now();
+  cutoff -= g_conf().get_val<double>("mgr_service_beacon_grace");
+  auto p = pending_service_map.services.begin();
+  while (p != pending_service_map.services.end()) {
+    auto q = p->second.daemons.begin();
+    while (q != p->second.daemons.end()) {
+      DaemonKey key{p->first, q->first};
+      if (!daemon_state.exists(key)) {
+        if (ServiceMap::is_normal_ceph_entity(p->first)) {
+          dout(10) << "daemon " << key << " in service map but not in daemon state "
+                   << "index -- force pruning" << dendl;
+          q = p->second.daemons.erase(q);
+          pending_service_map_dirty = pending_service_map.epoch;
+        } else {
+          derr << "missing key " << key << dendl;
+          ++q;
+        }
+
+        continue;
+      }
+
+      auto daemon = daemon_state.get(key);
+      std::lock_guard l(daemon->lock);
+      if (daemon->last_service_beacon == utime_t()) {
+	// we must have just restarted; assume they are alive now.
+	daemon->last_service_beacon = ceph_clock_now();
+	++q;
+	continue;
+      }
+      if (daemon->last_service_beacon < cutoff) {
+	dout(10) << "pruning stale " << p->first << "." << q->first
+		 << " last_beacon " << daemon->last_service_beacon << dendl;
+	q = p->second.daemons.erase(q);
+	pending_service_map_dirty = pending_service_map.epoch;
+      } else {
+	++q;
+      }
+    }
+    if (p->second.daemons.empty()) {
+      p = pending_service_map.services.erase(p);
+      pending_service_map_dirty = pending_service_map.epoch;
+    } else {
+      ++p;
+    }
+  }
+}
+
+void DaemonServer::send_report()
+{
+  if (!pgmap_ready) {
+    if (ceph_clock_now() - started_at > g_conf().get_val<int64_t>("mgr_stats_period") * 4.0) {
+      pgmap_ready = true;
+      reported_osds.clear();
+      dout(1) << "Giving up on OSDs that haven't reported yet, sending "
+              << "potentially incomplete PG state to mon" << dendl;
+    } else {
+      dout(1) << "Not sending PG status to monitor yet, waiting for OSDs"
+              << dendl;
+      return;
+    }
+  }
+
+  auto m = ceph::make_message<MMonMgrReport>();
+  m->gid = monc->get_global_id();
+  py_modules.get_health_checks(&m->health_checks);
+  py_modules.get_progress_events(&m->progress_events);
+
+  cluster_state.with_mutable_pgmap([&](PGMap& pg_map) {
+      cluster_state.update_delta_stats();
+
+      if (pending_service_map.epoch) {
+	_prune_pending_service_map();
+	if (pending_service_map_dirty >= pending_service_map.epoch) {
+	  pending_service_map.modified = ceph_clock_now();
+	  encode(pending_service_map, m->service_map_bl, CEPH_FEATURES_ALL);
+	  dout(10) << "sending service_map e" << pending_service_map.epoch
+		   << dendl;
+	  pending_service_map.epoch++;
+	}
+      }
+
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	  // FIXME: no easy way to get mon features here.  this will do for
+	  // now, though, as long as we don't make a backward-incompat change.
+	  pg_map.encode_digest(osdmap, m->get_data(), CEPH_FEATURES_ALL);
+	  dout(10) << pg_map << dendl;
+
+	  pg_map.get_health_checks(g_ceph_context, osdmap,
+				   &m->health_checks);
+
+	  dout(10) << m->health_checks.checks.size() << " health checks"
+		   << dendl;
+	  dout(20) << "health checks:\n";
+	  JSONFormatter jf(true);
+	  jf.dump_object("health_checks", m->health_checks);
+	  jf.flush(*_dout);
+	  *_dout << dendl;
+          if (osdmap.require_osd_release >= ceph_release_t::luminous) {
+              clog->debug() << "pgmap v" << pg_map.version << ": " << pg_map;
+          }
+	});
+    });
+
+  map<daemon_metric, unique_ptr<DaemonHealthMetricCollector>> accumulated;
+  for (auto service : {"osd", "mon"} ) {
+    auto daemons = daemon_state.get_by_service(service);
+    for (const auto& [key,state] : daemons) {
+      std::lock_guard l{state->lock};
+      for (const auto& metric : state->daemon_health_metrics) {
+        auto acc = accumulated.find(metric.get_type());
+        if (acc == accumulated.end()) {
+          auto collector = DaemonHealthMetricCollector::create(metric.get_type());
+          if (!collector) {
+            derr << __func__ << " " << key
+		 << " sent me an unknown health metric: "
+		 << std::hex << static_cast<uint8_t>(metric.get_type())
+		 << std::dec << dendl;
+            continue;
+          }
+	  dout(20) << " + " << state->key << " "
+		   << metric << dendl;
+          tie(acc, std::ignore) = accumulated.emplace(metric.get_type(),
+              std::move(collector));
+        }
+        acc->second->update(key, metric);
+      }
+    }
+  }
+  for (const auto& acc : accumulated) {
+    acc.second->summarize(m->health_checks);
+  }
+  // TODO? We currently do not notify the PyModules
+  // TODO: respect needs_send, so we send the report only if we are asked to do
+  //       so, or the state is updated.
+  monc->send_mon_message(std::move(m));
+}
+
+void DaemonServer::adjust_pgs()
+{
+  dout(20) << dendl;
+  unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
+  double max_misplaced = g_conf().get_val<double>("target_max_misplaced_ratio");
+  bool aggro = g_conf().get_val<bool>("mgr_debug_aggressive_pg_num_changes");
+
+  map<string,unsigned> pg_num_to_set;
+  map<string,unsigned> pgp_num_to_set;
+  set<pg_t> upmaps_to_clear;
+  cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+      unsigned creating_or_unknown = 0;
+      for (auto& i : pg_map.num_pg_by_state) {
+	if ((i.first & (PG_STATE_CREATING)) ||
+	    i.first == 0) {
+	  creating_or_unknown += i.second;
+	}
+      }
+      unsigned left = max;
+      if (creating_or_unknown >= max) {
+	return;
+      }
+      left -= creating_or_unknown;
+      dout(10) << "creating_or_unknown " << creating_or_unknown
+	       << " max_creating " << max
+               << " left " << left
+               << dendl;
+
+      // FIXME: These checks are fundamentally racy given that adjust_pgs()
+      // can run more frequently than we get updated pg stats from OSDs.  We
+      // may make multiple adjustments with stale informaiton.
+      double misplaced_ratio, degraded_ratio;
+      double inactive_pgs_ratio, unknown_pgs_ratio;
+      pg_map.get_recovery_stats(&misplaced_ratio, &degraded_ratio,
+				&inactive_pgs_ratio, &unknown_pgs_ratio);
+      dout(20) << "misplaced_ratio " << misplaced_ratio
+	       << " degraded_ratio " << degraded_ratio
+	       << " inactive_pgs_ratio " << inactive_pgs_ratio
+	       << " unknown_pgs_ratio " << unknown_pgs_ratio
+	       << "; target_max_misplaced_ratio " << max_misplaced
+	       << dendl;
+
+      for (auto& i : osdmap.get_pools()) {
+	const pg_pool_t& p = i.second;
+
+	// adjust pg_num?
+	if (p.get_pg_num_target() != p.get_pg_num()) {
+	  dout(20) << "pool " << i.first
+		   << " pg_num " << p.get_pg_num()
+		   << " target " << p.get_pg_num_target()
+		   << dendl;
+	  if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
+	    dout(10) << "pool " << i.first
+		     << " pg_num_target " << p.get_pg_num_target()
+		     << " pg_num " << p.get_pg_num()
+		     << " - still creating initial pgs"
+		     << dendl;
+	  } else if (p.get_pg_num_target() < p.get_pg_num()) {
+	    // pg_num decrease (merge)
+	    pg_t merge_source(p.get_pg_num() - 1, i.first);
+	    pg_t merge_target = merge_source.get_parent();
+	    bool ok = true;
+
+	    if (p.get_pg_num() != p.get_pg_num_pending()) {
+	      dout(10) << "pool " << i.first
+		       << " pg_num_target " << p.get_pg_num_target()
+		       << " pg_num " << p.get_pg_num()
+		       << " - decrease and pg_num_pending != pg_num, waiting"
+		       << dendl;
+	      ok = false;
+	    } else if (p.get_pg_num() == p.get_pgp_num()) {
+	      dout(10) << "pool " << i.first
+		       << " pg_num_target " << p.get_pg_num_target()
+		       << " pg_num " << p.get_pg_num()
+		       << " - decrease blocked by pgp_num "
+		       << p.get_pgp_num()
+		       << dendl;
+	      ok = false;
+	    }
+	    vector<int32_t> source_acting;
+            for (auto &merge_participant : {merge_source, merge_target}) {
+              bool is_merge_source = merge_participant == merge_source;
+              if (osdmap.have_pg_upmaps(merge_participant)) {
+                dout(10) << "pool " << i.first
+                         << " pg_num_target " << p.get_pg_num_target()
+                         << " pg_num " << p.get_pg_num()
+                         << (is_merge_source ? " - merge source " : " - merge target ")
+                         << merge_participant
+                         << " has upmap" << dendl;
+                upmaps_to_clear.insert(merge_participant);
+                ok = false;
+              }
+              auto q = pg_map.pg_stat.find(merge_participant);
+              if (q == pg_map.pg_stat.end()) {
+	        dout(10) << "pool " << i.first
+		         << " pg_num_target " << p.get_pg_num_target()
+		         << " pg_num " << p.get_pg_num()
+		         << " - no state for " << merge_participant
+		         << (is_merge_source ? " (merge source)" : " (merge target)")
+		         << dendl;
+	        ok = false;
+	      } else if ((q->second.state & (PG_STATE_ACTIVE | PG_STATE_CLEAN)) !=
+                                            (PG_STATE_ACTIVE | PG_STATE_CLEAN)) {
+	        dout(10) << "pool " << i.first
+		         << " pg_num_target " << p.get_pg_num_target()
+		         << " pg_num " << p.get_pg_num()
+		         << (is_merge_source ? " - merge source " : " - merge target ")
+                         << merge_participant
+		         << " not clean (" << pg_state_string(q->second.state)
+		         << ")" << dendl;
+	        ok = false;
+	      }
+	      if (is_merge_source) {
+		source_acting = q->second.acting;
+	      } else if (ok && q->second.acting != source_acting) {
+		dout(10) << "pool " << i.first
+		         << " pg_num_target " << p.get_pg_num_target()
+		         << " pg_num " << p.get_pg_num()
+		         << (is_merge_source ? " - merge source " : " - merge target ")
+                         << merge_participant
+			 << " acting does not match (source " << source_acting
+			 << " != target " << q->second.acting
+			 << ")" << dendl;
+		ok = false;
+	      }
+            }
+
+	    if (ok) {
+	      unsigned target = p.get_pg_num() - 1;
+	      dout(10) << "pool " << i.first
+		       << " pg_num_target " << p.get_pg_num_target()
+		       << " pg_num " << p.get_pg_num()
+		       << " -> " << target
+		       << " (merging " << merge_source
+		       << " and " << merge_target
+		       << ")" << dendl;
+	      pg_num_to_set[osdmap.get_pool_name(i.first)] = target;
+              continue;
+	    }
+	  } else if (p.get_pg_num_target() > p.get_pg_num()) {
+	    // pg_num increase (split)
+	    bool active = true;
+	    auto q = pg_map.num_pg_by_pool_state.find(i.first);
+	    if (q != pg_map.num_pg_by_pool_state.end()) {
+	      for (auto& j : q->second) {
+		if ((j.first & (PG_STATE_ACTIVE|PG_STATE_PEERED)) == 0) {
+		  dout(20) << "pool " << i.first << " has " << j.second
+			   << " pgs in " << pg_state_string(j.first)
+			   << dendl;
+		  active = false;
+		  break;
+		}
+	      }
+	    } else {
+	      active = false;
+	    }
+	    unsigned pg_gap = p.get_pg_num() - p.get_pgp_num();
+	    unsigned max_jump = cct->_conf->mgr_max_pg_num_change;
+	    if (!active) {
+	      dout(10) << "pool " << i.first
+		       << " pg_num_target " << p.get_pg_num_target()
+		       << " pg_num " << p.get_pg_num()
+		       << " - not all pgs active"
+		       << dendl;
+	    } else if (pg_gap >= max_jump) {
+	      dout(10) << "pool " << i.first
+		       << " pg_num " << p.get_pg_num()
+		       << " - pgp_num " << p.get_pgp_num()
+		       << " gap >= max_pg_num_change " << max_jump
+		       << " - must scale pgp_num first"
+		       << dendl;
+	    } else {
+	      unsigned add = std::min(
+		std::min(left, max_jump - pg_gap),
+		p.get_pg_num_target() - p.get_pg_num());
+	      unsigned target = p.get_pg_num() + add;
+	      left -= add;
+	      dout(10) << "pool " << i.first
+		       << " pg_num_target " << p.get_pg_num_target()
+		       << " pg_num " << p.get_pg_num()
+		       << " -> " << target << dendl;
+	      pg_num_to_set[osdmap.get_pool_name(i.first)] = target;
+	    }
+	  }
+	}
+
+	// adjust pgp_num?
+	unsigned target = std::min(p.get_pg_num_pending(),
+				   p.get_pgp_num_target());
+	if (target != p.get_pgp_num()) {
+	  dout(20) << "pool " << i.first
+		   << " pgp_num_target " << p.get_pgp_num_target()
+		   << " pgp_num " << p.get_pgp_num()
+		   << " -> " << target << dendl;
+	  if (target > p.get_pgp_num() &&
+	      p.get_pgp_num() == p.get_pg_num()) {
+	    dout(10) << "pool " << i.first
+		     << " pgp_num_target " << p.get_pgp_num_target()
+		     << " pgp_num " << p.get_pgp_num()
+		     << " - increase blocked by pg_num " << p.get_pg_num()
+		     << dendl;
+	  } else if (!aggro && (inactive_pgs_ratio > 0 ||
+				degraded_ratio > 0 ||
+				unknown_pgs_ratio > 0)) {
+	    dout(10) << "pool " << i.first
+		     << " pgp_num_target " << p.get_pgp_num_target()
+		     << " pgp_num " << p.get_pgp_num()
+		     << " - inactive|degraded|unknown pgs, deferring pgp_num"
+		     << " update" << dendl;
+	  } else if (!aggro && (misplaced_ratio > max_misplaced)) {
+	    dout(10) << "pool " << i.first
+		     << " pgp_num_target " << p.get_pgp_num_target()
+		     << " pgp_num " << p.get_pgp_num()
+		     << " - misplaced_ratio " << misplaced_ratio
+		     << " > max " << max_misplaced
+		     << ", deferring pgp_num update" << dendl;
+	  } else {
+	    // NOTE: this calculation assumes objects are
+	    // basically uniformly distributed across all PGs
+	    // (regardless of pool), which is probably not
+	    // perfectly correct, but it's a start.  make no
+	    // single adjustment that's more than half of the
+	    // max_misplaced, to somewhat limit the magnitude of
+	    // our potential error here.
+	    int next;
+	    static constexpr unsigned MAX_NUM_OBJECTS_PER_PG_FOR_LEAP = 1;
+	    pool_stat_t s = pg_map.get_pg_pool_sum_stat(i.first);
+	    if (aggro ||
+		// pool is (virtually) empty; just jump to final pgp_num?
+		(p.get_pgp_num_target() > p.get_pgp_num() &&
+		 s.stats.sum.num_objects <= (MAX_NUM_OBJECTS_PER_PG_FOR_LEAP *
+					     p.get_pgp_num_target()))) {
+	      next = target;
+	    } else {
+	      double room =
+		std::min<double>(max_misplaced - misplaced_ratio,
+				 max_misplaced / 2.0);
+	      unsigned estmax = std::max<unsigned>(
+		(double)p.get_pg_num() * room, 1u);
+	      unsigned next_min = 0;
+	      if (p.get_pgp_num() > estmax) {
+	        next_min = p.get_pgp_num() - estmax;
+	      }
+	      next = std::clamp(target,
+				next_min,
+				p.get_pgp_num() + estmax);
+	      dout(20) << " room " << room << " estmax " << estmax
+		       << " delta " << (target-p.get_pgp_num())
+		       << " next " << next << dendl;
+	      if (p.get_pgp_num_target() == p.get_pg_num_target() &&
+		  p.get_pgp_num_target() < p.get_pg_num()) {
+		// since pgp_num is tracking pg_num, ceph is handling
+		// pgp_num.  so, be responsible: don't let pgp_num get
+		// too far out ahead of merges (if we are merging).
+		// this avoids moving lots of unmerged pgs onto a
+		// small number of OSDs where we might blow out the
+		// per-osd pg max.
+		unsigned max_outpace_merges =
+		  std::max<unsigned>(8, p.get_pg_num() * max_misplaced);
+		if (next + max_outpace_merges < p.get_pg_num()) {
+		  next = p.get_pg_num() - max_outpace_merges;
+		  dout(10) << "  using next " << next
+			   << " to avoid outpacing merges (max_outpace_merges "
+			   << max_outpace_merges << ")" << dendl;
+		}
+	      }
+	    }
+	    if (next != p.get_pgp_num()) {
+	      dout(10) << "pool " << i.first
+		       << " pgp_num_target " << p.get_pgp_num_target()
+		       << " pgp_num " << p.get_pgp_num()
+		       << " -> " << next << dendl;
+	      pgp_num_to_set[osdmap.get_pool_name(i.first)] = next;
+	    }
+	  }
+	}
+	if (left == 0) {
+	  return;
+	}
+      }
+    });
+  for (auto i : pg_num_to_set) {
+    const string cmd =
+      "{"
+      "\"prefix\": \"osd pool set\", "
+      "\"pool\": \"" + i.first + "\", "
+      "\"var\": \"pg_num_actual\", "
+      "\"val\": \"" + stringify(i.second) + "\""
+      "}";
+    monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr);
+  }
+  for (auto i : pgp_num_to_set) {
+    const string cmd =
+      "{"
+      "\"prefix\": \"osd pool set\", "
+      "\"pool\": \"" + i.first + "\", "
+      "\"var\": \"pgp_num_actual\", "
+      "\"val\": \"" + stringify(i.second) + "\""
+      "}";
+    monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr);
+  }
+  for (auto pg : upmaps_to_clear) {
+    const string cmd =
+      "{"
+      "\"prefix\": \"osd rm-pg-upmap\", "
+      "\"pgid\": \"" + stringify(pg) + "\""
+      "}";
+    monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr);
+    const string cmd2 =
+      "{"
+      "\"prefix\": \"osd rm-pg-upmap-items\", "
+      "\"pgid\": \"" + stringify(pg) + "\"" +
+      "}";
+    monc->start_mon_command({cmd2}, {}, nullptr, nullptr, nullptr);
+  }
+}
+
+void DaemonServer::got_service_map()
+{
+  std::lock_guard l(lock);
+
+  cluster_state.with_servicemap([&](const ServiceMap& service_map) {
+      if (pending_service_map.epoch == 0) {
+	// we just started up
+	dout(10) << "got initial map e" << service_map.epoch << dendl;
+	ceph_assert(pending_service_map_dirty == 0);
+	pending_service_map = service_map;
+	pending_service_map.epoch = service_map.epoch + 1;
+      } else if (pending_service_map.epoch <= service_map.epoch) {
+	// we just started up but got one more not our own map
+	dout(10) << "got newer initial map e" << service_map.epoch << dendl;
+	ceph_assert(pending_service_map_dirty == 0);
+	pending_service_map = service_map;
+	pending_service_map.epoch = service_map.epoch + 1;
+      } else {
+	// we already active and therefore must have persisted it,
+	// which means ours is the same or newer.
+	dout(10) << "got updated map e" << service_map.epoch << dendl;
+      }
+    });
+
+  // cull missing daemons, populate new ones
+  std::set<std::string> types;
+  for (auto& [type, service] : pending_service_map.services) {
+    if (ServiceMap::is_normal_ceph_entity(type)) {
+      continue;
+    }
+
+    types.insert(type);
+
+    std::set<std::string> names;
+    for (auto& q : service.daemons) {
+      names.insert(q.first);
+      DaemonKey key{type, q.first};
+      if (!daemon_state.exists(key)) {
+	auto daemon = std::make_shared<DaemonState>(daemon_state.types);
+	daemon->key = key;
+	daemon->set_metadata(q.second.metadata);
+	daemon->service_daemon = true;
+	daemon_state.insert(daemon);
+	dout(10) << "added missing " << key << dendl;
+      }
+    }
+    daemon_state.cull(type, names);
+  }
+  daemon_state.cull_services(types);
+}
+
+void DaemonServer::got_mgr_map()
+{
+  std::lock_guard l(lock);
+  set<std::string> have;
+  cluster_state.with_mgrmap([&](const MgrMap& mgrmap) {
+      auto md_update = [&] (DaemonKey key) {
+        std::ostringstream oss;
+        auto c = new MetadataUpdate(daemon_state, key);
+	// FIXME remove post-nautilus: include 'id' for luminous mons
+        oss << "{\"prefix\": \"mgr metadata\", \"who\": \""
+	    << key.name << "\", \"id\": \"" << key.name << "\"}";
+        monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c);
+      };
+      if (mgrmap.active_name.size()) {
+	DaemonKey key{"mgr", mgrmap.active_name};
+	have.insert(mgrmap.active_name);
+	if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) {
+	  md_update(key);
+	  dout(10) << "triggered addition of " << key << " via metadata update" << dendl;
+	}
+      }
+      for (auto& i : mgrmap.standbys) {
+        DaemonKey key{"mgr", i.second.name};
+	have.insert(i.second.name);
+	if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) {
+	  md_update(key);
+	  dout(10) << "triggered addition of " << key << " via metadata update" << dendl;
+	}
+      }
+    });
+  daemon_state.cull("mgr", have);
+}
+
+const char** DaemonServer::get_tracked_conf_keys() const
+{
+  static const char *KEYS[] = {
+    "mgr_stats_threshold",
+    "mgr_stats_period",
+    nullptr
+  };
+
+  return KEYS;
+}
+
+void DaemonServer::handle_conf_change(const ConfigProxy& conf,
+				      const std::set <std::string> &changed)
+{
+
+  if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) {
+    dout(4) << "Updating stats threshold/period on "
+            << daemon_connections.size() << " clients" << dendl;
+    // Send a fresh MMgrConfigure to all clients, so that they can follow
+    // the new policy for transmitting stats
+    finisher.queue(new LambdaContext([this](int r) {
+      std::lock_guard l(lock);
+      for (auto &c : daemon_connections) {
+        _send_configure(c);
+      }
+    }));
+  }
+}
+
+void DaemonServer::_send_configure(ConnectionRef c)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  auto configure = make_message<MMgrConfigure>();
+  configure->stats_period = g_conf().get_val<int64_t>("mgr_stats_period");
+  configure->stats_threshold = g_conf().get_val<int64_t>("mgr_stats_threshold");
+
+  if (c->peer_is_osd()) {
+    configure->osd_perf_metric_queries =
+        osd_perf_metric_collector.get_queries();
+  } else if (c->peer_is_mds()) {
+    configure->metric_config_message =
+      MetricConfigMessage(MDSConfigPayload(mds_perf_metric_collector.get_queries()));
+  }
+
+  c->send_message2(configure);
+}
+
+MetricQueryID DaemonServer::add_osd_perf_query(
+    const OSDPerfMetricQuery &query,
+    const std::optional<OSDPerfMetricLimit> &limit)
+{
+  return osd_perf_metric_collector.add_query(query, limit);
+}
+
+int DaemonServer::remove_osd_perf_query(MetricQueryID query_id)
+{
+  return osd_perf_metric_collector.remove_query(query_id);
+}
+
+int DaemonServer::get_osd_perf_counters(OSDPerfCollector *collector)
+{
+  return osd_perf_metric_collector.get_counters(collector);
+}
+
+MetricQueryID DaemonServer::add_mds_perf_query(
+    const MDSPerfMetricQuery &query,
+    const std::optional<MDSPerfMetricLimit> &limit)
+{
+  return mds_perf_metric_collector.add_query(query, limit);
+}
+
+int DaemonServer::remove_mds_perf_query(MetricQueryID query_id)
+{
+  return mds_perf_metric_collector.remove_query(query_id);
+}
+
+void DaemonServer::reregister_mds_perf_queries()
+{
+  mds_perf_metric_collector.reregister_queries();
+}
+
+int DaemonServer::get_mds_perf_counters(MDSPerfCollector *collector)
+{
+  return mds_perf_metric_collector.get_counters(collector);
+}
diff --git a/src/mgr/DaemonServer.h b/src/mgr/DaemonServer.h
new file mode 100644
index 000000000..a4cf990bd
--- /dev/null
+++ b/src/mgr/DaemonServer.h
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef DAEMON_SERVER_H_
+#define DAEMON_SERVER_H_
+
+#include "PyModuleRegistry.h"
+
+#include <set>
+#include <string>
+#include <boost/variant.hpp>
+
+#include "common/ceph_mutex.h"
+#include "common/LogClient.h"
+#include "common/Timer.h"
+
+#include <msg/Messenger.h>
+#include <mon/MonClient.h>
+
+#include "ServiceMap.h"
+#include "MgrSession.h"
+#include "DaemonState.h"
+#include "MetricCollector.h"
+#include "OSDPerfMetricCollector.h"
+#include "MDSPerfMetricCollector.h"
+
+class MMgrReport;
+class MMgrOpen;
+class MMgrUpdate;
+class MMgrClose;
+class MMonMgrReport;
+class MCommand;
+class MMgrCommand;
+struct MonCommand;
+class CommandContext;
+struct OSDPerfMetricQuery;
+struct MDSPerfMetricQuery;
+
+
+struct offline_pg_report {
+  set<int> osds;
+  set<pg_t> ok, not_ok, unknown;
+  set<pg_t> ok_become_degraded, ok_become_more_degraded;             // ok
+  set<pg_t> bad_no_pool, bad_already_inactive, bad_become_inactive;  // not ok
+
+  bool ok_to_stop() const {
+    return not_ok.empty() && unknown.empty();
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_bool("ok_to_stop", ok_to_stop());
+    f->open_array_section("osds");
+    for (auto o : osds) {
+      f->dump_int("osd", o);
+    }
+    f->close_section();
+    f->dump_unsigned("num_ok_pgs", ok.size());
+    f->dump_unsigned("num_not_ok_pgs", not_ok.size());
+
+    // ambiguous
+    if (!unknown.empty()) {
+      f->open_array_section("unknown_pgs");
+      for (auto pg : unknown) {
+	f->dump_stream("pg") << pg;
+      }
+      f->close_section();
+    }
+
+    // bad news
+    if (!bad_no_pool.empty()) {
+      f->open_array_section("bad_no_pool_pgs");
+      for (auto pg : bad_no_pool) {
+	f->dump_stream("pg") << pg;
+      }
+      f->close_section();
+    }
+    if (!bad_already_inactive.empty()) {
+      f->open_array_section("bad_already_inactive");
+      for (auto pg : bad_already_inactive) {
+	f->dump_stream("pg") << pg;
+      }
+      f->close_section();
+    }
+    if (!bad_become_inactive.empty()) {
+      f->open_array_section("bad_become_inactive");
+      for (auto pg : bad_become_inactive) {
+	f->dump_stream("pg") << pg;
+      }
+      f->close_section();
+    }
+
+    // informative
+    if (!ok_become_degraded.empty()) {
+      f->open_array_section("ok_become_degraded");
+      for (auto pg : ok_become_degraded) {
+	f->dump_stream("pg") << pg;
+      }
+      f->close_section();
+    }
+    if (!ok_become_more_degraded.empty()) {
+      f->open_array_section("ok_become_more_degraded");
+      for (auto pg : ok_become_more_degraded) {
+	f->dump_stream("pg") << pg;
+      }
+      f->close_section();
+    }
+  }
+};
+
+
+/**
+ * Server used in ceph-mgr to communicate with Ceph daemons like
+ * MDSs and OSDs.
+ */
+class DaemonServer : public Dispatcher, public md_config_obs_t
+{
+protected:
+  boost::scoped_ptr<Throttle> client_byte_throttler;
+  boost::scoped_ptr<Throttle> client_msg_throttler;
+  boost::scoped_ptr<Throttle> osd_byte_throttler;
+  boost::scoped_ptr<Throttle> osd_msg_throttler;
+  boost::scoped_ptr<Throttle> mds_byte_throttler;
+  boost::scoped_ptr<Throttle> mds_msg_throttler;
+  boost::scoped_ptr<Throttle> mon_byte_throttler;
+  boost::scoped_ptr<Throttle> mon_msg_throttler;
+
+  Messenger *msgr;
+  MonClient *monc;
+  Finisher  &finisher;
+  DaemonStateIndex &daemon_state;
+  ClusterState &cluster_state;
+  PyModuleRegistry &py_modules;
+  LogChannelRef clog, audit_clog;
+
+  // Connections for daemons, and clients with service names set
+  // (i.e. those MgrClients that are allowed to send MMgrReports)
+  std::set<ConnectionRef> daemon_connections;
+
+  /// connections for osds
+  ceph::unordered_map<int,set<ConnectionRef>> osd_cons;
+
+  ServiceMap pending_service_map;  // uncommitted
+
+  epoch_t pending_service_map_dirty = 0;
+
+  ceph::mutex lock = ceph::make_mutex("DaemonServer");
+
+  static void _generate_command_map(cmdmap_t& cmdmap,
+                                    map<string,string> &param_str_map);
+  static const MonCommand *_get_mgrcommand(const string &cmd_prefix,
+                                           const std::vector<MonCommand> &commands);
+  bool _allowed_command(
+    MgrSession *s, const string &service, const string &module,
+    const string &prefix, const cmdmap_t& cmdmap,
+    const map<string,string>& param_str_map,
+    const MonCommand *this_cmd);
+
+private:
+  friend class ReplyOnFinish;
+  bool _reply(MCommand* m,
+	      int ret, const std::string& s, const bufferlist& payload);
+
+  void _prune_pending_service_map();
+
+  void _check_offlines_pgs(
+    const set<int>& osds,
+    const OSDMap& osdmap,
+    const PGMap& pgmap,
+    offline_pg_report *report);
+  void _maximize_ok_to_stop_set(
+    const set<int>& orig_osds,
+    unsigned max,
+    const OSDMap& osdmap,
+    const PGMap& pgmap,
+    offline_pg_report *report);
+
+  utime_t started_at;
+  std::atomic<bool> pgmap_ready;
+  std::set<int32_t> reported_osds;
+  void maybe_ready(int32_t osd_id);
+
+  SafeTimer timer;
+  bool shutting_down;
+  Context *tick_event;
+  void tick();
+  void schedule_tick_locked(double delay_sec);
+
+  class OSDPerfMetricCollectorListener : public MetricListener {
+  public:
+    OSDPerfMetricCollectorListener(DaemonServer *server)
+      : server(server) {
+    }
+    void handle_query_updated() override {
+      server->handle_osd_perf_metric_query_updated();
+    }
+  private:
+    DaemonServer *server;
+  };
+  OSDPerfMetricCollectorListener osd_perf_metric_collector_listener;
+  OSDPerfMetricCollector osd_perf_metric_collector;
+  void handle_osd_perf_metric_query_updated();
+
+  class MDSPerfMetricCollectorListener : public MetricListener {
+  public:
+    MDSPerfMetricCollectorListener(DaemonServer *server)
+      : server(server) {
+    }
+    void handle_query_updated() override {
+      server->handle_mds_perf_metric_query_updated();
+    }
+  private:
+    DaemonServer *server;
+  };
+  MDSPerfMetricCollectorListener mds_perf_metric_collector_listener;
+  MDSPerfMetricCollector mds_perf_metric_collector;
+  void handle_mds_perf_metric_query_updated();
+
+  void handle_metric_payload(const OSDMetricPayload &payload) {
+    osd_perf_metric_collector.process_reports(payload);
+  }
+
+  void handle_metric_payload(const MDSMetricPayload &payload) {
+    mds_perf_metric_collector.process_reports(payload);
+  }
+
+  void handle_metric_payload(const UnknownMetricPayload &payload) {
+    ceph_abort();
+  }
+
+  struct HandlePayloadVisitor : public boost::static_visitor<void> {
+    DaemonServer *server;
+
+    HandlePayloadVisitor(DaemonServer *server)
+      : server(server) {
+    }
+
+    template <typename MetricPayload>
+    inline void operator()(const MetricPayload &payload) const {
+      server->handle_metric_payload(payload);
+    }
+  };
+
+  void update_task_status(DaemonKey key,
+			  const std::map<std::string,std::string>& task_status);
+
+public:
+  int init(uint64_t gid, entity_addrvec_t client_addrs);
+  void shutdown();
+
+  entity_addrvec_t get_myaddrs() const;
+
+  DaemonServer(MonClient *monc_,
+               Finisher &finisher_,
+	       DaemonStateIndex &daemon_state_,
+	       ClusterState &cluster_state_,
+	       PyModuleRegistry &py_modules_,
+	       LogChannelRef cl,
+	       LogChannelRef auditcl);
+  ~DaemonServer() override;
+
+  bool ms_dispatch2(const ceph::ref_t<Message>& m) override;
+  int ms_handle_authentication(Connection *con) override;
+  bool ms_handle_reset(Connection *con) override;
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override;
+
+  void fetch_missing_metadata(const DaemonKey& key, const entity_addr_t& addr);
+  bool handle_open(const ceph::ref_t<MMgrOpen>& m);
+  bool handle_update(const ceph::ref_t<MMgrUpdate>& m);
+  bool handle_close(const ceph::ref_t<MMgrClose>& m);
+  bool handle_report(const ceph::ref_t<MMgrReport>& m);
+  bool handle_command(const ceph::ref_t<MCommand>& m);
+  bool handle_command(const ceph::ref_t<MMgrCommand>& m);
+  bool _handle_command(std::shared_ptr<CommandContext>& cmdctx);
+  void send_report();
+  void got_service_map();
+  void got_mgr_map();
+  void adjust_pgs();
+
+  void _send_configure(ConnectionRef c);
+
+  MetricQueryID add_osd_perf_query(
+      const OSDPerfMetricQuery &query,
+      const std::optional<OSDPerfMetricLimit> &limit);
+  int remove_osd_perf_query(MetricQueryID query_id);
+  int get_osd_perf_counters(OSDPerfCollector *collector);
+
+  MetricQueryID add_mds_perf_query(const MDSPerfMetricQuery &query,
+                                   const std::optional<MDSPerfMetricLimit> &limit);
+  int remove_mds_perf_query(MetricQueryID query_id);
+  void reregister_mds_perf_queries();
+  int get_mds_perf_counters(MDSPerfCollector *collector);
+
+  virtual const char** get_tracked_conf_keys() const override;
+  virtual void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override;
+
+  void schedule_tick(double delay_sec);
+
+  void log_access_denied(std::shared_ptr<CommandContext>& cmdctx,
+                         MgrSession* session, std::stringstream& ss);
+  void dump_pg_ready(ceph::Formatter *f);
+};
+
+#endif
+
diff --git a/src/mgr/DaemonState.cc b/src/mgr/DaemonState.cc
new file mode 100644
index 000000000..32cbbe3b9
--- /dev/null
+++ b/src/mgr/DaemonState.cc
@@ -0,0 +1,381 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "DaemonState.h"
+
+#include <experimental/iterator>
+
+#include "MgrSession.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+void DeviceState::set_metadata(map<string,string>&& m)
+{
+  metadata = std::move(m);
+  auto p = metadata.find("life_expectancy_min");
+  if (p != metadata.end()) {
+    life_expectancy.first.parse(p->second);
+  }
+  p = metadata.find("life_expectancy_max");
+  if (p != metadata.end()) {
+    life_expectancy.second.parse(p->second);
+  }
+  p = metadata.find("life_expectancy_stamp");
+  if (p != metadata.end()) {
+    life_expectancy_stamp.parse(p->second);
+  }
+  p = metadata.find("wear_level");
+  if (p != metadata.end()) {
+    wear_level = atof(p->second.c_str());
+  }
+}
+
+void DeviceState::set_life_expectancy(utime_t from, utime_t to, utime_t now)
+{
+  life_expectancy = make_pair(from, to);
+  life_expectancy_stamp = now;
+  if (from != utime_t()) {
+    metadata["life_expectancy_min"] = stringify(from);
+  } else {
+    metadata["life_expectancy_min"] = "";
+  }
+  if (to != utime_t()) {
+    metadata["life_expectancy_max"] = stringify(to);
+  } else {
+    metadata["life_expectancy_max"] = "";
+  }
+  if (now != utime_t()) {
+    metadata["life_expectancy_stamp"] = stringify(now);
+  } else {
+    metadata["life_expectancy_stamp"] = "";
+  }
+}
+
+void DeviceState::rm_life_expectancy()
+{
+  life_expectancy = make_pair(utime_t(), utime_t());
+  life_expectancy_stamp = utime_t();
+  metadata.erase("life_expectancy_min");
+  metadata.erase("life_expectancy_max");
+  metadata.erase("life_expectancy_stamp");
+}
+
+void DeviceState::set_wear_level(float wear)
+{
+  wear_level = wear;
+  if (wear >= 0) {
+    metadata["wear_level"] = stringify(wear);
+  } else {
+    metadata.erase("wear_level");
+  }
+}
+
+string DeviceState::get_life_expectancy_str(utime_t now) const
+{
+  if (life_expectancy.first == utime_t()) {
+    return string();
+  }
+  if (now >= life_expectancy.first) {
+    return "now";
+  }
+  utime_t min = life_expectancy.first - now;
+  utime_t max = life_expectancy.second - now;
+  if (life_expectancy.second == utime_t()) {
+    return string(">") + timespan_str(make_timespan(min));
+  }
+  string a = timespan_str(make_timespan(min));
+  string b = timespan_str(make_timespan(max));
+  if (a == b) {
+    return a;
+  }
+  return a + " to " + b;
+}
+
+void DeviceState::dump(Formatter *f) const
+{
+  f->dump_string("devid", devid);
+  f->open_array_section("location");
+  for (auto& i : attachments) {
+    f->open_object_section("attachment");
+    f->dump_string("host", std::get<0>(i));
+    f->dump_string("dev", std::get<1>(i));
+    f->dump_string("path", std::get<2>(i));
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("daemons");
+  for (auto& i : daemons) {
+    f->dump_stream("daemon") << i;
+  }
+  f->close_section();
+  if (life_expectancy.first != utime_t()) {
+    f->dump_stream("life_expectancy_min") << life_expectancy.first;
+    f->dump_stream("life_expectancy_max") << life_expectancy.second;
+    f->dump_stream("life_expectancy_stamp")
+      << life_expectancy_stamp;
+  }
+  if (wear_level >= 0) {
+    f->dump_float("wear_level", wear_level);
+  }
+}
+
+void DeviceState::print(ostream& out) const
+{
+  out << "device " << devid << "\n";
+  for (auto& i : attachments) {
+    out << "attachment " << std::get<0>(i) << " " << std::get<1>(i) << " "
+	<< std::get<2>(i) << "\n";
+    out << "\n";
+  }
+  std::copy(std::begin(daemons), std::end(daemons),
+            std::experimental::make_ostream_joiner(out, ","));
+  out << '\n';
+  if (life_expectancy.first != utime_t()) {
+    out << "life_expectancy " << life_expectancy.first << " to "
+	<< life_expectancy.second
+	<< " (as of " << life_expectancy_stamp << ")\n";
+  }
+  if (wear_level >= 0) {
+    out << "wear_level " << wear_level << "\n";
+  }
+}
+
+void DaemonStateIndex::insert(DaemonStatePtr dm)
+{
+  std::unique_lock l{lock};
+  _insert(dm);
+}
+
+void DaemonStateIndex::_insert(DaemonStatePtr dm)
+{
+  if (all.count(dm->key)) {
+    _erase(dm->key);
+  }
+
+  by_server[dm->hostname][dm->key] = dm;
+  all[dm->key] = dm;
+
+  for (auto& i : dm->devices) {
+    auto d = _get_or_create_device(i.first);
+    d->daemons.insert(dm->key);
+    auto p = dm->devices_bypath.find(i.first);
+    if (p != dm->devices_bypath.end()) {
+      d->attachments.insert(std::make_tuple(dm->hostname, i.second, p->second));
+    } else {
+      d->attachments.insert(std::make_tuple(dm->hostname, i.second,
+					    std::string()));
+    }
+  }
+}
+
+void DaemonStateIndex::_erase(const DaemonKey& dmk)
+{
+  ceph_assert(ceph_mutex_is_wlocked(lock));
+
+  const auto to_erase = all.find(dmk);
+  ceph_assert(to_erase != all.end());
+  const auto dm = to_erase->second;
+
+  for (auto& i : dm->devices) {
+    auto d = _get_or_create_device(i.first);
+    ceph_assert(d->daemons.count(dmk));
+    d->daemons.erase(dmk);
+    auto p = dm->devices_bypath.find(i.first);
+    if (p != dm->devices_bypath.end()) {
+      d->attachments.erase(make_tuple(dm->hostname, i.second, p->second));
+    } else {
+      d->attachments.erase(make_tuple(dm->hostname, i.second, std::string()));
+    }
+    if (d->empty()) {
+      _erase_device(d);
+    }
+  }
+
+  auto &server_collection = by_server[dm->hostname];
+  server_collection.erase(dm->key);
+  if (server_collection.empty()) {
+    by_server.erase(dm->hostname);
+  }
+
+  all.erase(to_erase);
+}
+
+DaemonStateCollection DaemonStateIndex::get_by_service(
+  const std::string& svc) const
+{
+  std::shared_lock l{lock};
+
+  DaemonStateCollection result;
+
+  for (const auto& [key, state] : all) {
+    if (key.type == svc) {
+      result[key] = state;
+    }
+  }
+
+  return result;
+}
+
+DaemonStateCollection DaemonStateIndex::get_by_server(
+  const std::string &hostname) const
+{
+  std::shared_lock l{lock};
+
+  if (auto found = by_server.find(hostname); found != by_server.end()) {
+    return found->second;
+  } else {
+    return {};
+  }
+}
+
+bool DaemonStateIndex::exists(const DaemonKey &key) const
+{
+  std::shared_lock l{lock};
+
+  return all.count(key) > 0;
+}
+
+DaemonStatePtr DaemonStateIndex::get(const DaemonKey &key)
+{
+  std::shared_lock l{lock};
+
+  auto iter = all.find(key);
+  if (iter != all.end()) {
+    return iter->second;
+  } else {
+    return nullptr;
+  }
+}
+
+void DaemonStateIndex::rm(const DaemonKey &key)
+{
+  std::unique_lock l{lock};
+  _rm(key);
+}
+
+void DaemonStateIndex::_rm(const DaemonKey &key)
+{
+  if (all.count(key)) {
+    _erase(key);
+  }
+}
+
+void DaemonStateIndex::cull(const std::string& svc_name,
+			    const std::set<std::string>& names_exist)
+{
+  std::vector<string> victims;
+
+  std::unique_lock l{lock};
+  auto begin = all.lower_bound({svc_name, ""});
+  auto end = all.end();
+  for (auto &i = begin; i != end; ++i) {
+    const auto& daemon_key = i->first;
+    if (daemon_key.type != svc_name)
+      break;
+    if (names_exist.count(daemon_key.name) == 0) {
+      victims.push_back(daemon_key.name);
+    }
+  }
+
+  for (auto &i : victims) {
+    DaemonKey daemon_key{svc_name, i};
+    dout(4) << "Removing data for " << daemon_key << dendl;
+    _erase(daemon_key);
+  }
+}
+
+void DaemonStateIndex::cull_services(const std::set<std::string>& types_exist)
+{
+  std::set<DaemonKey> victims;
+
+  std::unique_lock l{lock};
+  for (auto it = all.begin(); it != all.end(); ++it) {
+    const auto& daemon_key = it->first;
+    if (it->second->service_daemon &&
+        types_exist.count(daemon_key.type) == 0) {
+      victims.insert(daemon_key);
+    }
+  }
+
+  for (auto &i : victims) {
+    dout(4) << "Removing data for " << i << dendl;
+    _erase(i);
+  }
+}
+
+void DaemonPerfCounters::update(const MMgrReport& report)
+{
+  dout(20) << "loading " << report.declare_types.size() << " new types, "
+	   << report.undeclare_types.size() << " old types, had "
+	   << types.size() << " types, got "
+           << report.packed.length() << " bytes of data" << dendl;
+
+  // Retrieve session state
+  auto priv = report.get_connection()->get_priv();
+  auto session = static_cast<MgrSession*>(priv.get());
+
+  // Load any newly declared types
+  for (const auto &t : report.declare_types) {
+    types.insert(std::make_pair(t.path, t));
+    session->declared_types.insert(t.path);
+  }
+  // Remove any old types
+  for (const auto &t : report.undeclare_types) {
+    session->declared_types.erase(t);
+  }
+
+  const auto now = ceph_clock_now();
+
+  // Parse packed data according to declared set of types
+  auto p = report.packed.cbegin();
+  DECODE_START(1, p);
+  for (const auto &t_path : session->declared_types) {
+    const auto &t = types.at(t_path);
+    auto instances_it = instances.find(t_path);
+    // Always check the instance exists, as we don't prevent yet
+    // multiple sessions from daemons with the same name, and one
+    // session clearing stats created by another on open.
+    if (instances_it == instances.end()) {
+      instances_it = instances.insert({t_path, t.type}).first;
+    }
+    uint64_t val = 0;
+    uint64_t avgcount = 0;
+    uint64_t avgcount2 = 0;
+
+    decode(val, p);
+    if (t.type & PERFCOUNTER_LONGRUNAVG) {
+      decode(avgcount, p);
+      decode(avgcount2, p);
+      instances_it->second.push_avg(now, val, avgcount);
+    } else {
+      instances_it->second.push(now, val);
+    }
+  }
+  DECODE_FINISH(p);
+}
+
+void PerfCounterInstance::push(utime_t t, uint64_t const &v)
+{
+  buffer.push_back({t, v});
+}
+
+void PerfCounterInstance::push_avg(utime_t t, uint64_t const &s,
+                                   uint64_t const &c)
+{
+  avg_buffer.push_back({t, s, c});
+}
diff --git a/src/mgr/DaemonState.h b/src/mgr/DaemonState.h
new file mode 100644
index 000000000..8c21305a9
--- /dev/null
+++ b/src/mgr/DaemonState.h
@@ -0,0 +1,409 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef DAEMON_STATE_H_
+#define DAEMON_STATE_H_
+
+#include <map>
+#include <string>
+#include <memory>
+#include <set>
+#include <boost/circular_buffer.hpp>
+
+#include "common/RWLock.h"
+#include "include/str_map.h"
+
+#include "msg/msg_types.h"
+
+// For PerfCounterType
+#include "messages/MMgrReport.h"
+#include "DaemonKey.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+// An instance of a performance counter type, within
+// a particular daemon.
+class PerfCounterInstance
+{
+  class DataPoint
+  {
+    public:
+    utime_t t;
+    uint64_t v;
+    DataPoint(utime_t t_, uint64_t v_)
+      : t(t_), v(v_)
+    {}
+  };
+
+  class AvgDataPoint
+  {
+    public:
+    utime_t t;
+    uint64_t s;
+    uint64_t c;
+    AvgDataPoint(utime_t t_, uint64_t s_, uint64_t c_)
+      : t(t_), s(s_), c(c_)
+    {}
+  };
+
+  boost::circular_buffer<DataPoint> buffer;
+  boost::circular_buffer<AvgDataPoint> avg_buffer;
+
+  uint64_t get_current() const;
+
+  public:
+  const boost::circular_buffer<DataPoint> & get_data() const
+  {
+    return buffer;
+  }
+  const DataPoint& get_latest_data() const
+  {
+    return buffer.back();
+  }
+  const boost::circular_buffer<AvgDataPoint> & get_data_avg() const
+  {
+    return avg_buffer;
+  }
+  const AvgDataPoint& get_latest_data_avg() const
+  {
+    return avg_buffer.back();
+  }
+  void push(utime_t t, uint64_t const &v);
+  void push_avg(utime_t t, uint64_t const &s, uint64_t const &c);
+
+  PerfCounterInstance(enum perfcounter_type_d type)
+  {
+    if (type & PERFCOUNTER_LONGRUNAVG)
+      avg_buffer = boost::circular_buffer<AvgDataPoint>(20);
+    else
+      buffer = boost::circular_buffer<DataPoint>(20);
+  };
+};
+
+
+typedef std::map<std::string, PerfCounterType> PerfCounterTypes;
+
+// Performance counters for one daemon
+class DaemonPerfCounters
+{
+  public:
+  // The record of perf stat types, shared between daemons
+  PerfCounterTypes &types;
+
+  explicit DaemonPerfCounters(PerfCounterTypes &types_)
+    : types(types_)
+  {}
+
+  std::map<std::string, PerfCounterInstance> instances;
+
+  void update(const MMgrReport& report);
+
+  void clear()
+  {
+    instances.clear();
+  }
+};
+
+// The state that we store about one daemon
+class DaemonState
+{
+  public:
+  ceph::mutex lock = ceph::make_mutex("DaemonState::lock");
+
+  DaemonKey key;
+
+  // The hostname where daemon was last seen running (extracted
+  // from the metadata)
+  std::string hostname;
+
+  // The metadata (hostname, version, etc) sent from the daemon
+  std::map<std::string, std::string> metadata;
+
+  /// device ids -> devname, derived from metadata[device_ids]
+  std::map<std::string,std::string> devices;
+
+  /// device ids -> by-path, derived from metadata[device_ids]
+  std::map<std::string,std::string> devices_bypath;
+
+  // TODO: this can be generalized to other daemons
+  std::vector<DaemonHealthMetric> daemon_health_metrics;
+
+  // Ephemeral state
+  bool service_daemon = false;
+  utime_t service_status_stamp;
+  std::map<std::string, std::string> service_status;
+  utime_t last_service_beacon;
+
+  // running config
+  std::map<std::string,std::map<int32_t,std::string>> config;
+
+  // mon config values we failed to set
+  std::map<std::string,std::string> ignored_mon_config;
+
+  // compiled-in config defaults (rarely used, so we leave them encoded!)
+  bufferlist config_defaults_bl;
+  std::map<std::string,std::string> config_defaults;
+
+  // The perf counters received in MMgrReport messages
+  DaemonPerfCounters perf_counters;
+
+  explicit DaemonState(PerfCounterTypes &types_)
+    : perf_counters(types_)
+  {
+  }
+
+  void set_metadata(const std::map<std::string,std::string>& m) {
+    devices.clear();
+    devices_bypath.clear();
+    metadata = m;
+    auto p = m.find("device_ids");
+    if (p != m.end()) {
+      map<std::string,std::string> devs, paths; // devname -> id or path
+      get_str_map(p->second, &devs, ",; ");
+      auto q = m.find("device_paths");
+      if (q != m.end()) {
+	get_str_map(q->second, &paths, ",; ");
+      }
+      for (auto& i : devs) {
+	if (i.second.size()) {  // skip blank ids
+	  devices[i.second] = i.first;   // id -> devname
+	  auto j = paths.find(i.first);
+	  if (j != paths.end()) {
+	    devices_bypath[i.second] = j->second; // id -> path
+	  }
+	}
+      }
+    }
+    p = m.find("hostname");
+    if (p != m.end()) {
+      hostname = p->second;
+    }
+  }
+
+  const std::map<std::string,std::string>& _get_config_defaults() {
+    if (config_defaults.empty() &&
+	config_defaults_bl.length()) {
+      auto p = config_defaults_bl.cbegin();
+      try {
+	decode(config_defaults, p);
+      } catch (buffer::error& e) {
+      }
+    }
+    return config_defaults;
+  }
+};
+
+typedef std::shared_ptr<DaemonState> DaemonStatePtr;
+typedef std::map<DaemonKey, DaemonStatePtr> DaemonStateCollection;
+
+
+struct DeviceState : public RefCountedObject
+{
+  std::string devid;
+  /// (server,devname,path)
+  std::set<std::tuple<std::string,std::string,std::string>> attachments;
+  std::set<DaemonKey> daemons;
+
+  std::map<string,string> metadata;  ///< persistent metadata
+
+  pair<utime_t,utime_t> life_expectancy;  ///< when device failure is expected
+  utime_t life_expectancy_stamp;          ///< when life expectency was recorded
+  float wear_level = -1;                  ///< SSD wear level (negative if unknown)
+
+  void set_metadata(map<string,string>&& m);
+
+  void set_life_expectancy(utime_t from, utime_t to, utime_t now);
+  void rm_life_expectancy();
+
+  void set_wear_level(float wear);
+
+  string get_life_expectancy_str(utime_t now) const;
+
+  /// true of we can be safely forgotten/removed from memory
+  bool empty() const {
+    return daemons.empty() && metadata.empty();
+  }
+
+  void dump(Formatter *f) const;
+  void print(ostream& out) const;
+
+private:
+  FRIEND_MAKE_REF(DeviceState);
+  DeviceState(const std::string& n) : devid(n) {}
+};
+
+/**
+ * Fuse the collection of per-daemon metadata from Ceph into
+ * a view that can be queried by service type, ID or also
+ * by server (aka fqdn).
+ */
+class DaemonStateIndex
+{
+private:
+  mutable ceph::shared_mutex lock =
+    ceph::make_shared_mutex("DaemonStateIndex", true, true, true);
+
+  std::map<std::string, DaemonStateCollection> by_server;
+  DaemonStateCollection all;
+  std::set<DaemonKey> updating;
+
+  std::map<std::string,ceph::ref_t<DeviceState>> devices;
+
+  void _erase(const DaemonKey& dmk);
+
+  ceph::ref_t<DeviceState> _get_or_create_device(const std::string& dev) {
+    auto em = devices.try_emplace(dev, nullptr);
+    auto& d = em.first->second;
+    if (em.second) {
+      d = ceph::make_ref<DeviceState>(dev);
+    }
+    return d;
+  }
+  void _erase_device(const ceph::ref_t<DeviceState>& d) {
+    devices.erase(d->devid);
+  }
+
+public:
+  DaemonStateIndex() {}
+
+  // FIXME: shouldn't really be public, maybe construct DaemonState
+  // objects internally to avoid this.
+  PerfCounterTypes types;
+
+  void insert(DaemonStatePtr dm);
+  void _insert(DaemonStatePtr dm);
+  bool exists(const DaemonKey &key) const;
+  DaemonStatePtr get(const DaemonKey &key);
+  void rm(const DaemonKey &key);
+  void _rm(const DaemonKey &key);
+
+  // Note that these return by value rather than reference to avoid
+  // callers needing to stay in lock while using result.  Callers must
+  // still take the individual DaemonState::lock on each entry though.
+  DaemonStateCollection get_by_server(const std::string &hostname) const;
+  DaemonStateCollection get_by_service(const std::string &svc_name) const;
+  DaemonStateCollection get_all() const {return all;}
+
+  template<typename Callback, typename...Args>
+  auto with_daemons_by_server(Callback&& cb, Args&&... args) const ->
+    decltype(cb(by_server, std::forward<Args>(args)...)) {
+    std::shared_lock l{lock};
+    
+    return std::forward<Callback>(cb)(by_server, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  bool with_device(const std::string& dev,
+		   Callback&& cb, Args&&... args) const {
+    std::shared_lock l{lock};
+    auto p = devices.find(dev);
+    if (p == devices.end()) {
+      return false;
+    }
+    std::forward<Callback>(cb)(*p->second, std::forward<Args>(args)...);
+    return true;
+  }
+
+  template<typename Callback, typename...Args>
+  bool with_device_write(const std::string& dev,
+			 Callback&& cb, Args&&... args) {
+    std::unique_lock l{lock};
+    auto p = devices.find(dev);
+    if (p == devices.end()) {
+      return false;
+    }
+    std::forward<Callback>(cb)(*p->second, std::forward<Args>(args)...);
+    if (p->second->empty()) {
+      _erase_device(p->second);
+    }
+    return true;
+  }
+
+  template<typename Callback, typename...Args>
+  void with_device_create(const std::string& dev,
+			  Callback&& cb, Args&&... args) {
+    std::unique_lock l{lock};
+    auto d = _get_or_create_device(dev);
+    std::forward<Callback>(cb)(*d, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  void with_devices(Callback&& cb, Args&&... args) const {
+    std::shared_lock l{lock};
+    for (auto& i : devices) {
+      std::forward<Callback>(cb)(*i.second, std::forward<Args>(args)...);
+    }
+  }
+
+  template<typename CallbackInitial, typename Callback, typename...Args>
+  void with_devices2(CallbackInitial&& cbi,  // with lock taken
+		     Callback&& cb,          // for each device
+		     Args&&... args) const {
+    std::shared_lock l{lock};
+    cbi();
+    for (auto& i : devices) {
+      std::forward<Callback>(cb)(*i.second, std::forward<Args>(args)...);
+    }
+  }
+
+  void list_devids_by_server(const std::string& server,
+			     std::set<std::string> *ls) {
+    auto m = get_by_server(server);
+    for (auto& i : m) {
+      std::lock_guard l(i.second->lock);
+      for (auto& j : i.second->devices) {
+	ls->insert(j.first);
+      }
+    }
+  }
+
+  void notify_updating(const DaemonKey &k) {
+    std::unique_lock l{lock};
+    updating.insert(k);
+  }
+  void clear_updating(const DaemonKey &k) {
+    std::unique_lock l{lock};
+    updating.erase(k);
+  }
+  bool is_updating(const DaemonKey &k) {
+    std::shared_lock l{lock};
+    return updating.count(k) > 0;
+  }
+
+  void update_metadata(DaemonStatePtr state,
+		       const map<string,string>& meta) {
+    // remove and re-insert in case the device metadata changed
+    std::unique_lock l{lock};
+    _rm(state->key);
+    {
+      std::lock_guard l2{state->lock};
+      state->set_metadata(meta);
+    }
+    _insert(state);
+  }
+
+  /**
+   * Remove state for all daemons of this type whose names are
+   * not present in `names_exist`.  Use this function when you have
+   * a cluster map and want to ensure that anything absent in the map
+   * is also absent in this class.
+   */
+  void cull(const std::string& svc_name,
+	    const std::set<std::string>& names_exist);
+  void cull_services(const std::set<std::string>& types_exist);
+};
+
+#endif
+
diff --git a/src/mgr/Gil.cc b/src/mgr/Gil.cc
new file mode 100644
index 000000000..de27b9acd
--- /dev/null
+++ b/src/mgr/Gil.cc
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "Python.h"
+
+#include "common/debug.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+#include "Gil.h"
+
+SafeThreadState::SafeThreadState(PyThreadState *ts_)
+    : ts(ts_)
+{
+  ceph_assert(ts != nullptr);
+  thread = pthread_self();
+}
+
+Gil::Gil(SafeThreadState &ts, bool new_thread) : pThreadState(ts)
+{
+  // Acquire the GIL, set the current thread state
+  PyEval_RestoreThread(pThreadState.ts);
+  dout(25) << "GIL acquired for thread state " << pThreadState.ts << dendl;
+
+  //
+  // If called from a separate OS thread (i.e. a thread not created
+  // by Python, that does't already have a python thread state that
+  // was created when that thread was active), we need to manually
+  // create and switch to a python thread state specifically for this
+  // OS thread.
+  //
+  // Note that instead of requring the caller to set new_thread == true
+  // when calling this from a separate OS thread, we could figure out
+  // if this was necessary automatically, as follows:
+  //
+  //   if (pThreadState->thread_id != PyThread_get_thread_ident()) {
+  //
+  // However, this means we're accessing pThreadState->thread_id, but
+  // the Python C API docs say that "The only public data member is
+  // PyInterpreterState *interp", i.e. doing this would violate
+  // something that's meant to be a black box.
+  //
+  if (new_thread) {
+    pNewThreadState = PyThreadState_New(pThreadState.ts->interp);
+    PyThreadState_Swap(pNewThreadState);
+    dout(20) << "Switched to new thread state " << pNewThreadState << dendl;
+  } else {
+    ceph_assert(pthread_self() == pThreadState.thread);
+  }
+}
+
+Gil::~Gil()
+{
+  if (pNewThreadState != nullptr) {
+    dout(20) << "Destroying new thread state " << pNewThreadState << dendl;
+    PyThreadState_Swap(pThreadState.ts);
+    PyThreadState_Clear(pNewThreadState);
+    PyThreadState_Delete(pNewThreadState);
+  }
+  // Release the GIL, reset the thread state to NULL
+  PyEval_SaveThread();
+  dout(25) << "GIL released for thread state " << pThreadState.ts << dendl;
+}
+
+without_gil_t::without_gil_t()
+{
+  assert(PyGILState_Check());
+  release_gil();
+}
+
+without_gil_t::~without_gil_t()
+{
+  if (save) {
+    acquire_gil();
+  }
+}
+
+void without_gil_t::release_gil()
+{
+  save = PyEval_SaveThread();
+}
+
+void without_gil_t::acquire_gil()
+{
+  assert(save);
+  PyEval_RestoreThread(save);
+  save = nullptr;
+}
+
+with_gil_t::with_gil_t(without_gil_t& allow_threads)
+  : allow_threads{allow_threads}
+{
+  allow_threads.acquire_gil();
+}
+
+with_gil_t::~with_gil_t()
+{
+  allow_threads.release_gil();
+}
diff --git a/src/mgr/Gil.h b/src/mgr/Gil.h
new file mode 100644
index 000000000..72675a503
--- /dev/null
+++ b/src/mgr/Gil.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <cassert>
+#include <functional>
+
+struct _ts;
+typedef struct _ts PyThreadState;
+
+#include <pthread.h>
+
+
+/**
+ * Wrap PyThreadState to carry a record of which POSIX thread
+ * the thread state relates to.  This allows the Gil class to
+ * validate that we're being used from the right thread.
+ */
+class SafeThreadState
+{
+  public:
+  explicit SafeThreadState(PyThreadState *ts_);
+
+  SafeThreadState()
+    : ts(nullptr), thread(0)
+  {
+  }
+
+  PyThreadState *ts;
+  pthread_t thread;
+
+  void set(PyThreadState *ts_)
+  {
+    ts = ts_;
+    thread = pthread_self();
+  }
+};
+
+//
+// Use one of these in any scope in which you need to hold Python's
+// Global Interpreter Lock.
+//
+// Do *not* nest these, as a second GIL acquire will deadlock (see
+// https://docs.python.org/2/c-api/init.html#c.PyEval_RestoreThread)
+//
+// If in doubt, explicitly put a scope around the block of code you
+// know you need the GIL in.
+//
+// See the comment in Gil::Gil for when to set new_thread == true
+//
+class Gil {
+public:
+  Gil(const Gil&) = delete;
+  Gil& operator=(const Gil&) = delete;
+
+  Gil(SafeThreadState &ts, bool new_thread = false);
+  ~Gil();
+
+private:
+  SafeThreadState &pThreadState;
+  PyThreadState *pNewThreadState = nullptr;
+};
+
+// because the Python runtime could relinquish the GIL when performing GC
+// and re-acquire it afterwards, we should enforce following locking policy:
+// 1. do not acquire locks when holding the GIL, use a without_gil or
+//    without_gil_t to guard the code which acquires non-gil locks.
+// 2. always hold a GIL when calling python functions, for example, when
+//    constructing a PyFormatter instance.
+//
+// a wrapper that provides a convenient RAII-style mechinary for acquiring
+// and releasing GIL, like the macros of Py_BEGIN_ALLOW_THREADS and
+// Py_END_ALLOW_THREADS.
+struct without_gil_t {
+  without_gil_t();
+  ~without_gil_t();
+  void release_gil();
+  void acquire_gil();
+private:
+  PyThreadState *save = nullptr;
+  friend struct with_gil_t;
+};
+
+struct with_gil_t {
+  with_gil_t(without_gil_t& allow_threads);
+  ~with_gil_t();
+private:
+  without_gil_t& allow_threads;
+};
+
+// invoke func with GIL acquired
+template<typename Func>
+auto with_gil(without_gil_t& no_gil, Func&& func) {
+  with_gil_t gil{no_gil};
+  return std::invoke(std::forward<Func>(func));
+}
+
+template<typename Func>
+auto without_gil(Func&& func) {
+  without_gil_t no_gil;
+  return std::invoke(std::forward<Func>(func));
+}
diff --git a/src/mgr/MDSPerfMetricCollector.cc b/src/mgr/MDSPerfMetricCollector.cc
new file mode 100644
index 000000000..62298aba3
--- /dev/null
+++ b/src/mgr/MDSPerfMetricCollector.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "messages/MMgrReport.h"
+#include "mgr/MDSPerfMetricTypes.h"
+#include "mgr/MDSPerfMetricCollector.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr.mds_perf_metric_collector " << __func__ << " "
+
+MDSPerfMetricCollector::MDSPerfMetricCollector(MetricListener &listener)
+  : MetricCollector<MDSPerfMetricQuery,
+                    MDSPerfMetricLimit,
+                    MDSPerfMetricKey,
+                    MDSPerfMetrics>(listener) {
+}
+
+void MDSPerfMetricCollector::process_reports(const MetricPayload &payload) {
+  const MDSPerfMetricReport &metric_report = boost::get<MDSMetricPayload>(payload).metric_report;
+
+  std::lock_guard locker(lock);
+  process_reports_generic(
+    metric_report.reports, [](PerformanceCounter *counter, const PerformanceCounter &update) {
+      counter->first = update.first;
+      counter->second = update.second;
+    });
+
+  // update delayed rank set
+  delayed_ranks = metric_report.rank_metrics_delayed;
+  dout(20) << ": delayed ranks=[" << delayed_ranks << "]" << dendl;
+
+  clock_gettime(CLOCK_MONOTONIC_COARSE, &last_updated_mono);
+}
+
+int MDSPerfMetricCollector::get_counters(PerfCollector *collector) {
+  MDSPerfCollector *c = static_cast<MDSPerfCollector *>(collector);
+
+  std::lock_guard locker(lock);
+
+  int r = get_counters_generic(c->query_id, &c->counters);
+  if (r != 0) {
+    return r;
+  }
+
+  get_delayed_ranks(&c->delayed_ranks);
+
+  get_last_updated(&c->last_updated_mono);
+  return r;
+}
+
+void MDSPerfMetricCollector::get_delayed_ranks(std::set<mds_rank_t> *ranks) {
+  ceph_assert(ceph_mutex_is_locked(lock));
+  *ranks = delayed_ranks;
+}
+
+void MDSPerfMetricCollector::get_last_updated(utime_t *ts) {
+  ceph_assert(ceph_mutex_is_locked(lock));
+  *ts = utime_t(last_updated_mono);
+}
diff --git a/src/mgr/MDSPerfMetricCollector.h b/src/mgr/MDSPerfMetricCollector.h
new file mode 100644
index 000000000..c72bce091
--- /dev/null
+++ b/src/mgr/MDSPerfMetricCollector.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_MDS_PERF_COLLECTOR_H
+#define CEPH_MGR_MDS_PERF_COLLECTOR_H
+
+#include "mgr/MetricCollector.h"
+#include "mgr/MDSPerfMetricTypes.h"
+
+// MDS performance query class
+class MDSPerfMetricCollector
+  : public MetricCollector<MDSPerfMetricQuery, MDSPerfMetricLimit, MDSPerfMetricKey,
+                           MDSPerfMetrics> {
+private:
+  std::set<mds_rank_t> delayed_ranks;
+  struct timespec last_updated_mono;
+
+  void get_delayed_ranks(std::set<mds_rank_t> *ranks);
+
+  void get_last_updated(utime_t *ts);
+public:
+  MDSPerfMetricCollector(MetricListener &listener);
+
+  void process_reports(const MetricPayload &payload) override;
+  int get_counters(PerfCollector *collector) override;
+};
+
+#endif // CEPH_MGR_MDS_PERF_COLLECTOR_H
diff --git a/src/mgr/MDSPerfMetricTypes.cc b/src/mgr/MDSPerfMetricTypes.cc
new file mode 100644
index 000000000..a16003774
--- /dev/null
+++ b/src/mgr/MDSPerfMetricTypes.cc
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <ostream>
+#include "mgr/MDSPerfMetricTypes.h"
+
+std::ostream& operator<<(std::ostream& os, const MDSPerfMetricSubKeyDescriptor &d) {
+  switch (d.type) {
+  case MDSPerfMetricSubKeyType::MDS_RANK:
+    os << "mds_rank";
+    break;
+  case MDSPerfMetricSubKeyType::CLIENT_ID:
+    os << "client_id";
+    break;
+  default:
+    os << "unknown (" << static_cast<int>(d.type) << ")";
+  }
+
+  return os << "~/" << d.regex_str << "/";
+}
+
+void MDSPerformanceCounterDescriptor::pack_counter(
+    const PerformanceCounter &c, bufferlist *bl) const {
+  using ceph::encode;
+  encode(c.first, *bl);
+  encode(c.second, *bl);
+  switch(type) {
+  case MDSPerformanceCounterType::CAP_HIT_METRIC:
+  case MDSPerformanceCounterType::READ_LATENCY_METRIC:
+  case MDSPerformanceCounterType::WRITE_LATENCY_METRIC:
+  case MDSPerformanceCounterType::METADATA_LATENCY_METRIC:
+  case MDSPerformanceCounterType::DENTRY_LEASE_METRIC:
+  case MDSPerformanceCounterType::OPENED_FILES_METRIC:
+  case MDSPerformanceCounterType::PINNED_ICAPS_METRIC:
+  case MDSPerformanceCounterType::OPENED_INODES_METRIC:
+  case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
+  case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
+  case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
+  case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
+  case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
+  case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
+  case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
+  case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
+    break;
+  default:
+    ceph_abort_msg("unknown counter type");
+  }
+}
+
+void MDSPerformanceCounterDescriptor::unpack_counter(
+    bufferlist::const_iterator& bl, PerformanceCounter *c) const {
+  using ceph::decode;
+  decode(c->first, bl);
+  decode(c->second, bl);
+  switch(type) {
+  case MDSPerformanceCounterType::CAP_HIT_METRIC:
+  case MDSPerformanceCounterType::READ_LATENCY_METRIC:
+  case MDSPerformanceCounterType::WRITE_LATENCY_METRIC:
+  case MDSPerformanceCounterType::METADATA_LATENCY_METRIC:
+  case MDSPerformanceCounterType::DENTRY_LEASE_METRIC:
+  case MDSPerformanceCounterType::OPENED_FILES_METRIC:
+  case MDSPerformanceCounterType::PINNED_ICAPS_METRIC:
+  case MDSPerformanceCounterType::OPENED_INODES_METRIC:
+  case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
+  case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
+  case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
+  case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
+  case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
+  case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
+  case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
+  case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
+    break;
+  default:
+    ceph_abort_msg("unknown counter type");
+  }
+}
+
+std::ostream& operator<<(std::ostream &os, const MDSPerformanceCounterDescriptor &d) {
+   switch(d.type) {
+   case MDSPerformanceCounterType::CAP_HIT_METRIC:
+     os << "cap_hit_metric";
+     break;
+   case MDSPerformanceCounterType::READ_LATENCY_METRIC:
+     os << "read_latency_metric";
+     break;
+   case MDSPerformanceCounterType::WRITE_LATENCY_METRIC:
+     os << "write_latency_metric";
+     break;
+   case MDSPerformanceCounterType::METADATA_LATENCY_METRIC:
+     os << "metadata_latency_metric";
+     break;
+   case MDSPerformanceCounterType::DENTRY_LEASE_METRIC:
+     os << "dentry_lease_metric";
+     break;
+   case MDSPerformanceCounterType::OPENED_FILES_METRIC:
+     os << "opened_files_metric";
+     break;
+   case MDSPerformanceCounterType::PINNED_ICAPS_METRIC:
+     os << "pinned_icaps_metric";
+     break;
+   case MDSPerformanceCounterType::OPENED_INODES_METRIC:
+     os << "opened_inodes_metric";
+     break;
+   case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
+     os << "read_io_sizes_metric";
+     break;
+   case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
+     os << "write_io_sizes_metric";
+     break;
+   case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
+     os << "avg_read_latency";
+     break;
+   case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
+     os << "stdev_read_latency";
+     break;
+   case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
+     os << "avg_write_latency";
+     break;
+   case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
+     os << "stdev_write_latency";
+     break;
+   case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
+     os << "avg_metadata_latency";
+     break;
+   case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
+     os << "stdev_metadata_latency";
+     break;
+   }
+
+   return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const MDSPerfMetricLimit &limit) {
+  return os << "[order_by=" << limit.order_by << ", max_count=" << limit.max_count << "]";
+}
+
+void MDSPerfMetricQuery::pack_counters(const PerformanceCounters &counters,
+                                       bufferlist *bl) const {
+  auto it = counters.begin();
+  for (auto &descriptor : performance_counter_descriptors) {
+    if (it == counters.end()) {
+      descriptor.pack_counter(PerformanceCounter(), bl);
+    } else {
+      descriptor.pack_counter(*it, bl);
+      it++;
+    }
+  }
+}
+
+std::ostream &operator<<(std::ostream &os, const MDSPerfMetricQuery &query) {
+  return os << "[key=" << query.key_descriptor << ", counter="
+            << query.performance_counter_descriptors << "]";
+}
diff --git a/src/mgr/MDSPerfMetricTypes.h b/src/mgr/MDSPerfMetricTypes.h
new file mode 100644
index 000000000..aa35b8cab
--- /dev/null
+++ b/src/mgr/MDSPerfMetricTypes.h
@@ -0,0 +1,367 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_MDS_PERF_METRIC_TYPES_H
+#define CEPH_MGR_MDS_PERF_METRIC_TYPES_H
+
+#include <regex>
+#include <vector>
+#include <iostream>
+
+#include "include/denc.h"
+#include "include/stringify.h"
+
+#include "mds/mdstypes.h"
+#include "mgr/Types.h"
+
+typedef std::vector<std::string> MDSPerfMetricSubKey; // array of regex match
+typedef std::vector<MDSPerfMetricSubKey> MDSPerfMetricKey;
+
+enum class MDSPerfMetricSubKeyType : uint8_t {
+  MDS_RANK = 0,
+  CLIENT_ID = 1,
+};
+
+struct MDSPerfMetricSubKeyDescriptor {
+  MDSPerfMetricSubKeyType type = static_cast<MDSPerfMetricSubKeyType>(-1);
+  std::string regex_str;
+  std::regex regex;
+
+  bool is_supported() const {
+    switch (type) {
+    case MDSPerfMetricSubKeyType::MDS_RANK:
+    case MDSPerfMetricSubKeyType::CLIENT_ID:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  MDSPerfMetricSubKeyDescriptor() {
+  }
+  MDSPerfMetricSubKeyDescriptor(MDSPerfMetricSubKeyType type, const std::string &regex_str)
+    : type(type), regex_str(regex_str) {
+  }
+
+  bool operator<(const MDSPerfMetricSubKeyDescriptor &other) const {
+    if (type < other.type) {
+      return true;
+    }
+    if (type > other.type) {
+      return false;
+    }
+    return regex_str < other.regex_str;
+  }
+
+  DENC(MDSPerfMetricSubKeyDescriptor, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.type, p);
+    denc(v.regex_str, p);
+    DENC_FINISH(p);
+  }
+};
+WRITE_CLASS_DENC(MDSPerfMetricSubKeyDescriptor)
+
+std::ostream& operator<<(std::ostream& os, const MDSPerfMetricSubKeyDescriptor &d);
+typedef std::vector<MDSPerfMetricSubKeyDescriptor> MDSPerfMetricKeyDescriptor;
+
+template<>
+struct denc_traits<MDSPerfMetricKeyDescriptor> {
+  static constexpr bool supported = true;
+  static constexpr bool bounded = false;
+  static constexpr bool featured = false;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const MDSPerfMetricKeyDescriptor& v, size_t& p) {
+    p += sizeof(uint32_t);
+    const auto size = v.size();
+    if (size) {
+      size_t per = 0;
+      denc(v.front(), per);
+      p +=  per * size;
+    }
+  }
+  static void encode(const MDSPerfMetricKeyDescriptor& v,
+		     ceph::buffer::list::contiguous_appender& p) {
+    denc_varint(v.size(), p);
+    for (auto& i : v) {
+      denc(i, p);
+    }
+  }
+  static void decode(MDSPerfMetricKeyDescriptor& v,
+                     ceph::buffer::ptr::const_iterator& p) {
+    unsigned num;
+    denc_varint(num, p);
+    v.clear();
+    v.reserve(num);
+    for (unsigned i=0; i < num; ++i) {
+      MDSPerfMetricSubKeyDescriptor d;
+      denc(d, p);
+      if (!d.is_supported()) {
+        v.clear();
+        return;
+      }
+      try {
+        d.regex = d.regex_str.c_str();
+      } catch (const std::regex_error& e) {
+        v.clear();
+        return;
+      }
+      if (d.regex.mark_count() == 0) {
+        v.clear();
+        return;
+      }
+      v.push_back(std::move(d));
+    }
+  }
+};
+
+enum class MDSPerformanceCounterType : uint8_t {
+  CAP_HIT_METRIC = 0,
+  READ_LATENCY_METRIC = 1,
+  WRITE_LATENCY_METRIC = 2,
+  METADATA_LATENCY_METRIC = 3,
+  DENTRY_LEASE_METRIC = 4,
+  OPENED_FILES_METRIC = 5,
+  PINNED_ICAPS_METRIC = 6,
+  OPENED_INODES_METRIC = 7,
+  READ_IO_SIZES_METRIC = 8,
+  WRITE_IO_SIZES_METRIC = 9,
+  AVG_READ_LATENCY_METRIC = 10,
+  STDEV_READ_LATENCY_METRIC = 11,
+  AVG_WRITE_LATENCY_METRIC = 12,
+  STDEV_WRITE_LATENCY_METRIC = 13,
+  AVG_METADATA_LATENCY_METRIC = 14,
+  STDEV_METADATA_LATENCY_METRIC = 15,
+};
+
+struct MDSPerformanceCounterDescriptor {
+  MDSPerformanceCounterType type = static_cast<MDSPerformanceCounterType>(-1);
+
+  bool is_supported() const {
+    switch(type) {
+    case MDSPerformanceCounterType::CAP_HIT_METRIC:
+    case MDSPerformanceCounterType::READ_LATENCY_METRIC:
+    case MDSPerformanceCounterType::WRITE_LATENCY_METRIC:
+    case MDSPerformanceCounterType::METADATA_LATENCY_METRIC:
+    case MDSPerformanceCounterType::DENTRY_LEASE_METRIC:
+    case MDSPerformanceCounterType::OPENED_FILES_METRIC:
+    case MDSPerformanceCounterType::PINNED_ICAPS_METRIC:
+    case MDSPerformanceCounterType::OPENED_INODES_METRIC:
+    case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
+    case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
+    case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
+    case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
+    case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
+    case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
+    case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
+    case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  MDSPerformanceCounterDescriptor() {
+  }
+  MDSPerformanceCounterDescriptor(MDSPerformanceCounterType type) : type(type) {
+  }
+
+  bool operator<(const MDSPerformanceCounterDescriptor &other) const {
+    return type < other.type;
+  }
+
+  bool operator==(const MDSPerformanceCounterDescriptor &other) const {
+    return type == other.type;
+  }
+
+  bool operator!=(const MDSPerformanceCounterDescriptor &other) const {
+    return type != other.type;
+  }
+
+  DENC(MDSPerformanceCounterDescriptor, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.type, p);
+    DENC_FINISH(p);
+  }
+
+  void pack_counter(const PerformanceCounter &c, ceph::buffer::list *bl) const;
+  void unpack_counter(ceph::buffer::list::const_iterator& bl, PerformanceCounter *c) const;
+};
+WRITE_CLASS_DENC(MDSPerformanceCounterDescriptor)
+
+std::ostream& operator<<(std::ostream &os, const MDSPerformanceCounterDescriptor &d);
+typedef std::vector<MDSPerformanceCounterDescriptor> MDSPerformanceCounterDescriptors;
+
+template<>
+struct denc_traits<MDSPerformanceCounterDescriptors> {
+  static constexpr bool supported = true;
+  static constexpr bool bounded = false;
+  static constexpr bool featured = false;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const MDSPerformanceCounterDescriptors& v, size_t& p) {
+    p += sizeof(uint32_t);
+    const auto size = v.size();
+    if (size) {
+      size_t per = 0;
+      denc(v.front(), per);
+      p +=  per * size;
+    }
+  }
+  static void encode(const MDSPerformanceCounterDescriptors& v,
+                     ceph::buffer::list::contiguous_appender& p) {
+    denc_varint(v.size(), p);
+    for (auto& i : v) {
+      denc(i, p);
+    }
+  }
+  static void decode(MDSPerformanceCounterDescriptors& v,
+                     ceph::buffer::ptr::const_iterator& p) {
+    unsigned num;
+    denc_varint(num, p);
+    v.clear();
+    v.reserve(num);
+    for (unsigned i=0; i < num; ++i) {
+      MDSPerformanceCounterDescriptor d;
+      denc(d, p);
+      if (d.is_supported()) {
+        v.push_back(std::move(d));
+      }
+    }
+  }
+};
+
+struct MDSPerfMetricLimit {
+  MDSPerformanceCounterDescriptor order_by;
+  uint64_t max_count;
+
+  MDSPerfMetricLimit() {
+  }
+  MDSPerfMetricLimit(const MDSPerformanceCounterDescriptor &order_by, uint64_t max_count)
+    : order_by(order_by), max_count(max_count) {
+  }
+
+  bool operator<(const MDSPerfMetricLimit &other) const {
+    if (order_by != other.order_by) {
+      return order_by < other.order_by;
+    }
+
+    return max_count < other.max_count;
+  }
+
+  DENC(MDSPerfMetricLimit, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.order_by, p);
+    denc(v.max_count, p);
+    DENC_FINISH(p);
+  }
+};
+WRITE_CLASS_DENC(MDSPerfMetricLimit)
+
+std::ostream &operator<<(std::ostream &os, const MDSPerfMetricLimit &limit);
+typedef std::set<MDSPerfMetricLimit> MDSPerfMetricLimits;
+
+struct MDSPerfMetricQuery {
+  MDSPerfMetricKeyDescriptor key_descriptor;
+  MDSPerformanceCounterDescriptors performance_counter_descriptors;
+
+  MDSPerfMetricQuery() {
+  }
+  MDSPerfMetricQuery(const MDSPerfMetricKeyDescriptor &key_descriptor,
+                     const MDSPerformanceCounterDescriptors &performance_counter_descriptors)
+    : key_descriptor(key_descriptor),
+      performance_counter_descriptors(performance_counter_descriptors)
+  {
+  }
+
+  bool operator<(const MDSPerfMetricQuery &other) const {
+    if (key_descriptor < other.key_descriptor) {
+      return true;
+    }
+    if (key_descriptor > other.key_descriptor) {
+      return false;
+    }
+    return performance_counter_descriptors < other.performance_counter_descriptors;
+  }
+
+  template <typename L>
+  bool get_key(L&& get_sub_key, MDSPerfMetricKey *key) const {
+    for (auto &sub_key_descriptor : key_descriptor) {
+      MDSPerfMetricSubKey sub_key;
+      if (!get_sub_key(sub_key_descriptor, &sub_key)) {
+        return false;
+      }
+      key->push_back(sub_key);
+    }
+    return true;
+  }
+
+  void get_performance_counter_descriptors(MDSPerformanceCounterDescriptors *descriptors) const {
+    *descriptors = performance_counter_descriptors;
+  }
+
+  template <typename L>
+  void update_counters(L &&update_counter, PerformanceCounters *counters) const {
+    auto it = counters->begin();
+    for (auto &descriptor : performance_counter_descriptors) {
+      // TODO: optimize
+      if (it == counters->end()) {
+        counters->push_back(PerformanceCounter());
+        it = std::prev(counters->end());
+      }
+      update_counter(descriptor, &(*it));
+      it++;
+    }
+  }
+
+  DENC(MDSPerfMetricQuery, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.key_descriptor, p);
+    denc(v.performance_counter_descriptors, p);
+    DENC_FINISH(p);
+  }
+
+  void pack_counters(const PerformanceCounters &counters, ceph::buffer::list *bl) const;
+};
+WRITE_CLASS_DENC(MDSPerfMetricQuery)
+
+std::ostream &operator<<(std::ostream &os, const MDSPerfMetricQuery &query);
+
+struct MDSPerfCollector : PerfCollector {
+  std::map<MDSPerfMetricKey, PerformanceCounters> counters;
+  std::set<mds_rank_t> delayed_ranks;
+  utime_t last_updated_mono;
+
+  MDSPerfCollector(MetricQueryID query_id)
+    : PerfCollector(query_id) {
+  }
+};
+
+struct MDSPerfMetrics {
+  MDSPerformanceCounterDescriptors performance_counter_descriptors;
+  std::map<MDSPerfMetricKey, ceph::buffer::list> group_packed_performance_counters;
+
+  DENC(MDSPerfMetrics, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.performance_counter_descriptors, p);
+    denc(v.group_packed_performance_counters, p);
+    DENC_FINISH(p);
+  }
+};
+
+struct MDSPerfMetricReport {
+  std::map<MDSPerfMetricQuery, MDSPerfMetrics> reports;
+  // set of active ranks that have delayed (stale) metrics
+  std::set<mds_rank_t> rank_metrics_delayed;
+
+  DENC(MDSPerfMetricReport, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.reports, p);
+    denc(v.rank_metrics_delayed, p);
+    DENC_FINISH(p);
+  }
+};
+
+WRITE_CLASS_DENC(MDSPerfMetrics)
+WRITE_CLASS_DENC(MDSPerfMetricReport)
+
+#endif // CEPH_MGR_MDS_PERF_METRIC_TYPES_H
diff --git a/src/mgr/MetricCollector.cc b/src/mgr/MetricCollector.cc
new file mode 100644
index 000000000..c31dcf0b9
--- /dev/null
+++ b/src/mgr/MetricCollector.cc
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "mgr/MetricCollector.h"
+#include "mgr/OSDPerfMetricTypes.h"
+#include "mgr/MDSPerfMetricTypes.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr.metric_collector " << __func__ << ": "
+
+template <typename Query, typename Limit, typename Key, typename Report>
+MetricCollector<Query, Limit, Key, Report>::MetricCollector(MetricListener &listener)
+  : listener(listener)
+{
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+MetricQueryID MetricCollector<Query, Limit, Key, Report>::add_query(
+    const Query &query,
+    const std::optional<Limit> &limit) {
+  dout(20) << "query=" << query << ", limit=" << limit << dendl;
+  uint64_t query_id;
+  bool notify = false;
+
+  {
+    std::lock_guard locker(lock);
+
+    query_id = next_query_id++;
+    auto it = queries.find(query);
+    if (it == queries.end()) {
+      it = queries.emplace(query, std::map<MetricQueryID, OptionalLimit>{}).first;
+      notify = true;
+    } else if (is_limited(it->second)) {
+      notify = true;
+    }
+
+    it->second.emplace(query_id, limit);
+    counters.emplace(query_id, std::map<Key, PerformanceCounters>{});
+  }
+
+  dout(10) << query << " " << (limit ? stringify(*limit) : "unlimited")
+           << " query_id=" << query_id << dendl;
+
+  if (notify) {
+    listener.handle_query_updated();
+  }
+
+  return query_id;
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+int MetricCollector<Query, Limit, Key, Report>::remove_query(MetricQueryID query_id) {
+  dout(20) << "query_id=" << query_id << dendl;
+  bool found = false;
+  bool notify = false;
+
+  {
+    std::lock_guard locker(lock);
+
+    for (auto it = queries.begin() ; it != queries.end();) {
+      auto iter = it->second.find(query_id);
+      if (iter == it->second.end()) {
+        ++it;
+        continue;
+      }
+
+      it->second.erase(iter);
+      if (it->second.empty()) {
+        it = queries.erase(it);
+        notify = true;
+      } else if (is_limited(it->second)) {
+        ++it;
+        notify = true;
+      }
+      found = true;
+      break;
+    }
+    counters.erase(query_id);
+  }
+
+  if (!found) {
+    dout(10) << query_id << " not found" << dendl;
+    return -ENOENT;
+  }
+
+  dout(10) << query_id << dendl;
+
+  if (notify) {
+    listener.handle_query_updated();
+  }
+
+  return 0;
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+void MetricCollector<Query, Limit, Key, Report>::remove_all_queries() {
+  dout(20) << dendl;
+  bool notify;
+
+  {
+    std::lock_guard locker(lock);
+
+    notify = !queries.empty();
+    queries.clear();
+  }
+
+  if (notify) {
+    listener.handle_query_updated();
+  }
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+void MetricCollector<Query, Limit, Key, Report>::reregister_queries() {
+  dout(20) << dendl;
+  listener.handle_query_updated();
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+int MetricCollector<Query, Limit, Key, Report>::get_counters_generic(
+    MetricQueryID query_id, std::map<Key, PerformanceCounters> *c) {
+  dout(20) << dendl;
+  ceph_assert(ceph_mutex_is_locked(lock));
+
+  auto it = counters.find(query_id);
+  if (it == counters.end()) {
+    dout(10) << "counters for " << query_id << " not found" << dendl;
+    return -ENOENT;
+  }
+
+  *c = std::move(it->second);
+  it->second.clear();
+
+  return 0;
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+void MetricCollector<Query, Limit, Key, Report>::process_reports_generic(
+    const std::map<Query, Report> &reports, UpdateCallback callback) {
+  ceph_assert(ceph_mutex_is_locked(lock));
+
+  if (reports.empty()) {
+    return;
+  }
+
+  for (auto& [query, report] : reports) {
+    dout(10) << "report for " << query << " query: "
+             << report.group_packed_performance_counters.size() << " records"
+             << dendl;
+
+    for (auto& [key, bl] : report.group_packed_performance_counters) {
+      auto bl_it = bl.cbegin();
+
+      for (auto& p : queries[query]) {
+        auto &key_counters = counters[p.first][key];
+        if (key_counters.empty()) {
+          key_counters.resize(query.performance_counter_descriptors.size(),
+                              {0, 0});
+        }
+      }
+
+      auto desc_it = report.performance_counter_descriptors.begin();
+      for (size_t i = 0; i < query.performance_counter_descriptors.size(); i++) {
+        if (desc_it == report.performance_counter_descriptors.end()) {
+          break;
+        }
+        if (*desc_it != query.performance_counter_descriptors[i]) {
+          continue;
+        }
+        PerformanceCounter c;
+        desc_it->unpack_counter(bl_it, &c);
+        dout(20) << "counter " << key << " " << *desc_it << ": " << c << dendl;
+
+        for (auto& p : queries[query]) {
+          auto &key_counters = counters[p.first][key];
+          callback(&key_counters[i], c);
+        }
+        desc_it++;
+      }
+    }
+  }
+}
+
+template class
+MetricCollector<OSDPerfMetricQuery, OSDPerfMetricLimit, OSDPerfMetricKey, OSDPerfMetricReport>;
+template class
+MetricCollector<MDSPerfMetricQuery, MDSPerfMetricLimit, MDSPerfMetricKey, MDSPerfMetrics>;
diff --git a/src/mgr/MetricCollector.h b/src/mgr/MetricCollector.h
new file mode 100644
index 000000000..91fa78781
--- /dev/null
+++ b/src/mgr/MetricCollector.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_METRIC_COLLECTOR_H
+#define CEPH_MGR_METRIC_COLLECTOR_H
+
+#include <map>
+#include <set>
+#include <tuple>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+#include "common/ceph_mutex.h"
+#include "msg/Message.h"
+#include "mgr/Types.h"
+#include "mgr/MetricTypes.h"
+
+class MMgrReport;
+
+template <typename Query, typename Limit, typename Key, typename Report>
+class MetricCollector {
+public:
+  virtual ~MetricCollector() {
+  }
+
+  using Limits = std::set<Limit>;
+
+  MetricCollector(MetricListener &listener);
+
+  MetricQueryID add_query(const Query &query, const std::optional<Limit> &limit);
+
+  int remove_query(MetricQueryID query_id);
+
+  void remove_all_queries();
+
+  void reregister_queries();
+
+  std::map<Query, Limits> get_queries() const {
+    std::lock_guard locker(lock);
+
+    std::map<Query, Limits> result;
+    for (auto& [query, limits] : queries) {
+      auto result_it = result.insert({query, {}}).first;
+      if (is_limited(limits)) {
+        for (auto& limit : limits) {
+          if (limit.second) {
+            result_it->second.insert(*limit.second);
+          }
+        }
+      }
+    }
+
+    return result;
+  }
+
+  virtual void process_reports(const MetricPayload &payload) = 0;
+  virtual int get_counters(PerfCollector *collector) = 0;
+
+protected:
+  typedef std::optional<Limit> OptionalLimit;
+  typedef std::map<MetricQueryID, OptionalLimit> QueryIDLimit;
+  typedef std::map<Query, QueryIDLimit> Queries;
+  typedef std::map<MetricQueryID, std::map<Key, PerformanceCounters>> Counters;
+  typedef std::function<void(PerformanceCounter *, const PerformanceCounter &)> UpdateCallback;
+
+  mutable ceph::mutex lock = ceph::make_mutex("mgr::metric::collector::lock");
+
+  Queries queries;
+  Counters counters;
+
+  void process_reports_generic(const std::map<Query, Report> &reports, UpdateCallback callback);
+  int get_counters_generic(MetricQueryID query_id, std::map<Key, PerformanceCounters> *counters);
+
+private:
+  MetricListener &listener;
+  MetricQueryID next_query_id = 0;
+
+  bool is_limited(const std::map<MetricQueryID, OptionalLimit> &limits) const {
+    return std::any_of(begin(limits), end(limits),
+                       [](auto &limits) { return limits.second.has_value(); });
+  }
+};
+
+#endif // CEPH_MGR_METRIC_COLLECTOR_H
diff --git a/src/mgr/MetricTypes.h b/src/mgr/MetricTypes.h
new file mode 100644
index 000000000..586c470ca
--- /dev/null
+++ b/src/mgr/MetricTypes.h
@@ -0,0 +1,277 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_METRIC_TYPES_H
+#define CEPH_MGR_METRIC_TYPES_H
+
+#include <boost/variant.hpp>
+#include "include/denc.h"
+#include "include/ceph_features.h"
+#include "mgr/OSDPerfMetricTypes.h"
+#include "mgr/MDSPerfMetricTypes.h"
+
+enum class MetricReportType {
+  METRIC_REPORT_TYPE_OSD = 0,
+  METRIC_REPORT_TYPE_MDS = 1,
+};
+
+struct OSDMetricPayload {
+  static const MetricReportType METRIC_REPORT_TYPE = MetricReportType::METRIC_REPORT_TYPE_OSD;
+  std::map<OSDPerfMetricQuery, OSDPerfMetricReport> report;
+
+  OSDMetricPayload() {
+  }
+  OSDMetricPayload(const std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &report)
+    : report(report) {
+  }
+
+  DENC(OSDMetricPayload, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.report, p);
+    DENC_FINISH(p);
+  }
+};
+
+struct MDSMetricPayload {
+  static const MetricReportType METRIC_REPORT_TYPE = MetricReportType::METRIC_REPORT_TYPE_MDS;
+  MDSPerfMetricReport metric_report;
+
+  MDSMetricPayload() {
+  }
+  MDSMetricPayload(const MDSPerfMetricReport &metric_report)
+    : metric_report(metric_report) {
+  }
+
+  DENC(MDSMetricPayload, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.metric_report, p);
+    DENC_FINISH(p);
+  }
+};
+
+struct UnknownMetricPayload {
+  static const MetricReportType METRIC_REPORT_TYPE = static_cast<MetricReportType>(-1);
+
+  UnknownMetricPayload() { }
+
+  DENC(UnknownMetricPayload, v, p) {
+    ceph_abort();
+  }
+};
+
+WRITE_CLASS_DENC(OSDMetricPayload)
+WRITE_CLASS_DENC(MDSMetricPayload)
+WRITE_CLASS_DENC(UnknownMetricPayload)
+
+typedef boost::variant<OSDMetricPayload,
+                       MDSMetricPayload,
+                       UnknownMetricPayload> MetricPayload;
+
+class EncodeMetricPayloadVisitor : public boost::static_visitor<void> {
+public:
+  explicit EncodeMetricPayloadVisitor(ceph::buffer::list &bl) : m_bl(bl) {
+  }
+
+  template <typename MetricPayload>
+  inline void operator()(const MetricPayload &payload) const {
+    using ceph::encode;
+    encode(static_cast<uint32_t>(MetricPayload::METRIC_REPORT_TYPE), m_bl);
+    encode(payload, m_bl);
+  }
+
+private:
+  ceph::buffer::list &m_bl;
+};
+
+class DecodeMetricPayloadVisitor : public boost::static_visitor<void> {
+public:
+  DecodeMetricPayloadVisitor(ceph::buffer::list::const_iterator &iter) : m_iter(iter) {
+  }
+
+  template <typename MetricPayload>
+  inline void operator()(MetricPayload &payload) const {
+    using ceph::decode;
+    decode(payload, m_iter);
+  }
+
+private:
+  ceph::buffer::list::const_iterator &m_iter;
+};
+
+struct MetricReportMessage {
+  MetricPayload payload;
+
+  MetricReportMessage(const MetricPayload &payload = UnknownMetricPayload())
+    : payload(payload) {
+  }
+
+  bool should_encode(uint64_t features) const {
+    if (!HAVE_FEATURE(features, SERVER_PACIFIC) &&
+	boost::get<MDSMetricPayload>(&payload)) {
+      return false;
+    }
+    return true;
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    boost::apply_visitor(EncodeMetricPayloadVisitor(bl), payload);
+  }
+
+  void decode(ceph::buffer::list::const_iterator &iter) {
+    using ceph::decode;
+
+    uint32_t metric_report_type;
+    decode(metric_report_type, iter);
+
+    switch (static_cast<MetricReportType>(metric_report_type)) {
+    case MetricReportType::METRIC_REPORT_TYPE_OSD:
+      payload = OSDMetricPayload();
+      break;
+    case MetricReportType::METRIC_REPORT_TYPE_MDS:
+      payload = MDSMetricPayload();
+      break;
+    default:
+      payload = UnknownMetricPayload();
+      break;
+  }
+
+  boost::apply_visitor(DecodeMetricPayloadVisitor(iter), payload);
+  }
+};
+
+WRITE_CLASS_ENCODER(MetricReportMessage);
+
+// variant for sending configure message to mgr clients
+
+enum MetricConfigType {
+  METRIC_CONFIG_TYPE_OSD = 0,
+  METRIC_CONFIG_TYPE_MDS = 1,
+};
+
+struct OSDConfigPayload {
+  static const MetricConfigType METRIC_CONFIG_TYPE = MetricConfigType::METRIC_CONFIG_TYPE_OSD;
+  std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> config;
+
+  OSDConfigPayload() {
+  }
+  OSDConfigPayload(const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &config)
+    : config(config) {
+  }
+
+  DENC(OSDConfigPayload, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.config, p);
+    DENC_FINISH(p);
+  }
+};
+
+struct MDSConfigPayload {
+  static const MetricConfigType METRIC_CONFIG_TYPE = MetricConfigType::METRIC_CONFIG_TYPE_MDS;
+  std::map<MDSPerfMetricQuery, MDSPerfMetricLimits> config;
+
+  MDSConfigPayload() {
+  }
+  MDSConfigPayload(const std::map<MDSPerfMetricQuery, MDSPerfMetricLimits> &config)
+    : config(config) {
+  }
+
+  DENC(MDSConfigPayload, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.config, p);
+    DENC_FINISH(p);
+  }
+};
+
+struct UnknownConfigPayload {
+  static const MetricConfigType METRIC_CONFIG_TYPE = static_cast<MetricConfigType>(-1);
+
+  UnknownConfigPayload() { }
+
+  DENC(UnknownConfigPayload, v, p) {
+    ceph_abort();
+  }
+};
+
+WRITE_CLASS_DENC(OSDConfigPayload)
+WRITE_CLASS_DENC(MDSConfigPayload)
+WRITE_CLASS_DENC(UnknownConfigPayload)
+
+typedef boost::variant<OSDConfigPayload,
+                       MDSConfigPayload,
+                       UnknownConfigPayload> ConfigPayload;
+
+class EncodeConfigPayloadVisitor : public boost::static_visitor<void> {
+public:
+  explicit EncodeConfigPayloadVisitor(ceph::buffer::list &bl) : m_bl(bl) {
+  }
+
+  template <typename ConfigPayload>
+  inline void operator()(const ConfigPayload &payload) const {
+    using ceph::encode;
+    encode(static_cast<uint32_t>(ConfigPayload::METRIC_CONFIG_TYPE), m_bl);
+    encode(payload, m_bl);
+  }
+
+private:
+  ceph::buffer::list &m_bl;
+};
+
+class DecodeConfigPayloadVisitor : public boost::static_visitor<void> {
+public:
+  DecodeConfigPayloadVisitor(ceph::buffer::list::const_iterator &iter) : m_iter(iter) {
+  }
+
+  template <typename ConfigPayload>
+  inline void operator()(ConfigPayload &payload) const {
+    using ceph::decode;
+    decode(payload, m_iter);
+  }
+
+private:
+  ceph::buffer::list::const_iterator &m_iter;
+};
+
+struct MetricConfigMessage {
+  ConfigPayload payload;
+
+  MetricConfigMessage(const ConfigPayload &payload = UnknownConfigPayload())
+    : payload(payload) {
+  }
+
+  bool should_encode(uint64_t features) const {
+    if (!HAVE_FEATURE(features, SERVER_PACIFIC) &&
+	boost::get<MDSConfigPayload>(&payload)) {
+      return false;
+    }
+    return true;
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    boost::apply_visitor(EncodeConfigPayloadVisitor(bl), payload);
+  }
+
+  void decode(ceph::buffer::list::const_iterator &iter) {
+    using ceph::decode;
+
+    uint32_t metric_config_type;
+    decode(metric_config_type, iter);
+
+    switch (metric_config_type) {
+    case MetricConfigType::METRIC_CONFIG_TYPE_OSD:
+      payload = OSDConfigPayload();
+      break;
+    case MetricConfigType::METRIC_CONFIG_TYPE_MDS:
+      payload = MDSConfigPayload();
+      break;
+    default:
+      payload = UnknownConfigPayload();
+      break;
+  }
+
+  boost::apply_visitor(DecodeConfigPayloadVisitor(iter), payload);
+  }
+};
+
+WRITE_CLASS_ENCODER(MetricConfigMessage);
+
+#endif // CEPH_MGR_METRIC_TYPES_H
diff --git a/src/mgr/Mgr.cc b/src/mgr/Mgr.cc
new file mode 100644
index 000000000..bf9eae2e7
--- /dev/null
+++ b/src/mgr/Mgr.cc
@@ -0,0 +1,795 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <Python.h>
+
+#include "osdc/Objecter.h"
+#include "client/Client.h"
+#include "common/errno.h"
+#include "mon/MonClient.h"
+#include "include/stringify.h"
+#include "global/global_context.h"
+#include "global/signal_handler.h"
+
+#include "mgr/MgrContext.h"
+
+#include "DaemonServer.h"
+#include "messages/MMgrDigest.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MLog.h"
+#include "messages/MServiceMap.h"
+#include "messages/MKVData.h"
+#include "PyModule.h"
+#include "Mgr.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+Mgr::Mgr(MonClient *monc_, const MgrMap& mgrmap,
+         PyModuleRegistry *py_module_registry_,
+	 Messenger *clientm_, Objecter *objecter_,
+	 Client* client_, LogChannelRef clog_, LogChannelRef audit_clog_) :
+  monc(monc_),
+  objecter(objecter_),
+  client(client_),
+  client_messenger(clientm_),
+  finisher(g_ceph_context, "Mgr", "mgr-fin"),
+  digest_received(false),
+  py_module_registry(py_module_registry_),
+  cluster_state(monc, nullptr, mgrmap),
+  server(monc, finisher, daemon_state, cluster_state, *py_module_registry,
+         clog_, audit_clog_),
+  clog(clog_),
+  audit_clog(audit_clog_),
+  initialized(false),
+  initializing(false)
+{
+  cluster_state.set_objecter(objecter);
+}
+
+
+Mgr::~Mgr()
+{
+}
+
+void MetadataUpdate::finish(int r)
+{
+  daemon_state.clear_updating(key);
+  if (r == 0) {
+    if (key.type == "mds" || key.type == "osd" ||
+        key.type == "mgr" || key.type == "mon") {
+      json_spirit::mValue json_result;
+      bool read_ok = json_spirit::read(
+          outbl.to_str(), json_result);
+      if (!read_ok) {
+        dout(1) << "mon returned invalid JSON for " << key << dendl;
+        return;
+      }
+      if (json_result.type() != json_spirit::obj_type) {
+        dout(1) << "mon returned valid JSON " << key
+		<< " but not an object: '" << outbl.to_str() << "'" << dendl;
+        return;
+      }
+      dout(4) << "mon returned valid metadata JSON for " << key << dendl;
+
+      json_spirit::mObject daemon_meta = json_result.get_obj();
+
+      // Skip daemon who doesn't have hostname yet
+      if (daemon_meta.count("hostname") == 0) {
+        dout(1) << "Skipping incomplete metadata entry for " << key << dendl;
+        return;
+      }
+
+      // Apply any defaults
+      for (const auto &i : defaults) {
+        if (daemon_meta.find(i.first) == daemon_meta.end()) {
+          daemon_meta[i.first] = i.second;
+        }
+      }
+
+      if (daemon_state.exists(key)) {
+        DaemonStatePtr state = daemon_state.get(key);
+	map<string,string> m;
+	{
+	  std::lock_guard l(state->lock);
+	  state->hostname = daemon_meta.at("hostname").get_str();
+
+	  if (key.type == "mds" || key.type == "mgr" || key.type == "mon") {
+	    daemon_meta.erase("name");
+	  } else if (key.type == "osd") {
+	    daemon_meta.erase("id");
+	  }
+	  daemon_meta.erase("hostname");
+	  for (const auto &[key, val] : daemon_meta) {
+	    m.emplace(key, val.get_str());
+	  }
+	}
+	daemon_state.update_metadata(state, m);
+      } else {
+        auto state = std::make_shared<DaemonState>(daemon_state.types);
+        state->key = key;
+        state->hostname = daemon_meta.at("hostname").get_str();
+
+        if (key.type == "mds" || key.type == "mgr" || key.type == "mon") {
+          daemon_meta.erase("name");
+        } else if (key.type == "osd") {
+          daemon_meta.erase("id");
+        }
+        daemon_meta.erase("hostname");
+
+	map<string,string> m;
+        for (const auto &[key, val] : daemon_meta) {
+          m.emplace(key, val.get_str());
+        }
+	state->set_metadata(m);
+
+        daemon_state.insert(state);
+      }
+    } else {
+      ceph_abort();
+    }
+  } else {
+    dout(1) << "mon failed to return metadata for " << key
+	    << ": " << cpp_strerror(r) << dendl;
+  }
+}
+
+void Mgr::background_init(Context *completion)
+{
+  std::lock_guard l(lock);
+  ceph_assert(!initializing);
+  ceph_assert(!initialized);
+  initializing = true;
+
+  finisher.start();
+
+  finisher.queue(new LambdaContext([this, completion](int r){
+    init();
+    completion->complete(0);
+  }));
+}
+
+std::map<std::string, std::string> Mgr::load_store()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  dout(10) << "listing keys" << dendl;
+  JSONCommand cmd;
+  cmd.run(monc, "{\"prefix\": \"config-key ls\"}");
+  lock.unlock();
+  cmd.wait();
+  lock.lock();
+  ceph_assert(cmd.r == 0);
+
+  std::map<std::string, std::string> loaded;
+  
+  for (auto &key_str : cmd.json_result.get_array()) {
+    std::string const key = key_str.get_str();
+    
+    dout(20) << "saw key '" << key << "'" << dendl;
+
+    const std::string store_prefix = PyModule::mgr_store_prefix;
+    const std::string device_prefix = "device/";
+
+    if (key.substr(0, device_prefix.size()) == device_prefix ||
+	key.substr(0, store_prefix.size()) == store_prefix) {
+      dout(20) << "fetching '" << key << "'" << dendl;
+      Command get_cmd;
+      std::ostringstream cmd_json;
+      cmd_json << "{\"prefix\": \"config-key get\", \"key\": \"" << key << "\"}";
+      get_cmd.run(monc, cmd_json.str());
+      lock.unlock();
+      get_cmd.wait();
+      lock.lock();
+      if (get_cmd.r == 0) { // tolerate racing config-key change
+	loaded[key] = get_cmd.outbl.to_str();
+      }
+    }
+  }
+
+  return loaded;
+}
+
+void Mgr::handle_signal(int signum)
+{
+  ceph_assert(signum == SIGINT || signum == SIGTERM);
+  shutdown();
+}
+
+static void handle_mgr_signal(int signum)
+{
+  derr << " *** Got signal " << sig_str(signum) << " ***" << dendl;
+
+  // The python modules don't reliably shut down, so don't even
+  // try. The mon will blocklist us (and all of our rados/cephfs
+  // clients) anyway. Just exit!
+
+  _exit(0);  // exit with 0 result code, as if we had done an orderly shutdown
+}
+
+void Mgr::init()
+{
+  std::unique_lock l(lock);
+  ceph_assert(initializing);
+  ceph_assert(!initialized);
+
+  // Enable signal handlers
+  register_async_signal_handler_oneshot(SIGINT, handle_mgr_signal);
+  register_async_signal_handler_oneshot(SIGTERM, handle_mgr_signal);
+
+  // Only pacific+ monitors support subscribe to kv updates
+  bool mon_allows_kv_sub = false;
+  monc->with_monmap(
+    [&](const MonMap &monmap) {
+      if (monmap.get_required_features().contains_all(
+	    ceph::features::mon::FEATURE_PACIFIC)) {
+	mon_allows_kv_sub = true;
+      }
+    });
+  if (!mon_allows_kv_sub) {
+    // mons are still pre-pacific.  wait long enough to ensure our
+    // next beacon is processed so that our module options are
+    // propagated.  See https://tracker.ceph.com/issues/49778
+    lock.unlock();
+    dout(10) << "waiting a bit for the pre-pacific mon to process our beacon" << dendl;
+    sleep(g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count() * 3);
+    lock.lock();
+  }
+
+  // subscribe to all the maps
+  monc->sub_want("log-info", 0, 0);
+  monc->sub_want("mgrdigest", 0, 0);
+  monc->sub_want("fsmap", 0, 0);
+  monc->sub_want("servicemap", 0, 0);
+  if (mon_allows_kv_sub) {
+    monc->sub_want("kv:config/", 0, 0);
+    monc->sub_want("kv:mgr/", 0, 0);
+    monc->sub_want("kv:device/", 0, 0);
+  }
+
+  dout(4) << "waiting for OSDMap..." << dendl;
+  // Subscribe to OSDMap update to pass on to ClusterState
+  objecter->maybe_request_map();
+
+  // reset the mon session.  we get these maps through subscriptions which
+  // are stateful with the connection, so even if *we* don't have them a
+  // previous incarnation sharing the same MonClient may have.
+  monc->reopen_session();
+
+  // Start Objecter and wait for OSD map
+  lock.unlock();  // Drop lock because OSDMap dispatch calls into my ms_dispatch
+  epoch_t e;
+  cluster_state.with_mgrmap([&e](const MgrMap& m) {
+    e = m.last_failure_osd_epoch;
+  });
+  /* wait for any blocklists to be applied to previous mgr instance */
+  dout(4) << "Waiting for new OSDMap (e=" << e
+          << ") that may blocklist prior active." << dendl;
+  objecter->wait_for_osd_map(e);
+  lock.lock();
+
+  // Start communicating with daemons to learn statistics etc
+  int r = server.init(monc->get_global_id(), client_messenger->get_myaddrs());
+  if (r < 0) {
+    derr << "Initialize server fail: " << cpp_strerror(r) << dendl;
+    // This is typically due to a bind() failure, so let's let
+    // systemd restart us.
+    exit(1);
+  }
+  dout(4) << "Initialized server at " << server.get_myaddrs() << dendl;
+
+  // Preload all daemon metadata (will subsequently keep this
+  // up to date by watching maps, so do the initial load before
+  // we subscribe to any maps)
+  dout(4) << "Loading daemon metadata..." << dendl;
+  load_all_metadata();
+
+  // Populate PGs in ClusterState
+  cluster_state.with_osdmap_and_pgmap([this](const OSDMap &osd_map,
+					     const PGMap& pg_map) {
+    cluster_state.notify_osdmap(osd_map);
+  });
+
+  // Wait for FSMap
+  dout(4) << "waiting for FSMap..." << dendl;
+  fs_map_cond.wait(l, [this] { return cluster_state.have_fsmap();});
+
+  // Wait for MgrDigest...
+  dout(4) << "waiting for MgrDigest..." << dendl;
+  digest_cond.wait(l, [this] { return digest_received; });
+
+  if (!mon_allows_kv_sub) {
+    dout(4) << "loading config-key data from pre-pacific mon cluster..." << dendl;
+    pre_init_store = load_store();
+  }
+
+  dout(4) << "initializing device state..." << dendl;
+  // Note: we only have to do this during startup because once we are
+  // active the only changes to this state will originate from one of our
+  // own modules.
+  for (auto p = pre_init_store.lower_bound("device/");
+       p != pre_init_store.end() && p->first.find("device/") == 0;
+       ++p) {
+    string devid = p->first.substr(7);
+    dout(10) << "  updating " << devid << dendl;
+    map<string,string> meta;
+    ostringstream ss;
+    int r = get_json_str_map(p->second, ss, &meta, false);
+    if (r < 0) {
+      derr << __func__ << " failed to parse " << p->second << ": " << ss.str()
+	   << dendl;
+    } else {
+      daemon_state.with_device_create(
+	devid, [&meta] (DeviceState& dev) {
+		 dev.set_metadata(std::move(meta));
+	       });
+    }
+  }
+  
+  // assume finisher already initialized in background_init
+  dout(4) << "starting python modules..." << dendl;
+  py_module_registry->active_start(
+    daemon_state, cluster_state,
+    pre_init_store, mon_allows_kv_sub,
+    *monc, clog, audit_clog, *objecter, *client,
+    finisher, server);
+
+  cluster_state.final_init();
+
+  AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+  r = admin_socket->register_command(
+    "mgr_status", this,
+    "Dump mgr status");
+  ceph_assert(r == 0);
+
+  dout(4) << "Complete." << dendl;
+  initializing = false;
+  initialized = true;
+}
+
+void Mgr::load_all_metadata()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  JSONCommand mds_cmd;
+  mds_cmd.run(monc, "{\"prefix\": \"mds metadata\"}");
+  JSONCommand osd_cmd;
+  osd_cmd.run(monc, "{\"prefix\": \"osd metadata\"}");
+  JSONCommand mon_cmd;
+  mon_cmd.run(monc, "{\"prefix\": \"mon metadata\"}");
+
+  lock.unlock();
+  mds_cmd.wait();
+  osd_cmd.wait();
+  mon_cmd.wait();
+  lock.lock();
+
+  ceph_assert(mds_cmd.r == 0);
+  ceph_assert(mon_cmd.r == 0);
+  ceph_assert(osd_cmd.r == 0);
+
+  for (auto &metadata_val : mds_cmd.json_result.get_array()) {
+    json_spirit::mObject daemon_meta = metadata_val.get_obj();
+    if (daemon_meta.count("hostname") == 0) {
+      dout(1) << "Skipping incomplete metadata entry" << dendl;
+      continue;
+    }
+
+    DaemonStatePtr dm = std::make_shared<DaemonState>(daemon_state.types);
+    dm->key = DaemonKey{"mds",
+                        daemon_meta.at("name").get_str()};
+    dm->hostname = daemon_meta.at("hostname").get_str();
+
+    daemon_meta.erase("name");
+    daemon_meta.erase("hostname");
+
+    for (const auto &[key, val] : daemon_meta) {
+      dm->metadata.emplace(key, val.get_str());
+    }
+
+    daemon_state.insert(dm);
+  }
+
+  for (auto &metadata_val : mon_cmd.json_result.get_array()) {
+    json_spirit::mObject daemon_meta = metadata_val.get_obj();
+    if (daemon_meta.count("hostname") == 0) {
+      dout(1) << "Skipping incomplete metadata entry" << dendl;
+      continue;
+    }
+
+    DaemonStatePtr dm = std::make_shared<DaemonState>(daemon_state.types);
+    dm->key = DaemonKey{"mon",
+                        daemon_meta.at("name").get_str()};
+    dm->hostname = daemon_meta.at("hostname").get_str();
+
+    daemon_meta.erase("name");
+    daemon_meta.erase("hostname");
+
+    map<string,string> m;
+    for (const auto &[key, val] : daemon_meta) {
+      m.emplace(key, val.get_str());
+    }
+    dm->set_metadata(m);
+
+    daemon_state.insert(dm);
+  }
+
+  for (auto &osd_metadata_val : osd_cmd.json_result.get_array()) {
+    json_spirit::mObject osd_metadata = osd_metadata_val.get_obj();
+    if (osd_metadata.count("hostname") == 0) {
+      dout(1) << "Skipping incomplete metadata entry" << dendl;
+      continue;
+    }
+    dout(4) << osd_metadata.at("hostname").get_str() << dendl;
+
+    DaemonStatePtr dm = std::make_shared<DaemonState>(daemon_state.types);
+    dm->key = DaemonKey{"osd",
+                        stringify(osd_metadata.at("id").get_int())};
+    dm->hostname = osd_metadata.at("hostname").get_str();
+
+    osd_metadata.erase("id");
+    osd_metadata.erase("hostname");
+
+    map<string,string> m;
+    for (const auto &i : osd_metadata) {
+      m[i.first] = i.second.get_str();
+    }
+    dm->set_metadata(m);
+
+    daemon_state.insert(dm);
+  }
+}
+
+
+void Mgr::shutdown()
+{
+  dout(10) << "mgr shutdown init" << dendl;
+  finisher.queue(new LambdaContext([&](int) {
+    {
+      std::lock_guard l(lock);
+      // First stop the server so that we're not taking any more incoming
+      // requests
+      server.shutdown();
+    }
+    // after the messenger is stopped, signal modules to shutdown via finisher
+    py_module_registry->active_shutdown();
+  }));
+
+  // Then stop the finisher to ensure its enqueued contexts aren't going
+  // to touch references to the things we're about to tear down
+  finisher.wait_for_empty();
+  finisher.stop();
+}
+
+void Mgr::handle_osd_map()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  std::set<std::string> names_exist;
+
+  /**
+   * When we see a new OSD map, inspect the entity addrs to
+   * see if they have changed (service restart), and if so
+   * reload the metadata.
+   */
+  cluster_state.with_osdmap_and_pgmap([this, &names_exist](const OSDMap &osd_map,
+							   const PGMap &pg_map) {
+    for (int osd_id = 0; osd_id < osd_map.get_max_osd(); ++osd_id) {
+      if (!osd_map.exists(osd_id)) {
+        continue;
+      }
+
+      // Remember which OSDs exist so that we can cull any that don't
+      names_exist.insert(stringify(osd_id));
+
+      // Consider whether to update the daemon metadata (new/restarted daemon)
+      const auto k = DaemonKey{"osd", std::to_string(osd_id)};
+      if (daemon_state.is_updating(k)) {
+        continue;
+      }
+
+      bool update_meta = false;
+      if (daemon_state.exists(k)) {
+        if (osd_map.get_up_from(osd_id) == osd_map.get_epoch()) {
+          dout(4) << "Mgr::handle_osd_map: osd." << osd_id
+		  << " joined cluster at " << "e" << osd_map.get_epoch()
+		  << dendl;
+          update_meta = true;
+        }
+      } else {
+        update_meta = true;
+      }
+      if (update_meta) {
+        auto c = new MetadataUpdate(daemon_state, k);
+        std::ostringstream cmd;
+        cmd << "{\"prefix\": \"osd metadata\", \"id\": "
+            << osd_id << "}";
+        monc->start_mon_command(
+            {cmd.str()},
+            {}, &c->outbl, &c->outs, c);
+      }
+    }
+
+    cluster_state.notify_osdmap(osd_map);
+  });
+
+  // TODO: same culling for MonMap
+  daemon_state.cull("osd", names_exist);
+}
+
+void Mgr::handle_log(ref_t<MLog> m)
+{
+  for (const auto &e : m->entries) {
+    py_module_registry->notify_all(e);
+  }
+}
+
+void Mgr::handle_service_map(ref_t<MServiceMap> m)
+{
+  dout(10) << "e" << m->service_map.epoch << dendl;
+  monc->sub_got("servicemap", m->service_map.epoch);
+  cluster_state.set_service_map(m->service_map);
+  server.got_service_map();
+}
+
+void Mgr::handle_mon_map()
+{
+  dout(20) << __func__ << dendl;
+  assert(ceph_mutex_is_locked_by_me(lock));
+  std::set<std::string> names_exist;
+  cluster_state.with_monmap([&] (auto &monmap) {
+    for (unsigned int i = 0; i < monmap.size(); i++) {
+      names_exist.insert(monmap.get_name(i));
+    }
+  });
+  for (const auto& name : names_exist) {
+    const auto k = DaemonKey{"mon", name};
+    if (daemon_state.is_updating(k)) {
+      continue;
+    }
+    auto c = new MetadataUpdate(daemon_state, k);
+    const char* cmd = R"({{"prefix": "mon metadata", "id": "{}"}})";
+    monc->start_mon_command({fmt::format(cmd, name)}, {},
+			    &c->outbl, &c->outs, c);
+  }
+  daemon_state.cull("mon", names_exist);
+}
+
+bool Mgr::ms_dispatch2(const ref_t<Message>& m)
+{
+  dout(10) << *m << dendl;
+  std::lock_guard l(lock);
+
+  switch (m->get_type()) {
+    case MSG_MGR_DIGEST:
+      handle_mgr_digest(ref_cast<MMgrDigest>(m));
+      break;
+    case CEPH_MSG_MON_MAP:
+      py_module_registry->notify_all("mon_map", "");
+      handle_mon_map();
+      break;
+    case CEPH_MSG_FS_MAP:
+      py_module_registry->notify_all("fs_map", "");
+      handle_fs_map(ref_cast<MFSMap>(m));
+      return false; // I shall let this pass through for Client
+    case CEPH_MSG_OSD_MAP:
+      handle_osd_map();
+
+      py_module_registry->notify_all("osd_map", "");
+
+      // Continuous subscribe, so that we can generate notifications
+      // for our MgrPyModules
+      objecter->maybe_request_map();
+      break;
+    case MSG_SERVICE_MAP:
+      handle_service_map(ref_cast<MServiceMap>(m));
+      //no users: py_module_registry->notify_all("service_map", "");
+      break;
+    case MSG_LOG:
+      handle_log(ref_cast<MLog>(m));
+      break;
+    case MSG_KV_DATA:
+      {
+	auto msg = ref_cast<MKVData>(m);
+	monc->sub_got("kv:"s + msg->prefix, msg->version);
+	if (!msg->data.empty()) {
+	  if (initialized) {
+	    py_module_registry->update_kv_data(
+	      msg->prefix,
+	      msg->incremental,
+	      msg->data
+	      );
+	  } else {
+	    // before we have created the ActivePyModules, we need to
+	    // track the store regions we're monitoring
+	    if (!msg->incremental) {
+	      dout(10) << "full update on " << msg->prefix << dendl;
+	      auto p = pre_init_store.lower_bound(msg->prefix);
+	      while (p != pre_init_store.end() && p->first.find(msg->prefix) == 0) {
+		dout(20) << " rm prior " << p->first << dendl;
+		p = pre_init_store.erase(p);
+	      }
+	    } else {
+	      dout(10) << "incremental update on " << msg->prefix << dendl;
+	    }
+	    for (auto& i : msg->data) {
+	      if (i.second) {
+		dout(20) << " set " << i.first << " = " << i.second->to_str() << dendl;
+		pre_init_store[i.first] = i.second->to_str();
+	      } else {
+		dout(20) << " rm " << i.first << dendl;
+		pre_init_store.erase(i.first);
+	      }
+	    }
+	  }
+	}
+      }
+      break;
+
+    default:
+      return false;
+  }
+  return true;
+}
+
+
+void Mgr::handle_fs_map(ref_t<MFSMap> m)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  std::set<std::string> names_exist;
+  const FSMap &new_fsmap = m->get_fsmap();
+
+  monc->sub_got("fsmap", m->epoch);
+
+  fs_map_cond.notify_all();
+
+  // TODO: callers (e.g. from python land) are potentially going to see
+  // the new fsmap before we've bothered populating all the resulting
+  // daemon_state.  Maybe we should block python land while we're making
+  // this kind of update?
+  
+  cluster_state.set_fsmap(new_fsmap);
+
+  auto mds_info = new_fsmap.get_mds_info();
+  for (const auto &i : mds_info) {
+    const auto &info = i.second;
+
+    if (!new_fsmap.gid_exists(i.first)){
+      continue;
+    }
+
+    // Remember which MDS exists so that we can cull any that don't
+    names_exist.insert(info.name);
+
+    const auto k = DaemonKey{"mds", info.name};
+    if (daemon_state.is_updating(k)) {
+      continue;
+    }
+
+    bool update = false;
+    if (daemon_state.exists(k)) {
+      auto metadata = daemon_state.get(k);
+      std::lock_guard l(metadata->lock);
+      if (metadata->metadata.empty() ||
+	  metadata->metadata.count("addr") == 0) {
+        update = true;
+      } else {
+        auto metadata_addrs = metadata->metadata.at("addr");
+        const auto map_addrs = info.addrs;
+        update = metadata_addrs != stringify(map_addrs);
+        if (update) {
+          dout(4) << "MDS[" << info.name << "] addr change " << metadata_addrs
+                  << " != " << stringify(map_addrs) << dendl;
+        }
+      }
+    } else {
+      update = true;
+    }
+
+    if (update) {
+      auto c = new MetadataUpdate(daemon_state, k);
+
+      // Older MDS daemons don't have addr in the metadata, so
+      // fake it if the returned metadata doesn't have the field.
+      c->set_default("addr", stringify(info.addrs));
+
+      std::ostringstream cmd;
+      cmd << "{\"prefix\": \"mds metadata\", \"who\": \""
+          << info.name << "\"}";
+      monc->start_mon_command(
+          {cmd.str()},
+          {}, &c->outbl, &c->outs, c);
+    }
+  }
+  daemon_state.cull("mds", names_exist);
+}
+
+bool Mgr::got_mgr_map(const MgrMap& m)
+{
+  std::lock_guard l(lock);
+  dout(10) << m << dendl;
+
+  set<string> old_modules;
+  cluster_state.with_mgrmap([&](const MgrMap& m) {
+      old_modules = m.modules;
+    });
+  if (m.modules != old_modules) {
+    derr << "mgrmap module list changed to (" << m.modules << "), respawn"
+	 << dendl;
+    return true;
+  }
+
+  cluster_state.set_mgr_map(m);
+  server.got_mgr_map();
+
+  return false;
+}
+
+void Mgr::handle_mgr_digest(ref_t<MMgrDigest> m)
+{
+  dout(10) << m->mon_status_json.length() << dendl;
+  dout(10) << m->health_json.length() << dendl;
+  cluster_state.load_digest(m.get());
+  //no users: py_module_registry->notify_all("mon_status", "");
+  py_module_registry->notify_all("health", "");
+
+  // Hack: use this as a tick/opportunity to prompt python-land that
+  // the pgmap might have changed since last time we were here.
+  py_module_registry->notify_all("pg_summary", "");
+  dout(10) << "done." << dendl;
+  m.reset();
+
+  if (!digest_received) {
+    digest_received = true;
+    digest_cond.notify_all();
+  }
+}
+
+std::map<std::string, std::string> Mgr::get_services() const
+{
+  std::lock_guard l(lock);
+
+  return py_module_registry->get_services();
+}
+
+int Mgr::call(
+  std::string_view admin_command,
+  const cmdmap_t& cmdmap,
+  Formatter *f,
+  std::ostream& errss,
+  bufferlist& out)
+{
+  try {
+    if (admin_command == "mgr_status") {
+      f->open_object_section("mgr_status");
+      cluster_state.with_mgrmap(
+	[f](const MgrMap& mm) {
+	  f->dump_unsigned("mgrmap_epoch", mm.get_epoch());
+	});
+      f->dump_bool("initialized", initialized);
+      f->close_section();
+      return 0;
+    } else {
+      return -ENOSYS;
+    }
+  } catch (const TOPNSPC::common::bad_cmd_get& e) {
+    errss << e.what();
+    return -EINVAL;
+  }
+  return 0;
+}
diff --git a/src/mgr/Mgr.h b/src/mgr/Mgr.h
new file mode 100644
index 000000000..28a7da93d
--- /dev/null
+++ b/src/mgr/Mgr.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_MGR_H_
+#define CEPH_MGR_H_
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include <Python.h>
+
+#include "mds/FSMap.h"
+#include "messages/MFSMap.h"
+#include "msg/Messenger.h"
+#include "auth/Auth.h"
+#include "common/Finisher.h"
+#include "mon/MgrMap.h"
+
+#include "DaemonServer.h"
+#include "PyModuleRegistry.h"
+
+#include "DaemonState.h"
+#include "ClusterState.h"
+
+class MCommand;
+class MMgrDigest;
+class MLog;
+class MServiceMap;
+class Objecter;
+class Client;
+
+class Mgr : public AdminSocketHook {
+protected:
+  MonClient *monc;
+  Objecter  *objecter;
+  Client    *client;
+  Messenger *client_messenger;
+
+  mutable ceph::mutex lock = ceph::make_mutex("Mgr::lock");
+  Finisher finisher;
+
+  // Track receipt of initial data during startup
+  ceph::condition_variable fs_map_cond;
+  bool digest_received;
+  ceph::condition_variable digest_cond;
+
+  PyModuleRegistry *py_module_registry;
+  DaemonStateIndex daemon_state;
+  ClusterState cluster_state;
+
+  DaemonServer server;
+
+  LogChannelRef clog;
+  LogChannelRef audit_clog;
+
+  std::map<std::string, std::string> pre_init_store;
+
+  void load_all_metadata();
+  std::map<std::string, std::string> load_store();
+  void init();
+
+  bool initialized;
+  bool initializing;
+
+public:
+  Mgr(MonClient *monc_, const MgrMap& mgrmap,
+      PyModuleRegistry *py_module_registry_,
+      Messenger *clientm_, Objecter *objecter_,
+      Client *client_, LogChannelRef clog_, LogChannelRef audit_clog_);
+  ~Mgr();
+
+  bool is_initialized() const {return initialized;}
+  entity_addrvec_t get_server_addrs() const {
+    return server.get_myaddrs();
+  }
+
+  void handle_mgr_digest(ceph::ref_t<MMgrDigest> m);
+  void handle_fs_map(ceph::ref_t<MFSMap> m);
+  void handle_osd_map();
+  void handle_log(ceph::ref_t<MLog> m);
+  void handle_service_map(ceph::ref_t<MServiceMap> m);
+  void handle_mon_map();
+
+  bool got_mgr_map(const MgrMap& m);
+
+  bool ms_dispatch2(const ceph::ref_t<Message>& m);
+
+  void background_init(Context *completion);
+  void shutdown();
+
+  void handle_signal(int signum);
+
+  std::map<std::string, std::string> get_services() const;
+
+  int call(
+    std::string_view command,
+    const cmdmap_t& cmdmap,
+    Formatter *f,
+    std::ostream& errss,
+    ceph::buffer::list& out) override;
+};
+
+/**
+ * Context for completion of metadata mon commands: take
+ * the result and stash it in DaemonStateIndex
+ */
+class MetadataUpdate : public Context
+{
+
+private:
+  DaemonStateIndex &daemon_state;
+  DaemonKey key;
+
+  std::map<std::string, std::string> defaults;
+
+public:
+  bufferlist outbl;
+  std::string outs;
+
+  MetadataUpdate(DaemonStateIndex &daemon_state_, const DaemonKey &key_)
+    : daemon_state(daemon_state_), key(key_)
+  {
+      daemon_state.notify_updating(key);
+  }
+
+  void set_default(const std::string &k, const std::string &v)
+  {
+    defaults[k] = v;
+  }
+
+  void finish(int r) override;
+};
+
+
+#endif
diff --git a/src/mgr/MgrCap.cc b/src/mgr/MgrCap.cc
new file mode 100644
index 000000000..cba758083
--- /dev/null
+++ b/src/mgr/MgrCap.cc
@@ -0,0 +1,580 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <boost/config/warning_disable.hpp>
+#include <boost/spirit/include/qi_uint.hpp>
+#include <boost/spirit/include/qi.hpp>
+#include <boost/fusion/include/std_pair.hpp>
+#include <boost/spirit/include/phoenix.hpp>
+#include <boost/fusion/adapted/struct/adapt_struct.hpp>
+#include <boost/fusion/include/adapt_struct.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "MgrCap.h"
+#include "include/stringify.h"
+#include "include/ipaddr.h"
+#include "common/debug.h"
+#include "common/Formatter.h"
+
+#include <algorithm>
+#include <regex>
+
+#include "include/ceph_assert.h"
+
+static inline bool is_not_alnum_space(char c) {
+  return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_'));
+}
+
+static std::string maybe_quote_string(const std::string& str) {
+  if (find_if(str.begin(), str.end(), is_not_alnum_space) == str.end())
+    return str;
+  return std::string("\"") + str + std::string("\"");
+}
+
+#define dout_subsys ceph_subsys_mgr
+
+std::ostream& operator<<(std::ostream& out, const mgr_rwxa_t& p) {
+  if (p == MGR_CAP_ANY)
+    return out << "*";
+
+  if (p & MGR_CAP_R)
+    out << "r";
+  if (p & MGR_CAP_W)
+    out << "w";
+  if (p & MGR_CAP_X)
+    out << "x";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const MgrCapGrantConstraint& c) {
+  switch (c.match_type) {
+  case MgrCapGrantConstraint::MATCH_TYPE_EQUAL:
+    out << "=";
+    break;
+  case MgrCapGrantConstraint::MATCH_TYPE_PREFIX:
+    out << " prefix ";
+    break;
+  case MgrCapGrantConstraint::MATCH_TYPE_REGEX:
+    out << " regex ";
+    break;
+  default:
+    break;
+  }
+  out << maybe_quote_string(c.value);
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const MgrCapGrant& m) {
+  if (!m.profile.empty()) {
+    out << "profile " << maybe_quote_string(m.profile);
+  } else {
+    out << "allow";
+    if (!m.service.empty()) {
+      out << " service " << maybe_quote_string(m.service);
+    } else if (!m.module.empty()) {
+      out << " module " << maybe_quote_string(m.module);
+    } else if (!m.command.empty()) {
+      out << " command " << maybe_quote_string(m.command);
+    }
+  }
+
+  if (!m.arguments.empty()) {
+    out << (!m.profile.empty() ? "" : " with");
+    for (auto& [key, constraint] : m.arguments) {
+      out << " " << maybe_quote_string(key) << constraint;
+    }
+  }
+
+  if (m.allow != 0) {
+    out << " " << m.allow;
+  }
+
+  if (m.network.size()) {
+    out << " network " << m.network;
+  }
+  return out;
+}
+
+// <magic>
+//  fusion lets us easily populate structs via the qi parser.
+
+typedef std::map<std::string, MgrCapGrantConstraint> kvmap;
+
+BOOST_FUSION_ADAPT_STRUCT(MgrCapGrant,
+                          (std::string, service)
+                          (std::string, module)
+                          (std::string, profile)
+                          (std::string, command)
+                          (kvmap, arguments)
+                          (mgr_rwxa_t, allow)
+                          (std::string, network))
+
+BOOST_FUSION_ADAPT_STRUCT(MgrCapGrantConstraint,
+                          (MgrCapGrantConstraint::MatchType, match_type)
+                          (std::string, value))
+
+// </magic>
+
+void MgrCapGrant::parse_network() {
+  network_valid = ::parse_network(network.c_str(), &network_parsed,
+                                  &network_prefix);
+}
+
+void MgrCapGrant::expand_profile(std::ostream *err) const {
+  // only generate this list once
+  if (!profile_grants.empty()) {
+    return;
+  }
+
+  if (profile == "read-only") {
+    // grants READ-ONLY caps MGR-wide
+    profile_grants.push_back({{}, {}, {}, {}, {}, mgr_rwxa_t{MGR_CAP_R}});
+    return;
+  }
+
+  if (profile == "read-write") {
+    // grants READ-WRITE caps MGR-wide
+    profile_grants.push_back({{}, {}, {}, {}, {},
+                              mgr_rwxa_t{MGR_CAP_R | MGR_CAP_W}});
+    return;
+  }
+
+  if (profile == "crash") {
+    profile_grants.push_back({{}, {}, {}, "crash post", {}, {}});
+    return;
+  }
+
+  if (profile == "osd") {
+    // this is a documented profile (so we need to accept it as valid), but it
+    // currently doesn't do anything
+    return;
+  }
+
+  if (profile == "mds") {
+    // this is a documented profile (so we need to accept it as valid), but it
+    // currently doesn't do anything
+    return;
+  }
+
+  if (profile == "rbd" || profile == "rbd-read-only") {
+    Arguments filtered_arguments;
+    for (auto& [key, constraint] : arguments) {
+      if (key == "pool" || key == "namespace") {
+        filtered_arguments[key] = std::move(constraint);
+      } else {
+        if (err != nullptr) {
+          *err << "profile '" << profile << "' does not recognize key '" << key
+               << "'";
+        }
+        return;
+      }
+    }
+
+    mgr_rwxa_t perms = mgr_rwxa_t{MGR_CAP_R};
+    if (profile == "rbd") {
+      perms = mgr_rwxa_t{MGR_CAP_R | MGR_CAP_W};
+    }
+
+    // allow all 'rbd_support' commands (restricted by optional
+    // pool/namespace constraints)
+    profile_grants.push_back({{}, "rbd_support", {}, {},
+                              std::move(filtered_arguments), perms});
+    return;
+  }
+
+  if (err != nullptr) {
+    *err << "unrecognized profile '" << profile << "'";
+  }
+}
+
+bool MgrCapGrant::validate_arguments(
+      const std::map<std::string, std::string>& args) const {
+  for (auto& [key, constraint] : arguments) {
+    auto q = args.find(key);
+
+    // argument must be present if a constraint exists
+    if (q == args.end()) {
+      return false;
+    }
+
+    switch (constraint.match_type) {
+    case MgrCapGrantConstraint::MATCH_TYPE_EQUAL:
+      if (constraint.value != q->second)
+        return false;
+      break;
+    case MgrCapGrantConstraint::MATCH_TYPE_PREFIX:
+      if (q->second.find(constraint.value) != 0)
+        return false;
+      break;
+    case MgrCapGrantConstraint::MATCH_TYPE_REGEX:
+      try {
+        std::regex pattern(constraint.value, std::regex::extended);
+        if (!std::regex_match(q->second, pattern)) {
+          return false;
+        }
+      } catch(const std::regex_error&) {
+        return false;
+      }
+      break;
+    default:
+      return false;
+    }
+  }
+
+  return true;
+}
+
+mgr_rwxa_t MgrCapGrant::get_allowed(
+    CephContext *cct, EntityName name, const std::string& s,
+    const std::string& m, const std::string& c,
+    const std::map<std::string, std::string>& args) const {
+  if (!profile.empty()) {
+    expand_profile(nullptr);
+    mgr_rwxa_t a;
+    for (auto& grant : profile_grants) {
+      a = a | grant.get_allowed(cct, name, s, m, c, args);
+    }
+    return a;
+  }
+
+  if (!service.empty()) {
+    if (service != s) {
+      return mgr_rwxa_t{};
+    }
+    return allow;
+  }
+
+  if (!module.empty()) {
+    if (module != m) {
+      return mgr_rwxa_t{};
+    }
+
+    // don't test module arguments when validating a specific command
+    if (c.empty() && !validate_arguments(args)) {
+      return mgr_rwxa_t{};
+    }
+    return allow;
+  }
+
+  if (!command.empty()) {
+    if (command != c) {
+      return mgr_rwxa_t{};
+    }
+    if (!validate_arguments(args)) {
+      return mgr_rwxa_t{};
+    }
+    return mgr_rwxa_t{MGR_CAP_ANY};
+  }
+
+  return allow;
+}
+
+std::ostream& operator<<(std::ostream&out, const MgrCap& m) {
+  bool first = true;
+  for (auto& grant : m.grants) {
+    if (!first) {
+      out << ", ";
+    }
+    first = false;
+
+    out << grant;
+  }
+  return out;
+}
+
+bool MgrCap::is_allow_all() const {
+  for (auto& grant : grants) {
+    if (grant.is_allow_all()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void MgrCap::set_allow_all() {
+  grants.clear();
+  grants.push_back({{}, {}, {}, {}, {}, mgr_rwxa_t{MGR_CAP_ANY}});
+  text = "allow *";
+}
+
+bool MgrCap::is_capable(
+    CephContext *cct,
+    EntityName name,
+    const std::string& service,
+    const std::string& module,
+    const std::string& command,
+    const std::map<std::string, std::string>& command_args,
+    bool op_may_read, bool op_may_write, bool op_may_exec,
+    const entity_addr_t& addr) const {
+  if (cct) {
+    ldout(cct, 20) << "is_capable service=" << service << " "
+                   << "module=" << module << " "
+                   << "command=" << command
+                   << (op_may_read ? " read":"")
+                   << (op_may_write ? " write":"")
+                   << (op_may_exec ? " exec":"")
+                   << " addr " << addr
+                   << " on cap " << *this
+                   << dendl;
+  }
+
+  mgr_rwxa_t allow;
+  for (auto& grant : grants) {
+    if (cct)
+      ldout(cct, 20) << " allow so far " << allow << ", doing grant " << grant
+                     << dendl;
+
+    if (grant.network.size() &&
+        (!grant.network_valid ||
+         !network_contains(grant.network_parsed,
+                           grant.network_prefix,
+                           addr))) {
+      continue;
+    }
+
+    if (grant.is_allow_all()) {
+      if (cct) {
+        ldout(cct, 20) << " allow all" << dendl;
+      }
+      return true;
+    }
+
+    // check enumerated caps
+    allow = allow | grant.get_allowed(cct, name, service, module, command,
+                                      command_args);
+    if ((!op_may_read || (allow & MGR_CAP_R)) &&
+        (!op_may_write || (allow & MGR_CAP_W)) &&
+        (!op_may_exec || (allow & MGR_CAP_X))) {
+      if (cct) {
+        ldout(cct, 20) << " match" << dendl;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+void MgrCap::encode(ceph::buffer::list& bl) const {
+  // remain backwards compatible w/ MgrCap
+  ENCODE_START(4, 4, bl);
+  encode(text, bl);
+  ENCODE_FINISH(bl);
+}
+
+void MgrCap::decode(ceph::buffer::list::const_iterator& bl) {
+  // remain backwards compatible w/ MgrCap
+  std::string s;
+  DECODE_START(4, bl);
+  decode(s, bl);
+  DECODE_FINISH(bl);
+  parse(s, NULL);
+}
+
+void MgrCap::dump(ceph::Formatter *f) const {
+  f->dump_string("text", text);
+}
+
+void MgrCap::generate_test_instances(std::list<MgrCap*>& ls) {
+  ls.push_back(new MgrCap);
+  ls.push_back(new MgrCap);
+  ls.back()->parse("allow *");
+  ls.push_back(new MgrCap);
+  ls.back()->parse("allow rwx");
+  ls.push_back(new MgrCap);
+  ls.back()->parse("allow service foo x");
+  ls.push_back(new MgrCap);
+  ls.back()->parse("allow command bar x");
+  ls.push_back(new MgrCap);
+  ls.back()->parse("allow service foo r, allow command bar x");
+  ls.push_back(new MgrCap);
+  ls.back()->parse("allow command bar with k1=v1 x");
+  ls.push_back(new MgrCap);
+  ls.back()->parse("allow command bar with k1=v1 k2=v2 x");
+  ls.push_back(new MgrCap);
+  ls.back()->parse("allow module bar with k1=v1 k2=v2 x");
+  ls.push_back(new MgrCap);
+  ls.back()->parse("profile rbd pool=rbd");
+}
+
+// grammar
+namespace qi = boost::spirit::qi;
+namespace ascii = boost::spirit::ascii;
+namespace phoenix = boost::phoenix;
+
+template <typename Iterator>
+struct MgrCapParser : qi::grammar<Iterator, MgrCap()> {
+  MgrCapParser() : MgrCapParser::base_type(mgrcap) {
+    using qi::char_;
+    using qi::int_;
+    using qi::ulong_long;
+    using qi::lexeme;
+    using qi::alnum;
+    using qi::_val;
+    using qi::_1;
+    using qi::_2;
+    using qi::_3;
+    using qi::eps;
+    using qi::lit;
+
+    quoted_string %=
+      lexeme['"' >> +(char_ - '"') >> '"'] |
+      lexeme['\'' >> +(char_ - '\'') >> '\''];
+    unquoted_word %= +char_("a-zA-Z0-9_./-");
+    str %= quoted_string | unquoted_word;
+    network_str %= +char_("/.:a-fA-F0-9][");
+
+    spaces = +(lit(' ') | lit('\n') | lit('\t'));
+
+    // key <=|prefix|regex> value[ ...]
+    str_match = -spaces >> lit('=') >> -spaces >>
+                qi::attr(MgrCapGrantConstraint::MATCH_TYPE_EQUAL) >> str;
+    str_prefix = spaces >> lit("prefix") >> spaces >>
+                 qi::attr(MgrCapGrantConstraint::MATCH_TYPE_PREFIX) >> str;
+    str_regex = spaces >> lit("regex") >> spaces >>
+                 qi::attr(MgrCapGrantConstraint::MATCH_TYPE_REGEX) >> str;
+    kv_pair = str >> (str_match | str_prefix | str_regex);
+    kv_map %= kv_pair >> *(spaces >> kv_pair);
+
+    // command := command[=]cmd [k1=v1 k2=v2 ...]
+    command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces)
+                            >> qi::attr(std::string())
+                            >> qi::attr(std::string())
+                            >> qi::attr(std::string())
+                            >> str
+                            >> -(spaces >> lit("with") >> spaces >> kv_map)
+                            >> qi::attr(0)
+                            >> -(spaces >> lit("network") >> spaces >> network_str);
+
+    // service foo rwxa
+    service_match %= -spaces >> lit("allow") >> spaces >> lit("service") >> (lit('=') | spaces)
+                             >> str
+                             >> qi::attr(std::string())
+                             >> qi::attr(std::string())
+                             >> qi::attr(std::string())
+                             >> qi::attr(std::map<std::string, MgrCapGrantConstraint>())
+                             >> spaces >> rwxa
+                             >> -(spaces >> lit("network") >> spaces >> network_str);
+
+    // module foo rwxa
+    module_match %= -spaces >> lit("allow") >> spaces >> lit("module") >> (lit('=') | spaces)
+                            >> qi::attr(std::string())
+                            >> str
+                            >> qi::attr(std::string())
+                            >> qi::attr(std::string())
+                            >> -(spaces >> lit("with") >> spaces >> kv_map)
+                            >> spaces >> rwxa
+                            >> -(spaces >> lit("network") >> spaces >> network_str);
+
+    // profile foo
+    profile_match %= -spaces >> -(lit("allow") >> spaces)
+                             >> lit("profile") >> (lit('=') | spaces)
+                             >> qi::attr(std::string())
+                             >> qi::attr(std::string())
+                             >> str
+                             >> qi::attr(std::string())
+                             >> -(spaces >> kv_map)
+                             >> qi::attr(0)
+                             >> -(spaces >> lit("network") >> spaces >> network_str);
+
+    // rwxa
+    rwxa_match %= -spaces >> lit("allow") >> spaces
+                          >> qi::attr(std::string())
+                          >> qi::attr(std::string())
+                          >> qi::attr(std::string())
+                          >> qi::attr(std::string())
+                          >> qi::attr(std::map<std::string,MgrCapGrantConstraint>())
+                          >> rwxa
+                          >> -(spaces >> lit("network") >> spaces >> network_str);
+
+    // rwxa := * | [r][w][x]
+    rwxa =
+      (lit("*")[_val = MGR_CAP_ANY]) |
+      (lit("all")[_val = MGR_CAP_ANY]) |
+      ( eps[_val = 0] >>
+        ( lit('r')[_val |= MGR_CAP_R] ||
+          lit('w')[_val |= MGR_CAP_W] ||
+          lit('x')[_val |= MGR_CAP_X]
+          )
+        );
+
+    // grant := allow ...
+    grant = -spaces >> (rwxa_match | profile_match | service_match |
+                        module_match | command_match) >> -spaces;
+
+    // mgrcap := grant [grant ...]
+    grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' ')));
+    mgrcap = grants  [_val = phoenix::construct<MgrCap>(_1)];
+  }
+
+  qi::rule<Iterator> spaces;
+  qi::rule<Iterator, unsigned()> rwxa;
+  qi::rule<Iterator, std::string()> quoted_string;
+  qi::rule<Iterator, std::string()> unquoted_word;
+  qi::rule<Iterator, std::string()> str, network_str;
+
+  qi::rule<Iterator, MgrCapGrantConstraint()> str_match, str_prefix, str_regex;
+  qi::rule<Iterator, std::pair<std::string, MgrCapGrantConstraint>()> kv_pair;
+  qi::rule<Iterator, std::map<std::string, MgrCapGrantConstraint>()> kv_map;
+
+  qi::rule<Iterator, MgrCapGrant()> rwxa_match;
+  qi::rule<Iterator, MgrCapGrant()> command_match;
+  qi::rule<Iterator, MgrCapGrant()> service_match;
+  qi::rule<Iterator, MgrCapGrant()> module_match;
+  qi::rule<Iterator, MgrCapGrant()> profile_match;
+  qi::rule<Iterator, MgrCapGrant()> grant;
+  qi::rule<Iterator, std::vector<MgrCapGrant>()> grants;
+  qi::rule<Iterator, MgrCap()> mgrcap;
+};
+
+bool MgrCap::parse(const std::string& str, std::ostream *err) {
+  auto iter = str.begin();
+  auto end = str.end();
+
+  MgrCapParser<std::string::const_iterator> exp;
+  bool r = qi::parse(iter, end, exp, *this);
+  if (r && iter == end) {
+    text = str;
+
+    std::stringstream profile_err;
+    for (auto& g : grants) {
+      g.parse_network();
+
+      if (!g.profile.empty()) {
+        g.expand_profile(&profile_err);
+      }
+    }
+
+    if (!profile_err.str().empty()) {
+      if (err != nullptr) {
+        *err << "mgr capability parse failed during profile evaluation: "
+             << profile_err.str();
+      }
+      return false;
+    }
+    return true;
+  }
+
+  // Make sure no grants are kept after parsing failed!
+  grants.clear();
+
+  if (err) {
+    if (iter != end)
+      *err << "mgr capability parse failed, stopped at '"
+           << std::string(iter, end) << "' of '" << str << "'";
+    else
+      *err << "mgr capability parse failed, stopped at end of '" << str << "'";
+  }
+
+  return false;
+}
diff --git a/src/mgr/MgrCap.h b/src/mgr/MgrCap.h
new file mode 100644
index 000000000..f7a8bd5f8
--- /dev/null
+++ b/src/mgr/MgrCap.h
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGRCAP_H
+#define CEPH_MGRCAP_H
+
+#include <iosfwd>
+
+#include "include/common_fwd.h"
+#include "include/types.h"
+#include "common/entity_name.h"
+
+static const __u8 MGR_CAP_R     = (1 << 1);      // read
+static const __u8 MGR_CAP_W     = (1 << 2);      // write
+static const __u8 MGR_CAP_X     = (1 << 3);      // execute
+static const __u8 MGR_CAP_ANY   = 0xff;          // *
+
+struct mgr_rwxa_t {
+  __u8 val = 0U;
+
+  mgr_rwxa_t() {}
+  explicit mgr_rwxa_t(__u8 v) : val(v) {}
+
+  mgr_rwxa_t& operator=(__u8 v) {
+    val = v;
+    return *this;
+  }
+  operator __u8() const {
+    return val;
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const mgr_rwxa_t& p);
+
+struct MgrCapGrantConstraint {
+  enum MatchType {
+    MATCH_TYPE_NONE,
+    MATCH_TYPE_EQUAL,
+    MATCH_TYPE_PREFIX,
+    MATCH_TYPE_REGEX
+  };
+
+  MatchType match_type = MATCH_TYPE_NONE;
+  std::string value;
+
+  MgrCapGrantConstraint() {}
+  MgrCapGrantConstraint(MatchType match_type, std::string value)
+    : match_type(match_type), value(value) {
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const MgrCapGrantConstraint& c);
+
+struct MgrCapGrant {
+  /*
+   * A grant can come in one of four forms:
+   *
+   *  - a blanket allow ('allow rw', 'allow *')
+   *    - this will match against any service and the read/write/exec flags
+   *      in the mgr code.  semantics of what X means are somewhat ad hoc.
+   *
+   *  - a service allow ('allow service mds rw')
+   *    - this will match against a specific service and the r/w/x flags.
+   *
+   *  - a module allow ('allow module rbd_support rw, allow module rbd_support with pool=rbd rw')
+   *    - this will match against a specific python add-on module and the r/w/x
+   *      flags.
+   *
+   *  - a profile ('profile read-only, profile rbd pool=rbd')
+   *    - this will match against specific MGR-enforced semantics of what
+   *      this type of user should need to do.  examples include 'read-write',
+   *      'read-only', 'crash'.
+   *
+   *  - a command ('allow command foo', 'allow command bar with arg1=val1 arg2 prefix val2')
+   *      this includes the command name (the prefix string)
+   *
+   *  The command, module, and profile caps can also accept an optional
+   *  key/value map. If not provided, all command arguments and module
+   *  meta-arguments are allowed. If a key/value pair is specified, that
+   *  argument must be present and must match the provided constraint.
+   */
+  typedef std::map<std::string, MgrCapGrantConstraint> Arguments;
+
+  std::string service;
+  std::string module;
+  std::string profile;
+  std::string command;
+  Arguments arguments;
+
+  // restrict by network
+  std::string network;
+
+  // these are filled in by parse_network(), called by MgrCap::parse()
+  entity_addr_t network_parsed;
+  unsigned network_prefix = 0;
+  bool network_valid = true;
+
+  void parse_network();
+
+  mgr_rwxa_t allow;
+
+  // explicit grants that a profile grant expands to; populated as
+  // needed by expand_profile() (via is_match()) and cached here.
+  mutable std::list<MgrCapGrant> profile_grants;
+
+  void expand_profile(std::ostream *err=nullptr) const;
+
+  MgrCapGrant() : allow(0) {}
+  MgrCapGrant(std::string&& service,
+              std::string&& module,
+              std::string&& profile,
+              std::string&& command,
+              Arguments&& arguments,
+              mgr_rwxa_t allow)
+    : service(std::move(service)), module(std::move(module)),
+      profile(std::move(profile)), command(std::move(command)),
+      arguments(std::move(arguments)), allow(allow) {
+  }
+
+  bool validate_arguments(
+      const std::map<std::string, std::string>& arguments) const;
+
+  /**
+   * check if given request parameters match our constraints
+   *
+   * @param cct context
+   * @param name entity name
+   * @param service service (if any)
+   * @param module module (if any)
+   * @param command command (if any)
+   * @param arguments profile/module/command args (if any)
+   * @return bits we allow
+   */
+  mgr_rwxa_t get_allowed(
+      CephContext *cct,
+      EntityName name,
+      const std::string& service,
+      const std::string& module,
+      const std::string& command,
+      const std::map<std::string, std::string>& arguments) const;
+
+  bool is_allow_all() const {
+    return (allow == MGR_CAP_ANY &&
+            service.empty() &&
+            module.empty() &&
+            profile.empty() &&
+            command.empty());
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const MgrCapGrant& g);
+
+struct MgrCap {
+  std::string text;
+  std::vector<MgrCapGrant> grants;
+
+  MgrCap() {}
+  explicit MgrCap(const std::vector<MgrCapGrant> &g) : grants(g) {}
+
+  std::string get_str() const {
+    return text;
+  }
+
+  bool is_allow_all() const;
+  void set_allow_all();
+  bool parse(const std::string& str, std::ostream *err=NULL);
+
+  /**
+   * check if we are capable of something
+   *
+   * This method actually checks a description of a particular operation against
+   * what the capability has specified.
+   *
+   * @param service service name
+   * @param module module name
+   * @param command command id
+   * @param arguments
+   * @param op_may_read whether the operation may need to read
+   * @param op_may_write whether the operation may need to write
+   * @param op_may_exec whether the operation may exec
+   * @return true if the operation is allowed, false otherwise
+   */
+  bool is_capable(CephContext *cct,
+		  EntityName name,
+		  const std::string& service,
+		  const std::string& module,
+		  const std::string& command,
+		  const std::map<std::string, std::string>& arguments,
+		  bool op_may_read, bool op_may_write, bool op_may_exec,
+		  const entity_addr_t& addr) const;
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<MgrCap*>& ls);
+};
+WRITE_CLASS_ENCODER(MgrCap)
+
+std::ostream& operator<<(std::ostream& out, const MgrCap& cap);
+
+#endif // CEPH_MGRCAP_H
diff --git a/src/mgr/MgrClient.cc b/src/mgr/MgrClient.cc
new file mode 100644
index 000000000..6230b3387
--- /dev/null
+++ b/src/mgr/MgrClient.cc
@@ -0,0 +1,662 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#include "MgrClient.h"
+
+#include "mgr/MgrContext.h"
+#include "mon/MonMap.h"
+
+#include "msg/Messenger.h"
+#include "messages/MMgrMap.h"
+#include "messages/MMgrReport.h"
+#include "messages/MMgrOpen.h"
+#include "messages/MMgrUpdate.h"
+#include "messages/MMgrClose.h"
+#include "messages/MMgrConfigure.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MMgrCommand.h"
+#include "messages/MMgrCommandReply.h"
+#include "messages/MPGStats.h"
+
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::make_message;
+using ceph::ref_cast;
+using ceph::ref_t;
+
+#define dout_subsys ceph_subsys_mgrc
+#undef dout_prefix
+#define dout_prefix *_dout << "mgrc " << __func__ << " "
+
+MgrClient::MgrClient(CephContext *cct_, Messenger *msgr_, MonMap *monmap_)
+  : Dispatcher(cct_),
+    cct(cct_),
+    msgr(msgr_),
+    monmap(monmap_),
+    timer(cct_, lock)
+{
+  ceph_assert(cct != nullptr);
+}
+
+void MgrClient::init()
+{
+  std::lock_guard l(lock);
+
+  ceph_assert(msgr != nullptr);
+
+  timer.init();
+  initialized = true;
+}
+
+void MgrClient::shutdown()
+{
+  std::unique_lock l(lock);
+  ldout(cct, 10) << dendl;
+
+  if (connect_retry_callback) {
+    timer.cancel_event(connect_retry_callback);
+    connect_retry_callback = nullptr;
+  }
+
+  // forget about in-flight commands if we are prematurely shut down
+  // (e.g., by control-C)
+  command_table.clear();
+  if (service_daemon &&
+      session &&
+      session->con &&
+      HAVE_FEATURE(session->con->get_features(), SERVER_MIMIC)) {
+    ldout(cct, 10) << "closing mgr session" << dendl;
+    auto m = make_message<MMgrClose>();
+    m->daemon_name = daemon_name;
+    m->service_name = service_name;
+    session->con->send_message2(m);
+    auto timeout = ceph::make_timespan(cct->_conf.get_val<double>(
+			      "mgr_client_service_daemon_unregister_timeout"));
+    shutdown_cond.wait_for(l, timeout);
+  }
+
+  timer.shutdown();
+  if (session) {
+    session->con->mark_down();
+    session.reset();
+  }
+}
+
+bool MgrClient::ms_dispatch2(const ref_t<Message>& m)
+{
+  std::lock_guard l(lock);
+
+  switch(m->get_type()) {
+  case MSG_MGR_MAP:
+    return handle_mgr_map(ref_cast<MMgrMap>(m));
+  case MSG_MGR_CONFIGURE:
+    return handle_mgr_configure(ref_cast<MMgrConfigure>(m));
+  case MSG_MGR_CLOSE:
+    return handle_mgr_close(ref_cast<MMgrClose>(m));
+  case MSG_COMMAND_REPLY:
+    if (m->get_source().type() == CEPH_ENTITY_TYPE_MGR) {
+      MCommandReply *c = static_cast<MCommandReply*>(m.get());
+      handle_command_reply(c->get_tid(), c->get_data(), c->rs, c->r);
+      return true;
+    } else {
+      return false;
+    }
+  case MSG_MGR_COMMAND_REPLY:
+    if (m->get_source().type() == CEPH_ENTITY_TYPE_MGR) {
+      MMgrCommandReply *c = static_cast<MMgrCommandReply*>(m.get());
+      handle_command_reply(c->get_tid(), c->get_data(), c->rs, c->r);
+      return true;
+    } else {
+      return false;
+    }
+  default:
+    ldout(cct, 30) << "Not handling " << *m << dendl; 
+    return false;
+  }
+}
+
+void MgrClient::reconnect()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  if (session) {
+    ldout(cct, 4) << "Terminating session with "
+		  << session->con->get_peer_addr() << dendl;
+    session->con->mark_down();
+    session.reset();
+    stats_period = 0;
+    if (report_callback != nullptr) {
+      timer.cancel_event(report_callback);
+      report_callback = nullptr;
+    }
+  }
+
+  if (!map.get_available()) {
+    ldout(cct, 4) << "No active mgr available yet" << dendl;
+    return;
+  }
+
+  if (!clock_t::is_zero(last_connect_attempt)) {
+    auto now = clock_t::now();
+    auto when = last_connect_attempt +
+      ceph::make_timespan(
+        cct->_conf.get_val<double>("mgr_connect_retry_interval"));
+    if (now < when) {
+      if (!connect_retry_callback) {
+	connect_retry_callback = timer.add_event_at(
+	  when,
+	  new LambdaContext([this](int r){
+	      connect_retry_callback = nullptr;
+	      reconnect();
+	    }));
+      }
+      ldout(cct, 4) << "waiting to retry connect until " << when << dendl;
+      return;
+    }
+  }
+
+  if (connect_retry_callback) {
+    timer.cancel_event(connect_retry_callback);
+    connect_retry_callback = nullptr;
+  }
+
+  ldout(cct, 4) << "Starting new session with " << map.get_active_addrs()
+		<< dendl;
+  last_connect_attempt = clock_t::now();
+
+  session.reset(new MgrSessionState());
+  session->con = msgr->connect_to(CEPH_ENTITY_TYPE_MGR,
+				  map.get_active_addrs());
+
+  if (service_daemon) {
+    daemon_dirty_status = true;
+  }
+  task_dirty_status = true;
+
+  // Don't send an open if we're just a client (i.e. doing
+  // command-sending, not stats etc)
+  if (msgr->get_mytype() != CEPH_ENTITY_TYPE_CLIENT || service_daemon) {
+    _send_open();
+  }
+
+  // resend any pending commands
+  auto p = command_table.get_commands().begin();
+  while (p != command_table.get_commands().end()) {
+    auto tid = p->first;
+    auto& op = p->second;
+    ldout(cct,10) << "resending " << tid << (op.tell ? " (tell)":" (cli)") << dendl;
+    MessageRef m;
+    if (op.tell) {
+      if (op.name.size() && op.name != map.active_name) {
+	ldout(cct, 10) << "active mgr " << map.active_name << " != target "
+		       << op.name << dendl;
+	if (op.on_finish) {
+	  op.on_finish->complete(-ENXIO);
+	}
+	++p;
+	command_table.erase(tid);
+	continue;
+      }
+      // Set fsid argument to signal that this is really a tell message (and
+      // we are not a legacy client sending a non-tell command via MCommand).
+      m = op.get_message(monmap->fsid, false);
+    } else {
+      m = op.get_message(
+	{},
+	HAVE_FEATURE(map.active_mgr_features, SERVER_OCTOPUS));
+    }
+    ceph_assert(session);
+    ceph_assert(session->con);
+    session->con->send_message2(std::move(m));
+    ++p;
+  }
+}
+
+void MgrClient::_send_open()
+{
+  if (session && session->con) {
+    auto open = make_message<MMgrOpen>();
+    if (!service_name.empty()) {
+      open->service_name = service_name;
+      open->daemon_name = daemon_name;
+    } else {
+      open->daemon_name = cct->_conf->name.get_id();
+    }
+    if (service_daemon) {
+      open->service_daemon = service_daemon;
+      open->daemon_metadata = daemon_metadata;
+    }
+    cct->_conf.get_config_bl(0, &open->config_bl, &last_config_bl_version);
+    cct->_conf.get_defaults_bl(&open->config_defaults_bl);
+    session->con->send_message2(open);
+  }
+}
+
+void MgrClient::_send_update()
+{
+  if (session && session->con) {
+    auto update = make_message<MMgrUpdate>();
+    if (!service_name.empty()) {
+      update->service_name = service_name;
+      update->daemon_name = daemon_name;
+    } else {
+      update->daemon_name = cct->_conf->name.get_id();
+    }
+    if (need_metadata_update) {
+      update->daemon_metadata = daemon_metadata;
+    }
+    update->need_metadata_update = need_metadata_update;
+    session->con->send_message2(update);
+  }
+}
+
+bool MgrClient::handle_mgr_map(ref_t<MMgrMap> m)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  ldout(cct, 20) << *m << dendl;
+
+  map = m->get_map();
+  ldout(cct, 4) << "Got map version " << map.epoch << dendl;
+
+  ldout(cct, 4) << "Active mgr is now " << map.get_active_addrs() << dendl;
+
+  // Reset session?
+  if (!session ||
+      session->con->get_peer_addrs() != map.get_active_addrs()) {
+    reconnect();
+  }
+
+  return true;
+}
+
+bool MgrClient::ms_handle_reset(Connection *con)
+{
+  std::lock_guard l(lock);
+  if (session && con == session->con) {
+    ldout(cct, 4) << __func__ << " con " << con << dendl;
+    reconnect();
+    return true;
+  }
+  return false;
+}
+
+bool MgrClient::ms_handle_refused(Connection *con)
+{
+  // do nothing for now
+  return false;
+}
+
+void MgrClient::_send_stats()
+{
+  _send_report();
+  _send_pgstats();
+  if (stats_period != 0) {
+    report_callback = timer.add_event_after(
+      stats_period,
+      new LambdaContext([this](int) {
+	  _send_stats();
+	}));
+  }
+}
+
+void MgrClient::_send_report()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+  ceph_assert(session);
+  report_callback = nullptr;
+
+  auto report = make_message<MMgrReport>();
+  auto pcc = cct->get_perfcounters_collection();
+
+  pcc->with_counters([this, report](
+        const PerfCountersCollectionImpl::CounterMap &by_path)
+  {
+    // Helper for checking whether a counter should be included
+    auto include_counter = [this](
+        const PerfCounters::perf_counter_data_any_d &ctr,
+        const PerfCounters &perf_counters)
+    {
+      return perf_counters.get_adjusted_priority(ctr.prio) >= (int)stats_threshold;
+    };
+
+    // Helper for cases where we want to forget a counter
+    auto undeclare = [report, this](const std::string &path)
+    {
+      report->undeclare_types.push_back(path);
+      ldout(cct,20) << " undeclare " << path << dendl;
+      session->declared.erase(path);
+    };
+
+    ENCODE_START(1, 1, report->packed);
+
+    // Find counters that no longer exist, and undeclare them
+    for (auto p = session->declared.begin(); p != session->declared.end(); ) {
+      const auto &path = *(p++);
+      if (by_path.count(path) == 0) {
+        undeclare(path);
+      }
+    }
+
+    for (const auto &i : by_path) {
+      auto& path = i.first;
+      auto& data = *(i.second.data);
+      auto& perf_counters = *(i.second.perf_counters);
+
+      // Find counters that still exist, but are no longer permitted by
+      // stats_threshold
+      if (!include_counter(data, perf_counters)) {
+        if (session->declared.count(path)) {
+          undeclare(path);
+        }
+        continue;
+      }
+
+      if (session->declared.count(path) == 0) {
+	ldout(cct,20) << " declare " << path << dendl;
+	PerfCounterType type;
+	type.path = path;
+	if (data.description) {
+	  type.description = data.description;
+	}
+	if (data.nick) {
+	  type.nick = data.nick;
+	}
+	type.type = data.type;
+       type.priority = perf_counters.get_adjusted_priority(data.prio);
+	type.unit = data.unit;
+	report->declare_types.push_back(std::move(type));
+	session->declared.insert(path);
+      }
+
+      encode(static_cast<uint64_t>(data.u64), report->packed);
+      if (data.type & PERFCOUNTER_LONGRUNAVG) {
+        encode(static_cast<uint64_t>(data.avgcount), report->packed);
+        encode(static_cast<uint64_t>(data.avgcount2), report->packed);
+      }
+    }
+    ENCODE_FINISH(report->packed);
+
+    ldout(cct, 20) << "sending " << session->declared.size() << " counters ("
+                      "of possible " << by_path.size() << "), "
+		   << report->declare_types.size() << " new, "
+                   << report->undeclare_types.size() << " removed"
+                   << dendl;
+  });
+
+  ldout(cct, 20) << "encoded " << report->packed.length() << " bytes" << dendl;
+
+  if (daemon_name.size()) {
+    report->daemon_name = daemon_name;
+  } else {
+    report->daemon_name = cct->_conf->name.get_id();
+  }
+  report->service_name = service_name;
+
+  if (daemon_dirty_status) {
+    report->daemon_status = daemon_status;
+    daemon_dirty_status = false;
+  }
+
+  if (task_dirty_status) {
+    report->task_status = task_status;
+    task_dirty_status = false;
+  }
+
+  report->daemon_health_metrics = std::move(daemon_health_metrics);
+
+  cct->_conf.get_config_bl(last_config_bl_version, &report->config_bl,
+			    &last_config_bl_version);
+
+  if (get_perf_report_cb) {
+    MetricPayload payload = get_perf_report_cb();
+    MetricReportMessage message(payload);
+    report->metric_report_message = message;
+  }
+
+  session->con->send_message2(report);
+}
+
+void MgrClient::send_pgstats()
+{
+  std::lock_guard l(lock);
+  _send_pgstats();
+}
+
+void MgrClient::_send_pgstats()
+{
+  if (pgstats_cb && session) {
+    session->con->send_message(pgstats_cb());
+  }
+}
+
+bool MgrClient::handle_mgr_configure(ref_t<MMgrConfigure> m)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  ldout(cct, 20) << *m << dendl;
+
+  if (!session) {
+    lderr(cct) << "dropping unexpected configure message" << dendl;
+    return true;
+  }
+
+  ldout(cct, 4) << "stats_period=" << m->stats_period << dendl;
+
+  if (stats_threshold != m->stats_threshold) {
+    ldout(cct, 4) << "updated stats threshold: " << m->stats_threshold << dendl;
+    stats_threshold = m->stats_threshold;
+  }
+
+  if (!m->osd_perf_metric_queries.empty()) {
+    handle_config_payload(m->osd_perf_metric_queries);
+  } else if (m->metric_config_message) {
+    const MetricConfigMessage &message = *m->metric_config_message;
+    boost::apply_visitor(HandlePayloadVisitor(this), message.payload);
+  }
+
+  bool starting = (stats_period == 0) && (m->stats_period != 0);
+  stats_period = m->stats_period;
+  if (starting) {
+    _send_stats();
+  }
+
+  return true;
+}
+
+bool MgrClient::handle_mgr_close(ref_t<MMgrClose> m)
+{
+  service_daemon = false;
+  shutdown_cond.notify_all();
+  return true;
+}
+
+int MgrClient::start_command(const vector<string>& cmd, const bufferlist& inbl,
+			     bufferlist *outbl, string *outs,
+			     Context *onfinish)
+{
+  std::lock_guard l(lock);
+
+  ldout(cct, 20) << "cmd: " << cmd << dendl;
+
+  if (map.epoch == 0 && mgr_optional) {
+    ldout(cct,20) << " no MgrMap, assuming EACCES" << dendl;
+    return -EACCES;
+  }
+
+  auto &op = command_table.start_command();
+  op.cmd = cmd;
+  op.inbl = inbl;
+  op.outbl = outbl;
+  op.outs = outs;
+  op.on_finish = onfinish;
+
+  if (session && session->con) {
+    // Leaving fsid argument null because it isn't used historically, and
+    // we can use it as a signal that we are sending a non-tell command.
+    auto m = op.get_message(
+      {},
+      HAVE_FEATURE(map.active_mgr_features, SERVER_OCTOPUS));
+    session->con->send_message2(std::move(m));
+  } else {
+    ldout(cct, 5) << "no mgr session (no running mgr daemon?), waiting" << dendl;
+  }
+  return 0;
+}
+
+int MgrClient::start_tell_command(
+  const string& name,
+  const vector<string>& cmd, const bufferlist& inbl,
+  bufferlist *outbl, string *outs,
+  Context *onfinish)
+{
+  std::lock_guard l(lock);
+
+  ldout(cct, 20) << "target: " << name << " cmd: " << cmd << dendl;
+
+  if (map.epoch == 0 && mgr_optional) {
+    ldout(cct,20) << " no MgrMap, assuming EACCES" << dendl;
+    return -EACCES;
+  }
+
+  auto &op = command_table.start_command();
+  op.tell = true;
+  op.name = name;
+  op.cmd = cmd;
+  op.inbl = inbl;
+  op.outbl = outbl;
+  op.outs = outs;
+  op.on_finish = onfinish;
+
+  if (session && session->con && (name.size() == 0 || map.active_name == name)) {
+    // Set fsid argument to signal that this is really a tell message (and
+    // we are not a legacy client sending a non-tell command via MCommand).
+    auto m = op.get_message(monmap->fsid, false);
+    session->con->send_message2(std::move(m));
+  } else {
+    ldout(cct, 5) << "no mgr session (no running mgr daemon?), or "
+		  << name << " not active mgr, waiting" << dendl;
+  }
+  return 0;
+}
+
+bool MgrClient::handle_command_reply(
+  uint64_t tid,
+  bufferlist& data,
+  const std::string& rs,
+  int r)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+  ldout(cct, 20) << "tid " << tid << " r " << r << dendl;
+
+  if (!command_table.exists(tid)) {
+    ldout(cct, 4) << "handle_command_reply tid " << tid
+            << " not found" << dendl;
+    return true;
+  }
+
+  auto &op = command_table.get_command(tid);
+  if (op.outbl) {
+    *op.outbl = std::move(data);
+  }
+
+  if (op.outs) {
+    *(op.outs) = rs;
+  }
+
+  if (op.on_finish) {
+    op.on_finish->complete(r);
+  }
+
+  command_table.erase(tid);
+  return true;
+}
+
+int MgrClient::update_daemon_metadata(
+  const std::string& service,
+  const std::string& name,
+  const std::map<std::string,std::string>& metadata)
+{
+  std::lock_guard l(lock);
+  if (service_daemon) {
+    return -EEXIST;
+  }
+  ldout(cct,1) << service << "." << name << " metadata " << metadata << dendl;
+  service_name = service;
+  daemon_name = name;
+  daemon_metadata = metadata;
+  daemon_dirty_status = true;
+
+  if (need_metadata_update &&
+      !daemon_metadata.empty()) {
+    _send_update();
+    need_metadata_update = false;
+  }
+
+  return 0;
+}
+
+int MgrClient::service_daemon_register(
+  const std::string& service,
+  const std::string& name,
+  const std::map<std::string,std::string>& metadata)
+{
+  std::lock_guard l(lock);
+  if (service_daemon) {
+    return -EEXIST;
+  }
+  ldout(cct,1) << service << "." << name << " metadata " << metadata << dendl;
+  service_daemon = true;
+  service_name = service;
+  daemon_name = name;
+  daemon_metadata = metadata;
+  daemon_dirty_status = true;
+
+  // late register?
+  if (msgr->get_mytype() == CEPH_ENTITY_TYPE_CLIENT && session && session->con) {
+    _send_open();
+  }
+
+  return 0;
+}
+
+int MgrClient::service_daemon_update_status(
+  std::map<std::string,std::string>&& status)
+{
+  std::lock_guard l(lock);
+  ldout(cct,10) << status << dendl;
+  daemon_status = std::move(status);
+  daemon_dirty_status = true;
+  return 0;
+}
+
+int MgrClient::service_daemon_update_task_status(
+  std::map<std::string,std::string> &&status) {
+  std::lock_guard l(lock);
+  ldout(cct,10) << status << dendl;
+  task_status = std::move(status);
+  task_dirty_status = true;
+  return 0;
+}
+
+void MgrClient::update_daemon_health(std::vector<DaemonHealthMetric>&& metrics)
+{
+  std::lock_guard l(lock);
+  daemon_health_metrics = std::move(metrics);
+}
+
diff --git a/src/mgr/MgrClient.h b/src/mgr/MgrClient.h
new file mode 100644
index 000000000..1668d8da0
--- /dev/null
+++ b/src/mgr/MgrClient.h
@@ -0,0 +1,215 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MGR_CLIENT_H_
+#define MGR_CLIENT_H_
+
+#include <boost/variant.hpp>
+
+#include "msg/Connection.h"
+#include "msg/Dispatcher.h"
+#include "mon/MgrMap.h"
+#include "mgr/DaemonHealthMetric.h"
+
+#include "messages/MMgrReport.h"
+#include "mgr/MetricTypes.h"
+
+#include "common/perf_counters.h"
+#include "common/Timer.h"
+#include "common/CommandTable.h"
+
+class MMgrMap;
+class MMgrConfigure;
+class MMgrClose;
+class Messenger;
+class MCommandReply;
+class MPGStats;
+class MonMap;
+
+class MgrSessionState
+{
+  public:
+  // Which performance counters have we already transmitted schema for?
+  std::set<std::string> declared;
+
+  // Our connection to the mgr
+  ConnectionRef con;
+};
+
+class MgrCommand : public CommandOp
+{
+  public:
+  std::string name;
+  bool tell = false;
+
+  explicit MgrCommand(ceph_tid_t t) : CommandOp(t) {}
+  MgrCommand() : CommandOp() {}
+};
+
+class MgrClient : public Dispatcher
+{
+protected:
+  CephContext *cct;
+  MgrMap map;
+  Messenger *msgr;
+  MonMap *monmap;
+
+  std::unique_ptr<MgrSessionState> session;
+
+  ceph::mutex lock = ceph::make_mutex("MgrClient::lock");
+  ceph::condition_variable shutdown_cond;
+
+  uint32_t stats_period = 0;
+  uint32_t stats_threshold = 0;
+  SafeTimer timer;
+
+  CommandTable<MgrCommand> command_table;
+
+  using clock_t = ceph::real_clock;
+  clock_t::time_point last_connect_attempt;
+
+  uint64_t last_config_bl_version = 0;
+
+  Context *report_callback = nullptr;
+  Context *connect_retry_callback = nullptr;
+
+  // If provided, use this to compose an MPGStats to send with
+  // our reports (hook for use by OSD)
+  std::function<MPGStats*()> pgstats_cb;
+  std::function<void(const ConfigPayload &)> set_perf_queries_cb;
+  std::function<MetricPayload()> get_perf_report_cb;
+
+  // for service registration and beacon
+  bool service_daemon = false;
+  bool daemon_dirty_status = false;
+  bool task_dirty_status = false;
+  bool need_metadata_update = true;
+  std::string service_name, daemon_name;
+  std::map<std::string,std::string> daemon_metadata;
+  std::map<std::string,std::string> daemon_status;
+  std::map<std::string,std::string> task_status;
+  std::vector<DaemonHealthMetric> daemon_health_metrics;
+
+  void reconnect();
+  void _send_open();
+  void _send_update();
+
+  // In pre-luminous clusters, the ceph-mgr service is absent or optional,
+  // so we must not block in start_command waiting for it.
+  bool mgr_optional = false;
+
+public:
+  MgrClient(CephContext *cct_, Messenger *msgr_, MonMap *monmap);
+
+  void set_messenger(Messenger *msgr_) { msgr = msgr_; }
+
+  void init();
+  void shutdown();
+
+  void set_mgr_optional(bool optional_) {mgr_optional = optional_;}
+
+  bool ms_dispatch2(const ceph::ref_t<Message>& m) override;
+  bool ms_handle_reset(Connection *con) override;
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override;
+
+  bool handle_mgr_map(ceph::ref_t<MMgrMap> m);
+  bool handle_mgr_configure(ceph::ref_t<MMgrConfigure> m);
+  bool handle_mgr_close(ceph::ref_t<MMgrClose> m);
+  bool handle_command_reply(
+    uint64_t tid,
+    ceph::buffer::list& data,
+    const std::string& rs,
+    int r);
+
+  void set_perf_metric_query_cb(
+    std::function<void(const ConfigPayload &)> cb_set,
+    std::function<MetricPayload()> cb_get)
+  {
+      std::lock_guard l(lock);
+      set_perf_queries_cb = cb_set;
+      get_perf_report_cb = cb_get;
+  }
+
+  void send_pgstats();
+  void set_pgstats_cb(std::function<MPGStats*()>&& cb_)
+  {
+    std::lock_guard l(lock);
+    pgstats_cb = std::move(cb_);
+  }
+
+  int start_command(
+    const std::vector<std::string>& cmd, const ceph::buffer::list& inbl,
+    ceph::buffer::list *outbl, std::string *outs,
+    Context *onfinish);
+  int start_tell_command(
+    const std::string& name,
+    const std::vector<std::string>& cmd, const ceph::buffer::list& inbl,
+    ceph::buffer::list *outbl, std::string *outs,
+    Context *onfinish);
+
+  int update_daemon_metadata(
+    const std::string& service,
+    const std::string& name,
+    const std::map<std::string,std::string>& metadata);
+  int service_daemon_register(
+    const std::string& service,
+    const std::string& name,
+    const std::map<std::string,std::string>& metadata);
+  int service_daemon_update_status(
+    std::map<std::string,std::string>&& status);
+  int service_daemon_update_task_status(
+    std::map<std::string,std::string> &&task_status);
+  void update_daemon_health(std::vector<DaemonHealthMetric>&& metrics);
+
+  bool is_initialized() const { return initialized; }
+
+private:
+  void handle_config_payload(const OSDConfigPayload &payload) {
+    if (set_perf_queries_cb) {
+      set_perf_queries_cb(payload);
+    }
+  }
+
+  void handle_config_payload(const MDSConfigPayload &payload) {
+    if (set_perf_queries_cb) {
+      set_perf_queries_cb(payload);
+    }
+  }
+
+  void handle_config_payload(const UnknownConfigPayload &payload) {
+    ceph_abort();
+  }
+
+  struct HandlePayloadVisitor : public boost::static_visitor<void> {
+    MgrClient *mgrc;
+
+    HandlePayloadVisitor(MgrClient *mgrc)
+      : mgrc(mgrc) {
+    }
+
+    template <typename ConfigPayload>
+    inline void operator()(const ConfigPayload &payload) const {
+      mgrc->handle_config_payload(payload);
+    }
+  };
+
+  void _send_stats();
+  void _send_pgstats();
+  void _send_report();
+
+  bool initialized = false;
+};
+
+#endif
diff --git a/src/mgr/MgrCommands.h b/src/mgr/MgrCommands.h
new file mode 100644
index 000000000..bc3350da4
--- /dev/null
+++ b/src/mgr/MgrCommands.h
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/* no guard; may be included multiple times */
+
+// see MonCommands.h
+
+COMMAND("pg stat", "show placement group status.",
+	"pg", "r")
+COMMAND("pg getmap", "get binary pg map to -o/stdout", "pg", "r")
+
+COMMAND("pg dump "							\
+	"name=dumpcontents,type=CephChoices,strings=all|summary|sum|delta|pools|osds|pgs|pgs_brief,n=N,req=false", \
+	"show human-readable versions of pg map (only 'all' valid with plain)", "pg", "r")
+COMMAND("pg dump_json "							\
+	"name=dumpcontents,type=CephChoices,strings=all|summary|sum|pools|osds|pgs,n=N,req=false", \
+	"show human-readable version of pg map in json only",\
+	"pg", "r")
+COMMAND("pg dump_pools_json", "show pg pools info in json only",\
+	"pg", "r")
+
+COMMAND("pg ls-by-pool "		\
+        "name=poolstr,type=CephString " \
+	"name=states,type=CephString,n=N,req=false", \
+	"list pg with pool = [poolname]", "pg", "r")
+COMMAND("pg ls-by-primary " \
+        "name=osd,type=CephOsdName " \
+        "name=pool,type=CephInt,req=false " \
+	"name=states,type=CephString,n=N,req=false", \
+	"list pg with primary = [osd]", "pg", "r")
+COMMAND("pg ls-by-osd " \
+        "name=osd,type=CephOsdName " \
+        "name=pool,type=CephInt,req=false " \
+	"name=states,type=CephString,n=N,req=false", \
+	"list pg on osd [osd]", "pg", "r")
+COMMAND("pg ls " \
+        "name=pool,type=CephInt,req=false " \
+	"name=states,type=CephString,n=N,req=false", \
+	"list pg with specific pool, osd, state", "pg", "r")
+COMMAND("pg dump_stuck " \
+	"name=stuckops,type=CephChoices,strings=inactive|unclean|stale|undersized|degraded,n=N,req=false " \
+	"name=threshold,type=CephInt,req=false",
+	"show information about stuck pgs",\
+	"pg", "r")
+COMMAND("pg debug " \
+	"name=debugop,type=CephChoices,strings=unfound_objects_exist|degraded_pgs_exist", \
+	"show debug info about pgs", "pg", "r")
+
+COMMAND("pg scrub name=pgid,type=CephPgid", "start scrub on <pgid>", \
+	"pg", "rw")
+COMMAND("pg deep-scrub name=pgid,type=CephPgid", "start deep-scrub on <pgid>", \
+	"pg", "rw")
+COMMAND("pg repair name=pgid,type=CephPgid", "start repair on <pgid>", \
+	"pg", "rw")
+
+COMMAND("pg force-recovery name=pgid,type=CephPgid,n=N", "force recovery of <pgid> first", \
+	"pg", "rw")
+COMMAND("pg force-backfill name=pgid,type=CephPgid,n=N", "force backfill of <pgid> first", \
+	"pg", "rw")
+COMMAND("pg cancel-force-recovery name=pgid,type=CephPgid,n=N", "restore normal recovery priority of <pgid>", \
+	"pg", "rw")
+COMMAND("pg cancel-force-backfill name=pgid,type=CephPgid,n=N", "restore normal backfill priority of <pgid>", \
+	"pg", "rw")
+
+// stuff in osd namespace
+COMMAND("osd perf", \
+        "print dump of OSD perf summary stats", \
+        "osd", \
+        "r")
+COMMAND("osd df " \
+        "name=output_method,type=CephChoices,strings=plain|tree,req=false " \
+        "name=filter_by,type=CephChoices,strings=class|name,req=false " \
+        "name=filter,type=CephString,req=false", \
+	"show OSD utilization", "osd", "r")
+COMMAND("osd blocked-by", \
+	"print histogram of which OSDs are blocking their peers", \
+	"osd", "r")
+COMMAND("osd pool stats " \
+        "name=pool_name,type=CephPoolname,req=false",
+        "obtain stats from all pools, or from specified pool",
+        "osd", "r")
+COMMAND("osd pool scrub " \
+        "name=who,type=CephPoolname,n=N", \
+        "initiate scrub on pool <who>", \
+        "osd", "rw")
+COMMAND("osd pool deep-scrub " \
+        "name=who,type=CephPoolname,n=N", \
+        "initiate deep-scrub on pool <who>", \
+        "osd", "rw")
+COMMAND("osd pool repair " \
+        "name=who,type=CephPoolname,n=N", \
+        "initiate repair on pool <who>", \
+        "osd", "rw")
+COMMAND("osd pool force-recovery " \
+        "name=who,type=CephPoolname,n=N", \
+        "force recovery of specified pool <who> first", \
+        "osd", "rw")
+COMMAND("osd pool force-backfill " \
+        "name=who,type=CephPoolname,n=N", \
+        "force backfill of specified pool <who> first", \
+        "osd", "rw")
+COMMAND("osd pool cancel-force-recovery " \
+        "name=who,type=CephPoolname,n=N", \
+        "restore normal recovery priority of specified pool <who>", \
+        "osd", "rw")
+COMMAND("osd pool cancel-force-backfill " \
+        "name=who,type=CephPoolname,n=N", \
+        "restore normal recovery priority of specified pool <who>", \
+        "osd", "rw")
+COMMAND("osd reweight-by-utilization " \
+	"name=oload,type=CephInt,req=false " \
+	"name=max_change,type=CephFloat,req=false "			\
+	"name=max_osds,type=CephInt,req=false "			\
+	"name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\
+	"reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
+	"osd", "rw")
+COMMAND("osd test-reweight-by-utilization " \
+	"name=oload,type=CephInt,req=false " \
+	"name=max_change,type=CephFloat,req=false "			\
+	"name=max_osds,type=CephInt,req=false "			\
+	"name=no_increasing,type=CephBool,req=false",\
+	"dry run of reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
+	"osd", "r")
+COMMAND("osd reweight-by-pg " \
+	"name=oload,type=CephInt,req=false " \
+	"name=max_change,type=CephFloat,req=false "			\
+	"name=max_osds,type=CephInt,req=false "			\
+	"name=pools,type=CephPoolname,n=N,req=false",			\
+	"reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
+	"osd", "rw")
+COMMAND("osd test-reweight-by-pg " \
+	"name=oload,type=CephInt,req=false " \
+	"name=max_change,type=CephFloat,req=false "			\
+	"name=max_osds,type=CephInt,req=false "			\
+	"name=pools,type=CephPoolname,n=N,req=false",			\
+	"dry run of reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
+	"osd", "r")
+
+COMMAND("osd destroy "	    \
+        "name=id,type=CephOsdName " \
+	"name=force,type=CephBool,req=false "
+        // backward compat synonym for --force
+	"name=yes_i_really_mean_it,type=CephBool,req=false", \
+        "mark osd as being destroyed. Keeps the ID intact (allowing reuse), " \
+        "but removes cephx keys, config-key data and lockbox keys, "\
+        "rendering data permanently unreadable.", \
+        "osd", "rw")
+COMMAND("osd purge " \
+        "name=id,type=CephOsdName " \
+	"name=force,type=CephBool,req=false "
+        // backward compat synonym for --force
+	"name=yes_i_really_mean_it,type=CephBool,req=false", \
+        "purge all osd data from the monitors including the OSD id " \
+	"and CRUSH position",					     \
+	"osd", "rw")
+
+COMMAND("osd safe-to-destroy name=ids,type=CephString,n=N",
+	"check whether osd(s) can be safely destroyed without reducing data durability",
+	"osd", "r")
+COMMAND("osd ok-to-stop name=ids,type=CephString,n=N "\
+	"name=max,type=CephInt,req=false",
+	"check whether osd(s) can be safely stopped without reducing immediate"\
+	" data availability", "osd", "r")
+
+COMMAND("osd scrub " \
+	"name=who,type=CephString", \
+	"initiate scrub on osd <who>, or use <all|any> to scrub all", \
+        "osd", "rw")
+COMMAND("osd deep-scrub " \
+	"name=who,type=CephString", \
+	"initiate deep scrub on osd <who>, or use <all|any> to deep scrub all", \
+        "osd", "rw")
+COMMAND("osd repair " \
+	"name=who,type=CephString", \
+	"initiate repair on osd <who>, or use <all|any> to repair all", \
+        "osd", "rw")
+
+COMMAND("service dump",
+        "dump service map", "service", "r")
+COMMAND("service status",
+        "dump service state", "service", "r")
+
+COMMAND("config show " \
+	"name=who,type=CephString name=key,type=CephString,req=False",
+	"Show running configuration",
+	"mgr", "r")
+COMMAND("config show-with-defaults " \
+	"name=who,type=CephString",
+	"Show running configuration (including compiled-in defaults)",
+	"mgr", "r")
+
+COMMAND("device ls",
+	"Show devices",
+	"mgr", "r")
+COMMAND("device info name=devid,type=CephString",
+	"Show information about a device",
+	"mgr", "r")
+COMMAND("device ls-by-daemon name=who,type=CephString",
+	"Show devices associated with a daemon",
+	"mgr", "r")
+COMMAND("device ls-by-host name=host,type=CephString",
+	"Show devices on a host",
+	"mgr", "r")
+COMMAND("device set-life-expectancy name=devid,type=CephString "\
+	"name=from,type=CephString "\
+	"name=to,type=CephString,req=False",
+	"Set predicted device life expectancy",
+	"mgr", "rw")
+COMMAND("device rm-life-expectancy name=devid,type=CephString",
+	"Clear predicted device life expectancy",
+	"mgr", "rw")
diff --git a/src/mgr/MgrContext.h b/src/mgr/MgrContext.h
new file mode 100644
index 000000000..a5490bef3
--- /dev/null
+++ b/src/mgr/MgrContext.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef MGR_CONTEXT_H_
+#define MGR_CONTEXT_H_
+
+#include <memory>
+
+#include "common/ceph_json.h"
+#include "common/Cond.h"
+#include "mon/MonClient.h"
+
+class Command
+{
+protected:
+  C_SaferCond cond;
+public:
+  ceph::buffer::list outbl;
+  std::string outs;
+  int r;
+
+  void run(MonClient *monc, const std::string &command)
+  {
+    monc->start_mon_command({command}, {},
+        &outbl, &outs, &cond);
+  }
+
+  void run(MonClient *monc, const std::string &command, const ceph::buffer::list &inbl)
+  {
+    monc->start_mon_command({command}, inbl,
+        &outbl, &outs, &cond);
+  }
+
+  virtual void wait()
+  {
+    r = cond.wait();
+  }
+
+  virtual ~Command() {}
+};
+
+
+class JSONCommand : public Command
+{
+public:
+  json_spirit::mValue json_result;
+
+  void wait() override
+  {
+    Command::wait();
+
+    if (r == 0) {
+      bool read_ok = json_spirit::read(
+          outbl.to_str(), json_result);
+      if (!read_ok) {
+        r = -EINVAL;
+      }
+    }
+  }
+};
+
+#endif
+
diff --git a/src/mgr/MgrSession.h b/src/mgr/MgrSession.h
new file mode 100644
index 000000000..40b50220b
--- /dev/null
+++ b/src/mgr/MgrSession.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_MGRSESSION_H
+#define CEPH_MGR_MGRSESSION_H
+
+#include "common/RefCountedObj.h"
+#include "common/entity_name.h"
+#include "msg/msg_types.h"
+#include "MgrCap.h"
+
+
+/**
+ * Session state associated with the Connection.
+ */
+struct MgrSession : public RefCountedObject {
+  uint64_t global_id = 0;
+  EntityName entity_name;
+  entity_inst_t inst;
+
+  int osd_id = -1;  ///< osd id (if an osd)
+
+  MgrCap caps;
+
+  std::set<std::string> declared_types;
+
+  const entity_addr_t& get_peer_addr() const {
+    return inst.addr;
+  }
+
+private:
+  FRIEND_MAKE_REF(MgrSession);
+  explicit MgrSession(CephContext *cct) : RefCountedObject(cct) {}
+  ~MgrSession() override = default;
+};
+
+using MgrSessionRef = ceph::ref_t<MgrSession>;
+
+
+#endif
diff --git a/src/mgr/MgrStandby.cc b/src/mgr/MgrStandby.cc
new file mode 100644
index 000000000..2821bf4cf
--- /dev/null
+++ b/src/mgr/MgrStandby.cc
@@ -0,0 +1,503 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <Python.h>
+
+#include "common/errno.h"
+#include "common/signal.h"
+#include "include/compat.h"
+
+#include "include/stringify.h"
+#include "global/global_context.h"
+#include "global/signal_handler.h"
+
+#include "mgr/MgrContext.h"
+#include "mgr/mgr_commands.h"
+#include "mgr/mgr_perf_counters.h"
+
+#include "messages/MMgrBeacon.h"
+#include "messages/MMgrMap.h"
+#include "Mgr.h"
+
+#include "MgrStandby.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+MgrStandby::MgrStandby(int argc, const char **argv) :
+  Dispatcher(g_ceph_context),
+  monc{g_ceph_context, poolctx},
+  client_messenger(Messenger::create(
+		     g_ceph_context,
+		     cct->_conf.get_val<std::string>("ms_type"),
+		     entity_name_t::MGR(),
+		     "mgr",
+		     Messenger::get_pid_nonce())),
+  objecter{g_ceph_context, client_messenger.get(), &monc, poolctx},
+  client{client_messenger.get(), &monc, &objecter},
+  mgrc(g_ceph_context, client_messenger.get(), &monc.monmap),
+  log_client(g_ceph_context, client_messenger.get(), &monc.monmap, LogClient::NO_FLAGS),
+  clog(log_client.create_channel(CLOG_CHANNEL_CLUSTER)),
+  audit_clog(log_client.create_channel(CLOG_CHANNEL_AUDIT)),
+  finisher(g_ceph_context, "MgrStandby", "mgrsb-fin"),
+  timer(g_ceph_context, lock),
+  py_module_registry(clog),
+  active_mgr(nullptr),
+  orig_argc(argc),
+  orig_argv(argv),
+  available_in_map(false)
+{
+}
+
+MgrStandby::~MgrStandby() = default;
+
+const char** MgrStandby::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    // clog & admin clog
+    "clog_to_monitors",
+    "clog_to_syslog",
+    "clog_to_syslog_facility",
+    "clog_to_syslog_level",
+    "clog_to_graylog",
+    "clog_to_graylog_host",
+    "clog_to_graylog_port",
+    "mgr_standby_modules",
+    "host",
+    "fsid",
+    NULL
+  };
+  return KEYS;
+}
+
+void MgrStandby::handle_conf_change(
+  const ConfigProxy& conf,
+  const std::set <std::string> &changed)
+{
+  if (changed.count("clog_to_monitors") ||
+      changed.count("clog_to_syslog") ||
+      changed.count("clog_to_syslog_level") ||
+      changed.count("clog_to_syslog_facility") ||
+      changed.count("clog_to_graylog") ||
+      changed.count("clog_to_graylog_host") ||
+      changed.count("clog_to_graylog_port") ||
+      changed.count("host") ||
+      changed.count("fsid")) {
+    _update_log_config();
+  }
+  if (changed.count("mgr_standby_modules") && !active_mgr) {
+    if (g_conf().get_val<bool>("mgr_standby_modules") != py_module_registry.have_standby_modules()) {
+      dout(1) << "mgr_standby_modules now "
+	      << (int)g_conf().get_val<bool>("mgr_standby_modules")
+	      << ", standby modules are "
+	      << (py_module_registry.have_standby_modules() ? "":"not ")
+	      << "active, respawning"
+	      << dendl;
+      respawn();
+    }
+  }
+}
+
+int MgrStandby::init()
+{
+  init_async_signal_handler();
+  register_async_signal_handler(SIGHUP, sighup_handler);
+
+  cct->_conf.add_observer(this);
+
+  std::lock_guard l(lock);
+
+  // Start finisher
+  finisher.start();
+
+  // Initialize Messenger
+  client_messenger->add_dispatcher_tail(this);
+  client_messenger->add_dispatcher_head(&objecter);
+  client_messenger->add_dispatcher_tail(&client);
+  client_messenger->start();
+
+  poolctx.start(2);
+
+  // Initialize MonClient
+  if (monc.build_initial_monmap() < 0) {
+    client_messenger->shutdown();
+    client_messenger->wait();
+    return -1;
+  }
+
+  monc.sub_want("mgrmap", 0, 0);
+
+  monc.set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD
+      |CEPH_ENTITY_TYPE_MDS|CEPH_ENTITY_TYPE_MGR);
+  monc.set_messenger(client_messenger.get());
+
+  // We must register our config callback before calling init(), so
+  // that we see the initial configuration message
+  monc.register_config_callback([this](const std::string &k, const std::string &v){
+      // removing value to hide sensitive data going into mgr logs
+      // leaving this for debugging purposes
+      // dout(10) << "config_callback: " << k << " : " << v << dendl;
+      dout(10) << "config_callback: " << k << " : " << dendl;
+      if (k.substr(0, 4) == "mgr/") {
+        py_module_registry.handle_config(k, v);
+	return true;
+      }
+      return false;
+    });
+  monc.register_config_notify_callback([this]() {
+      py_module_registry.handle_config_notify();
+    });
+  dout(4) << "Registered monc callback" << dendl;
+
+  int r = monc.init();
+  if (r < 0) {
+    monc.shutdown();
+    client_messenger->shutdown();
+    client_messenger->wait();
+    return r;
+  }
+  mgrc.init();
+  client_messenger->add_dispatcher_tail(&mgrc);
+
+  r = monc.authenticate();
+  if (r < 0) {
+    derr << "Authentication failed, did you specify a mgr ID with a valid keyring?" << dendl;
+    monc.shutdown();
+    client_messenger->shutdown();
+    client_messenger->wait();
+    return r;
+  }
+  // only forward monmap updates after authentication finishes, otherwise
+  // monc.authenticate() will be waiting for MgrStandy::ms_dispatch()
+  // to acquire the lock forever, as it is already locked in the beginning of
+  // this method.
+  monc.set_passthrough_monmap();
+
+  client_t whoami = monc.get_global_id();
+  client_messenger->set_myname(entity_name_t::MGR(whoami.v));
+  monc.set_log_client(&log_client);
+  _update_log_config();
+  objecter.set_client_incarnation(0);
+  objecter.init();
+  objecter.start();
+  client.init();
+  timer.init();
+
+  py_module_registry.init();
+  mgr_perf_start(g_ceph_context);
+
+
+  tick();
+
+  dout(4) << "Complete." << dendl;
+  return 0;
+}
+
+void MgrStandby::send_beacon()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+  dout(20) << state_str() << dendl;
+
+  auto modules = py_module_registry.get_modules();
+
+  // Construct a list of the info about each loaded module
+  // which we will transmit to the monitor.
+  std::vector<MgrMap::ModuleInfo> module_info;
+  for (const auto &module : modules) {
+    MgrMap::ModuleInfo info;
+    info.name = module->get_name();
+    info.error_string = module->get_error_string();
+    info.can_run = module->get_can_run();
+    info.module_options = module->get_options();
+    module_info.push_back(std::move(info));
+  }
+
+  auto clients = py_module_registry.get_clients();
+  for (const auto& client : clients) {
+    dout(15) << "noting RADOS client for blocklist: " << client << dendl;
+  }
+
+  // Whether I think I am available (request MgrMonitor to set me
+  // as available in the map)
+  bool available = active_mgr != nullptr && active_mgr->is_initialized();
+
+  auto addrs = available ? active_mgr->get_server_addrs() : entity_addrvec_t();
+  dout(10) << "sending beacon as gid " << monc.get_global_id() << dendl;
+
+  map<string,string> metadata;
+  metadata["addr"] = client_messenger->get_myaddr_legacy().ip_only_to_str();
+  metadata["addrs"] = stringify(client_messenger->get_myaddrs());
+  collect_sys_info(&metadata, g_ceph_context);
+
+  auto m = ceph::make_message<MMgrBeacon>(monc.get_fsid(),
+				 monc.get_global_id(),
+                                 g_conf()->name.get_id(),
+                                 addrs,
+                                 available,
+				 std::move(module_info),
+				 std::move(metadata),
+                                 std::move(clients),
+				 CEPH_FEATURES_ALL);
+
+  if (available) {
+    if (!available_in_map) {
+      // We are informing the mon that we are done initializing: inform
+      // it of our command set.  This has to happen after init() because
+      // it needs the python modules to have loaded.
+      std::vector<MonCommand> commands = mgr_commands;
+      std::vector<MonCommand> py_commands = py_module_registry.get_commands();
+      commands.insert(commands.end(), py_commands.begin(), py_commands.end());
+      m->set_command_descs(commands);
+      dout(4) << "going active, including " << m->get_command_descs().size()
+              << " commands in beacon" << dendl;
+    }
+
+    m->set_services(active_mgr->get_services());
+  }
+
+  monc.send_mon_message(std::move(m));
+}
+
+void MgrStandby::tick()
+{
+  dout(10) << __func__ << dendl;
+  send_beacon();
+
+  timer.add_event_after(
+      g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count(),
+      new LambdaContext([this](int r){
+          tick();
+      }
+  ));
+}
+
+void MgrStandby::shutdown()
+{
+  finisher.queue(new LambdaContext([&](int) {
+    std::lock_guard l(lock);
+
+    dout(4) << "Shutting down" << dendl;
+
+    py_module_registry.shutdown();
+    // stop sending beacon first, I use monc to talk with monitors
+    timer.shutdown();
+    // client uses monc and objecter
+    client.shutdown();
+    mgrc.shutdown();
+    // Stop asio threads, so leftover events won't call into shut down
+    // monclient/objecter.
+    poolctx.finish();
+    // stop monc, so mon won't be able to instruct me to shutdown/activate after
+    // the active_mgr is stopped
+    monc.shutdown();
+    if (active_mgr) {
+      active_mgr->shutdown();
+    }
+    // objecter is used by monc and active_mgr
+    objecter.shutdown();
+    // client_messenger is used by all of them, so stop it in the end
+    client_messenger->shutdown();
+  }));
+
+  // Then stop the finisher to ensure its enqueued contexts aren't going
+  // to touch references to the things we're about to tear down
+  finisher.wait_for_empty();
+  finisher.stop();
+  mgr_perf_stop(g_ceph_context);
+}
+
+void MgrStandby::respawn()
+{
+  // --- WARNING TO FUTURE COPY/PASTERS ---
+  // You must also add a call like
+  //
+  //   ceph_pthread_setname(pthread_self(), "ceph-mgr");
+  //
+  // to main() so that /proc/$pid/stat field 2 contains "(ceph-mgr)"
+  // instead of "(exe)", so that killall (and log rotation) will work.
+
+  char *new_argv[orig_argc+1];
+  dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
+  for (int i=0; i<orig_argc; i++) {
+    new_argv[i] = (char *)orig_argv[i];
+    dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl;
+  }
+  new_argv[orig_argc] = NULL;
+
+  /* Determine the path to our executable, test if Linux /proc/self/exe exists.
+   * This allows us to exec the same executable even if it has since been
+   * unlinked.
+   */
+  char exe_path[PATH_MAX] = "";
+  if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) == -1) {
+    /* Print CWD for the user's interest */
+    char buf[PATH_MAX];
+    char *cwd = getcwd(buf, sizeof(buf));
+    ceph_assert(cwd);
+    dout(1) << " cwd " << cwd << dendl;
+
+    /* Fall back to a best-effort: just running in our CWD */
+    strncpy(exe_path, orig_argv[0], PATH_MAX-1);
+  } else {
+    dout(1) << "respawning with exe " << exe_path << dendl;
+    strcpy(exe_path, PROCPREFIX "/proc/self/exe");
+  }
+
+  dout(1) << " exe_path " << exe_path << dendl;
+
+  unblock_all_signals(NULL);
+  execv(exe_path, new_argv);
+
+  derr << "respawn execv " << orig_argv[0]
+       << " failed with " << cpp_strerror(errno) << dendl;
+  ceph_abort();
+}
+
+void MgrStandby::_update_log_config()
+{
+  map<string,string> log_to_monitors;
+  map<string,string> log_to_syslog;
+  map<string,string> log_channel;
+  map<string,string> log_prio;
+  map<string,string> log_to_graylog;
+  map<string,string> log_to_graylog_host;
+  map<string,string> log_to_graylog_port;
+  uuid_d fsid;
+  string host;
+
+  if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
+			       log_channel, log_prio, log_to_graylog,
+			       log_to_graylog_host, log_to_graylog_port,
+			       fsid, host) == 0) {
+    clog->update_config(log_to_monitors, log_to_syslog,
+			log_channel, log_prio, log_to_graylog,
+			log_to_graylog_host, log_to_graylog_port,
+			fsid, host);
+    audit_clog->update_config(log_to_monitors, log_to_syslog,
+			      log_channel, log_prio, log_to_graylog,
+			      log_to_graylog_host, log_to_graylog_port,
+			      fsid, host);
+  }
+}
+
+void MgrStandby::handle_mgr_map(ref_t<MMgrMap> mmap)
+{
+  auto &map = mmap->get_map();
+  dout(4) << "received map epoch " << map.get_epoch() << dendl;
+  const bool active_in_map = map.active_gid == monc.get_global_id();
+  dout(4) << "active in map: " << active_in_map
+          << " active is " << map.active_gid << dendl;
+
+  // PyModuleRegistry may ask us to respawn if it sees that
+  // this MgrMap is changing its set of enabled modules
+  bool need_respawn = py_module_registry.handle_mgr_map(map);
+  if (need_respawn) {
+    dout(1) << "respawning because set of enabled modules changed!" << dendl;
+    respawn();
+  }
+
+  if (active_in_map) {
+    if (!active_mgr) {
+      dout(1) << "Activating!" << dendl;
+      active_mgr.reset(new Mgr(&monc, map, &py_module_registry,
+                               client_messenger.get(), &objecter,
+			       &client, clog, audit_clog));
+      active_mgr->background_init(new LambdaContext(
+            [this](int r){
+              // Advertise our active-ness ASAP instead of waiting for
+              // next tick.
+              std::lock_guard l(lock);
+              send_beacon();
+            }));
+      dout(1) << "I am now activating" << dendl;
+    } else {
+      dout(10) << "I was already active" << dendl;
+      bool need_respawn = active_mgr->got_mgr_map(map);
+      if (need_respawn) {
+	respawn();
+      }
+    }
+
+    if (!available_in_map && map.get_available()) {
+      dout(4) << "Map now says I am available" << dendl;
+      available_in_map = true;
+    }
+  } else if (active_mgr != nullptr) {
+    derr << "I was active but no longer am" << dendl;
+    respawn();
+  } else {
+    if (map.active_gid != 0 && map.active_name != g_conf()->name.get_id()) {
+      // I am the standby and someone else is active, start modules
+      // in standby mode to do redirects if needed
+      if (!py_module_registry.is_standby_running() &&
+	  g_conf().get_val<bool>("mgr_standby_modules")) {
+        py_module_registry.standby_start(monc, finisher);
+      }
+    }
+  }
+}
+
+bool MgrStandby::ms_dispatch2(const ref_t<Message>& m)
+{
+  std::lock_guard l(lock);
+  dout(10) << state_str() << " " << *m << dendl;
+
+  if (m->get_type() == MSG_MGR_MAP) {
+    handle_mgr_map(ref_cast<MMgrMap>(m));
+  }
+  bool handled = false;
+  if (active_mgr) {
+    auto am = active_mgr;
+    lock.unlock();
+    handled = am->ms_dispatch2(m);
+    lock.lock();
+  }
+  if (m->get_type() == MSG_MGR_MAP) {
+    // let this pass through for mgrc
+    handled = false;
+  }
+  return handled;
+}
+
+
+bool MgrStandby::ms_handle_refused(Connection *con)
+{
+  // do nothing for now
+  return false;
+}
+
+int MgrStandby::main(vector<const char *> args)
+{
+  client_messenger->wait();
+
+  // Disable signal handlers
+  unregister_async_signal_handler(SIGHUP, sighup_handler);
+  shutdown_async_signal_handler();
+
+  return 0;
+}
+
+
+std::string MgrStandby::state_str()
+{
+  if (active_mgr == nullptr) {
+    return "standby";
+  } else if (active_mgr->is_initialized()) {
+    return "active";
+  } else {
+    return "active (starting)";
+  }
+}
diff --git a/src/mgr/MgrStandby.h b/src/mgr/MgrStandby.h
new file mode 100644
index 000000000..cac31a576
--- /dev/null
+++ b/src/mgr/MgrStandby.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#ifndef MGR_STANDBY_H_
+#define MGR_STANDBY_H_
+
+#include "auth/Auth.h"
+#include "common/async/context_pool.h"
+#include "common/Finisher.h"
+#include "common/Timer.h"
+#include "common/LogClient.h"
+
+#include "client/Client.h"
+#include "mon/MonClient.h"
+#include "osdc/Objecter.h"
+#include "PyModuleRegistry.h"
+#include "MgrClient.h"
+
+class MMgrMap;
+class Mgr;
+class PyModuleConfig;
+
+class MgrStandby : public Dispatcher,
+		   public md_config_obs_t {
+public:
+  // config observer bits
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set <std::string> &changed) override;
+
+protected:
+  ceph::async::io_context_pool poolctx;
+  MonClient monc;
+  std::unique_ptr<Messenger> client_messenger;
+  Objecter objecter;
+  Client client;
+
+  MgrClient mgrc;
+
+  LogClient log_client;
+  LogChannelRef clog, audit_clog;
+
+  ceph::mutex lock = ceph::make_mutex("MgrStandby::lock");
+  Finisher finisher;
+  SafeTimer timer;
+
+  PyModuleRegistry py_module_registry;
+  std::shared_ptr<Mgr> active_mgr;
+
+  int orig_argc;
+  const char **orig_argv;
+
+  std::string state_str();
+
+  void handle_mgr_map(ceph::ref_t<MMgrMap> m);
+  void _update_log_config();
+  void send_beacon();
+
+  bool available_in_map;
+
+public:
+  MgrStandby(int argc, const char **argv);
+  ~MgrStandby() override;
+
+  bool ms_dispatch2(const ceph::ref_t<Message>& m) override;
+  bool ms_handle_reset(Connection *con) override { return false; }
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override;
+
+  int init();
+  void shutdown();
+  void respawn();
+  int main(vector<const char *> args);
+  void tick();
+};
+
+#endif
+
diff --git a/src/mgr/OSDPerfMetricCollector.cc b/src/mgr/OSDPerfMetricCollector.cc
new file mode 100644
index 000000000..eb548ce70
--- /dev/null
+++ b/src/mgr/OSDPerfMetricCollector.cc
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "messages/MMgrReport.h"
+#include "OSDPerfMetricCollector.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr.osd_perf_metric_collector " << __func__ << " "
+
+OSDPerfMetricCollector::OSDPerfMetricCollector(MetricListener &listener)
+  : MetricCollector<OSDPerfMetricQuery,
+                    OSDPerfMetricLimit,
+                    OSDPerfMetricKey,
+                    OSDPerfMetricReport>(listener) {
+}
+
+void OSDPerfMetricCollector::process_reports(const MetricPayload &payload) {
+  const std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports =
+    boost::get<OSDMetricPayload>(payload).report;
+
+  std::lock_guard locker(lock);
+  process_reports_generic(
+    reports, [](PerformanceCounter *counter, const PerformanceCounter &update) {
+      counter->first += update.first;
+      counter->second += update.second;
+    });
+}
+
+int OSDPerfMetricCollector::get_counters(PerfCollector *collector) {
+  OSDPerfCollector *c = static_cast<OSDPerfCollector *>(collector);
+
+  std::lock_guard locker(lock);
+  return get_counters_generic(c->query_id, &c->counters);
+}
diff --git a/src/mgr/OSDPerfMetricCollector.h b/src/mgr/OSDPerfMetricCollector.h
new file mode 100644
index 000000000..c531dbf63
--- /dev/null
+++ b/src/mgr/OSDPerfMetricCollector.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef OSD_PERF_METRIC_COLLECTOR_H_
+#define OSD_PERF_METRIC_COLLECTOR_H_
+
+#include "mgr/MetricCollector.h"
+#include "mgr/OSDPerfMetricTypes.h"
+
+/**
+ * OSD performance query class.
+ */
+class OSDPerfMetricCollector
+  : public MetricCollector<OSDPerfMetricQuery, OSDPerfMetricLimit, OSDPerfMetricKey,
+                           OSDPerfMetricReport> {
+public:
+  OSDPerfMetricCollector(MetricListener &listener);
+
+  void process_reports(const MetricPayload &payload) override;
+  int get_counters(PerfCollector *collector) override;
+};
+
+#endif // OSD_PERF_METRIC_COLLECTOR_H_
diff --git a/src/mgr/OSDPerfMetricTypes.cc b/src/mgr/OSDPerfMetricTypes.cc
new file mode 100644
index 000000000..bce95e0ae
--- /dev/null
+++ b/src/mgr/OSDPerfMetricTypes.cc
@@ -0,0 +1,134 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mgr/OSDPerfMetricTypes.h"
+
+#include <ostream>
+
+using ceph::bufferlist;
+
+std::ostream& operator<<(std::ostream& os,
+                         const OSDPerfMetricSubKeyDescriptor &d) {
+  switch(d.type) {
+  case OSDPerfMetricSubKeyType::CLIENT_ID:
+    os << "client_id";
+    break;
+  case OSDPerfMetricSubKeyType::CLIENT_ADDRESS:
+    os << "client_address";
+    break;
+  case OSDPerfMetricSubKeyType::POOL_ID:
+    os << "pool_id";
+    break;
+  case OSDPerfMetricSubKeyType::NAMESPACE:
+    os << "namespace";
+    break;
+  case OSDPerfMetricSubKeyType::OSD_ID:
+    os << "osd_id";
+    break;
+  case OSDPerfMetricSubKeyType::PG_ID:
+    os << "pg_id";
+    break;
+  case OSDPerfMetricSubKeyType::OBJECT_NAME:
+    os << "object_name";
+    break;
+  case OSDPerfMetricSubKeyType::SNAP_ID:
+    os << "snap_id";
+    break;
+  default:
+    os << "unknown (" << static_cast<int>(d.type) << ")";
+  }
+  return os << "~/" << d.regex_str << "/";
+}
+
+void PerformanceCounterDescriptor::pack_counter(const PerformanceCounter &c,
+                                                bufferlist *bl) const {
+  using ceph::encode;
+  encode(c.first, *bl);
+  switch(type) {
+  case PerformanceCounterType::OPS:
+  case PerformanceCounterType::WRITE_OPS:
+  case PerformanceCounterType::READ_OPS:
+  case PerformanceCounterType::BYTES:
+  case PerformanceCounterType::WRITE_BYTES:
+  case PerformanceCounterType::READ_BYTES:
+    break;
+  case PerformanceCounterType::LATENCY:
+  case PerformanceCounterType::WRITE_LATENCY:
+  case PerformanceCounterType::READ_LATENCY:
+    encode(c.second, *bl);
+    break;
+  default:
+    ceph_abort_msg("unknown counter type");
+  }
+}
+
+void PerformanceCounterDescriptor::unpack_counter(
+    bufferlist::const_iterator& bl, PerformanceCounter *c) const {
+  using ceph::decode;
+  decode(c->first, bl);
+  switch(type) {
+  case PerformanceCounterType::OPS:
+  case PerformanceCounterType::WRITE_OPS:
+  case PerformanceCounterType::READ_OPS:
+  case PerformanceCounterType::BYTES:
+  case PerformanceCounterType::WRITE_BYTES:
+  case PerformanceCounterType::READ_BYTES:
+    break;
+  case PerformanceCounterType::LATENCY:
+  case PerformanceCounterType::WRITE_LATENCY:
+  case PerformanceCounterType::READ_LATENCY:
+    decode(c->second, bl);
+    break;
+  default:
+    ceph_abort_msg("unknown counter type");
+  }
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const PerformanceCounterDescriptor &d) {
+  switch(d.type) {
+  case PerformanceCounterType::OPS:
+    return os << "ops";
+  case PerformanceCounterType::WRITE_OPS:
+    return os << "write ops";
+  case PerformanceCounterType::READ_OPS:
+    return os << "read ops";
+  case PerformanceCounterType::BYTES:
+    return os << "bytes";
+  case PerformanceCounterType::WRITE_BYTES:
+    return os << "write bytes";
+  case PerformanceCounterType::READ_BYTES:
+    return os << "read bytes";
+  case PerformanceCounterType::LATENCY:
+    return os << "latency";
+  case PerformanceCounterType::WRITE_LATENCY:
+    return os << "write latency";
+  case PerformanceCounterType::READ_LATENCY:
+    return os << "read latency";
+  default:
+    return os << "unknown (" << static_cast<int>(d.type) << ")";
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const OSDPerfMetricLimit &limit) {
+  return os << "{order_by=" << limit.order_by << ", max_count="
+            << limit.max_count << "}";
+}
+
+void OSDPerfMetricQuery::pack_counters(const PerformanceCounters &counters,
+                                       bufferlist *bl) const {
+  auto it = counters.begin();
+  for (auto &descriptor : performance_counter_descriptors) {
+    if (it == counters.end()) {
+      descriptor.pack_counter(PerformanceCounter(), bl);
+    } else {
+      descriptor.pack_counter(*it, bl);
+      it++;
+    }
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const OSDPerfMetricQuery &query) {
+  return os << "{key=" << query.key_descriptor << ", counters="
+            << query.performance_counter_descriptors << "}";
+}
diff --git a/src/mgr/OSDPerfMetricTypes.h b/src/mgr/OSDPerfMetricTypes.h
new file mode 100644
index 000000000..1b5904e13
--- /dev/null
+++ b/src/mgr/OSDPerfMetricTypes.h
@@ -0,0 +1,360 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef OSD_PERF_METRIC_H_
+#define OSD_PERF_METRIC_H_
+
+#include "include/denc.h"
+#include "include/stringify.h"
+
+#include "mgr/Types.h"
+
+#include <regex>
+
+typedef std::vector<std::string> OSDPerfMetricSubKey; // array of regex match
+typedef std::vector<OSDPerfMetricSubKey> OSDPerfMetricKey;
+
+enum class OSDPerfMetricSubKeyType : uint8_t {
+  CLIENT_ID = 0,
+  CLIENT_ADDRESS = 1,
+  POOL_ID = 2,
+  NAMESPACE = 3,
+  OSD_ID = 4,
+  PG_ID = 5,
+  OBJECT_NAME = 6,
+  SNAP_ID = 7,
+};
+
+struct OSDPerfMetricSubKeyDescriptor {
+  OSDPerfMetricSubKeyType type = static_cast<OSDPerfMetricSubKeyType>(-1);
+  std::string regex_str;
+  std::regex regex;
+
+  bool is_supported() const {
+    switch (type) {
+    case OSDPerfMetricSubKeyType::CLIENT_ID:
+    case OSDPerfMetricSubKeyType::CLIENT_ADDRESS:
+    case OSDPerfMetricSubKeyType::POOL_ID:
+    case OSDPerfMetricSubKeyType::NAMESPACE:
+    case OSDPerfMetricSubKeyType::OSD_ID:
+    case OSDPerfMetricSubKeyType::PG_ID:
+    case OSDPerfMetricSubKeyType::OBJECT_NAME:
+    case OSDPerfMetricSubKeyType::SNAP_ID:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  OSDPerfMetricSubKeyDescriptor() {
+  }
+
+  OSDPerfMetricSubKeyDescriptor(OSDPerfMetricSubKeyType type,
+                                const std::string regex)
+    : type(type), regex_str(regex) {
+  }
+
+  bool operator<(const OSDPerfMetricSubKeyDescriptor &other) const {
+    if (type < other.type) {
+      return true;
+    }
+    if (type > other.type) {
+      return false;
+    }
+    return regex_str < other.regex_str;
+  }
+
+  DENC(OSDPerfMetricSubKeyDescriptor, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.type, p);
+    denc(v.regex_str, p);
+    DENC_FINISH(p);
+  }
+};
+WRITE_CLASS_DENC(OSDPerfMetricSubKeyDescriptor)
+
+std::ostream& operator<<(std::ostream& os,
+                         const OSDPerfMetricSubKeyDescriptor &d);
+
+typedef std::vector<OSDPerfMetricSubKeyDescriptor> OSDPerfMetricKeyDescriptor;
+
+template<>
+struct denc_traits<OSDPerfMetricKeyDescriptor> {
+  static constexpr bool supported = true;
+  static constexpr bool bounded = false;
+  static constexpr bool featured = false;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const OSDPerfMetricKeyDescriptor& v, size_t& p) {
+    p += sizeof(uint32_t);
+    const auto size = v.size();
+    if (size) {
+      size_t per = 0;
+      denc(v.front(), per);
+      p +=  per * size;
+    }
+  }
+  static void encode(const OSDPerfMetricKeyDescriptor& v,
+		     ceph::buffer::list::contiguous_appender& p) {
+    denc_varint(v.size(), p);
+    for (auto& i : v) {
+      denc(i, p);
+    }
+  }
+  static void decode(OSDPerfMetricKeyDescriptor& v,
+                     ceph::buffer::ptr::const_iterator& p) {
+    unsigned num;
+    denc_varint(num, p);
+    v.clear();
+    v.reserve(num);
+    for (unsigned i=0; i < num; ++i) {
+      OSDPerfMetricSubKeyDescriptor d;
+      denc(d, p);
+      if (!d.is_supported()) {
+        v.clear();
+        return;
+      }
+      try {
+        d.regex = d.regex_str.c_str();
+      } catch (const std::regex_error& e) {
+        v.clear();
+        return;
+      }
+      if (d.regex.mark_count() == 0) {
+        v.clear();
+        return;
+      }
+      v.push_back(std::move(d));
+    }
+  }
+};
+
+enum class PerformanceCounterType : uint8_t {
+  OPS = 0,
+  WRITE_OPS = 1,
+  READ_OPS = 2,
+  BYTES = 3,
+  WRITE_BYTES = 4,
+  READ_BYTES = 5,
+  LATENCY = 6,
+  WRITE_LATENCY = 7,
+  READ_LATENCY = 8,
+};
+
+struct PerformanceCounterDescriptor {
+  PerformanceCounterType type = static_cast<PerformanceCounterType>(-1);
+
+  bool is_supported() const {
+    switch (type) {
+    case PerformanceCounterType::OPS:
+    case PerformanceCounterType::WRITE_OPS:
+    case PerformanceCounterType::READ_OPS:
+    case PerformanceCounterType::BYTES:
+    case PerformanceCounterType::WRITE_BYTES:
+    case PerformanceCounterType::READ_BYTES:
+    case PerformanceCounterType::LATENCY:
+    case PerformanceCounterType::WRITE_LATENCY:
+    case PerformanceCounterType::READ_LATENCY:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  PerformanceCounterDescriptor() {
+  }
+
+  PerformanceCounterDescriptor(PerformanceCounterType type) : type(type) {
+  }
+
+  bool operator<(const PerformanceCounterDescriptor &other) const {
+    return type < other.type;
+  }
+
+  bool operator==(const PerformanceCounterDescriptor &other) const {
+    return type == other.type;
+  }
+
+  bool operator!=(const PerformanceCounterDescriptor &other) const {
+    return type != other.type;
+  }
+
+  DENC(PerformanceCounterDescriptor, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.type, p);
+    DENC_FINISH(p);
+  }
+
+  void pack_counter(const PerformanceCounter &c, ceph::buffer::list *bl) const;
+  void unpack_counter(ceph::buffer::list::const_iterator& bl,
+                      PerformanceCounter *c) const;
+};
+WRITE_CLASS_DENC(PerformanceCounterDescriptor)
+
+std::ostream& operator<<(std::ostream& os,
+                         const PerformanceCounterDescriptor &d);
+
+typedef std::vector<PerformanceCounterDescriptor> PerformanceCounterDescriptors;
+
+template<>
+struct denc_traits<PerformanceCounterDescriptors> {
+  static constexpr bool supported = true;
+  static constexpr bool bounded = false;
+  static constexpr bool featured = false;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const PerformanceCounterDescriptors& v, size_t& p) {
+    p += sizeof(uint32_t);
+    const auto size = v.size();
+    if (size) {
+      size_t per = 0;
+      denc(v.front(), per);
+      p +=  per * size;
+    }
+  }
+  static void encode(const PerformanceCounterDescriptors& v,
+                     ceph::buffer::list::contiguous_appender& p) {
+    denc_varint(v.size(), p);
+    for (auto& i : v) {
+      denc(i, p);
+    }
+  }
+  static void decode(PerformanceCounterDescriptors& v,
+                     ceph::buffer::ptr::const_iterator& p) {
+    unsigned num;
+    denc_varint(num, p);
+    v.clear();
+    v.reserve(num);
+    for (unsigned i=0; i < num; ++i) {
+      PerformanceCounterDescriptor d;
+      denc(d, p);
+      if (d.is_supported()) {
+        v.push_back(std::move(d));
+      }
+    }
+  }
+};
+
+struct OSDPerfMetricLimit {
+  PerformanceCounterDescriptor order_by;
+  uint64_t max_count = 0;
+
+  OSDPerfMetricLimit() {
+  }
+
+  OSDPerfMetricLimit(const PerformanceCounterDescriptor &order_by,
+                     uint64_t max_count)
+    : order_by(order_by), max_count(max_count) {
+  }
+
+  bool operator<(const OSDPerfMetricLimit &other) const {
+    if (order_by != other.order_by) {
+      return order_by < other.order_by;
+    }
+    return max_count < other.max_count;
+  }
+
+  DENC(OSDPerfMetricLimit, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.order_by, p);
+    denc(v.max_count, p);
+    DENC_FINISH(p);
+  }
+};
+WRITE_CLASS_DENC(OSDPerfMetricLimit)
+
+std::ostream& operator<<(std::ostream& os, const OSDPerfMetricLimit &limit);
+
+typedef std::set<OSDPerfMetricLimit> OSDPerfMetricLimits;
+
+struct OSDPerfMetricQuery {
+  bool operator<(const OSDPerfMetricQuery &other) const {
+    if (key_descriptor < other.key_descriptor) {
+      return true;
+    }
+    if (key_descriptor > other.key_descriptor) {
+      return false;
+    }
+    return (performance_counter_descriptors <
+            other.performance_counter_descriptors);
+  }
+
+  OSDPerfMetricQuery() {
+  }
+
+  OSDPerfMetricQuery(
+      const OSDPerfMetricKeyDescriptor &key_descriptor,
+      const PerformanceCounterDescriptors &performance_counter_descriptors)
+    : key_descriptor(key_descriptor),
+      performance_counter_descriptors(performance_counter_descriptors) {
+  }
+
+  template <typename L>
+  bool get_key(L&& get_sub_key, OSDPerfMetricKey *key) const {
+    for (auto &sub_key_descriptor : key_descriptor) {
+      OSDPerfMetricSubKey sub_key;
+      if (!get_sub_key(sub_key_descriptor, &sub_key)) {
+        return false;
+      }
+      key->push_back(sub_key);
+    }
+    return true;
+  }
+
+  DENC(OSDPerfMetricQuery, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.key_descriptor, p);
+    denc(v.performance_counter_descriptors, p);
+    DENC_FINISH(p);
+  }
+
+  void get_performance_counter_descriptors(
+      PerformanceCounterDescriptors *descriptors) const {
+    *descriptors = performance_counter_descriptors;
+  }
+
+  template <typename L>
+  void update_counters(L &&update_counter,
+                       PerformanceCounters *counters) const {
+    auto it = counters->begin();
+    for (auto &descriptor : performance_counter_descriptors) {
+      // TODO: optimize
+      if (it == counters->end()) {
+        counters->push_back(PerformanceCounter());
+        it = std::prev(counters->end());
+      }
+      update_counter(descriptor, &(*it));
+      it++;
+    }
+  }
+
+  void pack_counters(const PerformanceCounters &counters, ceph::buffer::list *bl) const;
+
+  OSDPerfMetricKeyDescriptor key_descriptor;
+  PerformanceCounterDescriptors performance_counter_descriptors;
+};
+WRITE_CLASS_DENC(OSDPerfMetricQuery)
+
+struct OSDPerfCollector : PerfCollector {
+  std::map<OSDPerfMetricKey, PerformanceCounters> counters;
+
+  OSDPerfCollector(MetricQueryID query_id)
+    : PerfCollector(query_id) {
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const OSDPerfMetricQuery &query);
+
+struct OSDPerfMetricReport {
+  PerformanceCounterDescriptors performance_counter_descriptors;
+  std::map<OSDPerfMetricKey, ceph::buffer::list> group_packed_performance_counters;
+
+  DENC(OSDPerfMetricReport, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.performance_counter_descriptors, p);
+    denc(v.group_packed_performance_counters, p);
+    DENC_FINISH(p);
+  }
+};
+WRITE_CLASS_DENC(OSDPerfMetricReport)
+
+#endif // OSD_PERF_METRIC_H_
+
diff --git a/src/mgr/PyFormatter.cc b/src/mgr/PyFormatter.cc
new file mode 100644
index 000000000..8e58f6e9a
--- /dev/null
+++ b/src/mgr/PyFormatter.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat Inc
+ *
+ * Author: John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "PyFormatter.h"
+#include <fstream>
+
+#define LARGE_SIZE 1024
+
+
+void PyFormatter::open_array_section(std::string_view name)
+{
+  PyObject *list = PyList_New(0);
+  dump_pyobject(name, list);
+  stack.push(cursor);
+  cursor = list;
+}
+
+void PyFormatter::open_object_section(std::string_view name)
+{
+  PyObject *dict = PyDict_New();
+  dump_pyobject(name, dict);
+  stack.push(cursor);
+  cursor = dict;
+}
+
+void PyFormatter::dump_unsigned(std::string_view name, uint64_t u)
+{
+  PyObject *p = PyLong_FromUnsignedLong(u);
+  ceph_assert(p);
+  dump_pyobject(name, p);
+}
+
+void PyFormatter::dump_int(std::string_view name, int64_t u)
+{
+  PyObject *p = PyLong_FromLongLong(u);
+  ceph_assert(p);
+  dump_pyobject(name, p);
+}
+
+void PyFormatter::dump_float(std::string_view name, double d)
+{
+  dump_pyobject(name, PyFloat_FromDouble(d));
+}
+
+void PyFormatter::dump_string(std::string_view name, std::string_view s)
+{
+  dump_pyobject(name, PyUnicode_FromString(s.data()));
+}
+
+void PyFormatter::dump_bool(std::string_view name, bool b)
+{
+  if (b) {
+    Py_INCREF(Py_True);
+    dump_pyobject(name, Py_True);
+  } else {
+    Py_INCREF(Py_False);
+    dump_pyobject(name, Py_False);
+  }
+}
+
+std::ostream& PyFormatter::dump_stream(std::string_view name)
+{
+  // Give the caller an ostream, construct a PyString,
+  // and remember the association between the two.  On flush,
+  // we'll read from the ostream into the PyString
+  auto ps = std::make_shared<PendingStream>();
+  ps->cursor = cursor;
+  ps->name = name;
+
+  pending_streams.push_back(ps);
+
+  return ps->stream;
+}
+
+void PyFormatter::dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+  char buf[LARGE_SIZE];
+  vsnprintf(buf, LARGE_SIZE, fmt, ap);
+
+  dump_pyobject(name, PyUnicode_FromString(buf));
+}
+
+/**
+ * Steals reference to `p`
+ */
+void PyFormatter::dump_pyobject(std::string_view name, PyObject *p)
+{
+  if (PyList_Check(cursor)) {
+    PyList_Append(cursor, p);
+    Py_DECREF(p);
+  } else if (PyDict_Check(cursor)) {
+    PyObject *key = PyUnicode_DecodeUTF8(name.data(), name.size(), nullptr);
+    PyDict_SetItem(cursor, key, p);
+    Py_DECREF(key);
+    Py_DECREF(p);
+  } else {
+    ceph_abort();
+  }
+}
+
+void PyFormatter::finish_pending_streams()
+{
+  for (const auto &i : pending_streams) {
+    PyObject *tmp_cur = cursor;
+    cursor = i->cursor;
+    dump_pyobject(
+        i->name.c_str(),
+        PyUnicode_FromString(i->stream.str().c_str()));
+    cursor = tmp_cur;
+  }
+
+  pending_streams.clear();
+}
+
+PyObject* PyJSONFormatter::get()
+{
+  if(json_formatter::stack_size()) {
+    close_section();
+  }
+  ceph_assert(!json_formatter::stack_size());
+  std::ostringstream ss;
+  flush(ss);
+  std::string s = ss.str();
+  PyObject* obj = PyBytes_FromStringAndSize(std::move(s.c_str()), s.size());
+  return obj;
+}
diff --git a/src/mgr/PyFormatter.h b/src/mgr/PyFormatter.h
new file mode 100644
index 000000000..5e4c0a679
--- /dev/null
+++ b/src/mgr/PyFormatter.h
@@ -0,0 +1,163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat Inc
+ *
+ * Author: John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef PY_FORMATTER_H_
+#define PY_FORMATTER_H_
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include <Python.h>
+
+#include <stack>
+#include <string>
+#include <string_view>
+#include <sstream>
+#include <memory>
+#include <list>
+
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+
+class PyFormatter : public ceph::Formatter
+{
+public:
+  PyFormatter (const PyFormatter&) = delete;
+  PyFormatter& operator= (const PyFormatter&) = delete;
+  PyFormatter(bool pretty = false, bool array = false)
+  {
+    // It is forbidden to instantiate me outside of the GIL,
+    // because I construct python objects right away
+
+    // Initialise cursor to an empty dict
+    if (!array) {
+      root = cursor = PyDict_New();
+    } else {
+      root = cursor = PyList_New(0);
+    }
+  }
+
+  ~PyFormatter() override
+  {
+    cursor = NULL;
+    Py_DECREF(root);
+    root = NULL;
+  }
+
+  // Obscure, don't care.
+  void open_array_section_in_ns(std::string_view name, const char *ns) override
+  {ceph_abort();}
+  void open_object_section_in_ns(std::string_view name, const char *ns) override
+  {ceph_abort();}
+
+  void reset() override
+  {
+    const bool array = PyList_Check(root);
+    Py_DECREF(root);
+    if (array) {
+      root = cursor = PyList_New(0);
+    } else {
+      root = cursor = PyDict_New();
+    }
+  }
+
+  void set_status(int status, const char* status_name) override {}
+  void output_header() override {};
+  void output_footer() override {};
+  void enable_line_break() override {};
+
+  void open_array_section(std::string_view name) override;
+  void open_object_section(std::string_view name) override;
+  void close_section() override
+  {
+    ceph_assert(cursor != root);
+    ceph_assert(!stack.empty());
+    cursor = stack.top();
+    stack.pop();
+  }
+  void dump_bool(std::string_view name, bool b) override;
+  void dump_unsigned(std::string_view name, uint64_t u) override;
+  void dump_int(std::string_view name, int64_t u) override;
+  void dump_float(std::string_view name, double d) override;
+  void dump_string(std::string_view name, std::string_view s) override;
+  std::ostream& dump_stream(std::string_view name) override;
+  void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+
+  void flush(std::ostream& os) override
+  {
+      // This class is not a serializer: this doesn't make sense
+      ceph_abort();
+  }
+
+  int get_len() const override
+  {
+      // This class is not a serializer: this doesn't make sense
+      ceph_abort();
+      return 0;
+  }
+
+  void write_raw_data(const char *data) override
+  {
+      // This class is not a serializer: this doesn't make sense
+      ceph_abort();
+  }
+
+  PyObject *get()
+  {
+    finish_pending_streams();
+
+    Py_INCREF(root);
+    return root;
+  }
+
+  void finish_pending_streams();
+
+private:
+  PyObject *root;
+  PyObject *cursor;
+  std::stack<PyObject *> stack;
+
+  void dump_pyobject(std::string_view name, PyObject *p);
+
+  class PendingStream {
+    public:
+    PyObject *cursor;
+    std::string name;
+    std::stringstream stream;
+  };
+
+  std::list<std::shared_ptr<PendingStream> > pending_streams;
+
+};
+
+class PyJSONFormatter : public JSONFormatter {
+public:
+  PyObject *get();
+  PyJSONFormatter (const PyJSONFormatter&) = default;
+  PyJSONFormatter(bool pretty=false, bool is_array=false) : JSONFormatter(pretty) {
+    if(is_array) {
+      open_array_section("");
+    } else {
+      open_object_section("");
+    }
+}
+
+private:
+  using json_formatter = JSONFormatter;
+  template <class T> void add_value(std::string_view name, T val);
+  void add_value(std::string_view name, std::string_view val, bool quoted);
+};
+
+#endif
+
diff --git a/src/mgr/PyModule.cc b/src/mgr/PyModule.cc
new file mode 100644
index 000000000..19d02332d
--- /dev/null
+++ b/src/mgr/PyModule.cc
@@ -0,0 +1,729 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "BaseMgrModule.h"
+#include "BaseMgrStandbyModule.h"
+#include "PyOSDMap.h"
+#include "MgrContext.h"
+#include "PyUtil.h"
+
+#include "PyModule.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[py] "
+
+// definition for non-const static member
+std::string PyModule::mgr_store_prefix = "mgr/";
+
+// Courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text
+#define BOOST_BIND_GLOBAL_PLACEHOLDERS
+// Boost apparently can't be bothered to fix its own usage of its own
+// deprecated features.
+#include <boost/python/extract.hpp>
+#include <boost/python/import.hpp>
+#include <boost/python/object.hpp>
+#undef BOOST_BIND_GLOBAL_PLACEHOLDERS
+#include <boost/algorithm/string/predicate.hpp>
+#include "include/ceph_assert.h"  // boost clobbers this
+// decode a Python exception into a string
+std::string handle_pyerror()
+{
+    using namespace boost::python;
+    using namespace boost;
+
+    PyObject *exc, *val, *tb;
+    object formatted_list, formatted;
+    PyErr_Fetch(&exc, &val, &tb);
+    PyErr_NormalizeException(&exc, &val, &tb);
+    handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb));
+    object traceback(import("traceback"));
+    if (!tb) {
+        object format_exception_only(traceback.attr("format_exception_only"));
+        try {
+          formatted_list = format_exception_only(hexc, hval);
+        } catch (error_already_set const &) {
+          // error while processing exception object
+          // returning only the exception string value
+          PyObject *name_attr = PyObject_GetAttrString(exc, "__name__");
+          std::stringstream ss;
+          ss << PyUnicode_AsUTF8(name_attr) << ": " << PyUnicode_AsUTF8(val);
+          Py_XDECREF(name_attr);
+          ss << "\nError processing exception object: " << peek_pyerror();
+          return ss.str();
+        }
+    } else {
+        object format_exception(traceback.attr("format_exception"));
+        try {
+          formatted_list = format_exception(hexc, hval, htb);
+        } catch (error_already_set const &) {
+          // error while processing exception object
+          // returning only the exception string value
+          PyObject *name_attr = PyObject_GetAttrString(exc, "__name__");
+          std::stringstream ss;
+          ss << PyUnicode_AsUTF8(name_attr) << ": " << PyUnicode_AsUTF8(val);
+          Py_XDECREF(name_attr);
+          ss << "\nError processing exception object: " << peek_pyerror();
+          return ss.str();
+        }
+    }
+    formatted = str("").join(formatted_list);
+    return extract<std::string>(formatted);
+}
+
+/**
+ * Get the single-line exception message, without clearing any
+ * exception state.
+ */
+std::string peek_pyerror()
+{
+  PyObject *ptype, *pvalue, *ptraceback;
+  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+  ceph_assert(ptype);
+  ceph_assert(pvalue);
+  PyObject *pvalue_str = PyObject_Str(pvalue);
+  std::string exc_msg = PyUnicode_AsUTF8(pvalue_str);
+  Py_DECREF(pvalue_str);
+  PyErr_Restore(ptype, pvalue, ptraceback);
+
+  return exc_msg;
+}
+
+
+namespace {
+  PyObject* log_write(PyObject*, PyObject* args) {
+    char* m = nullptr;
+    if (PyArg_ParseTuple(args, "s", &m)) {
+      auto len = strlen(m);
+      if (len && m[len-1] == '\n') {
+	m[len-1] = '\0';
+      }
+      dout(4) << m << dendl;
+    }
+    Py_RETURN_NONE;
+  }
+
+  PyObject* log_flush(PyObject*, PyObject*){
+    Py_RETURN_NONE;
+  }
+
+  static PyMethodDef log_methods[] = {
+    {"write", log_write, METH_VARARGS, "write stdout and stderr"},
+    {"flush", log_flush, METH_VARARGS, "flush"},
+    {nullptr, nullptr, 0, nullptr}
+  };
+
+  static PyModuleDef ceph_logger_module = {
+    PyModuleDef_HEAD_INIT,
+    "ceph_logger",
+    nullptr,
+    -1,
+    log_methods,
+  };
+}
+
+PyModuleConfig::PyModuleConfig() = default;  
+
+PyModuleConfig::PyModuleConfig(PyModuleConfig &mconfig)
+  : config(mconfig.config)
+{}
+
+PyModuleConfig::~PyModuleConfig() = default;
+
+
+void PyModuleConfig::set_config(
+    MonClient *monc,
+    const std::string &module_name,
+    const std::string &key, const boost::optional<std::string>& val)
+{
+  const std::string global_key = "mgr/" + module_name + "/" + key;
+  Command set_cmd;
+  {
+    std::ostringstream cmd_json;
+    JSONFormatter jf;
+    jf.open_object_section("cmd");
+    if (val) {
+      jf.dump_string("prefix", "config set");
+      jf.dump_string("value", *val);
+    } else {
+      jf.dump_string("prefix", "config rm");
+    }
+    jf.dump_string("who", "mgr");
+    jf.dump_string("name", global_key);
+    jf.close_section();
+    jf.flush(cmd_json);
+    set_cmd.run(monc, cmd_json.str());
+  }
+  set_cmd.wait();
+
+  if (set_cmd.r == 0) {
+    std::lock_guard l(lock);
+    if (val) {
+      config[global_key] = *val;
+    } else {
+      config.erase(global_key);
+    }
+  } else {
+    if (val) {
+      dout(0) << "`config set mgr " << global_key << " " << val << "` failed: "
+        << cpp_strerror(set_cmd.r) << dendl;
+    } else {
+      dout(0) << "`config rm mgr " << global_key << "` failed: "
+        << cpp_strerror(set_cmd.r) << dendl;
+    }
+    dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
+  }
+}
+
+std::string PyModule::get_site_packages()
+{
+  std::stringstream site_packages;
+
+  // CPython doesn't auto-add site-packages dirs to sys.path for us,
+  // but it does provide a module that we can ask for them.
+  auto site_module = PyImport_ImportModule("site");
+  ceph_assert(site_module);
+
+  auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages");
+  if (site_packages_fn != nullptr) {
+    auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr);
+    ceph_assert(site_packages_list);
+
+    auto n = PyList_Size(site_packages_list);
+    for (Py_ssize_t i = 0; i < n; ++i) {
+      if (i != 0) {
+        site_packages << ":";
+      }
+      site_packages << PyUnicode_AsUTF8(PyList_GetItem(site_packages_list, i));
+    }
+
+    Py_DECREF(site_packages_list);
+    Py_DECREF(site_packages_fn);
+  } else {
+    // Fall back to generating our own site-packages paths by imitating
+    // what the standard site.py does.  This is annoying but it lets us
+    // run inside virtualenvs :-/
+
+    auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages");
+    ceph_assert(site_packages_fn);
+
+    auto known_paths = PySet_New(nullptr);
+    auto pArgs = PyTuple_Pack(1, known_paths);
+    PyObject_CallObject(site_packages_fn, pArgs);
+    Py_DECREF(pArgs);
+    Py_DECREF(known_paths);
+    Py_DECREF(site_packages_fn);
+
+    auto sys_module = PyImport_ImportModule("sys");
+    ceph_assert(sys_module);
+    auto sys_path = PyObject_GetAttrString(sys_module, "path");
+    ceph_assert(sys_path);
+
+    dout(1) << "sys.path:" << dendl;
+    auto n = PyList_Size(sys_path);
+    bool first = true;
+    for (Py_ssize_t i = 0; i < n; ++i) {
+      dout(1) << "  " << PyUnicode_AsUTF8(PyList_GetItem(sys_path, i)) << dendl;
+      if (first) {
+        first = false;
+      } else {
+        site_packages << ":";
+      }
+      site_packages << PyUnicode_AsUTF8(PyList_GetItem(sys_path, i));
+    }
+
+    Py_DECREF(sys_path);
+    Py_DECREF(sys_module);
+  }
+
+  Py_DECREF(site_module);
+
+  return site_packages.str();
+}
+
+PyObject* PyModule::init_ceph_logger()
+{
+  auto py_logger = PyModule_Create(&ceph_logger_module);
+  PySys_SetObject("stderr", py_logger);
+  PySys_SetObject("stdout", py_logger);
+  return py_logger;
+}
+
+PyObject* PyModule::init_ceph_module()
+{
+  static PyMethodDef module_methods[] = {
+    {nullptr, nullptr, 0, nullptr}
+  };
+  static PyModuleDef ceph_module_def = {
+    PyModuleDef_HEAD_INIT,
+    "ceph_module",
+    nullptr,
+    -1,
+    module_methods,
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr
+  };
+  PyObject *ceph_module = PyModule_Create(&ceph_module_def);
+  ceph_assert(ceph_module != nullptr);
+  std::map<const char*, PyTypeObject*> classes{
+    {{"BaseMgrModule", &BaseMgrModuleType},
+     {"BaseMgrStandbyModule", &BaseMgrStandbyModuleType},
+     {"BasePyOSDMap", &BasePyOSDMapType},
+     {"BasePyOSDMapIncremental", &BasePyOSDMapIncrementalType},
+     {"BasePyCRUSH", &BasePyCRUSHType}}
+  };
+  for (auto [name, type] : classes) {
+    type->tp_new = PyType_GenericNew;
+    if (PyType_Ready(type) < 0) {
+      ceph_abort();
+    }
+    Py_INCREF(type);
+
+    PyModule_AddObject(ceph_module, name, (PyObject *)type);
+  }
+  return ceph_module;
+}
+
+int PyModule::load(PyThreadState *pMainThreadState)
+{
+  ceph_assert(pMainThreadState != nullptr);
+
+  // Configure sub-interpreter
+  {
+    SafeThreadState sts(pMainThreadState);
+    Gil gil(sts);
+
+    auto thread_state = Py_NewInterpreter();
+    if (thread_state == nullptr) {
+      derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl;
+      return -EINVAL;
+    } else {
+      pMyThreadState.set(thread_state);
+      // Some python modules do not cope with an unpopulated argv, so lets
+      // fake one.  This step also picks up site-packages into sys.path.
+      const wchar_t *argv[] = {L"ceph-mgr"};
+      PySys_SetArgv(1, (wchar_t**)argv);
+      // Configure sys.path to include mgr_module_path
+      string paths = (g_conf().get_val<std::string>("mgr_module_path") + ':' +
+                      get_site_packages() + ':');
+      wstring sys_path(wstring(begin(paths), end(paths)) + Py_GetPath());
+      PySys_SetPath(const_cast<wchar_t*>(sys_path.c_str()));
+      dout(10) << "Computed sys.path '"
+	       << string(begin(sys_path), end(sys_path)) << "'" << dendl;
+    }
+  }
+  // Environment is all good, import the external module
+  {
+    Gil gil(pMyThreadState);
+
+    int r;
+    r = load_subclass_of("MgrModule", &pClass);
+    if (r) {
+      derr << "Class not found in module '" << module_name << "'" << dendl;
+      return r;
+    }
+
+    r = load_commands();
+    if (r != 0) {
+      derr << "Missing or invalid COMMANDS attribute in module '"
+          << module_name << "'" << dendl;
+      error_string = "Missing or invalid COMMANDS attribute";
+      return r;
+    }
+
+    register_options(pClass);
+    r = load_options();
+    if (r != 0) {
+      derr << "Missing or invalid MODULE_OPTIONS attribute in module '"
+          << module_name << "'" << dendl;
+      error_string = "Missing or invalid MODULE_OPTIONS attribute";
+      return r;
+    }
+
+    load_notify_types();
+
+    // We've imported the module and found a MgrModule subclass, at this
+    // point the module is considered loaded.  It might still not be
+    // runnable though, can_run populated later...
+    loaded = true;
+
+    r = load_subclass_of("MgrStandbyModule", &pStandbyClass);
+    if (!r) {
+      dout(4) << "Standby mode available in module '" << module_name
+              << "'" << dendl;
+      register_options(pStandbyClass);
+    } else {
+      dout(4) << "Standby mode not provided by module '" << module_name
+              << "'" << dendl;
+    }
+
+    // Populate can_run by interrogating the module's callback that
+    // may check for dependencies etc
+    PyObject *pCanRunTuple = PyObject_CallMethod(pClass,
+      const_cast<char*>("can_run"), const_cast<char*>("()"));
+    if (pCanRunTuple != nullptr) {
+      if (PyTuple_Check(pCanRunTuple) && PyTuple_Size(pCanRunTuple) == 2) {
+        PyObject *pCanRun = PyTuple_GetItem(pCanRunTuple, 0);
+        PyObject *can_run_str = PyTuple_GetItem(pCanRunTuple, 1);
+        if (!PyBool_Check(pCanRun) || !PyUnicode_Check(can_run_str)) {
+          derr << "Module " << get_name()
+               << " returned wrong type in can_run" << dendl;
+          error_string = "wrong type returned from can_run";
+          can_run = false;
+        } else {
+          can_run = (pCanRun == Py_True);
+          if (!can_run) {
+            error_string = PyUnicode_AsUTF8(can_run_str);
+            dout(4) << "Module " << get_name()
+                    << " reported that it cannot run: "
+                    << error_string << dendl;
+          }
+        }
+      } else {
+        derr << "Module " << get_name()
+             << " returned wrong type in can_run" << dendl;
+        error_string = "wrong type returned from can_run";
+        can_run = false;
+      }
+
+      Py_DECREF(pCanRunTuple);
+    } else {
+      derr << "Exception calling can_run on " << get_name() << dendl;
+      derr << handle_pyerror() << dendl;
+      can_run = false;
+    }
+  }
+  return 0;
+}
+
+int PyModule::walk_dict_list(
+    const std::string &attr_name,
+    std::function<int(PyObject*)> fn)
+{
+  PyObject *command_list = PyObject_GetAttrString(pClass, attr_name.c_str());
+  if (command_list == nullptr) {
+    derr << "Module " << get_name() << " has missing " << attr_name
+         << " member" << dendl;
+    return -EINVAL;
+  }
+  if (!PyObject_TypeCheck(command_list, &PyList_Type)) {
+    // Relatively easy mistake for human to make, e.g. defining COMMANDS
+    // as a {} instead of a []
+    derr << "Module " << get_name() << " has " << attr_name
+         << " member of wrong type (should be a list)" << dendl;
+    return -EINVAL;
+  }
+
+  // Invoke fn on each item in the list
+  int r = 0;
+  const size_t list_size = PyList_Size(command_list);
+  for (size_t i = 0; i < list_size; ++i) {
+    PyObject *command = PyList_GetItem(command_list, i);
+    ceph_assert(command != nullptr);
+
+    if (!PyDict_Check(command)) {
+      derr << "Module " << get_name() << " has non-dict entry "
+           << "in " << attr_name << " list" << dendl;
+      return -EINVAL;
+    }
+
+    r = fn(command);
+    if (r != 0) {
+      break;
+    }
+  }
+  Py_DECREF(command_list);
+
+  return r;
+}
+
+int PyModule::register_options(PyObject *cls)
+{
+  PyObject *pRegCmd = PyObject_CallMethod(
+    cls,
+    const_cast<char*>("_register_options"), const_cast<char*>("(s)"),
+      module_name.c_str());
+  if (pRegCmd != nullptr) {
+    Py_DECREF(pRegCmd);
+  } else {
+    derr << "Exception calling _register_options on " << get_name()
+         << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+  return 0;
+}
+
+int PyModule::load_notify_types()
+{
+  PyObject *ls = PyObject_GetAttrString(pClass, "NOTIFY_TYPES");
+  if (ls == nullptr) {
+    derr << "Module " << get_name() << " has missing NOTIFY_TYPES member" << dendl;
+    return -EINVAL;
+  }
+  if (!PyObject_TypeCheck(ls, &PyList_Type)) {
+    // Relatively easy mistake for human to make, e.g. defining COMMANDS
+    // as a {} instead of a []
+    derr << "Module " << get_name() << " has NOTIFY_TYPES that is not a list" << dendl;
+    return -EINVAL;
+  }
+
+  const size_t list_size = PyList_Size(ls);
+  for (size_t i = 0; i < list_size; ++i) {
+    PyObject *notify_type = PyList_GetItem(ls, i);
+    ceph_assert(notify_type != nullptr);
+
+    if (!PyObject_TypeCheck(notify_type, &PyUnicode_Type)) {
+      derr << "Module " << get_name() << " has non-string entry in NOTIFY_TYPES list"
+	   << dendl;
+      return -EINVAL;
+    }
+
+    notify_types.insert(PyUnicode_AsUTF8(notify_type));
+  }
+  Py_DECREF(ls);
+  dout(10) << "Module " << get_name() << " notify_types " << notify_types << dendl;
+
+  return 0;
+}
+
+int PyModule::load_commands()
+{
+  PyObject *pRegCmd = PyObject_CallMethod(pClass,
+      const_cast<char*>("_register_commands"), const_cast<char*>("(s)"),
+      module_name.c_str());
+  if (pRegCmd != nullptr) {
+    Py_DECREF(pRegCmd);
+  } else {
+    derr << "Exception calling _register_commands on " << get_name()
+         << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+
+  int r = walk_dict_list("COMMANDS", [this](PyObject *pCommand) -> int {
+    ModuleCommand command;
+
+    PyObject *pCmd = PyDict_GetItemString(pCommand, "cmd");
+    ceph_assert(pCmd != nullptr);
+    command.cmdstring = PyUnicode_AsUTF8(pCmd);
+
+    dout(20) << "loaded command " << command.cmdstring << dendl;
+
+    PyObject *pDesc = PyDict_GetItemString(pCommand, "desc");
+    ceph_assert(pDesc != nullptr);
+    command.helpstring = PyUnicode_AsUTF8(pDesc);
+
+    PyObject *pPerm = PyDict_GetItemString(pCommand, "perm");
+    ceph_assert(pPerm != nullptr);
+    command.perm = PyUnicode_AsUTF8(pPerm);
+
+    command.polling = false;
+    if (PyObject *pPoll = PyDict_GetItemString(pCommand, "poll");
+	pPoll && PyObject_IsTrue(pPoll)) {
+      command.polling = true;
+    }
+
+    command.module_name = module_name;
+
+    commands.push_back(std::move(command));
+
+    return 0;
+  });
+
+  dout(10) << "loaded " << commands.size() << " commands" << dendl;
+
+  return r;
+}
+
+int PyModule::load_options()
+{
+  int r = walk_dict_list("MODULE_OPTIONS", [this](PyObject *pOption) -> int {
+    MgrMap::ModuleOption option;
+    PyObject *p;
+    p = PyDict_GetItemString(pOption, "name");
+    ceph_assert(p != nullptr);
+    option.name = PyUnicode_AsUTF8(p);
+    option.type = Option::TYPE_STR;
+    p = PyDict_GetItemString(pOption, "type");
+    if (p && PyObject_TypeCheck(p, &PyUnicode_Type)) {
+      std::string s = PyUnicode_AsUTF8(p);
+      int t = Option::str_to_type(s);
+      if (t >= 0) {
+	option.type = t;
+      }
+    }
+    p = PyDict_GetItemString(pOption, "desc");
+    if (p && PyObject_TypeCheck(p, &PyUnicode_Type)) {
+      option.desc = PyUnicode_AsUTF8(p);
+    }
+    p = PyDict_GetItemString(pOption, "long_desc");
+    if (p && PyObject_TypeCheck(p, &PyUnicode_Type)) {
+      option.long_desc = PyUnicode_AsUTF8(p);
+    }
+    p = PyDict_GetItemString(pOption, "default");
+    if (p) {
+      auto q = PyObject_Str(p);
+      option.default_value = PyUnicode_AsUTF8(q);
+      Py_DECREF(q);
+    }
+    p = PyDict_GetItemString(pOption, "min");
+    if (p) {
+      auto q = PyObject_Str(p);
+      option.min = PyUnicode_AsUTF8(q);
+      Py_DECREF(q);
+    }
+    p = PyDict_GetItemString(pOption, "max");
+    if (p) {
+      auto q = PyObject_Str(p);
+      option.max = PyUnicode_AsUTF8(q);
+      Py_DECREF(q);
+    }
+    p = PyDict_GetItemString(pOption, "enum_allowed");
+    if (p && PyObject_TypeCheck(p, &PyList_Type)) {
+      for (unsigned i = 0; i < PyList_Size(p); ++i) {
+	auto q = PyList_GetItem(p, i);
+	if (q) {
+	  auto r = PyObject_Str(q);
+	  option.enum_allowed.insert(PyUnicode_AsUTF8(r));
+	  Py_DECREF(r);
+	}
+      }
+    }
+    p = PyDict_GetItemString(pOption, "see_also");
+    if (p && PyObject_TypeCheck(p, &PyList_Type)) {
+      for (unsigned i = 0; i < PyList_Size(p); ++i) {
+	auto q = PyList_GetItem(p, i);
+	if (q && PyObject_TypeCheck(q, &PyUnicode_Type)) {
+	  option.see_also.insert(PyUnicode_AsUTF8(q));
+	}
+      }
+    }
+    p = PyDict_GetItemString(pOption, "tags");
+    if (p && PyObject_TypeCheck(p, &PyList_Type)) {
+      for (unsigned i = 0; i < PyList_Size(p); ++i) {
+	auto q = PyList_GetItem(p, i);
+	if (q && PyObject_TypeCheck(q, &PyUnicode_Type)) {
+	  option.tags.insert(PyUnicode_AsUTF8(q));
+	}
+      }
+    }
+    p = PyDict_GetItemString(pOption, "runtime");
+    if (p && PyObject_TypeCheck(p, &PyBool_Type)) {
+      if (p == Py_True) {
+	option.flags |= Option::FLAG_RUNTIME;
+      }
+      if (p == Py_False) {
+	option.flags &= ~Option::FLAG_RUNTIME;
+      }
+    }
+    dout(20) << "loaded module option " << option.name << dendl;
+    options[option.name] = std::move(option);
+    return 0;
+  });
+
+  dout(10) << "loaded " << options.size() << " options" << dendl;
+
+  return r;
+}
+
+bool PyModule::is_option(const std::string &option_name)
+{
+  std::lock_guard l(lock);
+  return options.count(option_name) > 0;
+}
+
+PyObject *PyModule::get_typed_option_value(const std::string& name,
+					   const std::string& value)
+{
+  // we don't need to hold a lock here because these MODULE_OPTIONS
+  // are set up exactly once during startup.
+  auto p = options.find(name);
+  if (p != options.end()) {
+    return get_python_typed_option_value((Option::type_t)p->second.type, value);
+  }
+  return PyUnicode_FromString(value.c_str());
+}
+
+int PyModule::load_subclass_of(const char* base_class, PyObject** py_class)
+{
+  // load the base class
+  PyObject *mgr_module = PyImport_ImportModule("mgr_module");
+  if (!mgr_module) {
+    error_string = peek_pyerror();
+    derr << "Module not found: 'mgr_module'" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  }
+  auto mgr_module_type = PyObject_GetAttrString(mgr_module, base_class);
+  Py_DECREF(mgr_module);
+  if (!mgr_module_type) {
+    error_string = peek_pyerror();
+    derr << "Unable to import MgrModule from mgr_module" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  }
+
+  // find the sub class
+  PyObject *plugin_module = PyImport_ImportModule(module_name.c_str());
+  if (!plugin_module) {
+    error_string = peek_pyerror();
+    derr << "Module not found: '" << module_name << "'" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -ENOENT;
+  }
+  auto locals = PyModule_GetDict(plugin_module);
+  Py_DECREF(plugin_module);
+  PyObject *key, *value;
+  Py_ssize_t pos = 0;
+  *py_class = nullptr;
+  while (PyDict_Next(locals, &pos, &key, &value)) {
+    if (!PyType_Check(value)) {
+      continue;
+    }
+    if (!PyObject_IsSubclass(value, mgr_module_type)) {
+      continue;
+    }
+    if (PyObject_RichCompareBool(value, mgr_module_type, Py_EQ)) {
+      continue;
+    }
+    auto class_name = PyUnicode_AsUTF8(key);
+    if (*py_class) {
+      derr << __func__ << ": ignoring '"
+	   << module_name << "." << class_name << "'"
+	   << ": only one '" << base_class
+	   << "' class is loaded from each plugin" << dendl;
+      continue;
+    }
+    *py_class = value;
+    dout(4) << __func__ << ": found class: '"
+	    << module_name << "." << class_name << "'" << dendl;
+  }
+  Py_DECREF(mgr_module_type);
+
+  return *py_class ? 0 : -EINVAL;
+}
+
+PyModule::~PyModule()
+{
+  if (pMyThreadState.ts != nullptr) {
+    Gil gil(pMyThreadState, true);
+    Py_XDECREF(pClass);
+    Py_XDECREF(pStandbyClass);
+  }
+}
+
diff --git a/src/mgr/PyModule.h b/src/mgr/PyModule.h
new file mode 100644
index 000000000..fe2e16238
--- /dev/null
+++ b/src/mgr/PyModule.h
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <boost/optional.hpp>
+#include "common/ceph_mutex.h"
+#include "Python.h"
+#include "Gil.h"
+#include "mon/MgrMap.h"
+
+
+class MonClient;
+
+std::string handle_pyerror();
+
+std::string peek_pyerror();
+
+/**
+ * A Ceph CLI command description provided from a Python module
+ */
+class ModuleCommand {
+public:
+  std::string cmdstring;
+  std::string helpstring;
+  std::string perm;
+  bool polling;
+
+  // Call the ActivePyModule of this name to handle the command
+  std::string module_name;
+};
+
+class PyModule
+{
+  mutable ceph::mutex lock = ceph::make_mutex("PyModule::lock");
+private:
+  const std::string module_name;
+  std::string get_site_packages();
+  int load_subclass_of(const char* class_name, PyObject** py_class);
+
+  // Did the MgrMap identify this module as one that should run?
+  bool enabled = false;
+
+  // Did the MgrMap flag this module as always on?
+  bool always_on = false;
+
+  // Did we successfully import this python module and look up symbols?
+  // (i.e. is it possible to instantiate a MgrModule subclass instance?)
+  bool loaded = false;
+
+  // Did the module identify itself as being able to run?
+  // (i.e. should we expect instantiating and calling serve() to work?)
+  bool can_run = false;
+
+  // Did the module encounter an unexpected error while running?
+  // (e.g. throwing an exception from serve())
+  bool failed = false;
+
+  // Populated if loaded, can_run or failed indicates a problem
+  std::string error_string;
+
+  // Helper for loading MODULE_OPTIONS and COMMANDS members
+  int walk_dict_list(
+      const std::string &attr_name,
+      std::function<int(PyObject*)> fn);
+
+  int load_commands();
+  std::vector<ModuleCommand> commands;
+
+  int register_options(PyObject *cls);
+  int load_options();
+  std::map<std::string, MgrMap::ModuleOption> options;
+
+  int load_notify_types();
+  std::set<std::string> notify_types;
+
+public:
+  static std::string mgr_store_prefix;
+
+  SafeThreadState pMyThreadState;
+  PyObject *pClass = nullptr;
+  PyObject *pStandbyClass = nullptr;
+
+  explicit PyModule(const std::string &module_name_)
+    : module_name(module_name_)
+  {
+  }
+
+  ~PyModule();
+
+  bool is_option(const std::string &option_name);
+  const std::map<std::string,MgrMap::ModuleOption>& get_options() const {
+    return options;
+  }
+
+  PyObject *get_typed_option_value(
+    const std::string& option,
+    const std::string& value);
+
+  int load(PyThreadState *pMainThreadState);
+  static PyObject* init_ceph_logger();
+  static PyObject* init_ceph_module();
+
+  void set_enabled(const bool enabled_)
+  {
+    enabled = enabled_;
+  }
+
+  void set_always_on(const bool always_on_) {
+    always_on = always_on_;
+  }
+
+  /**
+   * Extend `out` with the contents of `this->commands`
+   */
+  void get_commands(std::vector<ModuleCommand> *out) const
+  {
+    std::lock_guard l(lock);
+    ceph_assert(out != nullptr);
+    out->insert(out->end(), commands.begin(), commands.end());
+  }
+
+
+  /**
+   * Mark the module as failed, recording the reason in the error
+   * string.
+   */
+  void fail(const std::string &reason)
+  {
+    std::lock_guard l(lock);
+    failed = true;
+    error_string = reason;
+  }
+
+  bool is_enabled() const {
+    std::lock_guard l(lock);
+    return enabled || always_on;
+  }
+
+  bool is_failed() const { std::lock_guard l(lock) ; return failed; }
+  bool is_loaded() const { std::lock_guard l(lock) ; return loaded; }
+  bool is_always_on() const { std::lock_guard l(lock) ; return always_on; }
+
+  bool should_notify(const std::string& notify_type) const {
+    return notify_types.count(notify_type);
+  }
+
+  const std::string &get_name() const {
+    std::lock_guard l(lock) ; return module_name;
+  }
+  const std::string &get_error_string() const {
+    std::lock_guard l(lock) ; return error_string;
+  }
+  bool get_can_run() const {
+    std::lock_guard l(lock) ; return can_run;
+  }
+};
+
+typedef std::shared_ptr<PyModule> PyModuleRef;
+
+class PyModuleConfig {
+public:
+  mutable ceph::mutex lock = ceph::make_mutex("PyModuleConfig::lock");
+  std::map<std::string, std::string> config;
+
+  PyModuleConfig();
+  
+  PyModuleConfig(PyModuleConfig &mconfig);
+  
+  ~PyModuleConfig();
+
+  void set_config(
+    MonClient *monc,
+    const std::string &module_name,
+    const std::string &key, const boost::optional<std::string>& val);
+
+};
diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc
new file mode 100644
index 000000000..1ae44143c
--- /dev/null
+++ b/src/mgr/PyModuleRegistry.cc
@@ -0,0 +1,454 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "PyModuleRegistry.h"
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+#error std::filesystem not available!
+#endif
+
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/split.h"
+
+#include "BaseMgrModule.h"
+#include "PyOSDMap.h"
+#include "BaseMgrStandbyModule.h"
+#include "Gil.h"
+#include "MgrContext.h"
+#include "mgr/mgr_commands.h"
+
+#include "ActivePyModules.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[py] "
+
+std::set<std::string> obsolete_modules = {
+  "orchestrator_cli",
+};
+
+void PyModuleRegistry::init()
+{
+  std::lock_guard locker(lock);
+
+  // Set up global python interpreter
+#define WCHAR(s) L ## #s
+  Py_SetProgramName(const_cast<wchar_t*>(WCHAR(MGR_PYTHON_EXECUTABLE)));
+#undef WCHAR
+  // Add more modules
+  if (g_conf().get_val<bool>("daemonize")) {
+    PyImport_AppendInittab("ceph_logger", PyModule::init_ceph_logger);
+  }
+  PyImport_AppendInittab("ceph_module", PyModule::init_ceph_module);
+  Py_InitializeEx(0);
+#if PY_VERSION_HEX < 0x03090000
+  // Let CPython know that we will be calling it back from other
+  // threads in future.
+  if (! PyEval_ThreadsInitialized()) {
+    PyEval_InitThreads();
+  }
+#endif
+  // Drop the GIL and remember the main thread state (current
+  // thread state becomes NULL)
+  pMainThreadState = PyEval_SaveThread();
+  ceph_assert(pMainThreadState != nullptr);
+
+  std::list<std::string> failed_modules;
+
+  const std::string module_path = g_conf().get_val<std::string>("mgr_module_path");
+  std::set<std::string> module_names = probe_modules(module_path);
+  // Load python code
+  for (const auto& module_name : module_names) {
+    dout(1) << "Loading python module '" << module_name << "'" << dendl;
+
+    // Everything starts disabled, set enabled flag on module
+    // when we see first MgrMap
+    auto mod = std::make_shared<PyModule>(module_name);
+    int r = mod->load(pMainThreadState);
+    if (r != 0) {
+      // Don't use handle_pyerror() here; we don't have the GIL
+      // or the right thread state (this is deliberate).
+      derr << "Error loading module '" << module_name << "': "
+        << cpp_strerror(r) << dendl;
+      failed_modules.push_back(module_name);
+      // Don't drop out here, load the other modules
+    }
+
+    // Record the module even if the load failed, so that we can
+    // report its loading error
+    modules[module_name] = std::move(mod);
+  }
+  if (module_names.empty()) {
+    clog->error() << "No ceph-mgr modules found in " << module_path;
+  }
+  if (!failed_modules.empty()) {
+    clog->error() << "Failed to load ceph-mgr modules: " << joinify(
+        failed_modules.begin(), failed_modules.end(), std::string(", "));
+  }
+}
+
+bool PyModuleRegistry::handle_mgr_map(const MgrMap &mgr_map_)
+{
+  std::lock_guard l(lock);
+
+  if (mgr_map.epoch == 0) {
+    mgr_map = mgr_map_;
+
+    // First time we see MgrMap, set the enabled flags on modules
+    // This should always happen before someone calls standby_start
+    // or active_start
+    for (const auto &[module_name, module] : modules) {
+      const bool enabled = (mgr_map.modules.count(module_name) > 0);
+      module->set_enabled(enabled);
+      const bool always_on = (mgr_map.get_always_on_modules().count(module_name) > 0);
+      module->set_always_on(always_on);
+    }
+
+    return false;
+  } else {
+    bool modules_changed = mgr_map_.modules != mgr_map.modules ||
+      mgr_map_.always_on_modules != mgr_map.always_on_modules;
+    mgr_map = mgr_map_;
+
+    if (standby_modules != nullptr) {
+      standby_modules->handle_mgr_map(mgr_map_);
+    }
+
+    return modules_changed;
+  }
+}
+
+
+
+void PyModuleRegistry::standby_start(MonClient &mc, Finisher &f)
+{
+  std::lock_guard l(lock);
+  ceph_assert(active_modules == nullptr);
+  ceph_assert(standby_modules == nullptr);
+
+  // Must have seen a MgrMap by this point, in order to know
+  // which modules should be enabled
+  ceph_assert(mgr_map.epoch > 0);
+
+  dout(4) << "Starting modules in standby mode" << dendl;
+
+  standby_modules.reset(new StandbyPyModules(
+        mgr_map, module_config, clog, mc, f));
+
+  std::set<std::string> failed_modules;
+  for (const auto &i : modules) {
+    if (!(i.second->is_enabled() && i.second->get_can_run())) {
+      // report always_on modules with a standby mode that won't run
+      if (i.second->is_always_on() && i.second->pStandbyClass) {
+        failed_modules.insert(i.second->get_name());
+      }
+      continue;
+    }
+
+    if (i.second->pStandbyClass) {
+      dout(4) << "starting module " << i.second->get_name() << dendl;
+      standby_modules->start_one(i.second);
+    } else {
+      dout(4) << "skipping module '" << i.second->get_name() << "' because "
+                 "it does not implement a standby mode" << dendl;
+    }
+  }
+
+  if (!failed_modules.empty()) {
+    clog->error() << "Failed to execute ceph-mgr module(s) in standby mode: "
+        << joinify(failed_modules.begin(), failed_modules.end(),
+                   std::string(", "));
+  }
+}
+
+void PyModuleRegistry::active_start(
+            DaemonStateIndex &ds, ClusterState &cs,
+            const std::map<std::string, std::string> &kv_store,
+	    bool mon_provides_kv_sub,
+            MonClient &mc, LogChannelRef clog_, LogChannelRef audit_clog_,
+            Objecter &objecter_, Client &client_, Finisher &f,
+            DaemonServer &server)
+{
+  std::lock_guard locker(lock);
+
+  dout(4) << "Starting modules in active mode" << dendl;
+
+  ceph_assert(active_modules == nullptr);
+
+  // Must have seen a MgrMap by this point, in order to know
+  // which modules should be enabled
+  ceph_assert(mgr_map.epoch > 0);
+
+  if (standby_modules != nullptr) {
+    standby_modules->shutdown();
+    standby_modules.reset();
+  }
+
+  active_modules.reset(
+    new ActivePyModules(
+      module_config,
+      kv_store, mon_provides_kv_sub,
+      ds, cs, mc,
+      clog_, audit_clog_, objecter_, client_, f, server,
+      *this));
+
+  for (const auto &i : modules) {
+    // Anything we're skipping because of !can_run will be flagged
+    // to the user separately via get_health_checks
+    if (!(i.second->is_enabled() && i.second->is_loaded())) {
+      continue;
+    }
+
+    dout(4) << "Starting " << i.first << dendl;
+    active_modules->start_one(i.second);
+  }
+}
+
+void PyModuleRegistry::active_shutdown()
+{
+  std::lock_guard locker(lock);
+
+  if (active_modules != nullptr) {
+    active_modules->shutdown();
+    active_modules.reset();
+  }
+}
+
+void PyModuleRegistry::shutdown()
+{
+  std::lock_guard locker(lock);
+
+  if (standby_modules != nullptr) {
+    standby_modules->shutdown();
+    standby_modules.reset();
+  }
+
+  // Ideally, now, we'd be able to do this for all modules:
+  //
+  //    Py_EndInterpreter(pMyThreadState);
+  //    PyThreadState_Swap(pMainThreadState);
+  //
+  // Unfortunately, if the module has any other *python* threads active
+  // at this point, Py_EndInterpreter() will abort with:
+  //
+  //    Fatal Python error: Py_EndInterpreter: not the last thread
+  //
+  // This can happen when using CherryPy in a module, becuase CherryPy
+  // runs an extra thread as a timeout monitor, which spends most of its
+  // life inside a time.sleep(60).  Unless you are very, very lucky with
+  // the timing calling this destructor, that thread will still be stuck
+  // in a sleep, and Py_EndInterpreter() will abort.
+  //
+  // This could of course also happen with a poorly written module which
+  // made no attempt to clean up any additional threads it created.
+  //
+  // The safest thing to do is just not call Py_EndInterpreter(), and
+  // let Py_Finalize() kill everything after all modules are shut down.
+
+  modules.clear();
+
+  PyEval_RestoreThread(pMainThreadState);
+  Py_Finalize();
+}
+
+std::set<std::string> PyModuleRegistry::probe_modules(const std::string &path) const
+{
+  const auto opt = g_conf().get_val<std::string>("mgr_disabled_modules");
+  const auto disabled_modules = ceph::split(opt);
+
+  std::set<std::string> modules;
+  for (const auto& entry: fs::directory_iterator(path)) {
+    if (!fs::is_directory(entry)) {
+      continue;
+    }
+    const std::string name = entry.path().filename();
+    if (std::count(disabled_modules.begin(), disabled_modules.end(), name)) {
+      dout(10) << "ignoring disabled module " << name << dendl;
+      continue;
+    }
+    auto module_path = entry.path() / "module.py";
+    if (fs::exists(module_path)) {
+      modules.emplace(name);
+    }
+  }
+  return modules;
+}
+
+int PyModuleRegistry::handle_command(
+  const ModuleCommand& module_command,
+  const MgrSession& session,
+  const cmdmap_t &cmdmap,
+  const bufferlist &inbuf,
+  std::stringstream *ds,
+  std::stringstream *ss)
+{
+  if (active_modules) {
+    return active_modules->handle_command(module_command, session, cmdmap,
+                                          inbuf, ds, ss);
+  } else {
+    // We do not expect to be called before active modules is up, but
+    // it's straightfoward to handle this case so let's do it.
+    return -EAGAIN;
+  }
+}
+
+std::vector<ModuleCommand> PyModuleRegistry::get_py_commands() const
+{
+  std::lock_guard l(lock);
+
+  std::vector<ModuleCommand> result;
+  for (const auto& i : modules) {
+    i.second->get_commands(&result);
+  }
+
+  return result;
+}
+
+std::vector<MonCommand> PyModuleRegistry::get_commands() const
+{
+  std::vector<ModuleCommand> commands = get_py_commands();
+  std::vector<MonCommand> result;
+  for (auto &pyc: commands) {
+    uint64_t flags = MonCommand::FLAG_MGR;
+    if (pyc.polling) {
+      flags |= MonCommand::FLAG_POLL;
+    }
+    result.push_back({pyc.cmdstring, pyc.helpstring, "mgr",
+                        pyc.perm, flags});
+  }
+  return result;
+}
+
+void PyModuleRegistry::get_health_checks(health_check_map_t *checks)
+{
+  std::lock_guard l(lock);
+
+  // Only the active mgr reports module issues
+  if (active_modules) {
+    active_modules->get_health_checks(checks);
+
+    std::map<std::string, std::string> dependency_modules;
+    std::map<std::string, std::string> failed_modules;
+
+    /*
+     * Break up broken modules into two categories:
+     *  - can_run=false: the module is working fine but explicitly
+     *    telling you that a dependency is missing.  Advise the user to
+     *    read the message from the module and install what's missing.
+     *  - failed=true or loaded=false: something unexpected is broken,
+     *    either at runtime (from serve()) or at load time.  This indicates
+     *    a bug and the user should be guided to inspect the mgr log
+     *    to investigate and gather evidence.
+     */
+
+    for (const auto &i : modules) {
+      auto module = i.second;
+      if (module->is_enabled() && !module->get_can_run()) {
+        dependency_modules[module->get_name()] = module->get_error_string();
+      } else if ((module->is_enabled() && !module->is_loaded())
+              || (module->is_failed() && module->get_can_run())) {
+        // - Unloadable modules are only reported if they're enabled,
+        //   to avoid spamming users about modules they don't have the
+        //   dependencies installed for because they don't use it.
+        // - Failed modules are only reported if they passed the can_run
+        //   checks (to avoid outputting two health messages about a
+        //   module that said can_run=false but we tried running it anyway)
+        failed_modules[module->get_name()] = module->get_error_string();
+      }
+    }
+
+    // report failed always_on modules as health errors
+    for (const auto& name : mgr_map.get_always_on_modules()) {
+      if (obsolete_modules.count(name)) {
+	continue;
+      }
+      if (active_modules->is_pending(name)) {
+	continue;
+      }
+      if (!active_modules->module_exists(name)) {
+        if (failed_modules.find(name) == failed_modules.end() &&
+            dependency_modules.find(name) == dependency_modules.end()) {
+          failed_modules[name] = "Not found or unloadable";
+        }
+      }
+    }
+
+    if (!dependency_modules.empty()) {
+      std::ostringstream ss;
+      if (dependency_modules.size() == 1) {
+        auto iter = dependency_modules.begin();
+        ss << "Module '" << iter->first << "' has failed dependency: "
+           << iter->second;
+      } else if (dependency_modules.size() > 1) {
+        ss << dependency_modules.size()
+	   << " mgr modules have failed dependencies";
+      }
+      auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str(),
+			    dependency_modules.size());
+      for (auto& i : dependency_modules) {
+	std::ostringstream ss;
+        ss << "Module '" << i.first << "' has failed dependency: " << i.second;
+	d.detail.push_back(ss.str());
+      }
+    }
+
+    if (!failed_modules.empty()) {
+      std::ostringstream ss;
+      if (failed_modules.size() == 1) {
+        auto iter = failed_modules.begin();
+        ss << "Module '" << iter->first << "' has failed: " << iter->second;
+      } else if (failed_modules.size() > 1) {
+        ss << failed_modules.size() << " mgr modules have failed";
+      }
+      auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str(),
+			    failed_modules.size());
+      for (auto& i : failed_modules) {
+	std::ostringstream ss;
+        ss << "Module '" << i.first << "' has failed: " << i.second;
+	d.detail.push_back(ss.str());
+      }
+    }
+  }
+}
+
+void PyModuleRegistry::handle_config(const std::string &k, const std::string &v)
+{
+  std::lock_guard l(module_config.lock);
+
+  if (!v.empty()) {
+    // removing value to hide sensitive data going into mgr logs
+    // leaving this for debugging purposes
+    // dout(10) << "Loaded module_config entry " << k << ":" << v << dendl;
+    dout(10) << "Loaded module_config entry " << k << ":" << dendl;
+    module_config.config[k] = v;
+  } else {
+    module_config.config.erase(k);
+  }
+}
+
+void PyModuleRegistry::handle_config_notify()
+{
+  std::lock_guard l(lock);
+  if (active_modules) {
+    active_modules->config_notify();
+  }
+}
diff --git a/src/mgr/PyModuleRegistry.h b/src/mgr/PyModuleRegistry.h
new file mode 100644
index 000000000..6c72af893
--- /dev/null
+++ b/src/mgr/PyModuleRegistry.h
@@ -0,0 +1,231 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#pragma once
+
+// First because it includes Python.h
+#include "PyModule.h"
+
+#include <string>
+#include <map>
+#include <set>
+#include <memory>
+
+#include "common/LogClient.h"
+
+#include "ActivePyModules.h"
+#include "StandbyPyModules.h"
+
+class MgrSession;
+
+/**
+ * This class is responsible for setting up the python runtime environment
+ * and importing the python modules.
+ *
+ * It is *not* responsible for constructing instances of their BaseMgrModule
+ * subclasses: that is the job of ActiveMgrModule, which consumes the class
+ * references that we load here.
+ */
+class PyModuleRegistry
+{
+private:
+  mutable ceph::mutex lock = ceph::make_mutex("PyModuleRegistry::lock");
+  LogChannelRef clog;
+
+  std::map<std::string, PyModuleRef> modules;
+  std::multimap<std::string, entity_addrvec_t> clients;
+
+  std::unique_ptr<ActivePyModules> active_modules;
+  std::unique_ptr<StandbyPyModules> standby_modules;
+
+  PyThreadState *pMainThreadState;
+
+  // We have our own copy of MgrMap, because we are constructed
+  // before ClusterState exists.
+  MgrMap mgr_map;
+
+  /**
+   * Discover python modules from local disk
+   */
+  std::set<std::string> probe_modules(const std::string &path) const;
+
+  PyModuleConfig module_config;
+
+public:
+  void handle_config(const std::string &k, const std::string &v);
+  void handle_config_notify();
+
+  void update_kv_data(
+    const std::string prefix,
+    bool incremental,
+    const map<std::string, boost::optional<bufferlist>, std::less<>>& data) {
+    ceph_assert(active_modules);
+    active_modules->update_kv_data(prefix, incremental, data);
+  }
+
+  /**
+   * Get references to all modules (whether they have loaded and/or
+   * errored) or not.
+   */
+  auto get_modules() const
+  {
+    std::vector<PyModuleRef> modules_out;
+    std::lock_guard l(lock);
+    for (const auto &i : modules) {
+      modules_out.push_back(i.second);
+    }
+
+    return modules_out;
+  }
+
+  explicit PyModuleRegistry(LogChannelRef clog_)
+    : clog(clog_)
+  {}
+
+  /**
+   * @return true if the mgrmap has changed such that the service needs restart
+   */
+  bool handle_mgr_map(const MgrMap &mgr_map_);
+
+  bool have_standby_modules() const {
+    return !!standby_modules;
+  }
+
+  void init();
+
+  void upgrade_config(
+      MonClient *monc,
+      const std::map<std::string, std::string> &old_config);
+
+  void active_start(
+                DaemonStateIndex &ds, ClusterState &cs,
+                const std::map<std::string, std::string> &kv_store,
+		bool mon_provides_kv_sub,
+                MonClient &mc, LogChannelRef clog_, LogChannelRef audit_clog_,
+                Objecter &objecter_, Client &client_, Finisher &f,
+                DaemonServer &server);
+  void standby_start(MonClient &mc, Finisher &f);
+
+  bool is_standby_running() const
+  {
+    return standby_modules != nullptr;
+  }
+
+  void active_shutdown();
+  void shutdown();
+
+  std::vector<MonCommand> get_commands() const;
+  std::vector<ModuleCommand> get_py_commands() const;
+
+  /**
+   * Get the specified module. The module does not have to be
+   * loaded or runnable.
+   *
+   * Returns an empty reference if it does not exist.
+   */
+  PyModuleRef get_module(const std::string &module_name)
+  {
+    std::lock_guard l(lock);
+    auto module_iter = modules.find(module_name);
+    if (module_iter == modules.end()) {
+        return {};
+    }
+    return module_iter->second;
+  }
+
+  /**
+   * Pass through command to the named module for execution.
+   *
+   * The command must exist in the COMMANDS reported by the module.  If it
+   * doesn't then this will abort.
+   *
+   * If ActivePyModules has not been instantiated yet then this will
+   * return EAGAIN.
+   */
+  int handle_command(
+    const ModuleCommand& module_command,
+    const MgrSession& session,
+    const cmdmap_t &cmdmap,
+    const bufferlist &inbuf,
+    std::stringstream *ds,
+    std::stringstream *ss);
+
+  /**
+   * Pass through health checks reported by modules, and report any
+   * modules that have failed (i.e. unhandled exceptions in serve())
+   */
+  void get_health_checks(health_check_map_t *checks);
+
+  void get_progress_events(map<std::string,ProgressEvent> *events) {
+    if (active_modules) {
+      active_modules->get_progress_events(events);
+    }
+  }
+
+  // FIXME: breaking interface so that I don't have to go rewrite all
+  // the places that call into these (for now)
+  // >>>
+  void notify_all(const std::string &notify_type,
+                  const std::string &notify_id)
+  {
+    if (active_modules) {
+      active_modules->notify_all(notify_type, notify_id);
+    }
+  }
+
+  void notify_all(const LogEntry &log_entry)
+  {
+    if (active_modules) {
+      active_modules->notify_all(log_entry);
+    }
+  }
+
+  bool should_notify(const std::string& name,
+		     const std::string& notify_type) {
+    return modules.at(name)->should_notify(notify_type);
+  }
+
+  std::map<std::string, std::string> get_services() const
+  {
+    ceph_assert(active_modules);
+    return active_modules->get_services();
+  }
+
+  void register_client(std::string_view name, entity_addrvec_t addrs)
+  {
+    clients.emplace(std::string(name), std::move(addrs));
+  }
+  void unregister_client(std::string_view name, const entity_addrvec_t& addrs)
+  {
+    auto itp = clients.equal_range(std::string(name));
+    for (auto it = itp.first; it != itp.second; ++it) {
+      if (it->second == addrs) {
+        clients.erase(it);
+        return;
+      }
+    }
+  }
+
+  auto get_clients() const
+  {
+    std::scoped_lock l(lock);
+    std::vector<entity_addrvec_t> v;
+    for (const auto& p : clients) {
+      v.push_back(p.second);
+    }
+    return v;
+  }
+
+  // <<< (end of ActivePyModules cheeky call-throughs)
+};
diff --git a/src/mgr/PyModuleRunner.cc b/src/mgr/PyModuleRunner.cc
new file mode 100644
index 000000000..e27f7f405
--- /dev/null
+++ b/src/mgr/PyModuleRunner.cc
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include <Python.h>
+
+#include "PyModule.h"
+
+#include "common/debug.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+PyModuleRunner::~PyModuleRunner()
+{
+  Gil gil(py_module->pMyThreadState, true);
+
+  if (pClassInstance) {
+    Py_XDECREF(pClassInstance);
+    pClassInstance = nullptr;
+  }
+}
+
+int PyModuleRunner::serve()
+{
+  ceph_assert(pClassInstance != nullptr);
+
+  // This method is called from a separate OS thread (i.e. a thread not
+  // created by Python), so tell Gil to wrap this in a new thread state.
+  Gil gil(py_module->pMyThreadState, true);
+
+  auto pValue = PyObject_CallMethod(pClassInstance,
+      const_cast<char*>("serve"), nullptr);
+
+  int r = 0;
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    // This is not a very informative log message because it's an
+    // unknown/unexpected exception that we can't say much about.
+
+
+    // Get short exception message for the cluster log, before
+    // dumping the full backtrace to the local log.
+    std::string exc_msg = peek_pyerror();
+    
+    clog->error() << "Unhandled exception from module '" << get_name()
+                  << "' while running on mgr." << g_conf()->name.get_id()
+                  << ": " << exc_msg;
+    derr << get_name() << ".serve:" << dendl;
+    derr << handle_pyerror() << dendl;
+
+    py_module->fail(exc_msg);
+
+    return -EINVAL;
+  }
+
+  return r;
+}
+
+void PyModuleRunner::shutdown()
+{
+  ceph_assert(pClassInstance != nullptr);
+
+  Gil gil(py_module->pMyThreadState, true);
+
+  auto pValue = PyObject_CallMethod(pClassInstance,
+      const_cast<char*>("shutdown"), nullptr);
+
+  if (pValue != NULL) {
+    Py_DECREF(pValue);
+  } else {
+    derr << "Failed to invoke shutdown() on " << get_name() << dendl;
+    derr << handle_pyerror() << dendl;
+  }
+
+  dead = true;
+}
+
+void PyModuleRunner::log(const std::string &record)
+{
+#undef dout_prefix
+#define dout_prefix *_dout
+  dout(0) << record << dendl;
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+}
+
+void* PyModuleRunner::PyModuleRunnerThread::entry()
+{
+  // No need to acquire the GIL here; the module does it.
+  dout(4) << "Entering thread for " << mod->get_name() << dendl;
+  mod->serve();
+  return nullptr;
+}
diff --git a/src/mgr/PyModuleRunner.h b/src/mgr/PyModuleRunner.h
new file mode 100644
index 000000000..88d9f755a
--- /dev/null
+++ b/src/mgr/PyModuleRunner.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+
+#pragma once
+
+#include "common/Thread.h"
+#include "common/LogClient.h"
+#include "mgr/Gil.h"
+
+#include "PyModule.h"
+
+/**
+ * Implement the pattern of calling serve() on a module in a thread,
+ * until shutdown() is called.
+ */
+class PyModuleRunner
+{
+public:
+  // Info about the module we're going to run
+  PyModuleRef py_module;
+
+protected:
+  // Populated by descendent class
+  PyObject *pClassInstance = nullptr;
+
+  LogChannelRef clog;
+
+  class PyModuleRunnerThread : public Thread
+  {
+    PyModuleRunner *mod;
+
+  public:
+    explicit PyModuleRunnerThread(PyModuleRunner *mod_)
+      : mod(mod_) {}
+
+    void *entry() override;
+  };
+
+  bool is_dead() const { return dead; }
+
+  std::string thread_name;
+
+public:
+  int serve();
+  void shutdown();
+  void log(const std::string &record);
+
+  const char *get_thread_name() const
+  {
+    return thread_name.c_str();
+  }
+
+  PyModuleRunner(
+      const PyModuleRef &py_module_,
+      LogChannelRef clog_)
+    : 
+      py_module(py_module_),
+      clog(clog_),
+      thread(this)
+  {
+    // Shortened name for use as thread name, because thread names
+    // required to be <16 chars
+    thread_name = py_module->get_name().substr(0, 15);
+
+    ceph_assert(py_module != nullptr);
+  }
+
+  ~PyModuleRunner();
+
+  PyModuleRunnerThread thread;
+
+  std::string const &get_name() const { return py_module->get_name(); }
+
+private:
+  bool dead = false;
+};
+
+
diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc
new file mode 100644
index 000000000..70813ca52
--- /dev/null
+++ b/src/mgr/PyOSDMap.cc
@@ -0,0 +1,682 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Mgr.h"
+
+#include "osd/OSDMap.h"
+#include "common/errno.h"
+#include "common/version.h"
+#include "include/stringify.h"
+
+#include "PyOSDMap.h"
+#include "PyFormatter.h"
+#include "Gil.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+typedef struct {
+  PyObject_HEAD
+  OSDMap *osdmap;
+} BasePyOSDMap;
+
+typedef struct {
+  PyObject_HEAD
+  OSDMap::Incremental *inc;
+} BasePyOSDMapIncremental;
+
+typedef struct {
+  PyObject_HEAD
+  std::shared_ptr<CrushWrapper> crush;
+} BasePyCRUSH;
+
+// ----------
+
+static PyObject *osdmap_get_epoch(BasePyOSDMap *self, PyObject *obj)
+{
+  return PyLong_FromLong(self->osdmap->get_epoch());
+}
+
+static PyObject *osdmap_get_crush_version(BasePyOSDMap* self, PyObject *obj)
+{
+  return PyLong_FromLong(self->osdmap->get_crush_version());
+}
+
+static PyObject *osdmap_dump(BasePyOSDMap* self, PyObject *obj)
+{
+  PyFormatter f;
+  self->osdmap->dump(&f);
+  return f.get();
+}
+
+static PyObject *osdmap_new_incremental(BasePyOSDMap *self, PyObject *obj)
+{
+  OSDMap::Incremental *inc = new OSDMap::Incremental;
+
+  inc->fsid = self->osdmap->get_fsid();
+  inc->epoch = self->osdmap->get_epoch() + 1;
+  // always include latest crush map here... this is okay since we never
+  // actually use this map in the real world (and even if we did it would
+  // be a no-op).
+  self->osdmap->crush->encode(inc->crush, CEPH_FEATURES_ALL);
+  dout(10) << __func__ << " " << inc << dendl;
+
+  return construct_with_capsule("mgr_module", "OSDMapIncremental",
+                                (void*)(inc));
+}
+
+static PyObject *osdmap_apply_incremental(BasePyOSDMap *self,
+    BasePyOSDMapIncremental *incobj)
+{
+  if (!PyObject_TypeCheck(incobj, &BasePyOSDMapIncrementalType)) {
+    derr << "Wrong type in osdmap_apply_incremental!" << dendl;
+    return nullptr;
+  }
+
+  bufferlist bl;
+  self->osdmap->encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+  OSDMap *next = new OSDMap;
+  next->decode(bl);
+  next->apply_incremental(*(incobj->inc));
+  dout(10) << __func__ << " map " << self->osdmap << " inc " << incobj->inc
+	   << " next " << next << dendl;
+
+  return construct_with_capsule("mgr_module", "OSDMap", (void*)next);
+}
+
+static PyObject *osdmap_get_crush(BasePyOSDMap* self, PyObject *obj)
+{
+  return construct_with_capsule("mgr_module", "CRUSHMap",
+      (void*)(&(self->osdmap->crush)));
+}
+
+static PyObject *osdmap_get_pools_by_take(BasePyOSDMap* self, PyObject *args)
+{
+  int take;
+  if (!PyArg_ParseTuple(args, "i:get_pools_by_take",
+			&take)) {
+    return nullptr;
+  }
+
+  PyFormatter f;
+  f.open_array_section("pools");
+  for (auto& p : self->osdmap->get_pools()) {
+    if (self->osdmap->crush->rule_has_take(p.second.crush_rule, take)) {
+      f.dump_int("pool", p.first);
+    }
+  }
+  f.close_section();
+  return f.get();
+}
+
+static PyObject *osdmap_calc_pg_upmaps(BasePyOSDMap* self, PyObject *args)
+{
+  PyObject *pool_list;
+  BasePyOSDMapIncremental *incobj;
+  int max_deviation = 0;
+  int max_iterations = 0;
+  if (!PyArg_ParseTuple(args, "OiiO:calc_pg_upmaps",
+			&incobj, &max_deviation,
+			&max_iterations, &pool_list)) {
+    return nullptr;
+  }
+  if (!PyList_CheckExact(pool_list)) {
+    derr << __func__ << " pool_list not a list" << dendl;
+    return nullptr;
+  }
+  set<int64_t> pools;
+  for (auto i = 0; i < PyList_Size(pool_list); ++i) {
+    PyObject *pool_name = PyList_GET_ITEM(pool_list, i);
+    if (!PyUnicode_Check(pool_name)) {
+      derr << __func__ << " " << pool_name << " not a string" << dendl;
+      return nullptr;
+    }
+    auto pool_id = self->osdmap->lookup_pg_pool_name(
+      PyUnicode_AsUTF8(pool_name));
+    if (pool_id < 0) {
+      derr << __func__ << " pool '" << PyUnicode_AsUTF8(pool_name)
+           << "' does not exist" << dendl;
+      return nullptr;
+    }
+    pools.insert(pool_id);
+  }
+
+  dout(10) << __func__ << " osdmap " << self->osdmap << " inc " << incobj->inc
+	   << " max_deviation " << max_deviation
+	   << " max_iterations " << max_iterations
+	   << " pools " << pools
+	   << dendl;
+  PyThreadState *tstate = PyEval_SaveThread();
+  int r = self->osdmap->calc_pg_upmaps(g_ceph_context,
+				 max_deviation,
+				 max_iterations,
+				 pools,
+				 incobj->inc);
+  PyEval_RestoreThread(tstate);
+  dout(10) << __func__ << " r = " << r << dendl;
+  return PyLong_FromLong(r);
+}
+
+static PyObject *osdmap_map_pool_pgs_up(BasePyOSDMap* self, PyObject *args)
+{
+  int poolid;
+  if (!PyArg_ParseTuple(args, "i:map_pool_pgs_up",
+			&poolid)) {
+    return nullptr;
+  }
+  auto pi = self->osdmap->get_pg_pool(poolid);
+  if (!pi)
+    return nullptr;
+  map<pg_t,vector<int>> pm;
+  for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
+    pg_t pgid(ps, poolid);
+    self->osdmap->pg_to_up_acting_osds(pgid, &pm[pgid], nullptr, nullptr, nullptr);
+  }
+  PyFormatter f;
+  for (auto p : pm) {
+    string pg = stringify(p.first);
+    f.open_array_section(pg.c_str());
+    for (auto o : p.second) {
+      f.dump_int("osd", o);
+    }
+    f.close_section();
+  }
+  return f.get();
+}
+
+static int
+BasePyOSDMap_init(BasePyOSDMap *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *osdmap_capsule = nullptr;
+    static const char *kwlist[] = {"osdmap_capsule", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &osdmap_capsule)) {
+      ceph_abort();
+      return -1;
+    }
+    ceph_assert(PyObject_TypeCheck(osdmap_capsule, &PyCapsule_Type));
+
+    self->osdmap = (OSDMap*)PyCapsule_GetPointer(
+        osdmap_capsule, nullptr);
+    ceph_assert(self->osdmap);
+
+    return 0;
+}
+
+
+static void
+BasePyOSDMap_dealloc(BasePyOSDMap *self)
+{
+  if (self->osdmap) {
+    delete self->osdmap;
+    self->osdmap = nullptr;
+  } else {
+    derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+  }
+  Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *osdmap_pg_to_up_acting_osds(BasePyOSDMap *self, PyObject *args)
+{
+  int pool_id = 0;
+  int ps = 0;
+  if (!PyArg_ParseTuple(args, "ii:pg_to_up_acting_osds",
+			&pool_id, &ps)) {
+    return nullptr;
+  }
+
+  std::vector<int> up;
+  int up_primary;
+  std::vector<int> acting;
+  int acting_primary;
+  pg_t pg_id(ps, pool_id);
+  self->osdmap->pg_to_up_acting_osds(pg_id,
+      &up, &up_primary,
+      &acting, &acting_primary);
+
+  // (Ab)use PyFormatter as a convenient way to generate a dict
+  PyFormatter f;
+  f.dump_int("up_primary", up_primary);
+  f.dump_int("acting_primary", acting_primary);
+  f.open_array_section("up");
+  for (const auto &i : up) {
+    f.dump_int("osd", i);
+  }
+  f.close_section();
+  f.open_array_section("acting");
+  for (const auto &i : acting) {
+    f.dump_int("osd", i);
+  }
+  f.close_section();
+
+  return f.get();
+}
+
+static PyObject *osdmap_pool_raw_used_rate(BasePyOSDMap *self, PyObject *args)
+{
+  int pool_id = 0;
+  if (!PyArg_ParseTuple(args, "i:pool_raw_used_rate",
+			&pool_id)) {
+    return nullptr;
+  }
+
+  if (!self->osdmap->have_pg_pool(pool_id)) {
+    return nullptr;
+  }
+
+  float rate = self->osdmap->pool_raw_used_rate(pool_id);
+
+  return PyFloat_FromDouble(rate);
+}
+
+
+PyMethodDef BasePyOSDMap_methods[] = {
+  {"_get_epoch", (PyCFunction)osdmap_get_epoch, METH_NOARGS, "Get OSDMap epoch"},
+  {"_get_crush_version", (PyCFunction)osdmap_get_crush_version, METH_NOARGS,
+    "Get CRUSH version"},
+  {"_dump", (PyCFunction)osdmap_dump, METH_NOARGS, "Dump OSDMap::Incremental"},
+  {"_new_incremental", (PyCFunction)osdmap_new_incremental, METH_NOARGS,
+   "Create OSDMap::Incremental"},
+  {"_apply_incremental", (PyCFunction)osdmap_apply_incremental, METH_O,
+   "Apply OSDMap::Incremental and return the resulting OSDMap"},
+  {"_get_crush", (PyCFunction)osdmap_get_crush, METH_NOARGS, "Get CrushWrapper"},
+  {"_get_pools_by_take", (PyCFunction)osdmap_get_pools_by_take, METH_VARARGS,
+   "Get pools that have CRUSH rules that TAKE the given root"},
+  {"_calc_pg_upmaps", (PyCFunction)osdmap_calc_pg_upmaps, METH_VARARGS,
+   "Calculate new pg-upmap values"},
+  {"_map_pool_pgs_up", (PyCFunction)osdmap_map_pool_pgs_up, METH_VARARGS,
+   "Calculate up set mappings for all PGs in a pool"},
+  {"_pg_to_up_acting_osds", (PyCFunction)osdmap_pg_to_up_acting_osds, METH_VARARGS,
+    "Calculate up+acting OSDs for a PG ID"},
+  {"_pool_raw_used_rate", (PyCFunction)osdmap_pool_raw_used_rate, METH_VARARGS,
+   "Get raw space to logical space ratio"},
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BasePyOSDMap", /* tp_name */
+  sizeof(BasePyOSDMap),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  (destructor)BasePyOSDMap_dealloc,      /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "Ceph OSDMap",             /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BasePyOSDMap_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BasePyOSDMap_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  0,     /* tp_new */
+};
+
+// ----------
+
+
+static int
+BasePyOSDMapIncremental_init(BasePyOSDMapIncremental *self,
+    PyObject *args, PyObject *kwds)
+{
+    PyObject *inc_capsule = nullptr;
+    static const char *kwlist[] = {"inc_capsule", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &inc_capsule)) {
+      ceph_abort();
+      return -1;
+    }
+    ceph_assert(PyObject_TypeCheck(inc_capsule, &PyCapsule_Type));
+
+    self->inc = (OSDMap::Incremental*)PyCapsule_GetPointer(
+        inc_capsule, nullptr);
+    ceph_assert(self->inc);
+
+    return 0;
+}
+
+static void
+BasePyOSDMapIncremental_dealloc(BasePyOSDMapIncremental *self)
+{
+  if (self->inc) {
+    delete self->inc;
+    self->inc = nullptr;
+  } else {
+    derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+  }
+  Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *osdmap_inc_get_epoch(BasePyOSDMapIncremental *self,
+    PyObject *obj)
+{
+  return PyLong_FromLong(self->inc->epoch);
+}
+
+static PyObject *osdmap_inc_dump(BasePyOSDMapIncremental *self,
+    PyObject *obj)
+{
+  PyFormatter f;
+  self->inc->dump(&f);
+  return f.get();
+}
+
+static int get_int_float_map(PyObject *obj, map<int,double> *out)
+{
+  PyObject *ls = PyDict_Items(obj);
+  for (int j = 0; j < PyList_Size(ls); ++j) {
+    PyObject *pair = PyList_GET_ITEM(ls, j);
+    if (!PyTuple_Check(pair)) {
+      derr << __func__ << " item " << j << " not a tuple" << dendl;
+      Py_DECREF(ls);
+      return -1;
+    }
+    int k;
+    double v;
+    if (!PyArg_ParseTuple(pair, "id:pair", &k, &v)) {
+      derr << __func__ << " item " << j << " not a size 2 tuple" << dendl;
+      Py_DECREF(ls);
+      return -1;
+    }
+    (*out)[k] = v;
+  }
+
+  Py_DECREF(ls);
+  return 0;
+}
+
+static PyObject *osdmap_inc_set_osd_reweights(BasePyOSDMapIncremental *self,
+    PyObject *weightobj)
+{
+  map<int,double> wm;
+  if (get_int_float_map(weightobj, &wm) < 0) {
+    return nullptr;
+  }
+
+  for (auto i : wm) {
+    self->inc->new_weight[i.first] = std::max(0.0, std::min(1.0, i.second)) * 0x10000;
+  }
+  Py_RETURN_NONE;
+}
+
+static PyObject *osdmap_inc_set_compat_weight_set_weights(
+  BasePyOSDMapIncremental *self, PyObject *weightobj)
+{
+  map<int,double> wm;
+  if (get_int_float_map(weightobj, &wm) < 0) {
+    return nullptr;
+  }
+
+  CrushWrapper crush;
+  ceph_assert(self->inc->crush.length());  // see new_incremental
+  auto p = self->inc->crush.cbegin();
+  decode(crush, p);
+  crush.create_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS, 1);
+  for (auto i : wm) {
+    crush.choose_args_adjust_item_weightf(
+      g_ceph_context,
+      crush.choose_args_get(CrushWrapper::DEFAULT_CHOOSE_ARGS),
+      i.first,
+      { i.second },
+      nullptr);
+  }
+  self->inc->crush.clear();
+  crush.encode(self->inc->crush, CEPH_FEATURES_ALL);
+  Py_RETURN_NONE;
+}
+
+PyMethodDef BasePyOSDMapIncremental_methods[] = {
+  {"_get_epoch", (PyCFunction)osdmap_inc_get_epoch, METH_NOARGS,
+    "Get OSDMap::Incremental epoch"},
+  {"_dump", (PyCFunction)osdmap_inc_dump, METH_NOARGS,
+    "Dump OSDMap::Incremental"},
+  {"_set_osd_reweights", (PyCFunction)osdmap_inc_set_osd_reweights,
+    METH_O, "Set osd reweight values"},
+  {"_set_crush_compat_weight_set_weights",
+   (PyCFunction)osdmap_inc_set_compat_weight_set_weights, METH_O,
+   "Set weight values in the pending CRUSH compat weight-set"},
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapIncrementalType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BasePyOSDMapIncremental", /* tp_name */
+  sizeof(BasePyOSDMapIncremental),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  (destructor)BasePyOSDMapIncremental_dealloc,      /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "Ceph OSDMapIncremental",  /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BasePyOSDMapIncremental_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BasePyOSDMapIncremental_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  0,                         /* tp_new */
+};
+
+
+// ----------
+
+static int
+BasePyCRUSH_init(BasePyCRUSH *self,
+    PyObject *args, PyObject *kwds)
+{
+    PyObject *crush_capsule = nullptr;
+    static const char *kwlist[] = {"crush_capsule", NULL};
+
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+                                      const_cast<char**>(kwlist),
+                                      &crush_capsule)) {
+      ceph_abort();
+      return -1;
+    }
+    ceph_assert(PyObject_TypeCheck(crush_capsule, &PyCapsule_Type));
+
+    auto ptr_ref = (std::shared_ptr<CrushWrapper>*)(
+        PyCapsule_GetPointer(crush_capsule, nullptr));
+
+    // We passed a pointer to a shared pointer, which is weird, but
+    // just enough to get it into the constructor: this is a real shared
+    // pointer construction now, and then we throw away that pointer to
+    // the shared pointer.
+    self->crush = *ptr_ref;
+    ceph_assert(self->crush);
+
+    return 0;
+}
+
+static void
+BasePyCRUSH_dealloc(BasePyCRUSH *self)
+{
+  self->crush.reset();
+  Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *crush_dump(BasePyCRUSH *self, PyObject *obj)
+{
+  PyFormatter f;
+  self->crush->dump(&f);
+  return f.get();
+}
+
+static PyObject *crush_get_item_name(BasePyCRUSH *self, PyObject *args)
+{
+  int item;
+  if (!PyArg_ParseTuple(args, "i:get_item_name", &item)) {
+    return nullptr;
+  }
+  if (!self->crush->item_exists(item)) {
+    Py_RETURN_NONE;
+  }
+  return PyUnicode_FromString(self->crush->get_item_name(item));
+}
+
+static PyObject *crush_get_item_weight(BasePyCRUSH *self, PyObject *args)
+{
+  int item;
+  if (!PyArg_ParseTuple(args, "i:get_item_weight", &item)) {
+    return nullptr;
+  }
+  if (!self->crush->item_exists(item)) {
+    Py_RETURN_NONE;
+  }
+  return PyFloat_FromDouble(self->crush->get_item_weightf(item));
+}
+
+static PyObject *crush_find_roots(BasePyCRUSH *self)
+{
+  set<int> roots;
+  self->crush->find_roots(&roots);
+  PyFormatter f;
+  f.open_array_section("roots");
+  for (auto root : roots) {
+    f.dump_int("root", root);
+  }
+  f.close_section();
+  return f.get();
+}
+
+static PyObject *crush_find_takes(BasePyCRUSH *self, PyObject *obj)
+{
+  set<int> takes;
+  self->crush->find_takes(&takes);
+  PyFormatter f;
+  f.open_array_section("takes");
+  for (auto root : takes) {
+    f.dump_int("root", root);
+  }
+  f.close_section();
+  return f.get();
+}
+
+static PyObject *crush_get_take_weight_osd_map(BasePyCRUSH *self, PyObject *args)
+{
+  int root;
+  if (!PyArg_ParseTuple(args, "i:get_take_weight_osd_map",
+			&root)) {
+    return nullptr;
+  }
+  map<int,float> wmap;
+
+  if (!self->crush->item_exists(root)) {
+    return nullptr;
+  }
+
+  self->crush->get_take_weight_osd_map(root, &wmap);
+  PyFormatter f;
+  f.open_object_section("weights");
+  for (auto& p : wmap) {
+    string n = stringify(p.first);     // ick
+    f.dump_float(n.c_str(), p.second);
+  }
+  f.close_section();
+  return f.get();
+}
+
+PyMethodDef BasePyCRUSH_methods[] = {
+  {"_dump", (PyCFunction)crush_dump, METH_NOARGS, "Dump map"},
+  {"_get_item_name", (PyCFunction)crush_get_item_name, METH_VARARGS,
+    "Get item name"},
+  {"_get_item_weight", (PyCFunction)crush_get_item_weight, METH_VARARGS,
+    "Get item weight"},
+  {"_find_roots", (PyCFunction)crush_find_roots, METH_NOARGS,
+   "Find all tree roots"},
+  {"_find_takes", (PyCFunction)crush_find_takes, METH_NOARGS,
+    "Find distinct TAKE roots"},
+  {"_get_take_weight_osd_map", (PyCFunction)crush_get_take_weight_osd_map,
+    METH_VARARGS, "Get OSD weight map for a given TAKE root node"},
+  {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyCRUSHType = {
+  PyVarObject_HEAD_INIT(NULL, 0)
+  "ceph_module.BasePyCRUSH", /* tp_name */
+  sizeof(BasePyCRUSH),     /* tp_basicsize */
+  0,                         /* tp_itemsize */
+  (destructor)BasePyCRUSH_dealloc,      /* tp_dealloc */
+  0,                         /* tp_print */
+  0,                         /* tp_getattr */
+  0,                         /* tp_setattr */
+  0,                         /* tp_compare */
+  0,                         /* tp_repr */
+  0,                         /* tp_as_number */
+  0,                         /* tp_as_sequence */
+  0,                         /* tp_as_mapping */
+  0,                         /* tp_hash */
+  0,                         /* tp_call */
+  0,                         /* tp_str */
+  0,                         /* tp_getattro */
+  0,                         /* tp_setattro */
+  0,                         /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
+  "Ceph OSDMapIncremental",  /* tp_doc */
+  0,                         /* tp_traverse */
+  0,                         /* tp_clear */
+  0,                         /* tp_richcompare */
+  0,                         /* tp_weaklistoffset */
+  0,                         /* tp_iter */
+  0,                         /* tp_iternext */
+  BasePyCRUSH_methods,     /* tp_methods */
+  0,                         /* tp_members */
+  0,                         /* tp_getset */
+  0,                         /* tp_base */
+  0,                         /* tp_dict */
+  0,                         /* tp_descr_get */
+  0,                         /* tp_descr_set */
+  0,                         /* tp_dictoffset */
+  (initproc)BasePyCRUSH_init,                         /* tp_init */
+  0,                         /* tp_alloc */
+  0,                         /* tp_new */
+};
diff --git a/src/mgr/PyOSDMap.h b/src/mgr/PyOSDMap.h
new file mode 100644
index 000000000..2cc30dfe2
--- /dev/null
+++ b/src/mgr/PyOSDMap.h
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <Python.h>
+
+#include <string>
+
+extern PyTypeObject BasePyOSDMapType;
+extern PyTypeObject BasePyOSDMapIncrementalType;
+extern PyTypeObject BasePyCRUSHType;
+
+PyObject *construct_with_capsule(
+    const std::string &module,
+    const std::string &clsname,
+    void *wrapped);
+
diff --git a/src/mgr/PyUtil.cc b/src/mgr/PyUtil.cc
new file mode 100644
index 000000000..a8efc2f28
--- /dev/null
+++ b/src/mgr/PyUtil.cc
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <Python.h>
+
+#include "PyUtil.h"
+
+PyObject *get_python_typed_option_value(
+  Option::type_t type,
+  const std::string& value)
+{
+  switch (type) {
+  case Option::TYPE_INT:
+  case Option::TYPE_UINT:
+  case Option::TYPE_SIZE:
+    return PyLong_FromString((char *)value.c_str(), nullptr, 0);
+  case Option::TYPE_SECS:
+  case Option::TYPE_MILLISECS:
+  case Option::TYPE_FLOAT:
+    {
+      PyObject *s = PyUnicode_FromString(value.c_str());
+      PyObject *f = PyFloat_FromString(s);
+      Py_DECREF(s);
+      return f;
+    }
+  case Option::TYPE_BOOL:
+    if (value == "1" || value == "true" || value == "True" ||
+	value == "on" || value == "yes") {
+      Py_INCREF(Py_True);
+      return Py_True;
+    } else {
+      Py_INCREF(Py_False);
+      return Py_False;
+    }
+  case Option::TYPE_STR:
+  case Option::TYPE_ADDR:
+  case Option::TYPE_ADDRVEC:
+  case Option::TYPE_UUID:
+    break;
+  }
+  return PyUnicode_FromString(value.c_str());
+}
diff --git a/src/mgr/PyUtil.h b/src/mgr/PyUtil.h
new file mode 100644
index 000000000..188b3d28f
--- /dev/null
+++ b/src/mgr/PyUtil.h
@@ -0,0 +1,14 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+
+#include <Python.h>
+
+#include "common/options.h"
+
+PyObject *get_python_typed_option_value(
+  Option::type_t type,
+  const std::string& value);
diff --git a/src/mgr/ServiceMap.cc b/src/mgr/ServiceMap.cc
new file mode 100644
index 000000000..b6f8ad97c
--- /dev/null
+++ b/src/mgr/ServiceMap.cc
@@ -0,0 +1,244 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mgr/ServiceMap.h"
+
+#include <experimental/iterator>
+#include <fmt/format.h>
+#include <regex>
+
+#include "common/Formatter.h"
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+// Daemon
+
+void ServiceMap::Daemon::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(gid, bl);
+  encode(addr, bl, features);
+  encode(start_epoch, bl);
+  encode(start_stamp, bl);
+  encode(metadata, bl);
+  encode(task_status, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ServiceMap::Daemon::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(2, p);
+  decode(gid, p);
+  decode(addr, p);
+  decode(start_epoch, p);
+  decode(start_stamp, p);
+  decode(metadata, p);
+  if (struct_v >= 2) {
+    decode(task_status, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void ServiceMap::Daemon::dump(Formatter *f) const
+{
+  f->dump_unsigned("start_epoch", start_epoch);
+  f->dump_stream("start_stamp") << start_stamp;
+  f->dump_unsigned("gid", gid);
+  f->dump_string("addr", addr.get_legacy_str());
+  f->open_object_section("metadata");
+  for (auto& p : metadata) {
+    f->dump_string(p.first.c_str(), p.second);
+  }
+  f->close_section();
+  f->open_object_section("task_status");
+  for (auto& p : task_status) {
+    f->dump_string(p.first.c_str(), p.second);
+  }
+  f->close_section();
+}
+
+void ServiceMap::Daemon::generate_test_instances(std::list<Daemon*>& ls)
+{
+  ls.push_back(new Daemon);
+  ls.push_back(new Daemon);
+  ls.back()->gid = 222;
+  ls.back()->metadata["this"] = "that";
+  ls.back()->task_status["task1"] = "running";
+}
+
+// Service
+
+std::string ServiceMap::Service::get_summary() const
+{
+  if (!summary.empty()) {
+    return summary;
+  }
+  if (daemons.empty()) {
+    return "no daemons active";
+  }
+
+  // If "daemon_type" is present, this will be used in place of "daemon" when
+  // reporting the count (e.g., "${N} daemons").
+  //
+  // We will additional break down the count by various groupings, based
+  // on the following keys:
+  //
+  //   "hostname" -> host(s)
+  //   "zone_id" -> zone(s)
+  //
+  // The `ceph -s` will be something likes:
+  //    iscsi: 3 portals active (3 hosts)
+  //      rgw: 3 gateways active (3 hosts, 1 zone)
+
+  std::map<std::string, std::set<std::string>> groupings;
+  std::string type("daemon");
+  int num = 0;
+  for (auto& d : daemons) {
+    ++num;
+    if (auto p = d.second.metadata.find("daemon_type");
+	p != d.second.metadata.end()) {
+      type = p->second;
+    }
+    for (auto k : {make_pair("zone", "zone_id"),
+	  make_pair("host", "hostname")}) {
+      auto p = d.second.metadata.find(k.second);
+      if (p != d.second.metadata.end()) {
+	groupings[k.first].insert(p->second);
+      }
+    }
+  }
+
+  std::ostringstream ss;
+  ss << num << " " << type << (num > 1 ? "s" : "") << " active";
+  if (groupings.size()) {
+    ss << " (";
+    for (auto i = groupings.begin(); i != groupings.end(); ++i) {
+      if (i != groupings.begin()) {
+	ss << ", ";
+      }
+      ss << i->second.size() << " " << i->first << (i->second.size() ? "s" : "");
+    }
+    ss << ")";
+  }
+
+  return ss.str();
+}
+
+bool ServiceMap::Service::has_running_tasks() const
+{
+  return std::any_of(daemons.begin(), daemons.end(), [](auto& daemon) {
+    return !daemon.second.task_status.empty();
+  });
+}
+
+std::string ServiceMap::Service::get_task_summary(const std::string_view task_prefix) const
+{
+  // contruct a map similar to:
+  //     {"service1 status" -> {"service1.0" -> "running"}}
+  //     {"service2 status" -> {"service2.0" -> "idle"},
+  //                           {"service2.1" -> "running"}}
+  std::map<std::string, std::map<std::string, std::string>> by_task;
+  for (const auto& [service_id, daemon] : daemons) {
+    for (const auto& [task_name, status] : daemon.task_status) {
+      by_task[task_name].emplace(fmt::format("{}.{}", task_prefix, service_id),
+				 status);
+    }
+  }
+  std::stringstream ss;
+  for (const auto &[task_name, status_by_service] : by_task) {
+    ss << "\n    " << task_name << ":";
+    for (auto& [service, status] : status_by_service) {
+      ss << "\n        " << service << ": " << status;
+    }
+  }
+  return ss.str();
+}
+
+void ServiceMap::Service::count_metadata(const std::string& field,
+					std::map<std::string,int> *out) const
+{
+  for (auto& p : daemons) {
+    auto q = p.second.metadata.find(field);
+    if (q == p.second.metadata.end()) {
+      (*out)["unknown"]++;
+    } else {
+      (*out)[q->second]++;
+    }
+  }
+}
+
+void ServiceMap::Service::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(daemons, bl, features);
+  encode(summary, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ServiceMap::Service::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(daemons, p);
+  decode(summary, p);
+  DECODE_FINISH(p);
+}
+
+void ServiceMap::Service::dump(Formatter *f) const
+{
+  f->open_object_section("daemons");
+  f->dump_string("summary", summary);
+  for (auto& p : daemons) {
+    f->dump_object(p.first.c_str(), p.second);
+  }
+  f->close_section();
+}
+
+void ServiceMap::Service::generate_test_instances(std::list<Service*>& ls)
+{
+  ls.push_back(new Service);
+  ls.push_back(new Service);
+  ls.back()->daemons["one"].gid = 1;
+  ls.back()->daemons["two"].gid = 2;
+}
+
+// ServiceMap
+
+void ServiceMap::encode(bufferlist& bl, uint64_t features) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(epoch, bl);
+  encode(modified, bl);
+  encode(services, bl, features);
+  ENCODE_FINISH(bl);
+}
+
+void ServiceMap::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(epoch, p);
+  decode(modified, p);
+  decode(services, p);
+  DECODE_FINISH(p);
+}
+
+void ServiceMap::dump(Formatter *f) const
+{
+  f->dump_unsigned("epoch", epoch);
+  f->dump_stream("modified") << modified;
+  f->open_object_section("services");
+  for (auto& p : services) {
+    f->dump_object(p.first.c_str(), p.second);
+  }
+  f->close_section();
+}
+
+void ServiceMap::generate_test_instances(std::list<ServiceMap*>& ls)
+{
+  ls.push_back(new ServiceMap);
+  ls.push_back(new ServiceMap);
+  ls.back()->epoch = 123;
+  ls.back()->services["rgw"].daemons["one"].gid = 123;
+  ls.back()->services["rgw"].daemons["two"].gid = 344;
+  ls.back()->services["iscsi"].daemons["foo"].gid = 3222;
+}
diff --git a/src/mgr/ServiceMap.h b/src/mgr/ServiceMap.h
new file mode 100644
index 000000000..ed027907c
--- /dev/null
+++ b/src/mgr/ServiceMap.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <list>
+#include <sstream>
+
+#include "include/utime.h"
+#include "include/buffer.h"
+#include "msg/msg_types.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+struct ServiceMap {
+  struct Daemon {
+    uint64_t gid = 0;
+    entity_addr_t addr;
+    epoch_t start_epoch = 0;   ///< epoch first registered
+    utime_t start_stamp;       ///< timestamp daemon started/registered
+    std::map<std::string,std::string> metadata;  ///< static metadata
+    std::map<std::string,std::string> task_status; ///< running task status
+
+    void encode(ceph::buffer::list& bl, uint64_t features) const;
+    void decode(ceph::buffer::list::const_iterator& p);
+    void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<Daemon*>& ls);
+  };
+
+  struct Service {
+    std::map<std::string,Daemon> daemons;
+    std::string summary;   ///< summary status std::string for 'ceph -s'
+
+    void encode(ceph::buffer::list& bl, uint64_t features) const;
+    void decode(ceph::buffer::list::const_iterator& p);
+    void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<Service*>& ls);
+
+    std::string get_summary() const;
+    bool has_running_tasks() const;
+    std::string get_task_summary(const std::string_view task_prefix) const;
+    void count_metadata(const std::string& field,
+			std::map<std::string,int> *out) const;
+  };
+
+  epoch_t epoch = 0;
+  utime_t modified;
+  std::map<std::string,Service> services;
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ServiceMap*>& ls);
+
+  std::pair<Daemon*,bool> get_daemon(const std::string& service,
+				     const std::string& daemon) {
+    auto& s = services[service];
+    auto [d, added] = s.daemons.try_emplace(daemon);
+    return {&d->second, added};
+  }
+
+  bool rm_daemon(const std::string& service,
+		 const std::string& daemon) {
+    auto p = services.find(service);
+    if (p == services.end()) {
+      return false;
+    }
+    auto q = p->second.daemons.find(daemon);
+    if (q == p->second.daemons.end()) {
+      return false;
+    }
+    p->second.daemons.erase(q);
+    if (p->second.daemons.empty()) {
+      services.erase(p);
+    }
+    return true;
+  }
+
+  static inline bool is_normal_ceph_entity(std::string_view type) {
+    if (type == "osd" ||
+        type == "client" ||
+        type == "mon" ||
+        type == "mds" ||
+        type == "mgr") {
+      return true;
+    }
+
+    return false;
+  }
+};
+WRITE_CLASS_ENCODER_FEATURES(ServiceMap)
+WRITE_CLASS_ENCODER_FEATURES(ServiceMap::Service)
+WRITE_CLASS_ENCODER_FEATURES(ServiceMap::Daemon)
diff --git a/src/mgr/StandbyPyModules.cc b/src/mgr/StandbyPyModules.cc
new file mode 100644
index 000000000..86ee8550c
--- /dev/null
+++ b/src/mgr/StandbyPyModules.cc
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "StandbyPyModules.h"
+
+#include "common/Finisher.h"
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "mgr/MgrContext.h"
+#include "mgr/Gil.h"
+
+// For ::mgr_store_prefix
+#include "PyModuleRegistry.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+StandbyPyModules::StandbyPyModules(
+    const MgrMap &mgr_map_,
+    PyModuleConfig &module_config,
+    LogChannelRef clog_,
+    MonClient &monc_,
+    Finisher &f)
+    : state(module_config, monc_),
+      clog(clog_),
+      finisher(f)
+{
+  state.set_mgr_map(mgr_map_);
+}
+
+// FIXME: completely identical to ActivePyModules
+void StandbyPyModules::shutdown()
+{
+  std::lock_guard locker(lock);
+
+  // Signal modules to drop out of serve() and/or tear down resources
+  for (auto &i : modules) {
+    auto module = i.second.get();
+    const auto& name = i.first;
+    dout(10) << "waiting for module " << name << " to shutdown" << dendl;
+    lock.unlock();
+    module->shutdown();
+    lock.lock();
+    dout(10) << "module " << name << " shutdown" << dendl;
+  }
+
+  // For modules implementing serve(), finish the threads where we
+  // were running that.
+  for (auto &i : modules) {
+    lock.unlock();
+    dout(10) << "joining thread for module " << i.first << dendl;
+    i.second->thread.join();
+    dout(10) << "joined thread for module " << i.first << dendl;
+    lock.lock();
+  }
+
+  modules.clear();
+}
+
+void StandbyPyModules::start_one(PyModuleRef py_module)
+{
+  std::lock_guard l(lock);
+  const auto name = py_module->get_name();
+  auto standby_module = new StandbyPyModule(state, py_module, clog);
+
+  // Send all python calls down a Finisher to avoid blocking
+  // C++ code, and avoid any potential lock cycles.
+  finisher.queue(new LambdaContext([this, standby_module, name](int) {
+    int r = standby_module->load();
+    if (r != 0) {
+      derr << "Failed to run module in standby mode ('" << name << "')"
+           << dendl;
+      delete standby_module;
+    } else {
+      std::lock_guard l(lock);
+      auto em = modules.emplace(name, standby_module);
+      ceph_assert(em.second); // actually inserted
+
+      dout(4) << "Starting thread for " << name << dendl;
+      standby_module->thread.create(standby_module->get_thread_name());
+    }
+  }));
+}
+
+int StandbyPyModule::load()
+{
+  Gil gil(py_module->pMyThreadState, true);
+
+  // We tell the module how we name it, so that it can be consistent
+  // with us in logging etc.
+  auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+  ceph_assert(pThisPtr != nullptr);
+  auto pModuleName = PyUnicode_FromString(get_name().c_str());
+  ceph_assert(pModuleName != nullptr);
+  auto pArgs = PyTuple_Pack(2, pModuleName, pThisPtr);
+  Py_DECREF(pThisPtr);
+  Py_DECREF(pModuleName);
+
+  pClassInstance = PyObject_CallObject(py_module->pStandbyClass, pArgs);
+  Py_DECREF(pArgs);
+  if (pClassInstance == nullptr) {
+    derr << "Failed to construct class in '" << get_name() << "'" << dendl;
+    derr << handle_pyerror() << dendl;
+    return -EINVAL;
+  } else {
+    dout(1) << "Constructed class from module: " << get_name() << dendl;
+    return 0;
+  }
+}
+
+bool StandbyPyModule::get_config(const std::string &key,
+                                 std::string *value) const
+{
+  const std::string global_key = "mgr/" + get_name() + "/" + key;
+
+  dout(4) << __func__ << " key: " << global_key << dendl;
+ 
+  return state.with_config([global_key, value](const PyModuleConfig &config){
+    if (config.config.count(global_key)) {
+      *value = config.config.at(global_key);
+      return true;
+    } else {
+      return false;
+    }
+  });
+}
+
+bool StandbyPyModule::get_store(const std::string &key,
+                                std::string *value) const
+{
+
+  const std::string global_key = PyModule::mgr_store_prefix
+    + get_name() + "/" + key;
+
+  dout(4) << __func__ << " key: " << global_key << dendl;
+
+  // Active modules use a cache of store values (kept up to date
+  // as writes pass through the active mgr), but standbys
+  // fetch values synchronously to get an up to date value.
+  // It's an acceptable cost because standby modules should not be
+  // doing a lot.
+  
+  MonClient &monc = state.get_monc();
+
+  std::ostringstream cmd_json;
+  cmd_json << "{\"prefix\": \"config-key get\", \"key\": \""
+           << global_key << "\"}";
+
+  bufferlist outbl;
+  std::string outs;
+  C_SaferCond c;
+  monc.start_mon_command(
+      {cmd_json.str()},
+      {},
+      &outbl,
+      &outs,
+      &c);
+
+  int r = c.wait();
+  if (r == -ENOENT) {
+    return false;
+  } else if (r != 0) {
+    // This is some internal error, not meaningful to python modules,
+    // so let them just see no value.
+    derr << __func__ << " error fetching store key '" << global_key << "': "
+         << cpp_strerror(r) << " " << outs << dendl;
+    return false;
+  } else {
+    *value = outbl.to_str();
+    return true;
+  }
+}
+
+std::string StandbyPyModule::get_active_uri() const
+{
+  std::string result;
+  state.with_mgr_map([&result, this](const MgrMap &mgr_map){
+    auto iter = mgr_map.services.find(get_name());
+    if (iter != mgr_map.services.end()) {
+      result = iter->second;
+    }
+  });
+
+  return result;
+}
+
diff --git a/src/mgr/StandbyPyModules.h b/src/mgr/StandbyPyModules.h
new file mode 100644
index 000000000..501dfc8c7
--- /dev/null
+++ b/src/mgr/StandbyPyModules.h
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include <string>
+#include <map>
+
+#include <Python.h>
+
+#include "common/Thread.h"
+#include "common/ceph_mutex.h"
+
+#include "mgr/Gil.h"
+#include "mon/MonClient.h"
+#include "mon/MgrMap.h"
+#include "mgr/PyModuleRunner.h"
+
+class Finisher;
+
+/**
+ * State that is read by all modules running in standby mode
+ */
+class StandbyPyModuleState
+{
+  mutable ceph::mutex lock = ceph::make_mutex("StandbyPyModuleState::lock");
+
+  MgrMap mgr_map;
+  PyModuleConfig &module_config;
+  MonClient &monc;
+
+public:
+
+  
+  StandbyPyModuleState(PyModuleConfig &module_config_, MonClient &monc_)
+    : module_config(module_config_), monc(monc_)
+  {}
+
+  void set_mgr_map(const MgrMap &mgr_map_)
+  {
+    std::lock_guard l(lock);
+
+    mgr_map = mgr_map_;
+  }
+
+  // MonClient does all its own locking so we're happy to hand out
+  // references.
+  MonClient &get_monc() {return monc;};
+
+  template<typename Callback, typename...Args>
+  void with_mgr_map(Callback&& cb, Args&&...args) const
+  {
+    std::lock_guard l(lock);
+    std::forward<Callback>(cb)(mgr_map, std::forward<Args>(args)...);
+  }
+
+  template<typename Callback, typename...Args>
+  auto with_config(Callback&& cb, Args&&... args) const ->
+    decltype(cb(module_config, std::forward<Args>(args)...)) {
+    std::lock_guard l(lock);
+
+    return std::forward<Callback>(cb)(module_config, std::forward<Args>(args)...);
+  }
+};
+
+
+class StandbyPyModule : public PyModuleRunner
+{
+  StandbyPyModuleState &state;
+
+  public:
+
+  StandbyPyModule(
+      StandbyPyModuleState &state_,
+      const PyModuleRef &py_module_,
+      LogChannelRef clog_)
+    :
+      PyModuleRunner(py_module_, clog_),
+      state(state_)
+  {
+  }
+
+  bool get_config(const std::string &key, std::string *value) const;
+  bool get_store(const std::string &key, std::string *value) const;
+  std::string get_active_uri() const;
+  entity_addrvec_t get_myaddrs() const {
+    return state.get_monc().get_myaddrs();
+  }
+
+  int load();
+};
+
+class StandbyPyModules
+{
+private:
+  mutable ceph::mutex lock = ceph::make_mutex("StandbyPyModules::lock");
+  std::map<std::string, std::unique_ptr<StandbyPyModule>> modules;
+
+  StandbyPyModuleState state;
+
+  LogChannelRef clog;
+
+  Finisher &finisher;
+
+public:
+
+  StandbyPyModules(
+      const MgrMap &mgr_map_,
+      PyModuleConfig &module_config,
+      LogChannelRef clog_,
+      MonClient &monc,
+      Finisher &f);
+
+  void start_one(PyModuleRef py_module);
+
+  void shutdown();
+
+  void handle_mgr_map(const MgrMap &mgr_map)
+  {
+    state.set_mgr_map(mgr_map);
+  }
+
+};
diff --git a/src/mgr/TTLCache.cc b/src/mgr/TTLCache.cc
new file mode 100644
index 000000000..05fe95987
--- /dev/null
+++ b/src/mgr/TTLCache.cc
@@ -0,0 +1,100 @@
+#include "TTLCache.h"
+
+#include <chrono>
+#include <functional>
+#include <string>
+
+#include "PyUtil.h"
+
+template <class Key, class Value>
+void TTLCacheBase<Key, Value>::insert(Key key, Value value) {
+  auto now = std::chrono::steady_clock::now();
+
+  if (!ttl) return;
+  int16_t random_ttl_offset =
+      ttl * ttl_spread_ratio * (2l * rand() / float(RAND_MAX) - 1);
+  // in order not to have spikes of misses we increase or decrease by 25% of
+  // the ttl
+  int16_t spreaded_ttl = ttl + random_ttl_offset;
+  auto expiration_date = now + std::chrono::seconds(spreaded_ttl);
+  cache::insert(key, {value, expiration_date});
+}
+
+template <class Key, class Value> Value TTLCacheBase<Key, Value>::get(Key key) {
+  if (!exists(key)) {
+    throw_key_not_found(key);
+  }
+  if (expired(key)) {
+    erase(key);
+    throw_key_not_found(key);
+  }
+  Value value = {get_value(key)};
+  return value;
+}
+
+template <class Key> PyObject* TTLCache<Key, PyObject*>::get(Key key) {
+  if (!this->exists(key)) {
+    this->throw_key_not_found(key);
+  }
+  if (this->expired(key)) {
+    this->erase(key);
+    this->throw_key_not_found(key);
+  }
+  PyObject* cached_value = this->get_value(key);
+  Py_INCREF(cached_value);
+  return cached_value;
+}
+
+template <class Key, class Value>
+void TTLCacheBase<Key, Value>::erase(Key key) {
+  cache::erase(key);
+}
+
+template <class Key> void TTLCache<Key, PyObject*>::erase(Key key) {
+  Py_DECREF(this->get_value(key, false));
+  ttl_base::erase(key);
+}
+
+template <class Key, class Value>
+bool TTLCacheBase<Key, Value>::expired(Key key) {
+  ttl_time_point expiration_date = get_value_time_point(key);
+  auto now = std::chrono::steady_clock::now();
+  if (now >= expiration_date) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <class Key, class Value> void TTLCacheBase<Key, Value>::clear() {
+  cache::clear();
+}
+
+template <class Key, class Value>
+Value TTLCacheBase<Key, Value>::get_value(Key key, bool count_hit) {
+  value_type stored_value = cache::get(key, count_hit);
+  Value value = std::get<0>(stored_value);
+  return value;
+}
+
+template <class Key, class Value>
+ttl_time_point TTLCacheBase<Key, Value>::get_value_time_point(Key key) {
+  value_type stored_value = cache::get(key, false);
+  ttl_time_point tp = std::get<1>(stored_value);
+  return tp;
+}
+
+template <class Key, class Value>
+void TTLCacheBase<Key, Value>::set_ttl(uint16_t ttl) {
+  this->ttl = ttl;
+}
+
+template <class Key, class Value>
+bool TTLCacheBase<Key, Value>::exists(Key key) {
+  return cache::exists(key);
+}
+
+template <class Key, class Value>
+void TTLCacheBase<Key, Value>::throw_key_not_found(Key key) {
+  cache::throw_key_not_found(key);
+}
diff --git a/src/mgr/TTLCache.h b/src/mgr/TTLCache.h
new file mode 100644
index 000000000..a6d5ddf2e
--- /dev/null
+++ b/src/mgr/TTLCache.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "PyUtil.h"
+
+using namespace std;
+
+template <class Key, class Value> class Cache {
+ private:
+  std::atomic<uint64_t> hits, misses;
+
+ protected:
+  unsigned int capacity;
+  Cache(unsigned int size = UINT16_MAX) : hits{0}, misses{0}, capacity{size} {};
+  std::map<Key, Value> content;
+  std::vector<string> allowed_keys = {"osd_map", "pg_dump", "pg_stats"};
+
+  void mark_miss() {
+    misses++;
+  }
+
+  void mark_hit() {
+    hits++;
+  }
+
+  unsigned int get_misses() { return misses; }
+  unsigned int get_hits() { return hits; }
+  void throw_key_not_found(Key key) {
+    std::stringstream ss;
+    ss << "Key " << key << " couldn't be found\n";
+    throw std::out_of_range(ss.str());
+  }
+
+ public:
+  void insert(Key key, Value value) {
+    mark_miss();
+    if (content.size() < capacity) {
+      content.insert({key, value});
+    }
+  }
+  Value get(Key key, bool count_hit = true) {
+    if (count_hit) {
+      mark_hit();
+    }
+    return content[key];
+  }
+  void erase(Key key) { content.erase(content.find(key)); }
+  void clear() { content.clear(); }
+  bool exists(Key key) { return content.find(key) != content.end(); }
+  std::pair<uint64_t, uint64_t> get_hit_miss_ratio() {
+    return std::make_pair(hits.load(), misses.load());
+  }
+  bool is_cacheable(Key key) {
+    for (auto k : allowed_keys) {
+      if (key == k) return true;
+    }
+    return false;
+  }
+  int size() { return content.size(); }
+
+  ~Cache(){};
+};
+
+using ttl_time_point = std::chrono::time_point<std::chrono::steady_clock>;
+template <class Key, class Value>
+class TTLCacheBase : public Cache<Key, std::pair<Value, ttl_time_point>> {
+ private:
+  uint16_t ttl;
+  float ttl_spread_ratio;
+  using value_type = std::pair<Value, ttl_time_point>;
+  using cache = Cache<Key, value_type>;
+
+ protected:
+  Value get_value(Key key, bool count_hit = true);
+  ttl_time_point get_value_time_point(Key key);
+  bool exists(Key key);
+  bool expired(Key key);
+  void finish_get(Key key);
+  void finish_erase(Key key);
+  void throw_key_not_found(Key key);
+
+ public:
+  TTLCacheBase(uint16_t ttl_ = 0, uint16_t size = UINT16_MAX,
+               float spread = 0.25)
+      : Cache<Key, value_type>(size), ttl{ttl_}, ttl_spread_ratio{spread} {}
+  ~TTLCacheBase(){};
+  void insert(Key key, Value value);
+  Value get(Key key);
+  void erase(Key key);
+  void clear();
+  uint16_t get_ttl() { return ttl; };
+  void set_ttl(uint16_t ttl);
+};
+
+template <class Key, class Value>
+class TTLCache : public TTLCacheBase<Key, Value> {
+ public:
+  TTLCache(uint16_t ttl_ = 0, uint16_t size = UINT16_MAX, float spread = 0.25)
+      : TTLCacheBase<Key, Value>(ttl_, size, spread) {}
+  ~TTLCache(){};
+};
+
+template <class Key>
+class TTLCache<Key, PyObject*> : public TTLCacheBase<Key, PyObject*> {
+ public:
+  TTLCache(uint16_t ttl_ = 0, uint16_t size = UINT16_MAX, float spread = 0.25)
+      : TTLCacheBase<Key, PyObject*>(ttl_, size, spread) {}
+  ~TTLCache(){};
+  PyObject* get(Key key);
+  void erase(Key key);
+
+ private:
+  using ttl_base = TTLCacheBase<Key, PyObject*>;
+};
+
+#include "TTLCache.cc"
+
diff --git a/src/mgr/Types.h b/src/mgr/Types.h
new file mode 100644
index 000000000..ab90bbbe9
--- /dev/null
+++ b/src/mgr/Types.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_TYPES_H
+#define CEPH_MGR_TYPES_H
+
+typedef int MetricQueryID;
+
+typedef std::pair<uint64_t,uint64_t> PerformanceCounter;
+typedef std::vector<PerformanceCounter> PerformanceCounters;
+
+struct MetricListener {
+  virtual ~MetricListener() {
+  }
+
+  virtual void handle_query_updated() = 0;
+};
+
+struct PerfCollector {
+  MetricQueryID query_id;
+  PerfCollector(MetricQueryID query_id)
+    : query_id(query_id) {
+  }
+};
+
+#endif // CEPH_MGR_TYPES_H
diff --git a/src/mgr/mgr_commands.cc b/src/mgr/mgr_commands.cc
new file mode 100644
index 000000000..206d1126a
--- /dev/null
+++ b/src/mgr/mgr_commands.cc
@@ -0,0 +1,14 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mgr_commands.h"
+
+/* The set of statically defined (C++-handled) commands.  This
+ * does not include the Python-defined commands, which are loaded
+ * in PyModules */
+const std::vector<MonCommand> mgr_commands = {
+#define COMMAND(parsesig, helptext, module, perm) \
+  {parsesig, helptext, module, perm, 0},
+#include "MgrCommands.h"
+#undef COMMAND
+};
diff --git a/src/mgr/mgr_commands.h b/src/mgr/mgr_commands.h
new file mode 100644
index 000000000..c6ed6c68d
--- /dev/null
+++ b/src/mgr/mgr_commands.h
@@ -0,0 +1,9 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "mon/MonCommand.h"
+#include <vector>
+
+extern const std::vector<MonCommand> mgr_commands;
diff --git a/src/mgr/mgr_perf_counters.cc b/src/mgr/mgr_perf_counters.cc
new file mode 100644
index 000000000..1b5585f9e
--- /dev/null
+++ b/src/mgr/mgr_perf_counters.cc
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "mgr_perf_counters.h"
+#include "common/perf_counters.h"
+#include "common/ceph_context.h"
+
+PerfCounters *perfcounter = NULL;
+
+int mgr_perf_start(CephContext *cct)
+{
+  PerfCountersBuilder plb(cct, "mgr", l_mgr_first, l_mgr_last);
+  plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+  plb.add_u64_counter(l_mgr_cache_hit, "cache_hit", "Cache hits");
+  plb.add_u64_counter(l_mgr_cache_miss, "cache_miss", "Cache miss");
+
+  perfcounter = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perfcounter);
+  return 0;
+}
+
+void mgr_perf_stop(CephContext *cct)
+{
+  ceph_assert(perfcounter);
+  cct->get_perfcounters_collection()->remove(perfcounter);
+  delete perfcounter;
+}
diff --git a/src/mgr/mgr_perf_counters.h b/src/mgr/mgr_perf_counters.h
new file mode 100644
index 000000000..d695d905f
--- /dev/null
+++ b/src/mgr/mgr_perf_counters.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+#include "include/common_fwd.h"
+
+extern PerfCounters* perfcounter;
+
+extern int mgr_perf_start(CephContext* cct);
+extern void mgr_perf_stop(CephContext* cct);
+
+enum {
+  l_mgr_first,
+
+  l_mgr_cache_hit,
+  l_mgr_cache_miss,
+
+  l_mgr_last,
+};
+