summaryrefslogtreecommitdiffstats
path: root/src/mgr
diff options
context:
space:
mode:
Diffstat (limited to 'src/mgr')
-rw-r--r--src/mgr/ActivePyModule.cc275
-rw-r--r--src/mgr/ActivePyModule.h102
-rw-r--r--src/mgr/ActivePyModules.cc1513
-rw-r--r--src/mgr/ActivePyModules.h228
-rw-r--r--src/mgr/BaseMgrModule.cc1623
-rw-r--r--src/mgr/BaseMgrModule.h7
-rw-r--r--src/mgr/BaseMgrStandbyModule.cc269
-rw-r--r--src/mgr/BaseMgrStandbyModule.h6
-rw-r--r--src/mgr/CMakeLists.txt46
-rw-r--r--src/mgr/ClusterState.cc384
-rw-r--r--src/mgr/ClusterState.h163
-rw-r--r--src/mgr/DaemonHealthMetric.h76
-rw-r--r--src/mgr/DaemonHealthMetricCollector.cc101
-rw-r--r--src/mgr/DaemonHealthMetricCollector.h32
-rw-r--r--src/mgr/DaemonKey.cc35
-rw-r--r--src/mgr/DaemonKey.h24
-rw-r--r--src/mgr/DaemonServer.cc3146
-rw-r--r--src/mgr/DaemonServer.h316
-rw-r--r--src/mgr/DaemonState.cc381
-rw-r--r--src/mgr/DaemonState.h409
-rw-r--r--src/mgr/Gil.cc114
-rw-r--r--src/mgr/Gil.h114
-rw-r--r--src/mgr/MDSPerfMetricCollector.cc64
-rw-r--r--src/mgr/MDSPerfMetricCollector.h28
-rw-r--r--src/mgr/MDSPerfMetricTypes.cc153
-rw-r--r--src/mgr/MDSPerfMetricTypes.h367
-rw-r--r--src/mgr/MetricCollector.cc191
-rw-r--r--src/mgr/MetricCollector.h85
-rw-r--r--src/mgr/MetricTypes.h277
-rw-r--r--src/mgr/Mgr.cc795
-rw-r--r--src/mgr/Mgr.h143
-rw-r--r--src/mgr/MgrCap.cc580
-rw-r--r--src/mgr/MgrCap.h201
-rw-r--r--src/mgr/MgrClient.cc662
-rw-r--r--src/mgr/MgrClient.h215
-rw-r--r--src/mgr/MgrCommands.h211
-rw-r--r--src/mgr/MgrContext.h73
-rw-r--r--src/mgr/MgrSession.h40
-rw-r--r--src/mgr/MgrStandby.cc503
-rw-r--r--src/mgr/MgrStandby.h89
-rw-r--r--src/mgr/OSDPerfMetricCollector.cc39
-rw-r--r--src/mgr/OSDPerfMetricCollector.h23
-rw-r--r--src/mgr/OSDPerfMetricTypes.cc134
-rw-r--r--src/mgr/OSDPerfMetricTypes.h360
-rw-r--r--src/mgr/PyFormatter.cc140
-rw-r--r--src/mgr/PyFormatter.h163
-rw-r--r--src/mgr/PyModule.cc729
-rw-r--r--src/mgr/PyModule.h191
-rw-r--r--src/mgr/PyModuleRegistry.cc454
-rw-r--r--src/mgr/PyModuleRegistry.h231
-rw-r--r--src/mgr/PyModuleRunner.cc110
-rw-r--r--src/mgr/PyModuleRunner.h89
-rw-r--r--src/mgr/PyOSDMap.cc682
-rw-r--r--src/mgr/PyOSDMap.h18
-rw-r--r--src/mgr/PyUtil.cc42
-rw-r--r--src/mgr/PyUtil.h14
-rw-r--r--src/mgr/ServiceMap.cc244
-rw-r--r--src/mgr/ServiceMap.h97
-rw-r--r--src/mgr/StandbyPyModules.cc200
-rw-r--r--src/mgr/StandbyPyModules.h133
-rw-r--r--src/mgr/TTLCache.cc100
-rw-r--r--src/mgr/TTLCache.h124
-rw-r--r--src/mgr/Types.h26
-rw-r--r--src/mgr/mgr_commands.cc14
-rw-r--r--src/mgr/mgr_commands.h9
-rw-r--r--src/mgr/mgr_perf_counters.cc28
-rw-r--r--src/mgr/mgr_perf_counters.h20
67 files changed, 18455 insertions, 0 deletions
diff --git a/src/mgr/ActivePyModule.cc b/src/mgr/ActivePyModule.cc
new file mode 100644
index 000000000..c776acfd0
--- /dev/null
+++ b/src/mgr/ActivePyModule.cc
@@ -0,0 +1,275 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "PyFormatter.h"
+
+#include "common/debug.h"
+#include "mon/MonCommand.h"
+
+#include "ActivePyModule.h"
+#include "MgrSession.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+int ActivePyModule::load(ActivePyModules *py_modules)
+{
+ ceph_assert(py_modules);
+ Gil gil(py_module->pMyThreadState, true);
+
+ // We tell the module how we name it, so that it can be consistent
+ // with us in logging etc.
+ auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+ auto pPyModules = PyCapsule_New(py_modules, nullptr, nullptr);
+ auto pModuleName = PyUnicode_FromString(get_name().c_str());
+ auto pArgs = PyTuple_Pack(3, pModuleName, pPyModules, pThisPtr);
+
+ pClassInstance = PyObject_CallObject(py_module->pClass, pArgs);
+ Py_DECREF(pModuleName);
+ Py_DECREF(pArgs);
+ if (pClassInstance == nullptr) {
+ derr << "Failed to construct class in '" << get_name() << "'" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -EINVAL;
+ } else {
+ dout(1) << "Constructed class from module: " << get_name() << dendl;
+ }
+
+ return 0;
+}
+
+void ActivePyModule::notify(const std::string &notify_type, const std::string &notify_id)
+{
+ if (is_dead()) {
+ dout(5) << "cancelling notify " << notify_type << " " << notify_id << dendl;
+ return;
+ }
+
+ ceph_assert(pClassInstance != nullptr);
+
+ Gil gil(py_module->pMyThreadState, true);
+
+ // Execute
+ auto pValue = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("notify"), const_cast<char*>("(ss)"),
+ notify_type.c_str(), notify_id.c_str());
+
+ if (pValue != NULL) {
+ Py_DECREF(pValue);
+ } else {
+ derr << get_name() << ".notify:" << dendl;
+ derr << handle_pyerror() << dendl;
+ // FIXME: callers can't be expected to handle a python module
+ // that has spontaneously broken, but Mgr() should provide
+ // a hook to unload misbehaving modules when they have an
+ // error somewhere like this
+ }
+}
+
+void ActivePyModule::notify_clog(const LogEntry &log_entry)
+{
+ if (is_dead()) {
+ dout(5) << "cancelling notify_clog" << dendl;
+ return;
+ }
+
+ ceph_assert(pClassInstance != nullptr);
+
+ Gil gil(py_module->pMyThreadState, true);
+
+ // Construct python-ized LogEntry
+ PyFormatter f;
+ log_entry.dump(&f);
+ auto py_log_entry = f.get();
+
+ // Execute
+ auto pValue = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("notify"), const_cast<char*>("(sN)"),
+ "clog", py_log_entry);
+
+ if (pValue != NULL) {
+ Py_DECREF(pValue);
+ } else {
+ derr << get_name() << ".notify_clog:" << dendl;
+ derr << handle_pyerror() << dendl;
+ // FIXME: callers can't be expected to handle a python module
+ // that has spontaneously broken, but Mgr() should provide
+ // a hook to unload misbehaving modules when they have an
+ // error somewhere like this
+ }
+}
+
+bool ActivePyModule::method_exists(const std::string &method) const
+{
+ Gil gil(py_module->pMyThreadState, true);
+
+ auto boundMethod = PyObject_GetAttrString(pClassInstance, method.c_str());
+ if (boundMethod == nullptr) {
+ return false;
+ } else {
+ Py_DECREF(boundMethod);
+ return true;
+ }
+}
+
+PyObject *ActivePyModule::dispatch_remote(
+ const std::string &method,
+ PyObject *args,
+ PyObject *kwargs,
+ std::string *err)
+{
+ ceph_assert(err != nullptr);
+
+ // Rather than serializing arguments, pass the CPython objects.
+ // Works because we happen to know that the subinterpreter
+ // implementation shares a GIL, allocator, deallocator and GC state, so
+ // it's okay to pass the objects between subinterpreters.
+ // But in future this might involve serialization to support a CSP-aware
+ // future Python interpreter a la PEP554
+
+ Gil gil(py_module->pMyThreadState, true);
+
+ // Fire the receiving method
+ auto boundMethod = PyObject_GetAttrString(pClassInstance, method.c_str());
+
+ // Caller should have done method_exists check first!
+ ceph_assert(boundMethod != nullptr);
+
+ dout(20) << "Calling " << py_module->get_name()
+ << "." << method << "..." << dendl;
+
+ auto remoteResult = PyObject_Call(boundMethod,
+ args, kwargs);
+ Py_DECREF(boundMethod);
+
+ if (remoteResult == nullptr) {
+ // Because the caller is in a different context, we can't let this
+ // exception bubble up, need to re-raise it from the caller's
+ // context later.
+ *err = handle_pyerror();
+ } else {
+ dout(20) << "Success calling '" << method << "'" << dendl;
+ }
+
+ return remoteResult;
+}
+
+void ActivePyModule::config_notify()
+{
+ if (is_dead()) {
+ dout(5) << "cancelling config_notify" << dendl;
+ return;
+ }
+
+ Gil gil(py_module->pMyThreadState, true);
+ dout(20) << "Calling " << py_module->get_name() << "._config_notify..."
+ << dendl;
+ auto remoteResult = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("_config_notify"),
+ (char*)NULL);
+ if (remoteResult != nullptr) {
+ Py_DECREF(remoteResult);
+ }
+}
+
+int ActivePyModule::handle_command(
+ const ModuleCommand& module_command,
+ const MgrSession& session,
+ const cmdmap_t &cmdmap,
+ const bufferlist &inbuf,
+ std::stringstream *ds,
+ std::stringstream *ss)
+{
+ ceph_assert(ss != nullptr);
+ ceph_assert(ds != nullptr);
+
+ if (pClassInstance == nullptr) {
+ // Not the friendliest error string, but we could only
+ // hit this in quite niche cases, if at all.
+ *ss << "Module not instantiated";
+ return -EINVAL;
+ }
+
+ Gil gil(py_module->pMyThreadState, true);
+
+ PyFormatter f;
+ TOPNSPC::common::cmdmap_dump(cmdmap, &f);
+ PyObject *py_cmd = f.get();
+ string instr;
+ inbuf.begin().copy(inbuf.length(), instr);
+
+ ceph_assert(m_session == nullptr);
+ m_command_perms = module_command.perm;
+ m_session = &session;
+
+ auto pResult = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("_handle_command"), const_cast<char*>("s#O"),
+ instr.c_str(), instr.length(), py_cmd);
+
+ m_command_perms.clear();
+ m_session = nullptr;
+ Py_DECREF(py_cmd);
+
+ int r = 0;
+ if (pResult != NULL) {
+ if (PyTuple_Size(pResult) != 3) {
+ derr << "module '" << py_module->get_name() << "' command handler "
+ "returned wrong type!" << dendl;
+ r = -EINVAL;
+ } else {
+ r = PyLong_AsLong(PyTuple_GetItem(pResult, 0));
+ *ds << PyUnicode_AsUTF8(PyTuple_GetItem(pResult, 1));
+ *ss << PyUnicode_AsUTF8(PyTuple_GetItem(pResult, 2));
+ }
+
+ Py_DECREF(pResult);
+ } else {
+ derr << "module '" << py_module->get_name() << "' command handler "
+ "threw exception: " << peek_pyerror() << dendl;
+ *ds << "";
+ *ss << handle_pyerror();
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
+void ActivePyModule::get_health_checks(health_check_map_t *checks)
+{
+ if (is_dead()) {
+ dout(5) << "cancelling get_health_checks" << dendl;
+ return;
+ }
+ checks->merge(health_checks);
+}
+
+bool ActivePyModule::is_authorized(
+ const std::map<std::string, std::string>& arguments) const {
+ if (m_session == nullptr) {
+ return false;
+ }
+
+ // No need to pass command prefix here since that would have already been
+ // tested before command invokation. Instead, only test for service/module
+ // arguments as defined by the module itself.
+ MonCommand mon_command {"", "", "", m_command_perms};
+ return m_session->caps.is_capable(nullptr, m_session->entity_name, "py",
+ py_module->get_name(), "", arguments,
+ mon_command.requires_perm('r'),
+ mon_command.requires_perm('w'),
+ mon_command.requires_perm('x'),
+ m_session->get_peer_addr());
+}
diff --git a/src/mgr/ActivePyModule.h b/src/mgr/ActivePyModule.h
new file mode 100644
index 000000000..1cbf6d18a
--- /dev/null
+++ b/src/mgr/ActivePyModule.h
@@ -0,0 +1,102 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#pragma once
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include "Python.h"
+
+#include "common/cmdparse.h"
+#include "common/LogEntry.h"
+#include "common/Thread.h"
+#include "mon/health_check.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
+
+#include <vector>
+#include <string>
+
+
+class ActivePyModule;
+class ActivePyModules;
+class MgrSession;
+class ModuleCommand;
+
+class ActivePyModule : public PyModuleRunner
+{
+private:
+ health_check_map_t health_checks;
+
+ // Optional, URI exposed by plugins that implement serve()
+ std::string uri;
+
+ std::string m_command_perms;
+ const MgrSession* m_session = nullptr;
+
+public:
+ ActivePyModule(const PyModuleRef &py_module_,
+ LogChannelRef clog_)
+ : PyModuleRunner(py_module_, clog_)
+ {}
+
+ int load(ActivePyModules *py_modules);
+ void notify(const std::string &notify_type, const std::string &notify_id);
+ void notify_clog(const LogEntry &le);
+
+ bool method_exists(const std::string &method) const;
+
+ PyObject *dispatch_remote(
+ const std::string &method,
+ PyObject *args,
+ PyObject *kwargs,
+ std::string *err);
+
+ int handle_command(
+ const ModuleCommand& module_command,
+ const MgrSession& session,
+ const cmdmap_t &cmdmap,
+ const bufferlist &inbuf,
+ std::stringstream *ds,
+ std::stringstream *ss);
+
+
+ bool set_health_checks(health_check_map_t&& c) {
+ // when health checks change a report is immediately sent to the monitors.
+ // currently modules have static health check details, but this equality
+ // test could be made smarter if too much noise shows up in the future.
+ bool changed = health_checks != c;
+ health_checks = std::move(c);
+ return changed;
+ }
+ void get_health_checks(health_check_map_t *checks);
+ void config_notify();
+
+ void set_uri(const std::string &str)
+ {
+ uri = str;
+ }
+
+ std::string get_uri() const
+ {
+ return uri;
+ }
+
+ bool is_authorized(const std::map<std::string, std::string>& arguments) const;
+
+};
+
+std::string handle_pyerror();
+
diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc
new file mode 100644
index 000000000..e62e93b30
--- /dev/null
+++ b/src/mgr/ActivePyModules.cc
@@ -0,0 +1,1513 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+// Include this first to get python headers earlier
+#include "Gil.h"
+
+#include "common/errno.h"
+#include "include/stringify.h"
+
+#include "PyFormatter.h"
+
+#include "osd/OSDMap.h"
+#include "mon/MonMap.h"
+#include "osd/osd_types.h"
+#include "mgr/MgrContext.h"
+#include "mgr/TTLCache.h"
+#include "mgr/mgr_perf_counters.h"
+
+// For ::mgr_store_prefix
+#include "PyModule.h"
+#include "PyModuleRegistry.h"
+#include "PyUtil.h"
+
+#include "ActivePyModules.h"
+#include "DaemonKey.h"
+#include "DaemonServer.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+ActivePyModules::ActivePyModules(
+ PyModuleConfig &module_config_,
+ std::map<std::string, std::string> store_data,
+ bool mon_provides_kv_sub,
+ DaemonStateIndex &ds, ClusterState &cs,
+ MonClient &mc, LogChannelRef clog_,
+ LogChannelRef audit_clog_, Objecter &objecter_,
+ Client &client_, Finisher &f, DaemonServer &server,
+ PyModuleRegistry &pmr)
+: module_config(module_config_), daemon_state(ds), cluster_state(cs),
+ monc(mc), clog(clog_), audit_clog(audit_clog_), objecter(objecter_),
+ client(client_), finisher(f),
+ cmd_finisher(g_ceph_context, "cmd_finisher", "cmdfin"),
+ server(server), py_module_registry(pmr)
+{
+ store_cache = std::move(store_data);
+ // we can only trust our ConfigMap if the mon cluster has provided
+ // kv sub since our startup.
+ have_local_config_map = mon_provides_kv_sub;
+ _refresh_config_map();
+ cmd_finisher.start();
+}
+
+ActivePyModules::~ActivePyModules() = default;
+
+void ActivePyModules::dump_server(const std::string &hostname,
+ const DaemonStateCollection &dmc,
+ Formatter *f)
+{
+ f->dump_string("hostname", hostname);
+ f->open_array_section("services");
+ std::string ceph_version;
+
+ for (const auto &[key, state] : dmc) {
+ std::string id;
+ without_gil([&ceph_version, &id, state=state] {
+ std::lock_guard l(state->lock);
+ // TODO: pick the highest version, and make sure that
+ // somewhere else (during health reporting?) we are
+ // indicating to the user if we see mixed versions
+ auto ver_iter = state->metadata.find("ceph_version");
+ if (ver_iter != state->metadata.end()) {
+ ceph_version = state->metadata.at("ceph_version");
+ }
+ if (state->metadata.find("id") != state->metadata.end()) {
+ id = state->metadata.at("id");
+ }
+ });
+ f->open_object_section("service");
+ f->dump_string("type", key.type);
+ f->dump_string("id", key.name);
+ f->dump_string("ceph_version", ceph_version);
+ if (!id.empty()) {
+ f->dump_string("name", id);
+ }
+ f->close_section();
+ }
+ f->close_section();
+
+ f->dump_string("ceph_version", ceph_version);
+}
+
+PyObject *ActivePyModules::get_server_python(const std::string &hostname)
+{
+ const auto dmc = without_gil([&]{
+ std::lock_guard l(lock);
+ dout(10) << " (" << hostname << ")" << dendl;
+ return daemon_state.get_by_server(hostname);
+ });
+ PyFormatter f;
+ dump_server(hostname, dmc, &f);
+ return f.get();
+}
+
+
+PyObject *ActivePyModules::list_servers_python()
+{
+ dout(10) << " >" << dendl;
+
+ without_gil_t no_gil;
+ return daemon_state.with_daemons_by_server([this, &no_gil]
+ (const std::map<std::string, DaemonStateCollection> &all) {
+ with_gil_t with_gil{no_gil};
+ PyFormatter f(false, true);
+ for (const auto &[hostname, daemon_state] : all) {
+ f.open_object_section("server");
+ dump_server(hostname, daemon_state, &f);
+ f.close_section();
+ }
+ return f.get();
+ });
+}
+
+PyObject *ActivePyModules::get_metadata_python(
+ const std::string &svc_type,
+ const std::string &svc_id)
+{
+ auto metadata = daemon_state.get(DaemonKey{svc_type, svc_id});
+ if (metadata == nullptr) {
+ derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+ Py_RETURN_NONE;
+ }
+ auto l = without_gil([&] {
+ return std::lock_guard(lock);
+ });
+ PyFormatter f;
+ f.dump_string("hostname", metadata->hostname);
+ for (const auto &[key, val] : metadata->metadata) {
+ f.dump_string(key, val);
+ }
+
+ return f.get();
+}
+
+PyObject *ActivePyModules::get_daemon_status_python(
+ const std::string &svc_type,
+ const std::string &svc_id)
+{
+ auto metadata = daemon_state.get(DaemonKey{svc_type, svc_id});
+ if (metadata == nullptr) {
+ derr << "Requested missing service " << svc_type << "." << svc_id << dendl;
+ Py_RETURN_NONE;
+ }
+ auto l = without_gil([&] {
+ return std::lock_guard(lock);
+ });
+ PyFormatter f;
+ for (const auto &[daemon, status] : metadata->service_status) {
+ f.dump_string(daemon, status);
+ }
+ return f.get();
+}
+
+void ActivePyModules::update_cache_metrics() {
+ auto hit_miss_ratio = ttl_cache.get_hit_miss_ratio();
+ perfcounter->set(l_mgr_cache_hit, hit_miss_ratio.first);
+ perfcounter->set(l_mgr_cache_miss, hit_miss_ratio.second);
+}
+
+PyObject *ActivePyModules::cacheable_get_python(const std::string &what)
+{
+ uint64_t ttl_seconds = g_conf().get_val<uint64_t>("mgr_ttl_cache_expire_seconds");
+ if(ttl_seconds > 0) {
+ ttl_cache.set_ttl(ttl_seconds);
+ try{
+ PyObject* cached = ttl_cache.get(what);
+ update_cache_metrics();
+ return cached;
+ } catch (std::out_of_range& e) {}
+ }
+
+ PyObject *obj = get_python(what);
+ if(ttl_seconds && ttl_cache.is_cacheable(what)) {
+ ttl_cache.insert(what, obj);
+ Py_INCREF(obj);
+ }
+ update_cache_metrics();
+ return obj;
+}
+
+PyObject *ActivePyModules::get_python(const std::string &what)
+{
+ uint64_t ttl_seconds = g_conf().get_val<uint64_t>("mgr_ttl_cache_expire_seconds");
+
+ PyFormatter pf;
+ PyJSONFormatter jf;
+ // Use PyJSONFormatter if TTL cache is enabled.
+ Formatter &f = ttl_seconds ? (Formatter&)jf : (Formatter&)pf;
+
+ if (what == "fs_map") {
+ without_gil_t no_gil;
+ cluster_state.with_fsmap([&](const FSMap &fsmap) {
+ no_gil.acquire_gil();
+ fsmap.dump(&f);
+ });
+ } else if (what == "osdmap_crush_map_text") {
+ without_gil_t no_gil;
+ bufferlist rdata;
+ cluster_state.with_osdmap([&](const OSDMap &osd_map){
+ osd_map.crush->encode(rdata, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ });
+ std::string crush_text = rdata.to_str();
+ with_gil_t with_gil{no_gil};
+ return PyUnicode_FromString(crush_text.c_str());
+ } else if (what.substr(0, 7) == "osd_map") {
+ without_gil_t no_gil;
+ cluster_state.with_osdmap([&](const OSDMap &osd_map){
+ no_gil.acquire_gil();
+ if (what == "osd_map") {
+ osd_map.dump(&f);
+ } else if (what == "osd_map_tree") {
+ osd_map.print_tree(&f, nullptr);
+ } else if (what == "osd_map_crush") {
+ osd_map.crush->dump(&f);
+ }
+ });
+ } else if (what == "modified_config_options") {
+ without_gil_t no_gil;
+ auto all_daemons = daemon_state.get_all();
+ set<string> names;
+ for (auto& [key, daemon] : all_daemons) {
+ std::lock_guard l(daemon->lock);
+ for (auto& [name, valmap] : daemon->config) {
+ names.insert(name);
+ }
+ }
+ with_gil_t with_gil{no_gil};
+ f.open_array_section("options");
+ for (auto& name : names) {
+ f.dump_string("name", name);
+ }
+ f.close_section();
+ } else if (what.substr(0, 6) == "config") {
+ if (what == "config_options") {
+ g_conf().config_options(&f);
+ } else if (what == "config") {
+ g_conf().show_config(&f);
+ }
+ } else if (what == "mon_map") {
+ without_gil_t no_gil;
+ cluster_state.with_monmap([&](const MonMap &monmap) {
+ no_gil.acquire_gil();
+ monmap.dump(&f);
+ });
+ } else if (what == "service_map") {
+ without_gil_t no_gil;
+ cluster_state.with_servicemap([&](const ServiceMap &service_map) {
+ no_gil.acquire_gil();
+ service_map.dump(&f);
+ });
+ } else if (what == "osd_metadata") {
+ without_gil_t no_gil;
+ auto dmc = daemon_state.get_by_service("osd");
+ for (const auto &[key, state] : dmc) {
+ std::lock_guard l(state->lock);
+ with_gil(no_gil, [&f, &name=key.name, state=state] {
+ f.open_object_section(name.c_str());
+ f.dump_string("hostname", state->hostname);
+ for (const auto &[name, val] : state->metadata) {
+ f.dump_string(name.c_str(), val);
+ }
+ f.close_section();
+ });
+ }
+ } else if (what == "mds_metadata") {
+ without_gil_t no_gil;
+ auto dmc = daemon_state.get_by_service("mds");
+ for (const auto &[key, state] : dmc) {
+ std::lock_guard l(state->lock);
+ with_gil(no_gil, [&f, &name=key.name, state=state] {
+ f.open_object_section(name.c_str());
+ f.dump_string("hostname", state->hostname);
+ for (const auto &[name, val] : state->metadata) {
+ f.dump_string(name.c_str(), val);
+ }
+ f.close_section();
+ });
+ }
+ } else if (what == "pg_summary") {
+ without_gil_t no_gil;
+ cluster_state.with_pgmap(
+ [&f, &no_gil](const PGMap &pg_map) {
+ std::map<std::string, std::map<std::string, uint32_t> > osds;
+ std::map<std::string, std::map<std::string, uint32_t> > pools;
+ std::map<std::string, uint32_t> all;
+ for (const auto &i : pg_map.pg_stat) {
+ const auto pool = i.first.m_pool;
+ const std::string state = pg_state_string(i.second.state);
+ // Insert to per-pool map
+ pools[stringify(pool)][state]++;
+ for (const auto &osd_id : i.second.acting) {
+ osds[stringify(osd_id)][state]++;
+ }
+ all[state]++;
+ }
+ with_gil_t with_gil{no_gil};
+ f.open_object_section("by_osd");
+ for (const auto &i : osds) {
+ f.open_object_section(i.first.c_str());
+ for (const auto &j : i.second) {
+ f.dump_int(j.first.c_str(), j.second);
+ }
+ f.close_section();
+ }
+ f.close_section();
+ f.open_object_section("by_pool");
+ for (const auto &i : pools) {
+ f.open_object_section(i.first.c_str());
+ for (const auto &j : i.second) {
+ f.dump_int(j.first.c_str(), j.second);
+ }
+ f.close_section();
+ }
+ f.close_section();
+ f.open_object_section("all");
+ for (const auto &i : all) {
+ f.dump_int(i.first.c_str(), i.second);
+ }
+ f.close_section();
+ f.open_object_section("pg_stats_sum");
+ pg_map.pg_sum.dump(&f);
+ f.close_section();
+ }
+ );
+ } else if (what == "pg_status") {
+ without_gil_t no_gil;
+ cluster_state.with_pgmap(
+ [&](const PGMap &pg_map) {
+ with_gil_t with_gil{no_gil};
+ pg_map.print_summary(&f, nullptr);
+ }
+ );
+ } else if (what == "pg_dump") {
+ without_gil_t no_gil;
+ cluster_state.with_pgmap(
+ [&](const PGMap &pg_map) {
+ with_gil_t with_gil{no_gil};
+ pg_map.dump(&f, false);
+ }
+ );
+ } else if (what == "devices") {
+ without_gil_t no_gil;
+ daemon_state.with_devices2(
+ [&] {
+ with_gil(no_gil, [&] { f.open_array_section("devices"); });
+ },
+ [&](const DeviceState &dev) {
+ with_gil(no_gil, [&] { f.dump_object("device", dev); });
+ });
+ with_gil(no_gil, [&] {
+ f.close_section();
+ });
+ } else if (what.size() > 7 &&
+ what.substr(0, 7) == "device ") {
+ without_gil_t no_gil;
+ string devid = what.substr(7);
+ if (!daemon_state.with_device(devid,
+ [&] (const DeviceState& dev) {
+ with_gil_t with_gil{no_gil};
+ f.dump_object("device", dev);
+ })) {
+ // device not found
+ }
+ } else if (what == "io_rate") {
+ without_gil_t no_gil;
+ cluster_state.with_pgmap(
+ [&](const PGMap &pg_map) {
+ with_gil_t with_gil{no_gil};
+ pg_map.dump_delta(&f);
+ }
+ );
+ } else if (what == "df") {
+ without_gil_t no_gil;
+ cluster_state.with_osdmap_and_pgmap(
+ [&](
+ const OSDMap& osd_map,
+ const PGMap &pg_map) {
+ with_gil_t with_gil{no_gil};
+ pg_map.dump_cluster_stats(nullptr, &f, true);
+ pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true);
+ });
+ } else if (what == "pg_stats") {
+ without_gil_t no_gil;
+ cluster_state.with_pgmap([&](const PGMap &pg_map) {
+ no_gil.acquire_gil();
+ pg_map.dump_pg_stats(&f, false);
+ });
+ } else if (what == "pool_stats") {
+ without_gil_t no_gil;
+ cluster_state.with_pgmap([&](const PGMap &pg_map) {
+ no_gil.acquire_gil();
+ pg_map.dump_pool_stats(&f);
+ });
+ } else if (what == "pg_ready") {
+ server.dump_pg_ready(&f);
+ } else if (what == "pg_progress") {
+ without_gil_t no_gil;
+ cluster_state.with_pgmap([&](const PGMap &pg_map) {
+ no_gil.acquire_gil();
+ pg_map.dump_pg_progress(&f);
+ server.dump_pg_ready(&f);
+ });
+ } else if (what == "osd_stats") {
+ without_gil_t no_gil;
+ cluster_state.with_pgmap([&](const PGMap &pg_map) {
+ no_gil.acquire_gil();
+ pg_map.dump_osd_stats(&f, false);
+ });
+ } else if (what == "osd_ping_times") {
+ without_gil_t no_gil;
+ cluster_state.with_pgmap([&](const PGMap &pg_map) {
+ no_gil.acquire_gil();
+ pg_map.dump_osd_ping_times(&f);
+ });
+ } else if (what == "osd_pool_stats") {
+ int64_t poolid = -ENOENT;
+ without_gil_t no_gil;
+ cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap,
+ const PGMap& pg_map) {
+ with_gil_t with_gil{no_gil};
+ f.open_array_section("pool_stats");
+ for (auto &p : osdmap.get_pools()) {
+ poolid = p.first;
+ pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, &f, nullptr);
+ }
+ f.close_section();
+ });
+ } else if (what == "health") {
+ without_gil_t no_gil;
+ cluster_state.with_health([&](const ceph::bufferlist &health_json) {
+ no_gil.acquire_gil();
+ f.dump_string("json", health_json.to_str());
+ });
+ } else if (what == "mon_status") {
+ without_gil_t no_gil;
+ cluster_state.with_mon_status(
+ [&](const ceph::bufferlist &mon_status_json) {
+ with_gil_t with_gil{no_gil};
+ f.dump_string("json", mon_status_json.to_str());
+ });
+ } else if (what == "mgr_map") {
+ without_gil_t no_gil;
+ cluster_state.with_mgrmap([&](const MgrMap &mgr_map) {
+ no_gil.acquire_gil();
+ mgr_map.dump(&f);
+ });
+ } else if (what == "mgr_ips") {
+ entity_addrvec_t myaddrs = server.get_myaddrs();
+ f.open_array_section("ips");
+ std::set<std::string> did;
+ for (auto& i : myaddrs.v) {
+ std::string ip = i.ip_only_to_str();
+ if (auto [where, inserted] = did.insert(ip); inserted) {
+ f.dump_string("ip", ip);
+ }
+ }
+ f.close_section();
+ } else if (what == "have_local_config_map") {
+ f.dump_bool("have_local_config_map", have_local_config_map);
+ } else if (what == "active_clean_pgs"){
+ without_gil_t no_gil;
+ cluster_state.with_pgmap(
+ [&](const PGMap &pg_map) {
+ with_gil_t with_gil{no_gil};
+ f.open_array_section("pg_stats");
+ for (auto &i : pg_map.pg_stat) {
+ const auto state = i.second.state;
+ const auto pgid_raw = i.first;
+ const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed);
+ const auto reported_epoch = i.second.reported_epoch;
+ if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) {
+ f.open_object_section("pg_stat");
+ f.dump_string("pgid", pgid);
+ f.dump_string("state", pg_state_string(state));
+ f.dump_unsigned("reported_epoch", reported_epoch);
+ f.close_section();
+ }
+ }
+ f.close_section();
+ const auto num_pg = pg_map.num_pg;
+ f.dump_unsigned("total_num_pgs", num_pg);
+ });
+ } else {
+ derr << "Python module requested unknown data '" << what << "'" << dendl;
+ Py_RETURN_NONE;
+ }
+ if(ttl_seconds) {
+ return jf.get();
+ } else {
+ return pf.get();
+ }
+}
+
+void ActivePyModules::start_one(PyModuleRef py_module)
+{
+ std::lock_guard l(lock);
+
+ const auto name = py_module->get_name();
+ auto active_module = std::make_shared<ActivePyModule>(py_module, clog);
+
+ pending_modules.insert(name);
+ // Send all python calls down a Finisher to avoid blocking
+ // C++ code, and avoid any potential lock cycles.
+ finisher.queue(new LambdaContext([this, active_module, name](int) {
+ int r = active_module->load(this);
+ std::lock_guard l(lock);
+ pending_modules.erase(name);
+ if (r != 0) {
+ derr << "Failed to run module in active mode ('" << name << "')"
+ << dendl;
+ } else {
+ auto em = modules.emplace(name, active_module);
+ ceph_assert(em.second); // actually inserted
+
+ dout(4) << "Starting thread for " << name << dendl;
+ active_module->thread.create(active_module->get_thread_name());
+ }
+ }));
+}
+
+void ActivePyModules::shutdown()
+{
+ std::lock_guard locker(lock);
+
+ // Signal modules to drop out of serve() and/or tear down resources
+ for (auto& [name, module] : modules) {
+ lock.unlock();
+ dout(10) << "calling module " << name << " shutdown()" << dendl;
+ module->shutdown();
+ dout(10) << "module " << name << " shutdown() returned" << dendl;
+ lock.lock();
+ }
+
+ // For modules implementing serve(), finish the threads where we
+ // were running that.
+ for (auto& [name, module] : modules) {
+ lock.unlock();
+ dout(10) << "joining module " << name << dendl;
+ module->thread.join();
+ dout(10) << "joined module " << name << dendl;
+ lock.lock();
+ }
+
+ cmd_finisher.wait_for_empty();
+ cmd_finisher.stop();
+
+ modules.clear();
+}
+
+void ActivePyModules::notify_all(const std::string &notify_type,
+ const std::string &notify_id)
+{
+ std::lock_guard l(lock);
+
+ dout(10) << __func__ << ": notify_all " << notify_type << dendl;
+ for (auto& [name, module] : modules) {
+ if (!py_module_registry.should_notify(name, notify_type)) {
+ continue;
+ }
+ // Send all python calls down a Finisher to avoid blocking
+ // C++ code, and avoid any potential lock cycles.
+ dout(15) << "queuing notify (" << notify_type << ") to " << name << dendl;
+ // workaround for https://bugs.llvm.org/show_bug.cgi?id=35984
+ finisher.queue(new LambdaContext([module=module, notify_type, notify_id]
+ (int r){
+ module->notify(notify_type, notify_id);
+ }));
+ }
+}
+
+void ActivePyModules::notify_all(const LogEntry &log_entry)
+{
+ std::lock_guard l(lock);
+
+ dout(10) << __func__ << ": notify_all (clog)" << dendl;
+ for (auto& [name, module] : modules) {
+ if (!py_module_registry.should_notify(name, "clog")) {
+ continue;
+ }
+ // Send all python calls down a Finisher to avoid blocking
+ // C++ code, and avoid any potential lock cycles.
+ //
+ // Note intentional use of non-reference lambda binding on
+ // log_entry: we take a copy because caller's instance is
+ // probably ephemeral.
+ dout(15) << "queuing notify (clog) to " << name << dendl;
+ // workaround for https://bugs.llvm.org/show_bug.cgi?id=35984
+ finisher.queue(new LambdaContext([module=module, log_entry](int r){
+ module->notify_clog(log_entry);
+ }));
+ }
+}
+
+bool ActivePyModules::get_store(const std::string &module_name,
+ const std::string &key, std::string *val) const
+{
+ without_gil_t no_gil;
+ std::lock_guard l(lock);
+
+ const std::string global_key = PyModule::mgr_store_prefix
+ + module_name + "/" + key;
+
+ dout(4) << __func__ << " key: " << global_key << dendl;
+
+ auto i = store_cache.find(global_key);
+ if (i != store_cache.end()) {
+ *val = i->second;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+PyObject *ActivePyModules::dispatch_remote(
+ const std::string &other_module,
+ const std::string &method,
+ PyObject *args,
+ PyObject *kwargs,
+ std::string *err)
+{
+ auto mod_iter = modules.find(other_module);
+ ceph_assert(mod_iter != modules.end());
+
+ return mod_iter->second->dispatch_remote(method, args, kwargs, err);
+}
+
+bool ActivePyModules::get_config(const std::string &module_name,
+ const std::string &key, std::string *val) const
+{
+ const std::string global_key = "mgr/" + module_name + "/" + key;
+
+ dout(20) << " key: " << global_key << dendl;
+
+ std::lock_guard lock(module_config.lock);
+
+ auto i = module_config.config.find(global_key);
+ if (i != module_config.config.end()) {
+ *val = i->second;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+PyObject *ActivePyModules::get_typed_config(
+ const std::string &module_name,
+ const std::string &key,
+ const std::string &prefix) const
+{
+ without_gil_t no_gil;
+ std::string value;
+ std::string final_key;
+ bool found = false;
+ if (prefix.size()) {
+ final_key = prefix + "/" + key;
+ found = get_config(module_name, final_key, &value);
+ }
+ if (!found) {
+ final_key = key;
+ found = get_config(module_name, final_key, &value);
+ }
+ if (found) {
+ PyModuleRef module = py_module_registry.get_module(module_name);
+ with_gil_t with_gil{no_gil};
+ if (!module) {
+ derr << "Module '" << module_name << "' is not available" << dendl;
+ Py_RETURN_NONE;
+ }
+ // removing value to hide sensitive data going into mgr logs
+ // leaving this for debugging purposes
+ // dout(10) << __func__ << " " << final_key << " found: " << value << dendl;
+ dout(10) << __func__ << " " << final_key << " found" << dendl;
+ return module->get_typed_option_value(key, value);
+ }
+ if (prefix.size()) {
+ dout(10) << " [" << prefix << "/]" << key << " not found "
+ << dendl;
+ } else {
+ dout(10) << " " << key << " not found " << dendl;
+ }
+ with_gil_t with_gil{no_gil};
+ Py_RETURN_NONE;
+}
+
+PyObject *ActivePyModules::get_store_prefix(const std::string &module_name,
+ const std::string &prefix) const
+{
+ without_gil_t no_gil;
+ std::lock_guard l(lock);
+ std::lock_guard lock(module_config.lock);
+
+ const std::string base_prefix = PyModule::mgr_store_prefix
+ + module_name + "/";
+ const std::string global_prefix = base_prefix + prefix;
+ dout(4) << __func__ << " prefix: " << global_prefix << dendl;
+
+ return with_gil(no_gil, [&] {
+ PyFormatter f;
+ for (auto p = store_cache.lower_bound(global_prefix);
+ p != store_cache.end() && p->first.find(global_prefix) == 0; ++p) {
+ f.dump_string(p->first.c_str() + base_prefix.size(), p->second);
+ }
+ return f.get();
+ });
+}
+
+void ActivePyModules::set_store(const std::string &module_name,
+ const std::string &key, const boost::optional<std::string>& val)
+{
+ const std::string global_key = PyModule::mgr_store_prefix
+ + module_name + "/" + key;
+
+ Command set_cmd;
+ {
+ std::lock_guard l(lock);
+
+ // NOTE: this isn't strictly necessary since we'll also get an MKVData
+ // update from the mon due to our subscription *before* our command is acked.
+ if (val) {
+ store_cache[global_key] = *val;
+ } else {
+ store_cache.erase(global_key);
+ }
+
+ std::ostringstream cmd_json;
+ JSONFormatter jf;
+ jf.open_object_section("cmd");
+ if (val) {
+ jf.dump_string("prefix", "config-key set");
+ jf.dump_string("key", global_key);
+ jf.dump_string("val", *val);
+ } else {
+ jf.dump_string("prefix", "config-key del");
+ jf.dump_string("key", global_key);
+ }
+ jf.close_section();
+ jf.flush(cmd_json);
+ set_cmd.run(&monc, cmd_json.str());
+ }
+ set_cmd.wait();
+
+ if (set_cmd.r != 0) {
+ // config-key set will fail if mgr's auth key has insufficient
+ // permission to set config keys
+ // FIXME: should this somehow raise an exception back into Python land?
+ dout(0) << "`config-key set " << global_key << " " << val << "` failed: "
+ << cpp_strerror(set_cmd.r) << dendl;
+ dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
+ }
+}
+
+void ActivePyModules::set_config(const std::string &module_name,
+ const std::string &key, const boost::optional<std::string>& val)
+{
+ module_config.set_config(&monc, module_name, key, val);
+}
+
+std::map<std::string, std::string> ActivePyModules::get_services() const
+{
+ std::map<std::string, std::string> result;
+ std::lock_guard l(lock);
+ for (const auto& [name, module] : modules) {
+ std::string svc_str = module->get_uri();
+ if (!svc_str.empty()) {
+ result[name] = svc_str;
+ }
+ }
+
+ return result;
+}
+
+void ActivePyModules::update_kv_data(
+ const std::string prefix,
+ bool incremental,
+ const map<std::string, boost::optional<bufferlist>, std::less<>>& data)
+{
+ std::lock_guard l(lock);
+ bool do_config = false;
+ if (!incremental) {
+ dout(10) << "full update on " << prefix << dendl;
+ auto p = store_cache.lower_bound(prefix);
+ while (p != store_cache.end() && p->first.find(prefix) == 0) {
+ dout(20) << " rm prior " << p->first << dendl;
+ p = store_cache.erase(p);
+ }
+ } else {
+ dout(10) << "incremental update on " << prefix << dendl;
+ }
+ for (auto& i : data) {
+ if (i.second) {
+ dout(20) << " set " << i.first << " = " << i.second->to_str() << dendl;
+ store_cache[i.first] = i.second->to_str();
+ } else {
+ dout(20) << " rm " << i.first << dendl;
+ store_cache.erase(i.first);
+ }
+ if (i.first.find("config/") == 0) {
+ do_config = true;
+ }
+ }
+ if (do_config) {
+ _refresh_config_map();
+ }
+}
+
+void ActivePyModules::_refresh_config_map()
+{
+ dout(10) << dendl;
+ config_map.clear();
+ for (auto p = store_cache.lower_bound("config/");
+ p != store_cache.end() && p->first.find("config/") == 0;
+ ++p) {
+ string key = p->first.substr(7);
+ if (key.find("mgr/") == 0) {
+ // NOTE: for now, we ignore module options. see also ceph_foreign_option_get().
+ continue;
+ }
+ string value = p->second;
+ string name;
+ string who;
+ config_map.parse_key(key, &name, &who);
+
+ const Option *opt = g_conf().find_option(name);
+ if (!opt) {
+ config_map.stray_options.push_back(
+ std::unique_ptr<Option>(
+ new Option(name, Option::TYPE_STR, Option::LEVEL_UNKNOWN)));
+ opt = config_map.stray_options.back().get();
+ }
+
+ string err;
+ int r = opt->pre_validate(&value, &err);
+ if (r < 0) {
+ dout(10) << __func__ << " pre-validate failed on '" << name << "' = '"
+ << value << "' for " << name << dendl;
+ }
+
+ MaskedOption mopt(opt);
+ mopt.raw_value = value;
+ string section_name;
+ if (who.size() &&
+ !ConfigMap::parse_mask(who, &section_name, &mopt.mask)) {
+ derr << __func__ << " invalid mask for key " << key << dendl;
+ } else if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+ dout(10) << __func__ << " NO_MON_UPDATE option '"
+ << name << "' = '" << value << "' for " << name
+ << dendl;
+ } else {
+ Section *section = &config_map.global;;
+ if (section_name.size() && section_name != "global") {
+ if (section_name.find('.') != std::string::npos) {
+ section = &config_map.by_id[section_name];
+ } else {
+ section = &config_map.by_type[section_name];
+ }
+ }
+ section->options.insert(make_pair(name, std::move(mopt)));
+ }
+ }
+}
+
+PyObject* ActivePyModules::with_perf_counters(
+ std::function<void(PerfCounterInstance& counter_instance, PerfCounterType& counter_type, PyFormatter& f)> fct,
+ const std::string &svc_name,
+ const std::string &svc_id,
+ const std::string &path) const
+{
+ PyFormatter f;
+ f.open_array_section(path);
+ {
+ without_gil_t no_gil;
+ std::lock_guard l(lock);
+ auto metadata = daemon_state.get(DaemonKey{svc_name, svc_id});
+ if (metadata) {
+ std::lock_guard l2(metadata->lock);
+ if (metadata->perf_counters.instances.count(path)) {
+ auto counter_instance = metadata->perf_counters.instances.at(path);
+ auto counter_type = metadata->perf_counters.types.at(path);
+ with_gil(no_gil, [&] {
+ fct(counter_instance, counter_type, f);
+ });
+ } else {
+ dout(4) << "Missing counter: '" << path << "' ("
+ << svc_name << "." << svc_id << ")" << dendl;
+ dout(20) << "Paths are:" << dendl;
+ for (const auto &i : metadata->perf_counters.instances) {
+ dout(20) << i.first << dendl;
+ }
+ }
+ } else {
+ dout(4) << "No daemon state for " << svc_name << "." << svc_id << ")"
+ << dendl;
+ }
+ }
+ f.close_section();
+ return f.get();
+}
+
+PyObject* ActivePyModules::get_counter_python(
+ const std::string &svc_name,
+ const std::string &svc_id,
+ const std::string &path)
+{
+ auto extract_counters = [](
+ PerfCounterInstance& counter_instance,
+ PerfCounterType& counter_type,
+ PyFormatter& f)
+ {
+ if (counter_type.type & PERFCOUNTER_LONGRUNAVG) {
+ const auto &avg_data = counter_instance.get_data_avg();
+ for (const auto &datapoint : avg_data) {
+ f.open_array_section("datapoint");
+ f.dump_float("t", datapoint.t);
+ f.dump_unsigned("s", datapoint.s);
+ f.dump_unsigned("c", datapoint.c);
+ f.close_section();
+ }
+ } else {
+ const auto &data = counter_instance.get_data();
+ for (const auto &datapoint : data) {
+ f.open_array_section("datapoint");
+ f.dump_float("t", datapoint.t);
+ f.dump_unsigned("v", datapoint.v);
+ f.close_section();
+ }
+ }
+ };
+ return with_perf_counters(extract_counters, svc_name, svc_id, path);
+}
+
+PyObject* ActivePyModules::get_latest_counter_python(
+ const std::string &svc_name,
+ const std::string &svc_id,
+ const std::string &path)
+{
+ auto extract_latest_counters = [](
+ PerfCounterInstance& counter_instance,
+ PerfCounterType& counter_type,
+ PyFormatter& f)
+ {
+ if (counter_type.type & PERFCOUNTER_LONGRUNAVG) {
+ const auto &datapoint = counter_instance.get_latest_data_avg();
+ f.dump_float("t", datapoint.t);
+ f.dump_unsigned("s", datapoint.s);
+ f.dump_unsigned("c", datapoint.c);
+ } else {
+ const auto &datapoint = counter_instance.get_latest_data();
+ f.dump_float("t", datapoint.t);
+ f.dump_unsigned("v", datapoint.v);
+ }
+ };
+ return with_perf_counters(extract_latest_counters, svc_name, svc_id, path);
+}
+
+PyObject* ActivePyModules::get_perf_schema_python(
+ const std::string &svc_type,
+ const std::string &svc_id)
+{
+ without_gil_t no_gil;
+ std::lock_guard l(lock);
+
+ DaemonStateCollection daemons;
+
+ if (svc_type == "") {
+ daemons = daemon_state.get_all();
+ } else if (svc_id.empty()) {
+ daemons = daemon_state.get_by_service(svc_type);
+ } else {
+ auto key = DaemonKey{svc_type, svc_id};
+ // so that the below can be a loop in all cases
+ auto got = daemon_state.get(key);
+ if (got != nullptr) {
+ daemons[key] = got;
+ }
+ }
+
+ auto f = with_gil(no_gil, [&] {
+ return PyFormatter();
+ });
+ if (!daemons.empty()) {
+ for (auto& [key, state] : daemons) {
+ std::lock_guard l(state->lock);
+ with_gil(no_gil, [&, key=ceph::to_string(key), state=state] {
+ f.open_object_section(key.c_str());
+ for (auto ctr_inst_iter : state->perf_counters.instances) {
+ const auto &counter_name = ctr_inst_iter.first;
+ f.open_object_section(counter_name.c_str());
+ auto type = state->perf_counters.types[counter_name];
+ f.dump_string("description", type.description);
+ if (!type.nick.empty()) {
+ f.dump_string("nick", type.nick);
+ }
+ f.dump_unsigned("type", type.type);
+ f.dump_unsigned("priority", type.priority);
+ f.dump_unsigned("units", type.unit);
+ f.close_section();
+ }
+ f.close_section();
+ });
+ }
+ } else {
+ dout(4) << __func__ << ": No daemon state found for "
+ << svc_type << "." << svc_id << ")" << dendl;
+ }
+ return f.get();
+}
+
+PyObject *ActivePyModules::get_context()
+{
+ auto l = without_gil([&] {
+ return std::lock_guard(lock);
+ });
+ // Construct a capsule containing ceph context.
+ // Not incrementing/decrementing ref count on the context because
+ // it's the global one and it has process lifetime.
+ auto capsule = PyCapsule_New(g_ceph_context, nullptr, nullptr);
+ return capsule;
+}
+
+/**
+ * Helper for our wrapped types that take a capsule in their constructor.
+ */
+PyObject *construct_with_capsule(
+ const std::string &module_name,
+ const std::string &clsname,
+ void *wrapped)
+{
+ // Look up the OSDMap type which we will construct
+ PyObject *module = PyImport_ImportModule(module_name.c_str());
+ if (!module) {
+ derr << "Failed to import python module:" << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+ ceph_assert(module);
+
+ PyObject *wrapper_type = PyObject_GetAttrString(
+ module, (const char*)clsname.c_str());
+ if (!wrapper_type) {
+ derr << "Failed to get python type:" << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+ ceph_assert(wrapper_type);
+
+ // Construct a capsule containing an OSDMap.
+ auto wrapped_capsule = PyCapsule_New(wrapped, nullptr, nullptr);
+ ceph_assert(wrapped_capsule);
+
+ // Construct the python OSDMap
+ auto pArgs = PyTuple_Pack(1, wrapped_capsule);
+ auto wrapper_instance = PyObject_CallObject(wrapper_type, pArgs);
+ if (wrapper_instance == nullptr) {
+ derr << "Failed to construct python OSDMap:" << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+ ceph_assert(wrapper_instance != nullptr);
+ Py_DECREF(pArgs);
+ Py_DECREF(wrapped_capsule);
+
+ Py_DECREF(wrapper_type);
+ Py_DECREF(module);
+
+ return wrapper_instance;
+}
+
+PyObject *ActivePyModules::get_osdmap()
+{
+ auto newmap = without_gil([&] {
+ OSDMap *newmap = new OSDMap;
+ cluster_state.with_osdmap([&](const OSDMap& o) {
+ newmap->deepish_copy_from(o);
+ });
+ return newmap;
+ });
+ return construct_with_capsule("mgr_module", "OSDMap", (void*)newmap);
+}
+
+PyObject *ActivePyModules::get_foreign_config(
+ const std::string& who,
+ const std::string& name)
+{
+ dout(10) << "ceph_foreign_option_get " << who << " " << name << dendl;
+
+ // NOTE: for now this will only work with build-in options, not module options.
+ const Option *opt = g_conf().find_option(name);
+ if (!opt) {
+ dout(4) << "ceph_foreign_option_get " << name << " not found " << dendl;
+ PyErr_Format(PyExc_KeyError, "option not found: %s", name.c_str());
+ return nullptr;
+ }
+
+ // If the monitors are not yet running pacific, we cannot rely on our local
+ // ConfigMap
+ if (!have_local_config_map) {
+ dout(20) << "mon cluster wasn't pacific when we started: falling back to 'config get'"
+ << dendl;
+ without_gil_t no_gil;
+ Command cmd;
+ {
+ std::lock_guard l(lock);
+ cmd.run(
+ &monc,
+ "{\"prefix\": \"config get\","s +
+ "\"who\": \""s + who + "\","s +
+ "\"key\": \""s + name + "\"}");
+ }
+ cmd.wait();
+ dout(10) << "ceph_foreign_option_get (mon command) " << who << " " << name << " = "
+ << cmd.outbl.to_str() << dendl;
+ with_gil_t gil(no_gil);
+ return get_python_typed_option_value(opt->type, cmd.outbl.to_str());
+ }
+
+ // mimic the behavor of mon/ConfigMonitor's 'config get' command
+ EntityName entity;
+ if (!entity.from_str(who) &&
+ !entity.from_str(who + ".")) {
+ dout(5) << "unrecognized entity '" << who << "'" << dendl;
+ PyErr_Format(PyExc_KeyError, "invalid entity: %s", who.c_str());
+ return nullptr;
+ }
+
+ without_gil_t no_gil;
+ lock.lock();
+
+ // FIXME: this is super inefficient, since we generate the entire daemon
+ // config just to extract one value from it!
+
+ std::map<std::string,std::string,std::less<>> config;
+ cluster_state.with_osdmap([&](const OSDMap &osdmap) {
+ map<string,string> crush_location;
+ string device_class;
+ if (entity.is_osd()) {
+ osdmap.crush->get_full_location(who, &crush_location);
+ int id = atoi(entity.get_id().c_str());
+ const char *c = osdmap.crush->get_item_class(id);
+ if (c) {
+ device_class = c;
+ }
+ dout(10) << __func__ << " crush_location " << crush_location
+ << " class " << device_class << dendl;
+ }
+
+ std::map<std::string,pair<std::string,const MaskedOption*>> src;
+ config = config_map.generate_entity_map(
+ entity,
+ crush_location,
+ osdmap.crush.get(),
+ device_class,
+ &src);
+ });
+
+ // get a single value
+ string value;
+ auto p = config.find(name);
+ if (p != config.end()) {
+ value = p->second;
+ } else {
+ if (!entity.is_client() &&
+ !boost::get<boost::blank>(&opt->daemon_value)) {
+ value = Option::to_str(opt->daemon_value);
+ } else {
+ value = Option::to_str(opt->value);
+ }
+ }
+
+ dout(10) << "ceph_foreign_option_get (configmap) " << who << " " << name << " = "
+ << value << dendl;
+ lock.unlock();
+ with_gil_t with_gil(no_gil);
+ return get_python_typed_option_value(opt->type, value);
+}
+
+void ActivePyModules::set_health_checks(const std::string& module_name,
+ health_check_map_t&& checks)
+{
+ bool changed = false;
+
+ lock.lock();
+ auto p = modules.find(module_name);
+ if (p != modules.end()) {
+ changed = p->second->set_health_checks(std::move(checks));
+ }
+ lock.unlock();
+
+ // immediately schedule a report to be sent to the monitors with the new
+ // health checks that have changed. This is done asynchronusly to avoid
+ // blocking python land. ActivePyModules::lock needs to be dropped to make
+ // lockdep happy:
+ //
+ // send_report callers: DaemonServer::lock -> PyModuleRegistery::lock
+ // active_start: PyModuleRegistry::lock -> ActivePyModules::lock
+ //
+ // if we don't release this->lock before calling schedule_tick a cycle is
+ // formed with the addition of ActivePyModules::lock -> DaemonServer::lock.
+ // This is still correct as send_report is run asynchronously under
+ // DaemonServer::lock.
+ if (changed)
+ server.schedule_tick(0);
+}
+
+int ActivePyModules::handle_command(
+ const ModuleCommand& module_command,
+ const MgrSession& session,
+ const cmdmap_t &cmdmap,
+ const bufferlist &inbuf,
+ std::stringstream *ds,
+ std::stringstream *ss)
+{
+ lock.lock();
+ auto mod_iter = modules.find(module_command.module_name);
+ if (mod_iter == modules.end()) {
+ *ss << "Module '" << module_command.module_name << "' is not available";
+ lock.unlock();
+ return -ENOENT;
+ }
+
+ lock.unlock();
+ return mod_iter->second->handle_command(module_command, session, cmdmap,
+ inbuf, ds, ss);
+}
+
+void ActivePyModules::get_health_checks(health_check_map_t *checks)
+{
+ std::lock_guard l(lock);
+ for (auto& [name, module] : modules) {
+ dout(15) << "getting health checks for " << name << dendl;
+ module->get_health_checks(checks);
+ }
+}
+
+void ActivePyModules::update_progress_event(
+ const std::string& evid,
+ const std::string& desc,
+ float progress,
+ bool add_to_ceph_s)
+{
+ std::lock_guard l(lock);
+ auto& pe = progress_events[evid];
+ pe.message = desc;
+ pe.progress = progress;
+ pe.add_to_ceph_s = add_to_ceph_s;
+}
+
+void ActivePyModules::complete_progress_event(const std::string& evid)
+{
+ std::lock_guard l(lock);
+ progress_events.erase(evid);
+}
+
+void ActivePyModules::clear_all_progress_events()
+{
+ std::lock_guard l(lock);
+ progress_events.clear();
+}
+
+void ActivePyModules::get_progress_events(std::map<std::string,ProgressEvent> *events)
+{
+ std::lock_guard l(lock);
+ *events = progress_events;
+}
+
+void ActivePyModules::config_notify()
+{
+ std::lock_guard l(lock);
+ for (auto& [name, module] : modules) {
+ // Send all python calls down a Finisher to avoid blocking
+ // C++ code, and avoid any potential lock cycles.
+ dout(15) << "notify (config) " << name << dendl;
+ // workaround for https://bugs.llvm.org/show_bug.cgi?id=35984
+ finisher.queue(new LambdaContext([module=module](int r){
+ module->config_notify();
+ }));
+ }
+}
+
+void ActivePyModules::set_uri(const std::string& module_name,
+ const std::string &uri)
+{
+ std::lock_guard l(lock);
+
+ dout(4) << " module " << module_name << " set URI '" << uri << "'" << dendl;
+
+ modules.at(module_name)->set_uri(uri);
+}
+
+void ActivePyModules::set_device_wear_level(const std::string& devid,
+ float wear_level)
+{
+ // update mgr state
+ map<string,string> meta;
+ daemon_state.with_device(
+ devid,
+ [wear_level, &meta] (DeviceState& dev) {
+ dev.set_wear_level(wear_level);
+ meta = dev.metadata;
+ });
+
+ // tell mon
+ json_spirit::Object json_object;
+ for (auto& i : meta) {
+ json_spirit::Config::add(json_object, i.first, i.second);
+ }
+ bufferlist json;
+ json.append(json_spirit::write(json_object));
+ const string cmd =
+ "{"
+ "\"prefix\": \"config-key set\", "
+ "\"key\": \"device/" + devid + "\""
+ "}";
+
+ Command set_cmd;
+ set_cmd.run(&monc, cmd, json);
+ set_cmd.wait();
+}
+
+MetricQueryID ActivePyModules::add_osd_perf_query(
+ const OSDPerfMetricQuery &query,
+ const std::optional<OSDPerfMetricLimit> &limit)
+{
+ return server.add_osd_perf_query(query, limit);
+}
+
+void ActivePyModules::remove_osd_perf_query(MetricQueryID query_id)
+{
+ int r = server.remove_osd_perf_query(query_id);
+ if (r < 0) {
+ dout(0) << "remove_osd_perf_query for query_id=" << query_id << " failed: "
+ << cpp_strerror(r) << dendl;
+ }
+}
+
+PyObject *ActivePyModules::get_osd_perf_counters(MetricQueryID query_id)
+{
+ OSDPerfCollector collector(query_id);
+ int r = server.get_osd_perf_counters(&collector);
+ if (r < 0) {
+ dout(0) << "get_osd_perf_counters for query_id=" << query_id << " failed: "
+ << cpp_strerror(r) << dendl;
+ Py_RETURN_NONE;
+ }
+
+ PyFormatter f;
+ const std::map<OSDPerfMetricKey, PerformanceCounters> &counters = collector.counters;
+
+ f.open_array_section("counters");
+ for (auto &[key, instance_counters] : counters) {
+ f.open_object_section("i");
+ f.open_array_section("k");
+ for (auto &sub_key : key) {
+ f.open_array_section("s");
+ for (size_t i = 0; i < sub_key.size(); i++) {
+ f.dump_string(stringify(i).c_str(), sub_key[i]);
+ }
+ f.close_section(); // s
+ }
+ f.close_section(); // k
+ f.open_array_section("c");
+ for (auto &c : instance_counters) {
+ f.open_array_section("p");
+ f.dump_unsigned("0", c.first);
+ f.dump_unsigned("1", c.second);
+ f.close_section(); // p
+ }
+ f.close_section(); // c
+ f.close_section(); // i
+ }
+ f.close_section(); // counters
+
+ return f.get();
+}
+
+MetricQueryID ActivePyModules::add_mds_perf_query(
+ const MDSPerfMetricQuery &query,
+ const std::optional<MDSPerfMetricLimit> &limit)
+{
+ return server.add_mds_perf_query(query, limit);
+}
+
+void ActivePyModules::remove_mds_perf_query(MetricQueryID query_id)
+{
+ int r = server.remove_mds_perf_query(query_id);
+ if (r < 0) {
+ dout(0) << "remove_mds_perf_query for query_id=" << query_id << " failed: "
+ << cpp_strerror(r) << dendl;
+ }
+}
+
+void ActivePyModules::reregister_mds_perf_queries()
+{
+ server.reregister_mds_perf_queries();
+}
+
+PyObject *ActivePyModules::get_mds_perf_counters(MetricQueryID query_id)
+{
+ MDSPerfCollector collector(query_id);
+ int r = server.get_mds_perf_counters(&collector);
+ if (r < 0) {
+ dout(0) << "get_mds_perf_counters for query_id=" << query_id << " failed: "
+ << cpp_strerror(r) << dendl;
+ Py_RETURN_NONE;
+ }
+
+ PyFormatter f;
+ const std::map<MDSPerfMetricKey, PerformanceCounters> &counters = collector.counters;
+
+ f.open_array_section("metrics");
+
+ f.open_array_section("delayed_ranks");
+ f.dump_string("ranks", stringify(collector.delayed_ranks).c_str());
+ f.close_section(); // delayed_ranks
+
+ f.open_array_section("counters");
+ for (auto &[key, instance_counters] : counters) {
+ f.open_object_section("i");
+ f.open_array_section("k");
+ for (auto &sub_key : key) {
+ f.open_array_section("s");
+ for (size_t i = 0; i < sub_key.size(); i++) {
+ f.dump_string(stringify(i).c_str(), sub_key[i]);
+ }
+ f.close_section(); // s
+ }
+ f.close_section(); // k
+ f.open_array_section("c");
+ for (auto &c : instance_counters) {
+ f.open_array_section("p");
+ f.dump_unsigned("0", c.first);
+ f.dump_unsigned("1", c.second);
+ f.close_section(); // p
+ }
+ f.close_section(); // c
+ f.close_section(); // i
+ }
+ f.close_section(); // counters
+
+ f.open_array_section("last_updated");
+ f.dump_float("last_updated_mono", collector.last_updated_mono);
+ f.close_section(); // last_updated
+
+ f.close_section(); // metrics
+
+ return f.get();
+}
+
+void ActivePyModules::cluster_log(const std::string &channel, clog_type prio,
+ const std::string &message)
+{
+ std::lock_guard l(lock);
+
+ auto cl = monc.get_log_client()->create_channel(channel);
+ map<string,string> log_to_monitors;
+ map<string,string> log_to_syslog;
+ map<string,string> log_channel;
+ map<string,string> log_prio;
+ map<string,string> log_to_graylog;
+ map<string,string> log_to_graylog_host;
+ map<string,string> log_to_graylog_port;
+ uuid_d fsid;
+ string host;
+ if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog,
+ log_channel, log_prio, log_to_graylog,
+ log_to_graylog_host, log_to_graylog_port,
+ fsid, host) == 0)
+ cl->update_config(log_to_monitors, log_to_syslog,
+ log_channel, log_prio, log_to_graylog,
+ log_to_graylog_host, log_to_graylog_port,
+ fsid, host);
+ cl->do_log(prio, message);
+}
+
+void ActivePyModules::register_client(std::string_view name, std::string addrs)
+{
+ std::lock_guard l(lock);
+
+ entity_addrvec_t addrv;
+ addrv.parse(addrs.data());
+
+ dout(7) << "registering msgr client handle " << addrv << dendl;
+ py_module_registry.register_client(name, std::move(addrv));
+}
+
+void ActivePyModules::unregister_client(std::string_view name, std::string addrs)
+{
+ std::lock_guard l(lock);
+
+ entity_addrvec_t addrv;
+ addrv.parse(addrs.data());
+
+ dout(7) << "unregistering msgr client handle " << addrv << dendl;
+ py_module_registry.unregister_client(name, addrv);
+}
diff --git a/src/mgr/ActivePyModules.h b/src/mgr/ActivePyModules.h
new file mode 100644
index 000000000..d916bdcca
--- /dev/null
+++ b/src/mgr/ActivePyModules.h
@@ -0,0 +1,228 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#pragma once
+
+#include "ActivePyModule.h"
+
+#include "common/Finisher.h"
+#include "common/ceph_mutex.h"
+
+#include "PyFormatter.h"
+
+#include "osdc/Objecter.h"
+#include "client/Client.h"
+#include "common/LogClient.h"
+#include "mon/MgrMap.h"
+#include "mon/MonCommand.h"
+#include "mon/mon_types.h"
+#include "mon/ConfigMap.h"
+#include "mgr/TTLCache.h"
+
+#include "DaemonState.h"
+#include "ClusterState.h"
+#include "OSDPerfMetricTypes.h"
+
+class health_check_map_t;
+class DaemonServer;
+class MgrSession;
+class ModuleCommand;
+class PyModuleRegistry;
+
+class ActivePyModules
+{
+ // module class instances not yet created
+ std::set<std::string, std::less<>> pending_modules;
+ // module class instances already created
+ std::map<std::string, std::shared_ptr<ActivePyModule>> modules;
+ PyModuleConfig &module_config;
+ bool have_local_config_map = false;
+ std::map<std::string, std::string> store_cache;
+ ConfigMap config_map; ///< derived from store_cache config/ keys
+ DaemonStateIndex &daemon_state;
+ ClusterState &cluster_state;
+ MonClient &monc;
+ LogChannelRef clog, audit_clog;
+ Objecter &objecter;
+ Client &client;
+ Finisher &finisher;
+ TTLCache<string, PyObject*> ttl_cache;
+public:
+ Finisher cmd_finisher;
+private:
+ DaemonServer &server;
+ PyModuleRegistry &py_module_registry;
+
+ map<std::string,ProgressEvent> progress_events;
+
+ mutable ceph::mutex lock = ceph::make_mutex("ActivePyModules::lock");
+
+public:
+ ActivePyModules(
+ PyModuleConfig &module_config,
+ std::map<std::string, std::string> store_data,
+ bool mon_provides_kv_sub,
+ DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+ LogChannelRef clog_, LogChannelRef audit_clog_, Objecter &objecter_, Client &client_,
+ Finisher &f, DaemonServer &server, PyModuleRegistry &pmr);
+
+ ~ActivePyModules();
+
+ // FIXME: wrap for send_command?
+ MonClient &get_monc() {return monc;}
+ Objecter &get_objecter() {return objecter;}
+ Client &get_client() {return client;}
+ PyObject *cacheable_get_python(const std::string &what);
+ PyObject *get_python(const std::string &what);
+ PyObject *get_server_python(const std::string &hostname);
+ PyObject *list_servers_python();
+ PyObject *get_metadata_python(
+ const std::string &svc_type, const std::string &svc_id);
+ PyObject *get_daemon_status_python(
+ const std::string &svc_type, const std::string &svc_id);
+ PyObject *get_counter_python(
+ const std::string &svc_type,
+ const std::string &svc_id,
+ const std::string &path);
+ PyObject *get_latest_counter_python(
+ const std::string &svc_type,
+ const std::string &svc_id,
+ const std::string &path);
+ PyObject *get_perf_schema_python(
+ const std::string &svc_type,
+ const std::string &svc_id);
+ PyObject *get_context();
+ PyObject *get_osdmap();
+ /// @note @c fct is not allowed to acquire locks when holding GIL
+ PyObject *with_perf_counters(
+ std::function<void(
+ PerfCounterInstance& counter_instance,
+ PerfCounterType& counter_type,
+ PyFormatter& f)> fct,
+ const std::string &svc_name,
+ const std::string &svc_id,
+ const std::string &path) const;
+
+ MetricQueryID add_osd_perf_query(
+ const OSDPerfMetricQuery &query,
+ const std::optional<OSDPerfMetricLimit> &limit);
+ void remove_osd_perf_query(MetricQueryID query_id);
+ PyObject *get_osd_perf_counters(MetricQueryID query_id);
+
+ MetricQueryID add_mds_perf_query(
+ const MDSPerfMetricQuery &query,
+ const std::optional<MDSPerfMetricLimit> &limit);
+ void remove_mds_perf_query(MetricQueryID query_id);
+ void reregister_mds_perf_queries();
+ PyObject *get_mds_perf_counters(MetricQueryID query_id);
+
+ bool get_store(const std::string &module_name,
+ const std::string &key, std::string *val) const;
+ PyObject *get_store_prefix(const std::string &module_name,
+ const std::string &prefix) const;
+ void set_store(const std::string &module_name,
+ const std::string &key, const boost::optional<std::string> &val);
+
+ bool get_config(const std::string &module_name,
+ const std::string &key, std::string *val) const;
+ void set_config(const std::string &module_name,
+ const std::string &key, const boost::optional<std::string> &val);
+
+ PyObject *get_typed_config(const std::string &module_name,
+ const std::string &key,
+ const std::string &prefix = "") const;
+ PyObject *get_foreign_config(
+ const std::string& who,
+ const std::string& name);
+
+ void set_health_checks(const std::string& module_name,
+ health_check_map_t&& checks);
+ void get_health_checks(health_check_map_t *checks);
+
+ void update_progress_event(const std::string& evid,
+ const std::string& desc,
+ float progress,
+ bool add_to_ceph_s);
+ void complete_progress_event(const std::string& evid);
+ void clear_all_progress_events();
+ void get_progress_events(std::map<std::string,ProgressEvent>* events);
+
+ void register_client(std::string_view name, std::string addrs);
+ void unregister_client(std::string_view name, std::string addrs);
+
+ void config_notify();
+
+ void set_uri(const std::string& module_name, const std::string &uri);
+ void set_device_wear_level(const std::string& devid, float wear_level);
+
+ int handle_command(
+ const ModuleCommand& module_command,
+ const MgrSession& session,
+ const cmdmap_t &cmdmap,
+ const bufferlist &inbuf,
+ std::stringstream *ds,
+ std::stringstream *ss);
+
+ std::map<std::string, std::string> get_services() const;
+
+ void update_kv_data(
+ const std::string prefix,
+ bool incremental,
+ const map<std::string, boost::optional<bufferlist>, std::less<>>& data);
+ void _refresh_config_map();
+
+ // Public so that MonCommandCompletion can use it
+ // FIXME: for send_command completion notifications,
+ // send it to only the module that sent the command, not everyone
+ void notify_all(const std::string &notify_type,
+ const std::string &notify_id);
+ void notify_all(const LogEntry &log_entry);
+
+ bool is_pending(std::string_view name) const {
+ return pending_modules.count(name) > 0;
+ }
+ bool module_exists(const std::string &name) const
+ {
+ return modules.count(name) > 0;
+ }
+
+ bool method_exists(
+ const std::string &module_name,
+ const std::string &method_name) const
+ {
+ return modules.at(module_name)->method_exists(method_name);
+ }
+
+ PyObject *dispatch_remote(
+ const std::string &other_module,
+ const std::string &method,
+ PyObject *args,
+ PyObject *kwargs,
+ std::string *err);
+
+ int init();
+ void shutdown();
+
+ void start_one(PyModuleRef py_module);
+
+ void dump_server(const std::string &hostname,
+ const DaemonStateCollection &dmc,
+ Formatter *f);
+
+ void cluster_log(const std::string &channel, clog_type prio,
+ const std::string &message);
+
+ bool inject_python_on() const;
+ void update_cache_metrics();
+};
+
diff --git a/src/mgr/BaseMgrModule.cc b/src/mgr/BaseMgrModule.cc
new file mode 100644
index 000000000..3f49976d8
--- /dev/null
+++ b/src/mgr/BaseMgrModule.cc
@@ -0,0 +1,1623 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+/**
+ * The interface we present to python code that runs within
+ * ceph-mgr. This is implemented as a Python class from which
+ * all modules must inherit -- access to the Ceph state is then
+ * available as methods on that object.
+ */
+
+#include "Python.h"
+
+#include "Mgr.h"
+
+#include "mon/MonClient.h"
+#include "common/errno.h"
+#include "common/version.h"
+#include "mgr/Types.h"
+
+#include "PyUtil.h"
+#include "BaseMgrModule.h"
+#include "Gil.h"
+
+#include <algorithm>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#define PLACEHOLDER ""
+
+
+typedef struct {
+ PyObject_HEAD
+ ActivePyModules *py_modules;
+ ActivePyModule *this_module;
+} BaseMgrModule;
+
+class MonCommandCompletion : public Context
+{
+ ActivePyModules *py_modules;
+ PyObject *python_completion;
+ const std::string tag;
+ SafeThreadState pThreadState;
+
+public:
+ std::string outs;
+ bufferlist outbl;
+
+ MonCommandCompletion(
+ ActivePyModules *py_modules_, PyObject* ev,
+ const std::string &tag_, PyThreadState *ts_)
+ : py_modules(py_modules_), python_completion(ev),
+ tag(tag_), pThreadState(ts_)
+ {
+ ceph_assert(python_completion != nullptr);
+ Py_INCREF(python_completion);
+ }
+
+ ~MonCommandCompletion() override
+ {
+ if (python_completion) {
+ // Usually do this in finish(): this path is only for if we're
+ // being destroyed without completing.
+ Gil gil(pThreadState, true);
+ Py_DECREF(python_completion);
+ python_completion = nullptr;
+ }
+ }
+
+ void finish(int r) override
+ {
+ ceph_assert(python_completion != nullptr);
+
+ dout(10) << "MonCommandCompletion::finish()" << dendl;
+ {
+ // Scoped so the Gil is released before calling notify_all()
+ // Create new thread state because this is called via the MonClient
+ // Finisher, not the PyModules finisher.
+ Gil gil(pThreadState, true);
+
+ auto set_fn = PyObject_GetAttrString(python_completion, "complete");
+ ceph_assert(set_fn != nullptr);
+
+ auto pyR = PyLong_FromLong(r);
+ auto pyOutBl = PyUnicode_FromString(outbl.to_str().c_str());
+ auto pyOutS = PyUnicode_FromString(outs.c_str());
+ auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS);
+ Py_DECREF(pyR);
+ Py_DECREF(pyOutBl);
+ Py_DECREF(pyOutS);
+
+ auto rtn = PyObject_CallObject(set_fn, args);
+ if (rtn != nullptr) {
+ Py_DECREF(rtn);
+ }
+ Py_DECREF(args);
+ Py_DECREF(set_fn);
+
+ Py_DECREF(python_completion);
+ python_completion = nullptr;
+ }
+ py_modules->notify_all("command", tag);
+ }
+};
+
+
+static PyObject*
+ceph_send_command(BaseMgrModule *self, PyObject *args)
+{
+ // Like mon, osd, mds
+ char *type = nullptr;
+
+ // Like "23" for an OSD or "myid" for an MDS
+ char *name = nullptr;
+
+ char *cmd_json = nullptr;
+ char *tag = nullptr;
+ char *inbuf_ptr = nullptr;
+ Py_ssize_t inbuf_len = 0;
+ bufferlist inbuf = {};
+
+ PyObject *completion = nullptr;
+ if (!PyArg_ParseTuple(args, "Ossssz#:ceph_send_command",
+ &completion, &type, &name, &cmd_json, &tag, &inbuf_ptr, &inbuf_len)) {
+ return nullptr;
+ }
+
+ if (inbuf_ptr) {
+ inbuf.append(inbuf_ptr, (unsigned)inbuf_len);
+ }
+
+ auto set_fn = PyObject_GetAttrString(completion, "complete");
+ if (set_fn == nullptr) {
+ ceph_abort(); // TODO raise python exception instead
+ } else {
+ ceph_assert(PyCallable_Check(set_fn));
+ }
+ Py_DECREF(set_fn);
+
+ MonCommandCompletion *command_c = new MonCommandCompletion(self->py_modules,
+ completion, tag, PyThreadState_Get());
+
+ PyThreadState *tstate = PyEval_SaveThread();
+
+ if (std::string(type) == "mon") {
+
+ // Wait for the latest OSDMap after each command we send to
+ // the mons. This is a heavy-handed hack to make life simpler
+ // for python module authors, so that they know whenever they
+ // run a command they've gt a fresh OSDMap afterwards.
+ // TODO: enhance MCommand interface so that it returns
+ // latest cluster map versions on completion, and callers
+ // can wait for those.
+ auto c = new LambdaContext([command_c, self](int command_r){
+ self->py_modules->get_objecter().wait_for_latest_osdmap(
+ [command_c, command_r](boost::system::error_code) {
+ command_c->complete(command_r);
+ });
+ });
+
+ self->py_modules->get_monc().start_mon_command(
+ name,
+ {cmd_json},
+ inbuf,
+ &command_c->outbl,
+ &command_c->outs,
+ new C_OnFinisher(c, &self->py_modules->cmd_finisher));
+ } else if (std::string(type) == "osd") {
+ std::string err;
+ uint64_t osd_id = strict_strtoll(name, 10, &err);
+ if (!err.empty()) {
+ delete command_c;
+ string msg("invalid osd_id: ");
+ msg.append("\"").append(name).append("\"");
+ PyEval_RestoreThread(tstate);
+ PyErr_SetString(PyExc_ValueError, msg.c_str());
+ return nullptr;
+ }
+
+ ceph_tid_t tid;
+ self->py_modules->get_objecter().osd_command(
+ osd_id,
+ {cmd_json},
+ inbuf,
+ &tid,
+ [command_c, f = &self->py_modules->cmd_finisher]
+ (boost::system::error_code ec, std::string s, ceph::buffer::list bl) {
+ command_c->outs = std::move(s);
+ command_c->outbl = std::move(bl);
+ f->queue(command_c);
+ });
+ } else if (std::string(type) == "mds") {
+ int r = self->py_modules->get_client().mds_command(
+ name,
+ {cmd_json},
+ inbuf,
+ &command_c->outbl,
+ &command_c->outs,
+ new C_OnFinisher(command_c, &self->py_modules->cmd_finisher));
+ if (r != 0) {
+ string msg("failed to send command to mds: ");
+ msg.append(cpp_strerror(r));
+ PyEval_RestoreThread(tstate);
+ PyErr_SetString(PyExc_RuntimeError, msg.c_str());
+ return nullptr;
+ }
+ } else if (std::string(type) == "pg") {
+ pg_t pgid;
+ if (!pgid.parse(name)) {
+ delete command_c;
+ string msg("invalid pgid: ");
+ msg.append("\"").append(name).append("\"");
+ PyEval_RestoreThread(tstate);
+ PyErr_SetString(PyExc_ValueError, msg.c_str());
+ return nullptr;
+ }
+
+ ceph_tid_t tid;
+ self->py_modules->get_objecter().pg_command(
+ pgid,
+ {cmd_json},
+ inbuf,
+ &tid,
+ [command_c, f = &self->py_modules->cmd_finisher]
+ (boost::system::error_code ec, std::string s, ceph::buffer::list bl) {
+ command_c->outs = std::move(s);
+ command_c->outbl = std::move(bl);
+ f->queue(command_c);
+ });
+ PyEval_RestoreThread(tstate);
+ return nullptr;
+ } else {
+ delete command_c;
+ string msg("unknown service type: ");
+ msg.append(type);
+ PyEval_RestoreThread(tstate);
+ PyErr_SetString(PyExc_ValueError, msg.c_str());
+ return nullptr;
+ }
+
+ PyEval_RestoreThread(tstate);
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_set_health_checks(BaseMgrModule *self, PyObject *args)
+{
+ PyObject *checks = NULL;
+ if (!PyArg_ParseTuple(args, "O:ceph_set_health_checks", &checks)) {
+ return NULL;
+ }
+ if (!PyDict_Check(checks)) {
+ derr << __func__ << " arg not a dict" << dendl;
+ Py_RETURN_NONE;
+ }
+ PyObject *checksls = PyDict_Items(checks);
+ health_check_map_t out_checks;
+ for (int i = 0; i < PyList_Size(checksls); ++i) {
+ PyObject *kv = PyList_GET_ITEM(checksls, i);
+ char *check_name = nullptr;
+ PyObject *check_info = nullptr;
+ if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) {
+ derr << __func__ << " dict item " << i
+ << " not a size 2 tuple" << dendl;
+ continue;
+ }
+ if (!PyDict_Check(check_info)) {
+ derr << __func__ << " item " << i << " " << check_name
+ << " value not a dict" << dendl;
+ continue;
+ }
+ health_status_t severity = HEALTH_OK;
+ string summary;
+ list<string> detail;
+ int64_t count = 0;
+ PyObject *infols = PyDict_Items(check_info);
+ for (int j = 0; j < PyList_Size(infols); ++j) {
+ PyObject *pair = PyList_GET_ITEM(infols, j);
+ if (!PyTuple_Check(pair)) {
+ derr << __func__ << " item " << i << " pair " << j
+ << " not a tuple" << dendl;
+ continue;
+ }
+ char *k = nullptr;
+ PyObject *v = nullptr;
+ if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) {
+ derr << __func__ << " item " << i << " pair " << j
+ << " not a size 2 tuple" << dendl;
+ continue;
+ }
+ string ks(k);
+ if (ks == "severity") {
+ if (!PyUnicode_Check(v)) {
+ derr << __func__ << " check " << check_name
+ << " severity value not string" << dendl;
+ continue;
+ }
+ if (const string vs = PyUnicode_AsUTF8(v); vs == "warning") {
+ severity = HEALTH_WARN;
+ } else if (vs == "error") {
+ severity = HEALTH_ERR;
+ }
+ } else if (ks == "summary") {
+ if (!PyUnicode_Check(v)) {
+ derr << __func__ << " check " << check_name
+ << " summary value not [unicode] string" << dendl;
+ continue;
+ } else {
+ summary = PyUnicode_AsUTF8(v);
+ }
+ } else if (ks == "count") {
+ if (PyLong_Check(v)) {
+ count = PyLong_AsLong(v);
+ } else {
+ derr << __func__ << " check " << check_name
+ << " count value not int" << dendl;
+ continue;
+ }
+ } else if (ks == "detail") {
+ if (!PyList_Check(v)) {
+ derr << __func__ << " check " << check_name
+ << " detail value not list" << dendl;
+ continue;
+ }
+ for (int k = 0; k < PyList_Size(v); ++k) {
+ PyObject *di = PyList_GET_ITEM(v, k);
+ if (!PyUnicode_Check(di)) {
+ derr << __func__ << " check " << check_name
+ << " detail item " << k << " not a [unicode] string" << dendl;
+ continue;
+ } else {
+ detail.push_back(PyUnicode_AsUTF8(di));
+ }
+ }
+ } else {
+ derr << __func__ << " check " << check_name
+ << " unexpected key " << k << dendl;
+ }
+ }
+ auto& d = out_checks.add(check_name, severity, summary, count);
+ d.detail.swap(detail);
+ }
+
+ JSONFormatter jf(true);
+ dout(10) << "module " << self->this_module->get_name()
+ << " health checks:\n";
+ out_checks.dump(&jf);
+ jf.flush(*_dout);
+ *_dout << dendl;
+
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->set_health_checks(self->this_module->get_name(),
+ std::move(out_checks));
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+
+static PyObject*
+ceph_state_get(BaseMgrModule *self, PyObject *args)
+{
+ char *what = NULL;
+ if (!PyArg_ParseTuple(args, "s:ceph_state_get", &what)) {
+ return NULL;
+ }
+
+ return self->py_modules->cacheable_get_python(what);
+}
+
+
+static PyObject*
+ceph_get_server(BaseMgrModule *self, PyObject *args)
+{
+ char *hostname = NULL;
+ if (!PyArg_ParseTuple(args, "z:ceph_get_server", &hostname)) {
+ return NULL;
+ }
+
+ if (hostname) {
+ return self->py_modules->get_server_python(hostname);
+ } else {
+ return self->py_modules->list_servers_python();
+ }
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrModule *self, PyObject *args)
+{
+ return PyUnicode_FromString(g_conf()->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_option_get(BaseMgrModule *self, PyObject *args)
+{
+ char *what = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_option_get", &what)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ const Option *opt = g_conf().find_option(string(what));
+ if (opt) {
+ std::string value;
+ switch (int r = g_conf().get_val(string(what), &value); r) {
+ case -ENOMEM:
+ PyErr_NoMemory();
+ return nullptr;
+ case -ENAMETOOLONG:
+ PyErr_SetString(PyExc_ValueError, "value too long");
+ return nullptr;
+ default:
+ ceph_assert(r == 0);
+ break;
+ }
+ dout(10) << "ceph_option_get " << what << " found: " << value << dendl;
+ return get_python_typed_option_value(opt->type, value);
+ } else {
+ dout(4) << "ceph_option_get " << what << " not found " << dendl;
+ PyErr_Format(PyExc_KeyError, "option not found: %s", what);
+ return nullptr;
+ }
+}
+
+static PyObject*
+ceph_foreign_option_get(BaseMgrModule *self, PyObject *args)
+{
+ char *who = nullptr;
+ char *what = nullptr;
+ if (!PyArg_ParseTuple(args, "ss:ceph_foreign_option_get", &who, &what)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+ return self->py_modules->get_foreign_config(who, what);
+}
+
+static PyObject*
+ceph_get_module_option(BaseMgrModule *self, PyObject *args)
+{
+ char *module = nullptr;
+ char *key = nullptr;
+ char *prefix = nullptr;
+ if (!PyArg_ParseTuple(args, "ss|s:ceph_get_module_option", &module, &key,
+ &prefix)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+ std::string str_prefix;
+ if (prefix) {
+ str_prefix = prefix;
+ }
+ assert(self->this_module->py_module);
+ auto pResult = self->py_modules->get_typed_config(module, key, str_prefix);
+ return pResult;
+}
+
+static PyObject*
+ceph_store_get_prefix(BaseMgrModule *self, PyObject *args)
+{
+ char *prefix = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_store_get_prefix", &prefix)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ return self->py_modules->get_store_prefix(self->this_module->get_name(),
+ prefix);
+}
+
+static PyObject*
+ceph_set_module_option(BaseMgrModule *self, PyObject *args)
+{
+ char *module = nullptr;
+ char *key = nullptr;
+ char *value = nullptr;
+ if (!PyArg_ParseTuple(args, "ssz:ceph_set_module_option",
+ &module, &key, &value)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+ boost::optional<string> val;
+ if (value) {
+ val = value;
+ }
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->set_config(module, key, val);
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_store_get(BaseMgrModule *self, PyObject *args)
+{
+ char *what = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_store_get", &what)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ std::string value;
+ bool found = self->py_modules->get_store(self->this_module->get_name(),
+ what, &value);
+ if (found) {
+ dout(10) << "ceph_store_get " << what << " found: " << value.c_str() << dendl;
+ return PyUnicode_FromString(value.c_str());
+ } else {
+ dout(4) << "ceph_store_get " << what << " not found " << dendl;
+ Py_RETURN_NONE;
+ }
+}
+
+static PyObject*
+ceph_store_set(BaseMgrModule *self, PyObject *args)
+{
+ char *key = nullptr;
+ char *value = nullptr;
+ if (!PyArg_ParseTuple(args, "sz:ceph_store_set", &key, &value)) {
+ return nullptr;
+ }
+ boost::optional<string> val;
+ if (value) {
+ val = value;
+ }
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->set_store(self->this_module->get_name(), key, val);
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+get_metadata(BaseMgrModule *self, PyObject *args)
+{
+ char *svc_name = NULL;
+ char *svc_id = NULL;
+ if (!PyArg_ParseTuple(args, "ss:get_metadata", &svc_name, &svc_id)) {
+ return nullptr;
+ }
+ return self->py_modules->get_metadata_python(svc_name, svc_id);
+}
+
+static PyObject*
+get_daemon_status(BaseMgrModule *self, PyObject *args)
+{
+ char *svc_name = NULL;
+ char *svc_id = NULL;
+ if (!PyArg_ParseTuple(args, "ss:get_daemon_status", &svc_name,
+ &svc_id)) {
+ return nullptr;
+ }
+ return self->py_modules->get_daemon_status_python(svc_name, svc_id);
+}
+
+static PyObject*
+ceph_log(BaseMgrModule *self, PyObject *args)
+{
+ char *record = nullptr;
+ if (!PyArg_ParseTuple(args, "s:log", &record)) {
+ return nullptr;
+ }
+
+ ceph_assert(self->this_module);
+
+ self->this_module->log(record);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_cluster_log(BaseMgrModule *self, PyObject *args)
+{
+ int prio = 0;
+ char *channel = nullptr;
+ char *message = nullptr;
+
+ if (!PyArg_ParseTuple(args, "sis:ceph_cluster_log", &channel, &prio, &message)) {
+ return nullptr;
+ }
+
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->cluster_log(channel, (clog_type)prio, message);
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject *
+ceph_get_version(BaseMgrModule *self, PyObject *args)
+{
+ return PyUnicode_FromString(pretty_version_to_str().c_str());
+}
+
+static PyObject *
+ceph_get_ceph_conf_path(BaseMgrModule *self, PyObject *args)
+{
+ return PyUnicode_FromString(g_conf().get_conf_path().c_str());
+}
+
+static PyObject *
+ceph_get_release_name(BaseMgrModule *self, PyObject *args)
+{
+ return PyUnicode_FromString(ceph_release_to_str());
+}
+
+static PyObject *
+ceph_lookup_release_name(BaseMgrModule *self, PyObject *args)
+{
+ int major = 0;
+ if (!PyArg_ParseTuple(args, "i:ceph_lookup_release_name", &major)) {
+ return nullptr;
+ }
+ return PyUnicode_FromString(ceph_release_name(major));
+}
+
+static PyObject *
+ceph_get_context(BaseMgrModule *self)
+{
+ return self->py_modules->get_context();
+}
+
+static PyObject*
+get_counter(BaseMgrModule *self, PyObject *args)
+{
+ char *svc_name = nullptr;
+ char *svc_id = nullptr;
+ char *counter_path = nullptr;
+ if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name,
+ &svc_id, &counter_path)) {
+ return nullptr;
+ }
+ return self->py_modules->get_counter_python(
+ svc_name, svc_id, counter_path);
+}
+
+static PyObject*
+get_latest_counter(BaseMgrModule *self, PyObject *args)
+{
+ char *svc_name = nullptr;
+ char *svc_id = nullptr;
+ char *counter_path = nullptr;
+ if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name,
+ &svc_id, &counter_path)) {
+ return nullptr;
+ }
+ return self->py_modules->get_latest_counter_python(
+ svc_name, svc_id, counter_path);
+}
+
+static PyObject*
+get_perf_schema(BaseMgrModule *self, PyObject *args)
+{
+ char *type_str = nullptr;
+ char *svc_id = nullptr;
+ if (!PyArg_ParseTuple(args, "ss:get_perf_schema", &type_str,
+ &svc_id)) {
+ return nullptr;
+ }
+
+ return self->py_modules->get_perf_schema_python(type_str, svc_id);
+}
+
+static PyObject *
+ceph_get_osdmap(BaseMgrModule *self, PyObject *args)
+{
+ return self->py_modules->get_osdmap();
+}
+
+static PyObject*
+ceph_set_uri(BaseMgrModule *self, PyObject *args)
+{
+ char *svc_str = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_advertize_service",
+ &svc_str)) {
+ return nullptr;
+ }
+
+ // We call down into PyModules even though we have a MgrPyModule
+ // reference here, because MgrPyModule's fields are protected
+ // by PyModules' lock.
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->set_uri(self->this_module->get_name(), svc_str);
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_set_wear_level(BaseMgrModule *self, PyObject *args)
+{
+ char *devid = nullptr;
+ float wear_level;
+ if (!PyArg_ParseTuple(args, "sf:ceph_set_wear_level",
+ &devid, &wear_level)) {
+ return nullptr;
+ }
+
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->set_device_wear_level(devid, wear_level);
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_have_mon_connection(BaseMgrModule *self, PyObject *args)
+{
+ if (self->py_modules->get_monc().is_connected()) {
+ Py_RETURN_TRUE;
+ } else {
+ Py_RETURN_FALSE;
+ }
+}
+
+static PyObject*
+ceph_update_progress_event(BaseMgrModule *self, PyObject *args)
+{
+ char *evid = nullptr;
+ char *desc = nullptr;
+ float progress = 0.0;
+ bool add_to_ceph_s = false;
+ if (!PyArg_ParseTuple(args, "ssfb:ceph_update_progress_event",
+ &evid, &desc, &progress, &add_to_ceph_s)) {
+ return nullptr;
+ }
+
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->update_progress_event(evid, desc, progress, add_to_ceph_s);
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_complete_progress_event(BaseMgrModule *self, PyObject *args)
+{
+ char *evid = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_complete_progress_event",
+ &evid)) {
+ return nullptr;
+ }
+
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->complete_progress_event(evid);
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_clear_all_progress_events(BaseMgrModule *self, PyObject *args)
+{
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->clear_all_progress_events();
+ PyEval_RestoreThread(tstate);
+
+ Py_RETURN_NONE;
+}
+
+
+
+static PyObject *
+ceph_dispatch_remote(BaseMgrModule *self, PyObject *args)
+{
+ char *other_module = nullptr;
+ char *method = nullptr;
+ PyObject *remote_args = nullptr;
+ PyObject *remote_kwargs = nullptr;
+ if (!PyArg_ParseTuple(args, "ssOO:ceph_dispatch_remote",
+ &other_module, &method, &remote_args, &remote_kwargs)) {
+ return nullptr;
+ }
+
+ // Early error handling, because if the module doesn't exist then we
+ // won't be able to use its thread state to set python error state
+ // inside dispatch_remote().
+ if (!self->py_modules->module_exists(other_module)) {
+ derr << "no module '" << other_module << "'" << dendl;
+ PyErr_SetString(PyExc_ImportError, "Module not found");
+ return nullptr;
+ }
+
+ // Drop GIL from calling python thread state, it will be taken
+ // both for checking for method existence and for executing method.
+ PyThreadState *tstate = PyEval_SaveThread();
+
+ if (!self->py_modules->method_exists(other_module, method)) {
+ PyEval_RestoreThread(tstate);
+ PyErr_SetString(PyExc_NameError, "Method not found");
+ return nullptr;
+ }
+
+ std::string err;
+ auto result = self->py_modules->dispatch_remote(other_module, method,
+ remote_args, remote_kwargs, &err);
+
+ PyEval_RestoreThread(tstate);
+
+ if (result == nullptr) {
+ std::stringstream ss;
+ ss << "Remote method threw exception: " << err;
+ PyErr_SetString(PyExc_RuntimeError, ss.str().c_str());
+ derr << ss.str() << dendl;
+ }
+
+ return result;
+}
+
+static PyObject*
+ceph_add_osd_perf_query(BaseMgrModule *self, PyObject *args)
+{
+ static const std::string NAME_KEY_DESCRIPTOR = "key_descriptor";
+ static const std::string NAME_COUNTERS_DESCRIPTORS =
+ "performance_counter_descriptors";
+ static const std::string NAME_LIMIT = "limit";
+ static const std::string NAME_SUB_KEY_TYPE = "type";
+ static const std::string NAME_SUB_KEY_REGEX = "regex";
+ static const std::string NAME_LIMIT_ORDER_BY = "order_by";
+ static const std::string NAME_LIMIT_MAX_COUNT = "max_count";
+ static const std::map<std::string, OSDPerfMetricSubKeyType> sub_key_types = {
+ {"client_id", OSDPerfMetricSubKeyType::CLIENT_ID},
+ {"client_address", OSDPerfMetricSubKeyType::CLIENT_ADDRESS},
+ {"pool_id", OSDPerfMetricSubKeyType::POOL_ID},
+ {"namespace", OSDPerfMetricSubKeyType::NAMESPACE},
+ {"osd_id", OSDPerfMetricSubKeyType::OSD_ID},
+ {"pg_id", OSDPerfMetricSubKeyType::PG_ID},
+ {"object_name", OSDPerfMetricSubKeyType::OBJECT_NAME},
+ {"snap_id", OSDPerfMetricSubKeyType::SNAP_ID},
+ };
+ static const std::map<std::string, PerformanceCounterType> counter_types = {
+ {"ops", PerformanceCounterType::OPS},
+ {"write_ops", PerformanceCounterType::WRITE_OPS},
+ {"read_ops", PerformanceCounterType::READ_OPS},
+ {"bytes", PerformanceCounterType::BYTES},
+ {"write_bytes", PerformanceCounterType::WRITE_BYTES},
+ {"read_bytes", PerformanceCounterType::READ_BYTES},
+ {"latency", PerformanceCounterType::LATENCY},
+ {"write_latency", PerformanceCounterType::WRITE_LATENCY},
+ {"read_latency", PerformanceCounterType::READ_LATENCY},
+ };
+
+ PyObject *py_query = nullptr;
+ if (!PyArg_ParseTuple(args, "O:ceph_add_osd_perf_query", &py_query)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+ if (!PyDict_Check(py_query)) {
+ derr << __func__ << " arg not a dict" << dendl;
+ Py_RETURN_NONE;
+ }
+
+ PyObject *query_params = PyDict_Items(py_query);
+ OSDPerfMetricQuery query;
+ std::optional<OSDPerfMetricLimit> limit;
+
+ // {
+ // 'key_descriptor': [
+ // {'type': subkey_type, 'regex': regex_pattern},
+ // ...
+ // ],
+ // 'performance_counter_descriptors': [
+ // list, of, descriptor, types
+ // ],
+ // 'limit': {'order_by': performance_counter_type, 'max_count': n},
+ // }
+
+ for (int i = 0; i < PyList_Size(query_params); ++i) {
+ PyObject *kv = PyList_GET_ITEM(query_params, i);
+ char *query_param_name = nullptr;
+ PyObject *query_param_val = nullptr;
+ if (!PyArg_ParseTuple(kv, "sO:pair", &query_param_name, &query_param_val)) {
+ derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl;
+ Py_RETURN_NONE;
+ }
+ if (query_param_name == NAME_KEY_DESCRIPTOR) {
+ if (!PyList_Check(query_param_val)) {
+ derr << __func__ << " " << query_param_name << " not a list" << dendl;
+ Py_RETURN_NONE;
+ }
+ for (int j = 0; j < PyList_Size(query_param_val); j++) {
+ PyObject *sub_key = PyList_GET_ITEM(query_param_val, j);
+ if (!PyDict_Check(sub_key)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " not a dict" << dendl;
+ Py_RETURN_NONE;
+ }
+ OSDPerfMetricSubKeyDescriptor d;
+ PyObject *sub_key_params = PyDict_Items(sub_key);
+ for (int k = 0; k < PyList_Size(sub_key_params); ++k) {
+ PyObject *pair = PyList_GET_ITEM(sub_key_params, k);
+ if (!PyTuple_Check(pair)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " pair " << k << " not a tuple" << dendl;
+ Py_RETURN_NONE;
+ }
+ char *param_name = nullptr;
+ PyObject *param_value = nullptr;
+ if (!PyArg_ParseTuple(pair, "sO:pair", &param_name, &param_value)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " pair " << k << " not a size 2 tuple" << dendl;
+ Py_RETURN_NONE;
+ }
+ if (param_name == NAME_SUB_KEY_TYPE) {
+ if (!PyUnicode_Check(param_value)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid param " << param_name << dendl;
+ Py_RETURN_NONE;
+ }
+ auto type = PyUnicode_AsUTF8(param_value);
+ auto it = sub_key_types.find(type);
+ if (it == sub_key_types.end()) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid type " << dendl;
+ Py_RETURN_NONE;
+ }
+ d.type = it->second;
+ } else if (param_name == NAME_SUB_KEY_REGEX) {
+ if (!PyUnicode_Check(param_value)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid param " << param_name << dendl;
+ Py_RETURN_NONE;
+ }
+ d.regex_str = PyUnicode_AsUTF8(param_value);
+ try {
+ d.regex = d.regex_str.c_str();
+ } catch (const std::regex_error& e) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid regex " << d.regex_str << dendl;
+ Py_RETURN_NONE;
+ }
+ if (d.regex.mark_count() == 0) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " regex " << d.regex_str << ": no capturing groups"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+ } else {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid param " << param_name << dendl;
+ Py_RETURN_NONE;
+ }
+ }
+ if (d.type == static_cast<OSDPerfMetricSubKeyType>(-1) ||
+ d.regex_str.empty()) {
+ derr << __func__ << " query " << query_param_name << " item " << i
+ << " invalid" << dendl;
+ Py_RETURN_NONE;
+ }
+ query.key_descriptor.push_back(d);
+ }
+ } else if (query_param_name == NAME_COUNTERS_DESCRIPTORS) {
+ if (!PyList_Check(query_param_val)) {
+ derr << __func__ << " " << query_param_name << " not a list" << dendl;
+ Py_RETURN_NONE;
+ }
+ for (int j = 0; j < PyList_Size(query_param_val); j++) {
+ PyObject *py_type = PyList_GET_ITEM(query_param_val, j);
+ if (!PyUnicode_Check(py_type)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " not a string" << dendl;
+ Py_RETURN_NONE;
+ }
+ auto type = PyUnicode_AsUTF8(py_type);
+ auto it = counter_types.find(type);
+ if (it == counter_types.end()) {
+ derr << __func__ << " query " << query_param_name << " item " << type
+ << " is not valid type" << dendl;
+ Py_RETURN_NONE;
+ }
+ query.performance_counter_descriptors.push_back(it->second);
+ }
+ } else if (query_param_name == NAME_LIMIT) {
+ if (!PyDict_Check(query_param_val)) {
+ derr << __func__ << " query " << query_param_name << " not a dict"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+
+ limit = OSDPerfMetricLimit();
+ PyObject *limit_params = PyDict_Items(query_param_val);
+
+ for (int j = 0; j < PyList_Size(limit_params); ++j) {
+ PyObject *kv = PyList_GET_ITEM(limit_params, j);
+ char *limit_param_name = nullptr;
+ PyObject *limit_param_val = nullptr;
+ if (!PyArg_ParseTuple(kv, "sO:pair", &limit_param_name,
+ &limit_param_val)) {
+ derr << __func__ << " limit item " << j << " not a size 2 tuple"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+
+ if (limit_param_name == NAME_LIMIT_ORDER_BY) {
+ if (!PyUnicode_Check(limit_param_val)) {
+ derr << __func__ << " " << limit_param_name << " not a string"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+ auto order_by = PyUnicode_AsUTF8(limit_param_val);
+ auto it = counter_types.find(order_by);
+ if (it == counter_types.end()) {
+ derr << __func__ << " limit " << limit_param_name
+ << " not a valid counter type" << dendl;
+ Py_RETURN_NONE;
+ }
+ limit->order_by = it->second;
+ } else if (limit_param_name == NAME_LIMIT_MAX_COUNT) {
+ if (!PyLong_Check(limit_param_val)) {
+ derr << __func__ << " " << limit_param_name << " not an int"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+ limit->max_count = PyLong_AsLong(limit_param_val);
+ } else {
+ derr << __func__ << " unknown limit param: " << limit_param_name
+ << dendl;
+ Py_RETURN_NONE;
+ }
+ }
+ } else {
+ derr << __func__ << " unknown query param: " << query_param_name << dendl;
+ Py_RETURN_NONE;
+ }
+ }
+
+ if (query.key_descriptor.empty() ||
+ query.performance_counter_descriptors.empty()) {
+ derr << __func__ << " invalid query" << dendl;
+ Py_RETURN_NONE;
+ }
+
+ if (limit) {
+ auto &ds = query.performance_counter_descriptors;
+ if (std::find(ds.begin(), ds.end(), limit->order_by) == ds.end()) {
+ derr << __func__ << " limit order_by " << limit->order_by
+ << " not in performance_counter_descriptors" << dendl;
+ Py_RETURN_NONE;
+ }
+ }
+
+ auto query_id = self->py_modules->add_osd_perf_query(query, limit);
+ return PyLong_FromLong(query_id);
+}
+
+static PyObject*
+ceph_remove_osd_perf_query(BaseMgrModule *self, PyObject *args)
+{
+ MetricQueryID query_id;
+ if (!PyArg_ParseTuple(args, "i:ceph_remove_osd_perf_query", &query_id)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ self->py_modules->remove_osd_perf_query(query_id);
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_get_osd_perf_counters(BaseMgrModule *self, PyObject *args)
+{
+ MetricQueryID query_id;
+ if (!PyArg_ParseTuple(args, "i:ceph_get_osd_perf_counters", &query_id)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ return self->py_modules->get_osd_perf_counters(query_id);
+}
+
+// MDS perf query interface -- mostly follows ceph_add_osd_perf_query()
+// style
+
+static PyObject*
+ceph_add_mds_perf_query(BaseMgrModule *self, PyObject *args)
+{
+ static const std::string NAME_KEY_DESCRIPTOR = "key_descriptor";
+ static const std::string NAME_COUNTERS_DESCRIPTORS =
+ "performance_counter_descriptors";
+ static const std::string NAME_LIMIT = "limit";
+ static const std::string NAME_SUB_KEY_TYPE = "type";
+ static const std::string NAME_SUB_KEY_REGEX = "regex";
+ static const std::string NAME_LIMIT_ORDER_BY = "order_by";
+ static const std::string NAME_LIMIT_MAX_COUNT = "max_count";
+ static const std::map<std::string, MDSPerfMetricSubKeyType> sub_key_types = {
+ {"mds_rank", MDSPerfMetricSubKeyType::MDS_RANK},
+ {"client_id", MDSPerfMetricSubKeyType::CLIENT_ID},
+ };
+ static const std::map<std::string, MDSPerformanceCounterType> counter_types = {
+ {"cap_hit", MDSPerformanceCounterType::CAP_HIT_METRIC},
+ {"read_latency", MDSPerformanceCounterType::READ_LATENCY_METRIC},
+ {"write_latency", MDSPerformanceCounterType::WRITE_LATENCY_METRIC},
+ {"metadata_latency", MDSPerformanceCounterType::METADATA_LATENCY_METRIC},
+ {"dentry_lease", MDSPerformanceCounterType::DENTRY_LEASE_METRIC},
+ {"opened_files", MDSPerformanceCounterType::OPENED_FILES_METRIC},
+ {"pinned_icaps", MDSPerformanceCounterType::PINNED_ICAPS_METRIC},
+ {"opened_inodes", MDSPerformanceCounterType::OPENED_INODES_METRIC},
+ {"read_io_sizes", MDSPerformanceCounterType::READ_IO_SIZES_METRIC},
+ {"write_io_sizes", MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC},
+ {"avg_read_latency", MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC},
+ {"stdev_read_latency", MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC},
+ {"avg_write_latency", MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC},
+ {"stdev_write_latency", MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC},
+ {"avg_metadata_latency", MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC},
+ {"stdev_metadata_latency", MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC},
+ };
+
+ PyObject *py_query = nullptr;
+ if (!PyArg_ParseTuple(args, "O:ceph_add_mds_perf_query", &py_query)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+ if (!PyDict_Check(py_query)) {
+ derr << __func__ << " arg not a dict" << dendl;
+ Py_RETURN_NONE;
+ }
+
+ PyObject *query_params = PyDict_Items(py_query);
+ MDSPerfMetricQuery query;
+ std::optional<MDSPerfMetricLimit> limit;
+
+ // {
+ // 'key_descriptor': [
+ // {'type': subkey_type, 'regex': regex_pattern},
+ // ...
+ // ],
+ // 'performance_counter_descriptors': [
+ // list, of, descriptor, types
+ // ],
+ // 'limit': {'order_by': performance_counter_type, 'max_count': n},
+ // }
+
+ for (int i = 0; i < PyList_Size(query_params); ++i) {
+ PyObject *kv = PyList_GET_ITEM(query_params, i);
+ char *query_param_name = nullptr;
+ PyObject *query_param_val = nullptr;
+ if (!PyArg_ParseTuple(kv, "sO:pair", &query_param_name, &query_param_val)) {
+ derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl;
+ Py_RETURN_NONE;
+ }
+ if (query_param_name == NAME_KEY_DESCRIPTOR) {
+ if (!PyList_Check(query_param_val)) {
+ derr << __func__ << " " << query_param_name << " not a list" << dendl;
+ Py_RETURN_NONE;
+ }
+ for (int j = 0; j < PyList_Size(query_param_val); j++) {
+ PyObject *sub_key = PyList_GET_ITEM(query_param_val, j);
+ if (!PyDict_Check(sub_key)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " not a dict" << dendl;
+ Py_RETURN_NONE;
+ }
+ MDSPerfMetricSubKeyDescriptor d;
+ PyObject *sub_key_params = PyDict_Items(sub_key);
+ for (int k = 0; k < PyList_Size(sub_key_params); ++k) {
+ PyObject *pair = PyList_GET_ITEM(sub_key_params, k);
+ if (!PyTuple_Check(pair)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " pair " << k << " not a tuple" << dendl;
+ Py_RETURN_NONE;
+ }
+ char *param_name = nullptr;
+ PyObject *param_value = nullptr;
+ if (!PyArg_ParseTuple(pair, "sO:pair", &param_name, &param_value)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " pair " << k << " not a size 2 tuple" << dendl;
+ Py_RETURN_NONE;
+ }
+ if (param_name == NAME_SUB_KEY_TYPE) {
+ if (!PyUnicode_Check(param_value)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid param " << param_name << dendl;
+ Py_RETURN_NONE;
+ }
+ auto type = PyUnicode_AsUTF8(param_value);
+ auto it = sub_key_types.find(type);
+ if (it == sub_key_types.end()) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid type " << dendl;
+ Py_RETURN_NONE;
+ }
+ d.type = it->second;
+ } else if (param_name == NAME_SUB_KEY_REGEX) {
+ if (!PyUnicode_Check(param_value)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid param " << param_name << dendl;
+ Py_RETURN_NONE;
+ }
+ d.regex_str = PyUnicode_AsUTF8(param_value);
+ try {
+ d.regex = d.regex_str.c_str();
+ } catch (const std::regex_error& e) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid regex " << d.regex_str << dendl;
+ Py_RETURN_NONE;
+ }
+ if (d.regex.mark_count() == 0) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " regex " << d.regex_str << ": no capturing groups"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+ } else {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " contains invalid param " << param_name << dendl;
+ Py_RETURN_NONE;
+ }
+ }
+ if (d.type == static_cast<MDSPerfMetricSubKeyType>(-1) ||
+ d.regex_str.empty()) {
+ derr << __func__ << " query " << query_param_name << " item " << i
+ << " invalid" << dendl;
+ Py_RETURN_NONE;
+ }
+ query.key_descriptor.push_back(d);
+ }
+ } else if (query_param_name == NAME_COUNTERS_DESCRIPTORS) {
+ if (!PyList_Check(query_param_val)) {
+ derr << __func__ << " " << query_param_name << " not a list" << dendl;
+ Py_RETURN_NONE;
+ }
+ for (int j = 0; j < PyList_Size(query_param_val); j++) {
+ PyObject *py_type = PyList_GET_ITEM(query_param_val, j);
+ if (!PyUnicode_Check(py_type)) {
+ derr << __func__ << " query " << query_param_name << " item " << j
+ << " not a string" << dendl;
+ Py_RETURN_NONE;
+ }
+ auto type = PyUnicode_AsUTF8(py_type);
+ auto it = counter_types.find(type);
+ if (it == counter_types.end()) {
+ derr << __func__ << " query " << query_param_name << " item " << type
+ << " is not valid type" << dendl;
+ Py_RETURN_NONE;
+ }
+ query.performance_counter_descriptors.push_back(it->second);
+ }
+ } else if (query_param_name == NAME_LIMIT) {
+ if (!PyDict_Check(query_param_val)) {
+ derr << __func__ << " query " << query_param_name << " not a dict"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+
+ limit = MDSPerfMetricLimit();
+ PyObject *limit_params = PyDict_Items(query_param_val);
+
+ for (int j = 0; j < PyList_Size(limit_params); ++j) {
+ PyObject *kv = PyList_GET_ITEM(limit_params, j);
+ char *limit_param_name = nullptr;
+ PyObject *limit_param_val = nullptr;
+ if (!PyArg_ParseTuple(kv, "sO:pair", &limit_param_name,
+ &limit_param_val)) {
+ derr << __func__ << " limit item " << j << " not a size 2 tuple"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+
+ if (limit_param_name == NAME_LIMIT_ORDER_BY) {
+ if (!PyUnicode_Check(limit_param_val)) {
+ derr << __func__ << " " << limit_param_name << " not a string"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+ auto order_by = PyUnicode_AsUTF8(limit_param_val);
+ auto it = counter_types.find(order_by);
+ if (it == counter_types.end()) {
+ derr << __func__ << " limit " << limit_param_name
+ << " not a valid counter type" << dendl;
+ Py_RETURN_NONE;
+ }
+ limit->order_by = it->second;
+ } else if (limit_param_name == NAME_LIMIT_MAX_COUNT) {
+#if PY_MAJOR_VERSION <= 2
+ if (!PyInt_Check(limit_param_val) && !PyLong_Check(limit_param_val)) {
+#else
+ if (!PyLong_Check(limit_param_val)) {
+#endif
+ derr << __func__ << " " << limit_param_name << " not an int"
+ << dendl;
+ Py_RETURN_NONE;
+ }
+ limit->max_count = PyLong_AsLong(limit_param_val);
+ } else {
+ derr << __func__ << " unknown limit param: " << limit_param_name
+ << dendl;
+ Py_RETURN_NONE;
+ }
+ }
+ } else {
+ derr << __func__ << " unknown query param: " << query_param_name << dendl;
+ Py_RETURN_NONE;
+ }
+ }
+
+ if (query.key_descriptor.empty()) {
+ derr << __func__ << " invalid query" << dendl;
+ Py_RETURN_NONE;
+ }
+
+ if (limit) {
+ auto &ds = query.performance_counter_descriptors;
+ if (std::find(ds.begin(), ds.end(), limit->order_by) == ds.end()) {
+ derr << __func__ << " limit order_by " << limit->order_by
+ << " not in performance_counter_descriptors" << dendl;
+ Py_RETURN_NONE;
+ }
+ }
+
+ auto query_id = self->py_modules->add_mds_perf_query(query, limit);
+ return PyLong_FromLong(query_id);
+}
+
+static PyObject*
+ceph_remove_mds_perf_query(BaseMgrModule *self, PyObject *args)
+{
+ MetricQueryID query_id;
+ if (!PyArg_ParseTuple(args, "i:ceph_remove_mds_perf_query", &query_id)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ self->py_modules->remove_mds_perf_query(query_id);
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_reregister_mds_perf_queries(BaseMgrModule *self, PyObject *args)
+{
+ self->py_modules->reregister_mds_perf_queries();
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_get_mds_perf_counters(BaseMgrModule *self, PyObject *args)
+{
+ MetricQueryID query_id;
+ if (!PyArg_ParseTuple(args, "i:ceph_get_mds_perf_counters", &query_id)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ return self->py_modules->get_mds_perf_counters(query_id);
+}
+
+static PyObject*
+ceph_is_authorized(BaseMgrModule *self, PyObject *args)
+{
+ PyObject *args_dict = NULL;
+ if (!PyArg_ParseTuple(args, "O:ceph_is_authorized", &args_dict)) {
+ return nullptr;
+ }
+
+ if (!PyDict_Check(args_dict)) {
+ derr << __func__ << " arg not a dict" << dendl;
+ Py_RETURN_FALSE;
+ }
+
+ std::map<std::string, std::string> arguments;
+
+ PyObject *args_list = PyDict_Items(args_dict);
+ for (int i = 0; i < PyList_Size(args_list); ++i) {
+ PyObject *kv = PyList_GET_ITEM(args_list, i);
+
+ char *arg_key = nullptr;
+ char *arg_value = nullptr;
+ if (!PyArg_ParseTuple(kv, "ss:pair", &arg_key, &arg_value)) {
+ derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl;
+ continue;
+ }
+
+ arguments[arg_key] = arg_value;
+ }
+
+ PyThreadState *tstate = PyEval_SaveThread();
+ bool r = self->this_module->is_authorized(arguments);
+ PyEval_RestoreThread(tstate);
+
+ if (r) {
+ Py_RETURN_TRUE;
+ }
+ Py_RETURN_FALSE;
+}
+
+static PyObject*
+ceph_register_client(BaseMgrModule *self, PyObject *args)
+{
+ char *addrs = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_register_client", &addrs)) {
+ return nullptr;
+ }
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->register_client(self->this_module->get_name(), addrs);
+ PyEval_RestoreThread(tstate);
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_unregister_client(BaseMgrModule *self, PyObject *args)
+{
+ char *addrs = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_unregister_client", &addrs)) {
+ return nullptr;
+ }
+ PyThreadState *tstate = PyEval_SaveThread();
+ self->py_modules->unregister_client(self->this_module->get_name(), addrs);
+ PyEval_RestoreThread(tstate);
+ Py_RETURN_NONE;
+}
+
+PyMethodDef BaseMgrModule_methods[] = {
+ {"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS,
+ "Get a cluster object"},
+
+ {"_ceph_get_server", (PyCFunction)ceph_get_server, METH_VARARGS,
+ "Get a server object"},
+
+ {"_ceph_get_metadata", (PyCFunction)get_metadata, METH_VARARGS,
+ "Get a service's metadata"},
+
+ {"_ceph_get_daemon_status", (PyCFunction)get_daemon_status, METH_VARARGS,
+ "Get a service's status"},
+
+ {"_ceph_send_command", (PyCFunction)ceph_send_command, METH_VARARGS,
+ "Send a mon command"},
+
+ {"_ceph_set_health_checks", (PyCFunction)ceph_set_health_checks, METH_VARARGS,
+ "Set health checks for this module"},
+
+ {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+ "Get the name of the Mgr daemon where we are running"},
+
+ {"_ceph_get_ceph_conf_path", (PyCFunction)ceph_get_ceph_conf_path, METH_NOARGS,
+ "Get path to ceph.conf"},
+
+ {"_ceph_get_option", (PyCFunction)ceph_option_get, METH_VARARGS,
+ "Get a native configuration option value"},
+
+ {"_ceph_get_foreign_option", (PyCFunction)ceph_foreign_option_get, METH_VARARGS,
+ "Get a native configuration option value for another entity"},
+
+ {"_ceph_get_module_option", (PyCFunction)ceph_get_module_option, METH_VARARGS,
+ "Get a module configuration option value"},
+
+ {"_ceph_get_store_prefix", (PyCFunction)ceph_store_get_prefix, METH_VARARGS,
+ "Get all KV store values with a given prefix"},
+
+ {"_ceph_set_module_option", (PyCFunction)ceph_set_module_option, METH_VARARGS,
+ "Set a module configuration option value"},
+
+ {"_ceph_get_store", (PyCFunction)ceph_store_get, METH_VARARGS,
+ "Get a stored field"},
+
+ {"_ceph_set_store", (PyCFunction)ceph_store_set, METH_VARARGS,
+ "Set a stored field"},
+
+ {"_ceph_get_counter", (PyCFunction)get_counter, METH_VARARGS,
+ "Get a performance counter"},
+
+ {"_ceph_get_latest_counter", (PyCFunction)get_latest_counter, METH_VARARGS,
+ "Get the latest performance counter"},
+
+ {"_ceph_get_perf_schema", (PyCFunction)get_perf_schema, METH_VARARGS,
+ "Get the performance counter schema"},
+
+ {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+ "Emit a (local) log message"},
+
+ {"_ceph_cluster_log", (PyCFunction)ceph_cluster_log, METH_VARARGS,
+ "Emit a cluster log message"},
+
+ {"_ceph_get_version", (PyCFunction)ceph_get_version, METH_NOARGS,
+ "Get the ceph version of this process"},
+
+ {"_ceph_get_release_name", (PyCFunction)ceph_get_release_name, METH_NOARGS,
+ "Get the ceph release name of this process"},
+
+ {"_ceph_lookup_release_name", (PyCFunction)ceph_lookup_release_name, METH_VARARGS,
+ "Get the ceph release name for a given major number"},
+
+ {"_ceph_get_context", (PyCFunction)ceph_get_context, METH_NOARGS,
+ "Get a CephContext* in a python capsule"},
+
+ {"_ceph_get_osdmap", (PyCFunction)ceph_get_osdmap, METH_NOARGS,
+ "Get an OSDMap* in a python capsule"},
+
+ {"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS,
+ "Advertize a service URI served by this module"},
+
+ {"_ceph_set_device_wear_level", (PyCFunction)ceph_set_wear_level, METH_VARARGS,
+ "Set device wear_level value"},
+
+ {"_ceph_have_mon_connection", (PyCFunction)ceph_have_mon_connection,
+ METH_NOARGS, "Find out whether this mgr daemon currently has "
+ "a connection to a monitor"},
+
+ {"_ceph_update_progress_event", (PyCFunction)ceph_update_progress_event,
+ METH_VARARGS, "Update status of a progress event"},
+ {"_ceph_complete_progress_event", (PyCFunction)ceph_complete_progress_event,
+ METH_VARARGS, "Complete a progress event"},
+ {"_ceph_clear_all_progress_events", (PyCFunction)ceph_clear_all_progress_events,
+ METH_NOARGS, "Clear all progress events"},
+
+ {"_ceph_dispatch_remote", (PyCFunction)ceph_dispatch_remote,
+ METH_VARARGS, "Dispatch a call to another module"},
+
+ {"_ceph_add_osd_perf_query", (PyCFunction)ceph_add_osd_perf_query,
+ METH_VARARGS, "Add an osd perf query"},
+
+ {"_ceph_remove_osd_perf_query", (PyCFunction)ceph_remove_osd_perf_query,
+ METH_VARARGS, "Remove an osd perf query"},
+
+ {"_ceph_get_osd_perf_counters", (PyCFunction)ceph_get_osd_perf_counters,
+ METH_VARARGS, "Get osd perf counters"},
+
+ {"_ceph_add_mds_perf_query", (PyCFunction)ceph_add_mds_perf_query,
+ METH_VARARGS, "Add an osd perf query"},
+
+ {"_ceph_remove_mds_perf_query", (PyCFunction)ceph_remove_mds_perf_query,
+ METH_VARARGS, "Remove an osd perf query"},
+
+ {"_ceph_reregister_mds_perf_queries", (PyCFunction)ceph_reregister_mds_perf_queries,
+ METH_NOARGS, "Re-register mds perf queries"},
+
+ {"_ceph_get_mds_perf_counters", (PyCFunction)ceph_get_mds_perf_counters,
+ METH_VARARGS, "Get osd perf counters"},
+
+ {"_ceph_is_authorized", (PyCFunction)ceph_is_authorized,
+ METH_VARARGS, "Verify the current session caps are valid"},
+
+ {"_ceph_register_client", (PyCFunction)ceph_register_client,
+ METH_VARARGS, "Register RADOS instance for potential blocklisting"},
+
+ {"_ceph_unregister_client", (PyCFunction)ceph_unregister_client,
+ METH_VARARGS, "Unregister RADOS instance for potential blocklisting"},
+
+ {NULL, NULL, 0, NULL}
+};
+
+
+static PyObject *
+BaseMgrModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+ BaseMgrModule *self;
+
+ self = (BaseMgrModule *)type->tp_alloc(type, 0);
+
+ return (PyObject *)self;
+}
+
+static int
+BaseMgrModule_init(BaseMgrModule *self, PyObject *args, PyObject *kwds)
+{
+ PyObject *py_modules_capsule = nullptr;
+ PyObject *this_module_capsule = nullptr;
+ static const char *kwlist[] = {"py_modules", "this_module", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "OO",
+ const_cast<char**>(kwlist),
+ &py_modules_capsule,
+ &this_module_capsule)) {
+ return -1;
+ }
+
+ self->py_modules = static_cast<ActivePyModules*>(PyCapsule_GetPointer(
+ py_modules_capsule, nullptr));
+ ceph_assert(self->py_modules);
+ self->this_module = static_cast<ActivePyModule*>(PyCapsule_GetPointer(
+ this_module_capsule, nullptr));
+ ceph_assert(self->this_module);
+
+ return 0;
+}
+
+PyTypeObject BaseMgrModuleType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BaseMgrModule", /* tp_name */
+ sizeof(BaseMgrModule), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ 0, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "ceph-mgr Python Plugin", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BaseMgrModule_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BaseMgrModule_init, /* tp_init */
+ 0, /* tp_alloc */
+ BaseMgrModule_new, /* tp_new */
+};
diff --git a/src/mgr/BaseMgrModule.h b/src/mgr/BaseMgrModule.h
new file mode 100644
index 000000000..2c2e5deb3
--- /dev/null
+++ b/src/mgr/BaseMgrModule.h
@@ -0,0 +1,7 @@
+
+#pragma once
+
+#include "Python.h"
+
+extern PyTypeObject BaseMgrModuleType;
+
diff --git a/src/mgr/BaseMgrStandbyModule.cc b/src/mgr/BaseMgrStandbyModule.cc
new file mode 100644
index 000000000..6f35088d0
--- /dev/null
+++ b/src/mgr/BaseMgrStandbyModule.cc
@@ -0,0 +1,269 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "BaseMgrStandbyModule.h"
+
+#include "StandbyPyModules.h"
+#include "PyFormatter.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+typedef struct {
+ PyObject_HEAD
+ StandbyPyModule *this_module;
+} BaseMgrStandbyModule;
+
+static PyObject *
+BaseMgrStandbyModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+ BaseMgrStandbyModule *self;
+
+ self = (BaseMgrStandbyModule *)type->tp_alloc(type, 0);
+
+ return (PyObject *)self;
+}
+
+static int
+BaseMgrStandbyModule_init(BaseMgrStandbyModule *self, PyObject *args, PyObject *kwds)
+{
+ PyObject *this_module_capsule = nullptr;
+ static const char *kwlist[] = {"this_module", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+ const_cast<char**>(kwlist),
+ &this_module_capsule)) {
+ return -1;
+ }
+
+ self->this_module = static_cast<StandbyPyModule*>(PyCapsule_GetPointer(
+ this_module_capsule, nullptr));
+ ceph_assert(self->this_module);
+
+ return 0;
+}
+
+static PyObject*
+ceph_get_mgr_id(BaseMgrStandbyModule *self, PyObject *args)
+{
+ return PyUnicode_FromString(g_conf()->name.get_id().c_str());
+}
+
+static PyObject*
+ceph_get_module_option(BaseMgrStandbyModule *self, PyObject *args)
+{
+ char *what = nullptr;
+ char *prefix = nullptr;
+ if (!PyArg_ParseTuple(args, "s|s:ceph_get_module_option", &what, &prefix)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+ PyThreadState *tstate = PyEval_SaveThread();
+ std::string final_key;
+ std::string value;
+ bool found = false;
+ if (prefix) {
+ final_key = std::string(prefix) + "/" + what;
+ found = self->this_module->get_config(final_key, &value);
+ }
+ if (!found) {
+ final_key = what;
+ found = self->this_module->get_config(final_key, &value);
+ }
+ PyEval_RestoreThread(tstate);
+ if (found) {
+ dout(10) << __func__ << " " << final_key << " found: " << value
+ << dendl;
+ return self->this_module->py_module->get_typed_option_value(what, value);
+ } else {
+ if (prefix) {
+ dout(4) << __func__ << " [" << prefix << "/]" << what << " not found "
+ << dendl;
+ } else {
+ dout(4) << __func__ << " " << what << " not found " << dendl;
+ }
+ Py_RETURN_NONE;
+ }
+}
+
+static PyObject*
+ceph_option_get(BaseMgrStandbyModule *self, PyObject *args)
+{
+ char *what = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_option_get", &what)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ std::string value;
+ int r = g_conf().get_val(string(what), &value);
+ if (r >= 0) {
+ dout(10) << "ceph_option_get " << what << " found: " << value << dendl;
+ return PyUnicode_FromString(value.c_str());
+ } else {
+ dout(4) << "ceph_option_get " << what << " not found " << dendl;
+ Py_RETURN_NONE;
+ }
+}
+
+static PyObject*
+ceph_store_get(BaseMgrStandbyModule *self, PyObject *args)
+{
+ char *what = nullptr;
+ if (!PyArg_ParseTuple(args, "s:ceph_store_get", &what)) {
+ derr << "Invalid args!" << dendl;
+ return nullptr;
+ }
+
+ // Drop GIL for blocking mon command execution
+ PyThreadState *tstate = PyEval_SaveThread();
+
+ std::string value;
+ bool found = self->this_module->get_store(what, &value);
+
+ PyEval_RestoreThread(tstate);
+
+ if (found) {
+ dout(10) << "ceph_store_get " << what << " found: " << value.c_str() << dendl;
+ return PyUnicode_FromString(value.c_str());
+ } else {
+ dout(4) << "ceph_store_get " << what << " not found " << dendl;
+ Py_RETURN_NONE;
+ }
+}
+
+static PyObject*
+ceph_get_active_uri(BaseMgrStandbyModule *self, PyObject *args)
+{
+ return PyUnicode_FromString(self->this_module->get_active_uri().c_str());
+}
+
+static PyObject*
+ceph_log(BaseMgrStandbyModule *self, PyObject *args)
+{
+ char *record = nullptr;
+ if (!PyArg_ParseTuple(args, "s:log", &record)) {
+ return nullptr;
+ }
+
+ ceph_assert(self->this_module);
+
+ self->this_module->log(record);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+ceph_standby_state_get(BaseMgrStandbyModule *self, PyObject *args)
+{
+ char *whatc = NULL;
+ if (!PyArg_ParseTuple(args, "s:ceph_state_get", &whatc)) {
+ return NULL;
+ }
+ std::string what(whatc);
+
+ PyFormatter f;
+
+ // Drop the GIL, as most of the following blocks will block on
+ // a mutex -- they are all responsible for re-taking the GIL before
+ // touching the PyFormatter instance or returning from the function.
+ without_gil_t no_gil;
+
+ if (what == "mgr_ips") {
+ entity_addrvec_t myaddrs = self->this_module->get_myaddrs();
+ with_gil_t with_gil{no_gil};
+ f.open_array_section("ips");
+ std::set<std::string> did;
+ for (auto& i : myaddrs.v) {
+ std::string ip = i.ip_only_to_str();
+ if (auto [where, inserted] = did.insert(ip); inserted) {
+ f.dump_string("ip", ip);
+ }
+ }
+ f.close_section();
+ return f.get();
+ } else {
+ derr << "Python module requested unknown data '" << what << "'" << dendl;
+ with_gil_t with_gil{no_gil};
+ Py_RETURN_NONE;
+ }
+}
+
+
+PyMethodDef BaseMgrStandbyModule_methods[] = {
+ {"_ceph_get", (PyCFunction)ceph_standby_state_get, METH_VARARGS,
+ "Get a cluster object (standby)"},
+
+ {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS,
+ "Get the name of the Mgr daemon where we are running"},
+
+ {"_ceph_get_module_option", (PyCFunction)ceph_get_module_option, METH_VARARGS,
+ "Get a module configuration option value"},
+
+ {"_ceph_get_option", (PyCFunction)ceph_option_get, METH_VARARGS,
+ "Get a native configuration option value"},
+
+ {"_ceph_get_store", (PyCFunction)ceph_store_get, METH_VARARGS,
+ "Get a KV store value"},
+
+ {"_ceph_get_active_uri", (PyCFunction)ceph_get_active_uri, METH_NOARGS,
+ "Get the URI of the active instance of this module, if any"},
+
+ {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS,
+ "Emit a log message"},
+
+ {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BaseMgrStandbyModuleType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BaseMgrStandbyModule", /* tp_name */
+ sizeof(BaseMgrStandbyModule), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ 0, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "ceph-mgr Standby Python Plugin", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BaseMgrStandbyModule_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BaseMgrStandbyModule_init, /* tp_init */
+ 0, /* tp_alloc */
+ BaseMgrStandbyModule_new, /* tp_new */
+};
diff --git a/src/mgr/BaseMgrStandbyModule.h b/src/mgr/BaseMgrStandbyModule.h
new file mode 100644
index 000000000..82bda9105
--- /dev/null
+++ b/src/mgr/BaseMgrStandbyModule.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <Python.h>
+
+extern PyTypeObject BaseMgrStandbyModuleType;
+
diff --git a/src/mgr/CMakeLists.txt b/src/mgr/CMakeLists.txt
new file mode 100644
index 000000000..8e152e060
--- /dev/null
+++ b/src/mgr/CMakeLists.txt
@@ -0,0 +1,46 @@
+add_library(mgr_cap_obj OBJECT
+ MgrCap.cc)
+
+if(WITH_MGR)
+ set(mgr_srcs
+ ${CMAKE_SOURCE_DIR}/src/ceph_mgr.cc
+ ${CMAKE_SOURCE_DIR}/src/mon/PGMap.cc
+ ${CMAKE_SOURCE_DIR}/src/mon/ConfigMap.cc
+ ActivePyModule.cc
+ ActivePyModules.cc
+ BaseMgrModule.cc
+ BaseMgrStandbyModule.cc
+ ClusterState.cc
+ DaemonHealthMetricCollector.cc
+ DaemonKey.cc
+ DaemonServer.cc
+ DaemonState.cc
+ Gil.cc
+ Mgr.cc
+ mgr_perf_counters.cc
+ MgrStandby.cc
+ MetricCollector.cc
+ OSDPerfMetricTypes.cc
+ OSDPerfMetricCollector.cc
+ MDSPerfMetricTypes.cc
+ MDSPerfMetricCollector.cc
+ PyFormatter.cc
+ PyUtil.cc
+ PyModule.cc
+ PyModuleRegistry.cc
+ PyModuleRunner.cc
+ PyOSDMap.cc
+ StandbyPyModules.cc
+ mgr_commands.cc
+ $<TARGET_OBJECTS:mgr_cap_obj>)
+ add_executable(ceph-mgr ${mgr_srcs})
+ target_compile_definitions(ceph-mgr PRIVATE PY_SSIZE_T_CLEAN)
+ target_link_libraries(ceph-mgr
+ osdc client heap_profiler
+ global-static ceph-common
+ Boost::python${MGR_PYTHON_VERSION_MAJOR}${MGR_PYTHON_VERSION_MINOR}
+ Python3::Python ${CMAKE_DL_LIBS} ${GSSAPI_LIBRARIES})
+ set_target_properties(ceph-mgr PROPERTIES
+ POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE})
+ install(TARGETS ceph-mgr DESTINATION bin)
+endif()
diff --git a/src/mgr/ClusterState.cc b/src/mgr/ClusterState.cc
new file mode 100644
index 000000000..28340d56d
--- /dev/null
+++ b/src/mgr/ClusterState.cc
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "messages/MMgrDigest.h"
+#include "messages/MMonMgrReport.h"
+#include "messages/MPGStats.h"
+
+#include "mgr/ClusterState.h"
+#include <time.h>
+#include <boost/range/adaptor/reversed.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+ClusterState::ClusterState(
+ MonClient *monc_,
+ Objecter *objecter_,
+ const MgrMap& mgrmap)
+ : monc(monc_),
+ objecter(objecter_),
+ mgr_map(mgrmap),
+ asok_hook(NULL)
+{}
+
+void ClusterState::set_objecter(Objecter *objecter_)
+{
+ std::lock_guard l(lock);
+
+ objecter = objecter_;
+}
+
+void ClusterState::set_fsmap(FSMap const &new_fsmap)
+{
+ std::lock_guard l(lock);
+
+ fsmap = new_fsmap;
+}
+
+void ClusterState::set_mgr_map(MgrMap const &new_mgrmap)
+{
+ std::lock_guard l(lock);
+ mgr_map = new_mgrmap;
+}
+
+void ClusterState::set_service_map(ServiceMap const &new_service_map)
+{
+ std::lock_guard l(lock);
+ servicemap = new_service_map;
+}
+
+void ClusterState::load_digest(MMgrDigest *m)
+{
+ std::lock_guard l(lock);
+ health_json = std::move(m->health_json);
+ mon_status_json = std::move(m->mon_status_json);
+}
+
+void ClusterState::ingest_pgstats(ref_t<MPGStats> stats)
+{
+ std::lock_guard l(lock);
+
+ const int from = stats->get_orig_source().num();
+ bool is_in = with_osdmap([from](const OSDMap& osdmap) {
+ return osdmap.is_in(from);
+ });
+
+ if (is_in) {
+ pending_inc.update_stat(from, std::move(stats->osd_stat));
+ } else {
+ osd_stat_t empty_stat;
+ empty_stat.seq = stats->osd_stat.seq;
+ pending_inc.update_stat(from, std::move(empty_stat));
+ }
+
+ for (auto p : stats->pg_stat) {
+ pg_t pgid = p.first;
+ const auto &pg_stats = p.second;
+
+ // In case we're hearing about a PG that according to last
+ // OSDMap update should not exist
+ auto r = existing_pools.find(pgid.pool());
+ if (r == existing_pools.end()) {
+ dout(15) << " got " << pgid
+ << " reported at " << pg_stats.reported_epoch << ":"
+ << pg_stats.reported_seq
+ << " state " << pg_state_string(pg_stats.state)
+ << " but pool not in " << existing_pools
+ << dendl;
+ continue;
+ }
+ if (pgid.ps() >= r->second) {
+ dout(15) << " got " << pgid
+ << " reported at " << pg_stats.reported_epoch << ":"
+ << pg_stats.reported_seq
+ << " state " << pg_state_string(pg_stats.state)
+ << " but > pg_num " << r->second
+ << dendl;
+ continue;
+ }
+ // In case we already heard about more recent stats from this PG
+ // from another OSD
+ const auto q = pg_map.pg_stat.find(pgid);
+ if (q != pg_map.pg_stat.end() &&
+ q->second.get_version_pair() > pg_stats.get_version_pair()) {
+ dout(15) << " had " << pgid << " from "
+ << q->second.reported_epoch << ":"
+ << q->second.reported_seq << dendl;
+ continue;
+ }
+
+ pending_inc.pg_stat_updates[pgid] = pg_stats;
+ }
+ for (auto p : stats->pool_stat) {
+ pending_inc.pool_statfs_updates[std::make_pair(p.first, from)] = p.second;
+ }
+}
+
+void ClusterState::update_delta_stats()
+{
+ pending_inc.stamp = ceph_clock_now();
+ pending_inc.version = pg_map.version + 1; // to make apply_incremental happy
+ dout(10) << " v" << pending_inc.version << dendl;
+
+ dout(30) << " pg_map before:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("pg_map", pg_map);
+ jf.flush(*_dout);
+ *_dout << dendl;
+ dout(30) << " incremental:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("pending_inc", pending_inc);
+ jf.flush(*_dout);
+ *_dout << dendl;
+ pg_map.apply_incremental(g_ceph_context, pending_inc);
+ pending_inc = PGMap::Incremental();
+}
+
+void ClusterState::notify_osdmap(const OSDMap &osd_map)
+{
+ assert(ceph_mutex_is_locked(lock));
+
+ pending_inc.stamp = ceph_clock_now();
+ pending_inc.version = pg_map.version + 1; // to make apply_incremental happy
+ dout(10) << " v" << pending_inc.version << dendl;
+
+ PGMapUpdater::check_osd_map(g_ceph_context, osd_map, pg_map, &pending_inc);
+
+ // update our list of pools that exist, so that we can filter pg_map updates
+ // in synchrony with this OSDMap.
+ existing_pools.clear();
+ for (auto& p : osd_map.get_pools()) {
+ existing_pools[p.first] = p.second.get_pg_num();
+ }
+
+ // brute force this for now (don't bother being clever by only
+ // checking osds that went up/down)
+ set<int> need_check_down_pg_osds;
+ PGMapUpdater::check_down_pgs(osd_map, pg_map, true,
+ need_check_down_pg_osds, &pending_inc);
+
+ dout(30) << " pg_map before:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("pg_map", pg_map);
+ jf.flush(*_dout);
+ *_dout << dendl;
+ dout(30) << " incremental:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("pending_inc", pending_inc);
+ jf.flush(*_dout);
+ *_dout << dendl;
+
+ pg_map.apply_incremental(g_ceph_context, pending_inc);
+ pending_inc = PGMap::Incremental();
+ // TODO: Complete the separation of PG state handling so
+ // that a cut-down set of functionality remains in PGMonitor
+ // while the full-blown PGMap lives only here.
+}
+
+class ClusterSocketHook : public AdminSocketHook {
+ ClusterState *cluster_state;
+public:
+ explicit ClusterSocketHook(ClusterState *o) : cluster_state(o) {}
+ int call(std::string_view admin_command, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out) override {
+ stringstream outss;
+ int r = 0;
+ try {
+ r = cluster_state->asok_command(admin_command, cmdmap, f, outss);
+ out.append(outss);
+ } catch (const TOPNSPC::common::bad_cmd_get& e) {
+ errss << e.what();
+ r = -EINVAL;
+ }
+ return r;
+ }
+};
+
+void ClusterState::final_init()
+{
+ AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+ asok_hook = new ClusterSocketHook(this);
+ int r = admin_socket->register_command(
+ "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
+ "Dump osd heartbeat network ping times");
+ ceph_assert(r == 0);
+}
+
+void ClusterState::shutdown()
+{
+ // unregister commands
+ g_ceph_context->get_admin_socket()->unregister_commands(asok_hook);
+ delete asok_hook;
+ asok_hook = NULL;
+}
+
+bool ClusterState::asok_command(
+ std::string_view admin_command,
+ const cmdmap_t& cmdmap,
+ Formatter *f,
+ ostream& ss)
+{
+ std::lock_guard l(lock);
+ if (admin_command == "dump_osd_network") {
+ int64_t value = 0;
+ // Default to health warning level if nothing specified
+ if (!(TOPNSPC::common::cmd_getval(cmdmap, "value", value))) {
+ // Convert milliseconds to microseconds
+ value = static_cast<int64_t>(g_ceph_context->_conf.get_val<double>("mon_warn_on_slow_ping_time")) * 1000;
+ if (value == 0) {
+ double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
+ value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
+ value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+ }
+ } else {
+ // Convert user input to microseconds
+ value *= 1000;
+ }
+ if (value < 0)
+ value = 0;
+
+ struct mgr_ping_time_t {
+ uint32_t pingtime;
+ int from;
+ int to;
+ bool back;
+ std::array<uint32_t,3> times;
+ std::array<uint32_t,3> min;
+ std::array<uint32_t,3> max;
+ uint32_t last;
+ uint32_t last_update;
+
+ bool operator<(const mgr_ping_time_t& rhs) const {
+ if (pingtime < rhs.pingtime)
+ return true;
+ if (pingtime > rhs.pingtime)
+ return false;
+ if (from < rhs.from)
+ return true;
+ if (from > rhs.from)
+ return false;
+ if (to < rhs.to)
+ return true;
+ if (to > rhs.to)
+ return false;
+ return back;
+ }
+ };
+
+ set<mgr_ping_time_t> sorted;
+ utime_t now = ceph_clock_now();
+ for (auto i : pg_map.osd_stat) {
+ for (auto j : i.second.hb_pingtime) {
+
+ if (j.second.last_update == 0)
+ continue;
+ auto stale_time = g_ceph_context->_conf.get_val<int64_t>("osd_mon_heartbeat_stat_stale");
+ if (now.sec() - j.second.last_update > stale_time) {
+ dout(20) << __func__ << " time out heartbeat for osd " << i.first
+ << " last_update " << j.second.last_update << dendl;
+ continue;
+ }
+ mgr_ping_time_t item;
+ item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+ item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
+ if (!value || item.pingtime >= value) {
+ item.from = i.first;
+ item.to = j.first;
+ item.times[0] = j.second.back_pingtime[0];
+ item.times[1] = j.second.back_pingtime[1];
+ item.times[2] = j.second.back_pingtime[2];
+ item.min[0] = j.second.back_min[0];
+ item.min[1] = j.second.back_min[1];
+ item.min[2] = j.second.back_min[2];
+ item.max[0] = j.second.back_max[0];
+ item.max[1] = j.second.back_max[1];
+ item.max[2] = j.second.back_max[2];
+ item.last = j.second.back_last;
+ item.back = true;
+ item.last_update = j.second.last_update;
+ sorted.emplace(item);
+ }
+
+ if (j.second.front_last == 0)
+ continue;
+ item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+ item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
+ if (!value || item.pingtime >= value) {
+ item.from = i.first;
+ item.to = j.first;
+ item.times[0] = j.second.front_pingtime[0];
+ item.times[1] = j.second.front_pingtime[1];
+ item.times[2] = j.second.front_pingtime[2];
+ item.min[0] = j.second.front_min[0];
+ item.min[1] = j.second.front_min[1];
+ item.min[2] = j.second.front_min[2];
+ item.max[0] = j.second.front_max[0];
+ item.max[1] = j.second.front_max[1];
+ item.max[2] = j.second.front_max[2];
+ item.last = j.second.front_last;
+ item.back = false;
+ item.last_update = j.second.last_update;
+ sorted.emplace(item);
+ }
+ }
+ }
+
+ // Network ping times (1min 5min 15min)
+ f->open_object_section("network_ping_times");
+ f->dump_int("threshold", value / 1000);
+ f->open_array_section("entries");
+ for (auto &sitem : boost::adaptors::reverse(sorted)) {
+ ceph_assert(!value || sitem.pingtime >= value);
+
+ f->open_object_section("entry");
+
+ const time_t lu(sitem.last_update);
+ char buffer[26];
+ string lustr(ctime_r(&lu, buffer));
+ lustr.pop_back(); // Remove trailing \n
+ auto stale = g_ceph_context->_conf.get_val<int64_t>("osd_heartbeat_stale");
+ f->dump_string("last update", lustr);
+ f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
+ f->dump_int("from osd", sitem.from);
+ f->dump_int("to osd", sitem.to);
+ f->dump_string("interface", (sitem.back ? "back" : "front"));
+ f->open_object_section("average");
+ f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
+ f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
+ f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
+ f->close_section(); // average
+ f->open_object_section("min");
+ f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.min[0],3).c_str());
+ f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.min[1],3).c_str());
+ f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.min[2],3).c_str());
+ f->close_section(); // min
+ f->open_object_section("max");
+ f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
+ f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
+ f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
+ f->close_section(); // max
+ f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
+ f->close_section(); // entry
+ }
+ f->close_section(); // entries
+ f->close_section(); // network_ping_times
+ } else {
+ ceph_abort_msg("broken asok registration");
+ }
+ return true;
+}
diff --git a/src/mgr/ClusterState.h b/src/mgr/ClusterState.h
new file mode 100644
index 000000000..eeff1f76b
--- /dev/null
+++ b/src/mgr/ClusterState.h
@@ -0,0 +1,163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CLUSTER_STATE_H_
+#define CLUSTER_STATE_H_
+
+#include "mds/FSMap.h"
+#include "mon/MgrMap.h"
+#include "common/ceph_mutex.h"
+
+#include "osdc/Objecter.h"
+#include "mon/MonClient.h"
+#include "mon/PGMap.h"
+#include "mgr/ServiceMap.h"
+
+class MMgrDigest;
+class MMonMgrReport;
+class MPGStats;
+
+
+/**
+ * Cluster-scope state (things like cluster maps) as opposed
+ * to daemon-level state (things like perf counters and smart)
+ */
+class ClusterState
+{
+protected:
+ MonClient *monc;
+ Objecter *objecter;
+ FSMap fsmap;
+ ServiceMap servicemap;
+ mutable ceph::mutex lock = ceph::make_mutex("ClusterState");
+
+ MgrMap mgr_map;
+
+ map<int64_t,unsigned> existing_pools; ///< pools that exist, and pg_num, as of PGMap epoch
+ PGMap pg_map;
+ PGMap::Incremental pending_inc;
+
+ bufferlist health_json;
+ bufferlist mon_status_json;
+
+ class ClusterSocketHook *asok_hook;
+
+public:
+
+ void load_digest(MMgrDigest *m);
+ void ingest_pgstats(ceph::ref_t<MPGStats> stats);
+
+ void update_delta_stats();
+
+ ClusterState(MonClient *monc_, Objecter *objecter_, const MgrMap& mgrmap);
+
+ void set_objecter(Objecter *objecter_);
+ void set_fsmap(FSMap const &new_fsmap);
+ void set_mgr_map(MgrMap const &new_mgrmap);
+ void set_service_map(ServiceMap const &new_service_map);
+
+ void notify_osdmap(const OSDMap &osd_map);
+
+ bool have_fsmap() const {
+ std::lock_guard l(lock);
+ return fsmap.get_epoch() > 0;
+ }
+
+ template<typename Callback, typename...Args>
+ auto with_servicemap(Callback&& cb, Args&&...args) const
+ {
+ std::lock_guard l(lock);
+ return std::forward<Callback>(cb)(servicemap, std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ auto with_fsmap(Callback&& cb, Args&&...args) const
+ {
+ std::lock_guard l(lock);
+ return std::forward<Callback>(cb)(fsmap, std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ auto with_mgrmap(Callback&& cb, Args&&...args) const
+ {
+ std::lock_guard l(lock);
+ return std::forward<Callback>(cb)(mgr_map, std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ auto with_pgmap(Callback&& cb, Args&&...args) const ->
+ decltype(cb(pg_map, std::forward<Args>(args)...))
+ {
+ std::lock_guard l(lock);
+ return std::forward<Callback>(cb)(pg_map, std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ auto with_mutable_pgmap(Callback&& cb, Args&&...args) ->
+ decltype(cb(pg_map, std::forward<Args>(args)...))
+ {
+ std::lock_guard l(lock);
+ return std::forward<Callback>(cb)(pg_map, std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ auto with_monmap(Args &&... args) const
+ {
+ std::lock_guard l(lock);
+ ceph_assert(monc != nullptr);
+ return monc->with_monmap(std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ auto with_osdmap(Args &&... args) const ->
+ decltype(objecter->with_osdmap(std::forward<Args>(args)...))
+ {
+ ceph_assert(objecter != nullptr);
+ return objecter->with_osdmap(std::forward<Args>(args)...);
+ }
+
+ // call cb(osdmap, pg_map, ...args) with the appropriate locks
+ template <typename Callback, typename ...Args>
+ auto with_osdmap_and_pgmap(Callback&& cb, Args&& ...args) const {
+ ceph_assert(objecter != nullptr);
+ std::lock_guard l(lock);
+ return objecter->with_osdmap(
+ std::forward<Callback>(cb),
+ pg_map,
+ std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ auto with_health(Callback&& cb, Args&&...args) const
+ {
+ std::lock_guard l(lock);
+ return std::forward<Callback>(cb)(health_json, std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ auto with_mon_status(Callback&& cb, Args&&...args) const
+ {
+ std::lock_guard l(lock);
+ return std::forward<Callback>(cb)(mon_status_json, std::forward<Args>(args)...);
+ }
+
+ void final_init();
+ void shutdown();
+ bool asok_command(std::string_view admin_command,
+ const cmdmap_t& cmdmap,
+ Formatter *f,
+ ostream& ss);
+};
+
+#endif
+
diff --git a/src/mgr/DaemonHealthMetric.h b/src/mgr/DaemonHealthMetric.h
new file mode 100644
index 000000000..ad3ea29ef
--- /dev/null
+++ b/src/mgr/DaemonHealthMetric.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <ostream>
+#include "include/denc.h"
+
+enum class daemon_metric : uint8_t {
+ SLOW_OPS,
+ PENDING_CREATING_PGS,
+ NONE,
+};
+
+static inline const char *daemon_metric_name(daemon_metric t) {
+ switch (t) {
+ case daemon_metric::SLOW_OPS: return "SLOW_OPS";
+ case daemon_metric::PENDING_CREATING_PGS: return "PENDING_CREATING_PGS";
+ case daemon_metric::NONE: return "NONE";
+ default: return "???";
+ }
+}
+
+union daemon_metric_t {
+ struct {
+ uint32_t n1;
+ uint32_t n2;
+ };
+ uint64_t n;
+ daemon_metric_t(uint32_t x, uint32_t y)
+ : n1(x), n2(y)
+ {}
+ daemon_metric_t(uint64_t x = 0)
+ : n(x)
+ {}
+};
+
+class DaemonHealthMetric
+{
+public:
+ DaemonHealthMetric() = default;
+ DaemonHealthMetric(daemon_metric type_, uint64_t n)
+ : type(type_), value(n)
+ {}
+ DaemonHealthMetric(daemon_metric type_, uint32_t n1, uint32_t n2)
+ : type(type_), value(n1, n2)
+ {}
+ daemon_metric get_type() const {
+ return type;
+ }
+ uint64_t get_n() const {
+ return value.n;
+ }
+ uint32_t get_n1() const {
+ return value.n1;
+ }
+ uint32_t get_n2() const {
+ return value.n2;
+ }
+ DENC(DaemonHealthMetric, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ denc(v.value.n, p);
+ DENC_FINISH(p);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const DaemonHealthMetric& m) {
+ return out << daemon_metric_name(m.get_type()) << "("
+ << m.get_n() << "|(" << m.get_n1() << "," << m.get_n2() << "))";
+ }
+private:
+ daemon_metric type = daemon_metric::NONE;
+ daemon_metric_t value;
+};
+WRITE_CLASS_DENC(DaemonHealthMetric)
diff --git a/src/mgr/DaemonHealthMetricCollector.cc b/src/mgr/DaemonHealthMetricCollector.cc
new file mode 100644
index 000000000..53c0b78a6
--- /dev/null
+++ b/src/mgr/DaemonHealthMetricCollector.cc
@@ -0,0 +1,101 @@
+#include <fmt/format.h>
+
+#include "include/health.h"
+#include "include/types.h"
+#include "DaemonHealthMetricCollector.h"
+
+namespace {
+
+class SlowOps final : public DaemonHealthMetricCollector {
+ bool _is_relevant(daemon_metric type) const override {
+ return type == daemon_metric::SLOW_OPS;
+ }
+ health_check_t& _get_check(health_check_map_t& cm) const override {
+ return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "", 1);
+ }
+ bool _update(const DaemonKey& daemon,
+ const DaemonHealthMetric& metric) override {
+ auto num_slow = metric.get_n1();
+ auto blocked_time = metric.get_n2();
+ value.n1 += num_slow;
+ value.n2 = std::max(value.n2, blocked_time);
+ if (num_slow || blocked_time) {
+ daemons.push_back(daemon);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ void _summarize(health_check_t& check) const override {
+ if (daemons.empty()) {
+ return;
+ }
+ // Note this message format is used in mgr/prometheus, so any change in format
+ // requires a corresponding change in the mgr/prometheus module.
+ ostringstream ss;
+ if (daemons.size() > 1) {
+ if (daemons.size() > 10) {
+ ss << "daemons " << vector<DaemonKey>(daemons.begin(), daemons.begin()+10)
+ << "..." << " have slow ops.";
+ } else {
+ ss << "daemons " << daemons << " have slow ops.";
+ }
+ } else {
+ ss << daemons.front() << " has slow ops";
+ }
+ check.summary =
+ fmt::format("{} slow ops, oldest one blocked for {} sec, {}",
+ value.n1, value.n2, ss.str());
+ // No detail
+ }
+ vector<DaemonKey> daemons;
+};
+
+
+class PendingPGs final : public DaemonHealthMetricCollector {
+ bool _is_relevant(daemon_metric type) const override {
+ return type == daemon_metric::PENDING_CREATING_PGS;
+ }
+ health_check_t& _get_check(health_check_map_t& cm) const override {
+ return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "", 1);
+ }
+ bool _update(const DaemonKey& osd,
+ const DaemonHealthMetric& metric) override {
+ value.n += metric.get_n();
+ if (metric.get_n()) {
+ osds.push_back(osd);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ void _summarize(health_check_t& check) const override {
+ if (osds.empty()) {
+ return;
+ }
+ check.summary = fmt::format("{} PGs pending on creation", value.n);
+ ostringstream ss;
+ if (osds.size() > 1) {
+ ss << "osds " << osds << " have pending PGs.";
+ } else {
+ ss << osds.front() << " has pending PGs";
+ }
+ check.detail.push_back(ss.str());
+ }
+ vector<DaemonKey> osds;
+};
+
+} // anonymous namespace
+
+unique_ptr<DaemonHealthMetricCollector>
+DaemonHealthMetricCollector::create(daemon_metric m)
+{
+ switch (m) {
+ case daemon_metric::SLOW_OPS:
+ return std::make_unique<SlowOps>();
+ case daemon_metric::PENDING_CREATING_PGS:
+ return std::make_unique<PendingPGs>();
+ default:
+ return {};
+ }
+}
diff --git a/src/mgr/DaemonHealthMetricCollector.h b/src/mgr/DaemonHealthMetricCollector.h
new file mode 100644
index 000000000..558f4e334
--- /dev/null
+++ b/src/mgr/DaemonHealthMetricCollector.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "DaemonHealthMetric.h"
+#include "DaemonKey.h"
+#include "mon/health_check.h"
+
+class DaemonHealthMetricCollector {
+public:
+ static std::unique_ptr<DaemonHealthMetricCollector> create(daemon_metric m);
+ void update(const DaemonKey& daemon, const DaemonHealthMetric& metric) {
+ if (_is_relevant(metric.get_type())) {
+ reported |= _update(daemon, metric);
+ }
+ }
+ void summarize(health_check_map_t& cm) {
+ if (reported) {
+ _summarize(_get_check(cm));
+ }
+ }
+ virtual ~DaemonHealthMetricCollector() {}
+private:
+ virtual bool _is_relevant(daemon_metric type) const = 0;
+ virtual health_check_t& _get_check(health_check_map_t& cm) const = 0;
+ virtual bool _update(const DaemonKey& daemon, const DaemonHealthMetric& metric) = 0;
+ virtual void _summarize(health_check_t& check) const = 0;
+protected:
+ daemon_metric_t value;
+ bool reported = false;
+};
diff --git a/src/mgr/DaemonKey.cc b/src/mgr/DaemonKey.cc
new file mode 100644
index 000000000..5501ac106
--- /dev/null
+++ b/src/mgr/DaemonKey.cc
@@ -0,0 +1,35 @@
+#include "DaemonKey.h"
+
+std::pair<DaemonKey, bool> DaemonKey::parse(const std::string& s)
+{
+ auto p = s.find('.');
+ if (p == s.npos) {
+ return {{}, false};
+ } else {
+ return {DaemonKey{s.substr(0, p), s.substr(p + 1)}, true};
+ }
+}
+
+bool operator<(const DaemonKey& lhs, const DaemonKey& rhs)
+{
+ if (int cmp = lhs.type.compare(rhs.type); cmp < 0) {
+ return true;
+ } else if (cmp > 0) {
+ return false;
+ } else {
+ return lhs.name < rhs.name;
+ }
+}
+
+std::ostream& operator<<(std::ostream& os, const DaemonKey& key)
+{
+ return os << key.type << '.' << key.name;
+}
+
+namespace ceph {
+std::string to_string(const DaemonKey& key)
+{
+ return key.type + '.' + key.name;
+}
+}
+
diff --git a/src/mgr/DaemonKey.h b/src/mgr/DaemonKey.h
new file mode 100644
index 000000000..92bacd649
--- /dev/null
+++ b/src/mgr/DaemonKey.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <utility>
+
+// Unique reference to a daemon within a cluster
+struct DaemonKey
+{
+ std::string type; // service type, like "osd", "mon"
+ std::string name; // service id / name, like "1", "a"
+ static std::pair<DaemonKey, bool> parse(const std::string& s);
+};
+
+bool operator<(const DaemonKey& lhs, const DaemonKey& rhs);
+std::ostream& operator<<(std::ostream& os, const DaemonKey& key);
+
+namespace ceph {
+ std::string to_string(const DaemonKey& key);
+}
+
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc
new file mode 100644
index 000000000..430911f6f
--- /dev/null
+++ b/src/mgr/DaemonServer.cc
@@ -0,0 +1,3146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "DaemonServer.h"
+#include <boost/algorithm/string.hpp>
+#include "mgr/Mgr.h"
+
+#include "include/stringify.h"
+#include "include/str_list.h"
+#include "auth/RotatingKeyRing.h"
+#include "json_spirit/json_spirit_writer.h"
+
+#include "mgr/mgr_commands.h"
+#include "mgr/DaemonHealthMetricCollector.h"
+#include "mgr/OSDPerfMetricCollector.h"
+#include "mgr/MDSPerfMetricCollector.h"
+#include "mon/MonCommand.h"
+
+#include "messages/MMgrOpen.h"
+#include "messages/MMgrUpdate.h"
+#include "messages/MMgrClose.h"
+#include "messages/MMgrConfigure.h"
+#include "messages/MMonMgrReport.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MMgrCommand.h"
+#include "messages/MMgrCommandReply.h"
+#include "messages/MPGStats.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrub2.h"
+#include "messages/MOSDForceRecovery.h"
+#include "common/errno.h"
+#include "common/pick_address.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr.server " << __func__ << " "
+using namespace TOPNSPC::common;
+namespace {
+ template <typename Map>
+ bool map_compare(Map const &lhs, Map const &rhs) {
+ return lhs.size() == rhs.size()
+ && std::equal(lhs.begin(), lhs.end(), rhs.begin(),
+ [] (auto a, auto b) { return a.first == b.first && a.second == b.second; });
+ }
+}
+
+DaemonServer::DaemonServer(MonClient *monc_,
+ Finisher &finisher_,
+ DaemonStateIndex &daemon_state_,
+ ClusterState &cluster_state_,
+ PyModuleRegistry &py_modules_,
+ LogChannelRef clog_,
+ LogChannelRef audit_clog_)
+ : Dispatcher(g_ceph_context),
+ client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes",
+ g_conf().get_val<Option::size_t>("mgr_client_bytes"))),
+ client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages",
+ g_conf().get_val<uint64_t>("mgr_client_messages"))),
+ osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes",
+ g_conf().get_val<Option::size_t>("mgr_osd_bytes"))),
+ osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages",
+ g_conf().get_val<uint64_t>("mgr_osd_messages"))),
+ mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes",
+ g_conf().get_val<Option::size_t>("mgr_mds_bytes"))),
+ mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages",
+ g_conf().get_val<uint64_t>("mgr_mds_messages"))),
+ mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes",
+ g_conf().get_val<Option::size_t>("mgr_mon_bytes"))),
+ mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages",
+ g_conf().get_val<uint64_t>("mgr_mon_messages"))),
+ msgr(nullptr),
+ monc(monc_),
+ finisher(finisher_),
+ daemon_state(daemon_state_),
+ cluster_state(cluster_state_),
+ py_modules(py_modules_),
+ clog(clog_),
+ audit_clog(audit_clog_),
+ pgmap_ready(false),
+ timer(g_ceph_context, lock),
+ shutting_down(false),
+ tick_event(nullptr),
+ osd_perf_metric_collector_listener(this),
+ osd_perf_metric_collector(osd_perf_metric_collector_listener),
+ mds_perf_metric_collector_listener(this),
+ mds_perf_metric_collector(mds_perf_metric_collector_listener)
+{
+ g_conf().add_observer(this);
+}
+
+DaemonServer::~DaemonServer() {
+ delete msgr;
+ g_conf().remove_observer(this);
+}
+
+int DaemonServer::init(uint64_t gid, entity_addrvec_t client_addrs)
+{
+ // Initialize Messenger
+ std::string public_msgr_type = g_conf()->ms_public_type.empty() ?
+ g_conf().get_val<std::string>("ms_type") : g_conf()->ms_public_type;
+ msgr = Messenger::create(g_ceph_context, public_msgr_type,
+ entity_name_t::MGR(gid),
+ "mgr",
+ Messenger::get_pid_nonce());
+ msgr->set_default_policy(Messenger::Policy::stateless_server(0));
+
+ msgr->set_auth_client(monc);
+
+ // throttle clients
+ msgr->set_policy_throttlers(entity_name_t::TYPE_CLIENT,
+ client_byte_throttler.get(),
+ client_msg_throttler.get());
+
+ // servers
+ msgr->set_policy_throttlers(entity_name_t::TYPE_OSD,
+ osd_byte_throttler.get(),
+ osd_msg_throttler.get());
+ msgr->set_policy_throttlers(entity_name_t::TYPE_MDS,
+ mds_byte_throttler.get(),
+ mds_msg_throttler.get());
+ msgr->set_policy_throttlers(entity_name_t::TYPE_MON,
+ mon_byte_throttler.get(),
+ mon_msg_throttler.get());
+
+ entity_addrvec_t addrs;
+ int r = pick_addresses(cct, CEPH_PICK_ADDRESS_PUBLIC, &addrs);
+ if (r < 0) {
+ return r;
+ }
+ dout(20) << __func__ << " will bind to " << addrs << dendl;
+ r = msgr->bindv(addrs);
+ if (r < 0) {
+ derr << "unable to bind mgr to " << addrs << dendl;
+ return r;
+ }
+
+ msgr->set_myname(entity_name_t::MGR(gid));
+ msgr->set_addr_unknowns(client_addrs);
+
+ msgr->start();
+ msgr->add_dispatcher_tail(this);
+
+ msgr->set_auth_server(monc);
+ monc->set_handle_authentication_dispatcher(this);
+
+ started_at = ceph_clock_now();
+
+ std::lock_guard l(lock);
+ timer.init();
+
+ schedule_tick_locked(
+ g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count());
+
+ return 0;
+}
+
+entity_addrvec_t DaemonServer::get_myaddrs() const
+{
+ return msgr->get_myaddrs();
+}
+
+int DaemonServer::ms_handle_authentication(Connection *con)
+{
+ auto s = ceph::make_ref<MgrSession>(cct);
+ con->set_priv(s);
+ s->inst.addr = con->get_peer_addr();
+ s->entity_name = con->peer_name;
+ dout(10) << __func__ << " new session " << s << " con " << con
+ << " entity " << con->peer_name
+ << " addr " << con->get_peer_addrs()
+ << dendl;
+
+ AuthCapsInfo &caps_info = con->get_peer_caps_info();
+ if (caps_info.allow_all) {
+ dout(10) << " session " << s << " " << s->entity_name
+ << " allow_all" << dendl;
+ s->caps.set_allow_all();
+ } else if (caps_info.caps.length() > 0) {
+ auto p = caps_info.caps.cbegin();
+ string str;
+ try {
+ decode(str, p);
+ }
+ catch (buffer::error& e) {
+ dout(10) << " session " << s << " " << s->entity_name
+ << " failed to decode caps" << dendl;
+ return -EACCES;
+ }
+ if (!s->caps.parse(str)) {
+ dout(10) << " session " << s << " " << s->entity_name
+ << " failed to parse caps '" << str << "'" << dendl;
+ return -EACCES;
+ }
+ dout(10) << " session " << s << " " << s->entity_name
+ << " has caps " << s->caps << " '" << str << "'" << dendl;
+ }
+
+ if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
+ std::lock_guard l(lock);
+ s->osd_id = atoi(s->entity_name.get_id().c_str());
+ dout(10) << "registering osd." << s->osd_id << " session "
+ << s << " con " << con << dendl;
+ osd_cons[s->osd_id].insert(con);
+ }
+
+ return 1;
+}
+
+bool DaemonServer::ms_handle_reset(Connection *con)
+{
+ if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
+ auto priv = con->get_priv();
+ auto session = static_cast<MgrSession*>(priv.get());
+ if (!session) {
+ return false;
+ }
+ std::lock_guard l(lock);
+ dout(10) << "unregistering osd." << session->osd_id
+ << " session " << session << " con " << con << dendl;
+ osd_cons[session->osd_id].erase(con);
+
+ auto iter = daemon_connections.find(con);
+ if (iter != daemon_connections.end()) {
+ daemon_connections.erase(iter);
+ }
+ }
+ return false;
+}
+
+bool DaemonServer::ms_handle_refused(Connection *con)
+{
+ // do nothing for now
+ return false;
+}
+
+bool DaemonServer::ms_dispatch2(const ref_t<Message>& m)
+{
+ // Note that we do *not* take ::lock here, in order to avoid
+ // serializing all message handling. It's up to each handler
+ // to take whatever locks it needs.
+ switch (m->get_type()) {
+ case MSG_PGSTATS:
+ cluster_state.ingest_pgstats(ref_cast<MPGStats>(m));
+ maybe_ready(m->get_source().num());
+ return true;
+ case MSG_MGR_REPORT:
+ return handle_report(ref_cast<MMgrReport>(m));
+ case MSG_MGR_OPEN:
+ return handle_open(ref_cast<MMgrOpen>(m));
+ case MSG_MGR_UPDATE:
+ return handle_update(ref_cast<MMgrUpdate>(m));
+ case MSG_MGR_CLOSE:
+ return handle_close(ref_cast<MMgrClose>(m));
+ case MSG_COMMAND:
+ return handle_command(ref_cast<MCommand>(m));
+ case MSG_MGR_COMMAND:
+ return handle_command(ref_cast<MMgrCommand>(m));
+ default:
+ dout(1) << "Unhandled message type " << m->get_type() << dendl;
+ return false;
+ };
+}
+
+void DaemonServer::dump_pg_ready(ceph::Formatter *f)
+{
+ f->dump_bool("pg_ready", pgmap_ready.load());
+}
+
+void DaemonServer::maybe_ready(int32_t osd_id)
+{
+ if (pgmap_ready.load()) {
+ // Fast path: we don't need to take lock because pgmap_ready
+ // is already set
+ } else {
+ std::lock_guard l(lock);
+
+ if (reported_osds.find(osd_id) == reported_osds.end()) {
+ dout(4) << "initial report from osd " << osd_id << dendl;
+ reported_osds.insert(osd_id);
+ std::set<int32_t> up_osds;
+
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ osdmap.get_up_osds(up_osds);
+ });
+
+ std::set<int32_t> unreported_osds;
+ std::set_difference(up_osds.begin(), up_osds.end(),
+ reported_osds.begin(), reported_osds.end(),
+ std::inserter(unreported_osds, unreported_osds.begin()));
+
+ if (unreported_osds.size() == 0) {
+ dout(4) << "all osds have reported, sending PG state to mon" << dendl;
+ pgmap_ready = true;
+ reported_osds.clear();
+ // Avoid waiting for next tick
+ send_report();
+ } else {
+ dout(4) << "still waiting for " << unreported_osds.size() << " osds"
+ " to report in before PGMap is ready" << dendl;
+ }
+ }
+ }
+}
+
+void DaemonServer::tick()
+{
+ dout(10) << dendl;
+ send_report();
+ adjust_pgs();
+
+ schedule_tick_locked(
+ g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count());
+}
+
+// Currently modules do not set health checks in response to events delivered to
+// all modules (e.g. notify) so we do not risk a thundering hurd situation here.
+// if this pattern emerges in the future, this scheduler could be modified to
+// fire after all modules have had a chance to set their health checks.
+void DaemonServer::schedule_tick_locked(double delay_sec)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ if (tick_event) {
+ timer.cancel_event(tick_event);
+ tick_event = nullptr;
+ }
+
+ // on shutdown start rejecting explicit requests to send reports that may
+ // originate from python land which may still be running.
+ if (shutting_down)
+ return;
+
+ tick_event = timer.add_event_after(delay_sec,
+ new LambdaContext([this](int r) {
+ tick();
+ }));
+}
+
+void DaemonServer::schedule_tick(double delay_sec)
+{
+ std::lock_guard l(lock);
+ schedule_tick_locked(delay_sec);
+}
+
+void DaemonServer::handle_osd_perf_metric_query_updated()
+{
+ dout(10) << dendl;
+
+ // Send a fresh MMgrConfigure to all clients, so that they can follow
+ // the new policy for transmitting stats
+ finisher.queue(new LambdaContext([this](int r) {
+ std::lock_guard l(lock);
+ for (auto &c : daemon_connections) {
+ if (c->peer_is_osd()) {
+ _send_configure(c);
+ }
+ }
+ }));
+}
+
+void DaemonServer::handle_mds_perf_metric_query_updated()
+{
+ dout(10) << dendl;
+
+ // Send a fresh MMgrConfigure to all clients, so that they can follow
+ // the new policy for transmitting stats
+ finisher.queue(new LambdaContext([this](int r) {
+ std::lock_guard l(lock);
+ for (auto &c : daemon_connections) {
+ if (c->peer_is_mds()) {
+ _send_configure(c);
+ }
+ }
+ }));
+}
+
+void DaemonServer::shutdown()
+{
+ dout(10) << "begin" << dendl;
+ msgr->shutdown();
+ msgr->wait();
+ cluster_state.shutdown();
+ dout(10) << "done" << dendl;
+
+ std::lock_guard l(lock);
+ shutting_down = true;
+ timer.shutdown();
+}
+
+static DaemonKey key_from_service(
+ const std::string& service_name,
+ int peer_type,
+ const std::string& daemon_name)
+{
+ if (!service_name.empty()) {
+ return DaemonKey{service_name, daemon_name};
+ } else {
+ return DaemonKey{ceph_entity_type_name(peer_type), daemon_name};
+ }
+}
+
+void DaemonServer::fetch_missing_metadata(const DaemonKey& key,
+ const entity_addr_t& addr)
+{
+ if (!daemon_state.is_updating(key) &&
+ (key.type == "osd" || key.type == "mds" || key.type == "mon")) {
+ std::ostringstream oss;
+ auto c = new MetadataUpdate(daemon_state, key);
+ if (key.type == "osd") {
+ oss << "{\"prefix\": \"osd metadata\", \"id\": "
+ << key.name<< "}";
+ } else if (key.type == "mds") {
+ c->set_default("addr", stringify(addr));
+ oss << "{\"prefix\": \"mds metadata\", \"who\": \""
+ << key.name << "\"}";
+ } else if (key.type == "mon") {
+ oss << "{\"prefix\": \"mon metadata\", \"id\": \""
+ << key.name << "\"}";
+ } else {
+ ceph_abort();
+ }
+ monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c);
+ }
+}
+
+bool DaemonServer::handle_open(const ref_t<MMgrOpen>& m)
+{
+ std::unique_lock l(lock);
+
+ DaemonKey key = key_from_service(m->service_name,
+ m->get_connection()->get_peer_type(),
+ m->daemon_name);
+
+ auto con = m->get_connection();
+ dout(10) << "from " << key << " " << con->get_peer_addr() << dendl;
+
+ _send_configure(con);
+
+ DaemonStatePtr daemon;
+ if (daemon_state.exists(key)) {
+ dout(20) << "updating existing DaemonState for " << key << dendl;
+ daemon = daemon_state.get(key);
+ }
+ if (!daemon) {
+ if (m->service_daemon) {
+ dout(4) << "constructing new DaemonState for " << key << dendl;
+ daemon = std::make_shared<DaemonState>(daemon_state.types);
+ daemon->key = key;
+ daemon->service_daemon = true;
+ daemon_state.insert(daemon);
+ } else {
+ /* A normal Ceph daemon has connected but we are or should be waiting on
+ * metadata for it. Close the session so that it tries to reconnect.
+ */
+ dout(2) << "ignoring open from " << key << " " << con->get_peer_addr()
+ << "; not ready for session (expect reconnect)" << dendl;
+ con->mark_down();
+ l.unlock();
+ fetch_missing_metadata(key, m->get_source_addr());
+ return true;
+ }
+ }
+ if (daemon) {
+ if (m->service_daemon) {
+ // update the metadata through the daemon state index to
+ // ensure it's kept up-to-date
+ daemon_state.update_metadata(daemon, m->daemon_metadata);
+ }
+
+ std::lock_guard l(daemon->lock);
+ daemon->perf_counters.clear();
+
+ daemon->service_daemon = m->service_daemon;
+ if (m->service_daemon) {
+ daemon->service_status = m->daemon_status;
+
+ utime_t now = ceph_clock_now();
+ auto [d, added] = pending_service_map.get_daemon(m->service_name,
+ m->daemon_name);
+ if (added || d->gid != (uint64_t)m->get_source().num()) {
+ dout(10) << "registering " << key << " in pending_service_map" << dendl;
+ d->gid = m->get_source().num();
+ d->addr = m->get_source_addr();
+ d->start_epoch = pending_service_map.epoch;
+ d->start_stamp = now;
+ d->metadata = m->daemon_metadata;
+ pending_service_map_dirty = pending_service_map.epoch;
+ }
+ }
+
+ auto p = m->config_bl.cbegin();
+ if (p != m->config_bl.end()) {
+ decode(daemon->config, p);
+ decode(daemon->ignored_mon_config, p);
+ dout(20) << " got config " << daemon->config
+ << " ignored " << daemon->ignored_mon_config << dendl;
+ }
+ daemon->config_defaults_bl = m->config_defaults_bl;
+ daemon->config_defaults.clear();
+ dout(20) << " got config_defaults_bl " << daemon->config_defaults_bl.length()
+ << " bytes" << dendl;
+ }
+
+ if (con->get_peer_type() != entity_name_t::TYPE_CLIENT &&
+ m->service_name.empty())
+ {
+ // Store in set of the daemon/service connections, i.e. those
+ // connections that require an update in the event of stats
+ // configuration changes.
+ daemon_connections.insert(con);
+ }
+
+ return true;
+}
+
+bool DaemonServer::handle_update(const ref_t<MMgrUpdate>& m)
+{
+ DaemonKey key;
+ if (!m->service_name.empty()) {
+ key.type = m->service_name;
+ } else {
+ key.type = ceph_entity_type_name(m->get_connection()->get_peer_type());
+ }
+ key.name = m->daemon_name;
+
+ dout(10) << "from " << m->get_connection() << " " << key << dendl;
+
+ if (m->get_connection()->get_peer_type() == entity_name_t::TYPE_CLIENT &&
+ m->service_name.empty()) {
+ // Clients should not be sending us update request
+ dout(10) << "rejecting update request from non-daemon client " << m->daemon_name
+ << dendl;
+ clog->warn() << "rejecting report from non-daemon client " << m->daemon_name
+ << " at " << m->get_connection()->get_peer_addrs();
+ m->get_connection()->mark_down();
+ return true;
+ }
+
+
+ {
+ std::unique_lock locker(lock);
+
+ DaemonStatePtr daemon;
+ // Look up the DaemonState
+ if (daemon_state.exists(key)) {
+ dout(20) << "updating existing DaemonState for " << key << dendl;
+
+ daemon = daemon_state.get(key);
+ if (m->need_metadata_update &&
+ !m->daemon_metadata.empty()) {
+ daemon_state.update_metadata(daemon, m->daemon_metadata);
+ }
+ }
+ }
+
+ return true;
+}
+
+bool DaemonServer::handle_close(const ref_t<MMgrClose>& m)
+{
+ std::lock_guard l(lock);
+
+ DaemonKey key = key_from_service(m->service_name,
+ m->get_connection()->get_peer_type(),
+ m->daemon_name);
+ dout(4) << "from " << m->get_connection() << " " << key << dendl;
+
+ if (daemon_state.exists(key)) {
+ DaemonStatePtr daemon = daemon_state.get(key);
+ daemon_state.rm(key);
+ {
+ std::lock_guard l(daemon->lock);
+ if (daemon->service_daemon) {
+ pending_service_map.rm_daemon(m->service_name, m->daemon_name);
+ pending_service_map_dirty = pending_service_map.epoch;
+ }
+ }
+ }
+
+ // send same message back as a reply
+ m->get_connection()->send_message2(m);
+ return true;
+}
+
+void DaemonServer::update_task_status(
+ DaemonKey key,
+ const std::map<std::string,std::string>& task_status)
+{
+ dout(10) << "got task status from " << key << dendl;
+
+ [[maybe_unused]] auto [daemon, added] =
+ pending_service_map.get_daemon(key.type, key.name);
+ if (daemon->task_status != task_status) {
+ daemon->task_status = task_status;
+ pending_service_map_dirty = pending_service_map.epoch;
+ }
+}
+
+bool DaemonServer::handle_report(const ref_t<MMgrReport>& m)
+{
+ DaemonKey key;
+ if (!m->service_name.empty()) {
+ key.type = m->service_name;
+ } else {
+ key.type = ceph_entity_type_name(m->get_connection()->get_peer_type());
+ }
+ key.name = m->daemon_name;
+
+ dout(10) << "from " << m->get_connection() << " " << key << dendl;
+
+ if (m->get_connection()->get_peer_type() == entity_name_t::TYPE_CLIENT &&
+ m->service_name.empty()) {
+ // Clients should not be sending us stats unless they are declaring
+ // themselves to be a daemon for some service.
+ dout(10) << "rejecting report from non-daemon client " << m->daemon_name
+ << dendl;
+ clog->warn() << "rejecting report from non-daemon client " << m->daemon_name
+ << " at " << m->get_connection()->get_peer_addrs();
+ m->get_connection()->mark_down();
+ return true;
+ }
+
+
+ {
+ std::unique_lock locker(lock);
+
+ DaemonStatePtr daemon;
+ // Look up the DaemonState
+ if (daemon_state.exists(key)) {
+ dout(20) << "updating existing DaemonState for " << key << dendl;
+ daemon = daemon_state.get(key);
+ } else {
+ locker.unlock();
+
+ // we don't know the hostname at this stage, reject MMgrReport here.
+ dout(5) << "rejecting report from " << key << ", since we do not have its metadata now."
+ << dendl;
+ // issue metadata request in background
+ fetch_missing_metadata(key, m->get_source_addr());
+
+ locker.lock();
+
+ // kill session
+ auto priv = m->get_connection()->get_priv();
+ auto session = static_cast<MgrSession*>(priv.get());
+ if (!session) {
+ return false;
+ }
+ m->get_connection()->mark_down();
+
+ dout(10) << "unregistering osd." << session->osd_id
+ << " session " << session << " con " << m->get_connection() << dendl;
+
+ if (osd_cons.find(session->osd_id) != osd_cons.end()) {
+ osd_cons[session->osd_id].erase(m->get_connection());
+ }
+
+ auto iter = daemon_connections.find(m->get_connection());
+ if (iter != daemon_connections.end()) {
+ daemon_connections.erase(iter);
+ }
+
+ return false;
+ }
+
+ // Update the DaemonState
+ ceph_assert(daemon != nullptr);
+ {
+ std::lock_guard l(daemon->lock);
+ auto &daemon_counters = daemon->perf_counters;
+ daemon_counters.update(*m.get());
+
+ auto p = m->config_bl.cbegin();
+ if (p != m->config_bl.end()) {
+ decode(daemon->config, p);
+ decode(daemon->ignored_mon_config, p);
+ dout(20) << " got config " << daemon->config
+ << " ignored " << daemon->ignored_mon_config << dendl;
+ }
+
+ utime_t now = ceph_clock_now();
+ if (daemon->service_daemon) {
+ if (m->daemon_status) {
+ daemon->service_status_stamp = now;
+ daemon->service_status = *m->daemon_status;
+ }
+ daemon->last_service_beacon = now;
+ } else if (m->daemon_status) {
+ derr << "got status from non-daemon " << key << dendl;
+ }
+ // update task status
+ if (m->task_status) {
+ update_task_status(key, *m->task_status);
+ daemon->last_service_beacon = now;
+ }
+ if (m->get_connection()->peer_is_osd() || m->get_connection()->peer_is_mon()) {
+ // only OSD and MON send health_checks to me now
+ daemon->daemon_health_metrics = std::move(m->daemon_health_metrics);
+ dout(10) << "daemon_health_metrics " << daemon->daemon_health_metrics
+ << dendl;
+ }
+ }
+ }
+
+ // if there are any schema updates, notify the python modules
+ /* no users currently
+ if (!m->declare_types.empty() || !m->undeclare_types.empty()) {
+ py_modules.notify_all("perf_schema_update", ceph::to_string(key));
+ }
+ */
+
+ if (m->get_connection()->peer_is_osd()) {
+ osd_perf_metric_collector.process_reports(m->osd_perf_metric_reports);
+ }
+
+ if (m->metric_report_message) {
+ const MetricReportMessage &message = *m->metric_report_message;
+ boost::apply_visitor(HandlePayloadVisitor(this), message.payload);
+ }
+
+ return true;
+}
+
+
+void DaemonServer::_generate_command_map(
+ cmdmap_t& cmdmap,
+ map<string,string> &param_str_map)
+{
+ for (auto p = cmdmap.begin();
+ p != cmdmap.end(); ++p) {
+ if (p->first == "prefix")
+ continue;
+ if (p->first == "caps") {
+ vector<string> cv;
+ if (cmd_getval(cmdmap, "caps", cv) &&
+ cv.size() % 2 == 0) {
+ for (unsigned i = 0; i < cv.size(); i += 2) {
+ string k = string("caps_") + cv[i];
+ param_str_map[k] = cv[i + 1];
+ }
+ continue;
+ }
+ }
+ param_str_map[p->first] = cmd_vartype_stringify(p->second);
+ }
+}
+
+const MonCommand *DaemonServer::_get_mgrcommand(
+ const string &cmd_prefix,
+ const std::vector<MonCommand> &cmds)
+{
+ const MonCommand *this_cmd = nullptr;
+ for (const auto &cmd : cmds) {
+ if (cmd.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) {
+ this_cmd = &cmd;
+ break;
+ }
+ }
+ return this_cmd;
+}
+
+bool DaemonServer::_allowed_command(
+ MgrSession *s,
+ const string &service,
+ const string &module,
+ const string &prefix,
+ const cmdmap_t& cmdmap,
+ const map<string,string>& param_str_map,
+ const MonCommand *this_cmd) {
+
+ if (s->entity_name.is_mon()) {
+ // mon is all-powerful. even when it is forwarding commands on behalf of
+ // old clients; we expect the mon is validating commands before proxying!
+ return true;
+ }
+
+ bool cmd_r = this_cmd->requires_perm('r');
+ bool cmd_w = this_cmd->requires_perm('w');
+ bool cmd_x = this_cmd->requires_perm('x');
+
+ bool capable = s->caps.is_capable(
+ g_ceph_context,
+ s->entity_name,
+ service, module, prefix, param_str_map,
+ cmd_r, cmd_w, cmd_x,
+ s->get_peer_addr());
+
+ dout(10) << " " << s->entity_name << " "
+ << (capable ? "" : "not ") << "capable" << dendl;
+ return capable;
+}
+
+/**
+ * The working data for processing an MCommand. This lives in
+ * a class to enable passing it into other threads for processing
+ * outside of the thread/locks that called handle_command.
+ */
+class CommandContext {
+public:
+ ceph::ref_t<MCommand> m_tell;
+ ceph::ref_t<MMgrCommand> m_mgr;
+ const std::vector<std::string>& cmd; ///< ref into m_tell or m_mgr
+ const bufferlist& data; ///< ref into m_tell or m_mgr
+ bufferlist odata;
+ cmdmap_t cmdmap;
+
+ explicit CommandContext(ceph::ref_t<MCommand> m)
+ : m_tell{std::move(m)},
+ cmd(m_tell->cmd),
+ data(m_tell->get_data()) {
+ }
+ explicit CommandContext(ceph::ref_t<MMgrCommand> m)
+ : m_mgr{std::move(m)},
+ cmd(m_mgr->cmd),
+ data(m_mgr->get_data()) {
+ }
+
+ void reply(int r, const std::stringstream &ss) {
+ reply(r, ss.str());
+ }
+
+ void reply(int r, const std::string &rs) {
+ // Let the connection drop as soon as we've sent our response
+ ConnectionRef con = m_tell ? m_tell->get_connection()
+ : m_mgr->get_connection();
+ if (con) {
+ con->mark_disposable();
+ }
+
+ if (r == 0) {
+ dout(20) << "success" << dendl;
+ } else {
+ derr << __func__ << " " << cpp_strerror(r) << " " << rs << dendl;
+ }
+ if (con) {
+ if (m_tell) {
+ MCommandReply *reply = new MCommandReply(r, rs);
+ reply->set_tid(m_tell->get_tid());
+ reply->set_data(odata);
+ con->send_message(reply);
+ } else {
+ MMgrCommandReply *reply = new MMgrCommandReply(r, rs);
+ reply->set_tid(m_mgr->get_tid());
+ reply->set_data(odata);
+ con->send_message(reply);
+ }
+ }
+ }
+};
+
+/**
+ * A context for receiving a bufferlist/error string from a background
+ * function and then calling back to a CommandContext when it's done
+ */
+class ReplyOnFinish : public Context {
+ std::shared_ptr<CommandContext> cmdctx;
+
+public:
+ bufferlist from_mon;
+ string outs;
+
+ explicit ReplyOnFinish(const std::shared_ptr<CommandContext> &cmdctx_)
+ : cmdctx(cmdctx_)
+ {}
+ void finish(int r) override {
+ cmdctx->odata.claim_append(from_mon);
+ cmdctx->reply(r, outs);
+ }
+};
+
+bool DaemonServer::handle_command(const ref_t<MCommand>& m)
+{
+ std::lock_guard l(lock);
+ auto cmdctx = std::make_shared<CommandContext>(m);
+ try {
+ return _handle_command(cmdctx);
+ } catch (const bad_cmd_get& e) {
+ cmdctx->reply(-EINVAL, e.what());
+ return true;
+ }
+}
+
+bool DaemonServer::handle_command(const ref_t<MMgrCommand>& m)
+{
+ std::lock_guard l(lock);
+ auto cmdctx = std::make_shared<CommandContext>(m);
+ try {
+ return _handle_command(cmdctx);
+ } catch (const bad_cmd_get& e) {
+ cmdctx->reply(-EINVAL, e.what());
+ return true;
+ }
+}
+
+void DaemonServer::log_access_denied(
+ std::shared_ptr<CommandContext>& cmdctx,
+ MgrSession* session, std::stringstream& ss) {
+ dout(1) << " access denied" << dendl;
+ audit_clog->info() << "from='" << session->inst << "' "
+ << "entity='" << session->entity_name << "' "
+ << "cmd=" << cmdctx->cmd << ": access denied";
+ ss << "access denied: does your client key have mgr caps? "
+ "See http://docs.ceph.com/en/latest/mgr/administrator/"
+ "#client-authentication";
+}
+
+void DaemonServer::_check_offlines_pgs(
+ const set<int>& osds,
+ const OSDMap& osdmap,
+ const PGMap& pgmap,
+ offline_pg_report *report)
+{
+ // reset output
+ *report = offline_pg_report();
+ report->osds = osds;
+
+ for (const auto& q : pgmap.pg_stat) {
+ set<int32_t> pg_acting; // net acting sets (with no missing if degraded)
+ bool found = false;
+ if (q.second.state == 0) {
+ report->unknown.insert(q.first);
+ continue;
+ }
+ if (q.second.state & PG_STATE_DEGRADED) {
+ for (auto& anm : q.second.avail_no_missing) {
+ if (osds.count(anm.osd)) {
+ found = true;
+ continue;
+ }
+ if (anm.osd != CRUSH_ITEM_NONE) {
+ pg_acting.insert(anm.osd);
+ }
+ }
+ } else {
+ for (auto& a : q.second.acting) {
+ if (osds.count(a)) {
+ found = true;
+ continue;
+ }
+ if (a != CRUSH_ITEM_NONE) {
+ pg_acting.insert(a);
+ }
+ }
+ }
+ if (!found) {
+ continue;
+ }
+ const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool());
+ bool dangerous = false;
+ if (!pi) {
+ report->bad_no_pool.insert(q.first); // pool is creating or deleting
+ dangerous = true;
+ }
+ if (!(q.second.state & PG_STATE_ACTIVE)) {
+ report->bad_already_inactive.insert(q.first);
+ dangerous = true;
+ }
+ if (pg_acting.size() < pi->min_size) {
+ report->bad_become_inactive.insert(q.first);
+ dangerous = true;
+ }
+ if (dangerous) {
+ report->not_ok.insert(q.first);
+ } else {
+ report->ok.insert(q.first);
+ if (q.second.state & PG_STATE_DEGRADED) {
+ report->ok_become_more_degraded.insert(q.first);
+ } else {
+ report->ok_become_degraded.insert(q.first);
+ }
+ }
+ }
+ dout(20) << osds << " -> " << report->ok.size() << " ok, "
+ << report->not_ok.size() << " not ok, "
+ << report->unknown.size() << " unknown"
+ << dendl;
+}
+
+void DaemonServer::_maximize_ok_to_stop_set(
+ const set<int>& orig_osds,
+ unsigned max,
+ const OSDMap& osdmap,
+ const PGMap& pgmap,
+ offline_pg_report *out_report)
+{
+ dout(20) << "orig_osds " << orig_osds << " max " << max << dendl;
+ _check_offlines_pgs(orig_osds, osdmap, pgmap, out_report);
+ if (!out_report->ok_to_stop()) {
+ return;
+ }
+ if (orig_osds.size() >= max) {
+ // already at max
+ return;
+ }
+
+ // semi-arbitrarily start with the first osd in the set
+ offline_pg_report report;
+ set<int> osds = orig_osds;
+ int parent = *osds.begin();
+ set<int> children;
+
+ while (true) {
+ // identify the next parent
+ int r = osdmap.crush->get_immediate_parent_id(parent, &parent);
+ if (r < 0) {
+ return; // just go with what we have so far!
+ }
+
+ // get candidate additions that are beneath this point in the tree
+ children.clear();
+ r = osdmap.crush->get_all_children(parent, &children);
+ if (r < 0) {
+ return; // just go with what we have so far!
+ }
+ dout(20) << " parent " << parent << " children " << children << dendl;
+
+ // try adding in more osds
+ int failed = 0; // how many children we failed to add to our set
+ for (auto o : children) {
+ if (o >= 0 && osdmap.is_up(o) && osds.count(o) == 0) {
+ osds.insert(o);
+ _check_offlines_pgs(osds, osdmap, pgmap, &report);
+ if (!report.ok_to_stop()) {
+ osds.erase(o);
+ ++failed;
+ continue;
+ }
+ *out_report = report;
+ if (osds.size() == max) {
+ dout(20) << " hit max" << dendl;
+ return; // yay, we hit the max
+ }
+ }
+ }
+
+ if (failed) {
+ // we hit some failures; go with what we have
+ dout(20) << " hit some peer failures" << dendl;
+ return;
+ }
+ }
+}
+
+bool DaemonServer::_handle_command(
+ std::shared_ptr<CommandContext>& cmdctx)
+{
+ MessageRef m;
+ bool admin_socket_cmd = false;
+ if (cmdctx->m_tell) {
+ m = cmdctx->m_tell;
+ // a blank fsid in MCommand signals a legacy client sending a "mon-mgr" CLI
+ // command.
+ admin_socket_cmd = (cmdctx->m_tell->fsid != uuid_d());
+ } else {
+ m = cmdctx->m_mgr;
+ }
+ auto priv = m->get_connection()->get_priv();
+ auto session = static_cast<MgrSession*>(priv.get());
+ if (!session) {
+ return true;
+ }
+ if (session->inst.name == entity_name_t()) {
+ session->inst.name = m->get_source();
+ }
+
+ map<string,string> param_str_map;
+ std::stringstream ss;
+ int r = 0;
+
+ if (!cmdmap_from_json(cmdctx->cmd, &(cmdctx->cmdmap), ss)) {
+ cmdctx->reply(-EINVAL, ss);
+ return true;
+ }
+
+ string prefix;
+ cmd_getval(cmdctx->cmdmap, "prefix", prefix);
+ dout(10) << "decoded-size=" << cmdctx->cmdmap.size() << " prefix=" << prefix << dendl;
+
+ boost::scoped_ptr<Formatter> f;
+ {
+ std::string format;
+ if (boost::algorithm::ends_with(prefix, "_json")) {
+ format = "json";
+ } else {
+ cmd_getval(cmdctx->cmdmap, "format", format, string("plain"));
+ }
+ f.reset(Formatter::create(format));
+ }
+
+ // this is just for mgr commands - admin socket commands will fall
+ // through and use the admin socket version of
+ // get_command_descriptions
+ if (prefix == "get_command_descriptions" && !admin_socket_cmd) {
+ dout(10) << "reading commands from python modules" << dendl;
+ const auto py_commands = py_modules.get_commands();
+
+ int cmdnum = 0;
+ JSONFormatter f;
+ f.open_object_section("command_descriptions");
+
+ auto dump_cmd = [&cmdnum, &f, m](const MonCommand &mc){
+ ostringstream secname;
+ secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+ dump_cmddesc_to_json(&f, m->get_connection()->get_features(),
+ secname.str(), mc.cmdstring, mc.helpstring,
+ mc.module, mc.req_perms, 0);
+ cmdnum++;
+ };
+
+ for (const auto &pyc : py_commands) {
+ dump_cmd(pyc);
+ }
+
+ for (const auto &mgr_cmd : mgr_commands) {
+ dump_cmd(mgr_cmd);
+ }
+
+ f.close_section(); // command_descriptions
+ f.flush(cmdctx->odata);
+ cmdctx->reply(0, ss);
+ return true;
+ }
+
+ // lookup command
+ const MonCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands);
+ _generate_command_map(cmdctx->cmdmap, param_str_map);
+
+ bool is_allowed = false;
+ ModuleCommand py_command;
+ if (admin_socket_cmd) {
+ // admin socket commands require all capabilities
+ is_allowed = session->caps.is_allow_all();
+ } else if (!mgr_cmd) {
+ // Resolve the command to the name of the module that will
+ // handle it (if the command exists)
+ auto py_commands = py_modules.get_py_commands();
+ for (const auto &pyc : py_commands) {
+ auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
+ if (pyc_prefix == prefix) {
+ py_command = pyc;
+ break;
+ }
+ }
+
+ MonCommand pyc = {"", "", "py", py_command.perm};
+ is_allowed = _allowed_command(session, "py", py_command.module_name,
+ prefix, cmdctx->cmdmap, param_str_map,
+ &pyc);
+ } else {
+ // validate user's permissions for requested command
+ is_allowed = _allowed_command(session, mgr_cmd->module, "",
+ prefix, cmdctx->cmdmap, param_str_map, mgr_cmd);
+ }
+
+ if (!is_allowed) {
+ log_access_denied(cmdctx, session, ss);
+ cmdctx->reply(-EACCES, ss);
+ return true;
+ }
+
+ audit_clog->debug()
+ << "from='" << session->inst << "' "
+ << "entity='" << session->entity_name << "' "
+ << "cmd=" << cmdctx->cmd << ": dispatch";
+
+ if (admin_socket_cmd) {
+ cct->get_admin_socket()->queue_tell_command(cmdctx->m_tell);
+ return true;
+ }
+
+ // ----------------
+ // service map commands
+ if (prefix == "service dump") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ cluster_state.with_servicemap([&](const ServiceMap &service_map) {
+ f->dump_object("service_map", service_map);
+ });
+ f->flush(cmdctx->odata);
+ cmdctx->reply(0, ss);
+ return true;
+ }
+ if (prefix == "service status") {
+ if (!f)
+ f.reset(Formatter::create("json-pretty"));
+ // only include state from services that are in the persisted service map
+ f->open_object_section("service_status");
+ for (auto& [type, service] : pending_service_map.services) {
+ if (ServiceMap::is_normal_ceph_entity(type)) {
+ continue;
+ }
+
+ f->open_object_section(type.c_str());
+ for (auto& q : service.daemons) {
+ f->open_object_section(q.first.c_str());
+ DaemonKey key{type, q.first};
+ ceph_assert(daemon_state.exists(key));
+ auto daemon = daemon_state.get(key);
+ std::lock_guard l(daemon->lock);
+ f->dump_stream("status_stamp") << daemon->service_status_stamp;
+ f->dump_stream("last_beacon") << daemon->last_service_beacon;
+ f->open_object_section("status");
+ for (auto& r : daemon->service_status) {
+ f->dump_string(r.first.c_str(), r.second);
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->flush(cmdctx->odata);
+ cmdctx->reply(0, ss);
+ return true;
+ }
+
+ if (prefix == "config set") {
+ std::string key;
+ std::string val;
+ cmd_getval(cmdctx->cmdmap, "key", key);
+ cmd_getval(cmdctx->cmdmap, "value", val);
+ r = cct->_conf.set_val(key, val, &ss);
+ if (r == 0) {
+ cct->_conf.apply_changes(nullptr);
+ }
+ cmdctx->reply(0, ss);
+ return true;
+ }
+
+ // -----------
+ // PG commands
+
+ if (prefix == "pg scrub" ||
+ prefix == "pg repair" ||
+ prefix == "pg deep-scrub") {
+ string scrubop = prefix.substr(3, string::npos);
+ pg_t pgid;
+ spg_t spgid;
+ string pgidstr;
+ cmd_getval(cmdctx->cmdmap, "pgid", pgidstr);
+ if (!pgid.parse(pgidstr.c_str())) {
+ ss << "invalid pgid '" << pgidstr << "'";
+ cmdctx->reply(-EINVAL, ss);
+ return true;
+ }
+ bool pg_exists = false;
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ pg_exists = osdmap.pg_exists(pgid);
+ });
+ if (!pg_exists) {
+ ss << "pg " << pgid << " does not exist";
+ cmdctx->reply(-ENOENT, ss);
+ return true;
+ }
+ int acting_primary = -1;
+ epoch_t epoch;
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ epoch = osdmap.get_epoch();
+ osdmap.get_primary_shard(pgid, &acting_primary, &spgid);
+ });
+ if (acting_primary == -1) {
+ ss << "pg " << pgid << " has no primary osd";
+ cmdctx->reply(-EAGAIN, ss);
+ return true;
+ }
+ auto p = osd_cons.find(acting_primary);
+ if (p == osd_cons.end()) {
+ ss << "pg " << pgid << " primary osd." << acting_primary
+ << " is not currently connected";
+ cmdctx->reply(-EAGAIN, ss);
+ return true;
+ }
+ for (auto& con : p->second) {
+ if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) {
+ vector<spg_t> pgs = { spgid };
+ con->send_message(new MOSDScrub2(monc->get_fsid(),
+ epoch,
+ pgs,
+ scrubop == "repair",
+ scrubop == "deep-scrub"));
+ } else {
+ vector<pg_t> pgs = { pgid };
+ con->send_message(new MOSDScrub(monc->get_fsid(),
+ pgs,
+ scrubop == "repair",
+ scrubop == "deep-scrub"));
+ }
+ }
+ ss << "instructing pg " << spgid << " on osd." << acting_primary
+ << " to " << scrubop;
+ cmdctx->reply(0, ss);
+ return true;
+ } else if (prefix == "osd scrub" ||
+ prefix == "osd deep-scrub" ||
+ prefix == "osd repair") {
+ string whostr;
+ cmd_getval(cmdctx->cmdmap, "who", whostr);
+ vector<string> pvec;
+ get_str_vec(prefix, pvec);
+
+ set<int> osds;
+ if (whostr == "*" || whostr == "all" || whostr == "any") {
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ for (int i = 0; i < osdmap.get_max_osd(); i++)
+ if (osdmap.is_up(i)) {
+ osds.insert(i);
+ }
+ });
+ } else {
+ long osd = parse_osd_id(whostr.c_str(), &ss);
+ if (osd < 0) {
+ ss << "invalid osd '" << whostr << "'";
+ cmdctx->reply(-EINVAL, ss);
+ return true;
+ }
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ if (osdmap.is_up(osd)) {
+ osds.insert(osd);
+ }
+ });
+ if (osds.empty()) {
+ ss << "osd." << osd << " is not up";
+ cmdctx->reply(-EAGAIN, ss);
+ return true;
+ }
+ }
+ set<int> sent_osds, failed_osds;
+ for (auto osd : osds) {
+ vector<spg_t> spgs;
+ epoch_t epoch;
+ cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) {
+ epoch = osdmap.get_epoch();
+ auto p = pgmap.pg_by_osd.find(osd);
+ if (p != pgmap.pg_by_osd.end()) {
+ for (auto pgid : p->second) {
+ int primary;
+ spg_t spg;
+ osdmap.get_primary_shard(pgid, &primary, &spg);
+ if (primary == osd) {
+ spgs.push_back(spg);
+ }
+ }
+ }
+ });
+ auto p = osd_cons.find(osd);
+ if (p == osd_cons.end()) {
+ failed_osds.insert(osd);
+ } else {
+ sent_osds.insert(osd);
+ for (auto& con : p->second) {
+ if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) {
+ con->send_message(new MOSDScrub2(monc->get_fsid(),
+ epoch,
+ spgs,
+ pvec.back() == "repair",
+ pvec.back() == "deep-scrub"));
+ } else {
+ con->send_message(new MOSDScrub(monc->get_fsid(),
+ pvec.back() == "repair",
+ pvec.back() == "deep-scrub"));
+ }
+ }
+ }
+ }
+ if (failed_osds.size() == osds.size()) {
+ ss << "failed to instruct osd(s) " << osds << " to " << pvec.back()
+ << " (not connected)";
+ r = -EAGAIN;
+ } else {
+ ss << "instructed osd(s) " << sent_osds << " to " << pvec.back();
+ if (!failed_osds.empty()) {
+ ss << "; osd(s) " << failed_osds << " were not connected";
+ }
+ r = 0;
+ }
+ cmdctx->reply(0, ss);
+ return true;
+ } else if (prefix == "osd pool scrub" ||
+ prefix == "osd pool deep-scrub" ||
+ prefix == "osd pool repair") {
+ vector<string> pool_names;
+ cmd_getval(cmdctx->cmdmap, "who", pool_names);
+ if (pool_names.empty()) {
+ ss << "must specify one or more pool names";
+ cmdctx->reply(-EINVAL, ss);
+ return true;
+ }
+ epoch_t epoch;
+ map<int32_t, vector<pg_t>> pgs_by_primary; // legacy
+ map<int32_t, vector<spg_t>> spgs_by_primary;
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ epoch = osdmap.get_epoch();
+ for (auto& pool_name : pool_names) {
+ auto pool_id = osdmap.lookup_pg_pool_name(pool_name);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << pool_name << "'";
+ r = -ENOENT;
+ return;
+ }
+ auto pool_pg_num = osdmap.get_pg_num(pool_id);
+ for (int i = 0; i < pool_pg_num; i++) {
+ pg_t pg(i, pool_id);
+ int primary;
+ spg_t spg;
+ auto got = osdmap.get_primary_shard(pg, &primary, &spg);
+ if (!got)
+ continue;
+ pgs_by_primary[primary].push_back(pg);
+ spgs_by_primary[primary].push_back(spg);
+ }
+ }
+ });
+ if (r < 0) {
+ cmdctx->reply(r, ss);
+ return true;
+ }
+ for (auto& it : spgs_by_primary) {
+ auto primary = it.first;
+ auto p = osd_cons.find(primary);
+ if (p == osd_cons.end()) {
+ ss << "osd." << primary << " is not currently connected";
+ cmdctx->reply(-EAGAIN, ss);
+ return true;
+ }
+ for (auto& con : p->second) {
+ if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) {
+ con->send_message(new MOSDScrub2(monc->get_fsid(),
+ epoch,
+ it.second,
+ prefix == "osd pool repair",
+ prefix == "osd pool deep-scrub"));
+ } else {
+ // legacy
+ auto q = pgs_by_primary.find(primary);
+ ceph_assert(q != pgs_by_primary.end());
+ con->send_message(new MOSDScrub(monc->get_fsid(),
+ q->second,
+ prefix == "osd pool repair",
+ prefix == "osd pool deep-scrub"));
+ }
+ }
+ }
+ cmdctx->reply(0, "");
+ return true;
+ } else if (prefix == "osd reweight-by-pg" ||
+ prefix == "osd reweight-by-utilization" ||
+ prefix == "osd test-reweight-by-pg" ||
+ prefix == "osd test-reweight-by-utilization") {
+ bool by_pg =
+ prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
+ bool dry_run =
+ prefix == "osd test-reweight-by-pg" ||
+ prefix == "osd test-reweight-by-utilization";
+ int64_t oload;
+ cmd_getval(cmdctx->cmdmap, "oload", oload, int64_t(120));
+ set<int64_t> pools;
+ vector<string> poolnames;
+ cmd_getval(cmdctx->cmdmap, "pools", poolnames);
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ for (const auto& poolname : poolnames) {
+ int64_t pool = osdmap.lookup_pg_pool_name(poolname);
+ if (pool < 0) {
+ ss << "pool '" << poolname << "' does not exist";
+ r = -ENOENT;
+ }
+ pools.insert(pool);
+ }
+ });
+ if (r) {
+ cmdctx->reply(r, ss);
+ return true;
+ }
+
+ double max_change = g_conf().get_val<double>("mon_reweight_max_change");
+ cmd_getval(cmdctx->cmdmap, "max_change", max_change);
+ if (max_change <= 0.0) {
+ ss << "max_change " << max_change << " must be positive";
+ cmdctx->reply(-EINVAL, ss);
+ return true;
+ }
+ int64_t max_osds = g_conf().get_val<int64_t>("mon_reweight_max_osds");
+ cmd_getval(cmdctx->cmdmap, "max_osds", max_osds);
+ if (max_osds <= 0) {
+ ss << "max_osds " << max_osds << " must be positive";
+ cmdctx->reply(-EINVAL, ss);
+ return true;
+ }
+ bool no_increasing = false;
+ cmd_getval(cmdctx->cmdmap, "no_increasing", no_increasing);
+ string out_str;
+ mempool::osdmap::map<int32_t, uint32_t> new_weights;
+ r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap &osdmap, const PGMap& pgmap) {
+ return reweight::by_utilization(osdmap, pgmap,
+ oload,
+ max_change,
+ max_osds,
+ by_pg,
+ pools.empty() ? NULL : &pools,
+ no_increasing,
+ &new_weights,
+ &ss, &out_str, f.get());
+ });
+ if (r >= 0) {
+ dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
+ }
+ if (f) {
+ f->flush(cmdctx->odata);
+ } else {
+ cmdctx->odata.append(out_str);
+ }
+ if (r < 0) {
+ ss << "FAILED reweight-by-pg";
+ cmdctx->reply(r, ss);
+ return true;
+ } else if (r == 0 || dry_run) {
+ ss << "no change";
+ cmdctx->reply(r, ss);
+ return true;
+ } else {
+ json_spirit::Object json_object;
+ for (const auto& osd_weight : new_weights) {
+ json_spirit::Config::add(json_object,
+ std::to_string(osd_weight.first),
+ std::to_string(osd_weight.second));
+ }
+ string s = json_spirit::write(json_object);
+ std::replace(begin(s), end(s), '\"', '\'');
+ const string cmd =
+ "{"
+ "\"prefix\": \"osd reweightn\", "
+ "\"weights\": \"" + s + "\""
+ "}";
+ auto on_finish = new ReplyOnFinish(cmdctx);
+ monc->start_mon_command({cmd}, {},
+ &on_finish->from_mon, &on_finish->outs, on_finish);
+ return true;
+ }
+ } else if (prefix == "osd df") {
+ string method, filter;
+ cmd_getval(cmdctx->cmdmap, "output_method", method);
+ cmd_getval(cmdctx->cmdmap, "filter", filter);
+ stringstream rs;
+ r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) {
+ // sanity check filter(s)
+ if (!filter.empty() &&
+ osdmap.lookup_pg_pool_name(filter) < 0 &&
+ !osdmap.crush->class_exists(filter) &&
+ !osdmap.crush->name_exists(filter)) {
+ rs << "'" << filter << "' not a pool, crush node or device class name";
+ return -EINVAL;
+ }
+ print_osd_utilization(osdmap, pgmap, ss,
+ f.get(), method == "tree", filter);
+ cmdctx->odata.append(ss);
+ return 0;
+ });
+ cmdctx->reply(r, rs);
+ return true;
+ } else if (prefix == "osd pool stats") {
+ string pool_name;
+ cmd_getval(cmdctx->cmdmap, "pool_name", pool_name);
+ int64_t poolid = -ENOENT;
+ bool one_pool = false;
+ r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+ if (!pool_name.empty()) {
+ poolid = osdmap.lookup_pg_pool_name(pool_name);
+ if (poolid < 0) {
+ ceph_assert(poolid == -ENOENT);
+ ss << "unrecognized pool '" << pool_name << "'";
+ return -ENOENT;
+ }
+ one_pool = true;
+ }
+ stringstream rs;
+ if (f)
+ f->open_array_section("pool_stats");
+ else {
+ if (osdmap.get_pools().empty()) {
+ ss << "there are no pools!";
+ goto stats_out;
+ }
+ }
+ for (auto &p : osdmap.get_pools()) {
+ if (!one_pool) {
+ poolid = p.first;
+ }
+ pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, f.get(), &rs);
+ if (one_pool) {
+ break;
+ }
+ }
+ stats_out:
+ if (f) {
+ f->close_section();
+ f->flush(cmdctx->odata);
+ } else {
+ cmdctx->odata.append(rs.str());
+ }
+ return 0;
+ });
+ if (r != -EOPNOTSUPP) {
+ cmdctx->reply(r, ss);
+ return true;
+ }
+ } else if (prefix == "osd safe-to-destroy" ||
+ prefix == "osd destroy" ||
+ prefix == "osd purge") {
+ set<int> osds;
+ int r = 0;
+ if (prefix == "osd safe-to-destroy") {
+ vector<string> ids;
+ cmd_getval(cmdctx->cmdmap, "ids", ids);
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+ });
+ if (!r && osds.empty()) {
+ ss << "must specify one or more OSDs";
+ r = -EINVAL;
+ }
+ } else {
+ int64_t id;
+ if (!cmd_getval(cmdctx->cmdmap, "id", id)) {
+ r = -EINVAL;
+ ss << "must specify OSD id";
+ } else {
+ osds.insert(id);
+ }
+ }
+ if (r < 0) {
+ cmdctx->reply(r, ss);
+ return true;
+ }
+ set<int> active_osds, missing_stats, stored_pgs, safe_to_destroy;
+ int affected_pgs = 0;
+ cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+ if (pg_map.num_pg_unknown > 0) {
+ ss << pg_map.num_pg_unknown << " pgs have unknown state; cannot draw"
+ << " any conclusions";
+ r = -EAGAIN;
+ return;
+ }
+ int num_active_clean = 0;
+ for (auto& p : pg_map.num_pg_by_state) {
+ unsigned want = PG_STATE_ACTIVE|PG_STATE_CLEAN;
+ if ((p.first & want) == want) {
+ num_active_clean += p.second;
+ }
+ }
+ for (auto osd : osds) {
+ if (!osdmap.exists(osd)) {
+ safe_to_destroy.insert(osd);
+ continue; // clearly safe to destroy
+ }
+ auto q = pg_map.num_pg_by_osd.find(osd);
+ if (q != pg_map.num_pg_by_osd.end()) {
+ if (q->second.acting > 0 || q->second.up_not_acting > 0) {
+ active_osds.insert(osd);
+ // XXX: For overlapping PGs, this counts them again
+ affected_pgs += q->second.acting + q->second.up_not_acting;
+ continue;
+ }
+ }
+ if (num_active_clean < pg_map.num_pg) {
+ // all pgs aren't active+clean; we need to be careful.
+ auto p = pg_map.osd_stat.find(osd);
+ if (p == pg_map.osd_stat.end() || !osdmap.is_up(osd)) {
+ missing_stats.insert(osd);
+ continue;
+ } else if (p->second.num_pgs > 0) {
+ stored_pgs.insert(osd);
+ continue;
+ }
+ }
+ safe_to_destroy.insert(osd);
+ }
+ });
+ if (r && prefix == "osd safe-to-destroy") {
+ cmdctx->reply(r, ss); // regardless of formatter
+ return true;
+ }
+ if (!r && (!active_osds.empty() ||
+ !missing_stats.empty() || !stored_pgs.empty())) {
+ if (!safe_to_destroy.empty()) {
+ ss << "OSD(s) " << safe_to_destroy
+ << " are safe to destroy without reducing data durability. ";
+ }
+ if (!active_osds.empty()) {
+ ss << "OSD(s) " << active_osds << " have " << affected_pgs
+ << " pgs currently mapped to them. ";
+ }
+ if (!missing_stats.empty()) {
+ ss << "OSD(s) " << missing_stats << " have no reported stats, and not all"
+ << " PGs are active+clean; we cannot draw any conclusions. ";
+ }
+ if (!stored_pgs.empty()) {
+ ss << "OSD(s) " << stored_pgs << " last reported they still store some PG"
+ << " data, and not all PGs are active+clean; we cannot be sure they"
+ << " aren't still needed.";
+ }
+ if (!active_osds.empty() || !stored_pgs.empty()) {
+ r = -EBUSY;
+ } else {
+ r = -EAGAIN;
+ }
+ }
+
+ if (prefix == "osd safe-to-destroy") {
+ if (!r) {
+ ss << "OSD(s) " << osds << " are safe to destroy without reducing data"
+ << " durability.";
+ }
+ if (f) {
+ f->open_object_section("osd_status");
+ f->open_array_section("safe_to_destroy");
+ for (auto i : safe_to_destroy)
+ f->dump_int("osd", i);
+ f->close_section();
+ f->open_array_section("active");
+ for (auto i : active_osds)
+ f->dump_int("osd", i);
+ f->close_section();
+ f->open_array_section("missing_stats");
+ for (auto i : missing_stats)
+ f->dump_int("osd", i);
+ f->close_section();
+ f->open_array_section("stored_pgs");
+ for (auto i : stored_pgs)
+ f->dump_int("osd", i);
+ f->close_section();
+ f->close_section(); // osd_status
+ f->flush(cmdctx->odata);
+ r = 0;
+ std::stringstream().swap(ss);
+ }
+ cmdctx->reply(r, ss);
+ return true;
+ }
+
+ if (r) {
+ bool force = false;
+ cmd_getval(cmdctx->cmdmap, "force", force);
+ if (!force) {
+ // Backward compat
+ cmd_getval(cmdctx->cmdmap, "yes_i_really_mean_it", force);
+ }
+ if (!force) {
+ ss << "\nYou can proceed by passing --force, but be warned that"
+ " this will likely mean real, permanent data loss.";
+ } else {
+ r = 0;
+ }
+ }
+ if (r) {
+ cmdctx->reply(r, ss);
+ return true;
+ }
+ const string cmd =
+ "{"
+ "\"prefix\": \"" + prefix + "-actual\", "
+ "\"id\": " + stringify(osds) + ", "
+ "\"yes_i_really_mean_it\": true"
+ "}";
+ auto on_finish = new ReplyOnFinish(cmdctx);
+ monc->start_mon_command({cmd}, {}, nullptr, &on_finish->outs, on_finish);
+ return true;
+ } else if (prefix == "osd ok-to-stop") {
+ vector<string> ids;
+ cmd_getval(cmdctx->cmdmap, "ids", ids);
+ set<int> osds;
+ int64_t max = 1;
+ cmd_getval(cmdctx->cmdmap, "max", max);
+ int r;
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+ });
+ if (!r && osds.empty()) {
+ ss << "must specify one or more OSDs";
+ r = -EINVAL;
+ }
+ if (max < (int)osds.size()) {
+ max = osds.size();
+ }
+ if (r < 0) {
+ cmdctx->reply(r, ss);
+ return true;
+ }
+ offline_pg_report out_report;
+ cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+ _maximize_ok_to_stop_set(
+ osds, max, osdmap, pg_map,
+ &out_report);
+ });
+ if (!f) {
+ f.reset(Formatter::create("json"));
+ }
+ f->dump_object("ok_to_stop", out_report);
+ f->flush(cmdctx->odata);
+ cmdctx->odata.append("\n");
+ if (!out_report.unknown.empty()) {
+ ss << out_report.unknown.size() << " pgs have unknown state; "
+ << "cannot draw any conclusions";
+ cmdctx->reply(-EAGAIN, ss);
+ }
+ if (!out_report.ok_to_stop()) {
+ ss << "unsafe to stop osd(s) at this time (" << out_report.not_ok.size() << " PGs are or would become offline)";
+ cmdctx->reply(-EBUSY, ss);
+ } else {
+ cmdctx->reply(0, ss);
+ }
+ return true;
+ } else if (prefix == "pg force-recovery" ||
+ prefix == "pg force-backfill" ||
+ prefix == "pg cancel-force-recovery" ||
+ prefix == "pg cancel-force-backfill" ||
+ prefix == "osd pool force-recovery" ||
+ prefix == "osd pool force-backfill" ||
+ prefix == "osd pool cancel-force-recovery" ||
+ prefix == "osd pool cancel-force-backfill") {
+ vector<string> vs;
+ get_str_vec(prefix, vs);
+ auto& granularity = vs.front();
+ auto& forceop = vs.back();
+ vector<pg_t> pgs;
+
+ // figure out actual op just once
+ int actual_op = 0;
+ if (forceop == "force-recovery") {
+ actual_op = OFR_RECOVERY;
+ } else if (forceop == "force-backfill") {
+ actual_op = OFR_BACKFILL;
+ } else if (forceop == "cancel-force-backfill") {
+ actual_op = OFR_BACKFILL | OFR_CANCEL;
+ } else if (forceop == "cancel-force-recovery") {
+ actual_op = OFR_RECOVERY | OFR_CANCEL;
+ }
+
+ set<pg_t> candidates; // deduped
+ if (granularity == "pg") {
+ // covnert pg names to pgs, discard any invalid ones while at it
+ vector<string> pgids;
+ cmd_getval(cmdctx->cmdmap, "pgid", pgids);
+ for (auto& i : pgids) {
+ pg_t pgid;
+ if (!pgid.parse(i.c_str())) {
+ ss << "invlaid pgid '" << i << "'; ";
+ r = -EINVAL;
+ continue;
+ }
+ candidates.insert(pgid);
+ }
+ } else {
+ // per pool
+ vector<string> pool_names;
+ cmd_getval(cmdctx->cmdmap, "who", pool_names);
+ if (pool_names.empty()) {
+ ss << "must specify one or more pool names";
+ cmdctx->reply(-EINVAL, ss);
+ return true;
+ }
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ for (auto& pool_name : pool_names) {
+ auto pool_id = osdmap.lookup_pg_pool_name(pool_name);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << pool_name << "'";
+ r = -ENOENT;
+ return;
+ }
+ auto pool_pg_num = osdmap.get_pg_num(pool_id);
+ for (int i = 0; i < pool_pg_num; i++)
+ candidates.insert({(unsigned int)i, (uint64_t)pool_id});
+ }
+ });
+ if (r < 0) {
+ cmdctx->reply(r, ss);
+ return true;
+ }
+ }
+
+ cluster_state.with_pgmap([&](const PGMap& pg_map) {
+ for (auto& i : candidates) {
+ auto it = pg_map.pg_stat.find(i);
+ if (it == pg_map.pg_stat.end()) {
+ ss << "pg " << i << " does not exist; ";
+ r = -ENOENT;
+ continue;
+ }
+ auto state = it->second.state;
+ // discard pgs for which user requests are pointless
+ switch (actual_op) {
+ case OFR_RECOVERY:
+ if ((state & (PG_STATE_DEGRADED |
+ PG_STATE_RECOVERY_WAIT |
+ PG_STATE_RECOVERING)) == 0) {
+ // don't return error, user script may be racing with cluster.
+ // not fatal.
+ ss << "pg " << i << " doesn't require recovery; ";
+ continue;
+ } else if (state & PG_STATE_FORCED_RECOVERY) {
+ ss << "pg " << i << " recovery already forced; ";
+ // return error, as it may be a bug in user script
+ r = -EINVAL;
+ continue;
+ }
+ break;
+ case OFR_BACKFILL:
+ if ((state & (PG_STATE_DEGRADED |
+ PG_STATE_BACKFILL_WAIT |
+ PG_STATE_BACKFILLING)) == 0) {
+ ss << "pg " << i << " doesn't require backfilling; ";
+ continue;
+ } else if (state & PG_STATE_FORCED_BACKFILL) {
+ ss << "pg " << i << " backfill already forced; ";
+ r = -EINVAL;
+ continue;
+ }
+ break;
+ case OFR_BACKFILL | OFR_CANCEL:
+ if ((state & PG_STATE_FORCED_BACKFILL) == 0) {
+ ss << "pg " << i << " backfill not forced; ";
+ continue;
+ }
+ break;
+ case OFR_RECOVERY | OFR_CANCEL:
+ if ((state & PG_STATE_FORCED_RECOVERY) == 0) {
+ ss << "pg " << i << " recovery not forced; ";
+ continue;
+ }
+ break;
+ default:
+ ceph_abort_msg("actual_op value is not supported");
+ }
+ pgs.push_back(i);
+ } // for
+ });
+
+ // respond with error only when no pgs are correct
+ // yes, in case of mixed errors, only the last one will be emitted,
+ // but the message presented will be fine
+ if (pgs.size() != 0) {
+ // clear error to not confuse users/scripts
+ r = 0;
+ }
+
+ // optimize the command -> messages conversion, use only one
+ // message per distinct OSD
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ // group pgs to process by osd
+ map<int, vector<spg_t>> osdpgs;
+ for (auto& pgid : pgs) {
+ int primary;
+ spg_t spg;
+ if (osdmap.get_primary_shard(pgid, &primary, &spg)) {
+ osdpgs[primary].push_back(spg);
+ }
+ }
+ for (auto& i : osdpgs) {
+ if (osdmap.is_up(i.first)) {
+ auto p = osd_cons.find(i.first);
+ if (p == osd_cons.end()) {
+ ss << "osd." << i.first << " is not currently connected";
+ r = -EAGAIN;
+ continue;
+ }
+ for (auto& con : p->second) {
+ con->send_message(
+ new MOSDForceRecovery(monc->get_fsid(), i.second, actual_op));
+ }
+ ss << "instructing pg(s) " << i.second << " on osd." << i.first
+ << " to " << forceop << "; ";
+ }
+ }
+ });
+ ss << std::endl;
+ cmdctx->reply(r, ss);
+ return true;
+ } else if (prefix == "config show" ||
+ prefix == "config show-with-defaults") {
+ string who;
+ cmd_getval(cmdctx->cmdmap, "who", who);
+ auto [key, valid] = DaemonKey::parse(who);
+ if (!valid) {
+ ss << "invalid daemon name: use <type>.<id>";
+ cmdctx->reply(-EINVAL, ss);
+ return true;
+ }
+ DaemonStatePtr daemon = daemon_state.get(key);
+ if (!daemon) {
+ ss << "no config state for daemon " << who;
+ cmdctx->reply(-ENOENT, ss);
+ return true;
+ }
+
+ std::lock_guard l(daemon->lock);
+
+ int r = 0;
+ string name;
+ if (cmd_getval(cmdctx->cmdmap, "key", name)) {
+ // handle special options
+ if (name == "fsid") {
+ cmdctx->odata.append(stringify(monc->get_fsid()) + "\n");
+ cmdctx->reply(r, ss);
+ return true;
+ }
+ auto p = daemon->config.find(name);
+ if (p != daemon->config.end() &&
+ !p->second.empty()) {
+ cmdctx->odata.append(p->second.rbegin()->second + "\n");
+ } else {
+ auto& defaults = daemon->_get_config_defaults();
+ auto q = defaults.find(name);
+ if (q != defaults.end()) {
+ cmdctx->odata.append(q->second + "\n");
+ } else {
+ r = -ENOENT;
+ }
+ }
+ } else if (daemon->config_defaults_bl.length() > 0) {
+ TextTable tbl;
+ if (f) {
+ f->open_array_section("config");
+ } else {
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("SOURCE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("OVERRIDES", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("IGNORES", TextTable::LEFT, TextTable::LEFT);
+ }
+ if (prefix == "config show") {
+ // show
+ for (auto& i : daemon->config) {
+ dout(20) << " " << i.first << " -> " << i.second << dendl;
+ if (i.second.empty()) {
+ continue;
+ }
+ if (f) {
+ f->open_object_section("value");
+ f->dump_string("name", i.first);
+ f->dump_string("value", i.second.rbegin()->second);
+ f->dump_string("source", ceph_conf_level_name(
+ i.second.rbegin()->first));
+ if (i.second.size() > 1) {
+ f->open_array_section("overrides");
+ auto j = i.second.rend();
+ for (--j; j != i.second.rbegin(); --j) {
+ f->open_object_section("value");
+ f->dump_string("source", ceph_conf_level_name(j->first));
+ f->dump_string("value", j->second);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ if (daemon->ignored_mon_config.count(i.first)) {
+ f->dump_string("ignores", "mon");
+ }
+ f->close_section();
+ } else {
+ tbl << i.first;
+ tbl << i.second.rbegin()->second;
+ tbl << ceph_conf_level_name(i.second.rbegin()->first);
+ if (i.second.size() > 1) {
+ list<string> ov;
+ auto j = i.second.rend();
+ for (--j; j != i.second.rbegin(); --j) {
+ if (j->second == i.second.rbegin()->second) {
+ ov.push_front(string("(") + ceph_conf_level_name(j->first) +
+ string("[") + j->second + string("]") +
+ string(")"));
+ } else {
+ ov.push_front(ceph_conf_level_name(j->first) +
+ string("[") + j->second + string("]"));
+
+ }
+ }
+ tbl << ov;
+ } else {
+ tbl << "";
+ }
+ tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : "");
+ tbl << TextTable::endrow;
+ }
+ }
+ } else {
+ // show-with-defaults
+ auto& defaults = daemon->_get_config_defaults();
+ for (auto& i : defaults) {
+ if (f) {
+ f->open_object_section("value");
+ f->dump_string("name", i.first);
+ } else {
+ tbl << i.first;
+ }
+ auto j = daemon->config.find(i.first);
+ if (j != daemon->config.end() && !j->second.empty()) {
+ // have config
+ if (f) {
+ f->dump_string("value", j->second.rbegin()->second);
+ f->dump_string("source", ceph_conf_level_name(
+ j->second.rbegin()->first));
+ if (j->second.size() > 1) {
+ f->open_array_section("overrides");
+ auto k = j->second.rend();
+ for (--k; k != j->second.rbegin(); --k) {
+ f->open_object_section("value");
+ f->dump_string("source", ceph_conf_level_name(k->first));
+ f->dump_string("value", k->second);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ if (daemon->ignored_mon_config.count(i.first)) {
+ f->dump_string("ignores", "mon");
+ }
+ f->close_section();
+ } else {
+ tbl << j->second.rbegin()->second;
+ tbl << ceph_conf_level_name(j->second.rbegin()->first);
+ if (j->second.size() > 1) {
+ list<string> ov;
+ auto k = j->second.rend();
+ for (--k; k != j->second.rbegin(); --k) {
+ if (k->second == j->second.rbegin()->second) {
+ ov.push_front(string("(") + ceph_conf_level_name(k->first) +
+ string("[") + k->second + string("]") +
+ string(")"));
+ } else {
+ ov.push_front(ceph_conf_level_name(k->first) +
+ string("[") + k->second + string("]"));
+ }
+ }
+ tbl << ov;
+ } else {
+ tbl << "";
+ }
+ tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : "");
+ tbl << TextTable::endrow;
+ }
+ } else {
+ // only have default
+ if (f) {
+ f->dump_string("value", i.second);
+ f->dump_string("source", ceph_conf_level_name(CONF_DEFAULT));
+ f->close_section();
+ } else {
+ tbl << i.second;
+ tbl << ceph_conf_level_name(CONF_DEFAULT);
+ tbl << "";
+ tbl << "";
+ tbl << TextTable::endrow;
+ }
+ }
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(cmdctx->odata);
+ } else {
+ cmdctx->odata.append(stringify(tbl));
+ }
+ }
+ cmdctx->reply(r, ss);
+ return true;
+ } else if (prefix == "device ls") {
+ set<string> devids;
+ TextTable tbl;
+ if (f) {
+ f->open_array_section("devices");
+ daemon_state.with_devices([&f](const DeviceState& dev) {
+ f->dump_object("device", dev);
+ });
+ f->close_section();
+ f->flush(cmdctx->odata);
+ } else {
+ tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("WEAR", TextTable::RIGHT, TextTable::RIGHT);
+ tbl.define_column("LIFE EXPECTANCY", TextTable::LEFT, TextTable::LEFT);
+ auto now = ceph_clock_now();
+ daemon_state.with_devices([&tbl, now](const DeviceState& dev) {
+ string h;
+ for (auto& i : dev.attachments) {
+ if (h.size()) {
+ h += " ";
+ }
+ h += std::get<0>(i) + ":" + std::get<1>(i);
+ }
+ string d;
+ for (auto& i : dev.daemons) {
+ if (d.size()) {
+ d += " ";
+ }
+ d += to_string(i);
+ }
+ char wear_level_str[16] = {0};
+ if (dev.wear_level >= 0) {
+ snprintf(wear_level_str, sizeof(wear_level_str)-1, "%d%%",
+ (int)(100.1 * dev.wear_level));
+ }
+ tbl << dev.devid
+ << h
+ << d
+ << wear_level_str
+ << dev.get_life_expectancy_str(now)
+ << TextTable::endrow;
+ });
+ cmdctx->odata.append(stringify(tbl));
+ }
+ cmdctx->reply(0, ss);
+ return true;
+ } else if (prefix == "device ls-by-daemon") {
+ string who;
+ cmd_getval(cmdctx->cmdmap, "who", who);
+ if (auto [k, valid] = DaemonKey::parse(who); !valid) {
+ ss << who << " is not a valid daemon name";
+ r = -EINVAL;
+ } else {
+ auto dm = daemon_state.get(k);
+ if (dm) {
+ if (f) {
+ f->open_array_section("devices");
+ for (auto& i : dm->devices) {
+ daemon_state.with_device(i.first, [&f] (const DeviceState& dev) {
+ f->dump_object("device", dev);
+ });
+ }
+ f->close_section();
+ f->flush(cmdctx->odata);
+ } else {
+ TextTable tbl;
+ tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("EXPECTED FAILURE", TextTable::LEFT,
+ TextTable::LEFT);
+ auto now = ceph_clock_now();
+ for (auto& i : dm->devices) {
+ daemon_state.with_device(
+ i.first, [&tbl, now] (const DeviceState& dev) {
+ string h;
+ for (auto& i : dev.attachments) {
+ if (h.size()) {
+ h += " ";
+ }
+ h += std::get<0>(i) + ":" + std::get<1>(i);
+ }
+ tbl << dev.devid
+ << h
+ << dev.get_life_expectancy_str(now)
+ << TextTable::endrow;
+ });
+ }
+ cmdctx->odata.append(stringify(tbl));
+ }
+ } else {
+ r = -ENOENT;
+ ss << "daemon " << who << " not found";
+ }
+ cmdctx->reply(r, ss);
+ }
+ } else if (prefix == "device ls-by-host") {
+ string host;
+ cmd_getval(cmdctx->cmdmap, "host", host);
+ set<string> devids;
+ daemon_state.list_devids_by_server(host, &devids);
+ if (f) {
+ f->open_array_section("devices");
+ for (auto& devid : devids) {
+ daemon_state.with_device(
+ devid, [&f] (const DeviceState& dev) {
+ f->dump_object("device", dev);
+ });
+ }
+ f->close_section();
+ f->flush(cmdctx->odata);
+ } else {
+ TextTable tbl;
+ tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("DEV", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("EXPECTED FAILURE", TextTable::LEFT, TextTable::LEFT);
+ auto now = ceph_clock_now();
+ for (auto& devid : devids) {
+ daemon_state.with_device(
+ devid, [&tbl, &host, now] (const DeviceState& dev) {
+ string n;
+ for (auto& j : dev.attachments) {
+ if (std::get<0>(j) == host) {
+ if (n.size()) {
+ n += " ";
+ }
+ n += std::get<1>(j);
+ }
+ }
+ string d;
+ for (auto& i : dev.daemons) {
+ if (d.size()) {
+ d += " ";
+ }
+ d += to_string(i);
+ }
+ tbl << dev.devid
+ << n
+ << d
+ << dev.get_life_expectancy_str(now)
+ << TextTable::endrow;
+ });
+ }
+ cmdctx->odata.append(stringify(tbl));
+ }
+ cmdctx->reply(0, ss);
+ return true;
+ } else if (prefix == "device info") {
+ string devid;
+ cmd_getval(cmdctx->cmdmap, "devid", devid);
+ int r = 0;
+ ostringstream rs;
+ if (!daemon_state.with_device(devid,
+ [&f, &rs] (const DeviceState& dev) {
+ if (f) {
+ f->dump_object("device", dev);
+ } else {
+ dev.print(rs);
+ }
+ })) {
+ ss << "device " << devid << " not found";
+ r = -ENOENT;
+ } else {
+ if (f) {
+ f->flush(cmdctx->odata);
+ } else {
+ cmdctx->odata.append(rs.str());
+ }
+ }
+ cmdctx->reply(r, ss);
+ return true;
+ } else if (prefix == "device set-life-expectancy") {
+ string devid;
+ cmd_getval(cmdctx->cmdmap, "devid", devid);
+ string from_str, to_str;
+ cmd_getval(cmdctx->cmdmap, "from", from_str);
+ cmd_getval(cmdctx->cmdmap, "to", to_str);
+ utime_t from, to;
+ if (!from.parse(from_str)) {
+ ss << "unable to parse datetime '" << from_str << "'";
+ r = -EINVAL;
+ cmdctx->reply(r, ss);
+ } else if (to_str.size() && !to.parse(to_str)) {
+ ss << "unable to parse datetime '" << to_str << "'";
+ r = -EINVAL;
+ cmdctx->reply(r, ss);
+ } else {
+ map<string,string> meta;
+ daemon_state.with_device_create(
+ devid,
+ [from, to, &meta] (DeviceState& dev) {
+ dev.set_life_expectancy(from, to, ceph_clock_now());
+ meta = dev.metadata;
+ });
+ json_spirit::Object json_object;
+ for (auto& i : meta) {
+ json_spirit::Config::add(json_object, i.first, i.second);
+ }
+ bufferlist json;
+ json.append(json_spirit::write(json_object));
+ const string cmd =
+ "{"
+ "\"prefix\": \"config-key set\", "
+ "\"key\": \"device/" + devid + "\""
+ "}";
+ auto on_finish = new ReplyOnFinish(cmdctx);
+ monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish);
+ }
+ return true;
+ } else if (prefix == "device rm-life-expectancy") {
+ string devid;
+ cmd_getval(cmdctx->cmdmap, "devid", devid);
+ map<string,string> meta;
+ if (daemon_state.with_device_write(devid, [&meta] (DeviceState& dev) {
+ dev.rm_life_expectancy();
+ meta = dev.metadata;
+ })) {
+ string cmd;
+ bufferlist json;
+ if (meta.empty()) {
+ cmd =
+ "{"
+ "\"prefix\": \"config-key rm\", "
+ "\"key\": \"device/" + devid + "\""
+ "}";
+ } else {
+ json_spirit::Object json_object;
+ for (auto& i : meta) {
+ json_spirit::Config::add(json_object, i.first, i.second);
+ }
+ json.append(json_spirit::write(json_object));
+ cmd =
+ "{"
+ "\"prefix\": \"config-key set\", "
+ "\"key\": \"device/" + devid + "\""
+ "}";
+ }
+ auto on_finish = new ReplyOnFinish(cmdctx);
+ monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish);
+ } else {
+ cmdctx->reply(0, ss);
+ }
+ return true;
+ } else {
+ if (!pgmap_ready) {
+ ss << "Warning: due to ceph-mgr restart, some PG states may not be up to date\n";
+ }
+ if (f) {
+ f->open_object_section("pg_info");
+ f->dump_bool("pg_ready", pgmap_ready);
+ }
+
+ // fall back to feeding command to PGMap
+ r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+ return process_pg_map_command(prefix, cmdctx->cmdmap, pg_map, osdmap,
+ f.get(), &ss, &cmdctx->odata);
+ });
+
+ if (f) {
+ f->close_section();
+ }
+ if (r != -EOPNOTSUPP) {
+ if (f) {
+ f->flush(cmdctx->odata);
+ }
+ cmdctx->reply(r, ss);
+ return true;
+ }
+ }
+
+ // Was the command unfound?
+ if (py_command.cmdstring.empty()) {
+ ss << "No handler found for '" << prefix << "'";
+ dout(4) << "No handler found for '" << prefix << "'" << dendl;
+ cmdctx->reply(-EINVAL, ss);
+ return true;
+ }
+
+ dout(10) << "passing through command '" << prefix << "' size " << cmdctx->cmdmap.size() << dendl;
+ finisher.queue(new LambdaContext([this, cmdctx, session, py_command, prefix]
+ (int r_) mutable {
+ std::stringstream ss;
+
+ dout(10) << "dispatching command '" << prefix << "' size " << cmdctx->cmdmap.size() << dendl;
+
+ // Validate that the module is enabled
+ auto& py_handler_name = py_command.module_name;
+ PyModuleRef module = py_modules.get_module(py_handler_name);
+ ceph_assert(module);
+ if (!module->is_enabled()) {
+ ss << "Module '" << py_handler_name << "' is not enabled (required by "
+ "command '" << prefix << "'): use `ceph mgr module enable "
+ << py_handler_name << "` to enable it";
+ dout(4) << ss.str() << dendl;
+ cmdctx->reply(-EOPNOTSUPP, ss);
+ return;
+ }
+
+ // Hack: allow the self-test method to run on unhealthy modules.
+ // Fix this in future by creating a special path for self test rather
+ // than having the hook be a normal module command.
+ std::string self_test_prefix = py_handler_name + " " + "self-test";
+
+ // Validate that the module is healthy
+ bool accept_command;
+ if (module->is_loaded()) {
+ if (module->get_can_run() && !module->is_failed()) {
+ // Healthy module
+ accept_command = true;
+ } else if (self_test_prefix == prefix) {
+ // Unhealthy, but allow because it's a self test command
+ accept_command = true;
+ } else {
+ accept_command = false;
+ ss << "Module '" << py_handler_name << "' has experienced an error and "
+ "cannot handle commands: " << module->get_error_string();
+ }
+ } else {
+ // Module not loaded
+ accept_command = false;
+ ss << "Module '" << py_handler_name << "' failed to load and "
+ "cannot handle commands: " << module->get_error_string();
+ }
+
+ if (!accept_command) {
+ dout(4) << ss.str() << dendl;
+ cmdctx->reply(-EIO, ss);
+ return;
+ }
+
+ std::stringstream ds;
+ bufferlist inbl = cmdctx->data;
+ int r = py_modules.handle_command(py_command, *session, cmdctx->cmdmap,
+ inbl, &ds, &ss);
+ if (r == -EACCES) {
+ log_access_denied(cmdctx, session, ss);
+ }
+
+ cmdctx->odata.append(ds);
+ cmdctx->reply(r, ss);
+ dout(10) << " command returned " << r << dendl;
+ }));
+ return true;
+}
+
+void DaemonServer::_prune_pending_service_map()
+{
+ utime_t cutoff = ceph_clock_now();
+ cutoff -= g_conf().get_val<double>("mgr_service_beacon_grace");
+ auto p = pending_service_map.services.begin();
+ while (p != pending_service_map.services.end()) {
+ auto q = p->second.daemons.begin();
+ while (q != p->second.daemons.end()) {
+ DaemonKey key{p->first, q->first};
+ if (!daemon_state.exists(key)) {
+ if (ServiceMap::is_normal_ceph_entity(p->first)) {
+ dout(10) << "daemon " << key << " in service map but not in daemon state "
+ << "index -- force pruning" << dendl;
+ q = p->second.daemons.erase(q);
+ pending_service_map_dirty = pending_service_map.epoch;
+ } else {
+ derr << "missing key " << key << dendl;
+ ++q;
+ }
+
+ continue;
+ }
+
+ auto daemon = daemon_state.get(key);
+ std::lock_guard l(daemon->lock);
+ if (daemon->last_service_beacon == utime_t()) {
+ // we must have just restarted; assume they are alive now.
+ daemon->last_service_beacon = ceph_clock_now();
+ ++q;
+ continue;
+ }
+ if (daemon->last_service_beacon < cutoff) {
+ dout(10) << "pruning stale " << p->first << "." << q->first
+ << " last_beacon " << daemon->last_service_beacon << dendl;
+ q = p->second.daemons.erase(q);
+ pending_service_map_dirty = pending_service_map.epoch;
+ } else {
+ ++q;
+ }
+ }
+ if (p->second.daemons.empty()) {
+ p = pending_service_map.services.erase(p);
+ pending_service_map_dirty = pending_service_map.epoch;
+ } else {
+ ++p;
+ }
+ }
+}
+
+void DaemonServer::send_report()
+{
+ if (!pgmap_ready) {
+ if (ceph_clock_now() - started_at > g_conf().get_val<int64_t>("mgr_stats_period") * 4.0) {
+ pgmap_ready = true;
+ reported_osds.clear();
+ dout(1) << "Giving up on OSDs that haven't reported yet, sending "
+ << "potentially incomplete PG state to mon" << dendl;
+ } else {
+ dout(1) << "Not sending PG status to monitor yet, waiting for OSDs"
+ << dendl;
+ return;
+ }
+ }
+
+ auto m = ceph::make_message<MMonMgrReport>();
+ m->gid = monc->get_global_id();
+ py_modules.get_health_checks(&m->health_checks);
+ py_modules.get_progress_events(&m->progress_events);
+
+ cluster_state.with_mutable_pgmap([&](PGMap& pg_map) {
+ cluster_state.update_delta_stats();
+
+ if (pending_service_map.epoch) {
+ _prune_pending_service_map();
+ if (pending_service_map_dirty >= pending_service_map.epoch) {
+ pending_service_map.modified = ceph_clock_now();
+ encode(pending_service_map, m->service_map_bl, CEPH_FEATURES_ALL);
+ dout(10) << "sending service_map e" << pending_service_map.epoch
+ << dendl;
+ pending_service_map.epoch++;
+ }
+ }
+
+ cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+ // FIXME: no easy way to get mon features here. this will do for
+ // now, though, as long as we don't make a backward-incompat change.
+ pg_map.encode_digest(osdmap, m->get_data(), CEPH_FEATURES_ALL);
+ dout(10) << pg_map << dendl;
+
+ pg_map.get_health_checks(g_ceph_context, osdmap,
+ &m->health_checks);
+
+ dout(10) << m->health_checks.checks.size() << " health checks"
+ << dendl;
+ dout(20) << "health checks:\n";
+ JSONFormatter jf(true);
+ jf.dump_object("health_checks", m->health_checks);
+ jf.flush(*_dout);
+ *_dout << dendl;
+ if (osdmap.require_osd_release >= ceph_release_t::luminous) {
+ clog->debug() << "pgmap v" << pg_map.version << ": " << pg_map;
+ }
+ });
+ });
+
+ map<daemon_metric, unique_ptr<DaemonHealthMetricCollector>> accumulated;
+ for (auto service : {"osd", "mon"} ) {
+ auto daemons = daemon_state.get_by_service(service);
+ for (const auto& [key,state] : daemons) {
+ std::lock_guard l{state->lock};
+ for (const auto& metric : state->daemon_health_metrics) {
+ auto acc = accumulated.find(metric.get_type());
+ if (acc == accumulated.end()) {
+ auto collector = DaemonHealthMetricCollector::create(metric.get_type());
+ if (!collector) {
+ derr << __func__ << " " << key
+ << " sent me an unknown health metric: "
+ << std::hex << static_cast<uint8_t>(metric.get_type())
+ << std::dec << dendl;
+ continue;
+ }
+ dout(20) << " + " << state->key << " "
+ << metric << dendl;
+ tie(acc, std::ignore) = accumulated.emplace(metric.get_type(),
+ std::move(collector));
+ }
+ acc->second->update(key, metric);
+ }
+ }
+ }
+ for (const auto& acc : accumulated) {
+ acc.second->summarize(m->health_checks);
+ }
+ // TODO? We currently do not notify the PyModules
+ // TODO: respect needs_send, so we send the report only if we are asked to do
+ // so, or the state is updated.
+ monc->send_mon_message(std::move(m));
+}
+
+void DaemonServer::adjust_pgs()
+{
+ dout(20) << dendl;
+ unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
+ double max_misplaced = g_conf().get_val<double>("target_max_misplaced_ratio");
+ bool aggro = g_conf().get_val<bool>("mgr_debug_aggressive_pg_num_changes");
+
+ map<string,unsigned> pg_num_to_set;
+ map<string,unsigned> pgp_num_to_set;
+ set<pg_t> upmaps_to_clear;
+ cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
+ unsigned creating_or_unknown = 0;
+ for (auto& i : pg_map.num_pg_by_state) {
+ if ((i.first & (PG_STATE_CREATING)) ||
+ i.first == 0) {
+ creating_or_unknown += i.second;
+ }
+ }
+ unsigned left = max;
+ if (creating_or_unknown >= max) {
+ return;
+ }
+ left -= creating_or_unknown;
+ dout(10) << "creating_or_unknown " << creating_or_unknown
+ << " max_creating " << max
+ << " left " << left
+ << dendl;
+
+ // FIXME: These checks are fundamentally racy given that adjust_pgs()
+ // can run more frequently than we get updated pg stats from OSDs. We
+ // may make multiple adjustments with stale informaiton.
+ double misplaced_ratio, degraded_ratio;
+ double inactive_pgs_ratio, unknown_pgs_ratio;
+ pg_map.get_recovery_stats(&misplaced_ratio, &degraded_ratio,
+ &inactive_pgs_ratio, &unknown_pgs_ratio);
+ dout(20) << "misplaced_ratio " << misplaced_ratio
+ << " degraded_ratio " << degraded_ratio
+ << " inactive_pgs_ratio " << inactive_pgs_ratio
+ << " unknown_pgs_ratio " << unknown_pgs_ratio
+ << "; target_max_misplaced_ratio " << max_misplaced
+ << dendl;
+
+ for (auto& i : osdmap.get_pools()) {
+ const pg_pool_t& p = i.second;
+
+ // adjust pg_num?
+ if (p.get_pg_num_target() != p.get_pg_num()) {
+ dout(20) << "pool " << i.first
+ << " pg_num " << p.get_pg_num()
+ << " target " << p.get_pg_num_target()
+ << dendl;
+ if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << " - still creating initial pgs"
+ << dendl;
+ } else if (p.get_pg_num_target() < p.get_pg_num()) {
+ // pg_num decrease (merge)
+ pg_t merge_source(p.get_pg_num() - 1, i.first);
+ pg_t merge_target = merge_source.get_parent();
+ bool ok = true;
+
+ if (p.get_pg_num() != p.get_pg_num_pending()) {
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << " - decrease and pg_num_pending != pg_num, waiting"
+ << dendl;
+ ok = false;
+ } else if (p.get_pg_num() == p.get_pgp_num()) {
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << " - decrease blocked by pgp_num "
+ << p.get_pgp_num()
+ << dendl;
+ ok = false;
+ }
+ vector<int32_t> source_acting;
+ for (auto &merge_participant : {merge_source, merge_target}) {
+ bool is_merge_source = merge_participant == merge_source;
+ if (osdmap.have_pg_upmaps(merge_participant)) {
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << (is_merge_source ? " - merge source " : " - merge target ")
+ << merge_participant
+ << " has upmap" << dendl;
+ upmaps_to_clear.insert(merge_participant);
+ ok = false;
+ }
+ auto q = pg_map.pg_stat.find(merge_participant);
+ if (q == pg_map.pg_stat.end()) {
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << " - no state for " << merge_participant
+ << (is_merge_source ? " (merge source)" : " (merge target)")
+ << dendl;
+ ok = false;
+ } else if ((q->second.state & (PG_STATE_ACTIVE | PG_STATE_CLEAN)) !=
+ (PG_STATE_ACTIVE | PG_STATE_CLEAN)) {
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << (is_merge_source ? " - merge source " : " - merge target ")
+ << merge_participant
+ << " not clean (" << pg_state_string(q->second.state)
+ << ")" << dendl;
+ ok = false;
+ }
+ if (is_merge_source) {
+ source_acting = q->second.acting;
+ } else if (ok && q->second.acting != source_acting) {
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << (is_merge_source ? " - merge source " : " - merge target ")
+ << merge_participant
+ << " acting does not match (source " << source_acting
+ << " != target " << q->second.acting
+ << ")" << dendl;
+ ok = false;
+ }
+ }
+
+ if (ok) {
+ unsigned target = p.get_pg_num() - 1;
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << " -> " << target
+ << " (merging " << merge_source
+ << " and " << merge_target
+ << ")" << dendl;
+ pg_num_to_set[osdmap.get_pool_name(i.first)] = target;
+ continue;
+ }
+ } else if (p.get_pg_num_target() > p.get_pg_num()) {
+ // pg_num increase (split)
+ bool active = true;
+ auto q = pg_map.num_pg_by_pool_state.find(i.first);
+ if (q != pg_map.num_pg_by_pool_state.end()) {
+ for (auto& j : q->second) {
+ if ((j.first & (PG_STATE_ACTIVE|PG_STATE_PEERED)) == 0) {
+ dout(20) << "pool " << i.first << " has " << j.second
+ << " pgs in " << pg_state_string(j.first)
+ << dendl;
+ active = false;
+ break;
+ }
+ }
+ } else {
+ active = false;
+ }
+ unsigned pg_gap = p.get_pg_num() - p.get_pgp_num();
+ unsigned max_jump = cct->_conf->mgr_max_pg_num_change;
+ if (!active) {
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << " - not all pgs active"
+ << dendl;
+ } else if (pg_gap >= max_jump) {
+ dout(10) << "pool " << i.first
+ << " pg_num " << p.get_pg_num()
+ << " - pgp_num " << p.get_pgp_num()
+ << " gap >= max_pg_num_change " << max_jump
+ << " - must scale pgp_num first"
+ << dendl;
+ } else {
+ unsigned add = std::min(
+ std::min(left, max_jump - pg_gap),
+ p.get_pg_num_target() - p.get_pg_num());
+ unsigned target = p.get_pg_num() + add;
+ left -= add;
+ dout(10) << "pool " << i.first
+ << " pg_num_target " << p.get_pg_num_target()
+ << " pg_num " << p.get_pg_num()
+ << " -> " << target << dendl;
+ pg_num_to_set[osdmap.get_pool_name(i.first)] = target;
+ }
+ }
+ }
+
+ // adjust pgp_num?
+ unsigned target = std::min(p.get_pg_num_pending(),
+ p.get_pgp_num_target());
+ if (target != p.get_pgp_num()) {
+ dout(20) << "pool " << i.first
+ << " pgp_num_target " << p.get_pgp_num_target()
+ << " pgp_num " << p.get_pgp_num()
+ << " -> " << target << dendl;
+ if (target > p.get_pgp_num() &&
+ p.get_pgp_num() == p.get_pg_num()) {
+ dout(10) << "pool " << i.first
+ << " pgp_num_target " << p.get_pgp_num_target()
+ << " pgp_num " << p.get_pgp_num()
+ << " - increase blocked by pg_num " << p.get_pg_num()
+ << dendl;
+ } else if (!aggro && (inactive_pgs_ratio > 0 ||
+ degraded_ratio > 0 ||
+ unknown_pgs_ratio > 0)) {
+ dout(10) << "pool " << i.first
+ << " pgp_num_target " << p.get_pgp_num_target()
+ << " pgp_num " << p.get_pgp_num()
+ << " - inactive|degraded|unknown pgs, deferring pgp_num"
+ << " update" << dendl;
+ } else if (!aggro && (misplaced_ratio > max_misplaced)) {
+ dout(10) << "pool " << i.first
+ << " pgp_num_target " << p.get_pgp_num_target()
+ << " pgp_num " << p.get_pgp_num()
+ << " - misplaced_ratio " << misplaced_ratio
+ << " > max " << max_misplaced
+ << ", deferring pgp_num update" << dendl;
+ } else {
+ // NOTE: this calculation assumes objects are
+ // basically uniformly distributed across all PGs
+ // (regardless of pool), which is probably not
+ // perfectly correct, but it's a start. make no
+ // single adjustment that's more than half of the
+ // max_misplaced, to somewhat limit the magnitude of
+ // our potential error here.
+ int next;
+ static constexpr unsigned MAX_NUM_OBJECTS_PER_PG_FOR_LEAP = 1;
+ pool_stat_t s = pg_map.get_pg_pool_sum_stat(i.first);
+ if (aggro ||
+ // pool is (virtually) empty; just jump to final pgp_num?
+ (p.get_pgp_num_target() > p.get_pgp_num() &&
+ s.stats.sum.num_objects <= (MAX_NUM_OBJECTS_PER_PG_FOR_LEAP *
+ p.get_pgp_num_target()))) {
+ next = target;
+ } else {
+ double room =
+ std::min<double>(max_misplaced - misplaced_ratio,
+ max_misplaced / 2.0);
+ unsigned estmax = std::max<unsigned>(
+ (double)p.get_pg_num() * room, 1u);
+ unsigned next_min = 0;
+ if (p.get_pgp_num() > estmax) {
+ next_min = p.get_pgp_num() - estmax;
+ }
+ next = std::clamp(target,
+ next_min,
+ p.get_pgp_num() + estmax);
+ dout(20) << " room " << room << " estmax " << estmax
+ << " delta " << (target-p.get_pgp_num())
+ << " next " << next << dendl;
+ if (p.get_pgp_num_target() == p.get_pg_num_target() &&
+ p.get_pgp_num_target() < p.get_pg_num()) {
+ // since pgp_num is tracking pg_num, ceph is handling
+ // pgp_num. so, be responsible: don't let pgp_num get
+ // too far out ahead of merges (if we are merging).
+ // this avoids moving lots of unmerged pgs onto a
+ // small number of OSDs where we might blow out the
+ // per-osd pg max.
+ unsigned max_outpace_merges =
+ std::max<unsigned>(8, p.get_pg_num() * max_misplaced);
+ if (next + max_outpace_merges < p.get_pg_num()) {
+ next = p.get_pg_num() - max_outpace_merges;
+ dout(10) << " using next " << next
+ << " to avoid outpacing merges (max_outpace_merges "
+ << max_outpace_merges << ")" << dendl;
+ }
+ }
+ }
+ if (next != p.get_pgp_num()) {
+ dout(10) << "pool " << i.first
+ << " pgp_num_target " << p.get_pgp_num_target()
+ << " pgp_num " << p.get_pgp_num()
+ << " -> " << next << dendl;
+ pgp_num_to_set[osdmap.get_pool_name(i.first)] = next;
+ }
+ }
+ }
+ if (left == 0) {
+ return;
+ }
+ }
+ });
+ for (auto i : pg_num_to_set) {
+ const string cmd =
+ "{"
+ "\"prefix\": \"osd pool set\", "
+ "\"pool\": \"" + i.first + "\", "
+ "\"var\": \"pg_num_actual\", "
+ "\"val\": \"" + stringify(i.second) + "\""
+ "}";
+ monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr);
+ }
+ for (auto i : pgp_num_to_set) {
+ const string cmd =
+ "{"
+ "\"prefix\": \"osd pool set\", "
+ "\"pool\": \"" + i.first + "\", "
+ "\"var\": \"pgp_num_actual\", "
+ "\"val\": \"" + stringify(i.second) + "\""
+ "}";
+ monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr);
+ }
+ for (auto pg : upmaps_to_clear) {
+ const string cmd =
+ "{"
+ "\"prefix\": \"osd rm-pg-upmap\", "
+ "\"pgid\": \"" + stringify(pg) + "\""
+ "}";
+ monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr);
+ const string cmd2 =
+ "{"
+ "\"prefix\": \"osd rm-pg-upmap-items\", "
+ "\"pgid\": \"" + stringify(pg) + "\"" +
+ "}";
+ monc->start_mon_command({cmd2}, {}, nullptr, nullptr, nullptr);
+ }
+}
+
+void DaemonServer::got_service_map()
+{
+ std::lock_guard l(lock);
+
+ cluster_state.with_servicemap([&](const ServiceMap& service_map) {
+ if (pending_service_map.epoch == 0) {
+ // we just started up
+ dout(10) << "got initial map e" << service_map.epoch << dendl;
+ ceph_assert(pending_service_map_dirty == 0);
+ pending_service_map = service_map;
+ pending_service_map.epoch = service_map.epoch + 1;
+ } else if (pending_service_map.epoch <= service_map.epoch) {
+ // we just started up but got one more not our own map
+ dout(10) << "got newer initial map e" << service_map.epoch << dendl;
+ ceph_assert(pending_service_map_dirty == 0);
+ pending_service_map = service_map;
+ pending_service_map.epoch = service_map.epoch + 1;
+ } else {
+ // we already active and therefore must have persisted it,
+ // which means ours is the same or newer.
+ dout(10) << "got updated map e" << service_map.epoch << dendl;
+ }
+ });
+
+ // cull missing daemons, populate new ones
+ std::set<std::string> types;
+ for (auto& [type, service] : pending_service_map.services) {
+ if (ServiceMap::is_normal_ceph_entity(type)) {
+ continue;
+ }
+
+ types.insert(type);
+
+ std::set<std::string> names;
+ for (auto& q : service.daemons) {
+ names.insert(q.first);
+ DaemonKey key{type, q.first};
+ if (!daemon_state.exists(key)) {
+ auto daemon = std::make_shared<DaemonState>(daemon_state.types);
+ daemon->key = key;
+ daemon->set_metadata(q.second.metadata);
+ daemon->service_daemon = true;
+ daemon_state.insert(daemon);
+ dout(10) << "added missing " << key << dendl;
+ }
+ }
+ daemon_state.cull(type, names);
+ }
+ daemon_state.cull_services(types);
+}
+
+void DaemonServer::got_mgr_map()
+{
+ std::lock_guard l(lock);
+ set<std::string> have;
+ cluster_state.with_mgrmap([&](const MgrMap& mgrmap) {
+ auto md_update = [&] (DaemonKey key) {
+ std::ostringstream oss;
+ auto c = new MetadataUpdate(daemon_state, key);
+ // FIXME remove post-nautilus: include 'id' for luminous mons
+ oss << "{\"prefix\": \"mgr metadata\", \"who\": \""
+ << key.name << "\", \"id\": \"" << key.name << "\"}";
+ monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c);
+ };
+ if (mgrmap.active_name.size()) {
+ DaemonKey key{"mgr", mgrmap.active_name};
+ have.insert(mgrmap.active_name);
+ if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) {
+ md_update(key);
+ dout(10) << "triggered addition of " << key << " via metadata update" << dendl;
+ }
+ }
+ for (auto& i : mgrmap.standbys) {
+ DaemonKey key{"mgr", i.second.name};
+ have.insert(i.second.name);
+ if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) {
+ md_update(key);
+ dout(10) << "triggered addition of " << key << " via metadata update" << dendl;
+ }
+ }
+ });
+ daemon_state.cull("mgr", have);
+}
+
+const char** DaemonServer::get_tracked_conf_keys() const
+{
+ static const char *KEYS[] = {
+ "mgr_stats_threshold",
+ "mgr_stats_period",
+ nullptr
+ };
+
+ return KEYS;
+}
+
+void DaemonServer::handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed)
+{
+
+ if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) {
+ dout(4) << "Updating stats threshold/period on "
+ << daemon_connections.size() << " clients" << dendl;
+ // Send a fresh MMgrConfigure to all clients, so that they can follow
+ // the new policy for transmitting stats
+ finisher.queue(new LambdaContext([this](int r) {
+ std::lock_guard l(lock);
+ for (auto &c : daemon_connections) {
+ _send_configure(c);
+ }
+ }));
+ }
+}
+
+void DaemonServer::_send_configure(ConnectionRef c)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ auto configure = make_message<MMgrConfigure>();
+ configure->stats_period = g_conf().get_val<int64_t>("mgr_stats_period");
+ configure->stats_threshold = g_conf().get_val<int64_t>("mgr_stats_threshold");
+
+ if (c->peer_is_osd()) {
+ configure->osd_perf_metric_queries =
+ osd_perf_metric_collector.get_queries();
+ } else if (c->peer_is_mds()) {
+ configure->metric_config_message =
+ MetricConfigMessage(MDSConfigPayload(mds_perf_metric_collector.get_queries()));
+ }
+
+ c->send_message2(configure);
+}
+
+MetricQueryID DaemonServer::add_osd_perf_query(
+ const OSDPerfMetricQuery &query,
+ const std::optional<OSDPerfMetricLimit> &limit)
+{
+ return osd_perf_metric_collector.add_query(query, limit);
+}
+
+int DaemonServer::remove_osd_perf_query(MetricQueryID query_id)
+{
+ return osd_perf_metric_collector.remove_query(query_id);
+}
+
+int DaemonServer::get_osd_perf_counters(OSDPerfCollector *collector)
+{
+ return osd_perf_metric_collector.get_counters(collector);
+}
+
+MetricQueryID DaemonServer::add_mds_perf_query(
+ const MDSPerfMetricQuery &query,
+ const std::optional<MDSPerfMetricLimit> &limit)
+{
+ return mds_perf_metric_collector.add_query(query, limit);
+}
+
+int DaemonServer::remove_mds_perf_query(MetricQueryID query_id)
+{
+ return mds_perf_metric_collector.remove_query(query_id);
+}
+
+void DaemonServer::reregister_mds_perf_queries()
+{
+ mds_perf_metric_collector.reregister_queries();
+}
+
+int DaemonServer::get_mds_perf_counters(MDSPerfCollector *collector)
+{
+ return mds_perf_metric_collector.get_counters(collector);
+}
diff --git a/src/mgr/DaemonServer.h b/src/mgr/DaemonServer.h
new file mode 100644
index 000000000..a4cf990bd
--- /dev/null
+++ b/src/mgr/DaemonServer.h
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef DAEMON_SERVER_H_
+#define DAEMON_SERVER_H_
+
+#include "PyModuleRegistry.h"
+
+#include <set>
+#include <string>
+#include <boost/variant.hpp>
+
+#include "common/ceph_mutex.h"
+#include "common/LogClient.h"
+#include "common/Timer.h"
+
+#include <msg/Messenger.h>
+#include <mon/MonClient.h>
+
+#include "ServiceMap.h"
+#include "MgrSession.h"
+#include "DaemonState.h"
+#include "MetricCollector.h"
+#include "OSDPerfMetricCollector.h"
+#include "MDSPerfMetricCollector.h"
+
+class MMgrReport;
+class MMgrOpen;
+class MMgrUpdate;
+class MMgrClose;
+class MMonMgrReport;
+class MCommand;
+class MMgrCommand;
+struct MonCommand;
+class CommandContext;
+struct OSDPerfMetricQuery;
+struct MDSPerfMetricQuery;
+
+
+struct offline_pg_report {
+ set<int> osds;
+ set<pg_t> ok, not_ok, unknown;
+ set<pg_t> ok_become_degraded, ok_become_more_degraded; // ok
+ set<pg_t> bad_no_pool, bad_already_inactive, bad_become_inactive; // not ok
+
+ bool ok_to_stop() const {
+ return not_ok.empty() && unknown.empty();
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_bool("ok_to_stop", ok_to_stop());
+ f->open_array_section("osds");
+ for (auto o : osds) {
+ f->dump_int("osd", o);
+ }
+ f->close_section();
+ f->dump_unsigned("num_ok_pgs", ok.size());
+ f->dump_unsigned("num_not_ok_pgs", not_ok.size());
+
+ // ambiguous
+ if (!unknown.empty()) {
+ f->open_array_section("unknown_pgs");
+ for (auto pg : unknown) {
+ f->dump_stream("pg") << pg;
+ }
+ f->close_section();
+ }
+
+ // bad news
+ if (!bad_no_pool.empty()) {
+ f->open_array_section("bad_no_pool_pgs");
+ for (auto pg : bad_no_pool) {
+ f->dump_stream("pg") << pg;
+ }
+ f->close_section();
+ }
+ if (!bad_already_inactive.empty()) {
+ f->open_array_section("bad_already_inactive");
+ for (auto pg : bad_already_inactive) {
+ f->dump_stream("pg") << pg;
+ }
+ f->close_section();
+ }
+ if (!bad_become_inactive.empty()) {
+ f->open_array_section("bad_become_inactive");
+ for (auto pg : bad_become_inactive) {
+ f->dump_stream("pg") << pg;
+ }
+ f->close_section();
+ }
+
+ // informative
+ if (!ok_become_degraded.empty()) {
+ f->open_array_section("ok_become_degraded");
+ for (auto pg : ok_become_degraded) {
+ f->dump_stream("pg") << pg;
+ }
+ f->close_section();
+ }
+ if (!ok_become_more_degraded.empty()) {
+ f->open_array_section("ok_become_more_degraded");
+ for (auto pg : ok_become_more_degraded) {
+ f->dump_stream("pg") << pg;
+ }
+ f->close_section();
+ }
+ }
+};
+
+
+/**
+ * Server used in ceph-mgr to communicate with Ceph daemons like
+ * MDSs and OSDs.
+ */
+class DaemonServer : public Dispatcher, public md_config_obs_t
+{
+protected:
+ boost::scoped_ptr<Throttle> client_byte_throttler;
+ boost::scoped_ptr<Throttle> client_msg_throttler;
+ boost::scoped_ptr<Throttle> osd_byte_throttler;
+ boost::scoped_ptr<Throttle> osd_msg_throttler;
+ boost::scoped_ptr<Throttle> mds_byte_throttler;
+ boost::scoped_ptr<Throttle> mds_msg_throttler;
+ boost::scoped_ptr<Throttle> mon_byte_throttler;
+ boost::scoped_ptr<Throttle> mon_msg_throttler;
+
+ Messenger *msgr;
+ MonClient *monc;
+ Finisher &finisher;
+ DaemonStateIndex &daemon_state;
+ ClusterState &cluster_state;
+ PyModuleRegistry &py_modules;
+ LogChannelRef clog, audit_clog;
+
+ // Connections for daemons, and clients with service names set
+ // (i.e. those MgrClients that are allowed to send MMgrReports)
+ std::set<ConnectionRef> daemon_connections;
+
+ /// connections for osds
+ ceph::unordered_map<int,set<ConnectionRef>> osd_cons;
+
+ ServiceMap pending_service_map; // uncommitted
+
+ epoch_t pending_service_map_dirty = 0;
+
+ ceph::mutex lock = ceph::make_mutex("DaemonServer");
+
+ static void _generate_command_map(cmdmap_t& cmdmap,
+ map<string,string> &param_str_map);
+ static const MonCommand *_get_mgrcommand(const string &cmd_prefix,
+ const std::vector<MonCommand> &commands);
+ bool _allowed_command(
+ MgrSession *s, const string &service, const string &module,
+ const string &prefix, const cmdmap_t& cmdmap,
+ const map<string,string>& param_str_map,
+ const MonCommand *this_cmd);
+
+private:
+ friend class ReplyOnFinish;
+ bool _reply(MCommand* m,
+ int ret, const std::string& s, const bufferlist& payload);
+
+ void _prune_pending_service_map();
+
+ void _check_offlines_pgs(
+ const set<int>& osds,
+ const OSDMap& osdmap,
+ const PGMap& pgmap,
+ offline_pg_report *report);
+ void _maximize_ok_to_stop_set(
+ const set<int>& orig_osds,
+ unsigned max,
+ const OSDMap& osdmap,
+ const PGMap& pgmap,
+ offline_pg_report *report);
+
+ utime_t started_at;
+ std::atomic<bool> pgmap_ready;
+ std::set<int32_t> reported_osds;
+ void maybe_ready(int32_t osd_id);
+
+ SafeTimer timer;
+ bool shutting_down;
+ Context *tick_event;
+ void tick();
+ void schedule_tick_locked(double delay_sec);
+
+ class OSDPerfMetricCollectorListener : public MetricListener {
+ public:
+ OSDPerfMetricCollectorListener(DaemonServer *server)
+ : server(server) {
+ }
+ void handle_query_updated() override {
+ server->handle_osd_perf_metric_query_updated();
+ }
+ private:
+ DaemonServer *server;
+ };
+ OSDPerfMetricCollectorListener osd_perf_metric_collector_listener;
+ OSDPerfMetricCollector osd_perf_metric_collector;
+ void handle_osd_perf_metric_query_updated();
+
+ class MDSPerfMetricCollectorListener : public MetricListener {
+ public:
+ MDSPerfMetricCollectorListener(DaemonServer *server)
+ : server(server) {
+ }
+ void handle_query_updated() override {
+ server->handle_mds_perf_metric_query_updated();
+ }
+ private:
+ DaemonServer *server;
+ };
+ MDSPerfMetricCollectorListener mds_perf_metric_collector_listener;
+ MDSPerfMetricCollector mds_perf_metric_collector;
+ void handle_mds_perf_metric_query_updated();
+
+ void handle_metric_payload(const OSDMetricPayload &payload) {
+ osd_perf_metric_collector.process_reports(payload);
+ }
+
+ void handle_metric_payload(const MDSMetricPayload &payload) {
+ mds_perf_metric_collector.process_reports(payload);
+ }
+
+ void handle_metric_payload(const UnknownMetricPayload &payload) {
+ ceph_abort();
+ }
+
+ struct HandlePayloadVisitor : public boost::static_visitor<void> {
+ DaemonServer *server;
+
+ HandlePayloadVisitor(DaemonServer *server)
+ : server(server) {
+ }
+
+ template <typename MetricPayload>
+ inline void operator()(const MetricPayload &payload) const {
+ server->handle_metric_payload(payload);
+ }
+ };
+
+ void update_task_status(DaemonKey key,
+ const std::map<std::string,std::string>& task_status);
+
+public:
+ int init(uint64_t gid, entity_addrvec_t client_addrs);
+ void shutdown();
+
+ entity_addrvec_t get_myaddrs() const;
+
+ DaemonServer(MonClient *monc_,
+ Finisher &finisher_,
+ DaemonStateIndex &daemon_state_,
+ ClusterState &cluster_state_,
+ PyModuleRegistry &py_modules_,
+ LogChannelRef cl,
+ LogChannelRef auditcl);
+ ~DaemonServer() override;
+
+ bool ms_dispatch2(const ceph::ref_t<Message>& m) override;
+ int ms_handle_authentication(Connection *con) override;
+ bool ms_handle_reset(Connection *con) override;
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override;
+
+ void fetch_missing_metadata(const DaemonKey& key, const entity_addr_t& addr);
+ bool handle_open(const ceph::ref_t<MMgrOpen>& m);
+ bool handle_update(const ceph::ref_t<MMgrUpdate>& m);
+ bool handle_close(const ceph::ref_t<MMgrClose>& m);
+ bool handle_report(const ceph::ref_t<MMgrReport>& m);
+ bool handle_command(const ceph::ref_t<MCommand>& m);
+ bool handle_command(const ceph::ref_t<MMgrCommand>& m);
+ bool _handle_command(std::shared_ptr<CommandContext>& cmdctx);
+ void send_report();
+ void got_service_map();
+ void got_mgr_map();
+ void adjust_pgs();
+
+ void _send_configure(ConnectionRef c);
+
+ MetricQueryID add_osd_perf_query(
+ const OSDPerfMetricQuery &query,
+ const std::optional<OSDPerfMetricLimit> &limit);
+ int remove_osd_perf_query(MetricQueryID query_id);
+ int get_osd_perf_counters(OSDPerfCollector *collector);
+
+ MetricQueryID add_mds_perf_query(const MDSPerfMetricQuery &query,
+ const std::optional<MDSPerfMetricLimit> &limit);
+ int remove_mds_perf_query(MetricQueryID query_id);
+ void reregister_mds_perf_queries();
+ int get_mds_perf_counters(MDSPerfCollector *collector);
+
+ virtual const char** get_tracked_conf_keys() const override;
+ virtual void handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed) override;
+
+ void schedule_tick(double delay_sec);
+
+ void log_access_denied(std::shared_ptr<CommandContext>& cmdctx,
+ MgrSession* session, std::stringstream& ss);
+ void dump_pg_ready(ceph::Formatter *f);
+};
+
+#endif
+
diff --git a/src/mgr/DaemonState.cc b/src/mgr/DaemonState.cc
new file mode 100644
index 000000000..32cbbe3b9
--- /dev/null
+++ b/src/mgr/DaemonState.cc
@@ -0,0 +1,381 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "DaemonState.h"
+
+#include <experimental/iterator>
+
+#include "MgrSession.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+void DeviceState::set_metadata(map<string,string>&& m)
+{
+ metadata = std::move(m);
+ auto p = metadata.find("life_expectancy_min");
+ if (p != metadata.end()) {
+ life_expectancy.first.parse(p->second);
+ }
+ p = metadata.find("life_expectancy_max");
+ if (p != metadata.end()) {
+ life_expectancy.second.parse(p->second);
+ }
+ p = metadata.find("life_expectancy_stamp");
+ if (p != metadata.end()) {
+ life_expectancy_stamp.parse(p->second);
+ }
+ p = metadata.find("wear_level");
+ if (p != metadata.end()) {
+ wear_level = atof(p->second.c_str());
+ }
+}
+
+void DeviceState::set_life_expectancy(utime_t from, utime_t to, utime_t now)
+{
+ life_expectancy = make_pair(from, to);
+ life_expectancy_stamp = now;
+ if (from != utime_t()) {
+ metadata["life_expectancy_min"] = stringify(from);
+ } else {
+ metadata["life_expectancy_min"] = "";
+ }
+ if (to != utime_t()) {
+ metadata["life_expectancy_max"] = stringify(to);
+ } else {
+ metadata["life_expectancy_max"] = "";
+ }
+ if (now != utime_t()) {
+ metadata["life_expectancy_stamp"] = stringify(now);
+ } else {
+ metadata["life_expectancy_stamp"] = "";
+ }
+}
+
+void DeviceState::rm_life_expectancy()
+{
+ life_expectancy = make_pair(utime_t(), utime_t());
+ life_expectancy_stamp = utime_t();
+ metadata.erase("life_expectancy_min");
+ metadata.erase("life_expectancy_max");
+ metadata.erase("life_expectancy_stamp");
+}
+
+void DeviceState::set_wear_level(float wear)
+{
+ wear_level = wear;
+ if (wear >= 0) {
+ metadata["wear_level"] = stringify(wear);
+ } else {
+ metadata.erase("wear_level");
+ }
+}
+
+string DeviceState::get_life_expectancy_str(utime_t now) const
+{
+ if (life_expectancy.first == utime_t()) {
+ return string();
+ }
+ if (now >= life_expectancy.first) {
+ return "now";
+ }
+ utime_t min = life_expectancy.first - now;
+ utime_t max = life_expectancy.second - now;
+ if (life_expectancy.second == utime_t()) {
+ return string(">") + timespan_str(make_timespan(min));
+ }
+ string a = timespan_str(make_timespan(min));
+ string b = timespan_str(make_timespan(max));
+ if (a == b) {
+ return a;
+ }
+ return a + " to " + b;
+}
+
+void DeviceState::dump(Formatter *f) const
+{
+ f->dump_string("devid", devid);
+ f->open_array_section("location");
+ for (auto& i : attachments) {
+ f->open_object_section("attachment");
+ f->dump_string("host", std::get<0>(i));
+ f->dump_string("dev", std::get<1>(i));
+ f->dump_string("path", std::get<2>(i));
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("daemons");
+ for (auto& i : daemons) {
+ f->dump_stream("daemon") << i;
+ }
+ f->close_section();
+ if (life_expectancy.first != utime_t()) {
+ f->dump_stream("life_expectancy_min") << life_expectancy.first;
+ f->dump_stream("life_expectancy_max") << life_expectancy.second;
+ f->dump_stream("life_expectancy_stamp")
+ << life_expectancy_stamp;
+ }
+ if (wear_level >= 0) {
+ f->dump_float("wear_level", wear_level);
+ }
+}
+
+void DeviceState::print(ostream& out) const
+{
+ out << "device " << devid << "\n";
+ for (auto& i : attachments) {
+ out << "attachment " << std::get<0>(i) << " " << std::get<1>(i) << " "
+ << std::get<2>(i) << "\n";
+ out << "\n";
+ }
+ std::copy(std::begin(daemons), std::end(daemons),
+ std::experimental::make_ostream_joiner(out, ","));
+ out << '\n';
+ if (life_expectancy.first != utime_t()) {
+ out << "life_expectancy " << life_expectancy.first << " to "
+ << life_expectancy.second
+ << " (as of " << life_expectancy_stamp << ")\n";
+ }
+ if (wear_level >= 0) {
+ out << "wear_level " << wear_level << "\n";
+ }
+}
+
+void DaemonStateIndex::insert(DaemonStatePtr dm)
+{
+ std::unique_lock l{lock};
+ _insert(dm);
+}
+
+void DaemonStateIndex::_insert(DaemonStatePtr dm)
+{
+ if (all.count(dm->key)) {
+ _erase(dm->key);
+ }
+
+ by_server[dm->hostname][dm->key] = dm;
+ all[dm->key] = dm;
+
+ for (auto& i : dm->devices) {
+ auto d = _get_or_create_device(i.first);
+ d->daemons.insert(dm->key);
+ auto p = dm->devices_bypath.find(i.first);
+ if (p != dm->devices_bypath.end()) {
+ d->attachments.insert(std::make_tuple(dm->hostname, i.second, p->second));
+ } else {
+ d->attachments.insert(std::make_tuple(dm->hostname, i.second,
+ std::string()));
+ }
+ }
+}
+
+void DaemonStateIndex::_erase(const DaemonKey& dmk)
+{
+ ceph_assert(ceph_mutex_is_wlocked(lock));
+
+ const auto to_erase = all.find(dmk);
+ ceph_assert(to_erase != all.end());
+ const auto dm = to_erase->second;
+
+ for (auto& i : dm->devices) {
+ auto d = _get_or_create_device(i.first);
+ ceph_assert(d->daemons.count(dmk));
+ d->daemons.erase(dmk);
+ auto p = dm->devices_bypath.find(i.first);
+ if (p != dm->devices_bypath.end()) {
+ d->attachments.erase(make_tuple(dm->hostname, i.second, p->second));
+ } else {
+ d->attachments.erase(make_tuple(dm->hostname, i.second, std::string()));
+ }
+ if (d->empty()) {
+ _erase_device(d);
+ }
+ }
+
+ auto &server_collection = by_server[dm->hostname];
+ server_collection.erase(dm->key);
+ if (server_collection.empty()) {
+ by_server.erase(dm->hostname);
+ }
+
+ all.erase(to_erase);
+}
+
+DaemonStateCollection DaemonStateIndex::get_by_service(
+ const std::string& svc) const
+{
+ std::shared_lock l{lock};
+
+ DaemonStateCollection result;
+
+ for (const auto& [key, state] : all) {
+ if (key.type == svc) {
+ result[key] = state;
+ }
+ }
+
+ return result;
+}
+
+DaemonStateCollection DaemonStateIndex::get_by_server(
+ const std::string &hostname) const
+{
+ std::shared_lock l{lock};
+
+ if (auto found = by_server.find(hostname); found != by_server.end()) {
+ return found->second;
+ } else {
+ return {};
+ }
+}
+
+bool DaemonStateIndex::exists(const DaemonKey &key) const
+{
+ std::shared_lock l{lock};
+
+ return all.count(key) > 0;
+}
+
+DaemonStatePtr DaemonStateIndex::get(const DaemonKey &key)
+{
+ std::shared_lock l{lock};
+
+ auto iter = all.find(key);
+ if (iter != all.end()) {
+ return iter->second;
+ } else {
+ return nullptr;
+ }
+}
+
+void DaemonStateIndex::rm(const DaemonKey &key)
+{
+ std::unique_lock l{lock};
+ _rm(key);
+}
+
+void DaemonStateIndex::_rm(const DaemonKey &key)
+{
+ if (all.count(key)) {
+ _erase(key);
+ }
+}
+
+void DaemonStateIndex::cull(const std::string& svc_name,
+ const std::set<std::string>& names_exist)
+{
+ std::vector<string> victims;
+
+ std::unique_lock l{lock};
+ auto begin = all.lower_bound({svc_name, ""});
+ auto end = all.end();
+ for (auto &i = begin; i != end; ++i) {
+ const auto& daemon_key = i->first;
+ if (daemon_key.type != svc_name)
+ break;
+ if (names_exist.count(daemon_key.name) == 0) {
+ victims.push_back(daemon_key.name);
+ }
+ }
+
+ for (auto &i : victims) {
+ DaemonKey daemon_key{svc_name, i};
+ dout(4) << "Removing data for " << daemon_key << dendl;
+ _erase(daemon_key);
+ }
+}
+
+void DaemonStateIndex::cull_services(const std::set<std::string>& types_exist)
+{
+ std::set<DaemonKey> victims;
+
+ std::unique_lock l{lock};
+ for (auto it = all.begin(); it != all.end(); ++it) {
+ const auto& daemon_key = it->first;
+ if (it->second->service_daemon &&
+ types_exist.count(daemon_key.type) == 0) {
+ victims.insert(daemon_key);
+ }
+ }
+
+ for (auto &i : victims) {
+ dout(4) << "Removing data for " << i << dendl;
+ _erase(i);
+ }
+}
+
+void DaemonPerfCounters::update(const MMgrReport& report)
+{
+ dout(20) << "loading " << report.declare_types.size() << " new types, "
+ << report.undeclare_types.size() << " old types, had "
+ << types.size() << " types, got "
+ << report.packed.length() << " bytes of data" << dendl;
+
+ // Retrieve session state
+ auto priv = report.get_connection()->get_priv();
+ auto session = static_cast<MgrSession*>(priv.get());
+
+ // Load any newly declared types
+ for (const auto &t : report.declare_types) {
+ types.insert(std::make_pair(t.path, t));
+ session->declared_types.insert(t.path);
+ }
+ // Remove any old types
+ for (const auto &t : report.undeclare_types) {
+ session->declared_types.erase(t);
+ }
+
+ const auto now = ceph_clock_now();
+
+ // Parse packed data according to declared set of types
+ auto p = report.packed.cbegin();
+ DECODE_START(1, p);
+ for (const auto &t_path : session->declared_types) {
+ const auto &t = types.at(t_path);
+ auto instances_it = instances.find(t_path);
+ // Always check the instance exists, as we don't prevent yet
+ // multiple sessions from daemons with the same name, and one
+ // session clearing stats created by another on open.
+ if (instances_it == instances.end()) {
+ instances_it = instances.insert({t_path, t.type}).first;
+ }
+ uint64_t val = 0;
+ uint64_t avgcount = 0;
+ uint64_t avgcount2 = 0;
+
+ decode(val, p);
+ if (t.type & PERFCOUNTER_LONGRUNAVG) {
+ decode(avgcount, p);
+ decode(avgcount2, p);
+ instances_it->second.push_avg(now, val, avgcount);
+ } else {
+ instances_it->second.push(now, val);
+ }
+ }
+ DECODE_FINISH(p);
+}
+
+void PerfCounterInstance::push(utime_t t, uint64_t const &v)
+{
+ buffer.push_back({t, v});
+}
+
+void PerfCounterInstance::push_avg(utime_t t, uint64_t const &s,
+ uint64_t const &c)
+{
+ avg_buffer.push_back({t, s, c});
+}
diff --git a/src/mgr/DaemonState.h b/src/mgr/DaemonState.h
new file mode 100644
index 000000000..8c21305a9
--- /dev/null
+++ b/src/mgr/DaemonState.h
@@ -0,0 +1,409 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef DAEMON_STATE_H_
+#define DAEMON_STATE_H_
+
+#include <map>
+#include <string>
+#include <memory>
+#include <set>
+#include <boost/circular_buffer.hpp>
+
+#include "common/RWLock.h"
+#include "include/str_map.h"
+
+#include "msg/msg_types.h"
+
+// For PerfCounterType
+#include "messages/MMgrReport.h"
+#include "DaemonKey.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+// An instance of a performance counter type, within
+// a particular daemon.
+class PerfCounterInstance
+{
+ class DataPoint
+ {
+ public:
+ utime_t t;
+ uint64_t v;
+ DataPoint(utime_t t_, uint64_t v_)
+ : t(t_), v(v_)
+ {}
+ };
+
+ class AvgDataPoint
+ {
+ public:
+ utime_t t;
+ uint64_t s;
+ uint64_t c;
+ AvgDataPoint(utime_t t_, uint64_t s_, uint64_t c_)
+ : t(t_), s(s_), c(c_)
+ {}
+ };
+
+ boost::circular_buffer<DataPoint> buffer;
+ boost::circular_buffer<AvgDataPoint> avg_buffer;
+
+ uint64_t get_current() const;
+
+ public:
+ const boost::circular_buffer<DataPoint> & get_data() const
+ {
+ return buffer;
+ }
+ const DataPoint& get_latest_data() const
+ {
+ return buffer.back();
+ }
+ const boost::circular_buffer<AvgDataPoint> & get_data_avg() const
+ {
+ return avg_buffer;
+ }
+ const AvgDataPoint& get_latest_data_avg() const
+ {
+ return avg_buffer.back();
+ }
+ void push(utime_t t, uint64_t const &v);
+ void push_avg(utime_t t, uint64_t const &s, uint64_t const &c);
+
+ PerfCounterInstance(enum perfcounter_type_d type)
+ {
+ if (type & PERFCOUNTER_LONGRUNAVG)
+ avg_buffer = boost::circular_buffer<AvgDataPoint>(20);
+ else
+ buffer = boost::circular_buffer<DataPoint>(20);
+ };
+};
+
+
+typedef std::map<std::string, PerfCounterType> PerfCounterTypes;
+
+// Performance counters for one daemon
+class DaemonPerfCounters
+{
+ public:
+ // The record of perf stat types, shared between daemons
+ PerfCounterTypes &types;
+
+ explicit DaemonPerfCounters(PerfCounterTypes &types_)
+ : types(types_)
+ {}
+
+ std::map<std::string, PerfCounterInstance> instances;
+
+ void update(const MMgrReport& report);
+
+ void clear()
+ {
+ instances.clear();
+ }
+};
+
+// The state that we store about one daemon
+class DaemonState
+{
+ public:
+ ceph::mutex lock = ceph::make_mutex("DaemonState::lock");
+
+ DaemonKey key;
+
+ // The hostname where daemon was last seen running (extracted
+ // from the metadata)
+ std::string hostname;
+
+ // The metadata (hostname, version, etc) sent from the daemon
+ std::map<std::string, std::string> metadata;
+
+ /// device ids -> devname, derived from metadata[device_ids]
+ std::map<std::string,std::string> devices;
+
+ /// device ids -> by-path, derived from metadata[device_ids]
+ std::map<std::string,std::string> devices_bypath;
+
+ // TODO: this can be generalized to other daemons
+ std::vector<DaemonHealthMetric> daemon_health_metrics;
+
+ // Ephemeral state
+ bool service_daemon = false;
+ utime_t service_status_stamp;
+ std::map<std::string, std::string> service_status;
+ utime_t last_service_beacon;
+
+ // running config
+ std::map<std::string,std::map<int32_t,std::string>> config;
+
+ // mon config values we failed to set
+ std::map<std::string,std::string> ignored_mon_config;
+
+ // compiled-in config defaults (rarely used, so we leave them encoded!)
+ bufferlist config_defaults_bl;
+ std::map<std::string,std::string> config_defaults;
+
+ // The perf counters received in MMgrReport messages
+ DaemonPerfCounters perf_counters;
+
+ explicit DaemonState(PerfCounterTypes &types_)
+ : perf_counters(types_)
+ {
+ }
+
+ void set_metadata(const std::map<std::string,std::string>& m) {
+ devices.clear();
+ devices_bypath.clear();
+ metadata = m;
+ auto p = m.find("device_ids");
+ if (p != m.end()) {
+ map<std::string,std::string> devs, paths; // devname -> id or path
+ get_str_map(p->second, &devs, ",; ");
+ auto q = m.find("device_paths");
+ if (q != m.end()) {
+ get_str_map(q->second, &paths, ",; ");
+ }
+ for (auto& i : devs) {
+ if (i.second.size()) { // skip blank ids
+ devices[i.second] = i.first; // id -> devname
+ auto j = paths.find(i.first);
+ if (j != paths.end()) {
+ devices_bypath[i.second] = j->second; // id -> path
+ }
+ }
+ }
+ }
+ p = m.find("hostname");
+ if (p != m.end()) {
+ hostname = p->second;
+ }
+ }
+
+ const std::map<std::string,std::string>& _get_config_defaults() {
+ if (config_defaults.empty() &&
+ config_defaults_bl.length()) {
+ auto p = config_defaults_bl.cbegin();
+ try {
+ decode(config_defaults, p);
+ } catch (buffer::error& e) {
+ }
+ }
+ return config_defaults;
+ }
+};
+
+typedef std::shared_ptr<DaemonState> DaemonStatePtr;
+typedef std::map<DaemonKey, DaemonStatePtr> DaemonStateCollection;
+
+
+struct DeviceState : public RefCountedObject
+{
+ std::string devid;
+ /// (server,devname,path)
+ std::set<std::tuple<std::string,std::string,std::string>> attachments;
+ std::set<DaemonKey> daemons;
+
+ std::map<string,string> metadata; ///< persistent metadata
+
+ pair<utime_t,utime_t> life_expectancy; ///< when device failure is expected
+ utime_t life_expectancy_stamp; ///< when life expectency was recorded
+ float wear_level = -1; ///< SSD wear level (negative if unknown)
+
+ void set_metadata(map<string,string>&& m);
+
+ void set_life_expectancy(utime_t from, utime_t to, utime_t now);
+ void rm_life_expectancy();
+
+ void set_wear_level(float wear);
+
+ string get_life_expectancy_str(utime_t now) const;
+
+ /// true of we can be safely forgotten/removed from memory
+ bool empty() const {
+ return daemons.empty() && metadata.empty();
+ }
+
+ void dump(Formatter *f) const;
+ void print(ostream& out) const;
+
+private:
+ FRIEND_MAKE_REF(DeviceState);
+ DeviceState(const std::string& n) : devid(n) {}
+};
+
+/**
+ * Fuse the collection of per-daemon metadata from Ceph into
+ * a view that can be queried by service type, ID or also
+ * by server (aka fqdn).
+ */
+class DaemonStateIndex
+{
+private:
+ mutable ceph::shared_mutex lock =
+ ceph::make_shared_mutex("DaemonStateIndex", true, true, true);
+
+ std::map<std::string, DaemonStateCollection> by_server;
+ DaemonStateCollection all;
+ std::set<DaemonKey> updating;
+
+ std::map<std::string,ceph::ref_t<DeviceState>> devices;
+
+ void _erase(const DaemonKey& dmk);
+
+ ceph::ref_t<DeviceState> _get_or_create_device(const std::string& dev) {
+ auto em = devices.try_emplace(dev, nullptr);
+ auto& d = em.first->second;
+ if (em.second) {
+ d = ceph::make_ref<DeviceState>(dev);
+ }
+ return d;
+ }
+ void _erase_device(const ceph::ref_t<DeviceState>& d) {
+ devices.erase(d->devid);
+ }
+
+public:
+ DaemonStateIndex() {}
+
+ // FIXME: shouldn't really be public, maybe construct DaemonState
+ // objects internally to avoid this.
+ PerfCounterTypes types;
+
+ void insert(DaemonStatePtr dm);
+ void _insert(DaemonStatePtr dm);
+ bool exists(const DaemonKey &key) const;
+ DaemonStatePtr get(const DaemonKey &key);
+ void rm(const DaemonKey &key);
+ void _rm(const DaemonKey &key);
+
+ // Note that these return by value rather than reference to avoid
+ // callers needing to stay in lock while using result. Callers must
+ // still take the individual DaemonState::lock on each entry though.
+ DaemonStateCollection get_by_server(const std::string &hostname) const;
+ DaemonStateCollection get_by_service(const std::string &svc_name) const;
+ DaemonStateCollection get_all() const {return all;}
+
+ template<typename Callback, typename...Args>
+ auto with_daemons_by_server(Callback&& cb, Args&&... args) const ->
+ decltype(cb(by_server, std::forward<Args>(args)...)) {
+ std::shared_lock l{lock};
+
+ return std::forward<Callback>(cb)(by_server, std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ bool with_device(const std::string& dev,
+ Callback&& cb, Args&&... args) const {
+ std::shared_lock l{lock};
+ auto p = devices.find(dev);
+ if (p == devices.end()) {
+ return false;
+ }
+ std::forward<Callback>(cb)(*p->second, std::forward<Args>(args)...);
+ return true;
+ }
+
+ template<typename Callback, typename...Args>
+ bool with_device_write(const std::string& dev,
+ Callback&& cb, Args&&... args) {
+ std::unique_lock l{lock};
+ auto p = devices.find(dev);
+ if (p == devices.end()) {
+ return false;
+ }
+ std::forward<Callback>(cb)(*p->second, std::forward<Args>(args)...);
+ if (p->second->empty()) {
+ _erase_device(p->second);
+ }
+ return true;
+ }
+
+ template<typename Callback, typename...Args>
+ void with_device_create(const std::string& dev,
+ Callback&& cb, Args&&... args) {
+ std::unique_lock l{lock};
+ auto d = _get_or_create_device(dev);
+ std::forward<Callback>(cb)(*d, std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ void with_devices(Callback&& cb, Args&&... args) const {
+ std::shared_lock l{lock};
+ for (auto& i : devices) {
+ std::forward<Callback>(cb)(*i.second, std::forward<Args>(args)...);
+ }
+ }
+
+ template<typename CallbackInitial, typename Callback, typename...Args>
+ void with_devices2(CallbackInitial&& cbi, // with lock taken
+ Callback&& cb, // for each device
+ Args&&... args) const {
+ std::shared_lock l{lock};
+ cbi();
+ for (auto& i : devices) {
+ std::forward<Callback>(cb)(*i.second, std::forward<Args>(args)...);
+ }
+ }
+
+ void list_devids_by_server(const std::string& server,
+ std::set<std::string> *ls) {
+ auto m = get_by_server(server);
+ for (auto& i : m) {
+ std::lock_guard l(i.second->lock);
+ for (auto& j : i.second->devices) {
+ ls->insert(j.first);
+ }
+ }
+ }
+
+ void notify_updating(const DaemonKey &k) {
+ std::unique_lock l{lock};
+ updating.insert(k);
+ }
+ void clear_updating(const DaemonKey &k) {
+ std::unique_lock l{lock};
+ updating.erase(k);
+ }
+ bool is_updating(const DaemonKey &k) {
+ std::shared_lock l{lock};
+ return updating.count(k) > 0;
+ }
+
+ void update_metadata(DaemonStatePtr state,
+ const map<string,string>& meta) {
+ // remove and re-insert in case the device metadata changed
+ std::unique_lock l{lock};
+ _rm(state->key);
+ {
+ std::lock_guard l2{state->lock};
+ state->set_metadata(meta);
+ }
+ _insert(state);
+ }
+
+ /**
+ * Remove state for all daemons of this type whose names are
+ * not present in `names_exist`. Use this function when you have
+ * a cluster map and want to ensure that anything absent in the map
+ * is also absent in this class.
+ */
+ void cull(const std::string& svc_name,
+ const std::set<std::string>& names_exist);
+ void cull_services(const std::set<std::string>& types_exist);
+};
+
+#endif
+
diff --git a/src/mgr/Gil.cc b/src/mgr/Gil.cc
new file mode 100644
index 000000000..de27b9acd
--- /dev/null
+++ b/src/mgr/Gil.cc
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "Python.h"
+
+#include "common/debug.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+#include "Gil.h"
+
+SafeThreadState::SafeThreadState(PyThreadState *ts_)
+ : ts(ts_)
+{
+ ceph_assert(ts != nullptr);
+ thread = pthread_self();
+}
+
+Gil::Gil(SafeThreadState &ts, bool new_thread) : pThreadState(ts)
+{
+ // Acquire the GIL, set the current thread state
+ PyEval_RestoreThread(pThreadState.ts);
+ dout(25) << "GIL acquired for thread state " << pThreadState.ts << dendl;
+
+ //
+ // If called from a separate OS thread (i.e. a thread not created
+ // by Python, that does't already have a python thread state that
+ // was created when that thread was active), we need to manually
+ // create and switch to a python thread state specifically for this
+ // OS thread.
+ //
+ // Note that instead of requring the caller to set new_thread == true
+ // when calling this from a separate OS thread, we could figure out
+ // if this was necessary automatically, as follows:
+ //
+ // if (pThreadState->thread_id != PyThread_get_thread_ident()) {
+ //
+ // However, this means we're accessing pThreadState->thread_id, but
+ // the Python C API docs say that "The only public data member is
+ // PyInterpreterState *interp", i.e. doing this would violate
+ // something that's meant to be a black box.
+ //
+ if (new_thread) {
+ pNewThreadState = PyThreadState_New(pThreadState.ts->interp);
+ PyThreadState_Swap(pNewThreadState);
+ dout(20) << "Switched to new thread state " << pNewThreadState << dendl;
+ } else {
+ ceph_assert(pthread_self() == pThreadState.thread);
+ }
+}
+
+Gil::~Gil()
+{
+ if (pNewThreadState != nullptr) {
+ dout(20) << "Destroying new thread state " << pNewThreadState << dendl;
+ PyThreadState_Swap(pThreadState.ts);
+ PyThreadState_Clear(pNewThreadState);
+ PyThreadState_Delete(pNewThreadState);
+ }
+ // Release the GIL, reset the thread state to NULL
+ PyEval_SaveThread();
+ dout(25) << "GIL released for thread state " << pThreadState.ts << dendl;
+}
+
+without_gil_t::without_gil_t()
+{
+ assert(PyGILState_Check());
+ release_gil();
+}
+
+without_gil_t::~without_gil_t()
+{
+ if (save) {
+ acquire_gil();
+ }
+}
+
+void without_gil_t::release_gil()
+{
+ save = PyEval_SaveThread();
+}
+
+void without_gil_t::acquire_gil()
+{
+ assert(save);
+ PyEval_RestoreThread(save);
+ save = nullptr;
+}
+
+with_gil_t::with_gil_t(without_gil_t& allow_threads)
+ : allow_threads{allow_threads}
+{
+ allow_threads.acquire_gil();
+}
+
+with_gil_t::~with_gil_t()
+{
+ allow_threads.release_gil();
+}
diff --git a/src/mgr/Gil.h b/src/mgr/Gil.h
new file mode 100644
index 000000000..72675a503
--- /dev/null
+++ b/src/mgr/Gil.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <cassert>
+#include <functional>
+
+struct _ts;
+typedef struct _ts PyThreadState;
+
+#include <pthread.h>
+
+
+/**
+ * Wrap PyThreadState to carry a record of which POSIX thread
+ * the thread state relates to. This allows the Gil class to
+ * validate that we're being used from the right thread.
+ */
+class SafeThreadState
+{
+ public:
+ explicit SafeThreadState(PyThreadState *ts_);
+
+ SafeThreadState()
+ : ts(nullptr), thread(0)
+ {
+ }
+
+ PyThreadState *ts;
+ pthread_t thread;
+
+ void set(PyThreadState *ts_)
+ {
+ ts = ts_;
+ thread = pthread_self();
+ }
+};
+
+//
+// Use one of these in any scope in which you need to hold Python's
+// Global Interpreter Lock.
+//
+// Do *not* nest these, as a second GIL acquire will deadlock (see
+// https://docs.python.org/2/c-api/init.html#c.PyEval_RestoreThread)
+//
+// If in doubt, explicitly put a scope around the block of code you
+// know you need the GIL in.
+//
+// See the comment in Gil::Gil for when to set new_thread == true
+//
+class Gil {
+public:
+ Gil(const Gil&) = delete;
+ Gil& operator=(const Gil&) = delete;
+
+ Gil(SafeThreadState &ts, bool new_thread = false);
+ ~Gil();
+
+private:
+ SafeThreadState &pThreadState;
+ PyThreadState *pNewThreadState = nullptr;
+};
+
+// because the Python runtime could relinquish the GIL when performing GC
+// and re-acquire it afterwards, we should enforce following locking policy:
+// 1. do not acquire locks when holding the GIL, use a without_gil or
+// without_gil_t to guard the code which acquires non-gil locks.
+// 2. always hold a GIL when calling python functions, for example, when
+// constructing a PyFormatter instance.
+//
+// a wrapper that provides a convenient RAII-style mechinary for acquiring
+// and releasing GIL, like the macros of Py_BEGIN_ALLOW_THREADS and
+// Py_END_ALLOW_THREADS.
+struct without_gil_t {
+ without_gil_t();
+ ~without_gil_t();
+ void release_gil();
+ void acquire_gil();
+private:
+ PyThreadState *save = nullptr;
+ friend struct with_gil_t;
+};
+
+struct with_gil_t {
+ with_gil_t(without_gil_t& allow_threads);
+ ~with_gil_t();
+private:
+ without_gil_t& allow_threads;
+};
+
+// invoke func with GIL acquired
+template<typename Func>
+auto with_gil(without_gil_t& no_gil, Func&& func) {
+ with_gil_t gil{no_gil};
+ return std::invoke(std::forward<Func>(func));
+}
+
+template<typename Func>
+auto without_gil(Func&& func) {
+ without_gil_t no_gil;
+ return std::invoke(std::forward<Func>(func));
+}
diff --git a/src/mgr/MDSPerfMetricCollector.cc b/src/mgr/MDSPerfMetricCollector.cc
new file mode 100644
index 000000000..62298aba3
--- /dev/null
+++ b/src/mgr/MDSPerfMetricCollector.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "messages/MMgrReport.h"
+#include "mgr/MDSPerfMetricTypes.h"
+#include "mgr/MDSPerfMetricCollector.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr.mds_perf_metric_collector " << __func__ << " "
+
+MDSPerfMetricCollector::MDSPerfMetricCollector(MetricListener &listener)
+ : MetricCollector<MDSPerfMetricQuery,
+ MDSPerfMetricLimit,
+ MDSPerfMetricKey,
+ MDSPerfMetrics>(listener) {
+}
+
+void MDSPerfMetricCollector::process_reports(const MetricPayload &payload) {
+ const MDSPerfMetricReport &metric_report = boost::get<MDSMetricPayload>(payload).metric_report;
+
+ std::lock_guard locker(lock);
+ process_reports_generic(
+ metric_report.reports, [](PerformanceCounter *counter, const PerformanceCounter &update) {
+ counter->first = update.first;
+ counter->second = update.second;
+ });
+
+ // update delayed rank set
+ delayed_ranks = metric_report.rank_metrics_delayed;
+ dout(20) << ": delayed ranks=[" << delayed_ranks << "]" << dendl;
+
+ clock_gettime(CLOCK_MONOTONIC_COARSE, &last_updated_mono);
+}
+
+int MDSPerfMetricCollector::get_counters(PerfCollector *collector) {
+ MDSPerfCollector *c = static_cast<MDSPerfCollector *>(collector);
+
+ std::lock_guard locker(lock);
+
+ int r = get_counters_generic(c->query_id, &c->counters);
+ if (r != 0) {
+ return r;
+ }
+
+ get_delayed_ranks(&c->delayed_ranks);
+
+ get_last_updated(&c->last_updated_mono);
+ return r;
+}
+
+void MDSPerfMetricCollector::get_delayed_ranks(std::set<mds_rank_t> *ranks) {
+ ceph_assert(ceph_mutex_is_locked(lock));
+ *ranks = delayed_ranks;
+}
+
+void MDSPerfMetricCollector::get_last_updated(utime_t *ts) {
+ ceph_assert(ceph_mutex_is_locked(lock));
+ *ts = utime_t(last_updated_mono);
+}
diff --git a/src/mgr/MDSPerfMetricCollector.h b/src/mgr/MDSPerfMetricCollector.h
new file mode 100644
index 000000000..c72bce091
--- /dev/null
+++ b/src/mgr/MDSPerfMetricCollector.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_MDS_PERF_COLLECTOR_H
+#define CEPH_MGR_MDS_PERF_COLLECTOR_H
+
+#include "mgr/MetricCollector.h"
+#include "mgr/MDSPerfMetricTypes.h"
+
+// MDS performance query class
+class MDSPerfMetricCollector
+ : public MetricCollector<MDSPerfMetricQuery, MDSPerfMetricLimit, MDSPerfMetricKey,
+ MDSPerfMetrics> {
+private:
+ std::set<mds_rank_t> delayed_ranks;
+ struct timespec last_updated_mono;
+
+ void get_delayed_ranks(std::set<mds_rank_t> *ranks);
+
+ void get_last_updated(utime_t *ts);
+public:
+ MDSPerfMetricCollector(MetricListener &listener);
+
+ void process_reports(const MetricPayload &payload) override;
+ int get_counters(PerfCollector *collector) override;
+};
+
+#endif // CEPH_MGR_MDS_PERF_COLLECTOR_H
diff --git a/src/mgr/MDSPerfMetricTypes.cc b/src/mgr/MDSPerfMetricTypes.cc
new file mode 100644
index 000000000..a16003774
--- /dev/null
+++ b/src/mgr/MDSPerfMetricTypes.cc
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <ostream>
+#include "mgr/MDSPerfMetricTypes.h"
+
+std::ostream& operator<<(std::ostream& os, const MDSPerfMetricSubKeyDescriptor &d) {
+ switch (d.type) {
+ case MDSPerfMetricSubKeyType::MDS_RANK:
+ os << "mds_rank";
+ break;
+ case MDSPerfMetricSubKeyType::CLIENT_ID:
+ os << "client_id";
+ break;
+ default:
+ os << "unknown (" << static_cast<int>(d.type) << ")";
+ }
+
+ return os << "~/" << d.regex_str << "/";
+}
+
+void MDSPerformanceCounterDescriptor::pack_counter(
+ const PerformanceCounter &c, bufferlist *bl) const {
+ using ceph::encode;
+ encode(c.first, *bl);
+ encode(c.second, *bl);
+ switch(type) {
+ case MDSPerformanceCounterType::CAP_HIT_METRIC:
+ case MDSPerformanceCounterType::READ_LATENCY_METRIC:
+ case MDSPerformanceCounterType::WRITE_LATENCY_METRIC:
+ case MDSPerformanceCounterType::METADATA_LATENCY_METRIC:
+ case MDSPerformanceCounterType::DENTRY_LEASE_METRIC:
+ case MDSPerformanceCounterType::OPENED_FILES_METRIC:
+ case MDSPerformanceCounterType::PINNED_ICAPS_METRIC:
+ case MDSPerformanceCounterType::OPENED_INODES_METRIC:
+ case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
+ case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
+ case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
+ case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
+ case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
+ case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
+ case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
+ case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
+ break;
+ default:
+ ceph_abort_msg("unknown counter type");
+ }
+}
+
+void MDSPerformanceCounterDescriptor::unpack_counter(
+ bufferlist::const_iterator& bl, PerformanceCounter *c) const {
+ using ceph::decode;
+ decode(c->first, bl);
+ decode(c->second, bl);
+ switch(type) {
+ case MDSPerformanceCounterType::CAP_HIT_METRIC:
+ case MDSPerformanceCounterType::READ_LATENCY_METRIC:
+ case MDSPerformanceCounterType::WRITE_LATENCY_METRIC:
+ case MDSPerformanceCounterType::METADATA_LATENCY_METRIC:
+ case MDSPerformanceCounterType::DENTRY_LEASE_METRIC:
+ case MDSPerformanceCounterType::OPENED_FILES_METRIC:
+ case MDSPerformanceCounterType::PINNED_ICAPS_METRIC:
+ case MDSPerformanceCounterType::OPENED_INODES_METRIC:
+ case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
+ case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
+ case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
+ case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
+ case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
+ case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
+ case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
+ case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
+ break;
+ default:
+ ceph_abort_msg("unknown counter type");
+ }
+}
+
+std::ostream& operator<<(std::ostream &os, const MDSPerformanceCounterDescriptor &d) {
+ switch(d.type) {
+ case MDSPerformanceCounterType::CAP_HIT_METRIC:
+ os << "cap_hit_metric";
+ break;
+ case MDSPerformanceCounterType::READ_LATENCY_METRIC:
+ os << "read_latency_metric";
+ break;
+ case MDSPerformanceCounterType::WRITE_LATENCY_METRIC:
+ os << "write_latency_metric";
+ break;
+ case MDSPerformanceCounterType::METADATA_LATENCY_METRIC:
+ os << "metadata_latency_metric";
+ break;
+ case MDSPerformanceCounterType::DENTRY_LEASE_METRIC:
+ os << "dentry_lease_metric";
+ break;
+ case MDSPerformanceCounterType::OPENED_FILES_METRIC:
+ os << "opened_files_metric";
+ break;
+ case MDSPerformanceCounterType::PINNED_ICAPS_METRIC:
+ os << "pinned_icaps_metric";
+ break;
+ case MDSPerformanceCounterType::OPENED_INODES_METRIC:
+ os << "opened_inodes_metric";
+ break;
+ case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
+ os << "read_io_sizes_metric";
+ break;
+ case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
+ os << "write_io_sizes_metric";
+ break;
+ case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
+ os << "avg_read_latency";
+ break;
+ case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
+ os << "stdev_read_latency";
+ break;
+ case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
+ os << "avg_write_latency";
+ break;
+ case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
+ os << "stdev_write_latency";
+ break;
+ case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
+ os << "avg_metadata_latency";
+ break;
+ case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
+ os << "stdev_metadata_latency";
+ break;
+ }
+
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const MDSPerfMetricLimit &limit) {
+ return os << "[order_by=" << limit.order_by << ", max_count=" << limit.max_count << "]";
+}
+
+void MDSPerfMetricQuery::pack_counters(const PerformanceCounters &counters,
+ bufferlist *bl) const {
+ auto it = counters.begin();
+ for (auto &descriptor : performance_counter_descriptors) {
+ if (it == counters.end()) {
+ descriptor.pack_counter(PerformanceCounter(), bl);
+ } else {
+ descriptor.pack_counter(*it, bl);
+ it++;
+ }
+ }
+}
+
+std::ostream &operator<<(std::ostream &os, const MDSPerfMetricQuery &query) {
+ return os << "[key=" << query.key_descriptor << ", counter="
+ << query.performance_counter_descriptors << "]";
+}
diff --git a/src/mgr/MDSPerfMetricTypes.h b/src/mgr/MDSPerfMetricTypes.h
new file mode 100644
index 000000000..aa35b8cab
--- /dev/null
+++ b/src/mgr/MDSPerfMetricTypes.h
@@ -0,0 +1,367 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_MDS_PERF_METRIC_TYPES_H
+#define CEPH_MGR_MDS_PERF_METRIC_TYPES_H
+
+#include <regex>
+#include <vector>
+#include <iostream>
+
+#include "include/denc.h"
+#include "include/stringify.h"
+
+#include "mds/mdstypes.h"
+#include "mgr/Types.h"
+
+typedef std::vector<std::string> MDSPerfMetricSubKey; // array of regex match
+typedef std::vector<MDSPerfMetricSubKey> MDSPerfMetricKey;
+
+enum class MDSPerfMetricSubKeyType : uint8_t {
+ MDS_RANK = 0,
+ CLIENT_ID = 1,
+};
+
+struct MDSPerfMetricSubKeyDescriptor {
+ MDSPerfMetricSubKeyType type = static_cast<MDSPerfMetricSubKeyType>(-1);
+ std::string regex_str;
+ std::regex regex;
+
+ bool is_supported() const {
+ switch (type) {
+ case MDSPerfMetricSubKeyType::MDS_RANK:
+ case MDSPerfMetricSubKeyType::CLIENT_ID:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ MDSPerfMetricSubKeyDescriptor() {
+ }
+ MDSPerfMetricSubKeyDescriptor(MDSPerfMetricSubKeyType type, const std::string &regex_str)
+ : type(type), regex_str(regex_str) {
+ }
+
+ bool operator<(const MDSPerfMetricSubKeyDescriptor &other) const {
+ if (type < other.type) {
+ return true;
+ }
+ if (type > other.type) {
+ return false;
+ }
+ return regex_str < other.regex_str;
+ }
+
+ DENC(MDSPerfMetricSubKeyDescriptor, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ denc(v.regex_str, p);
+ DENC_FINISH(p);
+ }
+};
+WRITE_CLASS_DENC(MDSPerfMetricSubKeyDescriptor)
+
+std::ostream& operator<<(std::ostream& os, const MDSPerfMetricSubKeyDescriptor &d);
+typedef std::vector<MDSPerfMetricSubKeyDescriptor> MDSPerfMetricKeyDescriptor;
+
+template<>
+struct denc_traits<MDSPerfMetricKeyDescriptor> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const MDSPerfMetricKeyDescriptor& v, size_t& p) {
+ p += sizeof(uint32_t);
+ const auto size = v.size();
+ if (size) {
+ size_t per = 0;
+ denc(v.front(), per);
+ p += per * size;
+ }
+ }
+ static void encode(const MDSPerfMetricKeyDescriptor& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ denc_varint(v.size(), p);
+ for (auto& i : v) {
+ denc(i, p);
+ }
+ }
+ static void decode(MDSPerfMetricKeyDescriptor& v,
+ ceph::buffer::ptr::const_iterator& p) {
+ unsigned num;
+ denc_varint(num, p);
+ v.clear();
+ v.reserve(num);
+ for (unsigned i=0; i < num; ++i) {
+ MDSPerfMetricSubKeyDescriptor d;
+ denc(d, p);
+ if (!d.is_supported()) {
+ v.clear();
+ return;
+ }
+ try {
+ d.regex = d.regex_str.c_str();
+ } catch (const std::regex_error& e) {
+ v.clear();
+ return;
+ }
+ if (d.regex.mark_count() == 0) {
+ v.clear();
+ return;
+ }
+ v.push_back(std::move(d));
+ }
+ }
+};
+
+enum class MDSPerformanceCounterType : uint8_t {
+ CAP_HIT_METRIC = 0,
+ READ_LATENCY_METRIC = 1,
+ WRITE_LATENCY_METRIC = 2,
+ METADATA_LATENCY_METRIC = 3,
+ DENTRY_LEASE_METRIC = 4,
+ OPENED_FILES_METRIC = 5,
+ PINNED_ICAPS_METRIC = 6,
+ OPENED_INODES_METRIC = 7,
+ READ_IO_SIZES_METRIC = 8,
+ WRITE_IO_SIZES_METRIC = 9,
+ AVG_READ_LATENCY_METRIC = 10,
+ STDEV_READ_LATENCY_METRIC = 11,
+ AVG_WRITE_LATENCY_METRIC = 12,
+ STDEV_WRITE_LATENCY_METRIC = 13,
+ AVG_METADATA_LATENCY_METRIC = 14,
+ STDEV_METADATA_LATENCY_METRIC = 15,
+};
+
+struct MDSPerformanceCounterDescriptor {
+ MDSPerformanceCounterType type = static_cast<MDSPerformanceCounterType>(-1);
+
+ bool is_supported() const {
+ switch(type) {
+ case MDSPerformanceCounterType::CAP_HIT_METRIC:
+ case MDSPerformanceCounterType::READ_LATENCY_METRIC:
+ case MDSPerformanceCounterType::WRITE_LATENCY_METRIC:
+ case MDSPerformanceCounterType::METADATA_LATENCY_METRIC:
+ case MDSPerformanceCounterType::DENTRY_LEASE_METRIC:
+ case MDSPerformanceCounterType::OPENED_FILES_METRIC:
+ case MDSPerformanceCounterType::PINNED_ICAPS_METRIC:
+ case MDSPerformanceCounterType::OPENED_INODES_METRIC:
+ case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
+ case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
+ case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
+ case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
+ case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
+ case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
+ case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
+ case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ MDSPerformanceCounterDescriptor() {
+ }
+ MDSPerformanceCounterDescriptor(MDSPerformanceCounterType type) : type(type) {
+ }
+
+ bool operator<(const MDSPerformanceCounterDescriptor &other) const {
+ return type < other.type;
+ }
+
+ bool operator==(const MDSPerformanceCounterDescriptor &other) const {
+ return type == other.type;
+ }
+
+ bool operator!=(const MDSPerformanceCounterDescriptor &other) const {
+ return type != other.type;
+ }
+
+ DENC(MDSPerformanceCounterDescriptor, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ DENC_FINISH(p);
+ }
+
+ void pack_counter(const PerformanceCounter &c, ceph::buffer::list *bl) const;
+ void unpack_counter(ceph::buffer::list::const_iterator& bl, PerformanceCounter *c) const;
+};
+WRITE_CLASS_DENC(MDSPerformanceCounterDescriptor)
+
+std::ostream& operator<<(std::ostream &os, const MDSPerformanceCounterDescriptor &d);
+typedef std::vector<MDSPerformanceCounterDescriptor> MDSPerformanceCounterDescriptors;
+
+template<>
+struct denc_traits<MDSPerformanceCounterDescriptors> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const MDSPerformanceCounterDescriptors& v, size_t& p) {
+ p += sizeof(uint32_t);
+ const auto size = v.size();
+ if (size) {
+ size_t per = 0;
+ denc(v.front(), per);
+ p += per * size;
+ }
+ }
+ static void encode(const MDSPerformanceCounterDescriptors& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ denc_varint(v.size(), p);
+ for (auto& i : v) {
+ denc(i, p);
+ }
+ }
+ static void decode(MDSPerformanceCounterDescriptors& v,
+ ceph::buffer::ptr::const_iterator& p) {
+ unsigned num;
+ denc_varint(num, p);
+ v.clear();
+ v.reserve(num);
+ for (unsigned i=0; i < num; ++i) {
+ MDSPerformanceCounterDescriptor d;
+ denc(d, p);
+ if (d.is_supported()) {
+ v.push_back(std::move(d));
+ }
+ }
+ }
+};
+
+struct MDSPerfMetricLimit {
+ MDSPerformanceCounterDescriptor order_by;
+ uint64_t max_count;
+
+ MDSPerfMetricLimit() {
+ }
+ MDSPerfMetricLimit(const MDSPerformanceCounterDescriptor &order_by, uint64_t max_count)
+ : order_by(order_by), max_count(max_count) {
+ }
+
+ bool operator<(const MDSPerfMetricLimit &other) const {
+ if (order_by != other.order_by) {
+ return order_by < other.order_by;
+ }
+
+ return max_count < other.max_count;
+ }
+
+ DENC(MDSPerfMetricLimit, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.order_by, p);
+ denc(v.max_count, p);
+ DENC_FINISH(p);
+ }
+};
+WRITE_CLASS_DENC(MDSPerfMetricLimit)
+
+std::ostream &operator<<(std::ostream &os, const MDSPerfMetricLimit &limit);
+typedef std::set<MDSPerfMetricLimit> MDSPerfMetricLimits;
+
+struct MDSPerfMetricQuery {
+ MDSPerfMetricKeyDescriptor key_descriptor;
+ MDSPerformanceCounterDescriptors performance_counter_descriptors;
+
+ MDSPerfMetricQuery() {
+ }
+ MDSPerfMetricQuery(const MDSPerfMetricKeyDescriptor &key_descriptor,
+ const MDSPerformanceCounterDescriptors &performance_counter_descriptors)
+ : key_descriptor(key_descriptor),
+ performance_counter_descriptors(performance_counter_descriptors)
+ {
+ }
+
+ bool operator<(const MDSPerfMetricQuery &other) const {
+ if (key_descriptor < other.key_descriptor) {
+ return true;
+ }
+ if (key_descriptor > other.key_descriptor) {
+ return false;
+ }
+ return performance_counter_descriptors < other.performance_counter_descriptors;
+ }
+
+ template <typename L>
+ bool get_key(L&& get_sub_key, MDSPerfMetricKey *key) const {
+ for (auto &sub_key_descriptor : key_descriptor) {
+ MDSPerfMetricSubKey sub_key;
+ if (!get_sub_key(sub_key_descriptor, &sub_key)) {
+ return false;
+ }
+ key->push_back(sub_key);
+ }
+ return true;
+ }
+
+ void get_performance_counter_descriptors(MDSPerformanceCounterDescriptors *descriptors) const {
+ *descriptors = performance_counter_descriptors;
+ }
+
+ template <typename L>
+ void update_counters(L &&update_counter, PerformanceCounters *counters) const {
+ auto it = counters->begin();
+ for (auto &descriptor : performance_counter_descriptors) {
+ // TODO: optimize
+ if (it == counters->end()) {
+ counters->push_back(PerformanceCounter());
+ it = std::prev(counters->end());
+ }
+ update_counter(descriptor, &(*it));
+ it++;
+ }
+ }
+
+ DENC(MDSPerfMetricQuery, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.key_descriptor, p);
+ denc(v.performance_counter_descriptors, p);
+ DENC_FINISH(p);
+ }
+
+ void pack_counters(const PerformanceCounters &counters, ceph::buffer::list *bl) const;
+};
+WRITE_CLASS_DENC(MDSPerfMetricQuery)
+
+std::ostream &operator<<(std::ostream &os, const MDSPerfMetricQuery &query);
+
+struct MDSPerfCollector : PerfCollector {
+ std::map<MDSPerfMetricKey, PerformanceCounters> counters;
+ std::set<mds_rank_t> delayed_ranks;
+ utime_t last_updated_mono;
+
+ MDSPerfCollector(MetricQueryID query_id)
+ : PerfCollector(query_id) {
+ }
+};
+
+struct MDSPerfMetrics {
+ MDSPerformanceCounterDescriptors performance_counter_descriptors;
+ std::map<MDSPerfMetricKey, ceph::buffer::list> group_packed_performance_counters;
+
+ DENC(MDSPerfMetrics, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.performance_counter_descriptors, p);
+ denc(v.group_packed_performance_counters, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct MDSPerfMetricReport {
+ std::map<MDSPerfMetricQuery, MDSPerfMetrics> reports;
+ // set of active ranks that have delayed (stale) metrics
+ std::set<mds_rank_t> rank_metrics_delayed;
+
+ DENC(MDSPerfMetricReport, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.reports, p);
+ denc(v.rank_metrics_delayed, p);
+ DENC_FINISH(p);
+ }
+};
+
+WRITE_CLASS_DENC(MDSPerfMetrics)
+WRITE_CLASS_DENC(MDSPerfMetricReport)
+
+#endif // CEPH_MGR_MDS_PERF_METRIC_TYPES_H
diff --git a/src/mgr/MetricCollector.cc b/src/mgr/MetricCollector.cc
new file mode 100644
index 000000000..c31dcf0b9
--- /dev/null
+++ b/src/mgr/MetricCollector.cc
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "mgr/MetricCollector.h"
+#include "mgr/OSDPerfMetricTypes.h"
+#include "mgr/MDSPerfMetricTypes.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr.metric_collector " << __func__ << ": "
+
+template <typename Query, typename Limit, typename Key, typename Report>
+MetricCollector<Query, Limit, Key, Report>::MetricCollector(MetricListener &listener)
+ : listener(listener)
+{
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+MetricQueryID MetricCollector<Query, Limit, Key, Report>::add_query(
+ const Query &query,
+ const std::optional<Limit> &limit) {
+ dout(20) << "query=" << query << ", limit=" << limit << dendl;
+ uint64_t query_id;
+ bool notify = false;
+
+ {
+ std::lock_guard locker(lock);
+
+ query_id = next_query_id++;
+ auto it = queries.find(query);
+ if (it == queries.end()) {
+ it = queries.emplace(query, std::map<MetricQueryID, OptionalLimit>{}).first;
+ notify = true;
+ } else if (is_limited(it->second)) {
+ notify = true;
+ }
+
+ it->second.emplace(query_id, limit);
+ counters.emplace(query_id, std::map<Key, PerformanceCounters>{});
+ }
+
+ dout(10) << query << " " << (limit ? stringify(*limit) : "unlimited")
+ << " query_id=" << query_id << dendl;
+
+ if (notify) {
+ listener.handle_query_updated();
+ }
+
+ return query_id;
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+int MetricCollector<Query, Limit, Key, Report>::remove_query(MetricQueryID query_id) {
+ dout(20) << "query_id=" << query_id << dendl;
+ bool found = false;
+ bool notify = false;
+
+ {
+ std::lock_guard locker(lock);
+
+ for (auto it = queries.begin() ; it != queries.end();) {
+ auto iter = it->second.find(query_id);
+ if (iter == it->second.end()) {
+ ++it;
+ continue;
+ }
+
+ it->second.erase(iter);
+ if (it->second.empty()) {
+ it = queries.erase(it);
+ notify = true;
+ } else if (is_limited(it->second)) {
+ ++it;
+ notify = true;
+ }
+ found = true;
+ break;
+ }
+ counters.erase(query_id);
+ }
+
+ if (!found) {
+ dout(10) << query_id << " not found" << dendl;
+ return -ENOENT;
+ }
+
+ dout(10) << query_id << dendl;
+
+ if (notify) {
+ listener.handle_query_updated();
+ }
+
+ return 0;
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+void MetricCollector<Query, Limit, Key, Report>::remove_all_queries() {
+ dout(20) << dendl;
+ bool notify;
+
+ {
+ std::lock_guard locker(lock);
+
+ notify = !queries.empty();
+ queries.clear();
+ }
+
+ if (notify) {
+ listener.handle_query_updated();
+ }
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+void MetricCollector<Query, Limit, Key, Report>::reregister_queries() {
+ dout(20) << dendl;
+ listener.handle_query_updated();
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+int MetricCollector<Query, Limit, Key, Report>::get_counters_generic(
+ MetricQueryID query_id, std::map<Key, PerformanceCounters> *c) {
+ dout(20) << dendl;
+ ceph_assert(ceph_mutex_is_locked(lock));
+
+ auto it = counters.find(query_id);
+ if (it == counters.end()) {
+ dout(10) << "counters for " << query_id << " not found" << dendl;
+ return -ENOENT;
+ }
+
+ *c = std::move(it->second);
+ it->second.clear();
+
+ return 0;
+}
+
+template <typename Query, typename Limit, typename Key, typename Report>
+void MetricCollector<Query, Limit, Key, Report>::process_reports_generic(
+ const std::map<Query, Report> &reports, UpdateCallback callback) {
+ ceph_assert(ceph_mutex_is_locked(lock));
+
+ if (reports.empty()) {
+ return;
+ }
+
+ for (auto& [query, report] : reports) {
+ dout(10) << "report for " << query << " query: "
+ << report.group_packed_performance_counters.size() << " records"
+ << dendl;
+
+ for (auto& [key, bl] : report.group_packed_performance_counters) {
+ auto bl_it = bl.cbegin();
+
+ for (auto& p : queries[query]) {
+ auto &key_counters = counters[p.first][key];
+ if (key_counters.empty()) {
+ key_counters.resize(query.performance_counter_descriptors.size(),
+ {0, 0});
+ }
+ }
+
+ auto desc_it = report.performance_counter_descriptors.begin();
+ for (size_t i = 0; i < query.performance_counter_descriptors.size(); i++) {
+ if (desc_it == report.performance_counter_descriptors.end()) {
+ break;
+ }
+ if (*desc_it != query.performance_counter_descriptors[i]) {
+ continue;
+ }
+ PerformanceCounter c;
+ desc_it->unpack_counter(bl_it, &c);
+ dout(20) << "counter " << key << " " << *desc_it << ": " << c << dendl;
+
+ for (auto& p : queries[query]) {
+ auto &key_counters = counters[p.first][key];
+ callback(&key_counters[i], c);
+ }
+ desc_it++;
+ }
+ }
+ }
+}
+
+template class
+MetricCollector<OSDPerfMetricQuery, OSDPerfMetricLimit, OSDPerfMetricKey, OSDPerfMetricReport>;
+template class
+MetricCollector<MDSPerfMetricQuery, MDSPerfMetricLimit, MDSPerfMetricKey, MDSPerfMetrics>;
diff --git a/src/mgr/MetricCollector.h b/src/mgr/MetricCollector.h
new file mode 100644
index 000000000..91fa78781
--- /dev/null
+++ b/src/mgr/MetricCollector.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_METRIC_COLLECTOR_H
+#define CEPH_MGR_METRIC_COLLECTOR_H
+
+#include <map>
+#include <set>
+#include <tuple>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+#include "common/ceph_mutex.h"
+#include "msg/Message.h"
+#include "mgr/Types.h"
+#include "mgr/MetricTypes.h"
+
+class MMgrReport;
+
+template <typename Query, typename Limit, typename Key, typename Report>
+class MetricCollector {
+public:
+ virtual ~MetricCollector() {
+ }
+
+ using Limits = std::set<Limit>;
+
+ MetricCollector(MetricListener &listener);
+
+ MetricQueryID add_query(const Query &query, const std::optional<Limit> &limit);
+
+ int remove_query(MetricQueryID query_id);
+
+ void remove_all_queries();
+
+ void reregister_queries();
+
+ std::map<Query, Limits> get_queries() const {
+ std::lock_guard locker(lock);
+
+ std::map<Query, Limits> result;
+ for (auto& [query, limits] : queries) {
+ auto result_it = result.insert({query, {}}).first;
+ if (is_limited(limits)) {
+ for (auto& limit : limits) {
+ if (limit.second) {
+ result_it->second.insert(*limit.second);
+ }
+ }
+ }
+ }
+
+ return result;
+ }
+
+ virtual void process_reports(const MetricPayload &payload) = 0;
+ virtual int get_counters(PerfCollector *collector) = 0;
+
+protected:
+ typedef std::optional<Limit> OptionalLimit;
+ typedef std::map<MetricQueryID, OptionalLimit> QueryIDLimit;
+ typedef std::map<Query, QueryIDLimit> Queries;
+ typedef std::map<MetricQueryID, std::map<Key, PerformanceCounters>> Counters;
+ typedef std::function<void(PerformanceCounter *, const PerformanceCounter &)> UpdateCallback;
+
+ mutable ceph::mutex lock = ceph::make_mutex("mgr::metric::collector::lock");
+
+ Queries queries;
+ Counters counters;
+
+ void process_reports_generic(const std::map<Query, Report> &reports, UpdateCallback callback);
+ int get_counters_generic(MetricQueryID query_id, std::map<Key, PerformanceCounters> *counters);
+
+private:
+ MetricListener &listener;
+ MetricQueryID next_query_id = 0;
+
+ bool is_limited(const std::map<MetricQueryID, OptionalLimit> &limits) const {
+ return std::any_of(begin(limits), end(limits),
+ [](auto &limits) { return limits.second.has_value(); });
+ }
+};
+
+#endif // CEPH_MGR_METRIC_COLLECTOR_H
diff --git a/src/mgr/MetricTypes.h b/src/mgr/MetricTypes.h
new file mode 100644
index 000000000..586c470ca
--- /dev/null
+++ b/src/mgr/MetricTypes.h
@@ -0,0 +1,277 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_METRIC_TYPES_H
+#define CEPH_MGR_METRIC_TYPES_H
+
+#include <boost/variant.hpp>
+#include "include/denc.h"
+#include "include/ceph_features.h"
+#include "mgr/OSDPerfMetricTypes.h"
+#include "mgr/MDSPerfMetricTypes.h"
+
+enum class MetricReportType {
+ METRIC_REPORT_TYPE_OSD = 0,
+ METRIC_REPORT_TYPE_MDS = 1,
+};
+
+struct OSDMetricPayload {
+ static const MetricReportType METRIC_REPORT_TYPE = MetricReportType::METRIC_REPORT_TYPE_OSD;
+ std::map<OSDPerfMetricQuery, OSDPerfMetricReport> report;
+
+ OSDMetricPayload() {
+ }
+ OSDMetricPayload(const std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &report)
+ : report(report) {
+ }
+
+ DENC(OSDMetricPayload, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.report, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct MDSMetricPayload {
+ static const MetricReportType METRIC_REPORT_TYPE = MetricReportType::METRIC_REPORT_TYPE_MDS;
+ MDSPerfMetricReport metric_report;
+
+ MDSMetricPayload() {
+ }
+ MDSMetricPayload(const MDSPerfMetricReport &metric_report)
+ : metric_report(metric_report) {
+ }
+
+ DENC(MDSMetricPayload, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.metric_report, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct UnknownMetricPayload {
+ static const MetricReportType METRIC_REPORT_TYPE = static_cast<MetricReportType>(-1);
+
+ UnknownMetricPayload() { }
+
+ DENC(UnknownMetricPayload, v, p) {
+ ceph_abort();
+ }
+};
+
+WRITE_CLASS_DENC(OSDMetricPayload)
+WRITE_CLASS_DENC(MDSMetricPayload)
+WRITE_CLASS_DENC(UnknownMetricPayload)
+
+typedef boost::variant<OSDMetricPayload,
+ MDSMetricPayload,
+ UnknownMetricPayload> MetricPayload;
+
+class EncodeMetricPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodeMetricPayloadVisitor(ceph::buffer::list &bl) : m_bl(bl) {
+ }
+
+ template <typename MetricPayload>
+ inline void operator()(const MetricPayload &payload) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(MetricPayload::METRIC_REPORT_TYPE), m_bl);
+ encode(payload, m_bl);
+ }
+
+private:
+ ceph::buffer::list &m_bl;
+};
+
+class DecodeMetricPayloadVisitor : public boost::static_visitor<void> {
+public:
+ DecodeMetricPayloadVisitor(ceph::buffer::list::const_iterator &iter) : m_iter(iter) {
+ }
+
+ template <typename MetricPayload>
+ inline void operator()(MetricPayload &payload) const {
+ using ceph::decode;
+ decode(payload, m_iter);
+ }
+
+private:
+ ceph::buffer::list::const_iterator &m_iter;
+};
+
+struct MetricReportMessage {
+ MetricPayload payload;
+
+ MetricReportMessage(const MetricPayload &payload = UnknownMetricPayload())
+ : payload(payload) {
+ }
+
+ bool should_encode(uint64_t features) const {
+ if (!HAVE_FEATURE(features, SERVER_PACIFIC) &&
+ boost::get<MDSMetricPayload>(&payload)) {
+ return false;
+ }
+ return true;
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+ boost::apply_visitor(EncodeMetricPayloadVisitor(bl), payload);
+ }
+
+ void decode(ceph::buffer::list::const_iterator &iter) {
+ using ceph::decode;
+
+ uint32_t metric_report_type;
+ decode(metric_report_type, iter);
+
+ switch (static_cast<MetricReportType>(metric_report_type)) {
+ case MetricReportType::METRIC_REPORT_TYPE_OSD:
+ payload = OSDMetricPayload();
+ break;
+ case MetricReportType::METRIC_REPORT_TYPE_MDS:
+ payload = MDSMetricPayload();
+ break;
+ default:
+ payload = UnknownMetricPayload();
+ break;
+ }
+
+ boost::apply_visitor(DecodeMetricPayloadVisitor(iter), payload);
+ }
+};
+
+WRITE_CLASS_ENCODER(MetricReportMessage);
+
+// variant for sending configure message to mgr clients
+
+enum MetricConfigType {
+ METRIC_CONFIG_TYPE_OSD = 0,
+ METRIC_CONFIG_TYPE_MDS = 1,
+};
+
+struct OSDConfigPayload {
+ static const MetricConfigType METRIC_CONFIG_TYPE = MetricConfigType::METRIC_CONFIG_TYPE_OSD;
+ std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> config;
+
+ OSDConfigPayload() {
+ }
+ OSDConfigPayload(const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &config)
+ : config(config) {
+ }
+
+ DENC(OSDConfigPayload, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.config, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct MDSConfigPayload {
+ static const MetricConfigType METRIC_CONFIG_TYPE = MetricConfigType::METRIC_CONFIG_TYPE_MDS;
+ std::map<MDSPerfMetricQuery, MDSPerfMetricLimits> config;
+
+ MDSConfigPayload() {
+ }
+ MDSConfigPayload(const std::map<MDSPerfMetricQuery, MDSPerfMetricLimits> &config)
+ : config(config) {
+ }
+
+ DENC(MDSConfigPayload, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.config, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct UnknownConfigPayload {
+ static const MetricConfigType METRIC_CONFIG_TYPE = static_cast<MetricConfigType>(-1);
+
+ UnknownConfigPayload() { }
+
+ DENC(UnknownConfigPayload, v, p) {
+ ceph_abort();
+ }
+};
+
+WRITE_CLASS_DENC(OSDConfigPayload)
+WRITE_CLASS_DENC(MDSConfigPayload)
+WRITE_CLASS_DENC(UnknownConfigPayload)
+
+typedef boost::variant<OSDConfigPayload,
+ MDSConfigPayload,
+ UnknownConfigPayload> ConfigPayload;
+
+class EncodeConfigPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodeConfigPayloadVisitor(ceph::buffer::list &bl) : m_bl(bl) {
+ }
+
+ template <typename ConfigPayload>
+ inline void operator()(const ConfigPayload &payload) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(ConfigPayload::METRIC_CONFIG_TYPE), m_bl);
+ encode(payload, m_bl);
+ }
+
+private:
+ ceph::buffer::list &m_bl;
+};
+
+class DecodeConfigPayloadVisitor : public boost::static_visitor<void> {
+public:
+ DecodeConfigPayloadVisitor(ceph::buffer::list::const_iterator &iter) : m_iter(iter) {
+ }
+
+ template <typename ConfigPayload>
+ inline void operator()(ConfigPayload &payload) const {
+ using ceph::decode;
+ decode(payload, m_iter);
+ }
+
+private:
+ ceph::buffer::list::const_iterator &m_iter;
+};
+
+struct MetricConfigMessage {
+ ConfigPayload payload;
+
+ MetricConfigMessage(const ConfigPayload &payload = UnknownConfigPayload())
+ : payload(payload) {
+ }
+
+ bool should_encode(uint64_t features) const {
+ if (!HAVE_FEATURE(features, SERVER_PACIFIC) &&
+ boost::get<MDSConfigPayload>(&payload)) {
+ return false;
+ }
+ return true;
+ }
+
+ void encode(ceph::buffer::list &bl) const {
+ boost::apply_visitor(EncodeConfigPayloadVisitor(bl), payload);
+ }
+
+ void decode(ceph::buffer::list::const_iterator &iter) {
+ using ceph::decode;
+
+ uint32_t metric_config_type;
+ decode(metric_config_type, iter);
+
+ switch (metric_config_type) {
+ case MetricConfigType::METRIC_CONFIG_TYPE_OSD:
+ payload = OSDConfigPayload();
+ break;
+ case MetricConfigType::METRIC_CONFIG_TYPE_MDS:
+ payload = MDSConfigPayload();
+ break;
+ default:
+ payload = UnknownConfigPayload();
+ break;
+ }
+
+ boost::apply_visitor(DecodeConfigPayloadVisitor(iter), payload);
+ }
+};
+
+WRITE_CLASS_ENCODER(MetricConfigMessage);
+
+#endif // CEPH_MGR_METRIC_TYPES_H
diff --git a/src/mgr/Mgr.cc b/src/mgr/Mgr.cc
new file mode 100644
index 000000000..bf9eae2e7
--- /dev/null
+++ b/src/mgr/Mgr.cc
@@ -0,0 +1,795 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include <Python.h>
+
+#include "osdc/Objecter.h"
+#include "client/Client.h"
+#include "common/errno.h"
+#include "mon/MonClient.h"
+#include "include/stringify.h"
+#include "global/global_context.h"
+#include "global/signal_handler.h"
+
+#include "mgr/MgrContext.h"
+
+#include "DaemonServer.h"
+#include "messages/MMgrDigest.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MLog.h"
+#include "messages/MServiceMap.h"
+#include "messages/MKVData.h"
+#include "PyModule.h"
+#include "Mgr.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+Mgr::Mgr(MonClient *monc_, const MgrMap& mgrmap,
+ PyModuleRegistry *py_module_registry_,
+ Messenger *clientm_, Objecter *objecter_,
+ Client* client_, LogChannelRef clog_, LogChannelRef audit_clog_) :
+ monc(monc_),
+ objecter(objecter_),
+ client(client_),
+ client_messenger(clientm_),
+ finisher(g_ceph_context, "Mgr", "mgr-fin"),
+ digest_received(false),
+ py_module_registry(py_module_registry_),
+ cluster_state(monc, nullptr, mgrmap),
+ server(monc, finisher, daemon_state, cluster_state, *py_module_registry,
+ clog_, audit_clog_),
+ clog(clog_),
+ audit_clog(audit_clog_),
+ initialized(false),
+ initializing(false)
+{
+ cluster_state.set_objecter(objecter);
+}
+
+
+Mgr::~Mgr()
+{
+}
+
+void MetadataUpdate::finish(int r)
+{
+ daemon_state.clear_updating(key);
+ if (r == 0) {
+ if (key.type == "mds" || key.type == "osd" ||
+ key.type == "mgr" || key.type == "mon") {
+ json_spirit::mValue json_result;
+ bool read_ok = json_spirit::read(
+ outbl.to_str(), json_result);
+ if (!read_ok) {
+ dout(1) << "mon returned invalid JSON for " << key << dendl;
+ return;
+ }
+ if (json_result.type() != json_spirit::obj_type) {
+ dout(1) << "mon returned valid JSON " << key
+ << " but not an object: '" << outbl.to_str() << "'" << dendl;
+ return;
+ }
+ dout(4) << "mon returned valid metadata JSON for " << key << dendl;
+
+ json_spirit::mObject daemon_meta = json_result.get_obj();
+
+ // Skip daemon who doesn't have hostname yet
+ if (daemon_meta.count("hostname") == 0) {
+ dout(1) << "Skipping incomplete metadata entry for " << key << dendl;
+ return;
+ }
+
+ // Apply any defaults
+ for (const auto &i : defaults) {
+ if (daemon_meta.find(i.first) == daemon_meta.end()) {
+ daemon_meta[i.first] = i.second;
+ }
+ }
+
+ if (daemon_state.exists(key)) {
+ DaemonStatePtr state = daemon_state.get(key);
+ map<string,string> m;
+ {
+ std::lock_guard l(state->lock);
+ state->hostname = daemon_meta.at("hostname").get_str();
+
+ if (key.type == "mds" || key.type == "mgr" || key.type == "mon") {
+ daemon_meta.erase("name");
+ } else if (key.type == "osd") {
+ daemon_meta.erase("id");
+ }
+ daemon_meta.erase("hostname");
+ for (const auto &[key, val] : daemon_meta) {
+ m.emplace(key, val.get_str());
+ }
+ }
+ daemon_state.update_metadata(state, m);
+ } else {
+ auto state = std::make_shared<DaemonState>(daemon_state.types);
+ state->key = key;
+ state->hostname = daemon_meta.at("hostname").get_str();
+
+ if (key.type == "mds" || key.type == "mgr" || key.type == "mon") {
+ daemon_meta.erase("name");
+ } else if (key.type == "osd") {
+ daemon_meta.erase("id");
+ }
+ daemon_meta.erase("hostname");
+
+ map<string,string> m;
+ for (const auto &[key, val] : daemon_meta) {
+ m.emplace(key, val.get_str());
+ }
+ state->set_metadata(m);
+
+ daemon_state.insert(state);
+ }
+ } else {
+ ceph_abort();
+ }
+ } else {
+ dout(1) << "mon failed to return metadata for " << key
+ << ": " << cpp_strerror(r) << dendl;
+ }
+}
+
+void Mgr::background_init(Context *completion)
+{
+ std::lock_guard l(lock);
+ ceph_assert(!initializing);
+ ceph_assert(!initialized);
+ initializing = true;
+
+ finisher.start();
+
+ finisher.queue(new LambdaContext([this, completion](int r){
+ init();
+ completion->complete(0);
+ }));
+}
+
+std::map<std::string, std::string> Mgr::load_store()
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ dout(10) << "listing keys" << dendl;
+ JSONCommand cmd;
+ cmd.run(monc, "{\"prefix\": \"config-key ls\"}");
+ lock.unlock();
+ cmd.wait();
+ lock.lock();
+ ceph_assert(cmd.r == 0);
+
+ std::map<std::string, std::string> loaded;
+
+ for (auto &key_str : cmd.json_result.get_array()) {
+ std::string const key = key_str.get_str();
+
+ dout(20) << "saw key '" << key << "'" << dendl;
+
+ const std::string store_prefix = PyModule::mgr_store_prefix;
+ const std::string device_prefix = "device/";
+
+ if (key.substr(0, device_prefix.size()) == device_prefix ||
+ key.substr(0, store_prefix.size()) == store_prefix) {
+ dout(20) << "fetching '" << key << "'" << dendl;
+ Command get_cmd;
+ std::ostringstream cmd_json;
+ cmd_json << "{\"prefix\": \"config-key get\", \"key\": \"" << key << "\"}";
+ get_cmd.run(monc, cmd_json.str());
+ lock.unlock();
+ get_cmd.wait();
+ lock.lock();
+ if (get_cmd.r == 0) { // tolerate racing config-key change
+ loaded[key] = get_cmd.outbl.to_str();
+ }
+ }
+ }
+
+ return loaded;
+}
+
+void Mgr::handle_signal(int signum)
+{
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ shutdown();
+}
+
+static void handle_mgr_signal(int signum)
+{
+ derr << " *** Got signal " << sig_str(signum) << " ***" << dendl;
+
+ // The python modules don't reliably shut down, so don't even
+ // try. The mon will blocklist us (and all of our rados/cephfs
+ // clients) anyway. Just exit!
+
+ _exit(0); // exit with 0 result code, as if we had done an orderly shutdown
+}
+
+void Mgr::init()
+{
+ std::unique_lock l(lock);
+ ceph_assert(initializing);
+ ceph_assert(!initialized);
+
+ // Enable signal handlers
+ register_async_signal_handler_oneshot(SIGINT, handle_mgr_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_mgr_signal);
+
+ // Only pacific+ monitors support subscribe to kv updates
+ bool mon_allows_kv_sub = false;
+ monc->with_monmap(
+ [&](const MonMap &monmap) {
+ if (monmap.get_required_features().contains_all(
+ ceph::features::mon::FEATURE_PACIFIC)) {
+ mon_allows_kv_sub = true;
+ }
+ });
+ if (!mon_allows_kv_sub) {
+ // mons are still pre-pacific. wait long enough to ensure our
+ // next beacon is processed so that our module options are
+ // propagated. See https://tracker.ceph.com/issues/49778
+ lock.unlock();
+ dout(10) << "waiting a bit for the pre-pacific mon to process our beacon" << dendl;
+ sleep(g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count() * 3);
+ lock.lock();
+ }
+
+ // subscribe to all the maps
+ monc->sub_want("log-info", 0, 0);
+ monc->sub_want("mgrdigest", 0, 0);
+ monc->sub_want("fsmap", 0, 0);
+ monc->sub_want("servicemap", 0, 0);
+ if (mon_allows_kv_sub) {
+ monc->sub_want("kv:config/", 0, 0);
+ monc->sub_want("kv:mgr/", 0, 0);
+ monc->sub_want("kv:device/", 0, 0);
+ }
+
+ dout(4) << "waiting for OSDMap..." << dendl;
+ // Subscribe to OSDMap update to pass on to ClusterState
+ objecter->maybe_request_map();
+
+ // reset the mon session. we get these maps through subscriptions which
+ // are stateful with the connection, so even if *we* don't have them a
+ // previous incarnation sharing the same MonClient may have.
+ monc->reopen_session();
+
+ // Start Objecter and wait for OSD map
+ lock.unlock(); // Drop lock because OSDMap dispatch calls into my ms_dispatch
+ epoch_t e;
+ cluster_state.with_mgrmap([&e](const MgrMap& m) {
+ e = m.last_failure_osd_epoch;
+ });
+ /* wait for any blocklists to be applied to previous mgr instance */
+ dout(4) << "Waiting for new OSDMap (e=" << e
+ << ") that may blocklist prior active." << dendl;
+ objecter->wait_for_osd_map(e);
+ lock.lock();
+
+ // Start communicating with daemons to learn statistics etc
+ int r = server.init(monc->get_global_id(), client_messenger->get_myaddrs());
+ if (r < 0) {
+ derr << "Initialize server fail: " << cpp_strerror(r) << dendl;
+ // This is typically due to a bind() failure, so let's let
+ // systemd restart us.
+ exit(1);
+ }
+ dout(4) << "Initialized server at " << server.get_myaddrs() << dendl;
+
+ // Preload all daemon metadata (will subsequently keep this
+ // up to date by watching maps, so do the initial load before
+ // we subscribe to any maps)
+ dout(4) << "Loading daemon metadata..." << dendl;
+ load_all_metadata();
+
+ // Populate PGs in ClusterState
+ cluster_state.with_osdmap_and_pgmap([this](const OSDMap &osd_map,
+ const PGMap& pg_map) {
+ cluster_state.notify_osdmap(osd_map);
+ });
+
+ // Wait for FSMap
+ dout(4) << "waiting for FSMap..." << dendl;
+ fs_map_cond.wait(l, [this] { return cluster_state.have_fsmap();});
+
+ // Wait for MgrDigest...
+ dout(4) << "waiting for MgrDigest..." << dendl;
+ digest_cond.wait(l, [this] { return digest_received; });
+
+ if (!mon_allows_kv_sub) {
+ dout(4) << "loading config-key data from pre-pacific mon cluster..." << dendl;
+ pre_init_store = load_store();
+ }
+
+ dout(4) << "initializing device state..." << dendl;
+ // Note: we only have to do this during startup because once we are
+ // active the only changes to this state will originate from one of our
+ // own modules.
+ for (auto p = pre_init_store.lower_bound("device/");
+ p != pre_init_store.end() && p->first.find("device/") == 0;
+ ++p) {
+ string devid = p->first.substr(7);
+ dout(10) << " updating " << devid << dendl;
+ map<string,string> meta;
+ ostringstream ss;
+ int r = get_json_str_map(p->second, ss, &meta, false);
+ if (r < 0) {
+ derr << __func__ << " failed to parse " << p->second << ": " << ss.str()
+ << dendl;
+ } else {
+ daemon_state.with_device_create(
+ devid, [&meta] (DeviceState& dev) {
+ dev.set_metadata(std::move(meta));
+ });
+ }
+ }
+
+ // assume finisher already initialized in background_init
+ dout(4) << "starting python modules..." << dendl;
+ py_module_registry->active_start(
+ daemon_state, cluster_state,
+ pre_init_store, mon_allows_kv_sub,
+ *monc, clog, audit_clog, *objecter, *client,
+ finisher, server);
+
+ cluster_state.final_init();
+
+ AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+ r = admin_socket->register_command(
+ "mgr_status", this,
+ "Dump mgr status");
+ ceph_assert(r == 0);
+
+ dout(4) << "Complete." << dendl;
+ initializing = false;
+ initialized = true;
+}
+
+void Mgr::load_all_metadata()
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ JSONCommand mds_cmd;
+ mds_cmd.run(monc, "{\"prefix\": \"mds metadata\"}");
+ JSONCommand osd_cmd;
+ osd_cmd.run(monc, "{\"prefix\": \"osd metadata\"}");
+ JSONCommand mon_cmd;
+ mon_cmd.run(monc, "{\"prefix\": \"mon metadata\"}");
+
+ lock.unlock();
+ mds_cmd.wait();
+ osd_cmd.wait();
+ mon_cmd.wait();
+ lock.lock();
+
+ ceph_assert(mds_cmd.r == 0);
+ ceph_assert(mon_cmd.r == 0);
+ ceph_assert(osd_cmd.r == 0);
+
+ for (auto &metadata_val : mds_cmd.json_result.get_array()) {
+ json_spirit::mObject daemon_meta = metadata_val.get_obj();
+ if (daemon_meta.count("hostname") == 0) {
+ dout(1) << "Skipping incomplete metadata entry" << dendl;
+ continue;
+ }
+
+ DaemonStatePtr dm = std::make_shared<DaemonState>(daemon_state.types);
+ dm->key = DaemonKey{"mds",
+ daemon_meta.at("name").get_str()};
+ dm->hostname = daemon_meta.at("hostname").get_str();
+
+ daemon_meta.erase("name");
+ daemon_meta.erase("hostname");
+
+ for (const auto &[key, val] : daemon_meta) {
+ dm->metadata.emplace(key, val.get_str());
+ }
+
+ daemon_state.insert(dm);
+ }
+
+ for (auto &metadata_val : mon_cmd.json_result.get_array()) {
+ json_spirit::mObject daemon_meta = metadata_val.get_obj();
+ if (daemon_meta.count("hostname") == 0) {
+ dout(1) << "Skipping incomplete metadata entry" << dendl;
+ continue;
+ }
+
+ DaemonStatePtr dm = std::make_shared<DaemonState>(daemon_state.types);
+ dm->key = DaemonKey{"mon",
+ daemon_meta.at("name").get_str()};
+ dm->hostname = daemon_meta.at("hostname").get_str();
+
+ daemon_meta.erase("name");
+ daemon_meta.erase("hostname");
+
+ map<string,string> m;
+ for (const auto &[key, val] : daemon_meta) {
+ m.emplace(key, val.get_str());
+ }
+ dm->set_metadata(m);
+
+ daemon_state.insert(dm);
+ }
+
+ for (auto &osd_metadata_val : osd_cmd.json_result.get_array()) {
+ json_spirit::mObject osd_metadata = osd_metadata_val.get_obj();
+ if (osd_metadata.count("hostname") == 0) {
+ dout(1) << "Skipping incomplete metadata entry" << dendl;
+ continue;
+ }
+ dout(4) << osd_metadata.at("hostname").get_str() << dendl;
+
+ DaemonStatePtr dm = std::make_shared<DaemonState>(daemon_state.types);
+ dm->key = DaemonKey{"osd",
+ stringify(osd_metadata.at("id").get_int())};
+ dm->hostname = osd_metadata.at("hostname").get_str();
+
+ osd_metadata.erase("id");
+ osd_metadata.erase("hostname");
+
+ map<string,string> m;
+ for (const auto &i : osd_metadata) {
+ m[i.first] = i.second.get_str();
+ }
+ dm->set_metadata(m);
+
+ daemon_state.insert(dm);
+ }
+}
+
+
+void Mgr::shutdown()
+{
+ dout(10) << "mgr shutdown init" << dendl;
+ finisher.queue(new LambdaContext([&](int) {
+ {
+ std::lock_guard l(lock);
+ // First stop the server so that we're not taking any more incoming
+ // requests
+ server.shutdown();
+ }
+ // after the messenger is stopped, signal modules to shutdown via finisher
+ py_module_registry->active_shutdown();
+ }));
+
+ // Then stop the finisher to ensure its enqueued contexts aren't going
+ // to touch references to the things we're about to tear down
+ finisher.wait_for_empty();
+ finisher.stop();
+}
+
+void Mgr::handle_osd_map()
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ std::set<std::string> names_exist;
+
+ /**
+ * When we see a new OSD map, inspect the entity addrs to
+ * see if they have changed (service restart), and if so
+ * reload the metadata.
+ */
+ cluster_state.with_osdmap_and_pgmap([this, &names_exist](const OSDMap &osd_map,
+ const PGMap &pg_map) {
+ for (int osd_id = 0; osd_id < osd_map.get_max_osd(); ++osd_id) {
+ if (!osd_map.exists(osd_id)) {
+ continue;
+ }
+
+ // Remember which OSDs exist so that we can cull any that don't
+ names_exist.insert(stringify(osd_id));
+
+ // Consider whether to update the daemon metadata (new/restarted daemon)
+ const auto k = DaemonKey{"osd", std::to_string(osd_id)};
+ if (daemon_state.is_updating(k)) {
+ continue;
+ }
+
+ bool update_meta = false;
+ if (daemon_state.exists(k)) {
+ if (osd_map.get_up_from(osd_id) == osd_map.get_epoch()) {
+ dout(4) << "Mgr::handle_osd_map: osd." << osd_id
+ << " joined cluster at " << "e" << osd_map.get_epoch()
+ << dendl;
+ update_meta = true;
+ }
+ } else {
+ update_meta = true;
+ }
+ if (update_meta) {
+ auto c = new MetadataUpdate(daemon_state, k);
+ std::ostringstream cmd;
+ cmd << "{\"prefix\": \"osd metadata\", \"id\": "
+ << osd_id << "}";
+ monc->start_mon_command(
+ {cmd.str()},
+ {}, &c->outbl, &c->outs, c);
+ }
+ }
+
+ cluster_state.notify_osdmap(osd_map);
+ });
+
+ // TODO: same culling for MonMap
+ daemon_state.cull("osd", names_exist);
+}
+
+void Mgr::handle_log(ref_t<MLog> m)
+{
+ for (const auto &e : m->entries) {
+ py_module_registry->notify_all(e);
+ }
+}
+
+void Mgr::handle_service_map(ref_t<MServiceMap> m)
+{
+ dout(10) << "e" << m->service_map.epoch << dendl;
+ monc->sub_got("servicemap", m->service_map.epoch);
+ cluster_state.set_service_map(m->service_map);
+ server.got_service_map();
+}
+
+void Mgr::handle_mon_map()
+{
+ dout(20) << __func__ << dendl;
+ assert(ceph_mutex_is_locked_by_me(lock));
+ std::set<std::string> names_exist;
+ cluster_state.with_monmap([&] (auto &monmap) {
+ for (unsigned int i = 0; i < monmap.size(); i++) {
+ names_exist.insert(monmap.get_name(i));
+ }
+ });
+ for (const auto& name : names_exist) {
+ const auto k = DaemonKey{"mon", name};
+ if (daemon_state.is_updating(k)) {
+ continue;
+ }
+ auto c = new MetadataUpdate(daemon_state, k);
+ const char* cmd = R"({{"prefix": "mon metadata", "id": "{}"}})";
+ monc->start_mon_command({fmt::format(cmd, name)}, {},
+ &c->outbl, &c->outs, c);
+ }
+ daemon_state.cull("mon", names_exist);
+}
+
+bool Mgr::ms_dispatch2(const ref_t<Message>& m)
+{
+ dout(10) << *m << dendl;
+ std::lock_guard l(lock);
+
+ switch (m->get_type()) {
+ case MSG_MGR_DIGEST:
+ handle_mgr_digest(ref_cast<MMgrDigest>(m));
+ break;
+ case CEPH_MSG_MON_MAP:
+ py_module_registry->notify_all("mon_map", "");
+ handle_mon_map();
+ break;
+ case CEPH_MSG_FS_MAP:
+ py_module_registry->notify_all("fs_map", "");
+ handle_fs_map(ref_cast<MFSMap>(m));
+ return false; // I shall let this pass through for Client
+ case CEPH_MSG_OSD_MAP:
+ handle_osd_map();
+
+ py_module_registry->notify_all("osd_map", "");
+
+ // Continuous subscribe, so that we can generate notifications
+ // for our MgrPyModules
+ objecter->maybe_request_map();
+ break;
+ case MSG_SERVICE_MAP:
+ handle_service_map(ref_cast<MServiceMap>(m));
+ //no users: py_module_registry->notify_all("service_map", "");
+ break;
+ case MSG_LOG:
+ handle_log(ref_cast<MLog>(m));
+ break;
+ case MSG_KV_DATA:
+ {
+ auto msg = ref_cast<MKVData>(m);
+ monc->sub_got("kv:"s + msg->prefix, msg->version);
+ if (!msg->data.empty()) {
+ if (initialized) {
+ py_module_registry->update_kv_data(
+ msg->prefix,
+ msg->incremental,
+ msg->data
+ );
+ } else {
+ // before we have created the ActivePyModules, we need to
+ // track the store regions we're monitoring
+ if (!msg->incremental) {
+ dout(10) << "full update on " << msg->prefix << dendl;
+ auto p = pre_init_store.lower_bound(msg->prefix);
+ while (p != pre_init_store.end() && p->first.find(msg->prefix) == 0) {
+ dout(20) << " rm prior " << p->first << dendl;
+ p = pre_init_store.erase(p);
+ }
+ } else {
+ dout(10) << "incremental update on " << msg->prefix << dendl;
+ }
+ for (auto& i : msg->data) {
+ if (i.second) {
+ dout(20) << " set " << i.first << " = " << i.second->to_str() << dendl;
+ pre_init_store[i.first] = i.second->to_str();
+ } else {
+ dout(20) << " rm " << i.first << dendl;
+ pre_init_store.erase(i.first);
+ }
+ }
+ }
+ }
+ }
+ break;
+
+ default:
+ return false;
+ }
+ return true;
+}
+
+
+void Mgr::handle_fs_map(ref_t<MFSMap> m)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ std::set<std::string> names_exist;
+ const FSMap &new_fsmap = m->get_fsmap();
+
+ monc->sub_got("fsmap", m->epoch);
+
+ fs_map_cond.notify_all();
+
+ // TODO: callers (e.g. from python land) are potentially going to see
+ // the new fsmap before we've bothered populating all the resulting
+ // daemon_state. Maybe we should block python land while we're making
+ // this kind of update?
+
+ cluster_state.set_fsmap(new_fsmap);
+
+ auto mds_info = new_fsmap.get_mds_info();
+ for (const auto &i : mds_info) {
+ const auto &info = i.second;
+
+ if (!new_fsmap.gid_exists(i.first)){
+ continue;
+ }
+
+ // Remember which MDS exists so that we can cull any that don't
+ names_exist.insert(info.name);
+
+ const auto k = DaemonKey{"mds", info.name};
+ if (daemon_state.is_updating(k)) {
+ continue;
+ }
+
+ bool update = false;
+ if (daemon_state.exists(k)) {
+ auto metadata = daemon_state.get(k);
+ std::lock_guard l(metadata->lock);
+ if (metadata->metadata.empty() ||
+ metadata->metadata.count("addr") == 0) {
+ update = true;
+ } else {
+ auto metadata_addrs = metadata->metadata.at("addr");
+ const auto map_addrs = info.addrs;
+ update = metadata_addrs != stringify(map_addrs);
+ if (update) {
+ dout(4) << "MDS[" << info.name << "] addr change " << metadata_addrs
+ << " != " << stringify(map_addrs) << dendl;
+ }
+ }
+ } else {
+ update = true;
+ }
+
+ if (update) {
+ auto c = new MetadataUpdate(daemon_state, k);
+
+ // Older MDS daemons don't have addr in the metadata, so
+ // fake it if the returned metadata doesn't have the field.
+ c->set_default("addr", stringify(info.addrs));
+
+ std::ostringstream cmd;
+ cmd << "{\"prefix\": \"mds metadata\", \"who\": \""
+ << info.name << "\"}";
+ monc->start_mon_command(
+ {cmd.str()},
+ {}, &c->outbl, &c->outs, c);
+ }
+ }
+ daemon_state.cull("mds", names_exist);
+}
+
+bool Mgr::got_mgr_map(const MgrMap& m)
+{
+ std::lock_guard l(lock);
+ dout(10) << m << dendl;
+
+ set<string> old_modules;
+ cluster_state.with_mgrmap([&](const MgrMap& m) {
+ old_modules = m.modules;
+ });
+ if (m.modules != old_modules) {
+ derr << "mgrmap module list changed to (" << m.modules << "), respawn"
+ << dendl;
+ return true;
+ }
+
+ cluster_state.set_mgr_map(m);
+ server.got_mgr_map();
+
+ return false;
+}
+
+void Mgr::handle_mgr_digest(ref_t<MMgrDigest> m)
+{
+ dout(10) << m->mon_status_json.length() << dendl;
+ dout(10) << m->health_json.length() << dendl;
+ cluster_state.load_digest(m.get());
+ //no users: py_module_registry->notify_all("mon_status", "");
+ py_module_registry->notify_all("health", "");
+
+ // Hack: use this as a tick/opportunity to prompt python-land that
+ // the pgmap might have changed since last time we were here.
+ py_module_registry->notify_all("pg_summary", "");
+ dout(10) << "done." << dendl;
+ m.reset();
+
+ if (!digest_received) {
+ digest_received = true;
+ digest_cond.notify_all();
+ }
+}
+
+std::map<std::string, std::string> Mgr::get_services() const
+{
+ std::lock_guard l(lock);
+
+ return py_module_registry->get_services();
+}
+
+int Mgr::call(
+ std::string_view admin_command,
+ const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out)
+{
+ try {
+ if (admin_command == "mgr_status") {
+ f->open_object_section("mgr_status");
+ cluster_state.with_mgrmap(
+ [f](const MgrMap& mm) {
+ f->dump_unsigned("mgrmap_epoch", mm.get_epoch());
+ });
+ f->dump_bool("initialized", initialized);
+ f->close_section();
+ return 0;
+ } else {
+ return -ENOSYS;
+ }
+ } catch (const TOPNSPC::common::bad_cmd_get& e) {
+ errss << e.what();
+ return -EINVAL;
+ }
+ return 0;
+}
diff --git a/src/mgr/Mgr.h b/src/mgr/Mgr.h
new file mode 100644
index 000000000..28a7da93d
--- /dev/null
+++ b/src/mgr/Mgr.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef CEPH_MGR_H_
+#define CEPH_MGR_H_
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include <Python.h>
+
+#include "mds/FSMap.h"
+#include "messages/MFSMap.h"
+#include "msg/Messenger.h"
+#include "auth/Auth.h"
+#include "common/Finisher.h"
+#include "mon/MgrMap.h"
+
+#include "DaemonServer.h"
+#include "PyModuleRegistry.h"
+
+#include "DaemonState.h"
+#include "ClusterState.h"
+
+class MCommand;
+class MMgrDigest;
+class MLog;
+class MServiceMap;
+class Objecter;
+class Client;
+
+class Mgr : public AdminSocketHook {
+protected:
+ MonClient *monc;
+ Objecter *objecter;
+ Client *client;
+ Messenger *client_messenger;
+
+ mutable ceph::mutex lock = ceph::make_mutex("Mgr::lock");
+ Finisher finisher;
+
+ // Track receipt of initial data during startup
+ ceph::condition_variable fs_map_cond;
+ bool digest_received;
+ ceph::condition_variable digest_cond;
+
+ PyModuleRegistry *py_module_registry;
+ DaemonStateIndex daemon_state;
+ ClusterState cluster_state;
+
+ DaemonServer server;
+
+ LogChannelRef clog;
+ LogChannelRef audit_clog;
+
+ std::map<std::string, std::string> pre_init_store;
+
+ void load_all_metadata();
+ std::map<std::string, std::string> load_store();
+ void init();
+
+ bool initialized;
+ bool initializing;
+
+public:
+ Mgr(MonClient *monc_, const MgrMap& mgrmap,
+ PyModuleRegistry *py_module_registry_,
+ Messenger *clientm_, Objecter *objecter_,
+ Client *client_, LogChannelRef clog_, LogChannelRef audit_clog_);
+ ~Mgr();
+
+ bool is_initialized() const {return initialized;}
+ entity_addrvec_t get_server_addrs() const {
+ return server.get_myaddrs();
+ }
+
+ void handle_mgr_digest(ceph::ref_t<MMgrDigest> m);
+ void handle_fs_map(ceph::ref_t<MFSMap> m);
+ void handle_osd_map();
+ void handle_log(ceph::ref_t<MLog> m);
+ void handle_service_map(ceph::ref_t<MServiceMap> m);
+ void handle_mon_map();
+
+ bool got_mgr_map(const MgrMap& m);
+
+ bool ms_dispatch2(const ceph::ref_t<Message>& m);
+
+ void background_init(Context *completion);
+ void shutdown();
+
+ void handle_signal(int signum);
+
+ std::map<std::string, std::string> get_services() const;
+
+ int call(
+ std::string_view command,
+ const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ ceph::buffer::list& out) override;
+};
+
+/**
+ * Context for completion of metadata mon commands: take
+ * the result and stash it in DaemonStateIndex
+ */
+class MetadataUpdate : public Context
+{
+
+private:
+ DaemonStateIndex &daemon_state;
+ DaemonKey key;
+
+ std::map<std::string, std::string> defaults;
+
+public:
+ bufferlist outbl;
+ std::string outs;
+
+ MetadataUpdate(DaemonStateIndex &daemon_state_, const DaemonKey &key_)
+ : daemon_state(daemon_state_), key(key_)
+ {
+ daemon_state.notify_updating(key);
+ }
+
+ void set_default(const std::string &k, const std::string &v)
+ {
+ defaults[k] = v;
+ }
+
+ void finish(int r) override;
+};
+
+
+#endif
diff --git a/src/mgr/MgrCap.cc b/src/mgr/MgrCap.cc
new file mode 100644
index 000000000..cba758083
--- /dev/null
+++ b/src/mgr/MgrCap.cc
@@ -0,0 +1,580 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <boost/config/warning_disable.hpp>
+#include <boost/spirit/include/qi_uint.hpp>
+#include <boost/spirit/include/qi.hpp>
+#include <boost/fusion/include/std_pair.hpp>
+#include <boost/spirit/include/phoenix.hpp>
+#include <boost/fusion/adapted/struct/adapt_struct.hpp>
+#include <boost/fusion/include/adapt_struct.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "MgrCap.h"
+#include "include/stringify.h"
+#include "include/ipaddr.h"
+#include "common/debug.h"
+#include "common/Formatter.h"
+
+#include <algorithm>
+#include <regex>
+
+#include "include/ceph_assert.h"
+
+static inline bool is_not_alnum_space(char c) {
+ return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_'));
+}
+
+static std::string maybe_quote_string(const std::string& str) {
+ if (find_if(str.begin(), str.end(), is_not_alnum_space) == str.end())
+ return str;
+ return std::string("\"") + str + std::string("\"");
+}
+
+#define dout_subsys ceph_subsys_mgr
+
+std::ostream& operator<<(std::ostream& out, const mgr_rwxa_t& p) {
+ if (p == MGR_CAP_ANY)
+ return out << "*";
+
+ if (p & MGR_CAP_R)
+ out << "r";
+ if (p & MGR_CAP_W)
+ out << "w";
+ if (p & MGR_CAP_X)
+ out << "x";
+ return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const MgrCapGrantConstraint& c) {
+ switch (c.match_type) {
+ case MgrCapGrantConstraint::MATCH_TYPE_EQUAL:
+ out << "=";
+ break;
+ case MgrCapGrantConstraint::MATCH_TYPE_PREFIX:
+ out << " prefix ";
+ break;
+ case MgrCapGrantConstraint::MATCH_TYPE_REGEX:
+ out << " regex ";
+ break;
+ default:
+ break;
+ }
+ out << maybe_quote_string(c.value);
+ return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const MgrCapGrant& m) {
+ if (!m.profile.empty()) {
+ out << "profile " << maybe_quote_string(m.profile);
+ } else {
+ out << "allow";
+ if (!m.service.empty()) {
+ out << " service " << maybe_quote_string(m.service);
+ } else if (!m.module.empty()) {
+ out << " module " << maybe_quote_string(m.module);
+ } else if (!m.command.empty()) {
+ out << " command " << maybe_quote_string(m.command);
+ }
+ }
+
+ if (!m.arguments.empty()) {
+ out << (!m.profile.empty() ? "" : " with");
+ for (auto& [key, constraint] : m.arguments) {
+ out << " " << maybe_quote_string(key) << constraint;
+ }
+ }
+
+ if (m.allow != 0) {
+ out << " " << m.allow;
+ }
+
+ if (m.network.size()) {
+ out << " network " << m.network;
+ }
+ return out;
+}
+
+// <magic>
+// fusion lets us easily populate structs via the qi parser.
+
+typedef std::map<std::string, MgrCapGrantConstraint> kvmap;
+
+BOOST_FUSION_ADAPT_STRUCT(MgrCapGrant,
+ (std::string, service)
+ (std::string, module)
+ (std::string, profile)
+ (std::string, command)
+ (kvmap, arguments)
+ (mgr_rwxa_t, allow)
+ (std::string, network))
+
+BOOST_FUSION_ADAPT_STRUCT(MgrCapGrantConstraint,
+ (MgrCapGrantConstraint::MatchType, match_type)
+ (std::string, value))
+
+// </magic>
+
+void MgrCapGrant::parse_network() {
+ network_valid = ::parse_network(network.c_str(), &network_parsed,
+ &network_prefix);
+}
+
+void MgrCapGrant::expand_profile(std::ostream *err) const {
+ // only generate this list once
+ if (!profile_grants.empty()) {
+ return;
+ }
+
+ if (profile == "read-only") {
+ // grants READ-ONLY caps MGR-wide
+ profile_grants.push_back({{}, {}, {}, {}, {}, mgr_rwxa_t{MGR_CAP_R}});
+ return;
+ }
+
+ if (profile == "read-write") {
+ // grants READ-WRITE caps MGR-wide
+ profile_grants.push_back({{}, {}, {}, {}, {},
+ mgr_rwxa_t{MGR_CAP_R | MGR_CAP_W}});
+ return;
+ }
+
+ if (profile == "crash") {
+ profile_grants.push_back({{}, {}, {}, "crash post", {}, {}});
+ return;
+ }
+
+ if (profile == "osd") {
+ // this is a documented profile (so we need to accept it as valid), but it
+ // currently doesn't do anything
+ return;
+ }
+
+ if (profile == "mds") {
+ // this is a documented profile (so we need to accept it as valid), but it
+ // currently doesn't do anything
+ return;
+ }
+
+ if (profile == "rbd" || profile == "rbd-read-only") {
+ Arguments filtered_arguments;
+ for (auto& [key, constraint] : arguments) {
+ if (key == "pool" || key == "namespace") {
+ filtered_arguments[key] = std::move(constraint);
+ } else {
+ if (err != nullptr) {
+ *err << "profile '" << profile << "' does not recognize key '" << key
+ << "'";
+ }
+ return;
+ }
+ }
+
+ mgr_rwxa_t perms = mgr_rwxa_t{MGR_CAP_R};
+ if (profile == "rbd") {
+ perms = mgr_rwxa_t{MGR_CAP_R | MGR_CAP_W};
+ }
+
+ // allow all 'rbd_support' commands (restricted by optional
+ // pool/namespace constraints)
+ profile_grants.push_back({{}, "rbd_support", {}, {},
+ std::move(filtered_arguments), perms});
+ return;
+ }
+
+ if (err != nullptr) {
+ *err << "unrecognized profile '" << profile << "'";
+ }
+}
+
+bool MgrCapGrant::validate_arguments(
+ const std::map<std::string, std::string>& args) const {
+ for (auto& [key, constraint] : arguments) {
+ auto q = args.find(key);
+
+ // argument must be present if a constraint exists
+ if (q == args.end()) {
+ return false;
+ }
+
+ switch (constraint.match_type) {
+ case MgrCapGrantConstraint::MATCH_TYPE_EQUAL:
+ if (constraint.value != q->second)
+ return false;
+ break;
+ case MgrCapGrantConstraint::MATCH_TYPE_PREFIX:
+ if (q->second.find(constraint.value) != 0)
+ return false;
+ break;
+ case MgrCapGrantConstraint::MATCH_TYPE_REGEX:
+ try {
+ std::regex pattern(constraint.value, std::regex::extended);
+ if (!std::regex_match(q->second, pattern)) {
+ return false;
+ }
+ } catch(const std::regex_error&) {
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return true;
+}
+
+mgr_rwxa_t MgrCapGrant::get_allowed(
+ CephContext *cct, EntityName name, const std::string& s,
+ const std::string& m, const std::string& c,
+ const std::map<std::string, std::string>& args) const {
+ if (!profile.empty()) {
+ expand_profile(nullptr);
+ mgr_rwxa_t a;
+ for (auto& grant : profile_grants) {
+ a = a | grant.get_allowed(cct, name, s, m, c, args);
+ }
+ return a;
+ }
+
+ if (!service.empty()) {
+ if (service != s) {
+ return mgr_rwxa_t{};
+ }
+ return allow;
+ }
+
+ if (!module.empty()) {
+ if (module != m) {
+ return mgr_rwxa_t{};
+ }
+
+ // don't test module arguments when validating a specific command
+ if (c.empty() && !validate_arguments(args)) {
+ return mgr_rwxa_t{};
+ }
+ return allow;
+ }
+
+ if (!command.empty()) {
+ if (command != c) {
+ return mgr_rwxa_t{};
+ }
+ if (!validate_arguments(args)) {
+ return mgr_rwxa_t{};
+ }
+ return mgr_rwxa_t{MGR_CAP_ANY};
+ }
+
+ return allow;
+}
+
+std::ostream& operator<<(std::ostream&out, const MgrCap& m) {
+ bool first = true;
+ for (auto& grant : m.grants) {
+ if (!first) {
+ out << ", ";
+ }
+ first = false;
+
+ out << grant;
+ }
+ return out;
+}
+
+bool MgrCap::is_allow_all() const {
+ for (auto& grant : grants) {
+ if (grant.is_allow_all()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void MgrCap::set_allow_all() {
+ grants.clear();
+ grants.push_back({{}, {}, {}, {}, {}, mgr_rwxa_t{MGR_CAP_ANY}});
+ text = "allow *";
+}
+
+bool MgrCap::is_capable(
+ CephContext *cct,
+ EntityName name,
+ const std::string& service,
+ const std::string& module,
+ const std::string& command,
+ const std::map<std::string, std::string>& command_args,
+ bool op_may_read, bool op_may_write, bool op_may_exec,
+ const entity_addr_t& addr) const {
+ if (cct) {
+ ldout(cct, 20) << "is_capable service=" << service << " "
+ << "module=" << module << " "
+ << "command=" << command
+ << (op_may_read ? " read":"")
+ << (op_may_write ? " write":"")
+ << (op_may_exec ? " exec":"")
+ << " addr " << addr
+ << " on cap " << *this
+ << dendl;
+ }
+
+ mgr_rwxa_t allow;
+ for (auto& grant : grants) {
+ if (cct)
+ ldout(cct, 20) << " allow so far " << allow << ", doing grant " << grant
+ << dendl;
+
+ if (grant.network.size() &&
+ (!grant.network_valid ||
+ !network_contains(grant.network_parsed,
+ grant.network_prefix,
+ addr))) {
+ continue;
+ }
+
+ if (grant.is_allow_all()) {
+ if (cct) {
+ ldout(cct, 20) << " allow all" << dendl;
+ }
+ return true;
+ }
+
+ // check enumerated caps
+ allow = allow | grant.get_allowed(cct, name, service, module, command,
+ command_args);
+ if ((!op_may_read || (allow & MGR_CAP_R)) &&
+ (!op_may_write || (allow & MGR_CAP_W)) &&
+ (!op_may_exec || (allow & MGR_CAP_X))) {
+ if (cct) {
+ ldout(cct, 20) << " match" << dendl;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+void MgrCap::encode(ceph::buffer::list& bl) const {
+ // remain backwards compatible w/ MgrCap
+ ENCODE_START(4, 4, bl);
+ encode(text, bl);
+ ENCODE_FINISH(bl);
+}
+
+void MgrCap::decode(ceph::buffer::list::const_iterator& bl) {
+ // remain backwards compatible w/ MgrCap
+ std::string s;
+ DECODE_START(4, bl);
+ decode(s, bl);
+ DECODE_FINISH(bl);
+ parse(s, NULL);
+}
+
+void MgrCap::dump(ceph::Formatter *f) const {
+ f->dump_string("text", text);
+}
+
+void MgrCap::generate_test_instances(std::list<MgrCap*>& ls) {
+ ls.push_back(new MgrCap);
+ ls.push_back(new MgrCap);
+ ls.back()->parse("allow *");
+ ls.push_back(new MgrCap);
+ ls.back()->parse("allow rwx");
+ ls.push_back(new MgrCap);
+ ls.back()->parse("allow service foo x");
+ ls.push_back(new MgrCap);
+ ls.back()->parse("allow command bar x");
+ ls.push_back(new MgrCap);
+ ls.back()->parse("allow service foo r, allow command bar x");
+ ls.push_back(new MgrCap);
+ ls.back()->parse("allow command bar with k1=v1 x");
+ ls.push_back(new MgrCap);
+ ls.back()->parse("allow command bar with k1=v1 k2=v2 x");
+ ls.push_back(new MgrCap);
+ ls.back()->parse("allow module bar with k1=v1 k2=v2 x");
+ ls.push_back(new MgrCap);
+ ls.back()->parse("profile rbd pool=rbd");
+}
+
+// grammar
+namespace qi = boost::spirit::qi;
+namespace ascii = boost::spirit::ascii;
+namespace phoenix = boost::phoenix;
+
+template <typename Iterator>
+struct MgrCapParser : qi::grammar<Iterator, MgrCap()> {
+ MgrCapParser() : MgrCapParser::base_type(mgrcap) {
+ using qi::char_;
+ using qi::int_;
+ using qi::ulong_long;
+ using qi::lexeme;
+ using qi::alnum;
+ using qi::_val;
+ using qi::_1;
+ using qi::_2;
+ using qi::_3;
+ using qi::eps;
+ using qi::lit;
+
+ quoted_string %=
+ lexeme['"' >> +(char_ - '"') >> '"'] |
+ lexeme['\'' >> +(char_ - '\'') >> '\''];
+ unquoted_word %= +char_("a-zA-Z0-9_./-");
+ str %= quoted_string | unquoted_word;
+ network_str %= +char_("/.:a-fA-F0-9][");
+
+ spaces = +(lit(' ') | lit('\n') | lit('\t'));
+
+ // key <=|prefix|regex> value[ ...]
+ str_match = -spaces >> lit('=') >> -spaces >>
+ qi::attr(MgrCapGrantConstraint::MATCH_TYPE_EQUAL) >> str;
+ str_prefix = spaces >> lit("prefix") >> spaces >>
+ qi::attr(MgrCapGrantConstraint::MATCH_TYPE_PREFIX) >> str;
+ str_regex = spaces >> lit("regex") >> spaces >>
+ qi::attr(MgrCapGrantConstraint::MATCH_TYPE_REGEX) >> str;
+ kv_pair = str >> (str_match | str_prefix | str_regex);
+ kv_map %= kv_pair >> *(spaces >> kv_pair);
+
+ // command := command[=]cmd [k1=v1 k2=v2 ...]
+ command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces)
+ >> qi::attr(std::string())
+ >> qi::attr(std::string())
+ >> qi::attr(std::string())
+ >> str
+ >> -(spaces >> lit("with") >> spaces >> kv_map)
+ >> qi::attr(0)
+ >> -(spaces >> lit("network") >> spaces >> network_str);
+
+ // service foo rwxa
+ service_match %= -spaces >> lit("allow") >> spaces >> lit("service") >> (lit('=') | spaces)
+ >> str
+ >> qi::attr(std::string())
+ >> qi::attr(std::string())
+ >> qi::attr(std::string())
+ >> qi::attr(std::map<std::string, MgrCapGrantConstraint>())
+ >> spaces >> rwxa
+ >> -(spaces >> lit("network") >> spaces >> network_str);
+
+ // module foo rwxa
+ module_match %= -spaces >> lit("allow") >> spaces >> lit("module") >> (lit('=') | spaces)
+ >> qi::attr(std::string())
+ >> str
+ >> qi::attr(std::string())
+ >> qi::attr(std::string())
+ >> -(spaces >> lit("with") >> spaces >> kv_map)
+ >> spaces >> rwxa
+ >> -(spaces >> lit("network") >> spaces >> network_str);
+
+ // profile foo
+ profile_match %= -spaces >> -(lit("allow") >> spaces)
+ >> lit("profile") >> (lit('=') | spaces)
+ >> qi::attr(std::string())
+ >> qi::attr(std::string())
+ >> str
+ >> qi::attr(std::string())
+ >> -(spaces >> kv_map)
+ >> qi::attr(0)
+ >> -(spaces >> lit("network") >> spaces >> network_str);
+
+ // rwxa
+ rwxa_match %= -spaces >> lit("allow") >> spaces
+ >> qi::attr(std::string())
+ >> qi::attr(std::string())
+ >> qi::attr(std::string())
+ >> qi::attr(std::string())
+ >> qi::attr(std::map<std::string,MgrCapGrantConstraint>())
+ >> rwxa
+ >> -(spaces >> lit("network") >> spaces >> network_str);
+
+ // rwxa := * | [r][w][x]
+ rwxa =
+ (lit("*")[_val = MGR_CAP_ANY]) |
+ (lit("all")[_val = MGR_CAP_ANY]) |
+ ( eps[_val = 0] >>
+ ( lit('r')[_val |= MGR_CAP_R] ||
+ lit('w')[_val |= MGR_CAP_W] ||
+ lit('x')[_val |= MGR_CAP_X]
+ )
+ );
+
+ // grant := allow ...
+ grant = -spaces >> (rwxa_match | profile_match | service_match |
+ module_match | command_match) >> -spaces;
+
+ // mgrcap := grant [grant ...]
+ grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' ')));
+ mgrcap = grants [_val = phoenix::construct<MgrCap>(_1)];
+ }
+
+ qi::rule<Iterator> spaces;
+ qi::rule<Iterator, unsigned()> rwxa;
+ qi::rule<Iterator, std::string()> quoted_string;
+ qi::rule<Iterator, std::string()> unquoted_word;
+ qi::rule<Iterator, std::string()> str, network_str;
+
+ qi::rule<Iterator, MgrCapGrantConstraint()> str_match, str_prefix, str_regex;
+ qi::rule<Iterator, std::pair<std::string, MgrCapGrantConstraint>()> kv_pair;
+ qi::rule<Iterator, std::map<std::string, MgrCapGrantConstraint>()> kv_map;
+
+ qi::rule<Iterator, MgrCapGrant()> rwxa_match;
+ qi::rule<Iterator, MgrCapGrant()> command_match;
+ qi::rule<Iterator, MgrCapGrant()> service_match;
+ qi::rule<Iterator, MgrCapGrant()> module_match;
+ qi::rule<Iterator, MgrCapGrant()> profile_match;
+ qi::rule<Iterator, MgrCapGrant()> grant;
+ qi::rule<Iterator, std::vector<MgrCapGrant>()> grants;
+ qi::rule<Iterator, MgrCap()> mgrcap;
+};
+
+bool MgrCap::parse(const std::string& str, std::ostream *err) {
+ auto iter = str.begin();
+ auto end = str.end();
+
+ MgrCapParser<std::string::const_iterator> exp;
+ bool r = qi::parse(iter, end, exp, *this);
+ if (r && iter == end) {
+ text = str;
+
+ std::stringstream profile_err;
+ for (auto& g : grants) {
+ g.parse_network();
+
+ if (!g.profile.empty()) {
+ g.expand_profile(&profile_err);
+ }
+ }
+
+ if (!profile_err.str().empty()) {
+ if (err != nullptr) {
+ *err << "mgr capability parse failed during profile evaluation: "
+ << profile_err.str();
+ }
+ return false;
+ }
+ return true;
+ }
+
+ // Make sure no grants are kept after parsing failed!
+ grants.clear();
+
+ if (err) {
+ if (iter != end)
+ *err << "mgr capability parse failed, stopped at '"
+ << std::string(iter, end) << "' of '" << str << "'";
+ else
+ *err << "mgr capability parse failed, stopped at end of '" << str << "'";
+ }
+
+ return false;
+}
diff --git a/src/mgr/MgrCap.h b/src/mgr/MgrCap.h
new file mode 100644
index 000000000..f7a8bd5f8
--- /dev/null
+++ b/src/mgr/MgrCap.h
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGRCAP_H
+#define CEPH_MGRCAP_H
+
+#include <iosfwd>
+
+#include "include/common_fwd.h"
+#include "include/types.h"
+#include "common/entity_name.h"
+
+static const __u8 MGR_CAP_R = (1 << 1); // read
+static const __u8 MGR_CAP_W = (1 << 2); // write
+static const __u8 MGR_CAP_X = (1 << 3); // execute
+static const __u8 MGR_CAP_ANY = 0xff; // *
+
+struct mgr_rwxa_t {
+ __u8 val = 0U;
+
+ mgr_rwxa_t() {}
+ explicit mgr_rwxa_t(__u8 v) : val(v) {}
+
+ mgr_rwxa_t& operator=(__u8 v) {
+ val = v;
+ return *this;
+ }
+ operator __u8() const {
+ return val;
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const mgr_rwxa_t& p);
+
+struct MgrCapGrantConstraint {
+ enum MatchType {
+ MATCH_TYPE_NONE,
+ MATCH_TYPE_EQUAL,
+ MATCH_TYPE_PREFIX,
+ MATCH_TYPE_REGEX
+ };
+
+ MatchType match_type = MATCH_TYPE_NONE;
+ std::string value;
+
+ MgrCapGrantConstraint() {}
+ MgrCapGrantConstraint(MatchType match_type, std::string value)
+ : match_type(match_type), value(value) {
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const MgrCapGrantConstraint& c);
+
+struct MgrCapGrant {
+ /*
+ * A grant can come in one of four forms:
+ *
+ * - a blanket allow ('allow rw', 'allow *')
+ * - this will match against any service and the read/write/exec flags
+ * in the mgr code. semantics of what X means are somewhat ad hoc.
+ *
+ * - a service allow ('allow service mds rw')
+ * - this will match against a specific service and the r/w/x flags.
+ *
+ * - a module allow ('allow module rbd_support rw, allow module rbd_support with pool=rbd rw')
+ * - this will match against a specific python add-on module and the r/w/x
+ * flags.
+ *
+ * - a profile ('profile read-only, profile rbd pool=rbd')
+ * - this will match against specific MGR-enforced semantics of what
+ * this type of user should need to do. examples include 'read-write',
+ * 'read-only', 'crash'.
+ *
+ * - a command ('allow command foo', 'allow command bar with arg1=val1 arg2 prefix val2')
+ * this includes the command name (the prefix string)
+ *
+ * The command, module, and profile caps can also accept an optional
+ * key/value map. If not provided, all command arguments and module
+ * meta-arguments are allowed. If a key/value pair is specified, that
+ * argument must be present and must match the provided constraint.
+ */
+ typedef std::map<std::string, MgrCapGrantConstraint> Arguments;
+
+ std::string service;
+ std::string module;
+ std::string profile;
+ std::string command;
+ Arguments arguments;
+
+ // restrict by network
+ std::string network;
+
+ // these are filled in by parse_network(), called by MgrCap::parse()
+ entity_addr_t network_parsed;
+ unsigned network_prefix = 0;
+ bool network_valid = true;
+
+ void parse_network();
+
+ mgr_rwxa_t allow;
+
+ // explicit grants that a profile grant expands to; populated as
+ // needed by expand_profile() (via is_match()) and cached here.
+ mutable std::list<MgrCapGrant> profile_grants;
+
+ void expand_profile(std::ostream *err=nullptr) const;
+
+ MgrCapGrant() : allow(0) {}
+ MgrCapGrant(std::string&& service,
+ std::string&& module,
+ std::string&& profile,
+ std::string&& command,
+ Arguments&& arguments,
+ mgr_rwxa_t allow)
+ : service(std::move(service)), module(std::move(module)),
+ profile(std::move(profile)), command(std::move(command)),
+ arguments(std::move(arguments)), allow(allow) {
+ }
+
+ bool validate_arguments(
+ const std::map<std::string, std::string>& arguments) const;
+
+ /**
+ * check if given request parameters match our constraints
+ *
+ * @param cct context
+ * @param name entity name
+ * @param service service (if any)
+ * @param module module (if any)
+ * @param command command (if any)
+ * @param arguments profile/module/command args (if any)
+ * @return bits we allow
+ */
+ mgr_rwxa_t get_allowed(
+ CephContext *cct,
+ EntityName name,
+ const std::string& service,
+ const std::string& module,
+ const std::string& command,
+ const std::map<std::string, std::string>& arguments) const;
+
+ bool is_allow_all() const {
+ return (allow == MGR_CAP_ANY &&
+ service.empty() &&
+ module.empty() &&
+ profile.empty() &&
+ command.empty());
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const MgrCapGrant& g);
+
+struct MgrCap {
+ std::string text;
+ std::vector<MgrCapGrant> grants;
+
+ MgrCap() {}
+ explicit MgrCap(const std::vector<MgrCapGrant> &g) : grants(g) {}
+
+ std::string get_str() const {
+ return text;
+ }
+
+ bool is_allow_all() const;
+ void set_allow_all();
+ bool parse(const std::string& str, std::ostream *err=NULL);
+
+ /**
+ * check if we are capable of something
+ *
+ * This method actually checks a description of a particular operation against
+ * what the capability has specified.
+ *
+ * @param service service name
+ * @param module module name
+ * @param command command id
+ * @param arguments
+ * @param op_may_read whether the operation may need to read
+ * @param op_may_write whether the operation may need to write
+ * @param op_may_exec whether the operation may exec
+ * @return true if the operation is allowed, false otherwise
+ */
+ bool is_capable(CephContext *cct,
+ EntityName name,
+ const std::string& service,
+ const std::string& module,
+ const std::string& command,
+ const std::map<std::string, std::string>& arguments,
+ bool op_may_read, bool op_may_write, bool op_may_exec,
+ const entity_addr_t& addr) const;
+
+ void encode(ceph::buffer::list& bl) const;
+ void decode(ceph::buffer::list::const_iterator& bl);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<MgrCap*>& ls);
+};
+WRITE_CLASS_ENCODER(MgrCap)
+
+std::ostream& operator<<(std::ostream& out, const MgrCap& cap);
+
+#endif // CEPH_MGRCAP_H
diff --git a/src/mgr/MgrClient.cc b/src/mgr/MgrClient.cc
new file mode 100644
index 000000000..6230b3387
--- /dev/null
+++ b/src/mgr/MgrClient.cc
@@ -0,0 +1,662 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "MgrClient.h"
+
+#include "mgr/MgrContext.h"
+#include "mon/MonMap.h"
+
+#include "msg/Messenger.h"
+#include "messages/MMgrMap.h"
+#include "messages/MMgrReport.h"
+#include "messages/MMgrOpen.h"
+#include "messages/MMgrUpdate.h"
+#include "messages/MMgrClose.h"
+#include "messages/MMgrConfigure.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MMgrCommand.h"
+#include "messages/MMgrCommandReply.h"
+#include "messages/MPGStats.h"
+
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::make_message;
+using ceph::ref_cast;
+using ceph::ref_t;
+
+#define dout_subsys ceph_subsys_mgrc
+#undef dout_prefix
+#define dout_prefix *_dout << "mgrc " << __func__ << " "
+
+MgrClient::MgrClient(CephContext *cct_, Messenger *msgr_, MonMap *monmap_)
+ : Dispatcher(cct_),
+ cct(cct_),
+ msgr(msgr_),
+ monmap(monmap_),
+ timer(cct_, lock)
+{
+ ceph_assert(cct != nullptr);
+}
+
+void MgrClient::init()
+{
+ std::lock_guard l(lock);
+
+ ceph_assert(msgr != nullptr);
+
+ timer.init();
+ initialized = true;
+}
+
+void MgrClient::shutdown()
+{
+ std::unique_lock l(lock);
+ ldout(cct, 10) << dendl;
+
+ if (connect_retry_callback) {
+ timer.cancel_event(connect_retry_callback);
+ connect_retry_callback = nullptr;
+ }
+
+ // forget about in-flight commands if we are prematurely shut down
+ // (e.g., by control-C)
+ command_table.clear();
+ if (service_daemon &&
+ session &&
+ session->con &&
+ HAVE_FEATURE(session->con->get_features(), SERVER_MIMIC)) {
+ ldout(cct, 10) << "closing mgr session" << dendl;
+ auto m = make_message<MMgrClose>();
+ m->daemon_name = daemon_name;
+ m->service_name = service_name;
+ session->con->send_message2(m);
+ auto timeout = ceph::make_timespan(cct->_conf.get_val<double>(
+ "mgr_client_service_daemon_unregister_timeout"));
+ shutdown_cond.wait_for(l, timeout);
+ }
+
+ timer.shutdown();
+ if (session) {
+ session->con->mark_down();
+ session.reset();
+ }
+}
+
+bool MgrClient::ms_dispatch2(const ref_t<Message>& m)
+{
+ std::lock_guard l(lock);
+
+ switch(m->get_type()) {
+ case MSG_MGR_MAP:
+ return handle_mgr_map(ref_cast<MMgrMap>(m));
+ case MSG_MGR_CONFIGURE:
+ return handle_mgr_configure(ref_cast<MMgrConfigure>(m));
+ case MSG_MGR_CLOSE:
+ return handle_mgr_close(ref_cast<MMgrClose>(m));
+ case MSG_COMMAND_REPLY:
+ if (m->get_source().type() == CEPH_ENTITY_TYPE_MGR) {
+ MCommandReply *c = static_cast<MCommandReply*>(m.get());
+ handle_command_reply(c->get_tid(), c->get_data(), c->rs, c->r);
+ return true;
+ } else {
+ return false;
+ }
+ case MSG_MGR_COMMAND_REPLY:
+ if (m->get_source().type() == CEPH_ENTITY_TYPE_MGR) {
+ MMgrCommandReply *c = static_cast<MMgrCommandReply*>(m.get());
+ handle_command_reply(c->get_tid(), c->get_data(), c->rs, c->r);
+ return true;
+ } else {
+ return false;
+ }
+ default:
+ ldout(cct, 30) << "Not handling " << *m << dendl;
+ return false;
+ }
+}
+
+void MgrClient::reconnect()
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ if (session) {
+ ldout(cct, 4) << "Terminating session with "
+ << session->con->get_peer_addr() << dendl;
+ session->con->mark_down();
+ session.reset();
+ stats_period = 0;
+ if (report_callback != nullptr) {
+ timer.cancel_event(report_callback);
+ report_callback = nullptr;
+ }
+ }
+
+ if (!map.get_available()) {
+ ldout(cct, 4) << "No active mgr available yet" << dendl;
+ return;
+ }
+
+ if (!clock_t::is_zero(last_connect_attempt)) {
+ auto now = clock_t::now();
+ auto when = last_connect_attempt +
+ ceph::make_timespan(
+ cct->_conf.get_val<double>("mgr_connect_retry_interval"));
+ if (now < when) {
+ if (!connect_retry_callback) {
+ connect_retry_callback = timer.add_event_at(
+ when,
+ new LambdaContext([this](int r){
+ connect_retry_callback = nullptr;
+ reconnect();
+ }));
+ }
+ ldout(cct, 4) << "waiting to retry connect until " << when << dendl;
+ return;
+ }
+ }
+
+ if (connect_retry_callback) {
+ timer.cancel_event(connect_retry_callback);
+ connect_retry_callback = nullptr;
+ }
+
+ ldout(cct, 4) << "Starting new session with " << map.get_active_addrs()
+ << dendl;
+ last_connect_attempt = clock_t::now();
+
+ session.reset(new MgrSessionState());
+ session->con = msgr->connect_to(CEPH_ENTITY_TYPE_MGR,
+ map.get_active_addrs());
+
+ if (service_daemon) {
+ daemon_dirty_status = true;
+ }
+ task_dirty_status = true;
+
+ // Don't send an open if we're just a client (i.e. doing
+ // command-sending, not stats etc)
+ if (msgr->get_mytype() != CEPH_ENTITY_TYPE_CLIENT || service_daemon) {
+ _send_open();
+ }
+
+ // resend any pending commands
+ auto p = command_table.get_commands().begin();
+ while (p != command_table.get_commands().end()) {
+ auto tid = p->first;
+ auto& op = p->second;
+ ldout(cct,10) << "resending " << tid << (op.tell ? " (tell)":" (cli)") << dendl;
+ MessageRef m;
+ if (op.tell) {
+ if (op.name.size() && op.name != map.active_name) {
+ ldout(cct, 10) << "active mgr " << map.active_name << " != target "
+ << op.name << dendl;
+ if (op.on_finish) {
+ op.on_finish->complete(-ENXIO);
+ }
+ ++p;
+ command_table.erase(tid);
+ continue;
+ }
+ // Set fsid argument to signal that this is really a tell message (and
+ // we are not a legacy client sending a non-tell command via MCommand).
+ m = op.get_message(monmap->fsid, false);
+ } else {
+ m = op.get_message(
+ {},
+ HAVE_FEATURE(map.active_mgr_features, SERVER_OCTOPUS));
+ }
+ ceph_assert(session);
+ ceph_assert(session->con);
+ session->con->send_message2(std::move(m));
+ ++p;
+ }
+}
+
+void MgrClient::_send_open()
+{
+ if (session && session->con) {
+ auto open = make_message<MMgrOpen>();
+ if (!service_name.empty()) {
+ open->service_name = service_name;
+ open->daemon_name = daemon_name;
+ } else {
+ open->daemon_name = cct->_conf->name.get_id();
+ }
+ if (service_daemon) {
+ open->service_daemon = service_daemon;
+ open->daemon_metadata = daemon_metadata;
+ }
+ cct->_conf.get_config_bl(0, &open->config_bl, &last_config_bl_version);
+ cct->_conf.get_defaults_bl(&open->config_defaults_bl);
+ session->con->send_message2(open);
+ }
+}
+
+void MgrClient::_send_update()
+{
+ if (session && session->con) {
+ auto update = make_message<MMgrUpdate>();
+ if (!service_name.empty()) {
+ update->service_name = service_name;
+ update->daemon_name = daemon_name;
+ } else {
+ update->daemon_name = cct->_conf->name.get_id();
+ }
+ if (need_metadata_update) {
+ update->daemon_metadata = daemon_metadata;
+ }
+ update->need_metadata_update = need_metadata_update;
+ session->con->send_message2(update);
+ }
+}
+
+bool MgrClient::handle_mgr_map(ref_t<MMgrMap> m)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ ldout(cct, 20) << *m << dendl;
+
+ map = m->get_map();
+ ldout(cct, 4) << "Got map version " << map.epoch << dendl;
+
+ ldout(cct, 4) << "Active mgr is now " << map.get_active_addrs() << dendl;
+
+ // Reset session?
+ if (!session ||
+ session->con->get_peer_addrs() != map.get_active_addrs()) {
+ reconnect();
+ }
+
+ return true;
+}
+
+bool MgrClient::ms_handle_reset(Connection *con)
+{
+ std::lock_guard l(lock);
+ if (session && con == session->con) {
+ ldout(cct, 4) << __func__ << " con " << con << dendl;
+ reconnect();
+ return true;
+ }
+ return false;
+}
+
+bool MgrClient::ms_handle_refused(Connection *con)
+{
+ // do nothing for now
+ return false;
+}
+
+void MgrClient::_send_stats()
+{
+ _send_report();
+ _send_pgstats();
+ if (stats_period != 0) {
+ report_callback = timer.add_event_after(
+ stats_period,
+ new LambdaContext([this](int) {
+ _send_stats();
+ }));
+ }
+}
+
+void MgrClient::_send_report()
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+ ceph_assert(session);
+ report_callback = nullptr;
+
+ auto report = make_message<MMgrReport>();
+ auto pcc = cct->get_perfcounters_collection();
+
+ pcc->with_counters([this, report](
+ const PerfCountersCollectionImpl::CounterMap &by_path)
+ {
+ // Helper for checking whether a counter should be included
+ auto include_counter = [this](
+ const PerfCounters::perf_counter_data_any_d &ctr,
+ const PerfCounters &perf_counters)
+ {
+ return perf_counters.get_adjusted_priority(ctr.prio) >= (int)stats_threshold;
+ };
+
+ // Helper for cases where we want to forget a counter
+ auto undeclare = [report, this](const std::string &path)
+ {
+ report->undeclare_types.push_back(path);
+ ldout(cct,20) << " undeclare " << path << dendl;
+ session->declared.erase(path);
+ };
+
+ ENCODE_START(1, 1, report->packed);
+
+ // Find counters that no longer exist, and undeclare them
+ for (auto p = session->declared.begin(); p != session->declared.end(); ) {
+ const auto &path = *(p++);
+ if (by_path.count(path) == 0) {
+ undeclare(path);
+ }
+ }
+
+ for (const auto &i : by_path) {
+ auto& path = i.first;
+ auto& data = *(i.second.data);
+ auto& perf_counters = *(i.second.perf_counters);
+
+ // Find counters that still exist, but are no longer permitted by
+ // stats_threshold
+ if (!include_counter(data, perf_counters)) {
+ if (session->declared.count(path)) {
+ undeclare(path);
+ }
+ continue;
+ }
+
+ if (session->declared.count(path) == 0) {
+ ldout(cct,20) << " declare " << path << dendl;
+ PerfCounterType type;
+ type.path = path;
+ if (data.description) {
+ type.description = data.description;
+ }
+ if (data.nick) {
+ type.nick = data.nick;
+ }
+ type.type = data.type;
+ type.priority = perf_counters.get_adjusted_priority(data.prio);
+ type.unit = data.unit;
+ report->declare_types.push_back(std::move(type));
+ session->declared.insert(path);
+ }
+
+ encode(static_cast<uint64_t>(data.u64), report->packed);
+ if (data.type & PERFCOUNTER_LONGRUNAVG) {
+ encode(static_cast<uint64_t>(data.avgcount), report->packed);
+ encode(static_cast<uint64_t>(data.avgcount2), report->packed);
+ }
+ }
+ ENCODE_FINISH(report->packed);
+
+ ldout(cct, 20) << "sending " << session->declared.size() << " counters ("
+ "of possible " << by_path.size() << "), "
+ << report->declare_types.size() << " new, "
+ << report->undeclare_types.size() << " removed"
+ << dendl;
+ });
+
+ ldout(cct, 20) << "encoded " << report->packed.length() << " bytes" << dendl;
+
+ if (daemon_name.size()) {
+ report->daemon_name = daemon_name;
+ } else {
+ report->daemon_name = cct->_conf->name.get_id();
+ }
+ report->service_name = service_name;
+
+ if (daemon_dirty_status) {
+ report->daemon_status = daemon_status;
+ daemon_dirty_status = false;
+ }
+
+ if (task_dirty_status) {
+ report->task_status = task_status;
+ task_dirty_status = false;
+ }
+
+ report->daemon_health_metrics = std::move(daemon_health_metrics);
+
+ cct->_conf.get_config_bl(last_config_bl_version, &report->config_bl,
+ &last_config_bl_version);
+
+ if (get_perf_report_cb) {
+ MetricPayload payload = get_perf_report_cb();
+ MetricReportMessage message(payload);
+ report->metric_report_message = message;
+ }
+
+ session->con->send_message2(report);
+}
+
+void MgrClient::send_pgstats()
+{
+ std::lock_guard l(lock);
+ _send_pgstats();
+}
+
+void MgrClient::_send_pgstats()
+{
+ if (pgstats_cb && session) {
+ session->con->send_message(pgstats_cb());
+ }
+}
+
+bool MgrClient::handle_mgr_configure(ref_t<MMgrConfigure> m)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ ldout(cct, 20) << *m << dendl;
+
+ if (!session) {
+ lderr(cct) << "dropping unexpected configure message" << dendl;
+ return true;
+ }
+
+ ldout(cct, 4) << "stats_period=" << m->stats_period << dendl;
+
+ if (stats_threshold != m->stats_threshold) {
+ ldout(cct, 4) << "updated stats threshold: " << m->stats_threshold << dendl;
+ stats_threshold = m->stats_threshold;
+ }
+
+ if (!m->osd_perf_metric_queries.empty()) {
+ handle_config_payload(m->osd_perf_metric_queries);
+ } else if (m->metric_config_message) {
+ const MetricConfigMessage &message = *m->metric_config_message;
+ boost::apply_visitor(HandlePayloadVisitor(this), message.payload);
+ }
+
+ bool starting = (stats_period == 0) && (m->stats_period != 0);
+ stats_period = m->stats_period;
+ if (starting) {
+ _send_stats();
+ }
+
+ return true;
+}
+
+bool MgrClient::handle_mgr_close(ref_t<MMgrClose> m)
+{
+ service_daemon = false;
+ shutdown_cond.notify_all();
+ return true;
+}
+
+int MgrClient::start_command(const vector<string>& cmd, const bufferlist& inbl,
+ bufferlist *outbl, string *outs,
+ Context *onfinish)
+{
+ std::lock_guard l(lock);
+
+ ldout(cct, 20) << "cmd: " << cmd << dendl;
+
+ if (map.epoch == 0 && mgr_optional) {
+ ldout(cct,20) << " no MgrMap, assuming EACCES" << dendl;
+ return -EACCES;
+ }
+
+ auto &op = command_table.start_command();
+ op.cmd = cmd;
+ op.inbl = inbl;
+ op.outbl = outbl;
+ op.outs = outs;
+ op.on_finish = onfinish;
+
+ if (session && session->con) {
+ // Leaving fsid argument null because it isn't used historically, and
+ // we can use it as a signal that we are sending a non-tell command.
+ auto m = op.get_message(
+ {},
+ HAVE_FEATURE(map.active_mgr_features, SERVER_OCTOPUS));
+ session->con->send_message2(std::move(m));
+ } else {
+ ldout(cct, 5) << "no mgr session (no running mgr daemon?), waiting" << dendl;
+ }
+ return 0;
+}
+
+int MgrClient::start_tell_command(
+ const string& name,
+ const vector<string>& cmd, const bufferlist& inbl,
+ bufferlist *outbl, string *outs,
+ Context *onfinish)
+{
+ std::lock_guard l(lock);
+
+ ldout(cct, 20) << "target: " << name << " cmd: " << cmd << dendl;
+
+ if (map.epoch == 0 && mgr_optional) {
+ ldout(cct,20) << " no MgrMap, assuming EACCES" << dendl;
+ return -EACCES;
+ }
+
+ auto &op = command_table.start_command();
+ op.tell = true;
+ op.name = name;
+ op.cmd = cmd;
+ op.inbl = inbl;
+ op.outbl = outbl;
+ op.outs = outs;
+ op.on_finish = onfinish;
+
+ if (session && session->con && (name.size() == 0 || map.active_name == name)) {
+ // Set fsid argument to signal that this is really a tell message (and
+ // we are not a legacy client sending a non-tell command via MCommand).
+ auto m = op.get_message(monmap->fsid, false);
+ session->con->send_message2(std::move(m));
+ } else {
+ ldout(cct, 5) << "no mgr session (no running mgr daemon?), or "
+ << name << " not active mgr, waiting" << dendl;
+ }
+ return 0;
+}
+
+bool MgrClient::handle_command_reply(
+ uint64_t tid,
+ bufferlist& data,
+ const std::string& rs,
+ int r)
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+
+ ldout(cct, 20) << "tid " << tid << " r " << r << dendl;
+
+ if (!command_table.exists(tid)) {
+ ldout(cct, 4) << "handle_command_reply tid " << tid
+ << " not found" << dendl;
+ return true;
+ }
+
+ auto &op = command_table.get_command(tid);
+ if (op.outbl) {
+ *op.outbl = std::move(data);
+ }
+
+ if (op.outs) {
+ *(op.outs) = rs;
+ }
+
+ if (op.on_finish) {
+ op.on_finish->complete(r);
+ }
+
+ command_table.erase(tid);
+ return true;
+}
+
+int MgrClient::update_daemon_metadata(
+ const std::string& service,
+ const std::string& name,
+ const std::map<std::string,std::string>& metadata)
+{
+ std::lock_guard l(lock);
+ if (service_daemon) {
+ return -EEXIST;
+ }
+ ldout(cct,1) << service << "." << name << " metadata " << metadata << dendl;
+ service_name = service;
+ daemon_name = name;
+ daemon_metadata = metadata;
+ daemon_dirty_status = true;
+
+ if (need_metadata_update &&
+ !daemon_metadata.empty()) {
+ _send_update();
+ need_metadata_update = false;
+ }
+
+ return 0;
+}
+
+int MgrClient::service_daemon_register(
+ const std::string& service,
+ const std::string& name,
+ const std::map<std::string,std::string>& metadata)
+{
+ std::lock_guard l(lock);
+ if (service_daemon) {
+ return -EEXIST;
+ }
+ ldout(cct,1) << service << "." << name << " metadata " << metadata << dendl;
+ service_daemon = true;
+ service_name = service;
+ daemon_name = name;
+ daemon_metadata = metadata;
+ daemon_dirty_status = true;
+
+ // late register?
+ if (msgr->get_mytype() == CEPH_ENTITY_TYPE_CLIENT && session && session->con) {
+ _send_open();
+ }
+
+ return 0;
+}
+
+int MgrClient::service_daemon_update_status(
+ std::map<std::string,std::string>&& status)
+{
+ std::lock_guard l(lock);
+ ldout(cct,10) << status << dendl;
+ daemon_status = std::move(status);
+ daemon_dirty_status = true;
+ return 0;
+}
+
+int MgrClient::service_daemon_update_task_status(
+ std::map<std::string,std::string> &&status) {
+ std::lock_guard l(lock);
+ ldout(cct,10) << status << dendl;
+ task_status = std::move(status);
+ task_dirty_status = true;
+ return 0;
+}
+
+void MgrClient::update_daemon_health(std::vector<DaemonHealthMetric>&& metrics)
+{
+ std::lock_guard l(lock);
+ daemon_health_metrics = std::move(metrics);
+}
+
diff --git a/src/mgr/MgrClient.h b/src/mgr/MgrClient.h
new file mode 100644
index 000000000..1668d8da0
--- /dev/null
+++ b/src/mgr/MgrClient.h
@@ -0,0 +1,215 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef MGR_CLIENT_H_
+#define MGR_CLIENT_H_
+
+#include <boost/variant.hpp>
+
+#include "msg/Connection.h"
+#include "msg/Dispatcher.h"
+#include "mon/MgrMap.h"
+#include "mgr/DaemonHealthMetric.h"
+
+#include "messages/MMgrReport.h"
+#include "mgr/MetricTypes.h"
+
+#include "common/perf_counters.h"
+#include "common/Timer.h"
+#include "common/CommandTable.h"
+
+class MMgrMap;
+class MMgrConfigure;
+class MMgrClose;
+class Messenger;
+class MCommandReply;
+class MPGStats;
+class MonMap;
+
+class MgrSessionState
+{
+ public:
+ // Which performance counters have we already transmitted schema for?
+ std::set<std::string> declared;
+
+ // Our connection to the mgr
+ ConnectionRef con;
+};
+
+class MgrCommand : public CommandOp
+{
+ public:
+ std::string name;
+ bool tell = false;
+
+ explicit MgrCommand(ceph_tid_t t) : CommandOp(t) {}
+ MgrCommand() : CommandOp() {}
+};
+
+class MgrClient : public Dispatcher
+{
+protected:
+ CephContext *cct;
+ MgrMap map;
+ Messenger *msgr;
+ MonMap *monmap;
+
+ std::unique_ptr<MgrSessionState> session;
+
+ ceph::mutex lock = ceph::make_mutex("MgrClient::lock");
+ ceph::condition_variable shutdown_cond;
+
+ uint32_t stats_period = 0;
+ uint32_t stats_threshold = 0;
+ SafeTimer timer;
+
+ CommandTable<MgrCommand> command_table;
+
+ using clock_t = ceph::real_clock;
+ clock_t::time_point last_connect_attempt;
+
+ uint64_t last_config_bl_version = 0;
+
+ Context *report_callback = nullptr;
+ Context *connect_retry_callback = nullptr;
+
+ // If provided, use this to compose an MPGStats to send with
+ // our reports (hook for use by OSD)
+ std::function<MPGStats*()> pgstats_cb;
+ std::function<void(const ConfigPayload &)> set_perf_queries_cb;
+ std::function<MetricPayload()> get_perf_report_cb;
+
+ // for service registration and beacon
+ bool service_daemon = false;
+ bool daemon_dirty_status = false;
+ bool task_dirty_status = false;
+ bool need_metadata_update = true;
+ std::string service_name, daemon_name;
+ std::map<std::string,std::string> daemon_metadata;
+ std::map<std::string,std::string> daemon_status;
+ std::map<std::string,std::string> task_status;
+ std::vector<DaemonHealthMetric> daemon_health_metrics;
+
+ void reconnect();
+ void _send_open();
+ void _send_update();
+
+ // In pre-luminous clusters, the ceph-mgr service is absent or optional,
+ // so we must not block in start_command waiting for it.
+ bool mgr_optional = false;
+
+public:
+ MgrClient(CephContext *cct_, Messenger *msgr_, MonMap *monmap);
+
+ void set_messenger(Messenger *msgr_) { msgr = msgr_; }
+
+ void init();
+ void shutdown();
+
+ void set_mgr_optional(bool optional_) {mgr_optional = optional_;}
+
+ bool ms_dispatch2(const ceph::ref_t<Message>& m) override;
+ bool ms_handle_reset(Connection *con) override;
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override;
+
+ bool handle_mgr_map(ceph::ref_t<MMgrMap> m);
+ bool handle_mgr_configure(ceph::ref_t<MMgrConfigure> m);
+ bool handle_mgr_close(ceph::ref_t<MMgrClose> m);
+ bool handle_command_reply(
+ uint64_t tid,
+ ceph::buffer::list& data,
+ const std::string& rs,
+ int r);
+
+ void set_perf_metric_query_cb(
+ std::function<void(const ConfigPayload &)> cb_set,
+ std::function<MetricPayload()> cb_get)
+ {
+ std::lock_guard l(lock);
+ set_perf_queries_cb = cb_set;
+ get_perf_report_cb = cb_get;
+ }
+
+ void send_pgstats();
+ void set_pgstats_cb(std::function<MPGStats*()>&& cb_)
+ {
+ std::lock_guard l(lock);
+ pgstats_cb = std::move(cb_);
+ }
+
+ int start_command(
+ const std::vector<std::string>& cmd, const ceph::buffer::list& inbl,
+ ceph::buffer::list *outbl, std::string *outs,
+ Context *onfinish);
+ int start_tell_command(
+ const std::string& name,
+ const std::vector<std::string>& cmd, const ceph::buffer::list& inbl,
+ ceph::buffer::list *outbl, std::string *outs,
+ Context *onfinish);
+
+ int update_daemon_metadata(
+ const std::string& service,
+ const std::string& name,
+ const std::map<std::string,std::string>& metadata);
+ int service_daemon_register(
+ const std::string& service,
+ const std::string& name,
+ const std::map<std::string,std::string>& metadata);
+ int service_daemon_update_status(
+ std::map<std::string,std::string>&& status);
+ int service_daemon_update_task_status(
+ std::map<std::string,std::string> &&task_status);
+ void update_daemon_health(std::vector<DaemonHealthMetric>&& metrics);
+
+ bool is_initialized() const { return initialized; }
+
+private:
+ void handle_config_payload(const OSDConfigPayload &payload) {
+ if (set_perf_queries_cb) {
+ set_perf_queries_cb(payload);
+ }
+ }
+
+ void handle_config_payload(const MDSConfigPayload &payload) {
+ if (set_perf_queries_cb) {
+ set_perf_queries_cb(payload);
+ }
+ }
+
+ void handle_config_payload(const UnknownConfigPayload &payload) {
+ ceph_abort();
+ }
+
+ struct HandlePayloadVisitor : public boost::static_visitor<void> {
+ MgrClient *mgrc;
+
+ HandlePayloadVisitor(MgrClient *mgrc)
+ : mgrc(mgrc) {
+ }
+
+ template <typename ConfigPayload>
+ inline void operator()(const ConfigPayload &payload) const {
+ mgrc->handle_config_payload(payload);
+ }
+ };
+
+ void _send_stats();
+ void _send_pgstats();
+ void _send_report();
+
+ bool initialized = false;
+};
+
+#endif
diff --git a/src/mgr/MgrCommands.h b/src/mgr/MgrCommands.h
new file mode 100644
index 000000000..bc3350da4
--- /dev/null
+++ b/src/mgr/MgrCommands.h
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/* no guard; may be included multiple times */
+
+// see MonCommands.h
+
+COMMAND("pg stat", "show placement group status.",
+ "pg", "r")
+COMMAND("pg getmap", "get binary pg map to -o/stdout", "pg", "r")
+
+COMMAND("pg dump " \
+ "name=dumpcontents,type=CephChoices,strings=all|summary|sum|delta|pools|osds|pgs|pgs_brief,n=N,req=false", \
+ "show human-readable versions of pg map (only 'all' valid with plain)", "pg", "r")
+COMMAND("pg dump_json " \
+ "name=dumpcontents,type=CephChoices,strings=all|summary|sum|pools|osds|pgs,n=N,req=false", \
+ "show human-readable version of pg map in json only",\
+ "pg", "r")
+COMMAND("pg dump_pools_json", "show pg pools info in json only",\
+ "pg", "r")
+
+COMMAND("pg ls-by-pool " \
+ "name=poolstr,type=CephString " \
+ "name=states,type=CephString,n=N,req=false", \
+ "list pg with pool = [poolname]", "pg", "r")
+COMMAND("pg ls-by-primary " \
+ "name=osd,type=CephOsdName " \
+ "name=pool,type=CephInt,req=false " \
+ "name=states,type=CephString,n=N,req=false", \
+ "list pg with primary = [osd]", "pg", "r")
+COMMAND("pg ls-by-osd " \
+ "name=osd,type=CephOsdName " \
+ "name=pool,type=CephInt,req=false " \
+ "name=states,type=CephString,n=N,req=false", \
+ "list pg on osd [osd]", "pg", "r")
+COMMAND("pg ls " \
+ "name=pool,type=CephInt,req=false " \
+ "name=states,type=CephString,n=N,req=false", \
+ "list pg with specific pool, osd, state", "pg", "r")
+COMMAND("pg dump_stuck " \
+ "name=stuckops,type=CephChoices,strings=inactive|unclean|stale|undersized|degraded,n=N,req=false " \
+ "name=threshold,type=CephInt,req=false",
+ "show information about stuck pgs",\
+ "pg", "r")
+COMMAND("pg debug " \
+ "name=debugop,type=CephChoices,strings=unfound_objects_exist|degraded_pgs_exist", \
+ "show debug info about pgs", "pg", "r")
+
+COMMAND("pg scrub name=pgid,type=CephPgid", "start scrub on <pgid>", \
+ "pg", "rw")
+COMMAND("pg deep-scrub name=pgid,type=CephPgid", "start deep-scrub on <pgid>", \
+ "pg", "rw")
+COMMAND("pg repair name=pgid,type=CephPgid", "start repair on <pgid>", \
+ "pg", "rw")
+
+COMMAND("pg force-recovery name=pgid,type=CephPgid,n=N", "force recovery of <pgid> first", \
+ "pg", "rw")
+COMMAND("pg force-backfill name=pgid,type=CephPgid,n=N", "force backfill of <pgid> first", \
+ "pg", "rw")
+COMMAND("pg cancel-force-recovery name=pgid,type=CephPgid,n=N", "restore normal recovery priority of <pgid>", \
+ "pg", "rw")
+COMMAND("pg cancel-force-backfill name=pgid,type=CephPgid,n=N", "restore normal backfill priority of <pgid>", \
+ "pg", "rw")
+
+// stuff in osd namespace
+COMMAND("osd perf", \
+ "print dump of OSD perf summary stats", \
+ "osd", \
+ "r")
+COMMAND("osd df " \
+ "name=output_method,type=CephChoices,strings=plain|tree,req=false " \
+ "name=filter_by,type=CephChoices,strings=class|name,req=false " \
+ "name=filter,type=CephString,req=false", \
+ "show OSD utilization", "osd", "r")
+COMMAND("osd blocked-by", \
+ "print histogram of which OSDs are blocking their peers", \
+ "osd", "r")
+COMMAND("osd pool stats " \
+ "name=pool_name,type=CephPoolname,req=false",
+ "obtain stats from all pools, or from specified pool",
+ "osd", "r")
+COMMAND("osd pool scrub " \
+ "name=who,type=CephPoolname,n=N", \
+ "initiate scrub on pool <who>", \
+ "osd", "rw")
+COMMAND("osd pool deep-scrub " \
+ "name=who,type=CephPoolname,n=N", \
+ "initiate deep-scrub on pool <who>", \
+ "osd", "rw")
+COMMAND("osd pool repair " \
+ "name=who,type=CephPoolname,n=N", \
+ "initiate repair on pool <who>", \
+ "osd", "rw")
+COMMAND("osd pool force-recovery " \
+ "name=who,type=CephPoolname,n=N", \
+ "force recovery of specified pool <who> first", \
+ "osd", "rw")
+COMMAND("osd pool force-backfill " \
+ "name=who,type=CephPoolname,n=N", \
+ "force backfill of specified pool <who> first", \
+ "osd", "rw")
+COMMAND("osd pool cancel-force-recovery " \
+ "name=who,type=CephPoolname,n=N", \
+ "restore normal recovery priority of specified pool <who>", \
+ "osd", "rw")
+COMMAND("osd pool cancel-force-backfill " \
+ "name=who,type=CephPoolname,n=N", \
+ "restore normal recovery priority of specified pool <who>", \
+ "osd", "rw")
+COMMAND("osd reweight-by-utilization " \
+ "name=oload,type=CephInt,req=false " \
+ "name=max_change,type=CephFloat,req=false " \
+ "name=max_osds,type=CephInt,req=false " \
+ "name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\
+ "reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
+ "osd", "rw")
+COMMAND("osd test-reweight-by-utilization " \
+ "name=oload,type=CephInt,req=false " \
+ "name=max_change,type=CephFloat,req=false " \
+ "name=max_osds,type=CephInt,req=false " \
+ "name=no_increasing,type=CephBool,req=false",\
+ "dry run of reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
+ "osd", "r")
+COMMAND("osd reweight-by-pg " \
+ "name=oload,type=CephInt,req=false " \
+ "name=max_change,type=CephFloat,req=false " \
+ "name=max_osds,type=CephInt,req=false " \
+ "name=pools,type=CephPoolname,n=N,req=false", \
+ "reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
+ "osd", "rw")
+COMMAND("osd test-reweight-by-pg " \
+ "name=oload,type=CephInt,req=false " \
+ "name=max_change,type=CephFloat,req=false " \
+ "name=max_osds,type=CephInt,req=false " \
+ "name=pools,type=CephPoolname,n=N,req=false", \
+ "dry run of reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
+ "osd", "r")
+
+COMMAND("osd destroy " \
+ "name=id,type=CephOsdName " \
+ "name=force,type=CephBool,req=false "
+ // backward compat synonym for --force
+ "name=yes_i_really_mean_it,type=CephBool,req=false", \
+ "mark osd as being destroyed. Keeps the ID intact (allowing reuse), " \
+ "but removes cephx keys, config-key data and lockbox keys, "\
+ "rendering data permanently unreadable.", \
+ "osd", "rw")
+COMMAND("osd purge " \
+ "name=id,type=CephOsdName " \
+ "name=force,type=CephBool,req=false "
+ // backward compat synonym for --force
+ "name=yes_i_really_mean_it,type=CephBool,req=false", \
+ "purge all osd data from the monitors including the OSD id " \
+ "and CRUSH position", \
+ "osd", "rw")
+
+COMMAND("osd safe-to-destroy name=ids,type=CephString,n=N",
+ "check whether osd(s) can be safely destroyed without reducing data durability",
+ "osd", "r")
+COMMAND("osd ok-to-stop name=ids,type=CephString,n=N "\
+ "name=max,type=CephInt,req=false",
+ "check whether osd(s) can be safely stopped without reducing immediate"\
+ " data availability", "osd", "r")
+
+COMMAND("osd scrub " \
+ "name=who,type=CephString", \
+ "initiate scrub on osd <who>, or use <all|any> to scrub all", \
+ "osd", "rw")
+COMMAND("osd deep-scrub " \
+ "name=who,type=CephString", \
+ "initiate deep scrub on osd <who>, or use <all|any> to deep scrub all", \
+ "osd", "rw")
+COMMAND("osd repair " \
+ "name=who,type=CephString", \
+ "initiate repair on osd <who>, or use <all|any> to repair all", \
+ "osd", "rw")
+
+COMMAND("service dump",
+ "dump service map", "service", "r")
+COMMAND("service status",
+ "dump service state", "service", "r")
+
+COMMAND("config show " \
+ "name=who,type=CephString name=key,type=CephString,req=False",
+ "Show running configuration",
+ "mgr", "r")
+COMMAND("config show-with-defaults " \
+ "name=who,type=CephString",
+ "Show running configuration (including compiled-in defaults)",
+ "mgr", "r")
+
+COMMAND("device ls",
+ "Show devices",
+ "mgr", "r")
+COMMAND("device info name=devid,type=CephString",
+ "Show information about a device",
+ "mgr", "r")
+COMMAND("device ls-by-daemon name=who,type=CephString",
+ "Show devices associated with a daemon",
+ "mgr", "r")
+COMMAND("device ls-by-host name=host,type=CephString",
+ "Show devices on a host",
+ "mgr", "r")
+COMMAND("device set-life-expectancy name=devid,type=CephString "\
+ "name=from,type=CephString "\
+ "name=to,type=CephString,req=False",
+ "Set predicted device life expectancy",
+ "mgr", "rw")
+COMMAND("device rm-life-expectancy name=devid,type=CephString",
+ "Clear predicted device life expectancy",
+ "mgr", "rw")
diff --git a/src/mgr/MgrContext.h b/src/mgr/MgrContext.h
new file mode 100644
index 000000000..a5490bef3
--- /dev/null
+++ b/src/mgr/MgrContext.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef MGR_CONTEXT_H_
+#define MGR_CONTEXT_H_
+
+#include <memory>
+
+#include "common/ceph_json.h"
+#include "common/Cond.h"
+#include "mon/MonClient.h"
+
+class Command
+{
+protected:
+ C_SaferCond cond;
+public:
+ ceph::buffer::list outbl;
+ std::string outs;
+ int r;
+
+ void run(MonClient *monc, const std::string &command)
+ {
+ monc->start_mon_command({command}, {},
+ &outbl, &outs, &cond);
+ }
+
+ void run(MonClient *monc, const std::string &command, const ceph::buffer::list &inbl)
+ {
+ monc->start_mon_command({command}, inbl,
+ &outbl, &outs, &cond);
+ }
+
+ virtual void wait()
+ {
+ r = cond.wait();
+ }
+
+ virtual ~Command() {}
+};
+
+
+class JSONCommand : public Command
+{
+public:
+ json_spirit::mValue json_result;
+
+ void wait() override
+ {
+ Command::wait();
+
+ if (r == 0) {
+ bool read_ok = json_spirit::read(
+ outbl.to_str(), json_result);
+ if (!read_ok) {
+ r = -EINVAL;
+ }
+ }
+ }
+};
+
+#endif
+
diff --git a/src/mgr/MgrSession.h b/src/mgr/MgrSession.h
new file mode 100644
index 000000000..40b50220b
--- /dev/null
+++ b/src/mgr/MgrSession.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_MGRSESSION_H
+#define CEPH_MGR_MGRSESSION_H
+
+#include "common/RefCountedObj.h"
+#include "common/entity_name.h"
+#include "msg/msg_types.h"
+#include "MgrCap.h"
+
+
+/**
+ * Session state associated with the Connection.
+ */
+struct MgrSession : public RefCountedObject {
+ uint64_t global_id = 0;
+ EntityName entity_name;
+ entity_inst_t inst;
+
+ int osd_id = -1; ///< osd id (if an osd)
+
+ MgrCap caps;
+
+ std::set<std::string> declared_types;
+
+ const entity_addr_t& get_peer_addr() const {
+ return inst.addr;
+ }
+
+private:
+ FRIEND_MAKE_REF(MgrSession);
+ explicit MgrSession(CephContext *cct) : RefCountedObject(cct) {}
+ ~MgrSession() override = default;
+};
+
+using MgrSessionRef = ceph::ref_t<MgrSession>;
+
+
+#endif
diff --git a/src/mgr/MgrStandby.cc b/src/mgr/MgrStandby.cc
new file mode 100644
index 000000000..2821bf4cf
--- /dev/null
+++ b/src/mgr/MgrStandby.cc
@@ -0,0 +1,503 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include <Python.h>
+
+#include "common/errno.h"
+#include "common/signal.h"
+#include "include/compat.h"
+
+#include "include/stringify.h"
+#include "global/global_context.h"
+#include "global/signal_handler.h"
+
+#include "mgr/MgrContext.h"
+#include "mgr/mgr_commands.h"
+#include "mgr/mgr_perf_counters.h"
+
+#include "messages/MMgrBeacon.h"
+#include "messages/MMgrMap.h"
+#include "Mgr.h"
+
+#include "MgrStandby.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+MgrStandby::MgrStandby(int argc, const char **argv) :
+ Dispatcher(g_ceph_context),
+ monc{g_ceph_context, poolctx},
+ client_messenger(Messenger::create(
+ g_ceph_context,
+ cct->_conf.get_val<std::string>("ms_type"),
+ entity_name_t::MGR(),
+ "mgr",
+ Messenger::get_pid_nonce())),
+ objecter{g_ceph_context, client_messenger.get(), &monc, poolctx},
+ client{client_messenger.get(), &monc, &objecter},
+ mgrc(g_ceph_context, client_messenger.get(), &monc.monmap),
+ log_client(g_ceph_context, client_messenger.get(), &monc.monmap, LogClient::NO_FLAGS),
+ clog(log_client.create_channel(CLOG_CHANNEL_CLUSTER)),
+ audit_clog(log_client.create_channel(CLOG_CHANNEL_AUDIT)),
+ finisher(g_ceph_context, "MgrStandby", "mgrsb-fin"),
+ timer(g_ceph_context, lock),
+ py_module_registry(clog),
+ active_mgr(nullptr),
+ orig_argc(argc),
+ orig_argv(argv),
+ available_in_map(false)
+{
+}
+
+MgrStandby::~MgrStandby() = default;
+
+const char** MgrStandby::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ // clog & admin clog
+ "clog_to_monitors",
+ "clog_to_syslog",
+ "clog_to_syslog_facility",
+ "clog_to_syslog_level",
+ "clog_to_graylog",
+ "clog_to_graylog_host",
+ "clog_to_graylog_port",
+ "mgr_standby_modules",
+ "host",
+ "fsid",
+ NULL
+ };
+ return KEYS;
+}
+
+void MgrStandby::handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set <std::string> &changed)
+{
+ if (changed.count("clog_to_monitors") ||
+ changed.count("clog_to_syslog") ||
+ changed.count("clog_to_syslog_level") ||
+ changed.count("clog_to_syslog_facility") ||
+ changed.count("clog_to_graylog") ||
+ changed.count("clog_to_graylog_host") ||
+ changed.count("clog_to_graylog_port") ||
+ changed.count("host") ||
+ changed.count("fsid")) {
+ _update_log_config();
+ }
+ if (changed.count("mgr_standby_modules") && !active_mgr) {
+ if (g_conf().get_val<bool>("mgr_standby_modules") != py_module_registry.have_standby_modules()) {
+ dout(1) << "mgr_standby_modules now "
+ << (int)g_conf().get_val<bool>("mgr_standby_modules")
+ << ", standby modules are "
+ << (py_module_registry.have_standby_modules() ? "":"not ")
+ << "active, respawning"
+ << dendl;
+ respawn();
+ }
+ }
+}
+
+int MgrStandby::init()
+{
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, sighup_handler);
+
+ cct->_conf.add_observer(this);
+
+ std::lock_guard l(lock);
+
+ // Start finisher
+ finisher.start();
+
+ // Initialize Messenger
+ client_messenger->add_dispatcher_tail(this);
+ client_messenger->add_dispatcher_head(&objecter);
+ client_messenger->add_dispatcher_tail(&client);
+ client_messenger->start();
+
+ poolctx.start(2);
+
+ // Initialize MonClient
+ if (monc.build_initial_monmap() < 0) {
+ client_messenger->shutdown();
+ client_messenger->wait();
+ return -1;
+ }
+
+ monc.sub_want("mgrmap", 0, 0);
+
+ monc.set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD
+ |CEPH_ENTITY_TYPE_MDS|CEPH_ENTITY_TYPE_MGR);
+ monc.set_messenger(client_messenger.get());
+
+ // We must register our config callback before calling init(), so
+ // that we see the initial configuration message
+ monc.register_config_callback([this](const std::string &k, const std::string &v){
+ // removing value to hide sensitive data going into mgr logs
+ // leaving this for debugging purposes
+ // dout(10) << "config_callback: " << k << " : " << v << dendl;
+ dout(10) << "config_callback: " << k << " : " << dendl;
+ if (k.substr(0, 4) == "mgr/") {
+ py_module_registry.handle_config(k, v);
+ return true;
+ }
+ return false;
+ });
+ monc.register_config_notify_callback([this]() {
+ py_module_registry.handle_config_notify();
+ });
+ dout(4) << "Registered monc callback" << dendl;
+
+ int r = monc.init();
+ if (r < 0) {
+ monc.shutdown();
+ client_messenger->shutdown();
+ client_messenger->wait();
+ return r;
+ }
+ mgrc.init();
+ client_messenger->add_dispatcher_tail(&mgrc);
+
+ r = monc.authenticate();
+ if (r < 0) {
+ derr << "Authentication failed, did you specify a mgr ID with a valid keyring?" << dendl;
+ monc.shutdown();
+ client_messenger->shutdown();
+ client_messenger->wait();
+ return r;
+ }
+ // only forward monmap updates after authentication finishes, otherwise
+ // monc.authenticate() will be waiting for MgrStandy::ms_dispatch()
+ // to acquire the lock forever, as it is already locked in the beginning of
+ // this method.
+ monc.set_passthrough_monmap();
+
+ client_t whoami = monc.get_global_id();
+ client_messenger->set_myname(entity_name_t::MGR(whoami.v));
+ monc.set_log_client(&log_client);
+ _update_log_config();
+ objecter.set_client_incarnation(0);
+ objecter.init();
+ objecter.start();
+ client.init();
+ timer.init();
+
+ py_module_registry.init();
+ mgr_perf_start(g_ceph_context);
+
+
+ tick();
+
+ dout(4) << "Complete." << dendl;
+ return 0;
+}
+
+void MgrStandby::send_beacon()
+{
+ ceph_assert(ceph_mutex_is_locked_by_me(lock));
+ dout(20) << state_str() << dendl;
+
+ auto modules = py_module_registry.get_modules();
+
+ // Construct a list of the info about each loaded module
+ // which we will transmit to the monitor.
+ std::vector<MgrMap::ModuleInfo> module_info;
+ for (const auto &module : modules) {
+ MgrMap::ModuleInfo info;
+ info.name = module->get_name();
+ info.error_string = module->get_error_string();
+ info.can_run = module->get_can_run();
+ info.module_options = module->get_options();
+ module_info.push_back(std::move(info));
+ }
+
+ auto clients = py_module_registry.get_clients();
+ for (const auto& client : clients) {
+ dout(15) << "noting RADOS client for blocklist: " << client << dendl;
+ }
+
+ // Whether I think I am available (request MgrMonitor to set me
+ // as available in the map)
+ bool available = active_mgr != nullptr && active_mgr->is_initialized();
+
+ auto addrs = available ? active_mgr->get_server_addrs() : entity_addrvec_t();
+ dout(10) << "sending beacon as gid " << monc.get_global_id() << dendl;
+
+ map<string,string> metadata;
+ metadata["addr"] = client_messenger->get_myaddr_legacy().ip_only_to_str();
+ metadata["addrs"] = stringify(client_messenger->get_myaddrs());
+ collect_sys_info(&metadata, g_ceph_context);
+
+ auto m = ceph::make_message<MMgrBeacon>(monc.get_fsid(),
+ monc.get_global_id(),
+ g_conf()->name.get_id(),
+ addrs,
+ available,
+ std::move(module_info),
+ std::move(metadata),
+ std::move(clients),
+ CEPH_FEATURES_ALL);
+
+ if (available) {
+ if (!available_in_map) {
+ // We are informing the mon that we are done initializing: inform
+ // it of our command set. This has to happen after init() because
+ // it needs the python modules to have loaded.
+ std::vector<MonCommand> commands = mgr_commands;
+ std::vector<MonCommand> py_commands = py_module_registry.get_commands();
+ commands.insert(commands.end(), py_commands.begin(), py_commands.end());
+ m->set_command_descs(commands);
+ dout(4) << "going active, including " << m->get_command_descs().size()
+ << " commands in beacon" << dendl;
+ }
+
+ m->set_services(active_mgr->get_services());
+ }
+
+ monc.send_mon_message(std::move(m));
+}
+
+void MgrStandby::tick()
+{
+ dout(10) << __func__ << dendl;
+ send_beacon();
+
+ timer.add_event_after(
+ g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count(),
+ new LambdaContext([this](int r){
+ tick();
+ }
+ ));
+}
+
+void MgrStandby::shutdown()
+{
+ finisher.queue(new LambdaContext([&](int) {
+ std::lock_guard l(lock);
+
+ dout(4) << "Shutting down" << dendl;
+
+ py_module_registry.shutdown();
+ // stop sending beacon first, I use monc to talk with monitors
+ timer.shutdown();
+ // client uses monc and objecter
+ client.shutdown();
+ mgrc.shutdown();
+ // Stop asio threads, so leftover events won't call into shut down
+ // monclient/objecter.
+ poolctx.finish();
+ // stop monc, so mon won't be able to instruct me to shutdown/activate after
+ // the active_mgr is stopped
+ monc.shutdown();
+ if (active_mgr) {
+ active_mgr->shutdown();
+ }
+ // objecter is used by monc and active_mgr
+ objecter.shutdown();
+ // client_messenger is used by all of them, so stop it in the end
+ client_messenger->shutdown();
+ }));
+
+ // Then stop the finisher to ensure its enqueued contexts aren't going
+ // to touch references to the things we're about to tear down
+ finisher.wait_for_empty();
+ finisher.stop();
+ mgr_perf_stop(g_ceph_context);
+}
+
+void MgrStandby::respawn()
+{
+ // --- WARNING TO FUTURE COPY/PASTERS ---
+ // You must also add a call like
+ //
+ // ceph_pthread_setname(pthread_self(), "ceph-mgr");
+ //
+ // to main() so that /proc/$pid/stat field 2 contains "(ceph-mgr)"
+ // instead of "(exe)", so that killall (and log rotation) will work.
+
+ char *new_argv[orig_argc+1];
+ dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
+ for (int i=0; i<orig_argc; i++) {
+ new_argv[i] = (char *)orig_argv[i];
+ dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl;
+ }
+ new_argv[orig_argc] = NULL;
+
+ /* Determine the path to our executable, test if Linux /proc/self/exe exists.
+ * This allows us to exec the same executable even if it has since been
+ * unlinked.
+ */
+ char exe_path[PATH_MAX] = "";
+ if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) == -1) {
+ /* Print CWD for the user's interest */
+ char buf[PATH_MAX];
+ char *cwd = getcwd(buf, sizeof(buf));
+ ceph_assert(cwd);
+ dout(1) << " cwd " << cwd << dendl;
+
+ /* Fall back to a best-effort: just running in our CWD */
+ strncpy(exe_path, orig_argv[0], PATH_MAX-1);
+ } else {
+ dout(1) << "respawning with exe " << exe_path << dendl;
+ strcpy(exe_path, PROCPREFIX "/proc/self/exe");
+ }
+
+ dout(1) << " exe_path " << exe_path << dendl;
+
+ unblock_all_signals(NULL);
+ execv(exe_path, new_argv);
+
+ derr << "respawn execv " << orig_argv[0]
+ << " failed with " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+}
+
+void MgrStandby::_update_log_config()
+{
+ map<string,string> log_to_monitors;
+ map<string,string> log_to_syslog;
+ map<string,string> log_channel;
+ map<string,string> log_prio;
+ map<string,string> log_to_graylog;
+ map<string,string> log_to_graylog_host;
+ map<string,string> log_to_graylog_port;
+ uuid_d fsid;
+ string host;
+
+ if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
+ log_channel, log_prio, log_to_graylog,
+ log_to_graylog_host, log_to_graylog_port,
+ fsid, host) == 0) {
+ clog->update_config(log_to_monitors, log_to_syslog,
+ log_channel, log_prio, log_to_graylog,
+ log_to_graylog_host, log_to_graylog_port,
+ fsid, host);
+ audit_clog->update_config(log_to_monitors, log_to_syslog,
+ log_channel, log_prio, log_to_graylog,
+ log_to_graylog_host, log_to_graylog_port,
+ fsid, host);
+ }
+}
+
+void MgrStandby::handle_mgr_map(ref_t<MMgrMap> mmap)
+{
+ auto &map = mmap->get_map();
+ dout(4) << "received map epoch " << map.get_epoch() << dendl;
+ const bool active_in_map = map.active_gid == monc.get_global_id();
+ dout(4) << "active in map: " << active_in_map
+ << " active is " << map.active_gid << dendl;
+
+ // PyModuleRegistry may ask us to respawn if it sees that
+ // this MgrMap is changing its set of enabled modules
+ bool need_respawn = py_module_registry.handle_mgr_map(map);
+ if (need_respawn) {
+ dout(1) << "respawning because set of enabled modules changed!" << dendl;
+ respawn();
+ }
+
+ if (active_in_map) {
+ if (!active_mgr) {
+ dout(1) << "Activating!" << dendl;
+ active_mgr.reset(new Mgr(&monc, map, &py_module_registry,
+ client_messenger.get(), &objecter,
+ &client, clog, audit_clog));
+ active_mgr->background_init(new LambdaContext(
+ [this](int r){
+ // Advertise our active-ness ASAP instead of waiting for
+ // next tick.
+ std::lock_guard l(lock);
+ send_beacon();
+ }));
+ dout(1) << "I am now activating" << dendl;
+ } else {
+ dout(10) << "I was already active" << dendl;
+ bool need_respawn = active_mgr->got_mgr_map(map);
+ if (need_respawn) {
+ respawn();
+ }
+ }
+
+ if (!available_in_map && map.get_available()) {
+ dout(4) << "Map now says I am available" << dendl;
+ available_in_map = true;
+ }
+ } else if (active_mgr != nullptr) {
+ derr << "I was active but no longer am" << dendl;
+ respawn();
+ } else {
+ if (map.active_gid != 0 && map.active_name != g_conf()->name.get_id()) {
+ // I am the standby and someone else is active, start modules
+ // in standby mode to do redirects if needed
+ if (!py_module_registry.is_standby_running() &&
+ g_conf().get_val<bool>("mgr_standby_modules")) {
+ py_module_registry.standby_start(monc, finisher);
+ }
+ }
+ }
+}
+
+bool MgrStandby::ms_dispatch2(const ref_t<Message>& m)
+{
+ std::lock_guard l(lock);
+ dout(10) << state_str() << " " << *m << dendl;
+
+ if (m->get_type() == MSG_MGR_MAP) {
+ handle_mgr_map(ref_cast<MMgrMap>(m));
+ }
+ bool handled = false;
+ if (active_mgr) {
+ auto am = active_mgr;
+ lock.unlock();
+ handled = am->ms_dispatch2(m);
+ lock.lock();
+ }
+ if (m->get_type() == MSG_MGR_MAP) {
+ // let this pass through for mgrc
+ handled = false;
+ }
+ return handled;
+}
+
+
+bool MgrStandby::ms_handle_refused(Connection *con)
+{
+ // do nothing for now
+ return false;
+}
+
+int MgrStandby::main(vector<const char *> args)
+{
+ client_messenger->wait();
+
+ // Disable signal handlers
+ unregister_async_signal_handler(SIGHUP, sighup_handler);
+ shutdown_async_signal_handler();
+
+ return 0;
+}
+
+
+std::string MgrStandby::state_str()
+{
+ if (active_mgr == nullptr) {
+ return "standby";
+ } else if (active_mgr->is_initialized()) {
+ return "active";
+ } else {
+ return "active (starting)";
+ }
+}
diff --git a/src/mgr/MgrStandby.h b/src/mgr/MgrStandby.h
new file mode 100644
index 000000000..cac31a576
--- /dev/null
+++ b/src/mgr/MgrStandby.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#ifndef MGR_STANDBY_H_
+#define MGR_STANDBY_H_
+
+#include "auth/Auth.h"
+#include "common/async/context_pool.h"
+#include "common/Finisher.h"
+#include "common/Timer.h"
+#include "common/LogClient.h"
+
+#include "client/Client.h"
+#include "mon/MonClient.h"
+#include "osdc/Objecter.h"
+#include "PyModuleRegistry.h"
+#include "MgrClient.h"
+
+class MMgrMap;
+class Mgr;
+class PyModuleConfig;
+
+class MgrStandby : public Dispatcher,
+ public md_config_obs_t {
+public:
+ // config observer bits
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set <std::string> &changed) override;
+
+protected:
+ ceph::async::io_context_pool poolctx;
+ MonClient monc;
+ std::unique_ptr<Messenger> client_messenger;
+ Objecter objecter;
+ Client client;
+
+ MgrClient mgrc;
+
+ LogClient log_client;
+ LogChannelRef clog, audit_clog;
+
+ ceph::mutex lock = ceph::make_mutex("MgrStandby::lock");
+ Finisher finisher;
+ SafeTimer timer;
+
+ PyModuleRegistry py_module_registry;
+ std::shared_ptr<Mgr> active_mgr;
+
+ int orig_argc;
+ const char **orig_argv;
+
+ std::string state_str();
+
+ void handle_mgr_map(ceph::ref_t<MMgrMap> m);
+ void _update_log_config();
+ void send_beacon();
+
+ bool available_in_map;
+
+public:
+ MgrStandby(int argc, const char **argv);
+ ~MgrStandby() override;
+
+ bool ms_dispatch2(const ceph::ref_t<Message>& m) override;
+ bool ms_handle_reset(Connection *con) override { return false; }
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override;
+
+ int init();
+ void shutdown();
+ void respawn();
+ int main(vector<const char *> args);
+ void tick();
+};
+
+#endif
+
diff --git a/src/mgr/OSDPerfMetricCollector.cc b/src/mgr/OSDPerfMetricCollector.cc
new file mode 100644
index 000000000..eb548ce70
--- /dev/null
+++ b/src/mgr/OSDPerfMetricCollector.cc
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "messages/MMgrReport.h"
+#include "OSDPerfMetricCollector.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr.osd_perf_metric_collector " << __func__ << " "
+
+OSDPerfMetricCollector::OSDPerfMetricCollector(MetricListener &listener)
+ : MetricCollector<OSDPerfMetricQuery,
+ OSDPerfMetricLimit,
+ OSDPerfMetricKey,
+ OSDPerfMetricReport>(listener) {
+}
+
+void OSDPerfMetricCollector::process_reports(const MetricPayload &payload) {
+ const std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports =
+ boost::get<OSDMetricPayload>(payload).report;
+
+ std::lock_guard locker(lock);
+ process_reports_generic(
+ reports, [](PerformanceCounter *counter, const PerformanceCounter &update) {
+ counter->first += update.first;
+ counter->second += update.second;
+ });
+}
+
+int OSDPerfMetricCollector::get_counters(PerfCollector *collector) {
+ OSDPerfCollector *c = static_cast<OSDPerfCollector *>(collector);
+
+ std::lock_guard locker(lock);
+ return get_counters_generic(c->query_id, &c->counters);
+}
diff --git a/src/mgr/OSDPerfMetricCollector.h b/src/mgr/OSDPerfMetricCollector.h
new file mode 100644
index 000000000..c531dbf63
--- /dev/null
+++ b/src/mgr/OSDPerfMetricCollector.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef OSD_PERF_METRIC_COLLECTOR_H_
+#define OSD_PERF_METRIC_COLLECTOR_H_
+
+#include "mgr/MetricCollector.h"
+#include "mgr/OSDPerfMetricTypes.h"
+
+/**
+ * OSD performance query class.
+ */
+class OSDPerfMetricCollector
+ : public MetricCollector<OSDPerfMetricQuery, OSDPerfMetricLimit, OSDPerfMetricKey,
+ OSDPerfMetricReport> {
+public:
+ OSDPerfMetricCollector(MetricListener &listener);
+
+ void process_reports(const MetricPayload &payload) override;
+ int get_counters(PerfCollector *collector) override;
+};
+
+#endif // OSD_PERF_METRIC_COLLECTOR_H_
diff --git a/src/mgr/OSDPerfMetricTypes.cc b/src/mgr/OSDPerfMetricTypes.cc
new file mode 100644
index 000000000..bce95e0ae
--- /dev/null
+++ b/src/mgr/OSDPerfMetricTypes.cc
@@ -0,0 +1,134 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mgr/OSDPerfMetricTypes.h"
+
+#include <ostream>
+
+using ceph::bufferlist;
+
+std::ostream& operator<<(std::ostream& os,
+ const OSDPerfMetricSubKeyDescriptor &d) {
+ switch(d.type) {
+ case OSDPerfMetricSubKeyType::CLIENT_ID:
+ os << "client_id";
+ break;
+ case OSDPerfMetricSubKeyType::CLIENT_ADDRESS:
+ os << "client_address";
+ break;
+ case OSDPerfMetricSubKeyType::POOL_ID:
+ os << "pool_id";
+ break;
+ case OSDPerfMetricSubKeyType::NAMESPACE:
+ os << "namespace";
+ break;
+ case OSDPerfMetricSubKeyType::OSD_ID:
+ os << "osd_id";
+ break;
+ case OSDPerfMetricSubKeyType::PG_ID:
+ os << "pg_id";
+ break;
+ case OSDPerfMetricSubKeyType::OBJECT_NAME:
+ os << "object_name";
+ break;
+ case OSDPerfMetricSubKeyType::SNAP_ID:
+ os << "snap_id";
+ break;
+ default:
+ os << "unknown (" << static_cast<int>(d.type) << ")";
+ }
+ return os << "~/" << d.regex_str << "/";
+}
+
+void PerformanceCounterDescriptor::pack_counter(const PerformanceCounter &c,
+ bufferlist *bl) const {
+ using ceph::encode;
+ encode(c.first, *bl);
+ switch(type) {
+ case PerformanceCounterType::OPS:
+ case PerformanceCounterType::WRITE_OPS:
+ case PerformanceCounterType::READ_OPS:
+ case PerformanceCounterType::BYTES:
+ case PerformanceCounterType::WRITE_BYTES:
+ case PerformanceCounterType::READ_BYTES:
+ break;
+ case PerformanceCounterType::LATENCY:
+ case PerformanceCounterType::WRITE_LATENCY:
+ case PerformanceCounterType::READ_LATENCY:
+ encode(c.second, *bl);
+ break;
+ default:
+ ceph_abort_msg("unknown counter type");
+ }
+}
+
+void PerformanceCounterDescriptor::unpack_counter(
+ bufferlist::const_iterator& bl, PerformanceCounter *c) const {
+ using ceph::decode;
+ decode(c->first, bl);
+ switch(type) {
+ case PerformanceCounterType::OPS:
+ case PerformanceCounterType::WRITE_OPS:
+ case PerformanceCounterType::READ_OPS:
+ case PerformanceCounterType::BYTES:
+ case PerformanceCounterType::WRITE_BYTES:
+ case PerformanceCounterType::READ_BYTES:
+ break;
+ case PerformanceCounterType::LATENCY:
+ case PerformanceCounterType::WRITE_LATENCY:
+ case PerformanceCounterType::READ_LATENCY:
+ decode(c->second, bl);
+ break;
+ default:
+ ceph_abort_msg("unknown counter type");
+ }
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const PerformanceCounterDescriptor &d) {
+ switch(d.type) {
+ case PerformanceCounterType::OPS:
+ return os << "ops";
+ case PerformanceCounterType::WRITE_OPS:
+ return os << "write ops";
+ case PerformanceCounterType::READ_OPS:
+ return os << "read ops";
+ case PerformanceCounterType::BYTES:
+ return os << "bytes";
+ case PerformanceCounterType::WRITE_BYTES:
+ return os << "write bytes";
+ case PerformanceCounterType::READ_BYTES:
+ return os << "read bytes";
+ case PerformanceCounterType::LATENCY:
+ return os << "latency";
+ case PerformanceCounterType::WRITE_LATENCY:
+ return os << "write latency";
+ case PerformanceCounterType::READ_LATENCY:
+ return os << "read latency";
+ default:
+ return os << "unknown (" << static_cast<int>(d.type) << ")";
+ }
+}
+
+std::ostream& operator<<(std::ostream& os, const OSDPerfMetricLimit &limit) {
+ return os << "{order_by=" << limit.order_by << ", max_count="
+ << limit.max_count << "}";
+}
+
+void OSDPerfMetricQuery::pack_counters(const PerformanceCounters &counters,
+ bufferlist *bl) const {
+ auto it = counters.begin();
+ for (auto &descriptor : performance_counter_descriptors) {
+ if (it == counters.end()) {
+ descriptor.pack_counter(PerformanceCounter(), bl);
+ } else {
+ descriptor.pack_counter(*it, bl);
+ it++;
+ }
+ }
+}
+
+std::ostream& operator<<(std::ostream& os, const OSDPerfMetricQuery &query) {
+ return os << "{key=" << query.key_descriptor << ", counters="
+ << query.performance_counter_descriptors << "}";
+}
diff --git a/src/mgr/OSDPerfMetricTypes.h b/src/mgr/OSDPerfMetricTypes.h
new file mode 100644
index 000000000..1b5904e13
--- /dev/null
+++ b/src/mgr/OSDPerfMetricTypes.h
@@ -0,0 +1,360 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef OSD_PERF_METRIC_H_
+#define OSD_PERF_METRIC_H_
+
+#include "include/denc.h"
+#include "include/stringify.h"
+
+#include "mgr/Types.h"
+
+#include <regex>
+
+typedef std::vector<std::string> OSDPerfMetricSubKey; // array of regex match
+typedef std::vector<OSDPerfMetricSubKey> OSDPerfMetricKey;
+
+enum class OSDPerfMetricSubKeyType : uint8_t {
+ CLIENT_ID = 0,
+ CLIENT_ADDRESS = 1,
+ POOL_ID = 2,
+ NAMESPACE = 3,
+ OSD_ID = 4,
+ PG_ID = 5,
+ OBJECT_NAME = 6,
+ SNAP_ID = 7,
+};
+
+struct OSDPerfMetricSubKeyDescriptor {
+ OSDPerfMetricSubKeyType type = static_cast<OSDPerfMetricSubKeyType>(-1);
+ std::string regex_str;
+ std::regex regex;
+
+ bool is_supported() const {
+ switch (type) {
+ case OSDPerfMetricSubKeyType::CLIENT_ID:
+ case OSDPerfMetricSubKeyType::CLIENT_ADDRESS:
+ case OSDPerfMetricSubKeyType::POOL_ID:
+ case OSDPerfMetricSubKeyType::NAMESPACE:
+ case OSDPerfMetricSubKeyType::OSD_ID:
+ case OSDPerfMetricSubKeyType::PG_ID:
+ case OSDPerfMetricSubKeyType::OBJECT_NAME:
+ case OSDPerfMetricSubKeyType::SNAP_ID:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ OSDPerfMetricSubKeyDescriptor() {
+ }
+
+ OSDPerfMetricSubKeyDescriptor(OSDPerfMetricSubKeyType type,
+ const std::string regex)
+ : type(type), regex_str(regex) {
+ }
+
+ bool operator<(const OSDPerfMetricSubKeyDescriptor &other) const {
+ if (type < other.type) {
+ return true;
+ }
+ if (type > other.type) {
+ return false;
+ }
+ return regex_str < other.regex_str;
+ }
+
+ DENC(OSDPerfMetricSubKeyDescriptor, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ denc(v.regex_str, p);
+ DENC_FINISH(p);
+ }
+};
+WRITE_CLASS_DENC(OSDPerfMetricSubKeyDescriptor)
+
+std::ostream& operator<<(std::ostream& os,
+ const OSDPerfMetricSubKeyDescriptor &d);
+
+typedef std::vector<OSDPerfMetricSubKeyDescriptor> OSDPerfMetricKeyDescriptor;
+
+template<>
+struct denc_traits<OSDPerfMetricKeyDescriptor> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const OSDPerfMetricKeyDescriptor& v, size_t& p) {
+ p += sizeof(uint32_t);
+ const auto size = v.size();
+ if (size) {
+ size_t per = 0;
+ denc(v.front(), per);
+ p += per * size;
+ }
+ }
+ static void encode(const OSDPerfMetricKeyDescriptor& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ denc_varint(v.size(), p);
+ for (auto& i : v) {
+ denc(i, p);
+ }
+ }
+ static void decode(OSDPerfMetricKeyDescriptor& v,
+ ceph::buffer::ptr::const_iterator& p) {
+ unsigned num;
+ denc_varint(num, p);
+ v.clear();
+ v.reserve(num);
+ for (unsigned i=0; i < num; ++i) {
+ OSDPerfMetricSubKeyDescriptor d;
+ denc(d, p);
+ if (!d.is_supported()) {
+ v.clear();
+ return;
+ }
+ try {
+ d.regex = d.regex_str.c_str();
+ } catch (const std::regex_error& e) {
+ v.clear();
+ return;
+ }
+ if (d.regex.mark_count() == 0) {
+ v.clear();
+ return;
+ }
+ v.push_back(std::move(d));
+ }
+ }
+};
+
+enum class PerformanceCounterType : uint8_t {
+ OPS = 0,
+ WRITE_OPS = 1,
+ READ_OPS = 2,
+ BYTES = 3,
+ WRITE_BYTES = 4,
+ READ_BYTES = 5,
+ LATENCY = 6,
+ WRITE_LATENCY = 7,
+ READ_LATENCY = 8,
+};
+
+struct PerformanceCounterDescriptor {
+ PerformanceCounterType type = static_cast<PerformanceCounterType>(-1);
+
+ bool is_supported() const {
+ switch (type) {
+ case PerformanceCounterType::OPS:
+ case PerformanceCounterType::WRITE_OPS:
+ case PerformanceCounterType::READ_OPS:
+ case PerformanceCounterType::BYTES:
+ case PerformanceCounterType::WRITE_BYTES:
+ case PerformanceCounterType::READ_BYTES:
+ case PerformanceCounterType::LATENCY:
+ case PerformanceCounterType::WRITE_LATENCY:
+ case PerformanceCounterType::READ_LATENCY:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ PerformanceCounterDescriptor() {
+ }
+
+ PerformanceCounterDescriptor(PerformanceCounterType type) : type(type) {
+ }
+
+ bool operator<(const PerformanceCounterDescriptor &other) const {
+ return type < other.type;
+ }
+
+ bool operator==(const PerformanceCounterDescriptor &other) const {
+ return type == other.type;
+ }
+
+ bool operator!=(const PerformanceCounterDescriptor &other) const {
+ return type != other.type;
+ }
+
+ DENC(PerformanceCounterDescriptor, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ DENC_FINISH(p);
+ }
+
+ void pack_counter(const PerformanceCounter &c, ceph::buffer::list *bl) const;
+ void unpack_counter(ceph::buffer::list::const_iterator& bl,
+ PerformanceCounter *c) const;
+};
+WRITE_CLASS_DENC(PerformanceCounterDescriptor)
+
+std::ostream& operator<<(std::ostream& os,
+ const PerformanceCounterDescriptor &d);
+
+typedef std::vector<PerformanceCounterDescriptor> PerformanceCounterDescriptors;
+
+template<>
+struct denc_traits<PerformanceCounterDescriptors> {
+ static constexpr bool supported = true;
+ static constexpr bool bounded = false;
+ static constexpr bool featured = false;
+ static constexpr bool need_contiguous = true;
+ static void bound_encode(const PerformanceCounterDescriptors& v, size_t& p) {
+ p += sizeof(uint32_t);
+ const auto size = v.size();
+ if (size) {
+ size_t per = 0;
+ denc(v.front(), per);
+ p += per * size;
+ }
+ }
+ static void encode(const PerformanceCounterDescriptors& v,
+ ceph::buffer::list::contiguous_appender& p) {
+ denc_varint(v.size(), p);
+ for (auto& i : v) {
+ denc(i, p);
+ }
+ }
+ static void decode(PerformanceCounterDescriptors& v,
+ ceph::buffer::ptr::const_iterator& p) {
+ unsigned num;
+ denc_varint(num, p);
+ v.clear();
+ v.reserve(num);
+ for (unsigned i=0; i < num; ++i) {
+ PerformanceCounterDescriptor d;
+ denc(d, p);
+ if (d.is_supported()) {
+ v.push_back(std::move(d));
+ }
+ }
+ }
+};
+
+struct OSDPerfMetricLimit {
+ PerformanceCounterDescriptor order_by;
+ uint64_t max_count = 0;
+
+ OSDPerfMetricLimit() {
+ }
+
+ OSDPerfMetricLimit(const PerformanceCounterDescriptor &order_by,
+ uint64_t max_count)
+ : order_by(order_by), max_count(max_count) {
+ }
+
+ bool operator<(const OSDPerfMetricLimit &other) const {
+ if (order_by != other.order_by) {
+ return order_by < other.order_by;
+ }
+ return max_count < other.max_count;
+ }
+
+ DENC(OSDPerfMetricLimit, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.order_by, p);
+ denc(v.max_count, p);
+ DENC_FINISH(p);
+ }
+};
+WRITE_CLASS_DENC(OSDPerfMetricLimit)
+
+std::ostream& operator<<(std::ostream& os, const OSDPerfMetricLimit &limit);
+
+typedef std::set<OSDPerfMetricLimit> OSDPerfMetricLimits;
+
+struct OSDPerfMetricQuery {
+ bool operator<(const OSDPerfMetricQuery &other) const {
+ if (key_descriptor < other.key_descriptor) {
+ return true;
+ }
+ if (key_descriptor > other.key_descriptor) {
+ return false;
+ }
+ return (performance_counter_descriptors <
+ other.performance_counter_descriptors);
+ }
+
+ OSDPerfMetricQuery() {
+ }
+
+ OSDPerfMetricQuery(
+ const OSDPerfMetricKeyDescriptor &key_descriptor,
+ const PerformanceCounterDescriptors &performance_counter_descriptors)
+ : key_descriptor(key_descriptor),
+ performance_counter_descriptors(performance_counter_descriptors) {
+ }
+
+ template <typename L>
+ bool get_key(L&& get_sub_key, OSDPerfMetricKey *key) const {
+ for (auto &sub_key_descriptor : key_descriptor) {
+ OSDPerfMetricSubKey sub_key;
+ if (!get_sub_key(sub_key_descriptor, &sub_key)) {
+ return false;
+ }
+ key->push_back(sub_key);
+ }
+ return true;
+ }
+
+ DENC(OSDPerfMetricQuery, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.key_descriptor, p);
+ denc(v.performance_counter_descriptors, p);
+ DENC_FINISH(p);
+ }
+
+ void get_performance_counter_descriptors(
+ PerformanceCounterDescriptors *descriptors) const {
+ *descriptors = performance_counter_descriptors;
+ }
+
+ template <typename L>
+ void update_counters(L &&update_counter,
+ PerformanceCounters *counters) const {
+ auto it = counters->begin();
+ for (auto &descriptor : performance_counter_descriptors) {
+ // TODO: optimize
+ if (it == counters->end()) {
+ counters->push_back(PerformanceCounter());
+ it = std::prev(counters->end());
+ }
+ update_counter(descriptor, &(*it));
+ it++;
+ }
+ }
+
+ void pack_counters(const PerformanceCounters &counters, ceph::buffer::list *bl) const;
+
+ OSDPerfMetricKeyDescriptor key_descriptor;
+ PerformanceCounterDescriptors performance_counter_descriptors;
+};
+WRITE_CLASS_DENC(OSDPerfMetricQuery)
+
+struct OSDPerfCollector : PerfCollector {
+ std::map<OSDPerfMetricKey, PerformanceCounters> counters;
+
+ OSDPerfCollector(MetricQueryID query_id)
+ : PerfCollector(query_id) {
+ }
+};
+
+std::ostream& operator<<(std::ostream& os, const OSDPerfMetricQuery &query);
+
+struct OSDPerfMetricReport {
+ PerformanceCounterDescriptors performance_counter_descriptors;
+ std::map<OSDPerfMetricKey, ceph::buffer::list> group_packed_performance_counters;
+
+ DENC(OSDPerfMetricReport, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.performance_counter_descriptors, p);
+ denc(v.group_packed_performance_counters, p);
+ DENC_FINISH(p);
+ }
+};
+WRITE_CLASS_DENC(OSDPerfMetricReport)
+
+#endif // OSD_PERF_METRIC_H_
+
diff --git a/src/mgr/PyFormatter.cc b/src/mgr/PyFormatter.cc
new file mode 100644
index 000000000..8e58f6e9a
--- /dev/null
+++ b/src/mgr/PyFormatter.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat Inc
+ *
+ * Author: John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "PyFormatter.h"
+#include <fstream>
+
+#define LARGE_SIZE 1024
+
+
+void PyFormatter::open_array_section(std::string_view name)
+{
+ PyObject *list = PyList_New(0);
+ dump_pyobject(name, list);
+ stack.push(cursor);
+ cursor = list;
+}
+
+void PyFormatter::open_object_section(std::string_view name)
+{
+ PyObject *dict = PyDict_New();
+ dump_pyobject(name, dict);
+ stack.push(cursor);
+ cursor = dict;
+}
+
+void PyFormatter::dump_unsigned(std::string_view name, uint64_t u)
+{
+ PyObject *p = PyLong_FromUnsignedLong(u);
+ ceph_assert(p);
+ dump_pyobject(name, p);
+}
+
+void PyFormatter::dump_int(std::string_view name, int64_t u)
+{
+ PyObject *p = PyLong_FromLongLong(u);
+ ceph_assert(p);
+ dump_pyobject(name, p);
+}
+
+void PyFormatter::dump_float(std::string_view name, double d)
+{
+ dump_pyobject(name, PyFloat_FromDouble(d));
+}
+
+void PyFormatter::dump_string(std::string_view name, std::string_view s)
+{
+ dump_pyobject(name, PyUnicode_FromString(s.data()));
+}
+
+void PyFormatter::dump_bool(std::string_view name, bool b)
+{
+ if (b) {
+ Py_INCREF(Py_True);
+ dump_pyobject(name, Py_True);
+ } else {
+ Py_INCREF(Py_False);
+ dump_pyobject(name, Py_False);
+ }
+}
+
+std::ostream& PyFormatter::dump_stream(std::string_view name)
+{
+ // Give the caller an ostream, construct a PyString,
+ // and remember the association between the two. On flush,
+ // we'll read from the ostream into the PyString
+ auto ps = std::make_shared<PendingStream>();
+ ps->cursor = cursor;
+ ps->name = name;
+
+ pending_streams.push_back(ps);
+
+ return ps->stream;
+}
+
+void PyFormatter::dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+ char buf[LARGE_SIZE];
+ vsnprintf(buf, LARGE_SIZE, fmt, ap);
+
+ dump_pyobject(name, PyUnicode_FromString(buf));
+}
+
+/**
+ * Steals reference to `p`
+ */
+void PyFormatter::dump_pyobject(std::string_view name, PyObject *p)
+{
+ if (PyList_Check(cursor)) {
+ PyList_Append(cursor, p);
+ Py_DECREF(p);
+ } else if (PyDict_Check(cursor)) {
+ PyObject *key = PyUnicode_DecodeUTF8(name.data(), name.size(), nullptr);
+ PyDict_SetItem(cursor, key, p);
+ Py_DECREF(key);
+ Py_DECREF(p);
+ } else {
+ ceph_abort();
+ }
+}
+
+void PyFormatter::finish_pending_streams()
+{
+ for (const auto &i : pending_streams) {
+ PyObject *tmp_cur = cursor;
+ cursor = i->cursor;
+ dump_pyobject(
+ i->name.c_str(),
+ PyUnicode_FromString(i->stream.str().c_str()));
+ cursor = tmp_cur;
+ }
+
+ pending_streams.clear();
+}
+
+PyObject* PyJSONFormatter::get()
+{
+ if(json_formatter::stack_size()) {
+ close_section();
+ }
+ ceph_assert(!json_formatter::stack_size());
+ std::ostringstream ss;
+ flush(ss);
+ std::string s = ss.str();
+ PyObject* obj = PyBytes_FromStringAndSize(std::move(s.c_str()), s.size());
+ return obj;
+}
diff --git a/src/mgr/PyFormatter.h b/src/mgr/PyFormatter.h
new file mode 100644
index 000000000..5e4c0a679
--- /dev/null
+++ b/src/mgr/PyFormatter.h
@@ -0,0 +1,163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat Inc
+ *
+ * Author: John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef PY_FORMATTER_H_
+#define PY_FORMATTER_H_
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include <Python.h>
+
+#include <stack>
+#include <string>
+#include <string_view>
+#include <sstream>
+#include <memory>
+#include <list>
+
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+
+class PyFormatter : public ceph::Formatter
+{
+public:
+ PyFormatter (const PyFormatter&) = delete;
+ PyFormatter& operator= (const PyFormatter&) = delete;
+ PyFormatter(bool pretty = false, bool array = false)
+ {
+ // It is forbidden to instantiate me outside of the GIL,
+ // because I construct python objects right away
+
+ // Initialise cursor to an empty dict
+ if (!array) {
+ root = cursor = PyDict_New();
+ } else {
+ root = cursor = PyList_New(0);
+ }
+ }
+
+ ~PyFormatter() override
+ {
+ cursor = NULL;
+ Py_DECREF(root);
+ root = NULL;
+ }
+
+ // Obscure, don't care.
+ void open_array_section_in_ns(std::string_view name, const char *ns) override
+ {ceph_abort();}
+ void open_object_section_in_ns(std::string_view name, const char *ns) override
+ {ceph_abort();}
+
+ void reset() override
+ {
+ const bool array = PyList_Check(root);
+ Py_DECREF(root);
+ if (array) {
+ root = cursor = PyList_New(0);
+ } else {
+ root = cursor = PyDict_New();
+ }
+ }
+
+ void set_status(int status, const char* status_name) override {}
+ void output_header() override {};
+ void output_footer() override {};
+ void enable_line_break() override {};
+
+ void open_array_section(std::string_view name) override;
+ void open_object_section(std::string_view name) override;
+ void close_section() override
+ {
+ ceph_assert(cursor != root);
+ ceph_assert(!stack.empty());
+ cursor = stack.top();
+ stack.pop();
+ }
+ void dump_bool(std::string_view name, bool b) override;
+ void dump_unsigned(std::string_view name, uint64_t u) override;
+ void dump_int(std::string_view name, int64_t u) override;
+ void dump_float(std::string_view name, double d) override;
+ void dump_string(std::string_view name, std::string_view s) override;
+ std::ostream& dump_stream(std::string_view name) override;
+ void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+
+ void flush(std::ostream& os) override
+ {
+ // This class is not a serializer: this doesn't make sense
+ ceph_abort();
+ }
+
+ int get_len() const override
+ {
+ // This class is not a serializer: this doesn't make sense
+ ceph_abort();
+ return 0;
+ }
+
+ void write_raw_data(const char *data) override
+ {
+ // This class is not a serializer: this doesn't make sense
+ ceph_abort();
+ }
+
+ PyObject *get()
+ {
+ finish_pending_streams();
+
+ Py_INCREF(root);
+ return root;
+ }
+
+ void finish_pending_streams();
+
+private:
+ PyObject *root;
+ PyObject *cursor;
+ std::stack<PyObject *> stack;
+
+ void dump_pyobject(std::string_view name, PyObject *p);
+
+ class PendingStream {
+ public:
+ PyObject *cursor;
+ std::string name;
+ std::stringstream stream;
+ };
+
+ std::list<std::shared_ptr<PendingStream> > pending_streams;
+
+};
+
+class PyJSONFormatter : public JSONFormatter {
+public:
+ PyObject *get();
+ PyJSONFormatter (const PyJSONFormatter&) = default;
+ PyJSONFormatter(bool pretty=false, bool is_array=false) : JSONFormatter(pretty) {
+ if(is_array) {
+ open_array_section("");
+ } else {
+ open_object_section("");
+ }
+}
+
+private:
+ using json_formatter = JSONFormatter;
+ template <class T> void add_value(std::string_view name, T val);
+ void add_value(std::string_view name, std::string_view val, bool quoted);
+};
+
+#endif
+
diff --git a/src/mgr/PyModule.cc b/src/mgr/PyModule.cc
new file mode 100644
index 000000000..19d02332d
--- /dev/null
+++ b/src/mgr/PyModule.cc
@@ -0,0 +1,729 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "BaseMgrModule.h"
+#include "BaseMgrStandbyModule.h"
+#include "PyOSDMap.h"
+#include "MgrContext.h"
+#include "PyUtil.h"
+
+#include "PyModule.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[py] "
+
+// definition for non-const static member
+std::string PyModule::mgr_store_prefix = "mgr/";
+
+// Courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text
+#define BOOST_BIND_GLOBAL_PLACEHOLDERS
+// Boost apparently can't be bothered to fix its own usage of its own
+// deprecated features.
+#include <boost/python/extract.hpp>
+#include <boost/python/import.hpp>
+#include <boost/python/object.hpp>
+#undef BOOST_BIND_GLOBAL_PLACEHOLDERS
+#include <boost/algorithm/string/predicate.hpp>
+#include "include/ceph_assert.h" // boost clobbers this
+// decode a Python exception into a string
+std::string handle_pyerror()
+{
+ using namespace boost::python;
+ using namespace boost;
+
+ PyObject *exc, *val, *tb;
+ object formatted_list, formatted;
+ PyErr_Fetch(&exc, &val, &tb);
+ PyErr_NormalizeException(&exc, &val, &tb);
+ handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb));
+ object traceback(import("traceback"));
+ if (!tb) {
+ object format_exception_only(traceback.attr("format_exception_only"));
+ try {
+ formatted_list = format_exception_only(hexc, hval);
+ } catch (error_already_set const &) {
+ // error while processing exception object
+ // returning only the exception string value
+ PyObject *name_attr = PyObject_GetAttrString(exc, "__name__");
+ std::stringstream ss;
+ ss << PyUnicode_AsUTF8(name_attr) << ": " << PyUnicode_AsUTF8(val);
+ Py_XDECREF(name_attr);
+ ss << "\nError processing exception object: " << peek_pyerror();
+ return ss.str();
+ }
+ } else {
+ object format_exception(traceback.attr("format_exception"));
+ try {
+ formatted_list = format_exception(hexc, hval, htb);
+ } catch (error_already_set const &) {
+ // error while processing exception object
+ // returning only the exception string value
+ PyObject *name_attr = PyObject_GetAttrString(exc, "__name__");
+ std::stringstream ss;
+ ss << PyUnicode_AsUTF8(name_attr) << ": " << PyUnicode_AsUTF8(val);
+ Py_XDECREF(name_attr);
+ ss << "\nError processing exception object: " << peek_pyerror();
+ return ss.str();
+ }
+ }
+ formatted = str("").join(formatted_list);
+ return extract<std::string>(formatted);
+}
+
+/**
+ * Get the single-line exception message, without clearing any
+ * exception state.
+ */
+std::string peek_pyerror()
+{
+ PyObject *ptype, *pvalue, *ptraceback;
+ PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+ ceph_assert(ptype);
+ ceph_assert(pvalue);
+ PyObject *pvalue_str = PyObject_Str(pvalue);
+ std::string exc_msg = PyUnicode_AsUTF8(pvalue_str);
+ Py_DECREF(pvalue_str);
+ PyErr_Restore(ptype, pvalue, ptraceback);
+
+ return exc_msg;
+}
+
+
+namespace {
+ PyObject* log_write(PyObject*, PyObject* args) {
+ char* m = nullptr;
+ if (PyArg_ParseTuple(args, "s", &m)) {
+ auto len = strlen(m);
+ if (len && m[len-1] == '\n') {
+ m[len-1] = '\0';
+ }
+ dout(4) << m << dendl;
+ }
+ Py_RETURN_NONE;
+ }
+
+ PyObject* log_flush(PyObject*, PyObject*){
+ Py_RETURN_NONE;
+ }
+
+ static PyMethodDef log_methods[] = {
+ {"write", log_write, METH_VARARGS, "write stdout and stderr"},
+ {"flush", log_flush, METH_VARARGS, "flush"},
+ {nullptr, nullptr, 0, nullptr}
+ };
+
+ static PyModuleDef ceph_logger_module = {
+ PyModuleDef_HEAD_INIT,
+ "ceph_logger",
+ nullptr,
+ -1,
+ log_methods,
+ };
+}
+
+PyModuleConfig::PyModuleConfig() = default;
+
+PyModuleConfig::PyModuleConfig(PyModuleConfig &mconfig)
+ : config(mconfig.config)
+{}
+
+PyModuleConfig::~PyModuleConfig() = default;
+
+
+void PyModuleConfig::set_config(
+ MonClient *monc,
+ const std::string &module_name,
+ const std::string &key, const boost::optional<std::string>& val)
+{
+ const std::string global_key = "mgr/" + module_name + "/" + key;
+ Command set_cmd;
+ {
+ std::ostringstream cmd_json;
+ JSONFormatter jf;
+ jf.open_object_section("cmd");
+ if (val) {
+ jf.dump_string("prefix", "config set");
+ jf.dump_string("value", *val);
+ } else {
+ jf.dump_string("prefix", "config rm");
+ }
+ jf.dump_string("who", "mgr");
+ jf.dump_string("name", global_key);
+ jf.close_section();
+ jf.flush(cmd_json);
+ set_cmd.run(monc, cmd_json.str());
+ }
+ set_cmd.wait();
+
+ if (set_cmd.r == 0) {
+ std::lock_guard l(lock);
+ if (val) {
+ config[global_key] = *val;
+ } else {
+ config.erase(global_key);
+ }
+ } else {
+ if (val) {
+ dout(0) << "`config set mgr " << global_key << " " << val << "` failed: "
+ << cpp_strerror(set_cmd.r) << dendl;
+ } else {
+ dout(0) << "`config rm mgr " << global_key << "` failed: "
+ << cpp_strerror(set_cmd.r) << dendl;
+ }
+ dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
+ }
+}
+
+std::string PyModule::get_site_packages()
+{
+ std::stringstream site_packages;
+
+ // CPython doesn't auto-add site-packages dirs to sys.path for us,
+ // but it does provide a module that we can ask for them.
+ auto site_module = PyImport_ImportModule("site");
+ ceph_assert(site_module);
+
+ auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages");
+ if (site_packages_fn != nullptr) {
+ auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr);
+ ceph_assert(site_packages_list);
+
+ auto n = PyList_Size(site_packages_list);
+ for (Py_ssize_t i = 0; i < n; ++i) {
+ if (i != 0) {
+ site_packages << ":";
+ }
+ site_packages << PyUnicode_AsUTF8(PyList_GetItem(site_packages_list, i));
+ }
+
+ Py_DECREF(site_packages_list);
+ Py_DECREF(site_packages_fn);
+ } else {
+ // Fall back to generating our own site-packages paths by imitating
+ // what the standard site.py does. This is annoying but it lets us
+ // run inside virtualenvs :-/
+
+ auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages");
+ ceph_assert(site_packages_fn);
+
+ auto known_paths = PySet_New(nullptr);
+ auto pArgs = PyTuple_Pack(1, known_paths);
+ PyObject_CallObject(site_packages_fn, pArgs);
+ Py_DECREF(pArgs);
+ Py_DECREF(known_paths);
+ Py_DECREF(site_packages_fn);
+
+ auto sys_module = PyImport_ImportModule("sys");
+ ceph_assert(sys_module);
+ auto sys_path = PyObject_GetAttrString(sys_module, "path");
+ ceph_assert(sys_path);
+
+ dout(1) << "sys.path:" << dendl;
+ auto n = PyList_Size(sys_path);
+ bool first = true;
+ for (Py_ssize_t i = 0; i < n; ++i) {
+ dout(1) << " " << PyUnicode_AsUTF8(PyList_GetItem(sys_path, i)) << dendl;
+ if (first) {
+ first = false;
+ } else {
+ site_packages << ":";
+ }
+ site_packages << PyUnicode_AsUTF8(PyList_GetItem(sys_path, i));
+ }
+
+ Py_DECREF(sys_path);
+ Py_DECREF(sys_module);
+ }
+
+ Py_DECREF(site_module);
+
+ return site_packages.str();
+}
+
+PyObject* PyModule::init_ceph_logger()
+{
+ auto py_logger = PyModule_Create(&ceph_logger_module);
+ PySys_SetObject("stderr", py_logger);
+ PySys_SetObject("stdout", py_logger);
+ return py_logger;
+}
+
+PyObject* PyModule::init_ceph_module()
+{
+ static PyMethodDef module_methods[] = {
+ {nullptr, nullptr, 0, nullptr}
+ };
+ static PyModuleDef ceph_module_def = {
+ PyModuleDef_HEAD_INIT,
+ "ceph_module",
+ nullptr,
+ -1,
+ module_methods,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr
+ };
+ PyObject *ceph_module = PyModule_Create(&ceph_module_def);
+ ceph_assert(ceph_module != nullptr);
+ std::map<const char*, PyTypeObject*> classes{
+ {{"BaseMgrModule", &BaseMgrModuleType},
+ {"BaseMgrStandbyModule", &BaseMgrStandbyModuleType},
+ {"BasePyOSDMap", &BasePyOSDMapType},
+ {"BasePyOSDMapIncremental", &BasePyOSDMapIncrementalType},
+ {"BasePyCRUSH", &BasePyCRUSHType}}
+ };
+ for (auto [name, type] : classes) {
+ type->tp_new = PyType_GenericNew;
+ if (PyType_Ready(type) < 0) {
+ ceph_abort();
+ }
+ Py_INCREF(type);
+
+ PyModule_AddObject(ceph_module, name, (PyObject *)type);
+ }
+ return ceph_module;
+}
+
+int PyModule::load(PyThreadState *pMainThreadState)
+{
+ ceph_assert(pMainThreadState != nullptr);
+
+ // Configure sub-interpreter
+ {
+ SafeThreadState sts(pMainThreadState);
+ Gil gil(sts);
+
+ auto thread_state = Py_NewInterpreter();
+ if (thread_state == nullptr) {
+ derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl;
+ return -EINVAL;
+ } else {
+ pMyThreadState.set(thread_state);
+ // Some python modules do not cope with an unpopulated argv, so lets
+ // fake one. This step also picks up site-packages into sys.path.
+ const wchar_t *argv[] = {L"ceph-mgr"};
+ PySys_SetArgv(1, (wchar_t**)argv);
+ // Configure sys.path to include mgr_module_path
+ string paths = (g_conf().get_val<std::string>("mgr_module_path") + ':' +
+ get_site_packages() + ':');
+ wstring sys_path(wstring(begin(paths), end(paths)) + Py_GetPath());
+ PySys_SetPath(const_cast<wchar_t*>(sys_path.c_str()));
+ dout(10) << "Computed sys.path '"
+ << string(begin(sys_path), end(sys_path)) << "'" << dendl;
+ }
+ }
+ // Environment is all good, import the external module
+ {
+ Gil gil(pMyThreadState);
+
+ int r;
+ r = load_subclass_of("MgrModule", &pClass);
+ if (r) {
+ derr << "Class not found in module '" << module_name << "'" << dendl;
+ return r;
+ }
+
+ r = load_commands();
+ if (r != 0) {
+ derr << "Missing or invalid COMMANDS attribute in module '"
+ << module_name << "'" << dendl;
+ error_string = "Missing or invalid COMMANDS attribute";
+ return r;
+ }
+
+ register_options(pClass);
+ r = load_options();
+ if (r != 0) {
+ derr << "Missing or invalid MODULE_OPTIONS attribute in module '"
+ << module_name << "'" << dendl;
+ error_string = "Missing or invalid MODULE_OPTIONS attribute";
+ return r;
+ }
+
+ load_notify_types();
+
+ // We've imported the module and found a MgrModule subclass, at this
+ // point the module is considered loaded. It might still not be
+ // runnable though, can_run populated later...
+ loaded = true;
+
+ r = load_subclass_of("MgrStandbyModule", &pStandbyClass);
+ if (!r) {
+ dout(4) << "Standby mode available in module '" << module_name
+ << "'" << dendl;
+ register_options(pStandbyClass);
+ } else {
+ dout(4) << "Standby mode not provided by module '" << module_name
+ << "'" << dendl;
+ }
+
+ // Populate can_run by interrogating the module's callback that
+ // may check for dependencies etc
+ PyObject *pCanRunTuple = PyObject_CallMethod(pClass,
+ const_cast<char*>("can_run"), const_cast<char*>("()"));
+ if (pCanRunTuple != nullptr) {
+ if (PyTuple_Check(pCanRunTuple) && PyTuple_Size(pCanRunTuple) == 2) {
+ PyObject *pCanRun = PyTuple_GetItem(pCanRunTuple, 0);
+ PyObject *can_run_str = PyTuple_GetItem(pCanRunTuple, 1);
+ if (!PyBool_Check(pCanRun) || !PyUnicode_Check(can_run_str)) {
+ derr << "Module " << get_name()
+ << " returned wrong type in can_run" << dendl;
+ error_string = "wrong type returned from can_run";
+ can_run = false;
+ } else {
+ can_run = (pCanRun == Py_True);
+ if (!can_run) {
+ error_string = PyUnicode_AsUTF8(can_run_str);
+ dout(4) << "Module " << get_name()
+ << " reported that it cannot run: "
+ << error_string << dendl;
+ }
+ }
+ } else {
+ derr << "Module " << get_name()
+ << " returned wrong type in can_run" << dendl;
+ error_string = "wrong type returned from can_run";
+ can_run = false;
+ }
+
+ Py_DECREF(pCanRunTuple);
+ } else {
+ derr << "Exception calling can_run on " << get_name() << dendl;
+ derr << handle_pyerror() << dendl;
+ can_run = false;
+ }
+ }
+ return 0;
+}
+
+int PyModule::walk_dict_list(
+ const std::string &attr_name,
+ std::function<int(PyObject*)> fn)
+{
+ PyObject *command_list = PyObject_GetAttrString(pClass, attr_name.c_str());
+ if (command_list == nullptr) {
+ derr << "Module " << get_name() << " has missing " << attr_name
+ << " member" << dendl;
+ return -EINVAL;
+ }
+ if (!PyObject_TypeCheck(command_list, &PyList_Type)) {
+ // Relatively easy mistake for human to make, e.g. defining COMMANDS
+ // as a {} instead of a []
+ derr << "Module " << get_name() << " has " << attr_name
+ << " member of wrong type (should be a list)" << dendl;
+ return -EINVAL;
+ }
+
+ // Invoke fn on each item in the list
+ int r = 0;
+ const size_t list_size = PyList_Size(command_list);
+ for (size_t i = 0; i < list_size; ++i) {
+ PyObject *command = PyList_GetItem(command_list, i);
+ ceph_assert(command != nullptr);
+
+ if (!PyDict_Check(command)) {
+ derr << "Module " << get_name() << " has non-dict entry "
+ << "in " << attr_name << " list" << dendl;
+ return -EINVAL;
+ }
+
+ r = fn(command);
+ if (r != 0) {
+ break;
+ }
+ }
+ Py_DECREF(command_list);
+
+ return r;
+}
+
+int PyModule::register_options(PyObject *cls)
+{
+ PyObject *pRegCmd = PyObject_CallMethod(
+ cls,
+ const_cast<char*>("_register_options"), const_cast<char*>("(s)"),
+ module_name.c_str());
+ if (pRegCmd != nullptr) {
+ Py_DECREF(pRegCmd);
+ } else {
+ derr << "Exception calling _register_options on " << get_name()
+ << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+ return 0;
+}
+
+int PyModule::load_notify_types()
+{
+ PyObject *ls = PyObject_GetAttrString(pClass, "NOTIFY_TYPES");
+ if (ls == nullptr) {
+ derr << "Module " << get_name() << " has missing NOTIFY_TYPES member" << dendl;
+ return -EINVAL;
+ }
+ if (!PyObject_TypeCheck(ls, &PyList_Type)) {
+ // Relatively easy mistake for human to make, e.g. defining COMMANDS
+ // as a {} instead of a []
+ derr << "Module " << get_name() << " has NOTIFY_TYPES that is not a list" << dendl;
+ return -EINVAL;
+ }
+
+ const size_t list_size = PyList_Size(ls);
+ for (size_t i = 0; i < list_size; ++i) {
+ PyObject *notify_type = PyList_GetItem(ls, i);
+ ceph_assert(notify_type != nullptr);
+
+ if (!PyObject_TypeCheck(notify_type, &PyUnicode_Type)) {
+ derr << "Module " << get_name() << " has non-string entry in NOTIFY_TYPES list"
+ << dendl;
+ return -EINVAL;
+ }
+
+ notify_types.insert(PyUnicode_AsUTF8(notify_type));
+ }
+ Py_DECREF(ls);
+ dout(10) << "Module " << get_name() << " notify_types " << notify_types << dendl;
+
+ return 0;
+}
+
+int PyModule::load_commands()
+{
+ PyObject *pRegCmd = PyObject_CallMethod(pClass,
+ const_cast<char*>("_register_commands"), const_cast<char*>("(s)"),
+ module_name.c_str());
+ if (pRegCmd != nullptr) {
+ Py_DECREF(pRegCmd);
+ } else {
+ derr << "Exception calling _register_commands on " << get_name()
+ << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+
+ int r = walk_dict_list("COMMANDS", [this](PyObject *pCommand) -> int {
+ ModuleCommand command;
+
+ PyObject *pCmd = PyDict_GetItemString(pCommand, "cmd");
+ ceph_assert(pCmd != nullptr);
+ command.cmdstring = PyUnicode_AsUTF8(pCmd);
+
+ dout(20) << "loaded command " << command.cmdstring << dendl;
+
+ PyObject *pDesc = PyDict_GetItemString(pCommand, "desc");
+ ceph_assert(pDesc != nullptr);
+ command.helpstring = PyUnicode_AsUTF8(pDesc);
+
+ PyObject *pPerm = PyDict_GetItemString(pCommand, "perm");
+ ceph_assert(pPerm != nullptr);
+ command.perm = PyUnicode_AsUTF8(pPerm);
+
+ command.polling = false;
+ if (PyObject *pPoll = PyDict_GetItemString(pCommand, "poll");
+ pPoll && PyObject_IsTrue(pPoll)) {
+ command.polling = true;
+ }
+
+ command.module_name = module_name;
+
+ commands.push_back(std::move(command));
+
+ return 0;
+ });
+
+ dout(10) << "loaded " << commands.size() << " commands" << dendl;
+
+ return r;
+}
+
+int PyModule::load_options()
+{
+ int r = walk_dict_list("MODULE_OPTIONS", [this](PyObject *pOption) -> int {
+ MgrMap::ModuleOption option;
+ PyObject *p;
+ p = PyDict_GetItemString(pOption, "name");
+ ceph_assert(p != nullptr);
+ option.name = PyUnicode_AsUTF8(p);
+ option.type = Option::TYPE_STR;
+ p = PyDict_GetItemString(pOption, "type");
+ if (p && PyObject_TypeCheck(p, &PyUnicode_Type)) {
+ std::string s = PyUnicode_AsUTF8(p);
+ int t = Option::str_to_type(s);
+ if (t >= 0) {
+ option.type = t;
+ }
+ }
+ p = PyDict_GetItemString(pOption, "desc");
+ if (p && PyObject_TypeCheck(p, &PyUnicode_Type)) {
+ option.desc = PyUnicode_AsUTF8(p);
+ }
+ p = PyDict_GetItemString(pOption, "long_desc");
+ if (p && PyObject_TypeCheck(p, &PyUnicode_Type)) {
+ option.long_desc = PyUnicode_AsUTF8(p);
+ }
+ p = PyDict_GetItemString(pOption, "default");
+ if (p) {
+ auto q = PyObject_Str(p);
+ option.default_value = PyUnicode_AsUTF8(q);
+ Py_DECREF(q);
+ }
+ p = PyDict_GetItemString(pOption, "min");
+ if (p) {
+ auto q = PyObject_Str(p);
+ option.min = PyUnicode_AsUTF8(q);
+ Py_DECREF(q);
+ }
+ p = PyDict_GetItemString(pOption, "max");
+ if (p) {
+ auto q = PyObject_Str(p);
+ option.max = PyUnicode_AsUTF8(q);
+ Py_DECREF(q);
+ }
+ p = PyDict_GetItemString(pOption, "enum_allowed");
+ if (p && PyObject_TypeCheck(p, &PyList_Type)) {
+ for (unsigned i = 0; i < PyList_Size(p); ++i) {
+ auto q = PyList_GetItem(p, i);
+ if (q) {
+ auto r = PyObject_Str(q);
+ option.enum_allowed.insert(PyUnicode_AsUTF8(r));
+ Py_DECREF(r);
+ }
+ }
+ }
+ p = PyDict_GetItemString(pOption, "see_also");
+ if (p && PyObject_TypeCheck(p, &PyList_Type)) {
+ for (unsigned i = 0; i < PyList_Size(p); ++i) {
+ auto q = PyList_GetItem(p, i);
+ if (q && PyObject_TypeCheck(q, &PyUnicode_Type)) {
+ option.see_also.insert(PyUnicode_AsUTF8(q));
+ }
+ }
+ }
+ p = PyDict_GetItemString(pOption, "tags");
+ if (p && PyObject_TypeCheck(p, &PyList_Type)) {
+ for (unsigned i = 0; i < PyList_Size(p); ++i) {
+ auto q = PyList_GetItem(p, i);
+ if (q && PyObject_TypeCheck(q, &PyUnicode_Type)) {
+ option.tags.insert(PyUnicode_AsUTF8(q));
+ }
+ }
+ }
+ p = PyDict_GetItemString(pOption, "runtime");
+ if (p && PyObject_TypeCheck(p, &PyBool_Type)) {
+ if (p == Py_True) {
+ option.flags |= Option::FLAG_RUNTIME;
+ }
+ if (p == Py_False) {
+ option.flags &= ~Option::FLAG_RUNTIME;
+ }
+ }
+ dout(20) << "loaded module option " << option.name << dendl;
+ options[option.name] = std::move(option);
+ return 0;
+ });
+
+ dout(10) << "loaded " << options.size() << " options" << dendl;
+
+ return r;
+}
+
+bool PyModule::is_option(const std::string &option_name)
+{
+ std::lock_guard l(lock);
+ return options.count(option_name) > 0;
+}
+
+PyObject *PyModule::get_typed_option_value(const std::string& name,
+ const std::string& value)
+{
+ // we don't need to hold a lock here because these MODULE_OPTIONS
+ // are set up exactly once during startup.
+ auto p = options.find(name);
+ if (p != options.end()) {
+ return get_python_typed_option_value((Option::type_t)p->second.type, value);
+ }
+ return PyUnicode_FromString(value.c_str());
+}
+
+int PyModule::load_subclass_of(const char* base_class, PyObject** py_class)
+{
+ // load the base class
+ PyObject *mgr_module = PyImport_ImportModule("mgr_module");
+ if (!mgr_module) {
+ error_string = peek_pyerror();
+ derr << "Module not found: 'mgr_module'" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -EINVAL;
+ }
+ auto mgr_module_type = PyObject_GetAttrString(mgr_module, base_class);
+ Py_DECREF(mgr_module);
+ if (!mgr_module_type) {
+ error_string = peek_pyerror();
+ derr << "Unable to import MgrModule from mgr_module" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -EINVAL;
+ }
+
+ // find the sub class
+ PyObject *plugin_module = PyImport_ImportModule(module_name.c_str());
+ if (!plugin_module) {
+ error_string = peek_pyerror();
+ derr << "Module not found: '" << module_name << "'" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -ENOENT;
+ }
+ auto locals = PyModule_GetDict(plugin_module);
+ Py_DECREF(plugin_module);
+ PyObject *key, *value;
+ Py_ssize_t pos = 0;
+ *py_class = nullptr;
+ while (PyDict_Next(locals, &pos, &key, &value)) {
+ if (!PyType_Check(value)) {
+ continue;
+ }
+ if (!PyObject_IsSubclass(value, mgr_module_type)) {
+ continue;
+ }
+ if (PyObject_RichCompareBool(value, mgr_module_type, Py_EQ)) {
+ continue;
+ }
+ auto class_name = PyUnicode_AsUTF8(key);
+ if (*py_class) {
+ derr << __func__ << ": ignoring '"
+ << module_name << "." << class_name << "'"
+ << ": only one '" << base_class
+ << "' class is loaded from each plugin" << dendl;
+ continue;
+ }
+ *py_class = value;
+ dout(4) << __func__ << ": found class: '"
+ << module_name << "." << class_name << "'" << dendl;
+ }
+ Py_DECREF(mgr_module_type);
+
+ return *py_class ? 0 : -EINVAL;
+}
+
+PyModule::~PyModule()
+{
+ if (pMyThreadState.ts != nullptr) {
+ Gil gil(pMyThreadState, true);
+ Py_XDECREF(pClass);
+ Py_XDECREF(pStandbyClass);
+ }
+}
+
diff --git a/src/mgr/PyModule.h b/src/mgr/PyModule.h
new file mode 100644
index 000000000..fe2e16238
--- /dev/null
+++ b/src/mgr/PyModule.h
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <boost/optional.hpp>
+#include "common/ceph_mutex.h"
+#include "Python.h"
+#include "Gil.h"
+#include "mon/MgrMap.h"
+
+
+class MonClient;
+
+std::string handle_pyerror();
+
+std::string peek_pyerror();
+
+/**
+ * A Ceph CLI command description provided from a Python module
+ */
+class ModuleCommand {
+public:
+ std::string cmdstring;
+ std::string helpstring;
+ std::string perm;
+ bool polling;
+
+ // Call the ActivePyModule of this name to handle the command
+ std::string module_name;
+};
+
+class PyModule
+{
+ mutable ceph::mutex lock = ceph::make_mutex("PyModule::lock");
+private:
+ const std::string module_name;
+ std::string get_site_packages();
+ int load_subclass_of(const char* class_name, PyObject** py_class);
+
+ // Did the MgrMap identify this module as one that should run?
+ bool enabled = false;
+
+ // Did the MgrMap flag this module as always on?
+ bool always_on = false;
+
+ // Did we successfully import this python module and look up symbols?
+ // (i.e. is it possible to instantiate a MgrModule subclass instance?)
+ bool loaded = false;
+
+ // Did the module identify itself as being able to run?
+ // (i.e. should we expect instantiating and calling serve() to work?)
+ bool can_run = false;
+
+ // Did the module encounter an unexpected error while running?
+ // (e.g. throwing an exception from serve())
+ bool failed = false;
+
+ // Populated if loaded, can_run or failed indicates a problem
+ std::string error_string;
+
+ // Helper for loading MODULE_OPTIONS and COMMANDS members
+ int walk_dict_list(
+ const std::string &attr_name,
+ std::function<int(PyObject*)> fn);
+
+ int load_commands();
+ std::vector<ModuleCommand> commands;
+
+ int register_options(PyObject *cls);
+ int load_options();
+ std::map<std::string, MgrMap::ModuleOption> options;
+
+ int load_notify_types();
+ std::set<std::string> notify_types;
+
+public:
+ static std::string mgr_store_prefix;
+
+ SafeThreadState pMyThreadState;
+ PyObject *pClass = nullptr;
+ PyObject *pStandbyClass = nullptr;
+
+ explicit PyModule(const std::string &module_name_)
+ : module_name(module_name_)
+ {
+ }
+
+ ~PyModule();
+
+ bool is_option(const std::string &option_name);
+ const std::map<std::string,MgrMap::ModuleOption>& get_options() const {
+ return options;
+ }
+
+ PyObject *get_typed_option_value(
+ const std::string& option,
+ const std::string& value);
+
+ int load(PyThreadState *pMainThreadState);
+ static PyObject* init_ceph_logger();
+ static PyObject* init_ceph_module();
+
+ void set_enabled(const bool enabled_)
+ {
+ enabled = enabled_;
+ }
+
+ void set_always_on(const bool always_on_) {
+ always_on = always_on_;
+ }
+
+ /**
+ * Extend `out` with the contents of `this->commands`
+ */
+ void get_commands(std::vector<ModuleCommand> *out) const
+ {
+ std::lock_guard l(lock);
+ ceph_assert(out != nullptr);
+ out->insert(out->end(), commands.begin(), commands.end());
+ }
+
+
+ /**
+ * Mark the module as failed, recording the reason in the error
+ * string.
+ */
+ void fail(const std::string &reason)
+ {
+ std::lock_guard l(lock);
+ failed = true;
+ error_string = reason;
+ }
+
+ bool is_enabled() const {
+ std::lock_guard l(lock);
+ return enabled || always_on;
+ }
+
+ bool is_failed() const { std::lock_guard l(lock) ; return failed; }
+ bool is_loaded() const { std::lock_guard l(lock) ; return loaded; }
+ bool is_always_on() const { std::lock_guard l(lock) ; return always_on; }
+
+ bool should_notify(const std::string& notify_type) const {
+ return notify_types.count(notify_type);
+ }
+
+ const std::string &get_name() const {
+ std::lock_guard l(lock) ; return module_name;
+ }
+ const std::string &get_error_string() const {
+ std::lock_guard l(lock) ; return error_string;
+ }
+ bool get_can_run() const {
+ std::lock_guard l(lock) ; return can_run;
+ }
+};
+
+typedef std::shared_ptr<PyModule> PyModuleRef;
+
+class PyModuleConfig {
+public:
+ mutable ceph::mutex lock = ceph::make_mutex("PyModuleConfig::lock");
+ std::map<std::string, std::string> config;
+
+ PyModuleConfig();
+
+ PyModuleConfig(PyModuleConfig &mconfig);
+
+ ~PyModuleConfig();
+
+ void set_config(
+ MonClient *monc,
+ const std::string &module_name,
+ const std::string &key, const boost::optional<std::string>& val);
+
+};
diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc
new file mode 100644
index 000000000..1ae44143c
--- /dev/null
+++ b/src/mgr/PyModuleRegistry.cc
@@ -0,0 +1,454 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "PyModuleRegistry.h"
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+#error std::filesystem not available!
+#endif
+
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/split.h"
+
+#include "BaseMgrModule.h"
+#include "PyOSDMap.h"
+#include "BaseMgrStandbyModule.h"
+#include "Gil.h"
+#include "MgrContext.h"
+#include "mgr/mgr_commands.h"
+
+#include "ActivePyModules.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr[py] "
+
+std::set<std::string> obsolete_modules = {
+ "orchestrator_cli",
+};
+
+void PyModuleRegistry::init()
+{
+ std::lock_guard locker(lock);
+
+ // Set up global python interpreter
+#define WCHAR(s) L ## #s
+ Py_SetProgramName(const_cast<wchar_t*>(WCHAR(MGR_PYTHON_EXECUTABLE)));
+#undef WCHAR
+ // Add more modules
+ if (g_conf().get_val<bool>("daemonize")) {
+ PyImport_AppendInittab("ceph_logger", PyModule::init_ceph_logger);
+ }
+ PyImport_AppendInittab("ceph_module", PyModule::init_ceph_module);
+ Py_InitializeEx(0);
+#if PY_VERSION_HEX < 0x03090000
+ // Let CPython know that we will be calling it back from other
+ // threads in future.
+ if (! PyEval_ThreadsInitialized()) {
+ PyEval_InitThreads();
+ }
+#endif
+ // Drop the GIL and remember the main thread state (current
+ // thread state becomes NULL)
+ pMainThreadState = PyEval_SaveThread();
+ ceph_assert(pMainThreadState != nullptr);
+
+ std::list<std::string> failed_modules;
+
+ const std::string module_path = g_conf().get_val<std::string>("mgr_module_path");
+ std::set<std::string> module_names = probe_modules(module_path);
+ // Load python code
+ for (const auto& module_name : module_names) {
+ dout(1) << "Loading python module '" << module_name << "'" << dendl;
+
+ // Everything starts disabled, set enabled flag on module
+ // when we see first MgrMap
+ auto mod = std::make_shared<PyModule>(module_name);
+ int r = mod->load(pMainThreadState);
+ if (r != 0) {
+ // Don't use handle_pyerror() here; we don't have the GIL
+ // or the right thread state (this is deliberate).
+ derr << "Error loading module '" << module_name << "': "
+ << cpp_strerror(r) << dendl;
+ failed_modules.push_back(module_name);
+ // Don't drop out here, load the other modules
+ }
+
+ // Record the module even if the load failed, so that we can
+ // report its loading error
+ modules[module_name] = std::move(mod);
+ }
+ if (module_names.empty()) {
+ clog->error() << "No ceph-mgr modules found in " << module_path;
+ }
+ if (!failed_modules.empty()) {
+ clog->error() << "Failed to load ceph-mgr modules: " << joinify(
+ failed_modules.begin(), failed_modules.end(), std::string(", "));
+ }
+}
+
+bool PyModuleRegistry::handle_mgr_map(const MgrMap &mgr_map_)
+{
+ std::lock_guard l(lock);
+
+ if (mgr_map.epoch == 0) {
+ mgr_map = mgr_map_;
+
+ // First time we see MgrMap, set the enabled flags on modules
+ // This should always happen before someone calls standby_start
+ // or active_start
+ for (const auto &[module_name, module] : modules) {
+ const bool enabled = (mgr_map.modules.count(module_name) > 0);
+ module->set_enabled(enabled);
+ const bool always_on = (mgr_map.get_always_on_modules().count(module_name) > 0);
+ module->set_always_on(always_on);
+ }
+
+ return false;
+ } else {
+ bool modules_changed = mgr_map_.modules != mgr_map.modules ||
+ mgr_map_.always_on_modules != mgr_map.always_on_modules;
+ mgr_map = mgr_map_;
+
+ if (standby_modules != nullptr) {
+ standby_modules->handle_mgr_map(mgr_map_);
+ }
+
+ return modules_changed;
+ }
+}
+
+
+
+void PyModuleRegistry::standby_start(MonClient &mc, Finisher &f)
+{
+ std::lock_guard l(lock);
+ ceph_assert(active_modules == nullptr);
+ ceph_assert(standby_modules == nullptr);
+
+ // Must have seen a MgrMap by this point, in order to know
+ // which modules should be enabled
+ ceph_assert(mgr_map.epoch > 0);
+
+ dout(4) << "Starting modules in standby mode" << dendl;
+
+ standby_modules.reset(new StandbyPyModules(
+ mgr_map, module_config, clog, mc, f));
+
+ std::set<std::string> failed_modules;
+ for (const auto &i : modules) {
+ if (!(i.second->is_enabled() && i.second->get_can_run())) {
+ // report always_on modules with a standby mode that won't run
+ if (i.second->is_always_on() && i.second->pStandbyClass) {
+ failed_modules.insert(i.second->get_name());
+ }
+ continue;
+ }
+
+ if (i.second->pStandbyClass) {
+ dout(4) << "starting module " << i.second->get_name() << dendl;
+ standby_modules->start_one(i.second);
+ } else {
+ dout(4) << "skipping module '" << i.second->get_name() << "' because "
+ "it does not implement a standby mode" << dendl;
+ }
+ }
+
+ if (!failed_modules.empty()) {
+ clog->error() << "Failed to execute ceph-mgr module(s) in standby mode: "
+ << joinify(failed_modules.begin(), failed_modules.end(),
+ std::string(", "));
+ }
+}
+
+void PyModuleRegistry::active_start(
+ DaemonStateIndex &ds, ClusterState &cs,
+ const std::map<std::string, std::string> &kv_store,
+ bool mon_provides_kv_sub,
+ MonClient &mc, LogChannelRef clog_, LogChannelRef audit_clog_,
+ Objecter &objecter_, Client &client_, Finisher &f,
+ DaemonServer &server)
+{
+ std::lock_guard locker(lock);
+
+ dout(4) << "Starting modules in active mode" << dendl;
+
+ ceph_assert(active_modules == nullptr);
+
+ // Must have seen a MgrMap by this point, in order to know
+ // which modules should be enabled
+ ceph_assert(mgr_map.epoch > 0);
+
+ if (standby_modules != nullptr) {
+ standby_modules->shutdown();
+ standby_modules.reset();
+ }
+
+ active_modules.reset(
+ new ActivePyModules(
+ module_config,
+ kv_store, mon_provides_kv_sub,
+ ds, cs, mc,
+ clog_, audit_clog_, objecter_, client_, f, server,
+ *this));
+
+ for (const auto &i : modules) {
+ // Anything we're skipping because of !can_run will be flagged
+ // to the user separately via get_health_checks
+ if (!(i.second->is_enabled() && i.second->is_loaded())) {
+ continue;
+ }
+
+ dout(4) << "Starting " << i.first << dendl;
+ active_modules->start_one(i.second);
+ }
+}
+
+void PyModuleRegistry::active_shutdown()
+{
+ std::lock_guard locker(lock);
+
+ if (active_modules != nullptr) {
+ active_modules->shutdown();
+ active_modules.reset();
+ }
+}
+
+void PyModuleRegistry::shutdown()
+{
+ std::lock_guard locker(lock);
+
+ if (standby_modules != nullptr) {
+ standby_modules->shutdown();
+ standby_modules.reset();
+ }
+
+ // Ideally, now, we'd be able to do this for all modules:
+ //
+ // Py_EndInterpreter(pMyThreadState);
+ // PyThreadState_Swap(pMainThreadState);
+ //
+ // Unfortunately, if the module has any other *python* threads active
+ // at this point, Py_EndInterpreter() will abort with:
+ //
+ // Fatal Python error: Py_EndInterpreter: not the last thread
+ //
+ // This can happen when using CherryPy in a module, becuase CherryPy
+ // runs an extra thread as a timeout monitor, which spends most of its
+ // life inside a time.sleep(60). Unless you are very, very lucky with
+ // the timing calling this destructor, that thread will still be stuck
+ // in a sleep, and Py_EndInterpreter() will abort.
+ //
+ // This could of course also happen with a poorly written module which
+ // made no attempt to clean up any additional threads it created.
+ //
+ // The safest thing to do is just not call Py_EndInterpreter(), and
+ // let Py_Finalize() kill everything after all modules are shut down.
+
+ modules.clear();
+
+ PyEval_RestoreThread(pMainThreadState);
+ Py_Finalize();
+}
+
+std::set<std::string> PyModuleRegistry::probe_modules(const std::string &path) const
+{
+ const auto opt = g_conf().get_val<std::string>("mgr_disabled_modules");
+ const auto disabled_modules = ceph::split(opt);
+
+ std::set<std::string> modules;
+ for (const auto& entry: fs::directory_iterator(path)) {
+ if (!fs::is_directory(entry)) {
+ continue;
+ }
+ const std::string name = entry.path().filename();
+ if (std::count(disabled_modules.begin(), disabled_modules.end(), name)) {
+ dout(10) << "ignoring disabled module " << name << dendl;
+ continue;
+ }
+ auto module_path = entry.path() / "module.py";
+ if (fs::exists(module_path)) {
+ modules.emplace(name);
+ }
+ }
+ return modules;
+}
+
+int PyModuleRegistry::handle_command(
+ const ModuleCommand& module_command,
+ const MgrSession& session,
+ const cmdmap_t &cmdmap,
+ const bufferlist &inbuf,
+ std::stringstream *ds,
+ std::stringstream *ss)
+{
+ if (active_modules) {
+ return active_modules->handle_command(module_command, session, cmdmap,
+ inbuf, ds, ss);
+ } else {
+ // We do not expect to be called before active modules is up, but
+ // it's straightfoward to handle this case so let's do it.
+ return -EAGAIN;
+ }
+}
+
+std::vector<ModuleCommand> PyModuleRegistry::get_py_commands() const
+{
+ std::lock_guard l(lock);
+
+ std::vector<ModuleCommand> result;
+ for (const auto& i : modules) {
+ i.second->get_commands(&result);
+ }
+
+ return result;
+}
+
+std::vector<MonCommand> PyModuleRegistry::get_commands() const
+{
+ std::vector<ModuleCommand> commands = get_py_commands();
+ std::vector<MonCommand> result;
+ for (auto &pyc: commands) {
+ uint64_t flags = MonCommand::FLAG_MGR;
+ if (pyc.polling) {
+ flags |= MonCommand::FLAG_POLL;
+ }
+ result.push_back({pyc.cmdstring, pyc.helpstring, "mgr",
+ pyc.perm, flags});
+ }
+ return result;
+}
+
+void PyModuleRegistry::get_health_checks(health_check_map_t *checks)
+{
+ std::lock_guard l(lock);
+
+ // Only the active mgr reports module issues
+ if (active_modules) {
+ active_modules->get_health_checks(checks);
+
+ std::map<std::string, std::string> dependency_modules;
+ std::map<std::string, std::string> failed_modules;
+
+ /*
+ * Break up broken modules into two categories:
+ * - can_run=false: the module is working fine but explicitly
+ * telling you that a dependency is missing. Advise the user to
+ * read the message from the module and install what's missing.
+ * - failed=true or loaded=false: something unexpected is broken,
+ * either at runtime (from serve()) or at load time. This indicates
+ * a bug and the user should be guided to inspect the mgr log
+ * to investigate and gather evidence.
+ */
+
+ for (const auto &i : modules) {
+ auto module = i.second;
+ if (module->is_enabled() && !module->get_can_run()) {
+ dependency_modules[module->get_name()] = module->get_error_string();
+ } else if ((module->is_enabled() && !module->is_loaded())
+ || (module->is_failed() && module->get_can_run())) {
+ // - Unloadable modules are only reported if they're enabled,
+ // to avoid spamming users about modules they don't have the
+ // dependencies installed for because they don't use it.
+ // - Failed modules are only reported if they passed the can_run
+ // checks (to avoid outputting two health messages about a
+ // module that said can_run=false but we tried running it anyway)
+ failed_modules[module->get_name()] = module->get_error_string();
+ }
+ }
+
+ // report failed always_on modules as health errors
+ for (const auto& name : mgr_map.get_always_on_modules()) {
+ if (obsolete_modules.count(name)) {
+ continue;
+ }
+ if (active_modules->is_pending(name)) {
+ continue;
+ }
+ if (!active_modules->module_exists(name)) {
+ if (failed_modules.find(name) == failed_modules.end() &&
+ dependency_modules.find(name) == dependency_modules.end()) {
+ failed_modules[name] = "Not found or unloadable";
+ }
+ }
+ }
+
+ if (!dependency_modules.empty()) {
+ std::ostringstream ss;
+ if (dependency_modules.size() == 1) {
+ auto iter = dependency_modules.begin();
+ ss << "Module '" << iter->first << "' has failed dependency: "
+ << iter->second;
+ } else if (dependency_modules.size() > 1) {
+ ss << dependency_modules.size()
+ << " mgr modules have failed dependencies";
+ }
+ auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str(),
+ dependency_modules.size());
+ for (auto& i : dependency_modules) {
+ std::ostringstream ss;
+ ss << "Module '" << i.first << "' has failed dependency: " << i.second;
+ d.detail.push_back(ss.str());
+ }
+ }
+
+ if (!failed_modules.empty()) {
+ std::ostringstream ss;
+ if (failed_modules.size() == 1) {
+ auto iter = failed_modules.begin();
+ ss << "Module '" << iter->first << "' has failed: " << iter->second;
+ } else if (failed_modules.size() > 1) {
+ ss << failed_modules.size() << " mgr modules have failed";
+ }
+ auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str(),
+ failed_modules.size());
+ for (auto& i : failed_modules) {
+ std::ostringstream ss;
+ ss << "Module '" << i.first << "' has failed: " << i.second;
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+}
+
+void PyModuleRegistry::handle_config(const std::string &k, const std::string &v)
+{
+ std::lock_guard l(module_config.lock);
+
+ if (!v.empty()) {
+ // removing value to hide sensitive data going into mgr logs
+ // leaving this for debugging purposes
+ // dout(10) << "Loaded module_config entry " << k << ":" << v << dendl;
+ dout(10) << "Loaded module_config entry " << k << ":" << dendl;
+ module_config.config[k] = v;
+ } else {
+ module_config.config.erase(k);
+ }
+}
+
+void PyModuleRegistry::handle_config_notify()
+{
+ std::lock_guard l(lock);
+ if (active_modules) {
+ active_modules->config_notify();
+ }
+}
diff --git a/src/mgr/PyModuleRegistry.h b/src/mgr/PyModuleRegistry.h
new file mode 100644
index 000000000..6c72af893
--- /dev/null
+++ b/src/mgr/PyModuleRegistry.h
@@ -0,0 +1,231 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#pragma once
+
+// First because it includes Python.h
+#include "PyModule.h"
+
+#include <string>
+#include <map>
+#include <set>
+#include <memory>
+
+#include "common/LogClient.h"
+
+#include "ActivePyModules.h"
+#include "StandbyPyModules.h"
+
+class MgrSession;
+
+/**
+ * This class is responsible for setting up the python runtime environment
+ * and importing the python modules.
+ *
+ * It is *not* responsible for constructing instances of their BaseMgrModule
+ * subclasses: that is the job of ActiveMgrModule, which consumes the class
+ * references that we load here.
+ */
+class PyModuleRegistry
+{
+private:
+ mutable ceph::mutex lock = ceph::make_mutex("PyModuleRegistry::lock");
+ LogChannelRef clog;
+
+ std::map<std::string, PyModuleRef> modules;
+ std::multimap<std::string, entity_addrvec_t> clients;
+
+ std::unique_ptr<ActivePyModules> active_modules;
+ std::unique_ptr<StandbyPyModules> standby_modules;
+
+ PyThreadState *pMainThreadState;
+
+ // We have our own copy of MgrMap, because we are constructed
+ // before ClusterState exists.
+ MgrMap mgr_map;
+
+ /**
+ * Discover python modules from local disk
+ */
+ std::set<std::string> probe_modules(const std::string &path) const;
+
+ PyModuleConfig module_config;
+
+public:
+ void handle_config(const std::string &k, const std::string &v);
+ void handle_config_notify();
+
+ void update_kv_data(
+ const std::string prefix,
+ bool incremental,
+ const map<std::string, boost::optional<bufferlist>, std::less<>>& data) {
+ ceph_assert(active_modules);
+ active_modules->update_kv_data(prefix, incremental, data);
+ }
+
+ /**
+ * Get references to all modules (whether they have loaded and/or
+ * errored) or not.
+ */
+ auto get_modules() const
+ {
+ std::vector<PyModuleRef> modules_out;
+ std::lock_guard l(lock);
+ for (const auto &i : modules) {
+ modules_out.push_back(i.second);
+ }
+
+ return modules_out;
+ }
+
+ explicit PyModuleRegistry(LogChannelRef clog_)
+ : clog(clog_)
+ {}
+
+ /**
+ * @return true if the mgrmap has changed such that the service needs restart
+ */
+ bool handle_mgr_map(const MgrMap &mgr_map_);
+
+ bool have_standby_modules() const {
+ return !!standby_modules;
+ }
+
+ void init();
+
+ void upgrade_config(
+ MonClient *monc,
+ const std::map<std::string, std::string> &old_config);
+
+ void active_start(
+ DaemonStateIndex &ds, ClusterState &cs,
+ const std::map<std::string, std::string> &kv_store,
+ bool mon_provides_kv_sub,
+ MonClient &mc, LogChannelRef clog_, LogChannelRef audit_clog_,
+ Objecter &objecter_, Client &client_, Finisher &f,
+ DaemonServer &server);
+ void standby_start(MonClient &mc, Finisher &f);
+
+ bool is_standby_running() const
+ {
+ return standby_modules != nullptr;
+ }
+
+ void active_shutdown();
+ void shutdown();
+
+ std::vector<MonCommand> get_commands() const;
+ std::vector<ModuleCommand> get_py_commands() const;
+
+ /**
+ * Get the specified module. The module does not have to be
+ * loaded or runnable.
+ *
+ * Returns an empty reference if it does not exist.
+ */
+ PyModuleRef get_module(const std::string &module_name)
+ {
+ std::lock_guard l(lock);
+ auto module_iter = modules.find(module_name);
+ if (module_iter == modules.end()) {
+ return {};
+ }
+ return module_iter->second;
+ }
+
+ /**
+ * Pass through command to the named module for execution.
+ *
+ * The command must exist in the COMMANDS reported by the module. If it
+ * doesn't then this will abort.
+ *
+ * If ActivePyModules has not been instantiated yet then this will
+ * return EAGAIN.
+ */
+ int handle_command(
+ const ModuleCommand& module_command,
+ const MgrSession& session,
+ const cmdmap_t &cmdmap,
+ const bufferlist &inbuf,
+ std::stringstream *ds,
+ std::stringstream *ss);
+
+ /**
+ * Pass through health checks reported by modules, and report any
+ * modules that have failed (i.e. unhandled exceptions in serve())
+ */
+ void get_health_checks(health_check_map_t *checks);
+
+ void get_progress_events(map<std::string,ProgressEvent> *events) {
+ if (active_modules) {
+ active_modules->get_progress_events(events);
+ }
+ }
+
+ // FIXME: breaking interface so that I don't have to go rewrite all
+ // the places that call into these (for now)
+ // >>>
+ void notify_all(const std::string &notify_type,
+ const std::string &notify_id)
+ {
+ if (active_modules) {
+ active_modules->notify_all(notify_type, notify_id);
+ }
+ }
+
+ void notify_all(const LogEntry &log_entry)
+ {
+ if (active_modules) {
+ active_modules->notify_all(log_entry);
+ }
+ }
+
+ bool should_notify(const std::string& name,
+ const std::string& notify_type) {
+ return modules.at(name)->should_notify(notify_type);
+ }
+
+ std::map<std::string, std::string> get_services() const
+ {
+ ceph_assert(active_modules);
+ return active_modules->get_services();
+ }
+
+ void register_client(std::string_view name, entity_addrvec_t addrs)
+ {
+ clients.emplace(std::string(name), std::move(addrs));
+ }
+ void unregister_client(std::string_view name, const entity_addrvec_t& addrs)
+ {
+ auto itp = clients.equal_range(std::string(name));
+ for (auto it = itp.first; it != itp.second; ++it) {
+ if (it->second == addrs) {
+ clients.erase(it);
+ return;
+ }
+ }
+ }
+
+ auto get_clients() const
+ {
+ std::scoped_lock l(lock);
+ std::vector<entity_addrvec_t> v;
+ for (const auto& p : clients) {
+ v.push_back(p.second);
+ }
+ return v;
+ }
+
+ // <<< (end of ActivePyModules cheeky call-throughs)
+};
diff --git a/src/mgr/PyModuleRunner.cc b/src/mgr/PyModuleRunner.cc
new file mode 100644
index 000000000..e27f7f405
--- /dev/null
+++ b/src/mgr/PyModuleRunner.cc
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+// Python.h comes first because otherwise it clobbers ceph's assert
+#include <Python.h>
+
+#include "PyModule.h"
+
+#include "common/debug.h"
+#include "mgr/Gil.h"
+
+#include "PyModuleRunner.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+PyModuleRunner::~PyModuleRunner()
+{
+ Gil gil(py_module->pMyThreadState, true);
+
+ if (pClassInstance) {
+ Py_XDECREF(pClassInstance);
+ pClassInstance = nullptr;
+ }
+}
+
+int PyModuleRunner::serve()
+{
+ ceph_assert(pClassInstance != nullptr);
+
+ // This method is called from a separate OS thread (i.e. a thread not
+ // created by Python), so tell Gil to wrap this in a new thread state.
+ Gil gil(py_module->pMyThreadState, true);
+
+ auto pValue = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("serve"), nullptr);
+
+ int r = 0;
+ if (pValue != NULL) {
+ Py_DECREF(pValue);
+ } else {
+ // This is not a very informative log message because it's an
+ // unknown/unexpected exception that we can't say much about.
+
+
+ // Get short exception message for the cluster log, before
+ // dumping the full backtrace to the local log.
+ std::string exc_msg = peek_pyerror();
+
+ clog->error() << "Unhandled exception from module '" << get_name()
+ << "' while running on mgr." << g_conf()->name.get_id()
+ << ": " << exc_msg;
+ derr << get_name() << ".serve:" << dendl;
+ derr << handle_pyerror() << dendl;
+
+ py_module->fail(exc_msg);
+
+ return -EINVAL;
+ }
+
+ return r;
+}
+
+void PyModuleRunner::shutdown()
+{
+ ceph_assert(pClassInstance != nullptr);
+
+ Gil gil(py_module->pMyThreadState, true);
+
+ auto pValue = PyObject_CallMethod(pClassInstance,
+ const_cast<char*>("shutdown"), nullptr);
+
+ if (pValue != NULL) {
+ Py_DECREF(pValue);
+ } else {
+ derr << "Failed to invoke shutdown() on " << get_name() << dendl;
+ derr << handle_pyerror() << dendl;
+ }
+
+ dead = true;
+}
+
+void PyModuleRunner::log(const std::string &record)
+{
+#undef dout_prefix
+#define dout_prefix *_dout
+ dout(0) << record << dendl;
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+}
+
+void* PyModuleRunner::PyModuleRunnerThread::entry()
+{
+ // No need to acquire the GIL here; the module does it.
+ dout(4) << "Entering thread for " << mod->get_name() << dendl;
+ mod->serve();
+ return nullptr;
+}
diff --git a/src/mgr/PyModuleRunner.h b/src/mgr/PyModuleRunner.h
new file mode 100644
index 000000000..88d9f755a
--- /dev/null
+++ b/src/mgr/PyModuleRunner.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#pragma once
+
+#include "common/Thread.h"
+#include "common/LogClient.h"
+#include "mgr/Gil.h"
+
+#include "PyModule.h"
+
+/**
+ * Implement the pattern of calling serve() on a module in a thread,
+ * until shutdown() is called.
+ */
+class PyModuleRunner
+{
+public:
+ // Info about the module we're going to run
+ PyModuleRef py_module;
+
+protected:
+ // Populated by descendent class
+ PyObject *pClassInstance = nullptr;
+
+ LogChannelRef clog;
+
+ class PyModuleRunnerThread : public Thread
+ {
+ PyModuleRunner *mod;
+
+ public:
+ explicit PyModuleRunnerThread(PyModuleRunner *mod_)
+ : mod(mod_) {}
+
+ void *entry() override;
+ };
+
+ bool is_dead() const { return dead; }
+
+ std::string thread_name;
+
+public:
+ int serve();
+ void shutdown();
+ void log(const std::string &record);
+
+ const char *get_thread_name() const
+ {
+ return thread_name.c_str();
+ }
+
+ PyModuleRunner(
+ const PyModuleRef &py_module_,
+ LogChannelRef clog_)
+ :
+ py_module(py_module_),
+ clog(clog_),
+ thread(this)
+ {
+ // Shortened name for use as thread name, because thread names
+ // required to be <16 chars
+ thread_name = py_module->get_name().substr(0, 15);
+
+ ceph_assert(py_module != nullptr);
+ }
+
+ ~PyModuleRunner();
+
+ PyModuleRunnerThread thread;
+
+ std::string const &get_name() const { return py_module->get_name(); }
+
+private:
+ bool dead = false;
+};
+
+
diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc
new file mode 100644
index 000000000..70813ca52
--- /dev/null
+++ b/src/mgr/PyOSDMap.cc
@@ -0,0 +1,682 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Mgr.h"
+
+#include "osd/OSDMap.h"
+#include "common/errno.h"
+#include "common/version.h"
+#include "include/stringify.h"
+
+#include "PyOSDMap.h"
+#include "PyFormatter.h"
+#include "Gil.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+
+typedef struct {
+ PyObject_HEAD
+ OSDMap *osdmap;
+} BasePyOSDMap;
+
+typedef struct {
+ PyObject_HEAD
+ OSDMap::Incremental *inc;
+} BasePyOSDMapIncremental;
+
+typedef struct {
+ PyObject_HEAD
+ std::shared_ptr<CrushWrapper> crush;
+} BasePyCRUSH;
+
+// ----------
+
+static PyObject *osdmap_get_epoch(BasePyOSDMap *self, PyObject *obj)
+{
+ return PyLong_FromLong(self->osdmap->get_epoch());
+}
+
+static PyObject *osdmap_get_crush_version(BasePyOSDMap* self, PyObject *obj)
+{
+ return PyLong_FromLong(self->osdmap->get_crush_version());
+}
+
+static PyObject *osdmap_dump(BasePyOSDMap* self, PyObject *obj)
+{
+ PyFormatter f;
+ self->osdmap->dump(&f);
+ return f.get();
+}
+
+static PyObject *osdmap_new_incremental(BasePyOSDMap *self, PyObject *obj)
+{
+ OSDMap::Incremental *inc = new OSDMap::Incremental;
+
+ inc->fsid = self->osdmap->get_fsid();
+ inc->epoch = self->osdmap->get_epoch() + 1;
+ // always include latest crush map here... this is okay since we never
+ // actually use this map in the real world (and even if we did it would
+ // be a no-op).
+ self->osdmap->crush->encode(inc->crush, CEPH_FEATURES_ALL);
+ dout(10) << __func__ << " " << inc << dendl;
+
+ return construct_with_capsule("mgr_module", "OSDMapIncremental",
+ (void*)(inc));
+}
+
+static PyObject *osdmap_apply_incremental(BasePyOSDMap *self,
+ BasePyOSDMapIncremental *incobj)
+{
+ if (!PyObject_TypeCheck(incobj, &BasePyOSDMapIncrementalType)) {
+ derr << "Wrong type in osdmap_apply_incremental!" << dendl;
+ return nullptr;
+ }
+
+ bufferlist bl;
+ self->osdmap->encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+ OSDMap *next = new OSDMap;
+ next->decode(bl);
+ next->apply_incremental(*(incobj->inc));
+ dout(10) << __func__ << " map " << self->osdmap << " inc " << incobj->inc
+ << " next " << next << dendl;
+
+ return construct_with_capsule("mgr_module", "OSDMap", (void*)next);
+}
+
+static PyObject *osdmap_get_crush(BasePyOSDMap* self, PyObject *obj)
+{
+ return construct_with_capsule("mgr_module", "CRUSHMap",
+ (void*)(&(self->osdmap->crush)));
+}
+
+static PyObject *osdmap_get_pools_by_take(BasePyOSDMap* self, PyObject *args)
+{
+ int take;
+ if (!PyArg_ParseTuple(args, "i:get_pools_by_take",
+ &take)) {
+ return nullptr;
+ }
+
+ PyFormatter f;
+ f.open_array_section("pools");
+ for (auto& p : self->osdmap->get_pools()) {
+ if (self->osdmap->crush->rule_has_take(p.second.crush_rule, take)) {
+ f.dump_int("pool", p.first);
+ }
+ }
+ f.close_section();
+ return f.get();
+}
+
+static PyObject *osdmap_calc_pg_upmaps(BasePyOSDMap* self, PyObject *args)
+{
+ PyObject *pool_list;
+ BasePyOSDMapIncremental *incobj;
+ int max_deviation = 0;
+ int max_iterations = 0;
+ if (!PyArg_ParseTuple(args, "OiiO:calc_pg_upmaps",
+ &incobj, &max_deviation,
+ &max_iterations, &pool_list)) {
+ return nullptr;
+ }
+ if (!PyList_CheckExact(pool_list)) {
+ derr << __func__ << " pool_list not a list" << dendl;
+ return nullptr;
+ }
+ set<int64_t> pools;
+ for (auto i = 0; i < PyList_Size(pool_list); ++i) {
+ PyObject *pool_name = PyList_GET_ITEM(pool_list, i);
+ if (!PyUnicode_Check(pool_name)) {
+ derr << __func__ << " " << pool_name << " not a string" << dendl;
+ return nullptr;
+ }
+ auto pool_id = self->osdmap->lookup_pg_pool_name(
+ PyUnicode_AsUTF8(pool_name));
+ if (pool_id < 0) {
+ derr << __func__ << " pool '" << PyUnicode_AsUTF8(pool_name)
+ << "' does not exist" << dendl;
+ return nullptr;
+ }
+ pools.insert(pool_id);
+ }
+
+ dout(10) << __func__ << " osdmap " << self->osdmap << " inc " << incobj->inc
+ << " max_deviation " << max_deviation
+ << " max_iterations " << max_iterations
+ << " pools " << pools
+ << dendl;
+ PyThreadState *tstate = PyEval_SaveThread();
+ int r = self->osdmap->calc_pg_upmaps(g_ceph_context,
+ max_deviation,
+ max_iterations,
+ pools,
+ incobj->inc);
+ PyEval_RestoreThread(tstate);
+ dout(10) << __func__ << " r = " << r << dendl;
+ return PyLong_FromLong(r);
+}
+
+static PyObject *osdmap_map_pool_pgs_up(BasePyOSDMap* self, PyObject *args)
+{
+ int poolid;
+ if (!PyArg_ParseTuple(args, "i:map_pool_pgs_up",
+ &poolid)) {
+ return nullptr;
+ }
+ auto pi = self->osdmap->get_pg_pool(poolid);
+ if (!pi)
+ return nullptr;
+ map<pg_t,vector<int>> pm;
+ for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
+ pg_t pgid(ps, poolid);
+ self->osdmap->pg_to_up_acting_osds(pgid, &pm[pgid], nullptr, nullptr, nullptr);
+ }
+ PyFormatter f;
+ for (auto p : pm) {
+ string pg = stringify(p.first);
+ f.open_array_section(pg.c_str());
+ for (auto o : p.second) {
+ f.dump_int("osd", o);
+ }
+ f.close_section();
+ }
+ return f.get();
+}
+
+static int
+BasePyOSDMap_init(BasePyOSDMap *self, PyObject *args, PyObject *kwds)
+{
+ PyObject *osdmap_capsule = nullptr;
+ static const char *kwlist[] = {"osdmap_capsule", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+ const_cast<char**>(kwlist),
+ &osdmap_capsule)) {
+ ceph_abort();
+ return -1;
+ }
+ ceph_assert(PyObject_TypeCheck(osdmap_capsule, &PyCapsule_Type));
+
+ self->osdmap = (OSDMap*)PyCapsule_GetPointer(
+ osdmap_capsule, nullptr);
+ ceph_assert(self->osdmap);
+
+ return 0;
+}
+
+
+static void
+BasePyOSDMap_dealloc(BasePyOSDMap *self)
+{
+ if (self->osdmap) {
+ delete self->osdmap;
+ self->osdmap = nullptr;
+ } else {
+ derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+ }
+ Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *osdmap_pg_to_up_acting_osds(BasePyOSDMap *self, PyObject *args)
+{
+ int pool_id = 0;
+ int ps = 0;
+ if (!PyArg_ParseTuple(args, "ii:pg_to_up_acting_osds",
+ &pool_id, &ps)) {
+ return nullptr;
+ }
+
+ std::vector<int> up;
+ int up_primary;
+ std::vector<int> acting;
+ int acting_primary;
+ pg_t pg_id(ps, pool_id);
+ self->osdmap->pg_to_up_acting_osds(pg_id,
+ &up, &up_primary,
+ &acting, &acting_primary);
+
+ // (Ab)use PyFormatter as a convenient way to generate a dict
+ PyFormatter f;
+ f.dump_int("up_primary", up_primary);
+ f.dump_int("acting_primary", acting_primary);
+ f.open_array_section("up");
+ for (const auto &i : up) {
+ f.dump_int("osd", i);
+ }
+ f.close_section();
+ f.open_array_section("acting");
+ for (const auto &i : acting) {
+ f.dump_int("osd", i);
+ }
+ f.close_section();
+
+ return f.get();
+}
+
+static PyObject *osdmap_pool_raw_used_rate(BasePyOSDMap *self, PyObject *args)
+{
+ int pool_id = 0;
+ if (!PyArg_ParseTuple(args, "i:pool_raw_used_rate",
+ &pool_id)) {
+ return nullptr;
+ }
+
+ if (!self->osdmap->have_pg_pool(pool_id)) {
+ return nullptr;
+ }
+
+ float rate = self->osdmap->pool_raw_used_rate(pool_id);
+
+ return PyFloat_FromDouble(rate);
+}
+
+
+PyMethodDef BasePyOSDMap_methods[] = {
+ {"_get_epoch", (PyCFunction)osdmap_get_epoch, METH_NOARGS, "Get OSDMap epoch"},
+ {"_get_crush_version", (PyCFunction)osdmap_get_crush_version, METH_NOARGS,
+ "Get CRUSH version"},
+ {"_dump", (PyCFunction)osdmap_dump, METH_NOARGS, "Dump OSDMap::Incremental"},
+ {"_new_incremental", (PyCFunction)osdmap_new_incremental, METH_NOARGS,
+ "Create OSDMap::Incremental"},
+ {"_apply_incremental", (PyCFunction)osdmap_apply_incremental, METH_O,
+ "Apply OSDMap::Incremental and return the resulting OSDMap"},
+ {"_get_crush", (PyCFunction)osdmap_get_crush, METH_NOARGS, "Get CrushWrapper"},
+ {"_get_pools_by_take", (PyCFunction)osdmap_get_pools_by_take, METH_VARARGS,
+ "Get pools that have CRUSH rules that TAKE the given root"},
+ {"_calc_pg_upmaps", (PyCFunction)osdmap_calc_pg_upmaps, METH_VARARGS,
+ "Calculate new pg-upmap values"},
+ {"_map_pool_pgs_up", (PyCFunction)osdmap_map_pool_pgs_up, METH_VARARGS,
+ "Calculate up set mappings for all PGs in a pool"},
+ {"_pg_to_up_acting_osds", (PyCFunction)osdmap_pg_to_up_acting_osds, METH_VARARGS,
+ "Calculate up+acting OSDs for a PG ID"},
+ {"_pool_raw_used_rate", (PyCFunction)osdmap_pool_raw_used_rate, METH_VARARGS,
+ "Get raw space to logical space ratio"},
+ {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BasePyOSDMap", /* tp_name */
+ sizeof(BasePyOSDMap), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)BasePyOSDMap_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "Ceph OSDMap", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BasePyOSDMap_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BasePyOSDMap_init, /* tp_init */
+ 0, /* tp_alloc */
+ 0, /* tp_new */
+};
+
+// ----------
+
+
+static int
+BasePyOSDMapIncremental_init(BasePyOSDMapIncremental *self,
+ PyObject *args, PyObject *kwds)
+{
+ PyObject *inc_capsule = nullptr;
+ static const char *kwlist[] = {"inc_capsule", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+ const_cast<char**>(kwlist),
+ &inc_capsule)) {
+ ceph_abort();
+ return -1;
+ }
+ ceph_assert(PyObject_TypeCheck(inc_capsule, &PyCapsule_Type));
+
+ self->inc = (OSDMap::Incremental*)PyCapsule_GetPointer(
+ inc_capsule, nullptr);
+ ceph_assert(self->inc);
+
+ return 0;
+}
+
+static void
+BasePyOSDMapIncremental_dealloc(BasePyOSDMapIncremental *self)
+{
+ if (self->inc) {
+ delete self->inc;
+ self->inc = nullptr;
+ } else {
+ derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl;
+ }
+ Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *osdmap_inc_get_epoch(BasePyOSDMapIncremental *self,
+ PyObject *obj)
+{
+ return PyLong_FromLong(self->inc->epoch);
+}
+
+static PyObject *osdmap_inc_dump(BasePyOSDMapIncremental *self,
+ PyObject *obj)
+{
+ PyFormatter f;
+ self->inc->dump(&f);
+ return f.get();
+}
+
+static int get_int_float_map(PyObject *obj, map<int,double> *out)
+{
+ PyObject *ls = PyDict_Items(obj);
+ for (int j = 0; j < PyList_Size(ls); ++j) {
+ PyObject *pair = PyList_GET_ITEM(ls, j);
+ if (!PyTuple_Check(pair)) {
+ derr << __func__ << " item " << j << " not a tuple" << dendl;
+ Py_DECREF(ls);
+ return -1;
+ }
+ int k;
+ double v;
+ if (!PyArg_ParseTuple(pair, "id:pair", &k, &v)) {
+ derr << __func__ << " item " << j << " not a size 2 tuple" << dendl;
+ Py_DECREF(ls);
+ return -1;
+ }
+ (*out)[k] = v;
+ }
+
+ Py_DECREF(ls);
+ return 0;
+}
+
+static PyObject *osdmap_inc_set_osd_reweights(BasePyOSDMapIncremental *self,
+ PyObject *weightobj)
+{
+ map<int,double> wm;
+ if (get_int_float_map(weightobj, &wm) < 0) {
+ return nullptr;
+ }
+
+ for (auto i : wm) {
+ self->inc->new_weight[i.first] = std::max(0.0, std::min(1.0, i.second)) * 0x10000;
+ }
+ Py_RETURN_NONE;
+}
+
+static PyObject *osdmap_inc_set_compat_weight_set_weights(
+ BasePyOSDMapIncremental *self, PyObject *weightobj)
+{
+ map<int,double> wm;
+ if (get_int_float_map(weightobj, &wm) < 0) {
+ return nullptr;
+ }
+
+ CrushWrapper crush;
+ ceph_assert(self->inc->crush.length()); // see new_incremental
+ auto p = self->inc->crush.cbegin();
+ decode(crush, p);
+ crush.create_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS, 1);
+ for (auto i : wm) {
+ crush.choose_args_adjust_item_weightf(
+ g_ceph_context,
+ crush.choose_args_get(CrushWrapper::DEFAULT_CHOOSE_ARGS),
+ i.first,
+ { i.second },
+ nullptr);
+ }
+ self->inc->crush.clear();
+ crush.encode(self->inc->crush, CEPH_FEATURES_ALL);
+ Py_RETURN_NONE;
+}
+
+PyMethodDef BasePyOSDMapIncremental_methods[] = {
+ {"_get_epoch", (PyCFunction)osdmap_inc_get_epoch, METH_NOARGS,
+ "Get OSDMap::Incremental epoch"},
+ {"_dump", (PyCFunction)osdmap_inc_dump, METH_NOARGS,
+ "Dump OSDMap::Incremental"},
+ {"_set_osd_reweights", (PyCFunction)osdmap_inc_set_osd_reweights,
+ METH_O, "Set osd reweight values"},
+ {"_set_crush_compat_weight_set_weights",
+ (PyCFunction)osdmap_inc_set_compat_weight_set_weights, METH_O,
+ "Set weight values in the pending CRUSH compat weight-set"},
+ {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyOSDMapIncrementalType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BasePyOSDMapIncremental", /* tp_name */
+ sizeof(BasePyOSDMapIncremental), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)BasePyOSDMapIncremental_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "Ceph OSDMapIncremental", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BasePyOSDMapIncremental_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BasePyOSDMapIncremental_init, /* tp_init */
+ 0, /* tp_alloc */
+ 0, /* tp_new */
+};
+
+
+// ----------
+
+static int
+BasePyCRUSH_init(BasePyCRUSH *self,
+ PyObject *args, PyObject *kwds)
+{
+ PyObject *crush_capsule = nullptr;
+ static const char *kwlist[] = {"crush_capsule", NULL};
+
+ if (! PyArg_ParseTupleAndKeywords(args, kwds, "O",
+ const_cast<char**>(kwlist),
+ &crush_capsule)) {
+ ceph_abort();
+ return -1;
+ }
+ ceph_assert(PyObject_TypeCheck(crush_capsule, &PyCapsule_Type));
+
+ auto ptr_ref = (std::shared_ptr<CrushWrapper>*)(
+ PyCapsule_GetPointer(crush_capsule, nullptr));
+
+ // We passed a pointer to a shared pointer, which is weird, but
+ // just enough to get it into the constructor: this is a real shared
+ // pointer construction now, and then we throw away that pointer to
+ // the shared pointer.
+ self->crush = *ptr_ref;
+ ceph_assert(self->crush);
+
+ return 0;
+}
+
+static void
+BasePyCRUSH_dealloc(BasePyCRUSH *self)
+{
+ self->crush.reset();
+ Py_TYPE(self)->tp_free(self);
+}
+
+static PyObject *crush_dump(BasePyCRUSH *self, PyObject *obj)
+{
+ PyFormatter f;
+ self->crush->dump(&f);
+ return f.get();
+}
+
+static PyObject *crush_get_item_name(BasePyCRUSH *self, PyObject *args)
+{
+ int item;
+ if (!PyArg_ParseTuple(args, "i:get_item_name", &item)) {
+ return nullptr;
+ }
+ if (!self->crush->item_exists(item)) {
+ Py_RETURN_NONE;
+ }
+ return PyUnicode_FromString(self->crush->get_item_name(item));
+}
+
+static PyObject *crush_get_item_weight(BasePyCRUSH *self, PyObject *args)
+{
+ int item;
+ if (!PyArg_ParseTuple(args, "i:get_item_weight", &item)) {
+ return nullptr;
+ }
+ if (!self->crush->item_exists(item)) {
+ Py_RETURN_NONE;
+ }
+ return PyFloat_FromDouble(self->crush->get_item_weightf(item));
+}
+
+static PyObject *crush_find_roots(BasePyCRUSH *self)
+{
+ set<int> roots;
+ self->crush->find_roots(&roots);
+ PyFormatter f;
+ f.open_array_section("roots");
+ for (auto root : roots) {
+ f.dump_int("root", root);
+ }
+ f.close_section();
+ return f.get();
+}
+
+static PyObject *crush_find_takes(BasePyCRUSH *self, PyObject *obj)
+{
+ set<int> takes;
+ self->crush->find_takes(&takes);
+ PyFormatter f;
+ f.open_array_section("takes");
+ for (auto root : takes) {
+ f.dump_int("root", root);
+ }
+ f.close_section();
+ return f.get();
+}
+
+static PyObject *crush_get_take_weight_osd_map(BasePyCRUSH *self, PyObject *args)
+{
+ int root;
+ if (!PyArg_ParseTuple(args, "i:get_take_weight_osd_map",
+ &root)) {
+ return nullptr;
+ }
+ map<int,float> wmap;
+
+ if (!self->crush->item_exists(root)) {
+ return nullptr;
+ }
+
+ self->crush->get_take_weight_osd_map(root, &wmap);
+ PyFormatter f;
+ f.open_object_section("weights");
+ for (auto& p : wmap) {
+ string n = stringify(p.first); // ick
+ f.dump_float(n.c_str(), p.second);
+ }
+ f.close_section();
+ return f.get();
+}
+
+PyMethodDef BasePyCRUSH_methods[] = {
+ {"_dump", (PyCFunction)crush_dump, METH_NOARGS, "Dump map"},
+ {"_get_item_name", (PyCFunction)crush_get_item_name, METH_VARARGS,
+ "Get item name"},
+ {"_get_item_weight", (PyCFunction)crush_get_item_weight, METH_VARARGS,
+ "Get item weight"},
+ {"_find_roots", (PyCFunction)crush_find_roots, METH_NOARGS,
+ "Find all tree roots"},
+ {"_find_takes", (PyCFunction)crush_find_takes, METH_NOARGS,
+ "Find distinct TAKE roots"},
+ {"_get_take_weight_osd_map", (PyCFunction)crush_get_take_weight_osd_map,
+ METH_VARARGS, "Get OSD weight map for a given TAKE root node"},
+ {NULL, NULL, 0, NULL}
+};
+
+PyTypeObject BasePyCRUSHType = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "ceph_module.BasePyCRUSH", /* tp_name */
+ sizeof(BasePyCRUSH), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)BasePyCRUSH_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ "Ceph OSDMapIncremental", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ BasePyCRUSH_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)BasePyCRUSH_init, /* tp_init */
+ 0, /* tp_alloc */
+ 0, /* tp_new */
+};
diff --git a/src/mgr/PyOSDMap.h b/src/mgr/PyOSDMap.h
new file mode 100644
index 000000000..2cc30dfe2
--- /dev/null
+++ b/src/mgr/PyOSDMap.h
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <Python.h>
+
+#include <string>
+
+extern PyTypeObject BasePyOSDMapType;
+extern PyTypeObject BasePyOSDMapIncrementalType;
+extern PyTypeObject BasePyCRUSHType;
+
+PyObject *construct_with_capsule(
+ const std::string &module,
+ const std::string &clsname,
+ void *wrapped);
+
diff --git a/src/mgr/PyUtil.cc b/src/mgr/PyUtil.cc
new file mode 100644
index 000000000..a8efc2f28
--- /dev/null
+++ b/src/mgr/PyUtil.cc
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <Python.h>
+
+#include "PyUtil.h"
+
+PyObject *get_python_typed_option_value(
+ Option::type_t type,
+ const std::string& value)
+{
+ switch (type) {
+ case Option::TYPE_INT:
+ case Option::TYPE_UINT:
+ case Option::TYPE_SIZE:
+ return PyLong_FromString((char *)value.c_str(), nullptr, 0);
+ case Option::TYPE_SECS:
+ case Option::TYPE_MILLISECS:
+ case Option::TYPE_FLOAT:
+ {
+ PyObject *s = PyUnicode_FromString(value.c_str());
+ PyObject *f = PyFloat_FromString(s);
+ Py_DECREF(s);
+ return f;
+ }
+ case Option::TYPE_BOOL:
+ if (value == "1" || value == "true" || value == "True" ||
+ value == "on" || value == "yes") {
+ Py_INCREF(Py_True);
+ return Py_True;
+ } else {
+ Py_INCREF(Py_False);
+ return Py_False;
+ }
+ case Option::TYPE_STR:
+ case Option::TYPE_ADDR:
+ case Option::TYPE_ADDRVEC:
+ case Option::TYPE_UUID:
+ break;
+ }
+ return PyUnicode_FromString(value.c_str());
+}
diff --git a/src/mgr/PyUtil.h b/src/mgr/PyUtil.h
new file mode 100644
index 000000000..188b3d28f
--- /dev/null
+++ b/src/mgr/PyUtil.h
@@ -0,0 +1,14 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+
+#include <Python.h>
+
+#include "common/options.h"
+
+PyObject *get_python_typed_option_value(
+ Option::type_t type,
+ const std::string& value);
diff --git a/src/mgr/ServiceMap.cc b/src/mgr/ServiceMap.cc
new file mode 100644
index 000000000..b6f8ad97c
--- /dev/null
+++ b/src/mgr/ServiceMap.cc
@@ -0,0 +1,244 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mgr/ServiceMap.h"
+
+#include <experimental/iterator>
+#include <fmt/format.h>
+#include <regex>
+
+#include "common/Formatter.h"
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+// Daemon
+
+void ServiceMap::Daemon::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(2, 1, bl);
+ encode(gid, bl);
+ encode(addr, bl, features);
+ encode(start_epoch, bl);
+ encode(start_stamp, bl);
+ encode(metadata, bl);
+ encode(task_status, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ServiceMap::Daemon::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(2, p);
+ decode(gid, p);
+ decode(addr, p);
+ decode(start_epoch, p);
+ decode(start_stamp, p);
+ decode(metadata, p);
+ if (struct_v >= 2) {
+ decode(task_status, p);
+ }
+ DECODE_FINISH(p);
+}
+
+void ServiceMap::Daemon::dump(Formatter *f) const
+{
+ f->dump_unsigned("start_epoch", start_epoch);
+ f->dump_stream("start_stamp") << start_stamp;
+ f->dump_unsigned("gid", gid);
+ f->dump_string("addr", addr.get_legacy_str());
+ f->open_object_section("metadata");
+ for (auto& p : metadata) {
+ f->dump_string(p.first.c_str(), p.second);
+ }
+ f->close_section();
+ f->open_object_section("task_status");
+ for (auto& p : task_status) {
+ f->dump_string(p.first.c_str(), p.second);
+ }
+ f->close_section();
+}
+
+void ServiceMap::Daemon::generate_test_instances(std::list<Daemon*>& ls)
+{
+ ls.push_back(new Daemon);
+ ls.push_back(new Daemon);
+ ls.back()->gid = 222;
+ ls.back()->metadata["this"] = "that";
+ ls.back()->task_status["task1"] = "running";
+}
+
+// Service
+
+std::string ServiceMap::Service::get_summary() const
+{
+ if (!summary.empty()) {
+ return summary;
+ }
+ if (daemons.empty()) {
+ return "no daemons active";
+ }
+
+ // If "daemon_type" is present, this will be used in place of "daemon" when
+ // reporting the count (e.g., "${N} daemons").
+ //
+ // We will additional break down the count by various groupings, based
+ // on the following keys:
+ //
+ // "hostname" -> host(s)
+ // "zone_id" -> zone(s)
+ //
+ // The `ceph -s` will be something likes:
+ // iscsi: 3 portals active (3 hosts)
+ // rgw: 3 gateways active (3 hosts, 1 zone)
+
+ std::map<std::string, std::set<std::string>> groupings;
+ std::string type("daemon");
+ int num = 0;
+ for (auto& d : daemons) {
+ ++num;
+ if (auto p = d.second.metadata.find("daemon_type");
+ p != d.second.metadata.end()) {
+ type = p->second;
+ }
+ for (auto k : {make_pair("zone", "zone_id"),
+ make_pair("host", "hostname")}) {
+ auto p = d.second.metadata.find(k.second);
+ if (p != d.second.metadata.end()) {
+ groupings[k.first].insert(p->second);
+ }
+ }
+ }
+
+ std::ostringstream ss;
+ ss << num << " " << type << (num > 1 ? "s" : "") << " active";
+ if (groupings.size()) {
+ ss << " (";
+ for (auto i = groupings.begin(); i != groupings.end(); ++i) {
+ if (i != groupings.begin()) {
+ ss << ", ";
+ }
+ ss << i->second.size() << " " << i->first << (i->second.size() ? "s" : "");
+ }
+ ss << ")";
+ }
+
+ return ss.str();
+}
+
+bool ServiceMap::Service::has_running_tasks() const
+{
+ return std::any_of(daemons.begin(), daemons.end(), [](auto& daemon) {
+ return !daemon.second.task_status.empty();
+ });
+}
+
+std::string ServiceMap::Service::get_task_summary(const std::string_view task_prefix) const
+{
+ // contruct a map similar to:
+ // {"service1 status" -> {"service1.0" -> "running"}}
+ // {"service2 status" -> {"service2.0" -> "idle"},
+ // {"service2.1" -> "running"}}
+ std::map<std::string, std::map<std::string, std::string>> by_task;
+ for (const auto& [service_id, daemon] : daemons) {
+ for (const auto& [task_name, status] : daemon.task_status) {
+ by_task[task_name].emplace(fmt::format("{}.{}", task_prefix, service_id),
+ status);
+ }
+ }
+ std::stringstream ss;
+ for (const auto &[task_name, status_by_service] : by_task) {
+ ss << "\n " << task_name << ":";
+ for (auto& [service, status] : status_by_service) {
+ ss << "\n " << service << ": " << status;
+ }
+ }
+ return ss.str();
+}
+
+void ServiceMap::Service::count_metadata(const std::string& field,
+ std::map<std::string,int> *out) const
+{
+ for (auto& p : daemons) {
+ auto q = p.second.metadata.find(field);
+ if (q == p.second.metadata.end()) {
+ (*out)["unknown"]++;
+ } else {
+ (*out)[q->second]++;
+ }
+ }
+}
+
+void ServiceMap::Service::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(daemons, bl, features);
+ encode(summary, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ServiceMap::Service::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(daemons, p);
+ decode(summary, p);
+ DECODE_FINISH(p);
+}
+
+void ServiceMap::Service::dump(Formatter *f) const
+{
+ f->open_object_section("daemons");
+ f->dump_string("summary", summary);
+ for (auto& p : daemons) {
+ f->dump_object(p.first.c_str(), p.second);
+ }
+ f->close_section();
+}
+
+void ServiceMap::Service::generate_test_instances(std::list<Service*>& ls)
+{
+ ls.push_back(new Service);
+ ls.push_back(new Service);
+ ls.back()->daemons["one"].gid = 1;
+ ls.back()->daemons["two"].gid = 2;
+}
+
+// ServiceMap
+
+void ServiceMap::encode(bufferlist& bl, uint64_t features) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(epoch, bl);
+ encode(modified, bl);
+ encode(services, bl, features);
+ ENCODE_FINISH(bl);
+}
+
+void ServiceMap::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(epoch, p);
+ decode(modified, p);
+ decode(services, p);
+ DECODE_FINISH(p);
+}
+
+void ServiceMap::dump(Formatter *f) const
+{
+ f->dump_unsigned("epoch", epoch);
+ f->dump_stream("modified") << modified;
+ f->open_object_section("services");
+ for (auto& p : services) {
+ f->dump_object(p.first.c_str(), p.second);
+ }
+ f->close_section();
+}
+
+void ServiceMap::generate_test_instances(std::list<ServiceMap*>& ls)
+{
+ ls.push_back(new ServiceMap);
+ ls.push_back(new ServiceMap);
+ ls.back()->epoch = 123;
+ ls.back()->services["rgw"].daemons["one"].gid = 123;
+ ls.back()->services["rgw"].daemons["two"].gid = 344;
+ ls.back()->services["iscsi"].daemons["foo"].gid = 3222;
+}
diff --git a/src/mgr/ServiceMap.h b/src/mgr/ServiceMap.h
new file mode 100644
index 000000000..ed027907c
--- /dev/null
+++ b/src/mgr/ServiceMap.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <list>
+#include <sstream>
+
+#include "include/utime.h"
+#include "include/buffer.h"
+#include "msg/msg_types.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+struct ServiceMap {
+ struct Daemon {
+ uint64_t gid = 0;
+ entity_addr_t addr;
+ epoch_t start_epoch = 0; ///< epoch first registered
+ utime_t start_stamp; ///< timestamp daemon started/registered
+ std::map<std::string,std::string> metadata; ///< static metadata
+ std::map<std::string,std::string> task_status; ///< running task status
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<Daemon*>& ls);
+ };
+
+ struct Service {
+ std::map<std::string,Daemon> daemons;
+ std::string summary; ///< summary status std::string for 'ceph -s'
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<Service*>& ls);
+
+ std::string get_summary() const;
+ bool has_running_tasks() const;
+ std::string get_task_summary(const std::string_view task_prefix) const;
+ void count_metadata(const std::string& field,
+ std::map<std::string,int> *out) const;
+ };
+
+ epoch_t epoch = 0;
+ utime_t modified;
+ std::map<std::string,Service> services;
+
+ void encode(ceph::buffer::list& bl, uint64_t features) const;
+ void decode(ceph::buffer::list::const_iterator& p);
+ void dump(ceph::Formatter *f) const;
+ static void generate_test_instances(std::list<ServiceMap*>& ls);
+
+ std::pair<Daemon*,bool> get_daemon(const std::string& service,
+ const std::string& daemon) {
+ auto& s = services[service];
+ auto [d, added] = s.daemons.try_emplace(daemon);
+ return {&d->second, added};
+ }
+
+ bool rm_daemon(const std::string& service,
+ const std::string& daemon) {
+ auto p = services.find(service);
+ if (p == services.end()) {
+ return false;
+ }
+ auto q = p->second.daemons.find(daemon);
+ if (q == p->second.daemons.end()) {
+ return false;
+ }
+ p->second.daemons.erase(q);
+ if (p->second.daemons.empty()) {
+ services.erase(p);
+ }
+ return true;
+ }
+
+ static inline bool is_normal_ceph_entity(std::string_view type) {
+ if (type == "osd" ||
+ type == "client" ||
+ type == "mon" ||
+ type == "mds" ||
+ type == "mgr") {
+ return true;
+ }
+
+ return false;
+ }
+};
+WRITE_CLASS_ENCODER_FEATURES(ServiceMap)
+WRITE_CLASS_ENCODER_FEATURES(ServiceMap::Service)
+WRITE_CLASS_ENCODER_FEATURES(ServiceMap::Daemon)
diff --git a/src/mgr/StandbyPyModules.cc b/src/mgr/StandbyPyModules.cc
new file mode 100644
index 000000000..86ee8550c
--- /dev/null
+++ b/src/mgr/StandbyPyModules.cc
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "StandbyPyModules.h"
+
+#include "common/Finisher.h"
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "mgr/MgrContext.h"
+#include "mgr/Gil.h"
+
+// For ::mgr_store_prefix
+#include "PyModuleRegistry.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+#undef dout_prefix
+#define dout_prefix *_dout << "mgr " << __func__ << " "
+
+
+StandbyPyModules::StandbyPyModules(
+ const MgrMap &mgr_map_,
+ PyModuleConfig &module_config,
+ LogChannelRef clog_,
+ MonClient &monc_,
+ Finisher &f)
+ : state(module_config, monc_),
+ clog(clog_),
+ finisher(f)
+{
+ state.set_mgr_map(mgr_map_);
+}
+
+// FIXME: completely identical to ActivePyModules
+void StandbyPyModules::shutdown()
+{
+ std::lock_guard locker(lock);
+
+ // Signal modules to drop out of serve() and/or tear down resources
+ for (auto &i : modules) {
+ auto module = i.second.get();
+ const auto& name = i.first;
+ dout(10) << "waiting for module " << name << " to shutdown" << dendl;
+ lock.unlock();
+ module->shutdown();
+ lock.lock();
+ dout(10) << "module " << name << " shutdown" << dendl;
+ }
+
+ // For modules implementing serve(), finish the threads where we
+ // were running that.
+ for (auto &i : modules) {
+ lock.unlock();
+ dout(10) << "joining thread for module " << i.first << dendl;
+ i.second->thread.join();
+ dout(10) << "joined thread for module " << i.first << dendl;
+ lock.lock();
+ }
+
+ modules.clear();
+}
+
+void StandbyPyModules::start_one(PyModuleRef py_module)
+{
+ std::lock_guard l(lock);
+ const auto name = py_module->get_name();
+ auto standby_module = new StandbyPyModule(state, py_module, clog);
+
+ // Send all python calls down a Finisher to avoid blocking
+ // C++ code, and avoid any potential lock cycles.
+ finisher.queue(new LambdaContext([this, standby_module, name](int) {
+ int r = standby_module->load();
+ if (r != 0) {
+ derr << "Failed to run module in standby mode ('" << name << "')"
+ << dendl;
+ delete standby_module;
+ } else {
+ std::lock_guard l(lock);
+ auto em = modules.emplace(name, standby_module);
+ ceph_assert(em.second); // actually inserted
+
+ dout(4) << "Starting thread for " << name << dendl;
+ standby_module->thread.create(standby_module->get_thread_name());
+ }
+ }));
+}
+
+int StandbyPyModule::load()
+{
+ Gil gil(py_module->pMyThreadState, true);
+
+ // We tell the module how we name it, so that it can be consistent
+ // with us in logging etc.
+ auto pThisPtr = PyCapsule_New(this, nullptr, nullptr);
+ ceph_assert(pThisPtr != nullptr);
+ auto pModuleName = PyUnicode_FromString(get_name().c_str());
+ ceph_assert(pModuleName != nullptr);
+ auto pArgs = PyTuple_Pack(2, pModuleName, pThisPtr);
+ Py_DECREF(pThisPtr);
+ Py_DECREF(pModuleName);
+
+ pClassInstance = PyObject_CallObject(py_module->pStandbyClass, pArgs);
+ Py_DECREF(pArgs);
+ if (pClassInstance == nullptr) {
+ derr << "Failed to construct class in '" << get_name() << "'" << dendl;
+ derr << handle_pyerror() << dendl;
+ return -EINVAL;
+ } else {
+ dout(1) << "Constructed class from module: " << get_name() << dendl;
+ return 0;
+ }
+}
+
+bool StandbyPyModule::get_config(const std::string &key,
+ std::string *value) const
+{
+ const std::string global_key = "mgr/" + get_name() + "/" + key;
+
+ dout(4) << __func__ << " key: " << global_key << dendl;
+
+ return state.with_config([global_key, value](const PyModuleConfig &config){
+ if (config.config.count(global_key)) {
+ *value = config.config.at(global_key);
+ return true;
+ } else {
+ return false;
+ }
+ });
+}
+
+bool StandbyPyModule::get_store(const std::string &key,
+ std::string *value) const
+{
+
+ const std::string global_key = PyModule::mgr_store_prefix
+ + get_name() + "/" + key;
+
+ dout(4) << __func__ << " key: " << global_key << dendl;
+
+ // Active modules use a cache of store values (kept up to date
+ // as writes pass through the active mgr), but standbys
+ // fetch values synchronously to get an up to date value.
+ // It's an acceptable cost because standby modules should not be
+ // doing a lot.
+
+ MonClient &monc = state.get_monc();
+
+ std::ostringstream cmd_json;
+ cmd_json << "{\"prefix\": \"config-key get\", \"key\": \""
+ << global_key << "\"}";
+
+ bufferlist outbl;
+ std::string outs;
+ C_SaferCond c;
+ monc.start_mon_command(
+ {cmd_json.str()},
+ {},
+ &outbl,
+ &outs,
+ &c);
+
+ int r = c.wait();
+ if (r == -ENOENT) {
+ return false;
+ } else if (r != 0) {
+ // This is some internal error, not meaningful to python modules,
+ // so let them just see no value.
+ derr << __func__ << " error fetching store key '" << global_key << "': "
+ << cpp_strerror(r) << " " << outs << dendl;
+ return false;
+ } else {
+ *value = outbl.to_str();
+ return true;
+ }
+}
+
+std::string StandbyPyModule::get_active_uri() const
+{
+ std::string result;
+ state.with_mgr_map([&result, this](const MgrMap &mgr_map){
+ auto iter = mgr_map.services.find(get_name());
+ if (iter != mgr_map.services.end()) {
+ result = iter->second;
+ }
+ });
+
+ return result;
+}
+
diff --git a/src/mgr/StandbyPyModules.h b/src/mgr/StandbyPyModules.h
new file mode 100644
index 000000000..501dfc8c7
--- /dev/null
+++ b/src/mgr/StandbyPyModules.h
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#pragma once
+
+#include <string>
+#include <map>
+
+#include <Python.h>
+
+#include "common/Thread.h"
+#include "common/ceph_mutex.h"
+
+#include "mgr/Gil.h"
+#include "mon/MonClient.h"
+#include "mon/MgrMap.h"
+#include "mgr/PyModuleRunner.h"
+
+class Finisher;
+
+/**
+ * State that is read by all modules running in standby mode
+ */
+class StandbyPyModuleState
+{
+ mutable ceph::mutex lock = ceph::make_mutex("StandbyPyModuleState::lock");
+
+ MgrMap mgr_map;
+ PyModuleConfig &module_config;
+ MonClient &monc;
+
+public:
+
+
+ StandbyPyModuleState(PyModuleConfig &module_config_, MonClient &monc_)
+ : module_config(module_config_), monc(monc_)
+ {}
+
+ void set_mgr_map(const MgrMap &mgr_map_)
+ {
+ std::lock_guard l(lock);
+
+ mgr_map = mgr_map_;
+ }
+
+ // MonClient does all its own locking so we're happy to hand out
+ // references.
+ MonClient &get_monc() {return monc;};
+
+ template<typename Callback, typename...Args>
+ void with_mgr_map(Callback&& cb, Args&&...args) const
+ {
+ std::lock_guard l(lock);
+ std::forward<Callback>(cb)(mgr_map, std::forward<Args>(args)...);
+ }
+
+ template<typename Callback, typename...Args>
+ auto with_config(Callback&& cb, Args&&... args) const ->
+ decltype(cb(module_config, std::forward<Args>(args)...)) {
+ std::lock_guard l(lock);
+
+ return std::forward<Callback>(cb)(module_config, std::forward<Args>(args)...);
+ }
+};
+
+
+class StandbyPyModule : public PyModuleRunner
+{
+ StandbyPyModuleState &state;
+
+ public:
+
+ StandbyPyModule(
+ StandbyPyModuleState &state_,
+ const PyModuleRef &py_module_,
+ LogChannelRef clog_)
+ :
+ PyModuleRunner(py_module_, clog_),
+ state(state_)
+ {
+ }
+
+ bool get_config(const std::string &key, std::string *value) const;
+ bool get_store(const std::string &key, std::string *value) const;
+ std::string get_active_uri() const;
+ entity_addrvec_t get_myaddrs() const {
+ return state.get_monc().get_myaddrs();
+ }
+
+ int load();
+};
+
+class StandbyPyModules
+{
+private:
+ mutable ceph::mutex lock = ceph::make_mutex("StandbyPyModules::lock");
+ std::map<std::string, std::unique_ptr<StandbyPyModule>> modules;
+
+ StandbyPyModuleState state;
+
+ LogChannelRef clog;
+
+ Finisher &finisher;
+
+public:
+
+ StandbyPyModules(
+ const MgrMap &mgr_map_,
+ PyModuleConfig &module_config,
+ LogChannelRef clog_,
+ MonClient &monc,
+ Finisher &f);
+
+ void start_one(PyModuleRef py_module);
+
+ void shutdown();
+
+ void handle_mgr_map(const MgrMap &mgr_map)
+ {
+ state.set_mgr_map(mgr_map);
+ }
+
+};
diff --git a/src/mgr/TTLCache.cc b/src/mgr/TTLCache.cc
new file mode 100644
index 000000000..05fe95987
--- /dev/null
+++ b/src/mgr/TTLCache.cc
@@ -0,0 +1,100 @@
+#include "TTLCache.h"
+
+#include <chrono>
+#include <functional>
+#include <string>
+
+#include "PyUtil.h"
+
+template <class Key, class Value>
+void TTLCacheBase<Key, Value>::insert(Key key, Value value) {
+ auto now = std::chrono::steady_clock::now();
+
+ if (!ttl) return;
+ int16_t random_ttl_offset =
+ ttl * ttl_spread_ratio * (2l * rand() / float(RAND_MAX) - 1);
+ // in order not to have spikes of misses we increase or decrease by 25% of
+ // the ttl
+ int16_t spreaded_ttl = ttl + random_ttl_offset;
+ auto expiration_date = now + std::chrono::seconds(spreaded_ttl);
+ cache::insert(key, {value, expiration_date});
+}
+
+template <class Key, class Value> Value TTLCacheBase<Key, Value>::get(Key key) {
+ if (!exists(key)) {
+ throw_key_not_found(key);
+ }
+ if (expired(key)) {
+ erase(key);
+ throw_key_not_found(key);
+ }
+ Value value = {get_value(key)};
+ return value;
+}
+
+template <class Key> PyObject* TTLCache<Key, PyObject*>::get(Key key) {
+ if (!this->exists(key)) {
+ this->throw_key_not_found(key);
+ }
+ if (this->expired(key)) {
+ this->erase(key);
+ this->throw_key_not_found(key);
+ }
+ PyObject* cached_value = this->get_value(key);
+ Py_INCREF(cached_value);
+ return cached_value;
+}
+
+template <class Key, class Value>
+void TTLCacheBase<Key, Value>::erase(Key key) {
+ cache::erase(key);
+}
+
+template <class Key> void TTLCache<Key, PyObject*>::erase(Key key) {
+ Py_DECREF(this->get_value(key, false));
+ ttl_base::erase(key);
+}
+
+template <class Key, class Value>
+bool TTLCacheBase<Key, Value>::expired(Key key) {
+ ttl_time_point expiration_date = get_value_time_point(key);
+ auto now = std::chrono::steady_clock::now();
+ if (now >= expiration_date) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+template <class Key, class Value> void TTLCacheBase<Key, Value>::clear() {
+ cache::clear();
+}
+
+template <class Key, class Value>
+Value TTLCacheBase<Key, Value>::get_value(Key key, bool count_hit) {
+ value_type stored_value = cache::get(key, count_hit);
+ Value value = std::get<0>(stored_value);
+ return value;
+}
+
+template <class Key, class Value>
+ttl_time_point TTLCacheBase<Key, Value>::get_value_time_point(Key key) {
+ value_type stored_value = cache::get(key, false);
+ ttl_time_point tp = std::get<1>(stored_value);
+ return tp;
+}
+
+template <class Key, class Value>
+void TTLCacheBase<Key, Value>::set_ttl(uint16_t ttl) {
+ this->ttl = ttl;
+}
+
+template <class Key, class Value>
+bool TTLCacheBase<Key, Value>::exists(Key key) {
+ return cache::exists(key);
+}
+
+template <class Key, class Value>
+void TTLCacheBase<Key, Value>::throw_key_not_found(Key key) {
+ cache::throw_key_not_found(key);
+}
diff --git a/src/mgr/TTLCache.h b/src/mgr/TTLCache.h
new file mode 100644
index 000000000..a6d5ddf2e
--- /dev/null
+++ b/src/mgr/TTLCache.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "PyUtil.h"
+
+using namespace std;
+
+template <class Key, class Value> class Cache {
+ private:
+ std::atomic<uint64_t> hits, misses;
+
+ protected:
+ unsigned int capacity;
+ Cache(unsigned int size = UINT16_MAX) : hits{0}, misses{0}, capacity{size} {};
+ std::map<Key, Value> content;
+ std::vector<string> allowed_keys = {"osd_map", "pg_dump", "pg_stats"};
+
+ void mark_miss() {
+ misses++;
+ }
+
+ void mark_hit() {
+ hits++;
+ }
+
+ unsigned int get_misses() { return misses; }
+ unsigned int get_hits() { return hits; }
+ void throw_key_not_found(Key key) {
+ std::stringstream ss;
+ ss << "Key " << key << " couldn't be found\n";
+ throw std::out_of_range(ss.str());
+ }
+
+ public:
+ void insert(Key key, Value value) {
+ mark_miss();
+ if (content.size() < capacity) {
+ content.insert({key, value});
+ }
+ }
+ Value get(Key key, bool count_hit = true) {
+ if (count_hit) {
+ mark_hit();
+ }
+ return content[key];
+ }
+ void erase(Key key) { content.erase(content.find(key)); }
+ void clear() { content.clear(); }
+ bool exists(Key key) { return content.find(key) != content.end(); }
+ std::pair<uint64_t, uint64_t> get_hit_miss_ratio() {
+ return std::make_pair(hits.load(), misses.load());
+ }
+ bool is_cacheable(Key key) {
+ for (auto k : allowed_keys) {
+ if (key == k) return true;
+ }
+ return false;
+ }
+ int size() { return content.size(); }
+
+ ~Cache(){};
+};
+
+using ttl_time_point = std::chrono::time_point<std::chrono::steady_clock>;
+template <class Key, class Value>
+class TTLCacheBase : public Cache<Key, std::pair<Value, ttl_time_point>> {
+ private:
+ uint16_t ttl;
+ float ttl_spread_ratio;
+ using value_type = std::pair<Value, ttl_time_point>;
+ using cache = Cache<Key, value_type>;
+
+ protected:
+ Value get_value(Key key, bool count_hit = true);
+ ttl_time_point get_value_time_point(Key key);
+ bool exists(Key key);
+ bool expired(Key key);
+ void finish_get(Key key);
+ void finish_erase(Key key);
+ void throw_key_not_found(Key key);
+
+ public:
+ TTLCacheBase(uint16_t ttl_ = 0, uint16_t size = UINT16_MAX,
+ float spread = 0.25)
+ : Cache<Key, value_type>(size), ttl{ttl_}, ttl_spread_ratio{spread} {}
+ ~TTLCacheBase(){};
+ void insert(Key key, Value value);
+ Value get(Key key);
+ void erase(Key key);
+ void clear();
+ uint16_t get_ttl() { return ttl; };
+ void set_ttl(uint16_t ttl);
+};
+
+template <class Key, class Value>
+class TTLCache : public TTLCacheBase<Key, Value> {
+ public:
+ TTLCache(uint16_t ttl_ = 0, uint16_t size = UINT16_MAX, float spread = 0.25)
+ : TTLCacheBase<Key, Value>(ttl_, size, spread) {}
+ ~TTLCache(){};
+};
+
+template <class Key>
+class TTLCache<Key, PyObject*> : public TTLCacheBase<Key, PyObject*> {
+ public:
+ TTLCache(uint16_t ttl_ = 0, uint16_t size = UINT16_MAX, float spread = 0.25)
+ : TTLCacheBase<Key, PyObject*>(ttl_, size, spread) {}
+ ~TTLCache(){};
+ PyObject* get(Key key);
+ void erase(Key key);
+
+ private:
+ using ttl_base = TTLCacheBase<Key, PyObject*>;
+};
+
+#include "TTLCache.cc"
+
diff --git a/src/mgr/Types.h b/src/mgr/Types.h
new file mode 100644
index 000000000..ab90bbbe9
--- /dev/null
+++ b/src/mgr/Types.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MGR_TYPES_H
+#define CEPH_MGR_TYPES_H
+
+typedef int MetricQueryID;
+
+typedef std::pair<uint64_t,uint64_t> PerformanceCounter;
+typedef std::vector<PerformanceCounter> PerformanceCounters;
+
+struct MetricListener {
+ virtual ~MetricListener() {
+ }
+
+ virtual void handle_query_updated() = 0;
+};
+
+struct PerfCollector {
+ MetricQueryID query_id;
+ PerfCollector(MetricQueryID query_id)
+ : query_id(query_id) {
+ }
+};
+
+#endif // CEPH_MGR_TYPES_H
diff --git a/src/mgr/mgr_commands.cc b/src/mgr/mgr_commands.cc
new file mode 100644
index 000000000..206d1126a
--- /dev/null
+++ b/src/mgr/mgr_commands.cc
@@ -0,0 +1,14 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "mgr_commands.h"
+
+/* The set of statically defined (C++-handled) commands. This
+ * does not include the Python-defined commands, which are loaded
+ * in PyModules */
+const std::vector<MonCommand> mgr_commands = {
+#define COMMAND(parsesig, helptext, module, perm) \
+ {parsesig, helptext, module, perm, 0},
+#include "MgrCommands.h"
+#undef COMMAND
+};
diff --git a/src/mgr/mgr_commands.h b/src/mgr/mgr_commands.h
new file mode 100644
index 000000000..c6ed6c68d
--- /dev/null
+++ b/src/mgr/mgr_commands.h
@@ -0,0 +1,9 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "mon/MonCommand.h"
+#include <vector>
+
+extern const std::vector<MonCommand> mgr_commands;
diff --git a/src/mgr/mgr_perf_counters.cc b/src/mgr/mgr_perf_counters.cc
new file mode 100644
index 000000000..1b5585f9e
--- /dev/null
+++ b/src/mgr/mgr_perf_counters.cc
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "mgr_perf_counters.h"
+#include "common/perf_counters.h"
+#include "common/ceph_context.h"
+
+PerfCounters *perfcounter = NULL;
+
+int mgr_perf_start(CephContext *cct)
+{
+ PerfCountersBuilder plb(cct, "mgr", l_mgr_first, l_mgr_last);
+ plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+ plb.add_u64_counter(l_mgr_cache_hit, "cache_hit", "Cache hits");
+ plb.add_u64_counter(l_mgr_cache_miss, "cache_miss", "Cache miss");
+
+ perfcounter = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perfcounter);
+ return 0;
+}
+
+void mgr_perf_stop(CephContext *cct)
+{
+ ceph_assert(perfcounter);
+ cct->get_perfcounters_collection()->remove(perfcounter);
+ delete perfcounter;
+}
diff --git a/src/mgr/mgr_perf_counters.h b/src/mgr/mgr_perf_counters.h
new file mode 100644
index 000000000..d695d905f
--- /dev/null
+++ b/src/mgr/mgr_perf_counters.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+#include "include/common_fwd.h"
+
+extern PerfCounters* perfcounter;
+
+extern int mgr_perf_start(CephContext* cct);
+extern void mgr_perf_stop(CephContext* cct);
+
+enum {
+ l_mgr_first,
+
+ l_mgr_cache_hit,
+ l_mgr_cache_miss,
+
+ l_mgr_last,
+};
+