From 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 20:24:20 +0200 Subject: Adding upstream version 14.2.21. Signed-off-by: Daniel Baumann --- src/mgr/ActivePyModule.cc | 275 +++ src/mgr/ActivePyModule.h | 103 ++ src/mgr/ActivePyModules.cc | 1102 ++++++++++++ src/mgr/ActivePyModules.h | 194 +++ src/mgr/BaseMgrModule.cc | 1228 ++++++++++++++ src/mgr/BaseMgrModule.h | 7 + src/mgr/BaseMgrStandbyModule.cc | 230 +++ src/mgr/BaseMgrStandbyModule.h | 7 + src/mgr/CMakeLists.txt | 37 + src/mgr/ClusterState.cc | 383 +++++ src/mgr/ClusterState.h | 161 ++ src/mgr/DaemonHealthMetric.h | 76 + src/mgr/DaemonHealthMetricCollector.cc | 125 ++ src/mgr/DaemonHealthMetricCollector.h | 32 + src/mgr/DaemonServer.cc | 2921 ++++++++++++++++++++++++++++++++ src/mgr/DaemonServer.h | 181 ++ src/mgr/DaemonState.cc | 347 ++++ src/mgr/DaemonState.h | 400 +++++ src/mgr/Gil.cc | 79 + src/mgr/Gil.h | 72 + src/mgr/Mgr.cc | 692 ++++++++ src/mgr/Mgr.h | 132 ++ src/mgr/MgrCap.cc | 580 +++++++ src/mgr/MgrCap.h | 202 +++ src/mgr/MgrClient.cc | 531 ++++++ src/mgr/MgrClient.h | 166 ++ src/mgr/MgrCommands.h | 210 +++ src/mgr/MgrContext.h | 66 + src/mgr/MgrSession.h | 38 + src/mgr/MgrStandby.cc | 515 ++++++ src/mgr/MgrStandby.h | 89 + src/mgr/OSDPerfMetricCollector.cc | 207 +++ src/mgr/OSDPerfMetricCollector.h | 55 + src/mgr/OSDPerfMetricTypes.cc | 132 ++ src/mgr/OSDPerfMetricTypes.h | 355 ++++ src/mgr/PyFormatter.cc | 127 ++ src/mgr/PyFormatter.h | 142 ++ src/mgr/PyModule.cc | 726 ++++++++ src/mgr/PyModule.h | 188 ++ src/mgr/PyModuleRegistry.cc | 526 ++++++ src/mgr/PyModuleRegistry.h | 186 ++ src/mgr/PyModuleRunner.cc | 110 ++ src/mgr/PyModuleRunner.h | 89 + src/mgr/PyOSDMap.cc | 667 ++++++++ src/mgr/PyOSDMap.h | 20 + src/mgr/PythonCompat.h | 38 + src/mgr/ServiceMap.cc | 138 ++ src/mgr/ServiceMap.h | 156 ++ src/mgr/StandbyPyModules.cc | 205 +++ src/mgr/StandbyPyModules.h | 130 ++ src/mgr/mgr_commands.cc | 14 + src/mgr/mgr_commands.h | 9 + 52 files changed, 15401 insertions(+) create mode 100644 src/mgr/ActivePyModule.cc create mode 100644 src/mgr/ActivePyModule.h create mode 100644 src/mgr/ActivePyModules.cc create mode 100644 src/mgr/ActivePyModules.h create mode 100644 src/mgr/BaseMgrModule.cc create mode 100644 src/mgr/BaseMgrModule.h create mode 100644 src/mgr/BaseMgrStandbyModule.cc create mode 100644 src/mgr/BaseMgrStandbyModule.h create mode 100644 src/mgr/CMakeLists.txt create mode 100644 src/mgr/ClusterState.cc create mode 100644 src/mgr/ClusterState.h create mode 100644 src/mgr/DaemonHealthMetric.h create mode 100644 src/mgr/DaemonHealthMetricCollector.cc create mode 100644 src/mgr/DaemonHealthMetricCollector.h create mode 100644 src/mgr/DaemonServer.cc create mode 100644 src/mgr/DaemonServer.h create mode 100644 src/mgr/DaemonState.cc create mode 100644 src/mgr/DaemonState.h create mode 100644 src/mgr/Gil.cc create mode 100644 src/mgr/Gil.h create mode 100644 src/mgr/Mgr.cc create mode 100644 src/mgr/Mgr.h create mode 100644 src/mgr/MgrCap.cc create mode 100644 src/mgr/MgrCap.h create mode 100644 src/mgr/MgrClient.cc create mode 100644 src/mgr/MgrClient.h create mode 100644 src/mgr/MgrCommands.h create mode 100644 src/mgr/MgrContext.h create mode 100644 src/mgr/MgrSession.h create mode 100644 src/mgr/MgrStandby.cc create mode 100644 src/mgr/MgrStandby.h create mode 100644 src/mgr/OSDPerfMetricCollector.cc create mode 100644 src/mgr/OSDPerfMetricCollector.h create mode 100644 src/mgr/OSDPerfMetricTypes.cc create mode 100644 src/mgr/OSDPerfMetricTypes.h create mode 100644 src/mgr/PyFormatter.cc create mode 100644 src/mgr/PyFormatter.h create mode 100644 src/mgr/PyModule.cc create mode 100644 src/mgr/PyModule.h create mode 100644 src/mgr/PyModuleRegistry.cc create mode 100644 src/mgr/PyModuleRegistry.h create mode 100644 src/mgr/PyModuleRunner.cc create mode 100644 src/mgr/PyModuleRunner.h create mode 100644 src/mgr/PyOSDMap.cc create mode 100644 src/mgr/PyOSDMap.h create mode 100644 src/mgr/PythonCompat.h create mode 100644 src/mgr/ServiceMap.cc create mode 100644 src/mgr/ServiceMap.h create mode 100644 src/mgr/StandbyPyModules.cc create mode 100644 src/mgr/StandbyPyModules.h create mode 100644 src/mgr/mgr_commands.cc create mode 100644 src/mgr/mgr_commands.h (limited to 'src/mgr') diff --git a/src/mgr/ActivePyModule.cc b/src/mgr/ActivePyModule.cc new file mode 100644 index 00000000..852c17be --- /dev/null +++ b/src/mgr/ActivePyModule.cc @@ -0,0 +1,275 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "PyFormatter.h" + +#include "common/debug.h" +#include "mon/MonCommand.h" + +#include "ActivePyModule.h" +#include "MgrSession.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +int ActivePyModule::load(ActivePyModules *py_modules) +{ + ceph_assert(py_modules); + Gil gil(py_module->pMyThreadState, true); + + // We tell the module how we name it, so that it can be consistent + // with us in logging etc. + auto pThisPtr = PyCapsule_New(this, nullptr, nullptr); + auto pPyModules = PyCapsule_New(py_modules, nullptr, nullptr); + auto pModuleName = PyString_FromString(get_name().c_str()); + auto pArgs = PyTuple_Pack(3, pModuleName, pPyModules, pThisPtr); + + pClassInstance = PyObject_CallObject(py_module->pClass, pArgs); + Py_DECREF(pModuleName); + Py_DECREF(pArgs); + if (pClassInstance == nullptr) { + derr << "Failed to construct class in '" << get_name() << "'" << dendl; + derr << handle_pyerror() << dendl; + return -EINVAL; + } else { + dout(1) << "Constructed class from module: " << get_name() << dendl; + } + + return 0; +} + +void ActivePyModule::notify(const std::string ¬ify_type, const std::string ¬ify_id) +{ + if (is_dead()) { + dout(5) << "cancelling notify " << notify_type << " " << notify_id << dendl; + return; + } + + ceph_assert(pClassInstance != nullptr); + + Gil gil(py_module->pMyThreadState, true); + + // Execute + auto pValue = PyObject_CallMethod(pClassInstance, + const_cast("notify"), const_cast("(ss)"), + notify_type.c_str(), notify_id.c_str()); + + if (pValue != NULL) { + Py_DECREF(pValue); + } else { + derr << get_name() << ".notify:" << dendl; + derr << handle_pyerror() << dendl; + // FIXME: callers can't be expected to handle a python module + // that has spontaneously broken, but Mgr() should provide + // a hook to unload misbehaving modules when they have an + // error somewhere like this + } +} + +void ActivePyModule::notify_clog(const LogEntry &log_entry) +{ + if (is_dead()) { + dout(5) << "cancelling notify_clog" << dendl; + return; + } + + ceph_assert(pClassInstance != nullptr); + + Gil gil(py_module->pMyThreadState, true); + + // Construct python-ized LogEntry + PyFormatter f; + log_entry.dump(&f); + auto py_log_entry = f.get(); + + // Execute + auto pValue = PyObject_CallMethod(pClassInstance, + const_cast("notify"), const_cast("(sN)"), + "clog", py_log_entry); + + if (pValue != NULL) { + Py_DECREF(pValue); + } else { + derr << get_name() << ".notify_clog:" << dendl; + derr << handle_pyerror() << dendl; + // FIXME: callers can't be expected to handle a python module + // that has spontaneously broken, but Mgr() should provide + // a hook to unload misbehaving modules when they have an + // error somewhere like this + } +} + +bool ActivePyModule::method_exists(const std::string &method) const +{ + Gil gil(py_module->pMyThreadState, true); + + auto boundMethod = PyObject_GetAttrString(pClassInstance, method.c_str()); + if (boundMethod == nullptr) { + return false; + } else { + Py_DECREF(boundMethod); + return true; + } +} + +PyObject *ActivePyModule::dispatch_remote( + const std::string &method, + PyObject *args, + PyObject *kwargs, + std::string *err) +{ + ceph_assert(err != nullptr); + + // Rather than serializing arguments, pass the CPython objects. + // Works because we happen to know that the subinterpreter + // implementation shares a GIL, allocator, deallocator and GC state, so + // it's okay to pass the objects between subinterpreters. + // But in future this might involve serialization to support a CSP-aware + // future Python interpreter a la PEP554 + + Gil gil(py_module->pMyThreadState, true); + + // Fire the receiving method + auto boundMethod = PyObject_GetAttrString(pClassInstance, method.c_str()); + + // Caller should have done method_exists check first! + ceph_assert(boundMethod != nullptr); + + dout(20) << "Calling " << py_module->get_name() + << "." << method << "..." << dendl; + + auto remoteResult = PyObject_Call(boundMethod, + args, kwargs); + Py_DECREF(boundMethod); + + if (remoteResult == nullptr) { + // Because the caller is in a different context, we can't let this + // exception bubble up, need to re-raise it from the caller's + // context later. + *err = handle_pyerror(); + } else { + dout(20) << "Success calling '" << method << "'" << dendl; + } + + return remoteResult; +} + +void ActivePyModule::config_notify() +{ + if (is_dead()) { + dout(5) << "cancelling config_notify" << dendl; + return; + } + + Gil gil(py_module->pMyThreadState, true); + dout(20) << "Calling " << py_module->get_name() << ".config_notify..." + << dendl; + auto remoteResult = PyObject_CallMethod(pClassInstance, + const_cast("config_notify"), + (char*)NULL); + if (remoteResult != nullptr) { + Py_DECREF(remoteResult); + } +} + +int ActivePyModule::handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss) +{ + ceph_assert(ss != nullptr); + ceph_assert(ds != nullptr); + + if (pClassInstance == nullptr) { + // Not the friendliest error string, but we could only + // hit this in quite niche cases, if at all. + *ss << "Module not instantiated"; + return -EINVAL; + } + + Gil gil(py_module->pMyThreadState, true); + + PyFormatter f; + cmdmap_dump(cmdmap, &f); + PyObject *py_cmd = f.get(); + string instr; + inbuf.copy(0, inbuf.length(), instr); + + ceph_assert(m_session == nullptr); + m_command_perms = module_command.perm; + m_session = &session; + + auto pResult = PyObject_CallMethod(pClassInstance, + const_cast("_handle_command"), const_cast("s#O"), + instr.c_str(), instr.length(), py_cmd); + + m_command_perms.clear(); + m_session = nullptr; + Py_DECREF(py_cmd); + + int r = 0; + if (pResult != NULL) { + if (PyTuple_Size(pResult) != 3) { + derr << "module '" << py_module->get_name() << "' command handler " + "returned wrong type!" << dendl; + r = -EINVAL; + } else { + r = PyInt_AsLong(PyTuple_GetItem(pResult, 0)); + *ds << PyString_AsString(PyTuple_GetItem(pResult, 1)); + *ss << PyString_AsString(PyTuple_GetItem(pResult, 2)); + } + + Py_DECREF(pResult); + } else { + derr << "module '" << py_module->get_name() << "' command handler " + "threw exception: " << peek_pyerror() << dendl; + *ds << ""; + *ss << handle_pyerror(); + r = -EINVAL; + } + + return r; +} + +void ActivePyModule::get_health_checks(health_check_map_t *checks) +{ + if (is_dead()) { + dout(5) << "cancelling get_health_checks" << dendl; + return; + } + checks->merge(health_checks); +} + +bool ActivePyModule::is_authorized( + const std::map& arguments) const { + if (m_session == nullptr) { + return false; + } + + // No need to pass command prefix here since that would have already been + // tested before command invokation. Instead, only test for service/module + // arguments as defined by the module itself. + MonCommand mon_command {"", "", "", m_command_perms}; + return m_session->caps.is_capable(nullptr, m_session->entity_name, "py", + py_module->get_name(), "", arguments, + mon_command.requires_perm('r'), + mon_command.requires_perm('w'), + mon_command.requires_perm('x'), + m_session->get_peer_addr()); +} diff --git a/src/mgr/ActivePyModule.h b/src/mgr/ActivePyModule.h new file mode 100644 index 00000000..410b8d81 --- /dev/null +++ b/src/mgr/ActivePyModule.h @@ -0,0 +1,103 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#pragma once + +// Python.h comes first because otherwise it clobbers ceph's assert +#include "Python.h" + +#include "common/cmdparse.h" +#include "common/LogEntry.h" +#include "common/Mutex.h" +#include "common/Thread.h" +#include "mon/health_check.h" +#include "mgr/Gil.h" + +#include "PyModuleRunner.h" + +#include +#include + + +class ActivePyModule; +class ActivePyModules; +class MgrSession; +class ModuleCommand; + +class ActivePyModule : public PyModuleRunner +{ +private: + health_check_map_t health_checks; + + // Optional, URI exposed by plugins that implement serve() + std::string uri; + + std::string m_command_perms; + const MgrSession* m_session = nullptr; + +public: + ActivePyModule(const PyModuleRef &py_module_, + LogChannelRef clog_) + : PyModuleRunner(py_module_, clog_) + {} + + int load(ActivePyModules *py_modules); + void notify(const std::string ¬ify_type, const std::string ¬ify_id); + void notify_clog(const LogEntry &le); + + bool method_exists(const std::string &method) const; + + PyObject *dispatch_remote( + const std::string &method, + PyObject *args, + PyObject *kwargs, + std::string *err); + + int handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss); + + + bool set_health_checks(health_check_map_t&& c) { + // when health checks change a report is immediately sent to the monitors. + // currently modules have static health check details, but this equality + // test could be made smarter if too much noise shows up in the future. + bool changed = health_checks != c; + health_checks = std::move(c); + return changed; + } + void get_health_checks(health_check_map_t *checks); + void config_notify(); + + void set_uri(const std::string &str) + { + uri = str; + } + + std::string get_uri() const + { + return uri; + } + + bool is_authorized(const std::map& arguments) const; + +}; + +std::string handle_pyerror(); + diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc new file mode 100644 index 00000000..e3f6e471 --- /dev/null +++ b/src/mgr/ActivePyModules.cc @@ -0,0 +1,1102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +// Include this first to get python headers earlier +#include "Gil.h" + +#include "common/errno.h" +#include "include/stringify.h" + +#include "PyFormatter.h" + +#include "osd/OSDMap.h" +#include "mon/MonMap.h" + +#include "mgr/MgrContext.h" + +// For ::config_prefix +#include "PyModule.h" +#include "PyModuleRegistry.h" + +#include "ActivePyModules.h" +#include "DaemonServer.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +ActivePyModules::ActivePyModules(PyModuleConfig &module_config_, + std::map store_data, + DaemonStateIndex &ds, ClusterState &cs, + MonClient &mc, LogChannelRef clog_, + LogChannelRef audit_clog_, Objecter &objecter_, + Client &client_, Finisher &f, DaemonServer &server, + PyModuleRegistry &pmr) + : module_config(module_config_), daemon_state(ds), cluster_state(cs), + monc(mc), clog(clog_), audit_clog(audit_clog_), objecter(objecter_), + client(client_), finisher(f), + cmd_finisher(g_ceph_context, "cmd_finisher", "cmdfin"), + server(server), py_module_registry(pmr), lock("ActivePyModules") +{ + store_cache = std::move(store_data); + cmd_finisher.start(); +} + +ActivePyModules::~ActivePyModules() = default; + +void ActivePyModules::dump_server(const std::string &hostname, + const DaemonStateCollection &dmc, + Formatter *f) +{ + f->dump_string("hostname", hostname); + f->open_array_section("services"); + std::string ceph_version; + + for (const auto &i : dmc) { + std::lock_guard l(i.second->lock); + const auto &key = i.first; + const std::string &str_type = key.first; + const std::string &svc_name = key.second; + + // TODO: pick the highest version, and make sure that + // somewhere else (during health reporting?) we are + // indicating to the user if we see mixed versions + auto ver_iter = i.second->metadata.find("ceph_version"); + if (ver_iter != i.second->metadata.end()) { + ceph_version = i.second->metadata.at("ceph_version"); + } + + f->open_object_section("service"); + f->dump_string("type", str_type); + f->dump_string("id", svc_name); + f->close_section(); + } + f->close_section(); + + f->dump_string("ceph_version", ceph_version); +} + + + +PyObject *ActivePyModules::get_server_python(const std::string &hostname) +{ + PyThreadState *tstate = PyEval_SaveThread(); + std::lock_guard l(lock); + PyEval_RestoreThread(tstate); + dout(10) << " (" << hostname << ")" << dendl; + + auto dmc = daemon_state.get_by_server(hostname); + + PyFormatter f; + dump_server(hostname, dmc, &f); + return f.get(); +} + + +PyObject *ActivePyModules::list_servers_python() +{ + PyFormatter f(false, true); + PyThreadState *tstate = PyEval_SaveThread(); + dout(10) << " >" << dendl; + + daemon_state.with_daemons_by_server([this, &f, &tstate] + (const std::map &all) { + PyEval_RestoreThread(tstate); + + for (const auto &i : all) { + const auto &hostname = i.first; + + f.open_object_section("server"); + dump_server(hostname, i.second, &f); + f.close_section(); + } + }); + + return f.get(); +} + +PyObject *ActivePyModules::get_metadata_python( + const std::string &svc_type, + const std::string &svc_id) +{ + auto metadata = daemon_state.get(DaemonKey(svc_type, svc_id)); + if (metadata == nullptr) { + derr << "Requested missing service " << svc_type << "." << svc_id << dendl; + Py_RETURN_NONE; + } + + std::lock_guard l(metadata->lock); + PyFormatter f; + f.dump_string("hostname", metadata->hostname); + for (const auto &i : metadata->metadata) { + f.dump_string(i.first.c_str(), i.second); + } + + return f.get(); +} + +PyObject *ActivePyModules::get_daemon_status_python( + const std::string &svc_type, + const std::string &svc_id) +{ + auto metadata = daemon_state.get(DaemonKey(svc_type, svc_id)); + if (metadata == nullptr) { + derr << "Requested missing service " << svc_type << "." << svc_id << dendl; + Py_RETURN_NONE; + } + + std::lock_guard l(metadata->lock); + PyFormatter f; + for (const auto &i : metadata->service_status) { + f.dump_string(i.first.c_str(), i.second); + } + return f.get(); +} + +PyObject *ActivePyModules::get_python(const std::string &what) +{ + PyFormatter f; + + // Drop the GIL, as most of the following blocks will block on + // a mutex -- they are all responsible for re-taking the GIL before + // touching the PyFormatter instance or returning from the function. + PyThreadState *tstate = PyEval_SaveThread(); + + if (what == "fs_map") { + cluster_state.with_fsmap([&f, &tstate](const FSMap &fsmap) { + PyEval_RestoreThread(tstate); + fsmap.dump(&f); + }); + return f.get(); + } else if (what == "osdmap_crush_map_text") { + bufferlist rdata; + cluster_state.with_osdmap([&rdata, &tstate](const OSDMap &osd_map){ + PyEval_RestoreThread(tstate); + osd_map.crush->encode(rdata, CEPH_FEATURES_SUPPORTED_DEFAULT); + }); + std::string crush_text = rdata.to_str(); + return PyString_FromString(crush_text.c_str()); + } else if (what.substr(0, 7) == "osd_map") { + cluster_state.with_osdmap([&f, &what, &tstate](const OSDMap &osd_map){ + PyEval_RestoreThread(tstate); + if (what == "osd_map") { + osd_map.dump(&f); + } else if (what == "osd_map_tree") { + osd_map.print_tree(&f, nullptr); + } else if (what == "osd_map_crush") { + osd_map.crush->dump(&f); + } + }); + return f.get(); + } else if (what == "modified_config_options") { + PyEval_RestoreThread(tstate); + auto all_daemons = daemon_state.get_all(); + set names; + for (auto& [key, daemon] : all_daemons) { + std::lock_guard l(daemon->lock); + for (auto& [name, valmap] : daemon->config) { + names.insert(name); + } + } + f.open_array_section("options"); + for (auto& name : names) { + f.dump_string("name", name); + } + f.close_section(); + return f.get(); + } else if (what.substr(0, 6) == "config") { + PyEval_RestoreThread(tstate); + if (what == "config_options") { + g_conf().config_options(&f); + } else if (what == "config") { + g_conf().show_config(&f); + } + return f.get(); + } else if (what == "mon_map") { + cluster_state.with_monmap( + [&f, &tstate](const MonMap &monmap) { + PyEval_RestoreThread(tstate); + monmap.dump(&f); + } + ); + return f.get(); + } else if (what == "service_map") { + cluster_state.with_servicemap( + [&f, &tstate](const ServiceMap &service_map) { + PyEval_RestoreThread(tstate); + service_map.dump(&f); + } + ); + return f.get(); + } else if (what == "osd_metadata") { + auto dmc = daemon_state.get_by_service("osd"); + PyEval_RestoreThread(tstate); + + for (const auto &i : dmc) { + std::lock_guard l(i.second->lock); + f.open_object_section(i.first.second.c_str()); + f.dump_string("hostname", i.second->hostname); + for (const auto &j : i.second->metadata) { + f.dump_string(j.first.c_str(), j.second); + } + f.close_section(); + } + return f.get(); + } else if (what == "pg_summary") { + cluster_state.with_pgmap( + [&f, &tstate](const PGMap &pg_map) { + PyEval_RestoreThread(tstate); + + std::map > osds; + std::map > pools; + std::map all; + for (const auto &i : pg_map.pg_stat) { + const auto pool = i.first.m_pool; + const std::string state = pg_state_string(i.second.state); + // Insert to per-pool map + pools[stringify(pool)][state]++; + for (const auto &osd_id : i.second.acting) { + osds[stringify(osd_id)][state]++; + } + all[state]++; + } + f.open_object_section("by_osd"); + for (const auto &i : osds) { + f.open_object_section(i.first.c_str()); + for (const auto &j : i.second) { + f.dump_int(j.first.c_str(), j.second); + } + f.close_section(); + } + f.close_section(); + f.open_object_section("by_pool"); + for (const auto &i : pools) { + f.open_object_section(i.first.c_str()); + for (const auto &j : i.second) { + f.dump_int(j.first.c_str(), j.second); + } + f.close_section(); + } + f.close_section(); + f.open_object_section("all"); + for (const auto &i : all) { + f.dump_int(i.first.c_str(), i.second); + } + f.close_section(); + f.open_object_section("pg_stats_sum"); + pg_map.pg_sum.dump(&f); + f.close_section(); + } + ); + return f.get(); + } else if (what == "pg_status") { + cluster_state.with_pgmap( + [&f, &tstate](const PGMap &pg_map) { + PyEval_RestoreThread(tstate); + pg_map.print_summary(&f, nullptr); + } + ); + return f.get(); + } else if (what == "pg_dump") { + cluster_state.with_pgmap( + [&f, &tstate](const PGMap &pg_map) { + PyEval_RestoreThread(tstate); + pg_map.dump(&f, false); + } + ); + return f.get(); + } else if (what == "devices") { + daemon_state.with_devices2( + [&tstate, &f]() { + PyEval_RestoreThread(tstate); + f.open_array_section("devices"); + }, + [&f] (const DeviceState& dev) { + f.dump_object("device", dev); + }); + f.close_section(); + return f.get(); + } else if (what.size() > 7 && + what.substr(0, 7) == "device ") { + string devid = what.substr(7); + if (!daemon_state.with_device( + devid, + [&f, &tstate] (const DeviceState& dev) { + PyEval_RestoreThread(tstate); + f.dump_object("device", dev); + })) { + // device not found + PyEval_RestoreThread(tstate); + } + return f.get(); + } else if (what == "io_rate") { + cluster_state.with_pgmap( + [&f, &tstate](const PGMap &pg_map) { + PyEval_RestoreThread(tstate); + pg_map.dump_delta(&f); + } + ); + return f.get(); + } else if (what == "df") { + cluster_state.with_osdmap_and_pgmap( + [this, &f, &tstate]( + const OSDMap& osd_map, + const PGMap &pg_map) { + PyEval_RestoreThread(tstate); + pg_map.dump_cluster_stats(nullptr, &f, true); + pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true); + }); + return f.get(); + } else if (what == "pg_stats") { + cluster_state.with_pgmap( + [&f, &tstate](const PGMap &pg_map) { + PyEval_RestoreThread(tstate); + pg_map.dump_pg_stats(&f, false); + }); + return f.get(); + } else if (what == "pool_stats") { + cluster_state.with_pgmap( + [&f, &tstate](const PGMap &pg_map) { + PyEval_RestoreThread(tstate); + pg_map.dump_pool_stats(&f); + }); + return f.get(); + } else if (what == "pg_ready") { + PyEval_RestoreThread(tstate); + server.dump_pg_ready(&f); + return f.get(); + } else if (what == "osd_stats") { + cluster_state.with_pgmap( + [&f, &tstate](const PGMap &pg_map) { + PyEval_RestoreThread(tstate); + pg_map.dump_osd_stats(&f, false); + }); + return f.get(); + } else if (what == "osd_ping_times") { + cluster_state.with_pgmap( + [&f, &tstate](const PGMap &pg_map) { + PyEval_RestoreThread(tstate); + pg_map.dump_osd_ping_times(&f); + }); + return f.get(); + } else if (what == "osd_pool_stats") { + int64_t poolid = -ENOENT; + string pool_name; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, + const PGMap& pg_map) { + PyEval_RestoreThread(tstate); + f.open_array_section("pool_stats"); + for (auto &p : osdmap.get_pools()) { + poolid = p.first; + pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, &f, nullptr); + } + f.close_section(); + }); + return f.get(); + } else if (what == "health") { + cluster_state.with_health( + [&f, &tstate](const ceph::bufferlist &health_json) { + PyEval_RestoreThread(tstate); + f.dump_string("json", health_json.to_str()); + }); + return f.get(); + } else if (what == "mon_status") { + cluster_state.with_mon_status( + [&f, &tstate](const ceph::bufferlist &mon_status_json) { + PyEval_RestoreThread(tstate); + f.dump_string("json", mon_status_json.to_str()); + }); + return f.get(); + } else if (what == "mgr_map") { + cluster_state.with_mgrmap([&f, &tstate](const MgrMap &mgr_map) { + PyEval_RestoreThread(tstate); + mgr_map.dump(&f); + }); + return f.get(); + } else { + derr << "Python module requested unknown data '" << what << "'" << dendl; + PyEval_RestoreThread(tstate); + Py_RETURN_NONE; + } +} + +void ActivePyModules::start_one(PyModuleRef py_module) +{ + std::lock_guard l(lock); + + const auto name = py_module->get_name(); + auto active_module = std::make_shared(py_module, clog); + + pending_modules.insert(name); + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + finisher.queue(new FunctionContext([this, active_module, name](int) { + int r = active_module->load(this); + std::lock_guard l(lock); + pending_modules.erase(name); + if (r != 0) { + derr << "Failed to run module in active mode ('" << name << "')" + << dendl; + } else { + auto em = modules.emplace(name, active_module); + ceph_assert(em.second); // actually inserted + + dout(4) << "Starting thread for " << name << dendl; + active_module->thread.create(active_module->get_thread_name()); + } + })); +} + +void ActivePyModules::shutdown() +{ + std::lock_guard locker(lock); + + // Signal modules to drop out of serve() and/or tear down resources + for (auto& [name, module] : modules) { + lock.Unlock(); + dout(10) << "calling module " << name << " shutdown()" << dendl; + module->shutdown(); + dout(10) << "module " << name << " shutdown() returned" << dendl; + lock.Lock(); + } + + // For modules implementing serve(), finish the threads where we + // were running that. + for (auto& [name, module] : modules) { + lock.Unlock(); + dout(10) << "joining module " << name << dendl; + module->thread.join(); + dout(10) << "joined module " << name << dendl; + lock.Lock(); + } + + cmd_finisher.wait_for_empty(); + cmd_finisher.stop(); + + modules.clear(); +} + +void ActivePyModules::notify_all(const std::string ¬ify_type, + const std::string ¬ify_id) +{ + std::lock_guard l(lock); + + dout(10) << __func__ << ": notify_all " << notify_type << dendl; + for (auto& [name, module] : modules) { + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + dout(15) << "queuing notify to " << name << dendl; + finisher.queue(new FunctionContext([module, notify_type, notify_id](int r){ + module->notify(notify_type, notify_id); + })); + } +} + +void ActivePyModules::notify_all(const LogEntry &log_entry) +{ + std::lock_guard l(lock); + + dout(10) << __func__ << ": notify_all (clog)" << dendl; + for (auto& [name, module] : modules) { + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + // + // Note intentional use of non-reference lambda binding on + // log_entry: we take a copy because caller's instance is + // probably ephemeral. + dout(15) << "queuing notify (clog) to " << name << dendl; + finisher.queue(new FunctionContext([module, log_entry](int r){ + module->notify_clog(log_entry); + })); + } +} + +bool ActivePyModules::get_store(const std::string &module_name, + const std::string &key, std::string *val) const +{ + PyThreadState *tstate = PyEval_SaveThread(); + std::lock_guard l(lock); + PyEval_RestoreThread(tstate); + + const std::string global_key = PyModule::config_prefix + + module_name + "/" + key; + + dout(4) << __func__ << " key: " << global_key << dendl; + + auto i = store_cache.find(global_key); + if (i != store_cache.end()) { + *val = i->second; + return true; + } else { + return false; + } +} + +PyObject *ActivePyModules::dispatch_remote( + const std::string &other_module, + const std::string &method, + PyObject *args, + PyObject *kwargs, + std::string *err) +{ + auto mod_iter = modules.find(other_module); + ceph_assert(mod_iter != modules.end()); + + return mod_iter->second->dispatch_remote(method, args, kwargs, err); +} + +bool ActivePyModules::get_config(const std::string &module_name, + const std::string &key, std::string *val) const +{ + const std::string global_key = PyModule::config_prefix + + module_name + "/" + key; + + dout(4) << __func__ << " key: " << global_key << dendl; + + std::lock_guard lock(module_config.lock); + + auto i = module_config.config.find(global_key); + if (i != module_config.config.end()) { + *val = i->second; + return true; + } else { + return false; + } +} + +PyObject *ActivePyModules::get_typed_config( + const std::string &module_name, + const std::string &key, + const std::string &prefix) const +{ + PyThreadState *tstate = PyEval_SaveThread(); + std::string value; + std::string final_key; + bool found = false; + if (prefix.size()) { + final_key = prefix + "/" + key; + found = get_config(module_name, final_key, &value); + } + if (!found) { + final_key = key; + found = get_config(module_name, final_key, &value); + } + if (found) { + PyModuleRef module = py_module_registry.get_module(module_name); + PyEval_RestoreThread(tstate); + if (!module) { + derr << "Module '" << module_name << "' is not available" << dendl; + Py_RETURN_NONE; + } + // removing value to hide sensitive data going into mgr logs + // leaving this for debugging purposes + // dout(10) << __func__ << " " << final_key << " found: " << value << dendl; + dout(10) << __func__ << " " << final_key << " found" << dendl; + return module->get_typed_option_value(key, value); + } + PyEval_RestoreThread(tstate); + if (prefix.size()) { + dout(4) << __func__ << " [" << prefix << "/]" << key << " not found " + << dendl; + } else { + dout(4) << __func__ << " " << key << " not found " << dendl; + } + Py_RETURN_NONE; +} + +PyObject *ActivePyModules::get_store_prefix(const std::string &module_name, + const std::string &prefix) const +{ + PyThreadState *tstate = PyEval_SaveThread(); + std::lock_guard l(lock); + std::lock_guard lock(module_config.lock); + PyEval_RestoreThread(tstate); + + const std::string base_prefix = PyModule::config_prefix + + module_name + "/"; + const std::string global_prefix = base_prefix + prefix; + dout(4) << __func__ << " prefix: " << global_prefix << dendl; + + PyFormatter f; + + for (auto p = store_cache.lower_bound(global_prefix); + p != store_cache.end() && p->first.find(global_prefix) == 0; + ++p) { + f.dump_string(p->first.c_str() + base_prefix.size(), p->second); + } + return f.get(); +} + +void ActivePyModules::set_store(const std::string &module_name, + const std::string &key, const boost::optional& val) +{ + const std::string global_key = PyModule::config_prefix + + module_name + "/" + key; + + Command set_cmd; + { + std::lock_guard l(lock); + if (val) { + store_cache[global_key] = *val; + } else { + store_cache.erase(global_key); + } + + std::ostringstream cmd_json; + JSONFormatter jf; + jf.open_object_section("cmd"); + if (val) { + jf.dump_string("prefix", "config-key set"); + jf.dump_string("key", global_key); + jf.dump_string("val", *val); + } else { + jf.dump_string("prefix", "config-key del"); + jf.dump_string("key", global_key); + } + jf.close_section(); + jf.flush(cmd_json); + set_cmd.run(&monc, cmd_json.str()); + } + set_cmd.wait(); + + if (set_cmd.r != 0) { + // config-key set will fail if mgr's auth key has insufficient + // permission to set config keys + // FIXME: should this somehow raise an exception back into Python land? + dout(0) << "`config-key set " << global_key << " " << val << "` failed: " + << cpp_strerror(set_cmd.r) << dendl; + dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl; + } +} + +void ActivePyModules::set_config(const std::string &module_name, + const std::string &key, const boost::optional& val) +{ + module_config.set_config(&monc, module_name, key, val); +} + +std::map ActivePyModules::get_services() const +{ + std::map result; + std::lock_guard l(lock); + for (const auto& [name, module] : modules) { + std::string svc_str = module->get_uri(); + if (!svc_str.empty()) { + result[name] = svc_str; + } + } + + return result; +} + +PyObject* ActivePyModules::with_perf_counters( + std::function fct, + const std::string &svc_name, + const std::string &svc_id, + const std::string &path) const +{ + PyThreadState *tstate = PyEval_SaveThread(); + std::lock_guard l(lock); + PyEval_RestoreThread(tstate); + + PyFormatter f; + f.open_array_section(path.c_str()); + + auto metadata = daemon_state.get(DaemonKey(svc_name, svc_id)); + if (metadata) { + std::lock_guard l2(metadata->lock); + if (metadata->perf_counters.instances.count(path)) { + auto counter_instance = metadata->perf_counters.instances.at(path); + auto counter_type = metadata->perf_counters.types.at(path); + fct(counter_instance, counter_type, f); + } else { + dout(4) << "Missing counter: '" << path << "' (" + << svc_name << "." << svc_id << ")" << dendl; + dout(20) << "Paths are:" << dendl; + for (const auto &i : metadata->perf_counters.instances) { + dout(20) << i.first << dendl; + } + } + } else { + dout(4) << "No daemon state for " + << svc_name << "." << svc_id << ")" << dendl; + } + f.close_section(); + return f.get(); +} + +PyObject* ActivePyModules::get_counter_python( + const std::string &svc_name, + const std::string &svc_id, + const std::string &path) +{ + auto extract_counters = []( + PerfCounterInstance& counter_instance, + PerfCounterType& counter_type, + PyFormatter& f) + { + if (counter_type.type & PERFCOUNTER_LONGRUNAVG) { + const auto &avg_data = counter_instance.get_data_avg(); + for (const auto &datapoint : avg_data) { + f.open_array_section("datapoint"); + f.dump_unsigned("t", datapoint.t.sec()); + f.dump_unsigned("s", datapoint.s); + f.dump_unsigned("c", datapoint.c); + f.close_section(); + } + } else { + const auto &data = counter_instance.get_data(); + for (const auto &datapoint : data) { + f.open_array_section("datapoint"); + f.dump_unsigned("t", datapoint.t.sec()); + f.dump_unsigned("v", datapoint.v); + f.close_section(); + } + } + }; + return with_perf_counters(extract_counters, svc_name, svc_id, path); +} + +PyObject* ActivePyModules::get_latest_counter_python( + const std::string &svc_name, + const std::string &svc_id, + const std::string &path) +{ + auto extract_latest_counters = []( + PerfCounterInstance& counter_instance, + PerfCounterType& counter_type, + PyFormatter& f) + { + if (counter_type.type & PERFCOUNTER_LONGRUNAVG) { + const auto &datapoint = counter_instance.get_latest_data_avg(); + f.dump_unsigned("t", datapoint.t.sec()); + f.dump_unsigned("s", datapoint.s); + f.dump_unsigned("c", datapoint.c); + } else { + const auto &datapoint = counter_instance.get_latest_data(); + f.dump_unsigned("t", datapoint.t.sec()); + f.dump_unsigned("v", datapoint.v); + } + }; + return with_perf_counters(extract_latest_counters, svc_name, svc_id, path); +} + +PyObject* ActivePyModules::get_perf_schema_python( + const std::string &svc_type, + const std::string &svc_id) +{ + PyThreadState *tstate = PyEval_SaveThread(); + std::lock_guard l(lock); + PyEval_RestoreThread(tstate); + + DaemonStateCollection daemons; + + if (svc_type == "") { + daemons = daemon_state.get_all(); + } else if (svc_id.empty()) { + daemons = daemon_state.get_by_service(svc_type); + } else { + auto key = DaemonKey(svc_type, svc_id); + // so that the below can be a loop in all cases + auto got = daemon_state.get(key); + if (got != nullptr) { + daemons[key] = got; + } + } + + PyFormatter f; + if (!daemons.empty()) { + for (auto statepair : daemons) { + auto key = statepair.first; + auto state = statepair.second; + + std::ostringstream daemon_name; + daemon_name << key.first << "." << key.second; + f.open_object_section(daemon_name.str().c_str()); + + std::lock_guard l(state->lock); + for (auto ctr_inst_iter : state->perf_counters.instances) { + const auto &counter_name = ctr_inst_iter.first; + f.open_object_section(counter_name.c_str()); + auto type = state->perf_counters.types[counter_name]; + f.dump_string("description", type.description); + if (!type.nick.empty()) { + f.dump_string("nick", type.nick); + } + f.dump_unsigned("type", type.type); + f.dump_unsigned("priority", type.priority); + f.dump_unsigned("units", type.unit); + f.close_section(); + } + f.close_section(); + } + } else { + dout(4) << __func__ << ": No daemon state found for " + << svc_type << "." << svc_id << ")" << dendl; + } + return f.get(); +} + +PyObject *ActivePyModules::get_context() +{ + PyThreadState *tstate = PyEval_SaveThread(); + std::lock_guard l(lock); + PyEval_RestoreThread(tstate); + + // Construct a capsule containing ceph context. + // Not incrementing/decrementing ref count on the context because + // it's the global one and it has process lifetime. + auto capsule = PyCapsule_New(g_ceph_context, nullptr, nullptr); + return capsule; +} + +/** + * Helper for our wrapped types that take a capsule in their constructor. + */ +PyObject *construct_with_capsule( + const std::string &module_name, + const std::string &clsname, + void *wrapped) +{ + // Look up the OSDMap type which we will construct + PyObject *module = PyImport_ImportModule(module_name.c_str()); + if (!module) { + derr << "Failed to import python module:" << dendl; + derr << handle_pyerror() << dendl; + } + ceph_assert(module); + + PyObject *wrapper_type = PyObject_GetAttrString( + module, (const char*)clsname.c_str()); + if (!wrapper_type) { + derr << "Failed to get python type:" << dendl; + derr << handle_pyerror() << dendl; + } + ceph_assert(wrapper_type); + + // Construct a capsule containing an OSDMap. + auto wrapped_capsule = PyCapsule_New(wrapped, nullptr, nullptr); + ceph_assert(wrapped_capsule); + + // Construct the python OSDMap + auto pArgs = PyTuple_Pack(1, wrapped_capsule); + auto wrapper_instance = PyObject_CallObject(wrapper_type, pArgs); + if (wrapper_instance == nullptr) { + derr << "Failed to construct python OSDMap:" << dendl; + derr << handle_pyerror() << dendl; + } + ceph_assert(wrapper_instance != nullptr); + Py_DECREF(pArgs); + Py_DECREF(wrapped_capsule); + + Py_DECREF(wrapper_type); + Py_DECREF(module); + + return wrapper_instance; +} + +PyObject *ActivePyModules::get_osdmap() +{ + OSDMap *newmap = new OSDMap; + + PyThreadState *tstate = PyEval_SaveThread(); + { + cluster_state.with_osdmap([&](const OSDMap& o) { + newmap->deepish_copy_from(o); + }); + } + PyEval_RestoreThread(tstate); + + return construct_with_capsule("mgr_module", "OSDMap", (void*)newmap); +} + +void ActivePyModules::set_health_checks(const std::string& module_name, + health_check_map_t&& checks) +{ + bool changed = false; + + lock.Lock(); + auto p = modules.find(module_name); + if (p != modules.end()) { + changed = p->second->set_health_checks(std::move(checks)); + } + lock.Unlock(); + + // immediately schedule a report to be sent to the monitors with the new + // health checks that have changed. This is done asynchronusly to avoid + // blocking python land. ActivePyModules::lock needs to be dropped to make + // lockdep happy: + // + // send_report callers: DaemonServer::lock -> PyModuleRegistery::lock + // active_start: PyModuleRegistry::lock -> ActivePyModules::lock + // + // if we don't release this->lock before calling schedule_tick a cycle is + // formed with the addition of ActivePyModules::lock -> DaemonServer::lock. + // This is still correct as send_report is run asynchronously under + // DaemonServer::lock. + if (changed) + server.schedule_tick(0); +} + +int ActivePyModules::handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss) +{ + lock.lock(); + auto mod_iter = modules.find(module_command.module_name); + if (mod_iter == modules.end()) { + *ss << "Module '" << module_command.module_name << "' is not available"; + lock.unlock(); + return -ENOENT; + } + + lock.unlock(); + return mod_iter->second->handle_command(module_command, session, cmdmap, + inbuf, ds, ss); +} + +void ActivePyModules::get_health_checks(health_check_map_t *checks) +{ + std::lock_guard l(lock); + for (auto& [name, module] : modules) { + dout(15) << "getting health checks for" << name << dendl; + module->get_health_checks(checks); + } +} + +void ActivePyModules::update_progress_event( + const std::string& evid, + const std::string& desc, + float progress) +{ + std::lock_guard l(lock); + auto& pe = progress_events[evid]; + pe.message = desc; + pe.progress = progress; +} + +void ActivePyModules::complete_progress_event(const std::string& evid) +{ + std::lock_guard l(lock); + progress_events.erase(evid); +} + +void ActivePyModules::clear_all_progress_events() +{ + std::lock_guard l(lock); + progress_events.clear(); +} + +void ActivePyModules::get_progress_events(std::map *events) +{ + std::lock_guard l(lock); + *events = progress_events; +} + +void ActivePyModules::config_notify() +{ + std::lock_guard l(lock); + for (auto& [name, module] : modules) { + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + dout(15) << "notify (config) " << name << dendl; + finisher.queue(new FunctionContext([module](int r){ + module->config_notify(); + })); + } +} + +void ActivePyModules::set_uri(const std::string& module_name, + const std::string &uri) +{ + std::lock_guard l(lock); + + dout(4) << " module " << module_name << " set URI '" << uri << "'" << dendl; + + modules.at(module_name)->set_uri(uri); +} + +OSDPerfMetricQueryID ActivePyModules::add_osd_perf_query( + const OSDPerfMetricQuery &query, + const std::optional &limit) +{ + return server.add_osd_perf_query(query, limit); +} + +void ActivePyModules::remove_osd_perf_query(OSDPerfMetricQueryID query_id) +{ + int r = server.remove_osd_perf_query(query_id); + if (r < 0) { + dout(0) << "remove_osd_perf_query for query_id=" << query_id << " failed: " + << cpp_strerror(r) << dendl; + } +} + +PyObject *ActivePyModules::get_osd_perf_counters(OSDPerfMetricQueryID query_id) +{ + std::map counters; + + int r = server.get_osd_perf_counters(query_id, &counters); + if (r < 0) { + dout(0) << "get_osd_perf_counters for query_id=" << query_id << " failed: " + << cpp_strerror(r) << dendl; + Py_RETURN_NONE; + } + + PyFormatter f; + + f.open_array_section("counters"); + for (auto &it : counters) { + auto &key = it.first; + auto &instance_counters = it.second; + f.open_object_section("i"); + f.open_array_section("k"); + for (auto &sub_key : key) { + f.open_array_section("s"); + for (size_t i = 0; i < sub_key.size(); i++) { + f.dump_string(stringify(i).c_str(), sub_key[i]); + } + f.close_section(); // s + } + f.close_section(); // k + f.open_array_section("c"); + for (auto &c : instance_counters) { + f.open_array_section("p"); + f.dump_unsigned("0", c.first); + f.dump_unsigned("1", c.second); + f.close_section(); // p + } + f.close_section(); // c + f.close_section(); // i + } + f.close_section(); // counters + + return f.get(); +} + +void ActivePyModules::cluster_log(const std::string &channel, clog_type prio, + const std::string &message) +{ + std::lock_guard l(lock); + + if (channel == "audit") { + audit_clog->do_log(prio, message); + } else { + clog->do_log(prio, message); + } +} diff --git a/src/mgr/ActivePyModules.h b/src/mgr/ActivePyModules.h new file mode 100644 index 00000000..8a14a467 --- /dev/null +++ b/src/mgr/ActivePyModules.h @@ -0,0 +1,194 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include "ActivePyModule.h" + +#include "common/Finisher.h" +#include "common/Mutex.h" + +#include "PyFormatter.h" + +#include "osdc/Objecter.h" +#include "client/Client.h" +#include "common/LogClient.h" +#include "mon/MgrMap.h" +#include "mon/MonCommand.h" +#include "mon/mon_types.h" + +#include "DaemonState.h" +#include "ClusterState.h" +#include "OSDPerfMetricTypes.h" + +class health_check_map_t; +class DaemonServer; +class MgrSession; +class ModuleCommand; +class PyModuleRegistry; + +class ActivePyModules +{ + // module class instances not yet created + std::set> pending_modules; + // module class instances already created + std::map> modules; + PyModuleConfig &module_config; + std::map store_cache; + DaemonStateIndex &daemon_state; + ClusterState &cluster_state; + MonClient &monc; + LogChannelRef clog, audit_clog; + Objecter &objecter; + Client &client; + Finisher &finisher; +public: + Finisher cmd_finisher; +private: + DaemonServer &server; + PyModuleRegistry &py_module_registry; + + map progress_events; + + mutable Mutex lock{"ActivePyModules::lock"}; + +public: + ActivePyModules(PyModuleConfig &module_config, + std::map store_data, + DaemonStateIndex &ds, ClusterState &cs, MonClient &mc, + LogChannelRef clog_, LogChannelRef audit_clog_, Objecter &objecter_, Client &client_, + Finisher &f, DaemonServer &server, PyModuleRegistry &pmr); + + ~ActivePyModules(); + + // FIXME: wrap for send_command? + MonClient &get_monc() {return monc;} + Objecter &get_objecter() {return objecter;} + Client &get_client() {return client;} + PyObject *get_python(const std::string &what); + PyObject *get_server_python(const std::string &hostname); + PyObject *list_servers_python(); + PyObject *get_metadata_python( + const std::string &svc_type, const std::string &svc_id); + PyObject *get_daemon_status_python( + const std::string &svc_type, const std::string &svc_id); + PyObject *get_counter_python( + const std::string &svc_type, + const std::string &svc_id, + const std::string &path); + PyObject *get_latest_counter_python( + const std::string &svc_type, + const std::string &svc_id, + const std::string &path); + PyObject *get_perf_schema_python( + const std::string &svc_type, + const std::string &svc_id); + PyObject *get_context(); + PyObject *get_osdmap(); + PyObject *with_perf_counters( + std::function fct, + const std::string &svc_name, + const std::string &svc_id, + const std::string &path) const; + + OSDPerfMetricQueryID add_osd_perf_query( + const OSDPerfMetricQuery &query, + const std::optional &limit); + void remove_osd_perf_query(OSDPerfMetricQueryID query_id); + PyObject *get_osd_perf_counters(OSDPerfMetricQueryID query_id); + + bool get_store(const std::string &module_name, + const std::string &key, std::string *val) const; + PyObject *get_store_prefix(const std::string &module_name, + const std::string &prefix) const; + void set_store(const std::string &module_name, + const std::string &key, const boost::optional &val); + + bool get_config(const std::string &module_name, + const std::string &key, std::string *val) const; + void set_config(const std::string &module_name, + const std::string &key, const boost::optional &val); + + PyObject *get_typed_config(const std::string &module_name, + const std::string &key, + const std::string &prefix = "") const; + + void set_health_checks(const std::string& module_name, + health_check_map_t&& checks); + void get_health_checks(health_check_map_t *checks); + + void update_progress_event(const std::string& evid, + const std::string& desc, + float progress); + void complete_progress_event(const std::string& evid); + void clear_all_progress_events(); + void get_progress_events(std::map* events); + + void config_notify(); + + void set_uri(const std::string& module_name, const std::string &uri); + + int handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss); + + std::map get_services() const; + + // Public so that MonCommandCompletion can use it + // FIXME: for send_command completion notifications, + // send it to only the module that sent the command, not everyone + void notify_all(const std::string ¬ify_type, + const std::string ¬ify_id); + void notify_all(const LogEntry &log_entry); + + bool is_pending(std::string_view name) const { + return pending_modules.count(name) > 0; + } + bool module_exists(const std::string &name) const + { + return modules.count(name) > 0; + } + + bool method_exists( + const std::string &module_name, + const std::string &method_name) const + { + return modules.at(module_name)->method_exists(method_name); + } + + PyObject *dispatch_remote( + const std::string &other_module, + const std::string &method, + PyObject *args, + PyObject *kwargs, + std::string *err); + + int init(); + void shutdown(); + + void start_one(PyModuleRef py_module); + + void dump_server(const std::string &hostname, + const DaemonStateCollection &dmc, + Formatter *f); + + void cluster_log(const std::string &channel, clog_type prio, + const std::string &message); +}; diff --git a/src/mgr/BaseMgrModule.cc b/src/mgr/BaseMgrModule.cc new file mode 100644 index 00000000..0f8f0b6d --- /dev/null +++ b/src/mgr/BaseMgrModule.cc @@ -0,0 +1,1228 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +/** + * The interface we present to python code that runs within + * ceph-mgr. This is implemented as a Python class from which + * all modules must inherit -- access to the Ceph state is then + * available as methods on that object. + */ + +#include "Python.h" + +#include "Mgr.h" + +#include "mon/MonClient.h" +#include "common/errno.h" +#include "common/version.h" + +#include "BaseMgrModule.h" +#include "Gil.h" + +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +#define PLACEHOLDER "" + + +typedef struct { + PyObject_HEAD + ActivePyModules *py_modules; + ActivePyModule *this_module; +} BaseMgrModule; + +class MonCommandCompletion : public Context +{ + ActivePyModules *py_modules; + PyObject *python_completion; + const std::string tag; + SafeThreadState pThreadState; + +public: + std::string outs; + bufferlist outbl; + + MonCommandCompletion( + ActivePyModules *py_modules_, PyObject* ev, + const std::string &tag_, PyThreadState *ts_) + : py_modules(py_modules_), python_completion(ev), + tag(tag_), pThreadState(ts_) + { + ceph_assert(python_completion != nullptr); + Py_INCREF(python_completion); + } + + ~MonCommandCompletion() override + { + if (python_completion) { + // Usually do this in finish(): this path is only for if we're + // being destroyed without completing. + Gil gil(pThreadState, true); + Py_DECREF(python_completion); + python_completion = nullptr; + } + } + + void finish(int r) override + { + ceph_assert(python_completion != nullptr); + + dout(10) << "MonCommandCompletion::finish()" << dendl; + { + // Scoped so the Gil is released before calling notify_all() + // Create new thread state because this is called via the MonClient + // Finisher, not the PyModules finisher. + Gil gil(pThreadState, true); + + auto set_fn = PyObject_GetAttrString(python_completion, "complete"); + ceph_assert(set_fn != nullptr); + + auto pyR = PyInt_FromLong(r); + auto pyOutBl = PyString_FromString(outbl.to_str().c_str()); + auto pyOutS = PyString_FromString(outs.c_str()); + auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS); + Py_DECREF(pyR); + Py_DECREF(pyOutBl); + Py_DECREF(pyOutS); + + auto rtn = PyObject_CallObject(set_fn, args); + if (rtn != nullptr) { + Py_DECREF(rtn); + } + Py_DECREF(args); + Py_DECREF(set_fn); + + Py_DECREF(python_completion); + python_completion = nullptr; + } + py_modules->notify_all("command", tag); + } +}; + + +static PyObject* +ceph_send_command(BaseMgrModule *self, PyObject *args) +{ + // Like mon, osd, mds + char *type = nullptr; + + // Like "23" for an OSD or "myid" for an MDS + char *name = nullptr; + + char *cmd_json = nullptr; + char *tag = nullptr; + char *inbuf_ptr = nullptr; + Py_ssize_t inbuf_len = 0; + bufferlist inbuf = {}; + + PyObject *completion = nullptr; + if (!PyArg_ParseTuple(args, "Ossssz#:ceph_send_command", + &completion, &type, &name, &cmd_json, &tag, &inbuf_ptr, &inbuf_len)) { + return nullptr; + } + + if (inbuf_ptr) { + inbuf.append(inbuf_ptr, (unsigned)inbuf_len); + } + + auto set_fn = PyObject_GetAttrString(completion, "complete"); + if (set_fn == nullptr) { + ceph_abort(); // TODO raise python exception instead + } else { + ceph_assert(PyCallable_Check(set_fn)); + } + Py_DECREF(set_fn); + + MonCommandCompletion *command_c = new MonCommandCompletion(self->py_modules, + completion, tag, PyThreadState_Get()); + + PyThreadState *tstate = PyEval_SaveThread(); + + if (std::string(type) == "mon") { + + // Wait for the latest OSDMap after each command we send to + // the mons. This is a heavy-handed hack to make life simpler + // for python module authors, so that they know whenever they + // run a command they've gt a fresh OSDMap afterwards. + // TODO: enhance MCommand interface so that it returns + // latest cluster map versions on completion, and callers + // can wait for those. + auto c = new FunctionContext([command_c, self](int command_r){ + self->py_modules->get_objecter().wait_for_latest_osdmap( + new FunctionContext([command_c, command_r](int wait_r){ + command_c->complete(command_r); + }) + ); + }); + + self->py_modules->get_monc().start_mon_command( + name, + {cmd_json}, + inbuf, + &command_c->outbl, + &command_c->outs, + new C_OnFinisher(c, &self->py_modules->cmd_finisher)); + } else if (std::string(type) == "osd") { + std::string err; + uint64_t osd_id = strict_strtoll(name, 10, &err); + if (!err.empty()) { + delete command_c; + string msg("invalid osd_id: "); + msg.append("\"").append(name).append("\""); + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_ValueError, msg.c_str()); + return nullptr; + } + + ceph_tid_t tid; + self->py_modules->get_objecter().osd_command( + osd_id, + {cmd_json}, + inbuf, + &tid, + &command_c->outbl, + &command_c->outs, + new C_OnFinisher(command_c, &self->py_modules->cmd_finisher)); + } else if (std::string(type) == "mds") { + int r = self->py_modules->get_client().mds_command( + name, + {cmd_json}, + inbuf, + &command_c->outbl, + &command_c->outs, + new C_OnFinisher(command_c, &self->py_modules->cmd_finisher)); + if (r != 0) { + string msg("failed to send command to mds: "); + msg.append(cpp_strerror(r)); + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_RuntimeError, msg.c_str()); + return nullptr; + } + } else if (std::string(type) == "pg") { + pg_t pgid; + if (!pgid.parse(name)) { + delete command_c; + string msg("invalid pgid: "); + msg.append("\"").append(name).append("\""); + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_ValueError, msg.c_str()); + return nullptr; + } + + ceph_tid_t tid; + self->py_modules->get_objecter().pg_command( + pgid, + {cmd_json}, + inbuf, + &tid, + &command_c->outbl, + &command_c->outs, + new C_OnFinisher(command_c, &self->py_modules->cmd_finisher)); + PyEval_RestoreThread(tstate); + return nullptr; + } else { + delete command_c; + string msg("unknown service type: "); + msg.append(type); + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_ValueError, msg.c_str()); + return nullptr; + } + + PyEval_RestoreThread(tstate); + Py_RETURN_NONE; +} + +static PyObject* +ceph_set_health_checks(BaseMgrModule *self, PyObject *args) +{ + PyObject *checks = NULL; + if (!PyArg_ParseTuple(args, "O:ceph_set_health_checks", &checks)) { + return NULL; + } + if (!PyDict_Check(checks)) { + derr << __func__ << " arg not a dict" << dendl; + Py_RETURN_NONE; + } + PyObject *checksls = PyDict_Items(checks); + health_check_map_t out_checks; + for (int i = 0; i < PyList_Size(checksls); ++i) { + PyObject *kv = PyList_GET_ITEM(checksls, i); + char *check_name = nullptr; + PyObject *check_info = nullptr; + if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) { + derr << __func__ << " dict item " << i + << " not a size 2 tuple" << dendl; + continue; + } + if (!PyDict_Check(check_info)) { + derr << __func__ << " item " << i << " " << check_name + << " value not a dict" << dendl; + continue; + } + health_status_t severity = HEALTH_OK; + string summary; + list detail; + PyObject *infols = PyDict_Items(check_info); + for (int j = 0; j < PyList_Size(infols); ++j) { + PyObject *pair = PyList_GET_ITEM(infols, j); + if (!PyTuple_Check(pair)) { + derr << __func__ << " item " << i << " pair " << j + << " not a tuple" << dendl; + continue; + } + char *k = nullptr; + PyObject *v = nullptr; + if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) { + derr << __func__ << " item " << i << " pair " << j + << " not a size 2 tuple" << dendl; + continue; + } + string ks(k); + if (ks == "severity") { + if (!PyString_Check(v)) { + derr << __func__ << " check " << check_name + << " severity value not string" << dendl; + continue; + } + string vs(PyString_AsString(v)); + if (vs == "warning") { + severity = HEALTH_WARN; + } else if (vs == "error") { + severity = HEALTH_ERR; + } + } else if (ks == "summary") { + if (!PyString_Check(v) && !PyUnicode_Check(v)) { + derr << __func__ << " check " << check_name + << " summary value not [unicode] string" << dendl; + continue; + } + summary = PyString_AsString(v); + } else if (ks == "detail") { + if (!PyList_Check(v)) { + derr << __func__ << " check " << check_name + << " detail value not list" << dendl; + continue; + } + for (int k = 0; k < PyList_Size(v); ++k) { + PyObject *di = PyList_GET_ITEM(v, k); + if (!PyString_Check(di) && !PyUnicode_Check(di)) { + derr << __func__ << " check " << check_name + << " detail item " << k << " not a [unicode] string" << dendl; + continue; + } + detail.push_back(PyString_AsString(di)); + } + } else { + derr << __func__ << " check " << check_name + << " unexpected key " << k << dendl; + } + } + auto& d = out_checks.add(check_name, severity, summary); + d.detail.swap(detail); + } + + JSONFormatter jf(true); + dout(10) << "module " << self->this_module->get_name() + << " health checks:\n"; + out_checks.dump(&jf); + jf.flush(*_dout); + *_dout << dendl; + + PyThreadState *tstate = PyEval_SaveThread(); + self->py_modules->set_health_checks(self->this_module->get_name(), + std::move(out_checks)); + PyEval_RestoreThread(tstate); + + Py_RETURN_NONE; +} + + +static PyObject* +ceph_state_get(BaseMgrModule *self, PyObject *args) +{ + char *what = NULL; + if (!PyArg_ParseTuple(args, "s:ceph_state_get", &what)) { + return NULL; + } + + return self->py_modules->get_python(what); +} + + +static PyObject* +ceph_get_server(BaseMgrModule *self, PyObject *args) +{ + char *hostname = NULL; + if (!PyArg_ParseTuple(args, "z:ceph_get_server", &hostname)) { + return NULL; + } + + if (hostname) { + return self->py_modules->get_server_python(hostname); + } else { + return self->py_modules->list_servers_python(); + } +} + +static PyObject* +ceph_get_mgr_id(BaseMgrModule *self, PyObject *args) +{ + return PyString_FromString(g_conf()->name.get_id().c_str()); +} + +static PyObject* +ceph_option_get(BaseMgrModule *self, PyObject *args) +{ + char *what = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_option_get", &what)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + std::string value; + int r = g_conf().get_val(string(what), &value); + if (r >= 0) { + dout(10) << "ceph_option_get " << what << " found: " << value << dendl; + return PyString_FromString(value.c_str()); + } else { + dout(4) << "ceph_option_get " << what << " not found " << dendl; + Py_RETURN_NONE; + } +} + +static PyObject* +ceph_get_module_option(BaseMgrModule *self, PyObject *args) +{ + char *module = nullptr; + char *key = nullptr; + char *prefix = nullptr; + if (!PyArg_ParseTuple(args, "ss|s:ceph_get_module_option", &module, &key, + &prefix)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + std::string str_prefix; + if (prefix) { + str_prefix = prefix; + } + assert(self->this_module->py_module); + auto pResult = self->py_modules->get_typed_config(module, key, str_prefix); + return pResult; +} + +static PyObject* +ceph_store_get_prefix(BaseMgrModule *self, PyObject *args) +{ + char *prefix = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_store_get_prefix", &prefix)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + return self->py_modules->get_store_prefix(self->this_module->get_name(), + prefix); +} + +static PyObject* +ceph_set_module_option(BaseMgrModule *self, PyObject *args) +{ + char *module = nullptr; + char *key = nullptr; + char *value = nullptr; + if (!PyArg_ParseTuple(args, "ssz:ceph_set_module_option", + &module, &key, &value)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + boost::optional val; + if (value) { + val = value; + } + PyThreadState *tstate = PyEval_SaveThread(); + self->py_modules->set_config(module, key, val); + PyEval_RestoreThread(tstate); + + Py_RETURN_NONE; +} + +static PyObject* +ceph_store_get(BaseMgrModule *self, PyObject *args) +{ + char *what = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_store_get", &what)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + std::string value; + bool found = self->py_modules->get_store(self->this_module->get_name(), + what, &value); + if (found) { + dout(10) << "ceph_store_get " << what << " found: " << value.c_str() << dendl; + return PyString_FromString(value.c_str()); + } else { + dout(4) << "ceph_store_get " << what << " not found " << dendl; + Py_RETURN_NONE; + } +} + +static PyObject* +ceph_store_set(BaseMgrModule *self, PyObject *args) +{ + char *key = nullptr; + char *value = nullptr; + if (!PyArg_ParseTuple(args, "sz:ceph_store_set", &key, &value)) { + return nullptr; + } + boost::optional val; + if (value) { + val = value; + } + PyThreadState *tstate = PyEval_SaveThread(); + self->py_modules->set_store(self->this_module->get_name(), key, val); + PyEval_RestoreThread(tstate); + + Py_RETURN_NONE; +} + +static PyObject* +get_metadata(BaseMgrModule *self, PyObject *args) +{ + char *svc_name = NULL; + char *svc_id = NULL; + if (!PyArg_ParseTuple(args, "ss:get_metadata", &svc_name, &svc_id)) { + return nullptr; + } + return self->py_modules->get_metadata_python(svc_name, svc_id); +} + +static PyObject* +get_daemon_status(BaseMgrModule *self, PyObject *args) +{ + char *svc_name = NULL; + char *svc_id = NULL; + if (!PyArg_ParseTuple(args, "ss:get_daemon_status", &svc_name, + &svc_id)) { + return nullptr; + } + return self->py_modules->get_daemon_status_python(svc_name, svc_id); +} + +static PyObject* +ceph_log(BaseMgrModule *self, PyObject *args) +{ + int level = 0; + char *record = nullptr; + if (!PyArg_ParseTuple(args, "is:log", &level, &record)) { + return nullptr; + } + + ceph_assert(self->this_module); + + self->this_module->log(level, record); + + Py_RETURN_NONE; +} + +static PyObject* +ceph_cluster_log(BaseMgrModule *self, PyObject *args) +{ + int prio = 0; + char *channel = nullptr; + char *message = nullptr; + std::vector channels = { "audit", "cluster" }; + + if (!PyArg_ParseTuple(args, "sis:ceph_cluster_log", &channel, &prio, &message)) { + return nullptr; + } + + if (std::find(channels.begin(), channels.end(), std::string(channel)) == channels.end()) { + std::string msg("Unknown channel: "); + msg.append(channel); + PyErr_SetString(PyExc_ValueError, msg.c_str()); + return nullptr; + } + + PyThreadState *tstate = PyEval_SaveThread(); + self->py_modules->cluster_log(channel, (clog_type)prio, message); + PyEval_RestoreThread(tstate); + + Py_RETURN_NONE; +} + +static PyObject * +ceph_get_version(BaseMgrModule *self, PyObject *args) +{ + return PyString_FromString(pretty_version_to_str().c_str()); +} + +static PyObject * +ceph_get_release_name(BaseMgrModule *self, PyObject *args) +{ + return PyString_FromString(ceph_release_to_str()); +} + +static PyObject * +ceph_get_context(BaseMgrModule *self) +{ + return self->py_modules->get_context(); +} + +static PyObject* +get_counter(BaseMgrModule *self, PyObject *args) +{ + char *svc_name = nullptr; + char *svc_id = nullptr; + char *counter_path = nullptr; + if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name, + &svc_id, &counter_path)) { + return nullptr; + } + return self->py_modules->get_counter_python( + svc_name, svc_id, counter_path); +} + +static PyObject* +get_latest_counter(BaseMgrModule *self, PyObject *args) +{ + char *svc_name = nullptr; + char *svc_id = nullptr; + char *counter_path = nullptr; + if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name, + &svc_id, &counter_path)) { + return nullptr; + } + return self->py_modules->get_latest_counter_python( + svc_name, svc_id, counter_path); +} + +static PyObject* +get_perf_schema(BaseMgrModule *self, PyObject *args) +{ + char *type_str = nullptr; + char *svc_id = nullptr; + if (!PyArg_ParseTuple(args, "ss:get_perf_schema", &type_str, + &svc_id)) { + return nullptr; + } + + return self->py_modules->get_perf_schema_python(type_str, svc_id); +} + +static PyObject * +ceph_get_osdmap(BaseMgrModule *self, PyObject *args) +{ + return self->py_modules->get_osdmap(); +} + +static PyObject* +ceph_set_uri(BaseMgrModule *self, PyObject *args) +{ + char *svc_str = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_advertize_service", + &svc_str)) { + return nullptr; + } + + // We call down into PyModules even though we have a MgrPyModule + // reference here, because MgrPyModule's fields are protected + // by PyModules' lock. + PyThreadState *tstate = PyEval_SaveThread(); + self->py_modules->set_uri(self->this_module->get_name(), svc_str); + PyEval_RestoreThread(tstate); + + Py_RETURN_NONE; +} + +static PyObject* +ceph_have_mon_connection(BaseMgrModule *self, PyObject *args) +{ + if (self->py_modules->get_monc().is_connected()) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } +} + +static PyObject* +ceph_update_progress_event(BaseMgrModule *self, PyObject *args) +{ + char *evid = nullptr; + char *desc = nullptr; + float progress = 0.0; + if (!PyArg_ParseTuple(args, "ssf:ceph_update_progress_event", + &evid, &desc, &progress)) { + return nullptr; + } + + PyThreadState *tstate = PyEval_SaveThread(); + self->py_modules->update_progress_event(evid, desc, progress); + PyEval_RestoreThread(tstate); + + Py_RETURN_NONE; +} + +static PyObject* +ceph_complete_progress_event(BaseMgrModule *self, PyObject *args) +{ + char *evid = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_complete_progress_event", + &evid)) { + return nullptr; + } + + PyThreadState *tstate = PyEval_SaveThread(); + self->py_modules->complete_progress_event(evid); + PyEval_RestoreThread(tstate); + + Py_RETURN_NONE; +} + +static PyObject* +ceph_clear_all_progress_events(BaseMgrModule *self, PyObject *args) +{ + PyThreadState *tstate = PyEval_SaveThread(); + self->py_modules->clear_all_progress_events(); + PyEval_RestoreThread(tstate); + + Py_RETURN_NONE; +} + + + +static PyObject * +ceph_dispatch_remote(BaseMgrModule *self, PyObject *args) +{ + char *other_module = nullptr; + char *method = nullptr; + PyObject *remote_args = nullptr; + PyObject *remote_kwargs = nullptr; + if (!PyArg_ParseTuple(args, "ssOO:ceph_dispatch_remote", + &other_module, &method, &remote_args, &remote_kwargs)) { + return nullptr; + } + + // Early error handling, because if the module doesn't exist then we + // won't be able to use its thread state to set python error state + // inside dispatch_remote(). + if (!self->py_modules->module_exists(other_module)) { + derr << "no module '" << other_module << "'" << dendl; + PyErr_SetString(PyExc_ImportError, "Module not found"); + return nullptr; + } + + // Drop GIL from calling python thread state, it will be taken + // both for checking for method existence and for executing method. + PyThreadState *tstate = PyEval_SaveThread(); + + if (!self->py_modules->method_exists(other_module, method)) { + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_NameError, "Method not found"); + return nullptr; + } + + std::string err; + auto result = self->py_modules->dispatch_remote(other_module, method, + remote_args, remote_kwargs, &err); + + PyEval_RestoreThread(tstate); + + if (result == nullptr) { + std::stringstream ss; + ss << "Remote method threw exception: " << err; + PyErr_SetString(PyExc_RuntimeError, ss.str().c_str()); + derr << ss.str() << dendl; + } + + return result; +} + +static PyObject* +ceph_add_osd_perf_query(BaseMgrModule *self, PyObject *args) +{ + static const std::string NAME_KEY_DESCRIPTOR = "key_descriptor"; + static const std::string NAME_COUNTERS_DESCRIPTORS = + "performance_counter_descriptors"; + static const std::string NAME_LIMIT = "limit"; + static const std::string NAME_SUB_KEY_TYPE = "type"; + static const std::string NAME_SUB_KEY_REGEX = "regex"; + static const std::string NAME_LIMIT_ORDER_BY = "order_by"; + static const std::string NAME_LIMIT_MAX_COUNT = "max_count"; + static const std::map sub_key_types = { + {"client_id", OSDPerfMetricSubKeyType::CLIENT_ID}, + {"client_address", OSDPerfMetricSubKeyType::CLIENT_ADDRESS}, + {"pool_id", OSDPerfMetricSubKeyType::POOL_ID}, + {"namespace", OSDPerfMetricSubKeyType::NAMESPACE}, + {"osd_id", OSDPerfMetricSubKeyType::OSD_ID}, + {"pg_id", OSDPerfMetricSubKeyType::PG_ID}, + {"object_name", OSDPerfMetricSubKeyType::OBJECT_NAME}, + {"snap_id", OSDPerfMetricSubKeyType::SNAP_ID}, + }; + static const std::map counter_types = { + {"ops", PerformanceCounterType::OPS}, + {"write_ops", PerformanceCounterType::WRITE_OPS}, + {"read_ops", PerformanceCounterType::READ_OPS}, + {"bytes", PerformanceCounterType::BYTES}, + {"write_bytes", PerformanceCounterType::WRITE_BYTES}, + {"read_bytes", PerformanceCounterType::READ_BYTES}, + {"latency", PerformanceCounterType::LATENCY}, + {"write_latency", PerformanceCounterType::WRITE_LATENCY}, + {"read_latency", PerformanceCounterType::READ_LATENCY}, + }; + + PyObject *py_query = nullptr; + if (!PyArg_ParseTuple(args, "O:ceph_add_osd_perf_query", &py_query)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + if (!PyDict_Check(py_query)) { + derr << __func__ << " arg not a dict" << dendl; + Py_RETURN_NONE; + } + + PyObject *query_params = PyDict_Items(py_query); + OSDPerfMetricQuery query; + std::optional limit; + + // { + // 'key_descriptor': [ + // {'type': subkey_type, 'regex': regex_pattern}, + // ... + // ], + // 'performance_counter_descriptors': [ + // list, of, descriptor, types + // ], + // 'limit': {'order_by': performance_counter_type, 'max_count': n}, + // } + + for (int i = 0; i < PyList_Size(query_params); ++i) { + PyObject *kv = PyList_GET_ITEM(query_params, i); + char *query_param_name = nullptr; + PyObject *query_param_val = nullptr; + if (!PyArg_ParseTuple(kv, "sO:pair", &query_param_name, &query_param_val)) { + derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl; + Py_RETURN_NONE; + } + if (query_param_name == NAME_KEY_DESCRIPTOR) { + if (!PyList_Check(query_param_val)) { + derr << __func__ << " " << query_param_name << " not a list" << dendl; + Py_RETURN_NONE; + } + for (int j = 0; j < PyList_Size(query_param_val); j++) { + PyObject *sub_key = PyList_GET_ITEM(query_param_val, j); + if (!PyDict_Check(sub_key)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " not a dict" << dendl; + Py_RETURN_NONE; + } + OSDPerfMetricSubKeyDescriptor d; + PyObject *sub_key_params = PyDict_Items(sub_key); + for (int k = 0; k < PyList_Size(sub_key_params); ++k) { + PyObject *pair = PyList_GET_ITEM(sub_key_params, k); + if (!PyTuple_Check(pair)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " pair " << k << " not a tuple" << dendl; + Py_RETURN_NONE; + } + char *param_name = nullptr; + PyObject *param_value = nullptr; + if (!PyArg_ParseTuple(pair, "sO:pair", ¶m_name, ¶m_value)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " pair " << k << " not a size 2 tuple" << dendl; + Py_RETURN_NONE; + } + if (param_name == NAME_SUB_KEY_TYPE) { + if (!PyString_Check(param_value)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid param " << param_name << dendl; + Py_RETURN_NONE; + } + auto type = PyString_AsString(param_value); + auto it = sub_key_types.find(type); + if (it == sub_key_types.end()) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid type " << dendl; + Py_RETURN_NONE; + } + d.type = it->second; + } else if (param_name == NAME_SUB_KEY_REGEX) { + if (!PyString_Check(param_value)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid param " << param_name << dendl; + Py_RETURN_NONE; + } + d.regex_str = PyString_AsString(param_value); + try { + d.regex = {d.regex_str.c_str()}; + } catch (const std::regex_error& e) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid regex " << d.regex_str << dendl; + Py_RETURN_NONE; + } + if (d.regex.mark_count() == 0) { + derr << __func__ << " query " << query_param_name << " item " << j + << " regex " << d.regex_str << ": no capturing groups" + << dendl; + Py_RETURN_NONE; + } + } else { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid param " << param_name << dendl; + Py_RETURN_NONE; + } + } + if (d.type == static_cast(-1) || + d.regex_str.empty()) { + derr << __func__ << " query " << query_param_name << " item " << i + << " invalid" << dendl; + Py_RETURN_NONE; + } + query.key_descriptor.push_back(d); + } + } else if (query_param_name == NAME_COUNTERS_DESCRIPTORS) { + if (!PyList_Check(query_param_val)) { + derr << __func__ << " " << query_param_name << " not a list" << dendl; + Py_RETURN_NONE; + } + for (int j = 0; j < PyList_Size(query_param_val); j++) { + PyObject *py_type = PyList_GET_ITEM(query_param_val, j); + if (!PyString_Check(py_type)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " not a string" << dendl; + Py_RETURN_NONE; + } + auto type = PyString_AsString(py_type); + auto it = counter_types.find(type); + if (it == counter_types.end()) { + derr << __func__ << " query " << query_param_name << " item " << type + << " is not valid type" << dendl; + Py_RETURN_NONE; + } + query.performance_counter_descriptors.push_back(it->second); + } + } else if (query_param_name == NAME_LIMIT) { + if (!PyDict_Check(query_param_val)) { + derr << __func__ << " query " << query_param_name << " not a dict" + << dendl; + Py_RETURN_NONE; + } + + limit = OSDPerfMetricLimit(); + PyObject *limit_params = PyDict_Items(query_param_val); + + for (int j = 0; j < PyList_Size(limit_params); ++j) { + PyObject *kv = PyList_GET_ITEM(limit_params, j); + char *limit_param_name = nullptr; + PyObject *limit_param_val = nullptr; + if (!PyArg_ParseTuple(kv, "sO:pair", &limit_param_name, + &limit_param_val)) { + derr << __func__ << " limit item " << j << " not a size 2 tuple" + << dendl; + Py_RETURN_NONE; + } + + if (limit_param_name == NAME_LIMIT_ORDER_BY) { + if (!PyString_Check(limit_param_val)) { + derr << __func__ << " " << limit_param_name << " not a string" + << dendl; + Py_RETURN_NONE; + } + auto order_by = PyString_AsString(limit_param_val); + auto it = counter_types.find(order_by); + if (it == counter_types.end()) { + derr << __func__ << " limit " << limit_param_name + << " not a valid counter type" << dendl; + Py_RETURN_NONE; + } + limit->order_by = it->second; + } else if (limit_param_name == NAME_LIMIT_MAX_COUNT) { +#if PY_MAJOR_VERSION <= 2 + if (!PyInt_Check(limit_param_val) && !PyLong_Check(limit_param_val)) { +#else + if (!PyLong_Check(limit_param_val)) { +#endif + derr << __func__ << " " << limit_param_name << " not an int" + << dendl; + Py_RETURN_NONE; + } + limit->max_count = PyLong_AsLong(limit_param_val); + } else { + derr << __func__ << " unknown limit param: " << limit_param_name + << dendl; + Py_RETURN_NONE; + } + } + } else { + derr << __func__ << " unknown query param: " << query_param_name << dendl; + Py_RETURN_NONE; + } + } + + if (query.key_descriptor.empty() || + query.performance_counter_descriptors.empty()) { + derr << __func__ << " invalid query" << dendl; + Py_RETURN_NONE; + } + + if (limit) { + auto &ds = query.performance_counter_descriptors; + if (std::find(ds.begin(), ds.end(), limit->order_by) == ds.end()) { + derr << __func__ << " limit order_by " << limit->order_by + << " not in performance_counter_descriptors" << dendl; + Py_RETURN_NONE; + } + } + + auto query_id = self->py_modules->add_osd_perf_query(query, limit); + return PyLong_FromLong(query_id); +} + +static PyObject* +ceph_remove_osd_perf_query(BaseMgrModule *self, PyObject *args) +{ + OSDPerfMetricQueryID query_id; + if (!PyArg_ParseTuple(args, "i:ceph_remove_osd_perf_query", &query_id)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + self->py_modules->remove_osd_perf_query(query_id); + Py_RETURN_NONE; +} + +static PyObject* +ceph_get_osd_perf_counters(BaseMgrModule *self, PyObject *args) +{ + OSDPerfMetricQueryID query_id; + if (!PyArg_ParseTuple(args, "i:ceph_get_osd_perf_counters", &query_id)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + return self->py_modules->get_osd_perf_counters(query_id); +} + +static PyObject* +ceph_is_authorized(BaseMgrModule *self, PyObject *args) +{ + PyObject *args_dict = NULL; + if (!PyArg_ParseTuple(args, "O:ceph_is_authorized", &args_dict)) { + return nullptr; + } + + if (!PyDict_Check(args_dict)) { + derr << __func__ << " arg not a dict" << dendl; + Py_RETURN_FALSE; + } + + std::map arguments; + + PyObject *args_list = PyDict_Items(args_dict); + for (int i = 0; i < PyList_Size(args_list); ++i) { + PyObject *kv = PyList_GET_ITEM(args_list, i); + + char *arg_key = nullptr; + char *arg_value = nullptr; + if (!PyArg_ParseTuple(kv, "ss:pair", &arg_key, &arg_value)) { + derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl; + continue; + } + + arguments[arg_key] = arg_value; + } + + if (self->this_module->is_authorized(arguments)) { + Py_RETURN_TRUE; + } + + Py_RETURN_FALSE; +} + +PyMethodDef BaseMgrModule_methods[] = { + {"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS, + "Get a cluster object"}, + + {"_ceph_get_server", (PyCFunction)ceph_get_server, METH_VARARGS, + "Get a server object"}, + + {"_ceph_get_metadata", (PyCFunction)get_metadata, METH_VARARGS, + "Get a service's metadata"}, + + {"_ceph_get_daemon_status", (PyCFunction)get_daemon_status, METH_VARARGS, + "Get a service's status"}, + + {"_ceph_send_command", (PyCFunction)ceph_send_command, METH_VARARGS, + "Send a mon command"}, + + {"_ceph_set_health_checks", (PyCFunction)ceph_set_health_checks, METH_VARARGS, + "Set health checks for this module"}, + + {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS, + "Get the name of the Mgr daemon where we are running"}, + + {"_ceph_get_option", (PyCFunction)ceph_option_get, METH_VARARGS, + "Get a native configuration option value"}, + + {"_ceph_get_module_option", (PyCFunction)ceph_get_module_option, METH_VARARGS, + "Get a module configuration option value"}, + + {"_ceph_get_store_prefix", (PyCFunction)ceph_store_get_prefix, METH_VARARGS, + "Get all KV store values with a given prefix"}, + + {"_ceph_set_module_option", (PyCFunction)ceph_set_module_option, METH_VARARGS, + "Set a module configuration option value"}, + + {"_ceph_get_store", (PyCFunction)ceph_store_get, METH_VARARGS, + "Get a stored field"}, + + {"_ceph_set_store", (PyCFunction)ceph_store_set, METH_VARARGS, + "Set a stored field"}, + + {"_ceph_get_counter", (PyCFunction)get_counter, METH_VARARGS, + "Get a performance counter"}, + + {"_ceph_get_latest_counter", (PyCFunction)get_latest_counter, METH_VARARGS, + "Get the latest performance counter"}, + + {"_ceph_get_perf_schema", (PyCFunction)get_perf_schema, METH_VARARGS, + "Get the performance counter schema"}, + + {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS, + "Emit a (local) log message"}, + + {"_ceph_cluster_log", (PyCFunction)ceph_cluster_log, METH_VARARGS, + "Emit a cluster log message"}, + + {"_ceph_get_version", (PyCFunction)ceph_get_version, METH_NOARGS, + "Get the ceph version of this process"}, + + {"_ceph_get_release_name", (PyCFunction)ceph_get_release_name, METH_NOARGS, + "Get the ceph release name of this process"}, + + {"_ceph_get_context", (PyCFunction)ceph_get_context, METH_NOARGS, + "Get a CephContext* in a python capsule"}, + + {"_ceph_get_osdmap", (PyCFunction)ceph_get_osdmap, METH_NOARGS, + "Get an OSDMap* in a python capsule"}, + + {"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS, + "Advertize a service URI served by this module"}, + + {"_ceph_have_mon_connection", (PyCFunction)ceph_have_mon_connection, + METH_NOARGS, "Find out whether this mgr daemon currently has " + "a connection to a monitor"}, + + {"_ceph_update_progress_event", (PyCFunction)ceph_update_progress_event, + METH_VARARGS, "Update status of a progress event"}, + {"_ceph_complete_progress_event", (PyCFunction)ceph_complete_progress_event, + METH_VARARGS, "Complete a progress event"}, + {"_ceph_clear_all_progress_events", (PyCFunction)ceph_clear_all_progress_events, + METH_NOARGS, "Clear all progress events"}, + + {"_ceph_dispatch_remote", (PyCFunction)ceph_dispatch_remote, + METH_VARARGS, "Dispatch a call to another module"}, + + {"_ceph_add_osd_perf_query", (PyCFunction)ceph_add_osd_perf_query, + METH_VARARGS, "Add an osd perf query"}, + + {"_ceph_remove_osd_perf_query", (PyCFunction)ceph_remove_osd_perf_query, + METH_VARARGS, "Remove an osd perf query"}, + + {"_ceph_get_osd_perf_counters", (PyCFunction)ceph_get_osd_perf_counters, + METH_VARARGS, "Get osd perf counters"}, + + {"_ceph_is_authorized", (PyCFunction)ceph_is_authorized, + METH_VARARGS, "Verify the current session caps are valid"}, + + {NULL, NULL, 0, NULL} +}; + + +static PyObject * +BaseMgrModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + BaseMgrModule *self; + + self = (BaseMgrModule *)type->tp_alloc(type, 0); + + return (PyObject *)self; +} + +static int +BaseMgrModule_init(BaseMgrModule *self, PyObject *args, PyObject *kwds) +{ + PyObject *py_modules_capsule = nullptr; + PyObject *this_module_capsule = nullptr; + static const char *kwlist[] = {"py_modules", "this_module", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "OO", + const_cast(kwlist), + &py_modules_capsule, + &this_module_capsule)) { + return -1; + } + + self->py_modules = static_cast(PyCapsule_GetPointer( + py_modules_capsule, nullptr)); + ceph_assert(self->py_modules); + self->this_module = static_cast(PyCapsule_GetPointer( + this_module_capsule, nullptr)); + ceph_assert(self->this_module); + + return 0; +} + +PyTypeObject BaseMgrModuleType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BaseMgrModule", /* tp_name */ + sizeof(BaseMgrModule), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "ceph-mgr Python Plugin", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BaseMgrModule_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BaseMgrModule_init, /* tp_init */ + 0, /* tp_alloc */ + BaseMgrModule_new, /* tp_new */ +}; + diff --git a/src/mgr/BaseMgrModule.h b/src/mgr/BaseMgrModule.h new file mode 100644 index 00000000..2c2e5deb --- /dev/null +++ b/src/mgr/BaseMgrModule.h @@ -0,0 +1,7 @@ + +#pragma once + +#include "Python.h" + +extern PyTypeObject BaseMgrModuleType; + diff --git a/src/mgr/BaseMgrStandbyModule.cc b/src/mgr/BaseMgrStandbyModule.cc new file mode 100644 index 00000000..f3fbff66 --- /dev/null +++ b/src/mgr/BaseMgrStandbyModule.cc @@ -0,0 +1,230 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "BaseMgrStandbyModule.h" + +#include "StandbyPyModules.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +typedef struct { + PyObject_HEAD + StandbyPyModule *this_module; +} BaseMgrStandbyModule; + +static PyObject * +BaseMgrStandbyModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + BaseMgrStandbyModule *self; + + self = (BaseMgrStandbyModule *)type->tp_alloc(type, 0); + + return (PyObject *)self; +} + +static int +BaseMgrStandbyModule_init(BaseMgrStandbyModule *self, PyObject *args, PyObject *kwds) +{ + PyObject *this_module_capsule = nullptr; + static const char *kwlist[] = {"this_module", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "O", + const_cast(kwlist), + &this_module_capsule)) { + return -1; + } + + self->this_module = static_cast(PyCapsule_GetPointer( + this_module_capsule, nullptr)); + ceph_assert(self->this_module); + + return 0; +} + +static PyObject* +ceph_get_mgr_id(BaseMgrStandbyModule *self, PyObject *args) +{ + return PyString_FromString(g_conf()->name.get_id().c_str()); +} + +static PyObject* +ceph_get_module_option(BaseMgrStandbyModule *self, PyObject *args) +{ + char *what = nullptr; + char *prefix = nullptr; + if (!PyArg_ParseTuple(args, "s|s:ceph_get_module_option", &what, &prefix)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + PyThreadState *tstate = PyEval_SaveThread(); + std::string final_key; + std::string value; + bool found = false; + if (prefix) { + final_key = std::string(prefix) + "/" + what; + found = self->this_module->get_config(final_key, &value); + } + if (!found) { + final_key = what; + found = self->this_module->get_config(final_key, &value); + } + PyEval_RestoreThread(tstate); + if (found) { + dout(10) << __func__ << " " << final_key << " found: " << value + << dendl; + return self->this_module->py_module->get_typed_option_value(what, value); + } else { + if (prefix) { + dout(4) << __func__ << " [" << prefix << "/]" << what << " not found " + << dendl; + } else { + dout(4) << __func__ << " " << what << " not found " << dendl; + } + Py_RETURN_NONE; + } +} + +static PyObject* +ceph_option_get(BaseMgrStandbyModule *self, PyObject *args) +{ + char *what = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_option_get", &what)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + std::string value; + int r = g_conf().get_val(string(what), &value); + if (r >= 0) { + dout(10) << "ceph_option_get " << what << " found: " << value << dendl; + return PyString_FromString(value.c_str()); + } else { + dout(4) << "ceph_option_get " << what << " not found " << dendl; + Py_RETURN_NONE; + } +} + +static PyObject* +ceph_store_get(BaseMgrStandbyModule *self, PyObject *args) +{ + char *what = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_store_get", &what)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + // Drop GIL for blocking mon command execution + PyThreadState *tstate = PyEval_SaveThread(); + + std::string value; + bool found = self->this_module->get_store(what, &value); + + PyEval_RestoreThread(tstate); + + if (found) { + dout(10) << "ceph_store_get " << what << " found: " << value.c_str() << dendl; + return PyString_FromString(value.c_str()); + } else { + dout(4) << "ceph_store_get " << what << " not found " << dendl; + Py_RETURN_NONE; + } +} + +static PyObject* +ceph_get_active_uri(BaseMgrStandbyModule *self, PyObject *args) +{ + return PyString_FromString(self->this_module->get_active_uri().c_str()); +} + +static PyObject* +ceph_log(BaseMgrStandbyModule *self, PyObject *args) +{ + int level = 0; + char *record = nullptr; + if (!PyArg_ParseTuple(args, "is:log", &level, &record)) { + return nullptr; + } + + ceph_assert(self->this_module); + + self->this_module->log(level, record); + + Py_RETURN_NONE; +} + +PyMethodDef BaseMgrStandbyModule_methods[] = { + + {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS, + "Get the name of the Mgr daemon where we are running"}, + + {"_ceph_get_module_option", (PyCFunction)ceph_get_module_option, METH_VARARGS, + "Get a module configuration option value"}, + + {"_ceph_get_option", (PyCFunction)ceph_option_get, METH_VARARGS, + "Get a native configuration option value"}, + + {"_ceph_get_store", (PyCFunction)ceph_store_get, METH_VARARGS, + "Get a KV store value"}, + + {"_ceph_get_active_uri", (PyCFunction)ceph_get_active_uri, METH_NOARGS, + "Get the URI of the active instance of this module, if any"}, + + {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS, + "Emit a log message"}, + + {NULL, NULL, 0, NULL} +}; + +PyTypeObject BaseMgrStandbyModuleType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BaseMgrStandbyModule", /* tp_name */ + sizeof(BaseMgrStandbyModule), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "ceph-mgr Standby Python Plugin", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BaseMgrStandbyModule_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BaseMgrStandbyModule_init, /* tp_init */ + 0, /* tp_alloc */ + BaseMgrStandbyModule_new, /* tp_new */ +}; diff --git a/src/mgr/BaseMgrStandbyModule.h b/src/mgr/BaseMgrStandbyModule.h new file mode 100644 index 00000000..c4cf2673 --- /dev/null +++ b/src/mgr/BaseMgrStandbyModule.h @@ -0,0 +1,7 @@ + +#pragma once + +#include "PythonCompat.h" + +extern PyTypeObject BaseMgrStandbyModuleType; + diff --git a/src/mgr/CMakeLists.txt b/src/mgr/CMakeLists.txt new file mode 100644 index 00000000..180e39bc --- /dev/null +++ b/src/mgr/CMakeLists.txt @@ -0,0 +1,37 @@ +add_library(mgr_cap_obj OBJECT + MgrCap.cc) + +set(mgr_srcs + ${CMAKE_SOURCE_DIR}/src/ceph_mgr.cc + ${CMAKE_SOURCE_DIR}/src/mon/PGMap.cc + ActivePyModule.cc + ActivePyModules.cc + BaseMgrModule.cc + BaseMgrStandbyModule.cc + ClusterState.cc + DaemonHealthMetricCollector.cc + DaemonServer.cc + DaemonState.cc + Gil.cc + Mgr.cc + MgrStandby.cc + OSDPerfMetricTypes.cc + OSDPerfMetricCollector.cc + PyFormatter.cc + PyModule.cc + PyModuleRegistry.cc + PyModuleRunner.cc + PyOSDMap.cc + StandbyPyModules.cc + mgr_commands.cc + $) +add_executable(ceph-mgr ${mgr_srcs}) +target_include_directories(ceph-mgr SYSTEM PRIVATE "${PYTHON_INCLUDE_DIRS}") +target_link_libraries(ceph-mgr + osdc client heap_profiler + global-static ceph-common + Boost::python${MGR_PYTHON_VERSION_MAJOR}${MGR_PYTHON_VERSION_MINOR} + ${MGR_PYTHON_LIBRARIES} ${CMAKE_DL_LIBS} ${GSSAPI_LIBRARIES}) +set_target_properties(ceph-mgr PROPERTIES + POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE}) +install(TARGETS ceph-mgr DESTINATION bin) diff --git a/src/mgr/ClusterState.cc b/src/mgr/ClusterState.cc new file mode 100644 index 00000000..48cabca1 --- /dev/null +++ b/src/mgr/ClusterState.cc @@ -0,0 +1,383 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "messages/MMgrDigest.h" +#include "messages/MMonMgrReport.h" +#include "messages/MPGStats.h" + +#include "mgr/ClusterState.h" +#include +#include + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +ClusterState::ClusterState( + MonClient *monc_, + Objecter *objecter_, + const MgrMap& mgrmap) + : monc(monc_), + objecter(objecter_), + lock("ClusterState"), + mgr_map(mgrmap), + asok_hook(NULL) +{} + +void ClusterState::set_objecter(Objecter *objecter_) +{ + std::lock_guard l(lock); + + objecter = objecter_; +} + +void ClusterState::set_fsmap(FSMap const &new_fsmap) +{ + std::lock_guard l(lock); + + fsmap = new_fsmap; +} + +void ClusterState::set_mgr_map(MgrMap const &new_mgrmap) +{ + std::lock_guard l(lock); + mgr_map = new_mgrmap; +} + +void ClusterState::set_service_map(ServiceMap const &new_service_map) +{ + std::lock_guard l(lock); + servicemap = new_service_map; +} + +void ClusterState::load_digest(MMgrDigest *m) +{ + std::lock_guard l(lock); + health_json = std::move(m->health_json); + mon_status_json = std::move(m->mon_status_json); +} + +void ClusterState::ingest_pgstats(MPGStats *stats) +{ + std::lock_guard l(lock); + + const int from = stats->get_orig_source().num(); + bool is_in = with_osdmap([from](const OSDMap& osdmap) { + return osdmap.is_in(from); + }); + + if (is_in) { + pending_inc.update_stat(from, std::move(stats->osd_stat)); + } else { + osd_stat_t empty_stat; + empty_stat.seq = stats->osd_stat.seq; + pending_inc.update_stat(from, std::move(empty_stat)); + } + + for (auto p : stats->pg_stat) { + pg_t pgid = p.first; + const auto &pg_stats = p.second; + + // In case we're hearing about a PG that according to last + // OSDMap update should not exist + auto r = existing_pools.find(pgid.pool()); + if (r == existing_pools.end()) { + dout(15) << " got " << pgid + << " reported at " << pg_stats.reported_epoch << ":" + << pg_stats.reported_seq + << " state " << pg_state_string(pg_stats.state) + << " but pool not in " << existing_pools + << dendl; + continue; + } + if (pgid.ps() >= r->second) { + dout(15) << " got " << pgid + << " reported at " << pg_stats.reported_epoch << ":" + << pg_stats.reported_seq + << " state " << pg_state_string(pg_stats.state) + << " but > pg_num " << r->second + << dendl; + continue; + } + // In case we already heard about more recent stats from this PG + // from another OSD + const auto q = pg_map.pg_stat.find(pgid); + if (q != pg_map.pg_stat.end() && + q->second.get_version_pair() > pg_stats.get_version_pair()) { + dout(15) << " had " << pgid << " from " + << q->second.reported_epoch << ":" + << q->second.reported_seq << dendl; + continue; + } + + pending_inc.pg_stat_updates[pgid] = pg_stats; + } + for (auto p : stats->pool_stat) { + pending_inc.pool_statfs_updates[std::make_pair(p.first, from)] = p.second; + } +} + +void ClusterState::update_delta_stats() +{ + pending_inc.stamp = ceph_clock_now(); + pending_inc.version = pg_map.version + 1; // to make apply_incremental happy + dout(10) << " v" << pending_inc.version << dendl; + + dout(30) << " pg_map before:\n"; + JSONFormatter jf(true); + jf.dump_object("pg_map", pg_map); + jf.flush(*_dout); + *_dout << dendl; + dout(30) << " incremental:\n"; + JSONFormatter jf(true); + jf.dump_object("pending_inc", pending_inc); + jf.flush(*_dout); + *_dout << dendl; + pg_map.apply_incremental(g_ceph_context, pending_inc); + pending_inc = PGMap::Incremental(); +} + +void ClusterState::notify_osdmap(const OSDMap &osd_map) +{ + assert(ceph_mutex_is_locked(lock)); + + pending_inc.stamp = ceph_clock_now(); + pending_inc.version = pg_map.version + 1; // to make apply_incremental happy + dout(10) << " v" << pending_inc.version << dendl; + + PGMapUpdater::check_osd_map(g_ceph_context, osd_map, pg_map, &pending_inc); + + // update our list of pools that exist, so that we can filter pg_map updates + // in synchrony with this OSDMap. + existing_pools.clear(); + for (auto& p : osd_map.get_pools()) { + existing_pools[p.first] = p.second.get_pg_num(); + } + + // brute force this for now (don't bother being clever by only + // checking osds that went up/down) + set need_check_down_pg_osds; + PGMapUpdater::check_down_pgs(osd_map, pg_map, true, + need_check_down_pg_osds, &pending_inc); + + dout(30) << " pg_map before:\n"; + JSONFormatter jf(true); + jf.dump_object("pg_map", pg_map); + jf.flush(*_dout); + *_dout << dendl; + dout(30) << " incremental:\n"; + JSONFormatter jf(true); + jf.dump_object("pending_inc", pending_inc); + jf.flush(*_dout); + *_dout << dendl; + + pg_map.apply_incremental(g_ceph_context, pending_inc); + pending_inc = PGMap::Incremental(); + // TODO: Complete the separation of PG state handling so + // that a cut-down set of functionality remains in PGMonitor + // while the full-blown PGMap lives only here. +} + +class ClusterSocketHook : public AdminSocketHook { + ClusterState *cluster_state; +public: + explicit ClusterSocketHook(ClusterState *o) : cluster_state(o) {} + bool call(std::string_view admin_command, const cmdmap_t& cmdmap, + std::string_view format, bufferlist& out) override { + stringstream ss; + bool r = true; + try { + r = cluster_state->asok_command(admin_command, cmdmap, format, ss); + } catch (const bad_cmd_get& e) { + ss << e.what(); + r = true; + } + out.append(ss); + return r; + } +}; + +void ClusterState::final_init() +{ + AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); + asok_hook = new ClusterSocketHook(this); + int r = admin_socket->register_command("dump_osd_network", + "dump_osd_network name=value,type=CephInt,req=false", asok_hook, + "Dump osd heartbeat network ping times"); + ceph_assert(r == 0); +} + +void ClusterState::shutdown() +{ + // unregister commands + g_ceph_context->get_admin_socket()->unregister_commands(asok_hook); + delete asok_hook; + asok_hook = NULL; +} + +bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap, + std::string_view format, ostream& ss) +{ + std::lock_guard l(lock); + Formatter *f = Formatter::create(format, "json-pretty", "json-pretty"); + if (admin_command == "dump_osd_network") { + int64_t value = 0; + // Default to health warning level if nothing specified + if (!(cmd_getval(g_ceph_context, cmdmap, "value", value))) { + // Convert milliseconds to microseconds + value = static_cast(g_ceph_context->_conf.get_val("mon_warn_on_slow_ping_time")) * 1000; + if (value == 0) { + double ratio = g_conf().get_val("mon_warn_on_slow_ping_ratio"); + value = g_conf().get_val("osd_heartbeat_grace"); + value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio + } + } else { + // Convert user input to microseconds + value *= 1000; + } + if (value < 0) + value = 0; + + struct mgr_ping_time_t { + uint32_t pingtime; + int from; + int to; + bool back; + std::array times; + std::array min; + std::array max; + uint32_t last; + uint32_t last_update; + + bool operator<(const mgr_ping_time_t& rhs) const { + if (pingtime < rhs.pingtime) + return true; + if (pingtime > rhs.pingtime) + return false; + if (from < rhs.from) + return true; + if (from > rhs.from) + return false; + if (to < rhs.to) + return true; + if (to > rhs.to) + return false; + return back; + } + }; + + set sorted; + utime_t now = ceph_clock_now(); + for (auto i : pg_map.osd_stat) { + for (auto j : i.second.hb_pingtime) { + + if (j.second.last_update == 0) + continue; + auto stale_time = g_ceph_context->_conf.get_val("osd_mon_heartbeat_stat_stale"); + if (now.sec() - j.second.last_update > stale_time) { + dout(20) << __func__ << " time out heartbeat for osd " << i.first + << " last_update " << j.second.last_update << dendl; + continue; + } + mgr_ping_time_t item; + item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); + item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]); + if (!value || item.pingtime >= value) { + item.from = i.first; + item.to = j.first; + item.times[0] = j.second.back_pingtime[0]; + item.times[1] = j.second.back_pingtime[1]; + item.times[2] = j.second.back_pingtime[2]; + item.min[0] = j.second.back_min[0]; + item.min[1] = j.second.back_min[1]; + item.min[2] = j.second.back_min[2]; + item.max[0] = j.second.back_max[0]; + item.max[1] = j.second.back_max[1]; + item.max[2] = j.second.back_max[2]; + item.last = j.second.back_last; + item.back = true; + item.last_update = j.second.last_update; + sorted.emplace(item); + } + + if (j.second.front_last == 0) + continue; + item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]); + item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]); + if (!value || item.pingtime >= value) { + item.from = i.first; + item.to = j.first; + item.times[0] = j.second.front_pingtime[0]; + item.times[1] = j.second.front_pingtime[1]; + item.times[2] = j.second.front_pingtime[2]; + item.min[0] = j.second.front_min[0]; + item.min[1] = j.second.front_min[1]; + item.min[2] = j.second.front_min[2]; + item.max[0] = j.second.front_max[0]; + item.max[1] = j.second.front_max[1]; + item.max[2] = j.second.front_max[2]; + item.last = j.second.front_last; + item.back = false; + item.last_update = j.second.last_update; + sorted.emplace(item); + } + } + } + + // Network ping times (1min 5min 15min) + f->open_object_section("network_ping_times"); + f->dump_int("threshold", value / 1000); + f->open_array_section("entries"); + for (auto &sitem : boost::adaptors::reverse(sorted)) { + ceph_assert(!value || sitem.pingtime >= value); + + f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = g_ceph_context->_conf.get_val("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); + f->dump_int("from osd", sitem.from); + f->dump_int("to osd", sitem.to); + f->dump_string("interface", (sitem.back ? "back" : "front")); + f->open_object_section("average"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str()); + f->close_section(); // average + f->open_object_section("min"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.min[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.min[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.min[2],3).c_str()); + f->close_section(); // min + f->open_object_section("max"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str()); + f->close_section(); // max + f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str()); + f->close_section(); // entry + } + f->close_section(); // entries + f->close_section(); // network_ping_times + } else { + ceph_abort_msg("broken asok registration"); + } + f->flush(ss); + delete f; + return true; +} diff --git a/src/mgr/ClusterState.h b/src/mgr/ClusterState.h new file mode 100644 index 00000000..a78d0687 --- /dev/null +++ b/src/mgr/ClusterState.h @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CLUSTER_STATE_H_ +#define CLUSTER_STATE_H_ + +#include "mds/FSMap.h" +#include "mon/MgrMap.h" +#include "common/Mutex.h" + +#include "osdc/Objecter.h" +#include "mon/MonClient.h" +#include "mon/PGMap.h" +#include "mgr/ServiceMap.h" + +class MMgrDigest; +class MMonMgrReport; +class MPGStats; + + +/** + * Cluster-scope state (things like cluster maps) as opposed + * to daemon-level state (things like perf counters and smart) + */ +class ClusterState +{ +protected: + MonClient *monc; + Objecter *objecter; + FSMap fsmap; + ServiceMap servicemap; + mutable Mutex lock; + + MgrMap mgr_map; + + map existing_pools; ///< pools that exist, and pg_num, as of PGMap epoch + PGMap pg_map; + PGMap::Incremental pending_inc; + + bufferlist health_json; + bufferlist mon_status_json; + + class ClusterSocketHook *asok_hook; + +public: + + void load_digest(MMgrDigest *m); + void ingest_pgstats(MPGStats *stats); + + void update_delta_stats(); + + ClusterState(MonClient *monc_, Objecter *objecter_, const MgrMap& mgrmap); + + void set_objecter(Objecter *objecter_); + void set_fsmap(FSMap const &new_fsmap); + void set_mgr_map(MgrMap const &new_mgrmap); + void set_service_map(ServiceMap const &new_service_map); + + void notify_osdmap(const OSDMap &osd_map); + + bool have_fsmap() const { + std::lock_guard l(lock); + return fsmap.get_epoch() > 0; + } + + template + void with_servicemap(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + std::forward(cb)(servicemap, std::forward(args)...); + } + + template + void with_fsmap(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + std::forward(cb)(fsmap, std::forward(args)...); + } + + template + void with_mgrmap(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + std::forward(cb)(mgr_map, std::forward(args)...); + } + + template + auto with_pgmap(Callback&& cb, Args&&...args) const -> + decltype(cb(pg_map, std::forward(args)...)) + { + std::lock_guard l(lock); + return std::forward(cb)(pg_map, std::forward(args)...); + } + + template + auto with_mutable_pgmap(Callback&& cb, Args&&...args) -> + decltype(cb(pg_map, std::forward(args)...)) + { + std::lock_guard l(lock); + return std::forward(cb)(pg_map, std::forward(args)...); + } + + template + void with_monmap(Args &&... args) const + { + std::lock_guard l(lock); + ceph_assert(monc != nullptr); + monc->with_monmap(std::forward(args)...); + } + + template + auto with_osdmap(Args &&... args) const -> + decltype(objecter->with_osdmap(std::forward(args)...)) + { + ceph_assert(objecter != nullptr); + return objecter->with_osdmap(std::forward(args)...); + } + + // call cb(osdmap, pg_map, ...args) with the appropriate locks + template + auto with_osdmap_and_pgmap(Callback&& cb, Args&& ...args) const { + ceph_assert(objecter != nullptr); + std::lock_guard l(lock); + return objecter->with_osdmap( + std::forward(cb), + pg_map, + std::forward(args)...); + } + + template + void with_health(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + std::forward(cb)(health_json, std::forward(args)...); + } + + template + void with_mon_status(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + std::forward(cb)(mon_status_json, std::forward(args)...); + } + + void final_init(); + void shutdown(); + bool asok_command(std::string_view admin_command, const cmdmap_t& cmdmap, + std::string_view format, ostream& ss); +}; + +#endif + diff --git a/src/mgr/DaemonHealthMetric.h b/src/mgr/DaemonHealthMetric.h new file mode 100644 index 00000000..4719fa18 --- /dev/null +++ b/src/mgr/DaemonHealthMetric.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include "include/denc.h" + +enum class daemon_metric : uint8_t { + SLOW_OPS, + PENDING_CREATING_PGS, + NONE, +}; + +static inline const char *daemon_metric_name(daemon_metric t) { + switch (t) { + case daemon_metric::SLOW_OPS: return "SLOW_OPS"; + case daemon_metric::PENDING_CREATING_PGS: return "PENDING_CREATING_PGS"; + case daemon_metric::NONE: return "NONE"; + default: return "???"; + } +} + +union daemon_metric_t { + struct { + uint32_t n1; + uint32_t n2; + }; + uint64_t n; + daemon_metric_t(uint32_t x, uint32_t y) + : n1(x), n2(y) + {} + daemon_metric_t(uint64_t x = 0) + : n(x) + {} +}; + +class DaemonHealthMetric +{ +public: + DaemonHealthMetric() = default; + DaemonHealthMetric(daemon_metric type_, uint64_t n) + : type(type_), value(n) + {} + DaemonHealthMetric(daemon_metric type_, uint32_t n1, uint32_t n2) + : type(type_), value(n1, n2) + {} + daemon_metric get_type() const { + return type; + } + uint64_t get_n() const { + return value.n; + } + uint32_t get_n1() const { + return value.n1; + } + uint32_t get_n2() const { + return value.n2; + } + DENC(DaemonHealthMetric, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.value.n, p); + DENC_FINISH(p); + } + + friend ostream& operator<<(ostream& out, const DaemonHealthMetric& m) { + return out << daemon_metric_name(m.get_type()) << "(" + << m.get_n() << "|(" << m.get_n1() << "," << m.get_n2() << "))"; + } +private: + daemon_metric type = daemon_metric::NONE; + daemon_metric_t value; +}; +WRITE_CLASS_DENC(DaemonHealthMetric) diff --git a/src/mgr/DaemonHealthMetricCollector.cc b/src/mgr/DaemonHealthMetricCollector.cc new file mode 100644 index 00000000..1c3dc431 --- /dev/null +++ b/src/mgr/DaemonHealthMetricCollector.cc @@ -0,0 +1,125 @@ +#include + +#include "include/health.h" +#include "include/types.h" +#include "DaemonHealthMetricCollector.h" + + + +ostream& operator<<(ostream& os, + const DaemonHealthMetricCollector::DaemonKey& daemon) { + return os << daemon.first << "." << daemon.second; +} + +// define operator<<(ostream&, const vector&) after +// ostream& operator<<(ostream&, const DaemonKey&), so that C++'s +// ADL can use the former instead of using the generic one: +// operator<<(ostream&, const std::pair&) +ostream& operator<<( + ostream& os, + const vector& daemons) +{ + os << "["; + for (auto d = daemons.begin(); d != daemons.end(); ++d) { + if (d != daemons.begin()) os << ","; + os << *d; + } + os << "]"; + return os; +} + +namespace { + +class SlowOps final : public DaemonHealthMetricCollector { + bool _is_relevant(daemon_metric type) const override { + return type == daemon_metric::SLOW_OPS; + } + health_check_t& _get_check(health_check_map_t& cm) const override { + return cm.get_or_add("SLOW_OPS", HEALTH_WARN, ""); + } + bool _update(const DaemonKey& daemon, + const DaemonHealthMetric& metric) override { + auto num_slow = metric.get_n1(); + auto blocked_time = metric.get_n2(); + value.n1 += num_slow; + value.n2 = std::max(value.n2, blocked_time); + if (num_slow || blocked_time) { + daemons.push_back(daemon); + return true; + } else { + return false; + } + } + void _summarize(health_check_t& check) const override { + if (daemons.empty()) { + return; + } + static const char* fmt = "%1% slow ops, oldest one blocked for %2% sec, %3%"; + // Note this message format is used in mgr/prometheus, so any change in format + // requires a corresponding change in the mgr/prometheus module. + ostringstream ss; + if (daemons.size() > 1) { + if (daemons.size() > 10) { + ss << "daemons " << vector(daemons.begin(), daemons.begin()+10) + << "..." << " have slow ops."; + } else { + ss << "daemons " << daemons << " have slow ops."; + } + } else { + ss << daemons.front() << " has slow ops"; + } + check.summary = boost::str(boost::format(fmt) % value.n1 % value.n2 % ss.str()); + // No detail + } + vector daemons; +}; + + +class PendingPGs final : public DaemonHealthMetricCollector { + bool _is_relevant(daemon_metric type) const override { + return type == daemon_metric::PENDING_CREATING_PGS; + } + health_check_t& _get_check(health_check_map_t& cm) const override { + return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, ""); + } + bool _update(const DaemonKey& osd, + const DaemonHealthMetric& metric) override { + value.n += metric.get_n(); + if (metric.get_n()) { + osds.push_back(osd); + return true; + } else { + return false; + } + } + void _summarize(health_check_t& check) const override { + if (osds.empty()) { + return; + } + static const char* fmt = "%1% PGs pending on creation"; + check.summary = boost::str(boost::format(fmt) % value.n); + ostringstream ss; + if (osds.size() > 1) { + ss << "osds " << osds << " have pending PGs."; + } else { + ss << osds.front() << " has pending PGs"; + } + check.detail.push_back(ss.str()); + } + vector osds; +}; + +} // anonymous namespace + +unique_ptr +DaemonHealthMetricCollector::create(daemon_metric m) +{ + switch (m) { + case daemon_metric::SLOW_OPS: + return unique_ptr{new SlowOps}; + case daemon_metric::PENDING_CREATING_PGS: + return unique_ptr{new PendingPGs}; + default: + return unique_ptr{}; + } +} diff --git a/src/mgr/DaemonHealthMetricCollector.h b/src/mgr/DaemonHealthMetricCollector.h new file mode 100644 index 00000000..42bf905f --- /dev/null +++ b/src/mgr/DaemonHealthMetricCollector.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include + +#include "DaemonHealthMetric.h" +#include "mon/health_check.h" + +class DaemonHealthMetricCollector { +public: + using DaemonKey = std::pair; + static std::unique_ptr create(daemon_metric m); + void update(const DaemonKey& daemon, const DaemonHealthMetric& metric) { + if (_is_relevant(metric.get_type())) { + reported |= _update(daemon, metric); + } + } + void summarize(health_check_map_t& cm) { + if (reported) { + _summarize(_get_check(cm)); + } + } + virtual ~DaemonHealthMetricCollector() {} +private: + virtual bool _is_relevant(daemon_metric type) const = 0; + virtual health_check_t& _get_check(health_check_map_t& cm) const = 0; + virtual bool _update(const DaemonKey& daemon, const DaemonHealthMetric& metric) = 0; + virtual void _summarize(health_check_t& check) const = 0; +protected: + daemon_metric_t value; + bool reported = false; +}; diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc new file mode 100644 index 00000000..d98901f0 --- /dev/null +++ b/src/mgr/DaemonServer.cc @@ -0,0 +1,2921 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "DaemonServer.h" +#include "mgr/Mgr.h" + +#include "include/stringify.h" +#include "include/str_list.h" +#include "auth/RotatingKeyRing.h" +#include "json_spirit/json_spirit_writer.h" + +#include "mgr/mgr_commands.h" +#include "mgr/DaemonHealthMetricCollector.h" +#include "mgr/OSDPerfMetricCollector.h" +#include "mon/MonCommand.h" + +#include "messages/MMgrOpen.h" +#include "messages/MMgrClose.h" +#include "messages/MMgrConfigure.h" +#include "messages/MMonMgrReport.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MPGStats.h" +#include "messages/MOSDScrub.h" +#include "messages/MOSDScrub2.h" +#include "messages/MOSDForceRecovery.h" +#include "common/errno.h" +#include "common/pick_address.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr.server " << __func__ << " " + +namespace { + template + bool map_compare(Map const &lhs, Map const &rhs) { + return lhs.size() == rhs.size() + && std::equal(lhs.begin(), lhs.end(), rhs.begin(), + [] (auto a, auto b) { return a.first == b.first && a.second == b.second; }); + } +} + +DaemonServer::DaemonServer(MonClient *monc_, + Finisher &finisher_, + DaemonStateIndex &daemon_state_, + ClusterState &cluster_state_, + PyModuleRegistry &py_modules_, + LogChannelRef clog_, + LogChannelRef audit_clog_) + : Dispatcher(g_ceph_context), + client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes", + g_conf().get_val("mgr_client_bytes"))), + client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages", + g_conf().get_val("mgr_client_messages"))), + osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes", + g_conf().get_val("mgr_osd_bytes"))), + osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages", + g_conf().get_val("mgr_osd_messages"))), + mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes", + g_conf().get_val("mgr_mds_bytes"))), + mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages", + g_conf().get_val("mgr_mds_messages"))), + mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes", + g_conf().get_val("mgr_mon_bytes"))), + mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages", + g_conf().get_val("mgr_mon_messages"))), + msgr(nullptr), + monc(monc_), + finisher(finisher_), + daemon_state(daemon_state_), + cluster_state(cluster_state_), + py_modules(py_modules_), + clog(clog_), + audit_clog(audit_clog_), + lock("DaemonServer"), + pgmap_ready(false), + timer(g_ceph_context, lock), + shutting_down(false), + tick_event(nullptr), + osd_perf_metric_collector_listener(this), + osd_perf_metric_collector(osd_perf_metric_collector_listener) +{ + g_conf().add_observer(this); +} + +DaemonServer::~DaemonServer() { + delete msgr; + g_conf().remove_observer(this); +} + +int DaemonServer::init(uint64_t gid, entity_addrvec_t client_addrs) +{ + // Initialize Messenger + std::string public_msgr_type = g_conf()->ms_public_type.empty() ? + g_conf().get_val("ms_type") : g_conf()->ms_public_type; + msgr = Messenger::create(g_ceph_context, public_msgr_type, + entity_name_t::MGR(gid), + "mgr", + getpid(), 0); + msgr->set_default_policy(Messenger::Policy::stateless_server(0)); + + msgr->set_auth_client(monc); + + // throttle clients + msgr->set_policy_throttlers(entity_name_t::TYPE_CLIENT, + client_byte_throttler.get(), + client_msg_throttler.get()); + + // servers + msgr->set_policy_throttlers(entity_name_t::TYPE_OSD, + osd_byte_throttler.get(), + osd_msg_throttler.get()); + msgr->set_policy_throttlers(entity_name_t::TYPE_MDS, + mds_byte_throttler.get(), + mds_msg_throttler.get()); + msgr->set_policy_throttlers(entity_name_t::TYPE_MON, + mon_byte_throttler.get(), + mon_msg_throttler.get()); + + entity_addrvec_t addrs; + int r = pick_addresses(cct, CEPH_PICK_ADDRESS_PUBLIC, &addrs); + if (r < 0) { + return r; + } + dout(20) << __func__ << " will bind to " << addrs << dendl; + r = msgr->bindv(addrs); + if (r < 0) { + derr << "unable to bind mgr to " << addrs << dendl; + return r; + } + + msgr->set_myname(entity_name_t::MGR(gid)); + msgr->set_addr_unknowns(client_addrs); + + msgr->start(); + msgr->add_dispatcher_tail(this); + + msgr->set_auth_server(monc); + monc->set_handle_authentication_dispatcher(this); + + started_at = ceph_clock_now(); + + std::lock_guard l(lock); + timer.init(); + + schedule_tick_locked( + g_conf().get_val("mgr_tick_period").count()); + + return 0; +} + +entity_addrvec_t DaemonServer::get_myaddrs() const +{ + return msgr->get_myaddrs(); +} + +KeyStore *DaemonServer::ms_get_auth1_authorizer_keystore() +{ + return monc->rotating_secrets.get(); +} + +int DaemonServer::ms_handle_authentication(Connection *con) +{ + int ret = 0; + MgrSession *s = new MgrSession(cct); + con->set_priv(s); + s->inst.addr = con->get_peer_addr(); + s->entity_name = con->peer_name; + dout(10) << __func__ << " new session " << s << " con " << con + << " entity " << con->peer_name + << " addr " << con->get_peer_addrs() + << dendl; + + AuthCapsInfo &caps_info = con->get_peer_caps_info(); + if (caps_info.allow_all) { + dout(10) << " session " << s << " " << s->entity_name + << " allow_all" << dendl; + s->caps.set_allow_all(); + } + + if (caps_info.caps.length() > 0) { + auto p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } + catch (buffer::error& e) { + ret = -EPERM; + } + bool success = s->caps.parse(str); + if (success) { + dout(10) << " session " << s << " " << s->entity_name + << " has caps " << s->caps << " '" << str << "'" << dendl; + ret = 1; + } else { + dout(10) << " session " << s << " " << s->entity_name + << " failed to parse caps '" << str << "'" << dendl; + ret = -EPERM; + } + } + + if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) { + std::lock_guard l(lock); + s->osd_id = atoi(s->entity_name.get_id().c_str()); + dout(10) << "registering osd." << s->osd_id << " session " + << s << " con " << con << dendl; + osd_cons[s->osd_id].insert(con); + } + + return ret; +} + +bool DaemonServer::ms_get_authorizer( + int dest_type, + AuthAuthorizer **authorizer) +{ + dout(10) << "type=" << ceph_entity_type_name(dest_type) << dendl; + + if (dest_type == CEPH_ENTITY_TYPE_MON) { + return true; + } + + *authorizer = monc->build_authorizer(dest_type); + dout(20) << "got authorizer " << *authorizer << dendl; + return *authorizer != NULL; +} + +bool DaemonServer::ms_handle_reset(Connection *con) +{ + if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) { + auto priv = con->get_priv(); + auto session = static_cast(priv.get()); + if (!session) { + return false; + } + std::lock_guard l(lock); + dout(10) << "unregistering osd." << session->osd_id + << " session " << session << " con " << con << dendl; + osd_cons[session->osd_id].erase(con); + + auto iter = daemon_connections.find(con); + if (iter != daemon_connections.end()) { + daemon_connections.erase(iter); + } + } + return false; +} + +bool DaemonServer::ms_handle_refused(Connection *con) +{ + // do nothing for now + return false; +} + +bool DaemonServer::ms_dispatch(Message *m) +{ + // Note that we do *not* take ::lock here, in order to avoid + // serializing all message handling. It's up to each handler + // to take whatever locks it needs. + switch (m->get_type()) { + case MSG_PGSTATS: + cluster_state.ingest_pgstats(static_cast(m)); + maybe_ready(m->get_source().num()); + m->put(); + return true; + case MSG_MGR_REPORT: + return handle_report(static_cast(m)); + case MSG_MGR_OPEN: + return handle_open(static_cast(m)); + case MSG_MGR_CLOSE: + return handle_close(static_cast(m)); + case MSG_COMMAND: + return handle_command(static_cast(m)); + default: + dout(1) << "Unhandled message type " << m->get_type() << dendl; + return false; + }; +} + +void DaemonServer::dump_pg_ready(ceph::Formatter *f) +{ + f->dump_bool("pg_ready", pgmap_ready.load()); +} + +void DaemonServer::maybe_ready(int32_t osd_id) +{ + if (pgmap_ready.load()) { + // Fast path: we don't need to take lock because pgmap_ready + // is already set + } else { + std::lock_guard l(lock); + + if (reported_osds.find(osd_id) == reported_osds.end()) { + dout(4) << "initial report from osd " << osd_id << dendl; + reported_osds.insert(osd_id); + std::set up_osds; + + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + osdmap.get_up_osds(up_osds); + }); + + std::set unreported_osds; + std::set_difference(up_osds.begin(), up_osds.end(), + reported_osds.begin(), reported_osds.end(), + std::inserter(unreported_osds, unreported_osds.begin())); + + if (unreported_osds.size() == 0) { + dout(4) << "all osds have reported, sending PG state to mon" << dendl; + pgmap_ready = true; + reported_osds.clear(); + // Avoid waiting for next tick + send_report(); + } else { + dout(4) << "still waiting for " << unreported_osds.size() << " osds" + " to report in before PGMap is ready" << dendl; + } + } + } +} + +void DaemonServer::tick() +{ + dout(10) << dendl; + send_report(); + adjust_pgs(); + + schedule_tick_locked( + g_conf().get_val("mgr_tick_period").count()); +} + +// Currently modules do not set health checks in response to events delivered to +// all modules (e.g. notify) so we do not risk a thundering hurd situation here. +// if this pattern emerges in the future, this scheduler could be modified to +// fire after all modules have had a chance to set their health checks. +void DaemonServer::schedule_tick_locked(double delay_sec) +{ + ceph_assert(lock.is_locked_by_me()); + + if (tick_event) { + timer.cancel_event(tick_event); + tick_event = nullptr; + } + + // on shutdown start rejecting explicit requests to send reports that may + // originate from python land which may still be running. + if (shutting_down) + return; + + tick_event = timer.add_event_after(delay_sec, + new FunctionContext([this](int r) { + tick(); + })); +} + +void DaemonServer::schedule_tick(double delay_sec) +{ + std::lock_guard l(lock); + schedule_tick_locked(delay_sec); +} + +void DaemonServer::handle_osd_perf_metric_query_updated() +{ + dout(10) << dendl; + + // Send a fresh MMgrConfigure to all clients, so that they can follow + // the new policy for transmitting stats + finisher.queue(new FunctionContext([this](int r) { + std::lock_guard l(lock); + for (auto &c : daemon_connections) { + if (c->peer_is_osd()) { + _send_configure(c); + } + } + })); +} + +void DaemonServer::shutdown() +{ + dout(10) << "begin" << dendl; + msgr->shutdown(); + msgr->wait(); + cluster_state.shutdown(); + dout(10) << "done" << dendl; + + std::lock_guard l(lock); + shutting_down = true; + timer.shutdown(); +} + +static DaemonKey key_from_service( + const std::string& service_name, + int peer_type, + const std::string& daemon_name) +{ + if (!service_name.empty()) { + return DaemonKey(service_name, daemon_name); + } else { + return DaemonKey(ceph_entity_type_name(peer_type), daemon_name); + } +} + +static bool key_from_string( + const std::string& name, + DaemonKey *out) +{ + auto p = name.find('.'); + if (p == std::string::npos) { + return false; + } + out->first = name.substr(0, p); + out->second = name.substr(p + 1); + return true; +} + +bool DaemonServer::handle_open(MMgrOpen *m) +{ + std::lock_guard l(lock); + + DaemonKey key = key_from_service(m->service_name, + m->get_connection()->get_peer_type(), + m->daemon_name); + + auto con = m->get_connection(); + dout(4) << "from " << key << " " << con << dendl; + + _send_configure(con); + + DaemonStatePtr daemon; + if (daemon_state.exists(key)) { + dout(20) << "updating existing DaemonState for " << key << dendl; + daemon = daemon_state.get(key); + } + if (!daemon) { + if (m->service_daemon) { + dout(4) << "constructing new DaemonState for " << key << dendl; + daemon = std::make_shared(daemon_state.types); + daemon->key = key; + daemon->service_daemon = true; + daemon_state.insert(daemon); + } else { + /* A normal Ceph daemon has connected but we are or should be waiting on + * metadata for it. Close the session so that it tries to reconnect. + */ + dout(2) << "ignoring open from " << key << " " << con->get_peer_addr() + << "; not ready for session (expect reconnect)" << dendl; + con->mark_down(); + m->put(); + return true; + } + } + if (daemon) { + if (m->service_daemon) { + // update the metadata through the daemon state index to + // ensure it's kept up-to-date + daemon_state.update_metadata(daemon, m->daemon_metadata); + } + + std::lock_guard l(daemon->lock); + daemon->perf_counters.clear(); + + daemon->service_daemon = m->service_daemon; + if (m->service_daemon) { + daemon->service_status = m->daemon_status; + + utime_t now = ceph_clock_now(); + auto [d, added] = pending_service_map.get_daemon(m->service_name, + m->daemon_name); + if (added || d->gid != (uint64_t)m->get_source().num()) { + dout(10) << "registering " << key << " in pending_service_map" << dendl; + d->gid = m->get_source().num(); + d->addr = m->get_source_addr(); + d->start_epoch = pending_service_map.epoch; + d->start_stamp = now; + d->metadata = m->daemon_metadata; + pending_service_map_dirty = pending_service_map.epoch; + } + } + + auto p = m->config_bl.cbegin(); + if (p != m->config_bl.end()) { + decode(daemon->config, p); + decode(daemon->ignored_mon_config, p); + dout(20) << " got config " << daemon->config + << " ignored " << daemon->ignored_mon_config << dendl; + } + daemon->config_defaults_bl = m->config_defaults_bl; + daemon->config_defaults.clear(); + dout(20) << " got config_defaults_bl " << daemon->config_defaults_bl.length() + << " bytes" << dendl; + } + + if (con->get_peer_type() != entity_name_t::TYPE_CLIENT && + m->service_name.empty()) + { + // Store in set of the daemon/service connections, i.e. those + // connections that require an update in the event of stats + // configuration changes. + daemon_connections.insert(con); + } + + m->put(); + return true; +} + +bool DaemonServer::handle_close(MMgrClose *m) +{ + std::lock_guard l(lock); + + DaemonKey key = key_from_service(m->service_name, + m->get_connection()->get_peer_type(), + m->daemon_name); + dout(4) << "from " << m->get_connection() << " " << key << dendl; + + if (daemon_state.exists(key)) { + DaemonStatePtr daemon = daemon_state.get(key); + daemon_state.rm(key); + { + std::lock_guard l(daemon->lock); + if (daemon->service_daemon) { + pending_service_map.rm_daemon(m->service_name, m->daemon_name); + pending_service_map_dirty = pending_service_map.epoch; + } + } + } + + // send same message back as a reply + m->get_connection()->send_message(m); + return true; +} + +void DaemonServer::update_task_status(DaemonKey key, MMgrReport *m) { + dout(10) << "got task status from " << key << dendl; + + auto p = pending_service_map.get_daemon(key.first, key.second); + if (!map_compare(p.first->task_status, *m->task_status)) { + p.first->task_status = *m->task_status; + pending_service_map_dirty = pending_service_map.epoch; + } +} + +bool DaemonServer::handle_report(MMgrReport *m) +{ + DaemonKey key; + if (!m->service_name.empty()) { + key.first = m->service_name; + } else { + key.first = ceph_entity_type_name(m->get_connection()->get_peer_type()); + } + key.second = m->daemon_name; + + dout(4) << "from " << m->get_connection() << " " << key << dendl; + + if (m->get_connection()->get_peer_type() == entity_name_t::TYPE_CLIENT && + m->service_name.empty()) { + // Clients should not be sending us stats unless they are declaring + // themselves to be a daemon for some service. + dout(4) << "rejecting report from non-daemon client " << m->daemon_name + << dendl; + m->get_connection()->mark_down(); + m->put(); + return true; + } + + + { + std::unique_lock locker(lock); + + DaemonStatePtr daemon; + // Look up the DaemonState + if (daemon_state.exists(key)) { + dout(20) << "updating existing DaemonState for " << key << dendl; + daemon = daemon_state.get(key); + } else { + locker.unlock(); + + // we don't know the hostname at this stage, reject MMgrReport here. + dout(5) << "rejecting report from " << key << ", since we do not have its metadata now." + << dendl; + // issue metadata request in background + if (!daemon_state.is_updating(key) && + (key.first == "osd" || key.first == "mds" || key.first == "mon")) { + + std::ostringstream oss; + auto c = new MetadataUpdate(daemon_state, key); + if (key.first == "osd") { + oss << "{\"prefix\": \"osd metadata\", \"id\": " + << key.second<< "}"; + + } else if (key.first == "mds") { + c->set_default("addr", stringify(m->get_source_addr())); + oss << "{\"prefix\": \"mds metadata\", \"who\": \"" + << key.second << "\"}"; + + } else if (key.first == "mon") { + oss << "{\"prefix\": \"mon metadata\", \"id\": \"" + << key.second << "\"}"; + } else { + ceph_abort(); + } + + monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c); + } + + locker.lock(); + + // kill session + auto priv = m->get_connection()->get_priv(); + auto session = static_cast(priv.get()); + if (!session) { + return false; + } + m->get_connection()->mark_down(); + + dout(10) << "unregistering osd." << session->osd_id + << " session " << session << " con " << m->get_connection() << dendl; + + if (osd_cons.find(session->osd_id) != osd_cons.end()) { + osd_cons[session->osd_id].erase(m->get_connection()); + } + + auto iter = daemon_connections.find(m->get_connection()); + if (iter != daemon_connections.end()) { + daemon_connections.erase(iter); + } + + return false; + } + + // Update the DaemonState + ceph_assert(daemon != nullptr); + { + std::lock_guard l(daemon->lock); + auto &daemon_counters = daemon->perf_counters; + daemon_counters.update(m); + + auto p = m->config_bl.cbegin(); + if (p != m->config_bl.end()) { + decode(daemon->config, p); + decode(daemon->ignored_mon_config, p); + dout(20) << " got config " << daemon->config + << " ignored " << daemon->ignored_mon_config << dendl; + } + + utime_t now = ceph_clock_now(); + if (daemon->service_daemon) { + if (m->daemon_status) { + daemon->service_status_stamp = now; + daemon->service_status = *m->daemon_status; + } + daemon->last_service_beacon = now; + } else if (m->daemon_status) { + derr << "got status from non-daemon " << key << dendl; + } + // update task status + if (m->task_status) { + update_task_status(key, m); + daemon->last_service_beacon = now; + } + if (m->get_connection()->peer_is_osd() || m->get_connection()->peer_is_mon()) { + // only OSD and MON send health_checks to me now + daemon->daemon_health_metrics = std::move(m->daemon_health_metrics); + dout(10) << "daemon_health_metrics " << daemon->daemon_health_metrics + << dendl; + } + } + } + + // if there are any schema updates, notify the python modules + if (!m->declare_types.empty() || !m->undeclare_types.empty()) { + ostringstream oss; + oss << key.first << '.' << key.second; + py_modules.notify_all("perf_schema_update", oss.str()); + } + + if (m->get_connection()->peer_is_osd()) { + osd_perf_metric_collector.process_reports(m->osd_perf_metric_reports); + } + + m->put(); + return true; +} + + +void DaemonServer::_generate_command_map( + cmdmap_t& cmdmap, + map ¶m_str_map) +{ + for (auto p = cmdmap.begin(); + p != cmdmap.end(); ++p) { + if (p->first == "prefix") + continue; + if (p->first == "caps") { + vector cv; + if (cmd_getval(g_ceph_context, cmdmap, "caps", cv) && + cv.size() % 2 == 0) { + for (unsigned i = 0; i < cv.size(); i += 2) { + string k = string("caps_") + cv[i]; + param_str_map[k] = cv[i + 1]; + } + continue; + } + } + param_str_map[p->first] = cmd_vartype_stringify(p->second); + } +} + +const MonCommand *DaemonServer::_get_mgrcommand( + const string &cmd_prefix, + const std::vector &cmds) +{ + const MonCommand *this_cmd = nullptr; + for (const auto &cmd : cmds) { + if (cmd.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) { + this_cmd = &cmd; + break; + } + } + return this_cmd; +} + +bool DaemonServer::_allowed_command( + MgrSession *s, + const string &service, + const string &module, + const string &prefix, + const cmdmap_t& cmdmap, + const map& param_str_map, + const MonCommand *this_cmd) { + + if (s->entity_name.is_mon()) { + // mon is all-powerful. even when it is forwarding commands on behalf of + // old clients; we expect the mon is validating commands before proxying! + return true; + } + + bool cmd_r = this_cmd->requires_perm('r'); + bool cmd_w = this_cmd->requires_perm('w'); + bool cmd_x = this_cmd->requires_perm('x'); + + bool capable = s->caps.is_capable( + g_ceph_context, + s->entity_name, + service, module, prefix, param_str_map, + cmd_r, cmd_w, cmd_x, + s->get_peer_addr()); + + dout(10) << " " << s->entity_name << " " + << (capable ? "" : "not ") << "capable" << dendl; + return capable; +} + +/** + * The working data for processing an MCommand. This lives in + * a class to enable passing it into other threads for processing + * outside of the thread/locks that called handle_command. + */ +class CommandContext { +public: + MCommand *m; + bufferlist odata; + cmdmap_t cmdmap; + + explicit CommandContext(MCommand *m_) + : m(m_) { + } + + ~CommandContext() { + m->put(); + } + + void reply(int r, const std::stringstream &ss) { + reply(r, ss.str()); + } + + void reply(int r, const std::string &rs) { + // Let the connection drop as soon as we've sent our response + ConnectionRef con = m->get_connection(); + if (con) { + con->mark_disposable(); + } + + if (r == 0) { + dout(4) << __func__ << " success" << dendl; + } else { + derr << __func__ << " " << cpp_strerror(r) << " " << rs << dendl; + } + if (con) { + MCommandReply *reply = new MCommandReply(r, rs); + reply->set_tid(m->get_tid()); + reply->set_data(odata); + con->send_message(reply); + } + } +}; + +/** + * A context for receiving a bufferlist/error string from a background + * function and then calling back to a CommandContext when it's done + */ +class ReplyOnFinish : public Context { + std::shared_ptr cmdctx; + +public: + bufferlist from_mon; + string outs; + + explicit ReplyOnFinish(const std::shared_ptr &cmdctx_) + : cmdctx(cmdctx_) + {} + void finish(int r) override { + cmdctx->odata.claim_append(from_mon); + cmdctx->reply(r, outs); + } +}; + +bool DaemonServer::handle_command(MCommand *m) +{ + std::lock_guard l(lock); + std::shared_ptr cmdctx = std::make_shared(m); + try { + return _handle_command(m, cmdctx); + } catch (const bad_cmd_get& e) { + cmdctx->reply(-EINVAL, e.what()); + return true; + } +} + +void DaemonServer::log_access_denied( + std::shared_ptr& cmdctx, + MgrSession* session, std::stringstream& ss) { + dout(1) << " access denied" << dendl; + audit_clog->info() << "from='" << session->inst << "' " + << "entity='" << session->entity_name << "' " + << "cmd=" << cmdctx->cmdmap << ": access denied"; + ss << "access denied: does your client key have mgr caps? " + "See http://docs.ceph.com/docs/master/mgr/administrator/" + "#client-authentication"; +} + +bool DaemonServer::_handle_command( + MCommand *m, + std::shared_ptr& cmdctx) +{ + auto priv = m->get_connection()->get_priv(); + auto session = static_cast(priv.get()); + if (!session) { + return true; + } + if (session->inst.name == entity_name_t()) + session->inst.name = m->get_source(); + + std::string format; + boost::scoped_ptr f; + map param_str_map; + std::stringstream ss; + int r = 0; + + if (!cmdmap_from_json(m->cmd, &(cmdctx->cmdmap), ss)) { + cmdctx->reply(-EINVAL, ss); + return true; + } + + { + cmd_getval(g_ceph_context, cmdctx->cmdmap, "format", format, string("plain")); + f.reset(Formatter::create(format)); + } + + string prefix; + cmd_getval(cct, cmdctx->cmdmap, "prefix", prefix); + + dout(4) << "decoded " << cmdctx->cmdmap.size() << dendl; + dout(4) << "prefix=" << prefix << dendl; + + if (prefix == "get_command_descriptions") { + dout(10) << "reading commands from python modules" << dendl; + const auto py_commands = py_modules.get_commands(); + + int cmdnum = 0; + JSONFormatter f; + f.open_object_section("command_descriptions"); + + auto dump_cmd = [&cmdnum, &f, m](const MonCommand &mc){ + ostringstream secname; + secname << "cmd" << setfill('0') << std::setw(3) << cmdnum; + dump_cmddesc_to_json(&f, m->get_connection()->get_features(), + secname.str(), mc.cmdstring, mc.helpstring, + mc.module, mc.req_perms, 0); + cmdnum++; + }; + + for (const auto &pyc : py_commands) { + dump_cmd(pyc); + } + + for (const auto &mgr_cmd : mgr_commands) { + dump_cmd(mgr_cmd); + } + + f.close_section(); // command_descriptions + f.flush(cmdctx->odata); + cmdctx->reply(0, ss); + return true; + } + + // lookup command + const MonCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands); + _generate_command_map(cmdctx->cmdmap, param_str_map); + + bool is_allowed = false; + ModuleCommand py_command; + if (!mgr_cmd) { + // Resolve the command to the name of the module that will + // handle it (if the command exists) + auto py_commands = py_modules.get_py_commands(); + for (const auto &pyc : py_commands) { + auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring); + if (pyc_prefix == prefix) { + py_command = pyc; + break; + } + } + + MonCommand pyc = {"", "", "py", py_command.perm}; + is_allowed = _allowed_command(session, "py", py_command.module_name, + prefix, cmdctx->cmdmap, param_str_map, + &pyc); + } else { + // validate user's permissions for requested command + is_allowed = _allowed_command(session, mgr_cmd->module, "", + prefix, cmdctx->cmdmap, param_str_map, mgr_cmd); + } + + if (!is_allowed) { + log_access_denied(cmdctx, session, ss); + cmdctx->reply(-EACCES, ss); + return true; + } + + audit_clog->debug() + << "from='" << session->inst << "' " + << "entity='" << session->entity_name << "' " + << "cmd=" << m->cmd << ": dispatch"; + + // ---------------- + // service map commands + if (prefix == "service dump") { + if (!f) + f.reset(Formatter::create("json-pretty")); + cluster_state.with_servicemap([&](const ServiceMap &service_map) { + f->dump_object("service_map", service_map); + }); + f->flush(cmdctx->odata); + cmdctx->reply(0, ss); + return true; + } + if (prefix == "service status") { + if (!f) + f.reset(Formatter::create("json-pretty")); + // only include state from services that are in the persisted service map + f->open_object_section("service_status"); + for (auto& p : pending_service_map.services) { + if (ServiceMap::is_normal_ceph_entity(p.first)) { + continue; + } + + f->open_object_section(p.first.c_str()); + for (auto& q : p.second.daemons) { + f->open_object_section(q.first.c_str()); + DaemonKey key(p.first, q.first); + ceph_assert(daemon_state.exists(key)); + auto daemon = daemon_state.get(key); + std::lock_guard l(daemon->lock); + f->dump_stream("status_stamp") << daemon->service_status_stamp; + f->dump_stream("last_beacon") << daemon->last_service_beacon; + f->open_object_section("status"); + for (auto& r : daemon->service_status) { + f->dump_string(r.first.c_str(), r.second); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + } + f->close_section(); + f->flush(cmdctx->odata); + cmdctx->reply(0, ss); + return true; + } + + if (prefix == "config set") { + std::string key; + std::string val; + cmd_getval(cct, cmdctx->cmdmap, "key", key); + cmd_getval(cct, cmdctx->cmdmap, "value", val); + r = cct->_conf.set_val(key, val, &ss); + if (r == 0) { + cct->_conf.apply_changes(nullptr); + } + cmdctx->reply(0, ss); + return true; + } + + // ----------- + // PG commands + + if (prefix == "pg scrub" || + prefix == "pg repair" || + prefix == "pg deep-scrub") { + string scrubop = prefix.substr(3, string::npos); + pg_t pgid; + spg_t spgid; + string pgidstr; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgidstr); + if (!pgid.parse(pgidstr.c_str())) { + ss << "invalid pgid '" << pgidstr << "'"; + cmdctx->reply(-EINVAL, ss); + return true; + } + bool pg_exists = false; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + pg_exists = osdmap.pg_exists(pgid); + }); + if (!pg_exists) { + ss << "pg " << pgid << " does not exist"; + cmdctx->reply(-ENOENT, ss); + return true; + } + int acting_primary = -1; + epoch_t epoch; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + epoch = osdmap.get_epoch(); + osdmap.get_primary_shard(pgid, &acting_primary, &spgid); + }); + if (acting_primary == -1) { + ss << "pg " << pgid << " has no primary osd"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + auto p = osd_cons.find(acting_primary); + if (p == osd_cons.end()) { + ss << "pg " << pgid << " primary osd." << acting_primary + << " is not currently connected"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + for (auto& con : p->second) { + if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) { + vector pgs = { spgid }; + con->send_message(new MOSDScrub2(monc->get_fsid(), + epoch, + pgs, + scrubop == "repair", + scrubop == "deep-scrub")); + } else { + vector pgs = { pgid }; + con->send_message(new MOSDScrub(monc->get_fsid(), + pgs, + scrubop == "repair", + scrubop == "deep-scrub")); + } + } + ss << "instructing pg " << spgid << " on osd." << acting_primary + << " to " << scrubop; + cmdctx->reply(0, ss); + return true; + } else if (prefix == "osd scrub" || + prefix == "osd deep-scrub" || + prefix == "osd repair") { + string whostr; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "who", whostr); + vector pvec; + get_str_vec(prefix, pvec); + + set osds; + if (whostr == "*" || whostr == "all" || whostr == "any") { + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (int i = 0; i < osdmap.get_max_osd(); i++) + if (osdmap.is_up(i)) { + osds.insert(i); + } + }); + } else { + long osd = parse_osd_id(whostr.c_str(), &ss); + if (osd < 0) { + ss << "invalid osd '" << whostr << "'"; + cmdctx->reply(-EINVAL, ss); + return true; + } + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + if (osdmap.is_up(osd)) { + osds.insert(osd); + } + }); + if (osds.empty()) { + ss << "osd." << osd << " is not up"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + } + set sent_osds, failed_osds; + for (auto osd : osds) { + vector spgs; + epoch_t epoch; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) { + epoch = osdmap.get_epoch(); + auto p = pgmap.pg_by_osd.find(osd); + if (p != pgmap.pg_by_osd.end()) { + for (auto pgid : p->second) { + int primary; + spg_t spg; + osdmap.get_primary_shard(pgid, &primary, &spg); + if (primary == osd) { + spgs.push_back(spg); + } + } + } + }); + auto p = osd_cons.find(osd); + if (p == osd_cons.end()) { + failed_osds.insert(osd); + } else { + sent_osds.insert(osd); + for (auto& con : p->second) { + if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) { + con->send_message(new MOSDScrub2(monc->get_fsid(), + epoch, + spgs, + pvec.back() == "repair", + pvec.back() == "deep-scrub")); + } else { + con->send_message(new MOSDScrub(monc->get_fsid(), + pvec.back() == "repair", + pvec.back() == "deep-scrub")); + } + } + } + } + if (failed_osds.size() == osds.size()) { + ss << "failed to instruct osd(s) " << osds << " to " << pvec.back() + << " (not connected)"; + r = -EAGAIN; + } else { + ss << "instructed osd(s) " << sent_osds << " to " << pvec.back(); + if (!failed_osds.empty()) { + ss << "; osd(s) " << failed_osds << " were not connected"; + } + r = 0; + } + cmdctx->reply(0, ss); + return true; + } else if (prefix == "osd pool scrub" || + prefix == "osd pool deep-scrub" || + prefix == "osd pool repair") { + vector pool_names; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "who", pool_names); + if (pool_names.empty()) { + ss << "must specify one or more pool names"; + cmdctx->reply(-EINVAL, ss); + return true; + } + epoch_t epoch; + map> pgs_by_primary; // legacy + map> spgs_by_primary; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + epoch = osdmap.get_epoch(); + for (auto& pool_name : pool_names) { + auto pool_id = osdmap.lookup_pg_pool_name(pool_name); + if (pool_id < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + return; + } + auto pool_pg_num = osdmap.get_pg_num(pool_id); + for (int i = 0; i < pool_pg_num; i++) { + pg_t pg(i, pool_id); + int primary; + spg_t spg; + auto got = osdmap.get_primary_shard(pg, &primary, &spg); + if (!got) + continue; + pgs_by_primary[primary].push_back(pg); + spgs_by_primary[primary].push_back(spg); + } + } + }); + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + for (auto& it : spgs_by_primary) { + auto primary = it.first; + auto p = osd_cons.find(primary); + if (p == osd_cons.end()) { + ss << "osd." << primary << " is not currently connected"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + for (auto& con : p->second) { + if (HAVE_FEATURE(con->get_features(), SERVER_MIMIC)) { + con->send_message(new MOSDScrub2(monc->get_fsid(), + epoch, + it.second, + prefix == "osd pool repair", + prefix == "osd pool deep-scrub")); + } else { + // legacy + auto q = pgs_by_primary.find(primary); + ceph_assert(q != pgs_by_primary.end()); + con->send_message(new MOSDScrub(monc->get_fsid(), + q->second, + prefix == "osd pool repair", + prefix == "osd pool deep-scrub")); + } + } + } + cmdctx->reply(0, ""); + return true; + } else if (prefix == "osd reweight-by-pg" || + prefix == "osd reweight-by-utilization" || + prefix == "osd test-reweight-by-pg" || + prefix == "osd test-reweight-by-utilization") { + bool by_pg = + prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg"; + bool dry_run = + prefix == "osd test-reweight-by-pg" || + prefix == "osd test-reweight-by-utilization"; + int64_t oload; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "oload", oload, int64_t(120)); + set pools; + vector poolnames; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "pools", poolnames); + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (const auto& poolname : poolnames) { + int64_t pool = osdmap.lookup_pg_pool_name(poolname); + if (pool < 0) { + ss << "pool '" << poolname << "' does not exist"; + r = -ENOENT; + } + pools.insert(pool); + } + }); + if (r) { + cmdctx->reply(r, ss); + return true; + } + + double max_change = g_conf().get_val("mon_reweight_max_change"); + cmd_getval(g_ceph_context, cmdctx->cmdmap, "max_change", max_change); + if (max_change <= 0.0) { + ss << "max_change " << max_change << " must be positive"; + cmdctx->reply(-EINVAL, ss); + return true; + } + int64_t max_osds = g_conf().get_val("mon_reweight_max_osds"); + cmd_getval(g_ceph_context, cmdctx->cmdmap, "max_osds", max_osds); + if (max_osds <= 0) { + ss << "max_osds " << max_osds << " must be positive"; + cmdctx->reply(-EINVAL, ss); + return true; + } + bool no_increasing = false; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "no_increasing", no_increasing); + string out_str; + mempool::osdmap::map new_weights; + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap &osdmap, const PGMap& pgmap) { + return reweight::by_utilization(osdmap, pgmap, + oload, + max_change, + max_osds, + by_pg, + pools.empty() ? NULL : &pools, + no_increasing, + &new_weights, + &ss, &out_str, f.get()); + }); + if (r >= 0) { + dout(10) << "reweight::by_utilization: finished with " << out_str << dendl; + } + if (f) { + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(out_str); + } + if (r < 0) { + ss << "FAILED reweight-by-pg"; + cmdctx->reply(r, ss); + return true; + } else if (r == 0 || dry_run) { + ss << "no change"; + cmdctx->reply(r, ss); + return true; + } else { + json_spirit::Object json_object; + for (const auto& osd_weight : new_weights) { + json_spirit::Config::add(json_object, + std::to_string(osd_weight.first), + std::to_string(osd_weight.second)); + } + string s = json_spirit::write(json_object); + std::replace(begin(s), end(s), '\"', '\''); + const string cmd = + "{" + "\"prefix\": \"osd reweightn\", " + "\"weights\": \"" + s + "\"" + "}"; + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, {}, + &on_finish->from_mon, &on_finish->outs, on_finish); + return true; + } + } else if (prefix == "osd df") { + string method; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "output_method", method); + string filter_by; + string filter; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "filter_by", filter_by); + cmd_getval(g_ceph_context, cmdctx->cmdmap, "filter", filter); + if (filter_by.empty() != filter.empty()) { + cmdctx->reply(-EINVAL, "you must specify both 'filter_by' and 'filter'"); + return true; + } + stringstream rs; + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) { + string class_name; + string item_name; + // sanity check filter(s) + if (filter_by == "class") { + if (!osdmap.crush->class_exists(filter)) { + rs << "specified class '" << filter << "' does not exist"; + return -EINVAL; + } + class_name = filter; + } + if (filter_by == "name") { + if (!osdmap.crush->name_exists(filter)) { + rs << "specified name '" << filter << "' does not exist"; + return -EINVAL; + } + item_name = filter; + } + print_osd_utilization(osdmap, pgmap, ss, + f.get(), method == "tree", + class_name, item_name); + + cmdctx->odata.append(ss); + return 0; + }); + cmdctx->reply(r, rs); + return true; + } else if (prefix == "osd pool stats") { + string pool_name; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "pool_name", pool_name); + int64_t poolid = -ENOENT; + bool one_pool = false; + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + if (!pool_name.empty()) { + poolid = osdmap.lookup_pg_pool_name(pool_name); + if (poolid < 0) { + ceph_assert(poolid == -ENOENT); + ss << "unrecognized pool '" << pool_name << "'"; + return -ENOENT; + } + one_pool = true; + } + stringstream rs; + if (f) + f->open_array_section("pool_stats"); + else { + if (osdmap.get_pools().empty()) { + ss << "there are no pools!"; + goto stats_out; + } + } + for (auto &p : osdmap.get_pools()) { + if (!one_pool) { + poolid = p.first; + } + pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, f.get(), &rs); + if (one_pool) { + break; + } + } + stats_out: + if (f) { + f->close_section(); + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(rs.str()); + } + return 0; + }); + if (r != -EOPNOTSUPP) { + cmdctx->reply(r, ss); + return true; + } + } else if (prefix == "osd safe-to-destroy" || + prefix == "osd destroy" || + prefix == "osd purge") { + set osds; + int r = 0; + if (prefix == "osd safe-to-destroy") { + vector ids; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "ids", ids); + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + r = osdmap.parse_osd_id_list(ids, &osds, &ss); + }); + if (!r && osds.empty()) { + ss << "must specify one or more OSDs"; + r = -EINVAL; + } + } else { + int64_t id; + if (!cmd_getval(g_ceph_context, cmdctx->cmdmap, "id", id)) { + r = -EINVAL; + ss << "must specify OSD id"; + } else { + osds.insert(id); + } + } + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + set active_osds, missing_stats, stored_pgs, safe_to_destroy; + int affected_pgs = 0; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + if (pg_map.num_pg_unknown > 0) { + ss << pg_map.num_pg_unknown << " pgs have unknown state; cannot draw" + << " any conclusions"; + r = -EAGAIN; + return; + } + int num_active_clean = 0; + for (auto& p : pg_map.num_pg_by_state) { + unsigned want = PG_STATE_ACTIVE|PG_STATE_CLEAN; + if ((p.first & want) == want) { + num_active_clean += p.second; + } + } + for (auto osd : osds) { + if (!osdmap.exists(osd)) { + safe_to_destroy.insert(osd); + continue; // clearly safe to destroy + } + auto q = pg_map.num_pg_by_osd.find(osd); + if (q != pg_map.num_pg_by_osd.end()) { + if (q->second.acting > 0 || q->second.up_not_acting > 0) { + active_osds.insert(osd); + // XXX: For overlapping PGs, this counts them again + affected_pgs += q->second.acting + q->second.up_not_acting; + continue; + } + } + if (num_active_clean < pg_map.num_pg) { + // all pgs aren't active+clean; we need to be careful. + auto p = pg_map.osd_stat.find(osd); + if (p == pg_map.osd_stat.end() || !osdmap.is_up(osd)) { + missing_stats.insert(osd); + continue; + } else if (p->second.num_pgs > 0) { + stored_pgs.insert(osd); + continue; + } + } + safe_to_destroy.insert(osd); + } + }); + if (r && prefix == "osd safe-to-destroy") { + cmdctx->reply(r, ss); // regardless of formatter + return true; + } + if (!r && (!active_osds.empty() || + !missing_stats.empty() || !stored_pgs.empty())) { + if (!safe_to_destroy.empty()) { + ss << "OSD(s) " << safe_to_destroy + << " are safe to destroy without reducing data durability. "; + } + if (!active_osds.empty()) { + ss << "OSD(s) " << active_osds << " have " << affected_pgs + << " pgs currently mapped to them. "; + } + if (!missing_stats.empty()) { + ss << "OSD(s) " << missing_stats << " have no reported stats, and not all" + << " PGs are active+clean; we cannot draw any conclusions. "; + } + if (!stored_pgs.empty()) { + ss << "OSD(s) " << stored_pgs << " last reported they still store some PG" + << " data, and not all PGs are active+clean; we cannot be sure they" + << " aren't still needed."; + } + if (!active_osds.empty() || !stored_pgs.empty()) { + r = -EBUSY; + } else { + r = -EAGAIN; + } + } + + if (prefix == "osd safe-to-destroy") { + if (!r) { + ss << "OSD(s) " << osds << " are safe to destroy without reducing data" + << " durability."; + } + if (f) { + f->open_object_section("osd_status"); + f->open_array_section("safe_to_destroy"); + for (auto i : safe_to_destroy) + f->dump_int("osd", i); + f->close_section(); + f->open_array_section("active"); + for (auto i : active_osds) + f->dump_int("osd", i); + f->close_section(); + f->open_array_section("missing_stats"); + for (auto i : missing_stats) + f->dump_int("osd", i); + f->close_section(); + f->open_array_section("stored_pgs"); + for (auto i : stored_pgs) + f->dump_int("osd", i); + f->close_section(); + f->close_section(); // osd_status + f->flush(cmdctx->odata); + r = 0; + std::stringstream().swap(ss); + } + cmdctx->reply(r, ss); + return true; + } + + if (r) { + bool force = false; + cmd_getval(cct, cmdctx->cmdmap, "force", force); + if (!force) { + // Backward compat + cmd_getval(cct, cmdctx->cmdmap, "yes_i_really_mean_it", force); + } + if (!force) { + ss << "\nYou can proceed by passing --force, but be warned that" + " this will likely mean real, permanent data loss."; + } else { + r = 0; + } + } + if (r) { + cmdctx->reply(r, ss); + return true; + } + const string cmd = + "{" + "\"prefix\": \"" + prefix + "-actual\", " + "\"id\": " + stringify(osds) + ", " + "\"yes_i_really_mean_it\": true" + "}"; + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, {}, nullptr, &on_finish->outs, on_finish); + return true; + } else if (prefix == "osd ok-to-stop") { + vector ids; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "ids", ids); + set osds; + int r; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + r = osdmap.parse_osd_id_list(ids, &osds, &ss); + }); + if (!r && osds.empty()) { + ss << "must specify one or more OSDs"; + r = -EINVAL; + } + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + int touched_pgs = 0; + int dangerous_pgs = 0; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + if (pg_map.num_pg_unknown > 0) { + ss << pg_map.num_pg_unknown << " pgs have unknown state; " + << "cannot draw any conclusions"; + r = -EAGAIN; + return; + } + for (const auto& q : pg_map.pg_stat) { + set pg_acting; // net acting sets (with no missing if degraded) + bool found = false; + if (q.second.state & PG_STATE_DEGRADED) { + for (auto& anm : q.second.avail_no_missing) { + if (osds.count(anm.osd)) { + found = true; + continue; + } + if (anm.osd != CRUSH_ITEM_NONE) { + pg_acting.insert(anm.osd); + } + } + } else { + for (auto& a : q.second.acting) { + if (osds.count(a)) { + found = true; + continue; + } + if (a != CRUSH_ITEM_NONE) { + pg_acting.insert(a); + } + } + } + if (!found) { + continue; + } + touched_pgs++; + if (!(q.second.state & PG_STATE_ACTIVE) || + (q.second.state & PG_STATE_DEGRADED)) { + ++dangerous_pgs; + continue; + } + const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool()); + if (!pi) { + ++dangerous_pgs; // pool is creating or deleting + } else { + if (pg_acting.size() < pi->min_size) { + ++dangerous_pgs; + } + } + } + }); + if (r) { + cmdctx->reply(r, ss); + return true; + } + if (dangerous_pgs) { + ss << dangerous_pgs << " PGs are already too degraded, would become" + << " too degraded or might become unavailable"; + cmdctx->reply(-EBUSY, ss); + return true; + } + ss << "OSD(s) " << osds << " are ok to stop without reducing" + << " availability or risking data, provided there are no other concurrent failures" + << " or interventions." << std::endl; + ss << touched_pgs << " PGs are likely to be" + << " degraded (but remain available) as a result."; + cmdctx->reply(0, ss); + return true; + } else if (prefix == "pg force-recovery" || + prefix == "pg force-backfill" || + prefix == "pg cancel-force-recovery" || + prefix == "pg cancel-force-backfill" || + prefix == "osd pool force-recovery" || + prefix == "osd pool force-backfill" || + prefix == "osd pool cancel-force-recovery" || + prefix == "osd pool cancel-force-backfill") { + vector vs; + get_str_vec(prefix, vs); + auto& granularity = vs.front(); + auto& forceop = vs.back(); + vector pgs; + + // figure out actual op just once + int actual_op = 0; + if (forceop == "force-recovery") { + actual_op = OFR_RECOVERY; + } else if (forceop == "force-backfill") { + actual_op = OFR_BACKFILL; + } else if (forceop == "cancel-force-backfill") { + actual_op = OFR_BACKFILL | OFR_CANCEL; + } else if (forceop == "cancel-force-recovery") { + actual_op = OFR_RECOVERY | OFR_CANCEL; + } + + set candidates; // deduped + if (granularity == "pg") { + // covnert pg names to pgs, discard any invalid ones while at it + vector pgids; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgids); + for (auto& i : pgids) { + pg_t pgid; + if (!pgid.parse(i.c_str())) { + ss << "invlaid pgid '" << i << "'; "; + r = -EINVAL; + continue; + } + candidates.insert(pgid); + } + } else { + // per pool + vector pool_names; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "who", pool_names); + if (pool_names.empty()) { + ss << "must specify one or more pool names"; + cmdctx->reply(-EINVAL, ss); + return true; + } + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (auto& pool_name : pool_names) { + auto pool_id = osdmap.lookup_pg_pool_name(pool_name); + if (pool_id < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + return; + } + auto pool_pg_num = osdmap.get_pg_num(pool_id); + for (int i = 0; i < pool_pg_num; i++) + candidates.insert({(unsigned int)i, (uint64_t)pool_id}); + } + }); + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + } + + cluster_state.with_pgmap([&](const PGMap& pg_map) { + for (auto& i : candidates) { + auto it = pg_map.pg_stat.find(i); + if (it == pg_map.pg_stat.end()) { + ss << "pg " << i << " does not exist; "; + r = -ENOENT; + continue; + } + auto state = it->second.state; + // discard pgs for which user requests are pointless + switch (actual_op) { + case OFR_RECOVERY: + if ((state & (PG_STATE_DEGRADED | + PG_STATE_RECOVERY_WAIT | + PG_STATE_RECOVERING)) == 0) { + // don't return error, user script may be racing with cluster. + // not fatal. + ss << "pg " << i << " doesn't require recovery; "; + continue; + } else if (state & PG_STATE_FORCED_RECOVERY) { + ss << "pg " << i << " recovery already forced; "; + // return error, as it may be a bug in user script + r = -EINVAL; + continue; + } + break; + case OFR_BACKFILL: + if ((state & (PG_STATE_DEGRADED | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILLING)) == 0) { + ss << "pg " << i << " doesn't require backfilling; "; + continue; + } else if (state & PG_STATE_FORCED_BACKFILL) { + ss << "pg " << i << " backfill already forced; "; + r = -EINVAL; + continue; + } + break; + case OFR_BACKFILL | OFR_CANCEL: + if ((state & PG_STATE_FORCED_BACKFILL) == 0) { + ss << "pg " << i << " backfill not forced; "; + continue; + } + break; + case OFR_RECOVERY | OFR_CANCEL: + if ((state & PG_STATE_FORCED_RECOVERY) == 0) { + ss << "pg " << i << " recovery not forced; "; + continue; + } + break; + default: + ceph_abort_msg("actual_op value is not supported"); + } + pgs.push_back(i); + } // for + }); + + // respond with error only when no pgs are correct + // yes, in case of mixed errors, only the last one will be emitted, + // but the message presented will be fine + if (pgs.size() != 0) { + // clear error to not confuse users/scripts + r = 0; + } + + // optimize the command -> messages conversion, use only one + // message per distinct OSD + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + // group pgs to process by osd + map> osdpgs; + for (auto& pgid : pgs) { + int primary; + spg_t spg; + if (osdmap.get_primary_shard(pgid, &primary, &spg)) { + osdpgs[primary].push_back(spg); + } + } + for (auto& i : osdpgs) { + if (osdmap.is_up(i.first)) { + auto p = osd_cons.find(i.first); + if (p == osd_cons.end()) { + ss << "osd." << i.first << " is not currently connected"; + r = -EAGAIN; + continue; + } + for (auto& con : p->second) { + con->send_message( + new MOSDForceRecovery(monc->get_fsid(), i.second, actual_op)); + } + ss << "instructing pg(s) " << i.second << " on osd." << i.first + << " to " << forceop << "; "; + } + } + }); + ss << std::endl; + cmdctx->reply(r, ss); + return true; + } else if (prefix == "config show" || + prefix == "config show-with-defaults") { + string who; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "who", who); + int r = 0; + auto dot = who.find('.'); + DaemonKey key; + key.first = who.substr(0, dot); + key.second = who.substr(dot + 1); + DaemonStatePtr daemon = daemon_state.get(key); + string name; + if (!daemon) { + ss << "no config state for daemon " << who; + cmdctx->reply(-ENOENT, ss); + return true; + } + + std::lock_guard l(daemon->lock); + + if (cmd_getval(g_ceph_context, cmdctx->cmdmap, "key", name)) { + // handle special options + if (name == "fsid") { + cmdctx->odata.append(stringify(monc->get_fsid()) + "\n"); + cmdctx->reply(r, ss); + return true; + } + auto p = daemon->config.find(name); + if (p != daemon->config.end() && + !p->second.empty()) { + cmdctx->odata.append(p->second.rbegin()->second + "\n"); + } else { + auto& defaults = daemon->_get_config_defaults(); + auto q = defaults.find(name); + if (q != defaults.end()) { + cmdctx->odata.append(q->second + "\n"); + } else { + r = -ENOENT; + } + } + } else if (daemon->config_defaults_bl.length() > 0) { + TextTable tbl; + if (f) { + f->open_array_section("config"); + } else { + tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("SOURCE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("OVERRIDES", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("IGNORES", TextTable::LEFT, TextTable::LEFT); + } + if (prefix == "config show") { + // show + for (auto& i : daemon->config) { + dout(20) << " " << i.first << " -> " << i.second << dendl; + if (i.second.empty()) { + continue; + } + if (f) { + f->open_object_section("value"); + f->dump_string("name", i.first); + f->dump_string("value", i.second.rbegin()->second); + f->dump_string("source", ceph_conf_level_name( + i.second.rbegin()->first)); + if (i.second.size() > 1) { + f->open_array_section("overrides"); + auto j = i.second.rend(); + for (--j; j != i.second.rbegin(); --j) { + f->open_object_section("value"); + f->dump_string("source", ceph_conf_level_name(j->first)); + f->dump_string("value", j->second); + f->close_section(); + } + f->close_section(); + } + if (daemon->ignored_mon_config.count(i.first)) { + f->dump_string("ignores", "mon"); + } + f->close_section(); + } else { + tbl << i.first; + tbl << i.second.rbegin()->second; + tbl << ceph_conf_level_name(i.second.rbegin()->first); + if (i.second.size() > 1) { + list ov; + auto j = i.second.rend(); + for (--j; j != i.second.rbegin(); --j) { + if (j->second == i.second.rbegin()->second) { + ov.push_front(string("(") + ceph_conf_level_name(j->first) + + string("[") + j->second + string("]") + + string(")")); + } else { + ov.push_front(ceph_conf_level_name(j->first) + + string("[") + j->second + string("]")); + + } + } + tbl << ov; + } else { + tbl << ""; + } + tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : ""); + tbl << TextTable::endrow; + } + } + } else { + // show-with-defaults + auto& defaults = daemon->_get_config_defaults(); + for (auto& i : defaults) { + if (f) { + f->open_object_section("value"); + f->dump_string("name", i.first); + } else { + tbl << i.first; + } + auto j = daemon->config.find(i.first); + if (j != daemon->config.end() && !j->second.empty()) { + // have config + if (f) { + f->dump_string("value", j->second.rbegin()->second); + f->dump_string("source", ceph_conf_level_name( + j->second.rbegin()->first)); + if (j->second.size() > 1) { + f->open_array_section("overrides"); + auto k = j->second.rend(); + for (--k; k != j->second.rbegin(); --k) { + f->open_object_section("value"); + f->dump_string("source", ceph_conf_level_name(k->first)); + f->dump_string("value", k->second); + f->close_section(); + } + f->close_section(); + } + if (daemon->ignored_mon_config.count(i.first)) { + f->dump_string("ignores", "mon"); + } + f->close_section(); + } else { + tbl << j->second.rbegin()->second; + tbl << ceph_conf_level_name(j->second.rbegin()->first); + if (j->second.size() > 1) { + list ov; + auto k = j->second.rend(); + for (--k; k != j->second.rbegin(); --k) { + if (k->second == j->second.rbegin()->second) { + ov.push_front(string("(") + ceph_conf_level_name(k->first) + + string("[") + k->second + string("]") + + string(")")); + } else { + ov.push_front(ceph_conf_level_name(k->first) + + string("[") + k->second + string("]")); + } + } + tbl << ov; + } else { + tbl << ""; + } + tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : ""); + tbl << TextTable::endrow; + } + } else { + // only have default + if (f) { + f->dump_string("value", i.second); + f->dump_string("source", ceph_conf_level_name(CONF_DEFAULT)); + f->close_section(); + } else { + tbl << i.second; + tbl << ceph_conf_level_name(CONF_DEFAULT); + tbl << ""; + tbl << ""; + tbl << TextTable::endrow; + } + } + } + } + if (f) { + f->close_section(); + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(stringify(tbl)); + } + } + cmdctx->reply(r, ss); + return true; + } else if (prefix == "device ls") { + set devids; + TextTable tbl; + if (f) { + f->open_array_section("devices"); + daemon_state.with_devices([&f](const DeviceState& dev) { + f->dump_object("device", dev); + }); + f->close_section(); + f->flush(cmdctx->odata); + } else { + tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("LIFE EXPECTANCY", TextTable::LEFT, TextTable::LEFT); + auto now = ceph_clock_now(); + daemon_state.with_devices([&tbl, now](const DeviceState& dev) { + string h; + for (auto& i : dev.devnames) { + if (h.size()) { + h += " "; + } + h += i.first + ":" + i.second; + } + string d; + for (auto& i : dev.daemons) { + if (d.size()) { + d += " "; + } + d += to_string(i); + } + tbl << dev.devid + << h + << d + << dev.get_life_expectancy_str(now) + << TextTable::endrow; + }); + cmdctx->odata.append(stringify(tbl)); + } + cmdctx->reply(0, ss); + return true; + } else if (prefix == "device ls-by-daemon") { + string who; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "who", who); + DaemonKey k; + if (!key_from_string(who, &k)) { + ss << who << " is not a valid daemon name"; + r = -EINVAL; + } else { + auto dm = daemon_state.get(k); + if (dm) { + if (f) { + f->open_array_section("devices"); + for (auto& i : dm->devices) { + daemon_state.with_device(i.first, [&f] (const DeviceState& dev) { + f->dump_object("device", dev); + }); + } + f->close_section(); + f->flush(cmdctx->odata); + } else { + TextTable tbl; + tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("EXPECTED FAILURE", TextTable::LEFT, + TextTable::LEFT); + auto now = ceph_clock_now(); + for (auto& i : dm->devices) { + daemon_state.with_device( + i.first, [&tbl, now] (const DeviceState& dev) { + string h; + for (auto& i : dev.devnames) { + if (h.size()) { + h += " "; + } + h += i.first + ":" + i.second; + } + tbl << dev.devid + << h + << dev.get_life_expectancy_str(now) + << TextTable::endrow; + }); + } + cmdctx->odata.append(stringify(tbl)); + } + } else { + r = -ENOENT; + ss << "daemon " << who << " not found"; + } + cmdctx->reply(r, ss); + } + } else if (prefix == "device ls-by-host") { + string host; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "host", host); + set devids; + daemon_state.list_devids_by_server(host, &devids); + if (f) { + f->open_array_section("devices"); + for (auto& devid : devids) { + daemon_state.with_device( + devid, [&f] (const DeviceState& dev) { + f->dump_object("device", dev); + }); + } + f->close_section(); + f->flush(cmdctx->odata); + } else { + TextTable tbl; + tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DEV", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("EXPECTED FAILURE", TextTable::LEFT, TextTable::LEFT); + auto now = ceph_clock_now(); + for (auto& devid : devids) { + daemon_state.with_device( + devid, [&tbl, &host, now] (const DeviceState& dev) { + string n; + for (auto& j : dev.devnames) { + if (j.first == host) { + if (n.size()) { + n += " "; + } + n += j.second; + } + } + string d; + for (auto& i : dev.daemons) { + if (d.size()) { + d += " "; + } + d += to_string(i); + } + tbl << dev.devid + << n + << d + << dev.get_life_expectancy_str(now) + << TextTable::endrow; + }); + } + cmdctx->odata.append(stringify(tbl)); + } + cmdctx->reply(0, ss); + return true; + } else if (prefix == "device info") { + string devid; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "devid", devid); + int r = 0; + ostringstream rs; + if (!daemon_state.with_device(devid, + [&f, &rs] (const DeviceState& dev) { + if (f) { + f->dump_object("device", dev); + } else { + dev.print(rs); + } + })) { + ss << "device " << devid << " not found"; + r = -ENOENT; + } else { + if (f) { + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(rs.str()); + } + } + cmdctx->reply(r, ss); + return true; + } else if (prefix == "device set-life-expectancy") { + string devid; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "devid", devid); + string from_str, to_str; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "from", from_str); + cmd_getval(g_ceph_context, cmdctx->cmdmap, "to", to_str); + utime_t from, to; + if (!from.parse(from_str)) { + ss << "unable to parse datetime '" << from_str << "'"; + r = -EINVAL; + cmdctx->reply(r, ss); + } else if (to_str.size() && !to.parse(to_str)) { + ss << "unable to parse datetime '" << to_str << "'"; + r = -EINVAL; + cmdctx->reply(r, ss); + } else { + map meta; + daemon_state.with_device_create( + devid, + [from, to, &meta] (DeviceState& dev) { + dev.set_life_expectancy(from, to, ceph_clock_now()); + meta = dev.metadata; + }); + json_spirit::Object json_object; + for (auto& i : meta) { + json_spirit::Config::add(json_object, i.first, i.second); + } + bufferlist json; + json.append(json_spirit::write(json_object)); + const string cmd = + "{" + "\"prefix\": \"config-key set\", " + "\"key\": \"device/" + devid + "\"" + "}"; + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish); + } + return true; + } else if (prefix == "device rm-life-expectancy") { + string devid; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "devid", devid); + map meta; + if (daemon_state.with_device_write(devid, [&meta] (DeviceState& dev) { + dev.rm_life_expectancy(); + meta = dev.metadata; + })) { + string cmd; + bufferlist json; + if (meta.empty()) { + cmd = + "{" + "\"prefix\": \"config-key rm\", " + "\"key\": \"device/" + devid + "\"" + "}"; + } else { + json_spirit::Object json_object; + for (auto& i : meta) { + json_spirit::Config::add(json_object, i.first, i.second); + } + json.append(json_spirit::write(json_object)); + cmd = + "{" + "\"prefix\": \"config-key set\", " + "\"key\": \"device/" + devid + "\"" + "}"; + } + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish); + } else { + cmdctx->reply(0, ss); + } + return true; + } else { + if (!pgmap_ready) { + ss << "Warning: due to ceph-mgr restart, some PG states may not be up to date\n"; + } + if (f) { + f->open_object_section("pg_info"); + f->dump_bool("pg_ready", pgmap_ready); + } + + // fall back to feeding command to PGMap + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + return process_pg_map_command(prefix, cmdctx->cmdmap, pg_map, osdmap, + f.get(), &ss, &cmdctx->odata); + }); + + if (f) { + f->close_section(); + } + if (r != -EOPNOTSUPP) { + if (f) { + f->flush(cmdctx->odata); + } + cmdctx->reply(r, ss); + return true; + } + } + + // Was the command unfound? + if (py_command.cmdstring.empty()) { + ss << "No handler found for '" << prefix << "'"; + dout(4) << "No handler found for '" << prefix << "'" << dendl; + cmdctx->reply(-EINVAL, ss); + return true; + } + + dout(10) << "passing through " << cmdctx->cmdmap.size() << dendl; + finisher.queue(new FunctionContext([this, cmdctx, session, py_command, prefix] + (int r_) mutable { + std::stringstream ss; + + // Validate that the module is enabled + auto& py_handler_name = py_command.module_name; + PyModuleRef module = py_modules.get_module(py_handler_name); + ceph_assert(module); + if (!module->is_enabled()) { + ss << "Module '" << py_handler_name << "' is not enabled (required by " + "command '" << prefix << "'): use `ceph mgr module enable " + << py_handler_name << "` to enable it"; + dout(4) << ss.str() << dendl; + cmdctx->reply(-EOPNOTSUPP, ss); + return; + } + + // Hack: allow the self-test method to run on unhealthy modules. + // Fix this in future by creating a special path for self test rather + // than having the hook be a normal module command. + std::string self_test_prefix = py_handler_name + " " + "self-test"; + + // Validate that the module is healthy + bool accept_command; + if (module->is_loaded()) { + if (module->get_can_run() && !module->is_failed()) { + // Healthy module + accept_command = true; + } else if (self_test_prefix == prefix) { + // Unhealthy, but allow because it's a self test command + accept_command = true; + } else { + accept_command = false; + ss << "Module '" << py_handler_name << "' has experienced an error and " + "cannot handle commands: " << module->get_error_string(); + } + } else { + // Module not loaded + accept_command = false; + ss << "Module '" << py_handler_name << "' failed to load and " + "cannot handle commands: " << module->get_error_string(); + } + + if (!accept_command) { + dout(4) << ss.str() << dendl; + cmdctx->reply(-EIO, ss); + return; + } + + std::stringstream ds; + bufferlist inbl = cmdctx->m->get_data(); + int r = py_modules.handle_command(py_command, *session, cmdctx->cmdmap, + inbl, &ds, &ss); + if (r == -EACCES) { + log_access_denied(cmdctx, session, ss); + } + + cmdctx->odata.append(ds); + cmdctx->reply(r, ss); + })); + return true; +} + +void DaemonServer::_prune_pending_service_map() +{ + utime_t cutoff = ceph_clock_now(); + cutoff -= g_conf().get_val("mgr_service_beacon_grace"); + auto p = pending_service_map.services.begin(); + while (p != pending_service_map.services.end()) { + auto q = p->second.daemons.begin(); + while (q != p->second.daemons.end()) { + DaemonKey key(p->first, q->first); + if (!daemon_state.exists(key)) { + if (ServiceMap::is_normal_ceph_entity(p->first)) { + dout(10) << "daemon " << key << " in service map but not in daemon state " + << "index -- force pruning" << dendl; + q = p->second.daemons.erase(q); + pending_service_map_dirty = pending_service_map.epoch; + } else { + derr << "missing key " << key << dendl; + ++q; + } + + continue; + } + + auto daemon = daemon_state.get(key); + std::lock_guard l(daemon->lock); + if (daemon->last_service_beacon == utime_t()) { + // we must have just restarted; assume they are alive now. + daemon->last_service_beacon = ceph_clock_now(); + ++q; + continue; + } + if (daemon->last_service_beacon < cutoff) { + dout(10) << "pruning stale " << p->first << "." << q->first + << " last_beacon " << daemon->last_service_beacon << dendl; + q = p->second.daemons.erase(q); + pending_service_map_dirty = pending_service_map.epoch; + } else { + ++q; + } + } + if (p->second.daemons.empty()) { + p = pending_service_map.services.erase(p); + pending_service_map_dirty = pending_service_map.epoch; + } else { + ++p; + } + } +} + +void DaemonServer::send_report() +{ + if (!pgmap_ready) { + if (ceph_clock_now() - started_at > g_conf().get_val("mgr_stats_period") * 4.0) { + pgmap_ready = true; + reported_osds.clear(); + dout(1) << "Giving up on OSDs that haven't reported yet, sending " + << "potentially incomplete PG state to mon" << dendl; + } else { + dout(1) << "Not sending PG status to monitor yet, waiting for OSDs" + << dendl; + return; + } + } + + auto m = new MMonMgrReport(); + py_modules.get_health_checks(&m->health_checks); + py_modules.get_progress_events(&m->progress_events); + + cluster_state.with_mutable_pgmap([&](PGMap& pg_map) { + cluster_state.update_delta_stats(); + + if (pending_service_map.epoch) { + _prune_pending_service_map(); + if (pending_service_map_dirty >= pending_service_map.epoch) { + pending_service_map.modified = ceph_clock_now(); + encode(pending_service_map, m->service_map_bl, CEPH_FEATURES_ALL); + dout(10) << "sending service_map e" << pending_service_map.epoch + << dendl; + pending_service_map.epoch++; + } + } + + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + // FIXME: no easy way to get mon features here. this will do for + // now, though, as long as we don't make a backward-incompat change. + pg_map.encode_digest(osdmap, m->get_data(), CEPH_FEATURES_ALL); + dout(10) << pg_map << dendl; + + pg_map.get_health_checks(g_ceph_context, osdmap, + &m->health_checks); + + dout(10) << m->health_checks.checks.size() << " health checks" + << dendl; + dout(20) << "health checks:\n"; + JSONFormatter jf(true); + jf.dump_object("health_checks", m->health_checks); + jf.flush(*_dout); + *_dout << dendl; + if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) { + clog->debug() << "pgmap v" << pg_map.version << ": " << pg_map; + } + }); + }); + + map> accumulated; + for (auto service : {"osd", "mon"} ) { + auto daemons = daemon_state.get_by_service(service); + for (const auto& [key,state] : daemons) { + std::lock_guard l{state->lock}; + for (const auto& metric : state->daemon_health_metrics) { + auto acc = accumulated.find(metric.get_type()); + if (acc == accumulated.end()) { + auto collector = DaemonHealthMetricCollector::create(metric.get_type()); + if (!collector) { + derr << __func__ << " " << key.first << "." << key.second + << " sent me an unknown health metric: " + << std::hex << static_cast(metric.get_type()) + << std::dec << dendl; + continue; + } + dout(20) << " + " << state->key << " " + << metric << dendl; + tie(acc, std::ignore) = accumulated.emplace(metric.get_type(), + std::move(collector)); + } + acc->second->update(key, metric); + } + } + } + for (const auto& acc : accumulated) { + acc.second->summarize(m->health_checks); + } + // TODO? We currently do not notify the PyModules + // TODO: respect needs_send, so we send the report only if we are asked to do + // so, or the state is updated. + monc->send_mon_message(m); +} + +void DaemonServer::adjust_pgs() +{ + dout(20) << dendl; + unsigned max = std::max(1, g_conf()->mon_osd_max_creating_pgs); + double max_misplaced = g_conf().get_val("target_max_misplaced_ratio"); + bool aggro = g_conf().get_val("mgr_debug_aggressive_pg_num_changes"); + + map pg_num_to_set; + map pgp_num_to_set; + set upmaps_to_clear; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + unsigned creating_or_unknown = 0; + for (auto& i : pg_map.num_pg_by_state) { + if ((i.first & (PG_STATE_CREATING)) || + i.first == 0) { + creating_or_unknown += i.second; + } + } + unsigned left = max; + if (creating_or_unknown >= max) { + return; + } + left -= creating_or_unknown; + dout(10) << "creating_or_unknown " << creating_or_unknown + << " max_creating " << max + << " left " << left + << dendl; + + // FIXME: These checks are fundamentally racy given that adjust_pgs() + // can run more frequently than we get updated pg stats from OSDs. We + // may make multiple adjustments with stale informaiton. + double misplaced_ratio, degraded_ratio; + double inactive_pgs_ratio, unknown_pgs_ratio; + pg_map.get_recovery_stats(&misplaced_ratio, °raded_ratio, + &inactive_pgs_ratio, &unknown_pgs_ratio); + dout(20) << "misplaced_ratio " << misplaced_ratio + << " degraded_ratio " << degraded_ratio + << " inactive_pgs_ratio " << inactive_pgs_ratio + << " unknown_pgs_ratio " << unknown_pgs_ratio + << "; target_max_misplaced_ratio " << max_misplaced + << dendl; + + for (auto& i : osdmap.get_pools()) { + const pg_pool_t& p = i.second; + + // adjust pg_num? + if (p.get_pg_num_target() != p.get_pg_num()) { + dout(20) << "pool " << i.first + << " pg_num " << p.get_pg_num() + << " target " << p.get_pg_num_target() + << dendl; + if (p.has_flag(pg_pool_t::FLAG_CREATING)) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - still creating initial pgs" + << dendl; + } else if (p.get_pg_num_target() < p.get_pg_num()) { + // pg_num decrease (merge) + pg_t merge_source(p.get_pg_num() - 1, i.first); + pg_t merge_target = merge_source.get_parent(); + bool ok = true; + + if (p.get_pg_num() != p.get_pg_num_pending()) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - decrease and pg_num_pending != pg_num, waiting" + << dendl; + ok = false; + } else if (p.get_pg_num() == p.get_pgp_num()) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - decrease blocked by pgp_num " + << p.get_pgp_num() + << dendl; + ok = false; + } + vector source_acting; + for (auto &merge_participant : {merge_source, merge_target}) { + bool is_merge_source = merge_participant == merge_source; + if (osdmap.have_pg_upmaps(merge_participant)) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << (is_merge_source ? " - merge source " : " - merge target ") + << merge_participant + << " has upmap" << dendl; + upmaps_to_clear.insert(merge_participant); + ok = false; + } + auto q = pg_map.pg_stat.find(merge_participant); + if (q == pg_map.pg_stat.end()) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - no state for " << merge_participant + << (is_merge_source ? " (merge source)" : " (merge target)") + << dendl; + ok = false; + } else if ((q->second.state & (PG_STATE_ACTIVE | PG_STATE_CLEAN)) != + (PG_STATE_ACTIVE | PG_STATE_CLEAN)) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << (is_merge_source ? " - merge source " : " - merge target ") + << merge_participant + << " not clean (" << pg_state_string(q->second.state) + << ")" << dendl; + ok = false; + } + if (is_merge_source) { + source_acting = q->second.acting; + } else if (ok && q->second.acting != source_acting) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << (is_merge_source ? " - merge source " : " - merge target ") + << merge_participant + << " acting does not match (source " << source_acting + << " != target " << q->second.acting + << ")" << dendl; + ok = false; + } + } + + if (ok) { + unsigned target = p.get_pg_num() - 1; + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " -> " << target + << " (merging " << merge_source + << " and " << merge_target + << ")" << dendl; + pg_num_to_set[osdmap.get_pool_name(i.first)] = target; + } + } else if (p.get_pg_num_target() > p.get_pg_num()) { + // pg_num increase (split) + bool active = true; + auto q = pg_map.num_pg_by_pool_state.find(i.first); + if (q != pg_map.num_pg_by_pool_state.end()) { + for (auto& j : q->second) { + if ((j.first & (PG_STATE_ACTIVE|PG_STATE_PEERED)) == 0) { + dout(20) << "pool " << i.first << " has " << j.second + << " pgs in " << pg_state_string(j.first) + << dendl; + active = false; + break; + } + } + } else { + active = false; + } + if (!active) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - not all pgs active" + << dendl; + } else { + unsigned add = std::min( + left, + p.get_pg_num_target() - p.get_pg_num()); + unsigned target = p.get_pg_num() + add; + left -= add; + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " -> " << target << dendl; + pg_num_to_set[osdmap.get_pool_name(i.first)] = target; + } + } + } + + // adjust pgp_num? + unsigned target = std::min(p.get_pg_num_pending(), + p.get_pgp_num_target()); + if (target != p.get_pgp_num()) { + dout(20) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " -> " << target << dendl; + if (target > p.get_pgp_num() && + p.get_pgp_num() == p.get_pg_num()) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " - increase blocked by pg_num " << p.get_pg_num() + << dendl; + } else if (!aggro && (inactive_pgs_ratio > 0 || + degraded_ratio > 0 || + unknown_pgs_ratio > 0)) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " - inactive|degraded|unknown pgs, deferring pgp_num" + << " update" << dendl; + } else if (!aggro && (misplaced_ratio > max_misplaced)) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " - misplaced_ratio " << misplaced_ratio + << " > max " << max_misplaced + << ", deferring pgp_num update" << dendl; + } else { + // NOTE: this calculation assumes objects are + // basically uniformly distributed across all PGs + // (regardless of pool), which is probably not + // perfectly correct, but it's a start. make no + // single adjustment that's more than half of the + // max_misplaced, to somewhat limit the magnitude of + // our potential error here. + int next; + + pool_stat_t s = pg_map.get_pg_pool_sum_stat(i.first); + if (aggro || + // pool is (virtually) empty; just jump to final pgp_num? + (p.get_pgp_num_target() > p.get_pgp_num() && + s.stats.sum.num_objects <= p.get_pgp_num_target())) { + next = target; + } else { + double room = + std::min(max_misplaced - misplaced_ratio, + max_misplaced / 2.0); + unsigned estmax = std::max( + (double)p.get_pg_num() * room, 1u); + int delta = target - p.get_pgp_num(); + next = p.get_pgp_num(); + if (delta < 0) { + next += std::max(-estmax, delta); + } else { + next += std::min(estmax, delta); + } + dout(20) << " room " << room << " estmax " << estmax + << " delta " << delta << " next " << next << dendl; + if (p.get_pgp_num_target() == p.get_pg_num_target() && + p.get_pgp_num_target() < p.get_pg_num()) { + // since pgp_num is tracking pg_num, ceph is handling + // pgp_num. so, be responsible: don't let pgp_num get + // too far out ahead of merges (if we are merging). + // this avoids moving lots of unmerged pgs onto a + // small number of OSDs where we might blow out the + // per-osd pg max. + unsigned max_outpace_merges = + std::max(8, p.get_pg_num() * max_misplaced); + if (next + max_outpace_merges < p.get_pg_num()) { + next = p.get_pg_num() - max_outpace_merges; + dout(10) << " using next " << next + << " to avoid outpacing merges (max_outpace_merges " + << max_outpace_merges << ")" << dendl; + } + } + } + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " -> " << next << dendl; + pgp_num_to_set[osdmap.get_pool_name(i.first)] = next; + } + } + if (left == 0) { + return; + } + } + }); + for (auto i : pg_num_to_set) { + const string cmd = + "{" + "\"prefix\": \"osd pool set\", " + "\"pool\": \"" + i.first + "\", " + "\"var\": \"pg_num_actual\", " + "\"val\": \"" + stringify(i.second) + "\"" + "}"; + monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr); + } + for (auto i : pgp_num_to_set) { + const string cmd = + "{" + "\"prefix\": \"osd pool set\", " + "\"pool\": \"" + i.first + "\", " + "\"var\": \"pgp_num_actual\", " + "\"val\": \"" + stringify(i.second) + "\"" + "}"; + monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr); + } + for (auto pg : upmaps_to_clear) { + const string cmd = + "{" + "\"prefix\": \"osd rm-pg-upmap\", " + "\"pgid\": \"" + stringify(pg) + "\"" + "}"; + monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr); + const string cmd2 = + "{" + "\"prefix\": \"osd rm-pg-upmap-items\", " + "\"pgid\": \"" + stringify(pg) + "\"" + + "}"; + monc->start_mon_command({cmd2}, {}, nullptr, nullptr, nullptr); + } +} + +void DaemonServer::got_service_map() +{ + std::lock_guard l(lock); + + cluster_state.with_servicemap([&](const ServiceMap& service_map) { + if (pending_service_map.epoch == 0) { + // we just started up + dout(10) << "got initial map e" << service_map.epoch << dendl; + pending_service_map = service_map; + pending_service_map.epoch = service_map.epoch + 1; + } else { + // we we already active and therefore must have persisted it, + // which means ours is the same or newer. + dout(10) << "got updated map e" << service_map.epoch << dendl; + ceph_assert(pending_service_map.epoch > service_map.epoch); + } + }); + + // cull missing daemons, populate new ones + std::set types; + for (auto& p : pending_service_map.services) { + if (ServiceMap::is_normal_ceph_entity(p.first)) { + continue; + } + + types.insert(p.first); + + std::set names; + for (auto& q : p.second.daemons) { + names.insert(q.first); + DaemonKey key(p.first, q.first); + if (!daemon_state.exists(key)) { + auto daemon = std::make_shared(daemon_state.types); + daemon->key = key; + daemon->set_metadata(q.second.metadata); + daemon->service_daemon = true; + daemon_state.insert(daemon); + dout(10) << "added missing " << key << dendl; + } + } + daemon_state.cull(p.first, names); + } + daemon_state.cull_services(types); +} + +void DaemonServer::got_mgr_map() +{ + std::lock_guard l(lock); + set have; + cluster_state.with_mgrmap([&](const MgrMap& mgrmap) { + auto md_update = [&] (DaemonKey key) { + std::ostringstream oss; + auto c = new MetadataUpdate(daemon_state, key); + // FIXME remove post-nautilus: include 'id' for luminous mons + oss << "{\"prefix\": \"mgr metadata\", \"who\": \"" + << key.second << "\", \"id\": \"" << key.second << "\"}"; + monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c); + }; + if (mgrmap.active_name.size()) { + DaemonKey key("mgr", mgrmap.active_name); + have.insert(mgrmap.active_name); + if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) { + md_update(key); + dout(10) << "triggered addition of " << key << " via metadata update" << dendl; + } + } + for (auto& i : mgrmap.standbys) { + DaemonKey key("mgr", i.second.name); + have.insert(i.second.name); + if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) { + md_update(key); + dout(10) << "triggered addition of " << key << " via metadata update" << dendl; + } + } + }); + daemon_state.cull("mgr", have); +} + +const char** DaemonServer::get_tracked_conf_keys() const +{ + static const char *KEYS[] = { + "mgr_stats_threshold", + "mgr_stats_period", + nullptr + }; + + return KEYS; +} + +void DaemonServer::handle_conf_change(const ConfigProxy& conf, + const std::set &changed) +{ + + if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) { + dout(4) << "Updating stats threshold/period on " + << daemon_connections.size() << " clients" << dendl; + // Send a fresh MMgrConfigure to all clients, so that they can follow + // the new policy for transmitting stats + finisher.queue(new FunctionContext([this](int r) { + std::lock_guard l(lock); + for (auto &c : daemon_connections) { + _send_configure(c); + } + })); + } +} + +void DaemonServer::_send_configure(ConnectionRef c) +{ + ceph_assert(lock.is_locked_by_me()); + + auto configure = new MMgrConfigure(); + configure->stats_period = g_conf().get_val("mgr_stats_period"); + configure->stats_threshold = g_conf().get_val("mgr_stats_threshold"); + + if (c->peer_is_osd()) { + configure->osd_perf_metric_queries = + osd_perf_metric_collector.get_queries(); + } + + c->send_message(configure); +} + +OSDPerfMetricQueryID DaemonServer::add_osd_perf_query( + const OSDPerfMetricQuery &query, + const std::optional &limit) +{ + return osd_perf_metric_collector.add_query(query, limit); +} + +int DaemonServer::remove_osd_perf_query(OSDPerfMetricQueryID query_id) +{ + return osd_perf_metric_collector.remove_query(query_id); +} + +int DaemonServer::get_osd_perf_counters( + OSDPerfMetricQueryID query_id, + std::map *counters) +{ + return osd_perf_metric_collector.get_counters(query_id, counters); +} diff --git a/src/mgr/DaemonServer.h b/src/mgr/DaemonServer.h new file mode 100644 index 00000000..3c73d84c --- /dev/null +++ b/src/mgr/DaemonServer.h @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef DAEMON_SERVER_H_ +#define DAEMON_SERVER_H_ + +#include "PyModuleRegistry.h" + +#include +#include + +#include "common/Mutex.h" +#include "common/LogClient.h" +#include "common/Timer.h" + +#include +#include + +#include "ServiceMap.h" +#include "MgrSession.h" +#include "DaemonState.h" +#include "OSDPerfMetricCollector.h" + +class MMgrReport; +class MMgrOpen; +class MMgrClose; +class MMonMgrReport; +class MCommand; +struct MonCommand; +class CommandContext; +struct OSDPerfMetricQuery; + + +/** + * Server used in ceph-mgr to communicate with Ceph daemons like + * MDSs and OSDs. + */ +class DaemonServer : public Dispatcher, public md_config_obs_t +{ +protected: + boost::scoped_ptr client_byte_throttler; + boost::scoped_ptr client_msg_throttler; + boost::scoped_ptr osd_byte_throttler; + boost::scoped_ptr osd_msg_throttler; + boost::scoped_ptr mds_byte_throttler; + boost::scoped_ptr mds_msg_throttler; + boost::scoped_ptr mon_byte_throttler; + boost::scoped_ptr mon_msg_throttler; + + Messenger *msgr; + MonClient *monc; + Finisher &finisher; + DaemonStateIndex &daemon_state; + ClusterState &cluster_state; + PyModuleRegistry &py_modules; + LogChannelRef clog, audit_clog; + + // Connections for daemons, and clients with service names set + // (i.e. those MgrClients that are allowed to send MMgrReports) + std::set daemon_connections; + + /// connections for osds + ceph::unordered_map> osd_cons; + + ServiceMap pending_service_map; // uncommitted + + epoch_t pending_service_map_dirty = 0; + + Mutex lock; + + static void _generate_command_map(cmdmap_t& cmdmap, + map ¶m_str_map); + static const MonCommand *_get_mgrcommand(const string &cmd_prefix, + const std::vector &commands); + bool _allowed_command( + MgrSession *s, const string &service, const string &module, + const string &prefix, const cmdmap_t& cmdmap, + const map& param_str_map, + const MonCommand *this_cmd); + +private: + friend class ReplyOnFinish; + bool _reply(MCommand* m, + int ret, const std::string& s, const bufferlist& payload); + + void _prune_pending_service_map(); + + utime_t started_at; + std::atomic pgmap_ready; + std::set reported_osds; + void maybe_ready(int32_t osd_id); + + SafeTimer timer; + bool shutting_down; + Context *tick_event; + void tick(); + void schedule_tick_locked(double delay_sec); + + class OSDPerfMetricCollectorListener : + public OSDPerfMetricCollector::Listener { + public: + OSDPerfMetricCollectorListener(DaemonServer *server) + : server(server) { + } + void handle_query_updated() override { + server->handle_osd_perf_metric_query_updated(); + } + private: + DaemonServer *server; + }; + OSDPerfMetricCollectorListener osd_perf_metric_collector_listener; + OSDPerfMetricCollector osd_perf_metric_collector; + void handle_osd_perf_metric_query_updated(); + + void update_task_status(DaemonKey key, MMgrReport *m); + +public: + int init(uint64_t gid, entity_addrvec_t client_addrs); + void shutdown(); + + entity_addrvec_t get_myaddrs() const; + + DaemonServer(MonClient *monc_, + Finisher &finisher_, + DaemonStateIndex &daemon_state_, + ClusterState &cluster_state_, + PyModuleRegistry &py_modules_, + LogChannelRef cl, + LogChannelRef auditcl); + ~DaemonServer() override; + + bool ms_dispatch(Message *m) override; + int ms_handle_authentication(Connection *con) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override; + bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) override; + KeyStore *ms_get_auth1_authorizer_keystore() override; + + bool handle_open(MMgrOpen *m); + bool handle_close(MMgrClose *m); + bool handle_report(MMgrReport *m); + bool handle_command(MCommand *m); + bool _handle_command(MCommand *m, std::shared_ptr& cmdctx); + void send_report(); + void got_service_map(); + void got_mgr_map(); + void adjust_pgs(); + + void _send_configure(ConnectionRef c); + + OSDPerfMetricQueryID add_osd_perf_query( + const OSDPerfMetricQuery &query, + const std::optional &limit); + int remove_osd_perf_query(OSDPerfMetricQueryID query_id); + int get_osd_perf_counters(OSDPerfMetricQueryID query_id, + std::map *c); + + virtual const char** get_tracked_conf_keys() const override; + virtual void handle_conf_change(const ConfigProxy& conf, + const std::set &changed) override; + + void schedule_tick(double delay_sec); + + void log_access_denied(std::shared_ptr& cmdctx, + MgrSession* session, std::stringstream& ss); + void dump_pg_ready(ceph::Formatter *f); +}; + +#endif + diff --git a/src/mgr/DaemonState.cc b/src/mgr/DaemonState.cc new file mode 100644 index 00000000..a276b395 --- /dev/null +++ b/src/mgr/DaemonState.cc @@ -0,0 +1,347 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "DaemonState.h" + +#include "MgrSession.h" +#include "include/stringify.h" +#include "common/Formatter.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +void DeviceState::set_metadata(map&& m) +{ + metadata = std::move(m); + auto p = metadata.find("life_expectancy_min"); + if (p != metadata.end()) { + life_expectancy.first.parse(p->second); + } + p = metadata.find("life_expectancy_max"); + if (p != metadata.end()) { + life_expectancy.second.parse(p->second); + } + p = metadata.find("life_expectancy_stamp"); + if (p != metadata.end()) { + life_expectancy_stamp.parse(p->second); + } +} + +void DeviceState::set_life_expectancy(utime_t from, utime_t to, utime_t now) +{ + life_expectancy = make_pair(from, to); + life_expectancy_stamp = now; + if (from != utime_t()) { + metadata["life_expectancy_min"] = from; + } else { + metadata["life_expectancy_min"] = ""; + } + if (to != utime_t()) { + metadata["life_expectancy_max"] = to; + } else { + metadata["life_expectancy_max"] = ""; + } + if (now != utime_t()) { + metadata["life_expectancy_stamp"] = stringify(now); + } else { + metadata["life_expectancy_stamp"] = ""; + } +} + +void DeviceState::rm_life_expectancy() +{ + life_expectancy = make_pair(utime_t(), utime_t()); + life_expectancy_stamp = utime_t(); + metadata.erase("life_expectancy_min"); + metadata.erase("life_expectancy_max"); + metadata.erase("life_expectancy_stamp"); +} + +string DeviceState::get_life_expectancy_str(utime_t now) const +{ + if (life_expectancy.first == utime_t()) { + return string(); + } + if (now >= life_expectancy.first) { + return "now"; + } + utime_t min = life_expectancy.first - now; + utime_t max = life_expectancy.second - now; + if (life_expectancy.second == utime_t()) { + return string(">") + timespan_str(make_timespan(min)); + } + string a = timespan_str(make_timespan(min)); + string b = timespan_str(make_timespan(max)); + if (a == b) { + return a; + } + return a + " to " + b; +} + +void DeviceState::dump(Formatter *f) const +{ + f->dump_string("devid", devid); + f->open_array_section("location"); + for (auto& i : devnames) { + f->open_object_section("attachment"); + f->dump_string("host", i.first); + f->dump_string("dev", i.second); + f->close_section(); + } + f->close_section(); + f->open_array_section("daemons"); + for (auto& i : daemons) { + f->dump_string("daemon", to_string(i)); + } + f->close_section(); + if (life_expectancy.first != utime_t()) { + f->dump_stream("life_expectancy_min") << life_expectancy.first; + f->dump_stream("life_expectancy_max") << life_expectancy.second; + f->dump_stream("life_expectancy_stamp") + << life_expectancy_stamp; + } +} + +void DeviceState::print(ostream& out) const +{ + out << "device " << devid << "\n"; + for (auto& i : devnames) { + out << "attachment " << i.first << ":" << i.second << "\n"; + } + set d; + for (auto& j : daemons) { + d.insert(to_string(j)); + } + out << "daemons " << d << "\n"; + if (life_expectancy.first != utime_t()) { + out << "life_expectancy " << life_expectancy.first << " to " + << life_expectancy.second + << " (as of " << life_expectancy_stamp << ")\n"; + } +} + +void DaemonStateIndex::insert(DaemonStatePtr dm) +{ + RWLock::WLocker l(lock); + _insert(dm); +} + +void DaemonStateIndex::_insert(DaemonStatePtr dm) +{ + if (all.count(dm->key)) { + _erase(dm->key); + } + + by_server[dm->hostname][dm->key] = dm; + all[dm->key] = dm; + + for (auto& i : dm->devices) { + auto d = _get_or_create_device(i.first); + d->daemons.insert(dm->key); + d->devnames.insert(make_pair(dm->hostname, i.second)); + } +} + +void DaemonStateIndex::_erase(const DaemonKey& dmk) +{ + ceph_assert(lock.is_wlocked()); + + const auto to_erase = all.find(dmk); + ceph_assert(to_erase != all.end()); + const auto dm = to_erase->second; + + for (auto& i : dm->devices) { + auto d = _get_or_create_device(i.first); + ceph_assert(d->daemons.count(dmk)); + d->daemons.erase(dmk); + d->devnames.erase(make_pair(dm->hostname, i.second)); + if (d->empty()) { + _erase_device(d); + } + } + + auto &server_collection = by_server[dm->hostname]; + server_collection.erase(dm->key); + if (server_collection.empty()) { + by_server.erase(dm->hostname); + } + + all.erase(to_erase); +} + +DaemonStateCollection DaemonStateIndex::get_by_service( + const std::string& svc) const +{ + RWLock::RLocker l(lock); + + DaemonStateCollection result; + + for (const auto &i : all) { + if (i.first.first == svc) { + result[i.first] = i.second; + } + } + + return result; +} + +DaemonStateCollection DaemonStateIndex::get_by_server( + const std::string &hostname) const +{ + RWLock::RLocker l(lock); + + if (by_server.count(hostname)) { + return by_server.at(hostname); + } else { + return {}; + } +} + +bool DaemonStateIndex::exists(const DaemonKey &key) const +{ + RWLock::RLocker l(lock); + + return all.count(key) > 0; +} + +DaemonStatePtr DaemonStateIndex::get(const DaemonKey &key) +{ + RWLock::RLocker l(lock); + + auto iter = all.find(key); + if (iter != all.end()) { + return iter->second; + } else { + return nullptr; + } +} + +void DaemonStateIndex::rm(const DaemonKey &key) +{ + RWLock::WLocker l(lock); + _rm(key); +} + +void DaemonStateIndex::_rm(const DaemonKey &key) +{ + if (all.count(key)) { + _erase(key); + } +} + +void DaemonStateIndex::cull(const std::string& svc_name, + const std::set& names_exist) +{ + std::vector victims; + + RWLock::WLocker l(lock); + auto begin = all.lower_bound({svc_name, ""}); + auto end = all.end(); + for (auto &i = begin; i != end; ++i) { + const auto& daemon_key = i->first; + if (daemon_key.first != svc_name) + break; + if (names_exist.count(daemon_key.second) == 0) { + victims.push_back(daemon_key.second); + } + } + + for (auto &i : victims) { + DaemonKey daemon_key{svc_name, i}; + dout(4) << "Removing data for " << daemon_key << dendl; + _erase(daemon_key); + } +} + +void DaemonStateIndex::cull_services(const std::set& types_exist) +{ + std::set victims; + + RWLock::WLocker l(lock); + for (auto it = all.begin(); it != all.end(); ++it) { + const auto& daemon_key = it->first; + if (it->second->service_daemon && + types_exist.count(daemon_key.first) == 0) { + victims.insert(daemon_key); + } + } + + for (auto &i : victims) { + dout(4) << "Removing data for " << i << dendl; + _erase(i); + } +} + +void DaemonPerfCounters::update(MMgrReport *report) +{ + dout(20) << "loading " << report->declare_types.size() << " new types, " + << report->undeclare_types.size() << " old types, had " + << types.size() << " types, got " + << report->packed.length() << " bytes of data" << dendl; + + // Retrieve session state + auto priv = report->get_connection()->get_priv(); + auto session = static_cast(priv.get()); + + // Load any newly declared types + for (const auto &t : report->declare_types) { + types.insert(std::make_pair(t.path, t)); + session->declared_types.insert(t.path); + } + // Remove any old types + for (const auto &t : report->undeclare_types) { + session->declared_types.erase(t); + } + + const auto now = ceph_clock_now(); + + // Parse packed data according to declared set of types + auto p = report->packed.cbegin(); + DECODE_START(1, p); + for (const auto &t_path : session->declared_types) { + const auto &t = types.at(t_path); + auto instances_it = instances.find(t_path); + // Always check the instance exists, as we don't prevent yet + // multiple sessions from daemons with the same name, and one + // session clearing stats created by another on open. + if (instances_it == instances.end()) { + instances_it = instances.insert({t_path, t.type}).first; + } + uint64_t val = 0; + uint64_t avgcount = 0; + uint64_t avgcount2 = 0; + + decode(val, p); + if (t.type & PERFCOUNTER_LONGRUNAVG) { + decode(avgcount, p); + decode(avgcount2, p); + instances_it->second.push_avg(now, val, avgcount); + } else { + instances_it->second.push(now, val); + } + } + DECODE_FINISH(p); +} + +void PerfCounterInstance::push(utime_t t, uint64_t const &v) +{ + buffer.push_back({t, v}); +} + +void PerfCounterInstance::push_avg(utime_t t, uint64_t const &s, + uint64_t const &c) +{ + avg_buffer.push_back({t, s, c}); +} diff --git a/src/mgr/DaemonState.h b/src/mgr/DaemonState.h new file mode 100644 index 00000000..0661f61a --- /dev/null +++ b/src/mgr/DaemonState.h @@ -0,0 +1,400 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef DAEMON_STATE_H_ +#define DAEMON_STATE_H_ + +#include +#include +#include +#include +#include + +#include "common/RWLock.h" +#include "include/str_map.h" + +#include "msg/msg_types.h" + +// For PerfCounterType +#include "messages/MMgrReport.h" + +namespace ceph { + class Formatter; +} + +// Unique reference to a daemon within a cluster +typedef std::pair DaemonKey; + +static inline std::string to_string(const DaemonKey& dk) { + return dk.first + "." + dk.second; +} + +// An instance of a performance counter type, within +// a particular daemon. +class PerfCounterInstance +{ + class DataPoint + { + public: + utime_t t; + uint64_t v; + DataPoint(utime_t t_, uint64_t v_) + : t(t_), v(v_) + {} + }; + + class AvgDataPoint + { + public: + utime_t t; + uint64_t s; + uint64_t c; + AvgDataPoint(utime_t t_, uint64_t s_, uint64_t c_) + : t(t_), s(s_), c(c_) + {} + }; + + boost::circular_buffer buffer; + boost::circular_buffer avg_buffer; + + uint64_t get_current() const; + + public: + const boost::circular_buffer & get_data() const + { + return buffer; + } + const DataPoint& get_latest_data() const + { + return buffer.back(); + } + const boost::circular_buffer & get_data_avg() const + { + return avg_buffer; + } + const AvgDataPoint& get_latest_data_avg() const + { + return avg_buffer.back(); + } + void push(utime_t t, uint64_t const &v); + void push_avg(utime_t t, uint64_t const &s, uint64_t const &c); + + PerfCounterInstance(enum perfcounter_type_d type) + { + if (type & PERFCOUNTER_LONGRUNAVG) + avg_buffer = boost::circular_buffer(20); + else + buffer = boost::circular_buffer(20); + }; +}; + + +typedef std::map PerfCounterTypes; + +// Performance counters for one daemon +class DaemonPerfCounters +{ + public: + // The record of perf stat types, shared between daemons + PerfCounterTypes &types; + + explicit DaemonPerfCounters(PerfCounterTypes &types_) + : types(types_) + {} + + std::map instances; + + void update(MMgrReport *report); + + void clear() + { + instances.clear(); + } +}; + +// The state that we store about one daemon +class DaemonState +{ + public: + Mutex lock = {"DaemonState::lock"}; + + DaemonKey key; + + // The hostname where daemon was last seen running (extracted + // from the metadata) + std::string hostname; + + // The metadata (hostname, version, etc) sent from the daemon + std::map metadata; + + /// device ids -> devname, derived from metadata[device_ids] + std::map devices; + + // TODO: this can be generalized to other daemons + std::vector daemon_health_metrics; + + // Ephemeral state + bool service_daemon = false; + utime_t service_status_stamp; + std::map service_status; + utime_t last_service_beacon; + + // running config + std::map> config; + + // mon config values we failed to set + std::map ignored_mon_config; + + // compiled-in config defaults (rarely used, so we leave them encoded!) + bufferlist config_defaults_bl; + std::map config_defaults; + + // The perf counters received in MMgrReport messages + DaemonPerfCounters perf_counters; + + explicit DaemonState(PerfCounterTypes &types_) + : perf_counters(types_) + { + } + + void set_metadata(const std::map& m) { + devices.clear(); + metadata = m; + auto p = m.find("device_ids"); + if (p != m.end()) { + map devs; + get_str_map(p->second, &devs, ",; "); + for (auto& i : devs) { + if (i.second.size()) { // skip blank ids + devices[i.second] = i.first; + } + } + } + p = m.find("hostname"); + if (p != m.end()) { + hostname = p->second; + } + } + + const std::map& _get_config_defaults() { + if (config_defaults.empty() && + config_defaults_bl.length()) { + auto p = config_defaults_bl.cbegin(); + try { + decode(config_defaults, p); + } catch (buffer::error& e) { + } + } + return config_defaults; + } +}; + +typedef std::shared_ptr DaemonStatePtr; +typedef std::map DaemonStateCollection; + + +struct DeviceState : public RefCountedObject +{ + std::string devid; + std::set> devnames; ///< (server,devname) + std::set daemons; + + std::map metadata; ///< persistent metadata + + pair life_expectancy; ///< when device failure is expected + utime_t life_expectancy_stamp; ///< when life expectency was recorded + + DeviceState(const std::string& n) + : RefCountedObject(nullptr, 0), + devid(n) {} + + void set_metadata(map&& m); + + void set_life_expectancy(utime_t from, utime_t to, utime_t now); + void rm_life_expectancy(); + + string get_life_expectancy_str(utime_t now) const; + + /// true of we can be safely forgotten/removed from memory + bool empty() const { + return daemons.empty() && metadata.empty(); + } + + void dump(Formatter *f) const; + void print(ostream& out) const; +}; + +typedef boost::intrusive_ptr DeviceStateRef; + +/** + * Fuse the collection of per-daemon metadata from Ceph into + * a view that can be queried by service type, ID or also + * by server (aka fqdn). + */ +class DaemonStateIndex +{ +private: + mutable RWLock lock = {"DaemonStateIndex", true, true, true}; + + std::map by_server; + DaemonStateCollection all; + std::set updating; + + std::map devices; + + void _erase(const DaemonKey& dmk); + + DeviceStateRef _get_or_create_device(const std::string& dev) { + auto p = devices.find(dev); + if (p != devices.end()) { + return p->second; + } + devices[dev] = new DeviceState(dev); + return devices[dev]; + } + void _erase_device(DeviceStateRef d) { + devices.erase(d->devid); + } + +public: + DaemonStateIndex() {} + + // FIXME: shouldn't really be public, maybe construct DaemonState + // objects internally to avoid this. + PerfCounterTypes types; + + void insert(DaemonStatePtr dm); + void _insert(DaemonStatePtr dm); + bool exists(const DaemonKey &key) const; + DaemonStatePtr get(const DaemonKey &key); + void rm(const DaemonKey &key); + void _rm(const DaemonKey &key); + + // Note that these return by value rather than reference to avoid + // callers needing to stay in lock while using result. Callers must + // still take the individual DaemonState::lock on each entry though. + DaemonStateCollection get_by_server(const std::string &hostname) const; + DaemonStateCollection get_by_service(const std::string &svc_name) const; + DaemonStateCollection get_all() const {return all;} + + template + auto with_daemons_by_server(Callback&& cb, Args&&... args) const -> + decltype(cb(by_server, std::forward(args)...)) { + RWLock::RLocker l(lock); + + return std::forward(cb)(by_server, std::forward(args)...); + } + + template + bool with_device(const std::string& dev, + Callback&& cb, Args&&... args) const { + RWLock::RLocker l(lock); + auto p = devices.find(dev); + if (p == devices.end()) { + return false; + } + std::forward(cb)(*p->second, std::forward(args)...); + return true; + } + + template + bool with_device_write(const std::string& dev, + Callback&& cb, Args&&... args) { + RWLock::WLocker l(lock); + auto p = devices.find(dev); + if (p == devices.end()) { + return false; + } + std::forward(cb)(*p->second, std::forward(args)...); + if (p->second->empty()) { + _erase_device(p->second); + } + return true; + } + + template + void with_device_create(const std::string& dev, + Callback&& cb, Args&&... args) { + RWLock::WLocker l(lock); + auto d = _get_or_create_device(dev); + std::forward(cb)(*d, std::forward(args)...); + } + + template + void with_devices(Callback&& cb, Args&&... args) const { + RWLock::RLocker l(lock); + for (auto& i : devices) { + std::forward(cb)(*i.second, std::forward(args)...); + } + } + + template + void with_devices2(CallbackInitial&& cbi, // with lock taken + Callback&& cb, // for each device + Args&&... args) const { + RWLock::RLocker l(lock); + cbi(); + for (auto& i : devices) { + std::forward(cb)(*i.second, std::forward(args)...); + } + } + + void list_devids_by_server(const std::string& server, + std::set *ls) { + auto m = get_by_server(server); + for (auto& i : m) { + std::lock_guard l(i.second->lock); + for (auto& j : i.second->devices) { + ls->insert(j.first); + } + } + } + + void notify_updating(const DaemonKey &k) { + RWLock::WLocker l(lock); + updating.insert(k); + } + void clear_updating(const DaemonKey &k) { + RWLock::WLocker l(lock); + updating.erase(k); + } + bool is_updating(const DaemonKey &k) { + RWLock::RLocker l(lock); + return updating.count(k) > 0; + } + + void update_metadata(DaemonStatePtr state, + const map& meta) { + // remove and re-insert in case the device metadata changed + RWLock::WLocker l(lock); + _rm(state->key); + { + Mutex::Locker l2(state->lock); + state->set_metadata(meta); + } + _insert(state); + } + + /** + * Remove state for all daemons of this type whose names are + * not present in `names_exist`. Use this function when you have + * a cluster map and want to ensure that anything absent in the map + * is also absent in this class. + */ + void cull(const std::string& svc_name, + const std::set& names_exist); + void cull_services(const std::set& types_exist); +}; + +#endif + diff --git a/src/mgr/Gil.cc b/src/mgr/Gil.cc new file mode 100644 index 00000000..d476c717 --- /dev/null +++ b/src/mgr/Gil.cc @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "Python.h" + +#include "common/debug.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +#include "Gil.h" + +SafeThreadState::SafeThreadState(PyThreadState *ts_) + : ts(ts_) +{ + ceph_assert(ts != nullptr); + thread = pthread_self(); +} + +Gil::Gil(SafeThreadState &ts, bool new_thread) : pThreadState(ts) +{ + // Acquire the GIL, set the current thread state + PyEval_RestoreThread(pThreadState.ts); + dout(25) << "GIL acquired for thread state " << pThreadState.ts << dendl; + + // + // If called from a separate OS thread (i.e. a thread not created + // by Python, that does't already have a python thread state that + // was created when that thread was active), we need to manually + // create and switch to a python thread state specifically for this + // OS thread. + // + // Note that instead of requring the caller to set new_thread == true + // when calling this from a separate OS thread, we could figure out + // if this was necessary automatically, as follows: + // + // if (pThreadState->thread_id != PyThread_get_thread_ident()) { + // + // However, this means we're accessing pThreadState->thread_id, but + // the Python C API docs say that "The only public data member is + // PyInterpreterState *interp", i.e. doing this would violate + // something that's meant to be a black box. + // + if (new_thread) { + pNewThreadState = PyThreadState_New(pThreadState.ts->interp); + PyThreadState_Swap(pNewThreadState); + dout(20) << "Switched to new thread state " << pNewThreadState << dendl; + } else { + ceph_assert(pthread_self() == pThreadState.thread); + } +} + +Gil::~Gil() +{ + if (pNewThreadState != nullptr) { + dout(20) << "Destroying new thread state " << pNewThreadState << dendl; + PyThreadState_Swap(pThreadState.ts); + PyThreadState_Clear(pNewThreadState); + PyThreadState_Delete(pNewThreadState); + } + // Release the GIL, reset the thread state to NULL + PyEval_SaveThread(); + dout(25) << "GIL released for thread state " << pThreadState.ts << dendl; +} + diff --git a/src/mgr/Gil.h b/src/mgr/Gil.h new file mode 100644 index 00000000..bff2d233 --- /dev/null +++ b/src/mgr/Gil.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +struct _ts; +typedef struct _ts PyThreadState; + +#include + + +/** + * Wrap PyThreadState to carry a record of which POSIX thread + * the thread state relates to. This allows the Gil class to + * validate that we're being used from the right thread. + */ +class SafeThreadState +{ + public: + explicit SafeThreadState(PyThreadState *ts_); + + SafeThreadState() + : ts(nullptr), thread(0) + { + } + + PyThreadState *ts; + pthread_t thread; + + void set(PyThreadState *ts_) + { + ts = ts_; + thread = pthread_self(); + } +}; + +// +// Use one of these in any scope in which you need to hold Python's +// Global Interpreter Lock. +// +// Do *not* nest these, as a second GIL acquire will deadlock (see +// https://docs.python.org/2/c-api/init.html#c.PyEval_RestoreThread) +// +// If in doubt, explicitly put a scope around the block of code you +// know you need the GIL in. +// +// See the comment in Gil::Gil for when to set new_thread == true +// +class Gil { +public: + Gil(const Gil&) = delete; + Gil& operator=(const Gil&) = delete; + + Gil(SafeThreadState &ts, bool new_thread = false); + ~Gil(); + +private: + SafeThreadState &pThreadState; + PyThreadState *pNewThreadState = nullptr; +}; + diff --git a/src/mgr/Mgr.cc b/src/mgr/Mgr.cc new file mode 100644 index 00000000..5dee6326 --- /dev/null +++ b/src/mgr/Mgr.cc @@ -0,0 +1,692 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include + +#include "osdc/Objecter.h" +#include "client/Client.h" +#include "common/errno.h" +#include "mon/MonClient.h" +#include "include/stringify.h" +#include "global/global_context.h" +#include "global/signal_handler.h" + +#include "mgr/MgrContext.h" + +#include "DaemonServer.h" +#include "messages/MMgrDigest.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MLog.h" +#include "messages/MServiceMap.h" +#include "PyModule.h" +#include "Mgr.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + + +Mgr::Mgr(MonClient *monc_, const MgrMap& mgrmap, + PyModuleRegistry *py_module_registry_, + Messenger *clientm_, Objecter *objecter_, + Client* client_, LogChannelRef clog_, LogChannelRef audit_clog_) : + monc(monc_), + objecter(objecter_), + client(client_), + client_messenger(clientm_), + lock("Mgr::lock"), + finisher(g_ceph_context, "Mgr", "mgr-fin"), + digest_received(false), + py_module_registry(py_module_registry_), + cluster_state(monc, nullptr, mgrmap), + server(monc, finisher, daemon_state, cluster_state, *py_module_registry, + clog_, audit_clog_), + clog(clog_), + audit_clog(audit_clog_), + initialized(false), + initializing(false) +{ + cluster_state.set_objecter(objecter); +} + + +Mgr::~Mgr() +{ +} + +void MetadataUpdate::finish(int r) +{ + daemon_state.clear_updating(key); + if (r == 0) { + if (key.first == "mds" || key.first == "osd" || + key.first == "mgr" || key.first == "mon") { + json_spirit::mValue json_result; + bool read_ok = json_spirit::read( + outbl.to_str(), json_result); + if (!read_ok) { + dout(1) << "mon returned invalid JSON for " + << key.first << "." << key.second << dendl; + return; + } + if (json_result.type() != json_spirit::obj_type) { + dout(1) << "mon returned valid JSON " + << key.first << "." << key.second + << " but not an object: '" << outbl.to_str() << "'" << dendl; + return; + } + dout(4) << "mon returned valid metadata JSON for " + << key.first << "." << key.second << dendl; + + json_spirit::mObject daemon_meta = json_result.get_obj(); + + // Skip daemon who doesn't have hostname yet + if (daemon_meta.count("hostname") == 0) { + dout(1) << "Skipping incomplete metadata entry for " + << key.first << "." << key.second << dendl; + return; + } + + // Apply any defaults + for (const auto &i : defaults) { + if (daemon_meta.find(i.first) == daemon_meta.end()) { + daemon_meta[i.first] = i.second; + } + } + + DaemonStatePtr state; + if (daemon_state.exists(key)) { + state = daemon_state.get(key); + state->hostname = daemon_meta.at("hostname").get_str(); + + if (key.first == "mds" || key.first == "mgr" || key.first == "mon") { + daemon_meta.erase("name"); + } else if (key.first == "osd") { + daemon_meta.erase("id"); + } + daemon_meta.erase("hostname"); + map m; + for (const auto &i : daemon_meta) { + m[i.first] = i.second.get_str(); + } + + daemon_state.update_metadata(state, m); + } else { + state = std::make_shared(daemon_state.types); + state->key = key; + state->hostname = daemon_meta.at("hostname").get_str(); + + if (key.first == "mds" || key.first == "mgr" || key.first == "mon") { + daemon_meta.erase("name"); + } else if (key.first == "osd") { + daemon_meta.erase("id"); + } + daemon_meta.erase("hostname"); + + map m; + for (const auto &i : daemon_meta) { + m[i.first] = i.second.get_str(); + } + state->set_metadata(m); + + daemon_state.insert(state); + } + } else { + ceph_abort(); + } + } else { + dout(1) << "mon failed to return metadata for " + << key.first << "." << key.second << ": " + << cpp_strerror(r) << dendl; + } +} + +void Mgr::background_init(Context *completion) +{ + std::lock_guard l(lock); + ceph_assert(!initializing); + ceph_assert(!initialized); + initializing = true; + + finisher.start(); + + finisher.queue(new FunctionContext([this, completion](int r){ + init(); + completion->complete(0); + })); +} + +std::map Mgr::load_store() +{ + ceph_assert(lock.is_locked_by_me()); + + dout(10) << "listing keys" << dendl; + JSONCommand cmd; + cmd.run(monc, "{\"prefix\": \"config-key ls\"}"); + lock.Unlock(); + cmd.wait(); + lock.Lock(); + ceph_assert(cmd.r == 0); + + std::map loaded; + + for (auto &key_str : cmd.json_result.get_array()) { + std::string const key = key_str.get_str(); + + dout(20) << "saw key '" << key << "'" << dendl; + + const std::string config_prefix = PyModule::config_prefix; + const std::string device_prefix = "device/"; + + if (key.substr(0, config_prefix.size()) == config_prefix || + key.substr(0, device_prefix.size()) == device_prefix) { + dout(20) << "fetching '" << key << "'" << dendl; + Command get_cmd; + std::ostringstream cmd_json; + cmd_json << "{\"prefix\": \"config-key get\", \"key\": \"" << key << "\"}"; + get_cmd.run(monc, cmd_json.str()); + lock.Unlock(); + get_cmd.wait(); + lock.Lock(); + if (get_cmd.r == 0) { // tolerate racing config-key change + if (key.substr(0, device_prefix.size()) == device_prefix) { + // device/ + string devid = key.substr(device_prefix.size()); + map meta; + ostringstream ss; + string val = get_cmd.outbl.to_str(); + int r = get_json_str_map(val, ss, &meta, false); + if (r < 0) { + derr << __func__ << " failed to parse " << val << ": " << ss.str() + << dendl; + } else { + daemon_state.with_device_create( + devid, [&meta] (DeviceState& dev) { + dev.set_metadata(std::move(meta)); + }); + } + } else { + // config/ + loaded[key] = get_cmd.outbl.to_str(); + } + } + } + } + + return loaded; +} + +void Mgr::init() +{ + std::lock_guard l(lock); + ceph_assert(initializing); + ceph_assert(!initialized); + + // Start communicating with daemons to learn statistics etc + int r = server.init(monc->get_global_id(), client_messenger->get_myaddrs()); + if (r < 0) { + derr << "Initialize server fail: " << cpp_strerror(r) << dendl; + // This is typically due to a bind() failure, so let's let + // systemd restart us. + exit(1); + } + dout(4) << "Initialized server at " << server.get_myaddrs() << dendl; + + // Preload all daemon metadata (will subsequently keep this + // up to date by watching maps, so do the initial load before + // we subscribe to any maps) + dout(4) << "Loading daemon metadata..." << dendl; + load_all_metadata(); + + // subscribe to all the maps + monc->sub_want("log-info", 0, 0); + monc->sub_want("mgrdigest", 0, 0); + monc->sub_want("fsmap", 0, 0); + monc->sub_want("servicemap", 0, 0); + + dout(4) << "waiting for OSDMap..." << dendl; + // Subscribe to OSDMap update to pass on to ClusterState + objecter->maybe_request_map(); + + // reset the mon session. we get these maps through subscriptions which + // are stateful with the connection, so even if *we* don't have them a + // previous incarnation sharing the same MonClient may have. + monc->reopen_session(); + + // Start Objecter and wait for OSD map + lock.Unlock(); // Drop lock because OSDMap dispatch calls into my ms_dispatch + objecter->wait_for_osd_map(); + lock.Lock(); + + // Populate PGs in ClusterState + cluster_state.with_osdmap_and_pgmap([this](const OSDMap &osd_map, + const PGMap& pg_map) { + cluster_state.notify_osdmap(osd_map); + }); + + // Wait for FSMap + dout(4) << "waiting for FSMap..." << dendl; + while (!cluster_state.have_fsmap()) { + fs_map_cond.Wait(lock); + } + + dout(4) << "waiting for config-keys..." << dendl; + + // Wait for MgrDigest... + dout(4) << "waiting for MgrDigest..." << dendl; + while (!digest_received) { + digest_cond.Wait(lock); + } + + // Load module KV store + auto kv_store = load_store(); + + // Migrate config from KV store on luminous->mimic + // drop lock because we do blocking config sets to mon + lock.Unlock(); + py_module_registry->upgrade_config(monc, kv_store); + lock.Lock(); + + // assume finisher already initialized in background_init + dout(4) << "starting python modules..." << dendl; + py_module_registry->active_start(daemon_state, cluster_state, + kv_store, *monc, clog, audit_clog, *objecter, *client, + finisher, server); + + cluster_state.final_init(); + + dout(4) << "Complete." << dendl; + initializing = false; + initialized = true; +} + +void Mgr::load_all_metadata() +{ + ceph_assert(lock.is_locked_by_me()); + + JSONCommand mds_cmd; + mds_cmd.run(monc, "{\"prefix\": \"mds metadata\"}"); + JSONCommand osd_cmd; + osd_cmd.run(monc, "{\"prefix\": \"osd metadata\"}"); + JSONCommand mon_cmd; + mon_cmd.run(monc, "{\"prefix\": \"mon metadata\"}"); + + lock.Unlock(); + mds_cmd.wait(); + osd_cmd.wait(); + mon_cmd.wait(); + lock.Lock(); + + ceph_assert(mds_cmd.r == 0); + ceph_assert(mon_cmd.r == 0); + ceph_assert(osd_cmd.r == 0); + + for (auto &metadata_val : mds_cmd.json_result.get_array()) { + json_spirit::mObject daemon_meta = metadata_val.get_obj(); + if (daemon_meta.count("hostname") == 0) { + dout(1) << "Skipping incomplete metadata entry" << dendl; + continue; + } + + DaemonStatePtr dm = std::make_shared(daemon_state.types); + dm->key = DaemonKey("mds", + daemon_meta.at("name").get_str()); + dm->hostname = daemon_meta.at("hostname").get_str(); + + daemon_meta.erase("name"); + daemon_meta.erase("hostname"); + + for (const auto &i : daemon_meta) { + dm->metadata[i.first] = i.second.get_str(); + } + + daemon_state.insert(dm); + } + + for (auto &metadata_val : mon_cmd.json_result.get_array()) { + json_spirit::mObject daemon_meta = metadata_val.get_obj(); + if (daemon_meta.count("hostname") == 0) { + dout(1) << "Skipping incomplete metadata entry" << dendl; + continue; + } + + DaemonStatePtr dm = std::make_shared(daemon_state.types); + dm->key = DaemonKey("mon", + daemon_meta.at("name").get_str()); + dm->hostname = daemon_meta.at("hostname").get_str(); + + daemon_meta.erase("name"); + daemon_meta.erase("hostname"); + + map m; + for (const auto &i : daemon_meta) { + m[i.first] = i.second.get_str(); + } + dm->set_metadata(m); + + daemon_state.insert(dm); + } + + for (auto &osd_metadata_val : osd_cmd.json_result.get_array()) { + json_spirit::mObject osd_metadata = osd_metadata_val.get_obj(); + if (osd_metadata.count("hostname") == 0) { + dout(1) << "Skipping incomplete metadata entry" << dendl; + continue; + } + dout(4) << osd_metadata.at("hostname").get_str() << dendl; + + DaemonStatePtr dm = std::make_shared(daemon_state.types); + dm->key = DaemonKey("osd", + stringify(osd_metadata.at("id").get_int())); + dm->hostname = osd_metadata.at("hostname").get_str(); + + osd_metadata.erase("id"); + osd_metadata.erase("hostname"); + + map m; + for (const auto &i : osd_metadata) { + m[i.first] = i.second.get_str(); + } + dm->set_metadata(m); + + daemon_state.insert(dm); + } +} + + +void Mgr::shutdown() +{ + finisher.queue(new FunctionContext([&](int) { + { + std::lock_guard l(lock); + monc->sub_unwant("log-info"); + monc->sub_unwant("mgrdigest"); + monc->sub_unwant("fsmap"); + // First stop the server so that we're not taking any more incoming + // requests + server.shutdown(); + } + // after the messenger is stopped, signal modules to shutdown via finisher + py_module_registry->active_shutdown(); + })); + + // Then stop the finisher to ensure its enqueued contexts aren't going + // to touch references to the things we're about to tear down + finisher.wait_for_empty(); + finisher.stop(); +} + +void Mgr::handle_osd_map() +{ + ceph_assert(lock.is_locked_by_me()); + + std::set names_exist; + + /** + * When we see a new OSD map, inspect the entity addrs to + * see if they have changed (service restart), and if so + * reload the metadata. + */ + cluster_state.with_osdmap_and_pgmap([this, &names_exist](const OSDMap &osd_map, + const PGMap &pg_map) { + for (int osd_id = 0; osd_id < osd_map.get_max_osd(); ++osd_id) { + if (!osd_map.exists(osd_id)) { + continue; + } + + // Remember which OSDs exist so that we can cull any that don't + names_exist.insert(stringify(osd_id)); + + // Consider whether to update the daemon metadata (new/restarted daemon) + const auto k = DaemonKey("osd", stringify(osd_id)); + if (daemon_state.is_updating(k)) { + continue; + } + + bool update_meta = false; + if (daemon_state.exists(k)) { + if (osd_map.get_up_from(osd_id) == osd_map.get_epoch()) { + dout(4) << "Mgr::handle_osd_map: osd." << osd_id + << " joined cluster at " << "e" << osd_map.get_epoch() + << dendl; + update_meta = true; + } + } else { + update_meta = true; + } + if (update_meta) { + auto c = new MetadataUpdate(daemon_state, k); + std::ostringstream cmd; + cmd << "{\"prefix\": \"osd metadata\", \"id\": " + << osd_id << "}"; + monc->start_mon_command( + {cmd.str()}, + {}, &c->outbl, &c->outs, c); + } + } + + cluster_state.notify_osdmap(osd_map); + }); + + // TODO: same culling for MonMap + daemon_state.cull("osd", names_exist); +} + +void Mgr::handle_log(MLog *m) +{ + for (const auto &e : m->entries) { + py_module_registry->notify_all(e); + } + + m->put(); +} + +void Mgr::handle_service_map(MServiceMap *m) +{ + dout(10) << "e" << m->service_map.epoch << dendl; + cluster_state.set_service_map(m->service_map); + server.got_service_map(); +} + +void Mgr::handle_mon_map() +{ + dout(20) << __func__ << dendl; + assert(lock.is_locked_by_me()); + std::set names_exist; + cluster_state.with_monmap([&] (auto &monmap) { + for (unsigned int i = 0; i < monmap.size(); i++) { + names_exist.insert(monmap.get_name(i)); + } + }); + for (const auto& name : names_exist) { + const auto k = DaemonKey{"osd", name}; + if (daemon_state.is_updating(k)) { + continue; + } + auto c = new MetadataUpdate(daemon_state, k); + std::ostringstream cmd; + cmd << "{\"prefix\": \"mon metadata\", \"id\": \"" + << name << "\"}"; + monc->start_mon_command( + {cmd.str()}, + {}, &c->outbl, &c->outs, c); + } + daemon_state.cull("mon", names_exist); +} + +bool Mgr::ms_dispatch(Message *m) +{ + dout(4) << *m << dendl; + std::lock_guard l(lock); + + switch (m->get_type()) { + case MSG_MGR_DIGEST: + handle_mgr_digest(static_cast(m)); + break; + case CEPH_MSG_MON_MAP: + py_module_registry->notify_all("mon_map", ""); + handle_mon_map(); + m->put(); + break; + case CEPH_MSG_FS_MAP: + py_module_registry->notify_all("fs_map", ""); + handle_fs_map((MFSMap*)m); + return false; // I shall let this pass through for Client + break; + case CEPH_MSG_OSD_MAP: + handle_osd_map(); + + py_module_registry->notify_all("osd_map", ""); + + // Continuous subscribe, so that we can generate notifications + // for our MgrPyModules + objecter->maybe_request_map(); + m->put(); + break; + case MSG_SERVICE_MAP: + handle_service_map(static_cast(m)); + py_module_registry->notify_all("service_map", ""); + m->put(); + break; + case MSG_LOG: + handle_log(static_cast(m)); + break; + + default: + return false; + } + return true; +} + + +void Mgr::handle_fs_map(MFSMap* m) +{ + ceph_assert(lock.is_locked_by_me()); + + std::set names_exist; + + const FSMap &new_fsmap = m->get_fsmap(); + + fs_map_cond.Signal(); + + // TODO: callers (e.g. from python land) are potentially going to see + // the new fsmap before we've bothered populating all the resulting + // daemon_state. Maybe we should block python land while we're making + // this kind of update? + + cluster_state.set_fsmap(new_fsmap); + + auto mds_info = new_fsmap.get_mds_info(); + for (const auto &i : mds_info) { + const auto &info = i.second; + + if (!new_fsmap.gid_exists(i.first)){ + continue; + } + + // Remember which MDS exists so that we can cull any that don't + names_exist.insert(info.name); + + const auto k = DaemonKey("mds", info.name); + if (daemon_state.is_updating(k)) { + continue; + } + + bool update = false; + if (daemon_state.exists(k)) { + auto metadata = daemon_state.get(k); + std::lock_guard l(metadata->lock); + if (metadata->metadata.empty() || + metadata->metadata.count("addr") == 0) { + update = true; + } else { + auto metadata_addrs = metadata->metadata.at("addr"); + const auto map_addrs = info.addrs; + update = metadata_addrs != stringify(map_addrs); + if (update) { + dout(4) << "MDS[" << info.name << "] addr change " << metadata_addrs + << " != " << stringify(map_addrs) << dendl; + } + } + } else { + update = true; + } + + if (update) { + auto c = new MetadataUpdate(daemon_state, k); + + // Older MDS daemons don't have addr in the metadata, so + // fake it if the returned metadata doesn't have the field. + c->set_default("addr", stringify(info.addrs)); + + std::ostringstream cmd; + cmd << "{\"prefix\": \"mds metadata\", \"who\": \"" + << info.name << "\"}"; + monc->start_mon_command( + {cmd.str()}, + {}, &c->outbl, &c->outs, c); + } + } + daemon_state.cull("mds", names_exist); +} + +bool Mgr::got_mgr_map(const MgrMap& m) +{ + std::lock_guard l(lock); + dout(10) << m << dendl; + + set old_modules; + cluster_state.with_mgrmap([&](const MgrMap& m) { + old_modules = m.modules; + }); + if (m.modules != old_modules) { + derr << "mgrmap module list changed to (" << m.modules << "), respawn" + << dendl; + return true; + } + + cluster_state.set_mgr_map(m); + server.got_mgr_map(); + + return false; +} + +void Mgr::handle_mgr_digest(MMgrDigest* m) +{ + dout(10) << m->mon_status_json.length() << dendl; + dout(10) << m->health_json.length() << dendl; + cluster_state.load_digest(m); + py_module_registry->notify_all("mon_status", ""); + py_module_registry->notify_all("health", ""); + + // Hack: use this as a tick/opportunity to prompt python-land that + // the pgmap might have changed since last time we were here. + py_module_registry->notify_all("pg_summary", ""); + dout(10) << "done." << dendl; + + m->put(); + + if (!digest_received) { + digest_received = true; + digest_cond.Signal(); + } +} + +std::map Mgr::get_services() const +{ + std::lock_guard l(lock); + + return py_module_registry->get_services(); +} + diff --git a/src/mgr/Mgr.h b/src/mgr/Mgr.h new file mode 100644 index 00000000..0248c9df --- /dev/null +++ b/src/mgr/Mgr.h @@ -0,0 +1,132 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_MGR_H_ +#define CEPH_MGR_H_ + +// Python.h comes first because otherwise it clobbers ceph's assert +#include "PythonCompat.h" + +#include "mds/FSMap.h" +#include "messages/MFSMap.h" +#include "msg/Messenger.h" +#include "auth/Auth.h" +#include "common/Finisher.h" +#include "mon/MgrMap.h" + +#include "DaemonServer.h" +#include "PyModuleRegistry.h" + +#include "DaemonState.h" +#include "ClusterState.h" + +class MCommand; +class MMgrDigest; +class MLog; +class MServiceMap; +class Objecter; +class Client; + +class Mgr { +protected: + MonClient *monc; + Objecter *objecter; + Client *client; + Messenger *client_messenger; + + mutable Mutex lock; + Finisher finisher; + + // Track receipt of initial data during startup + Cond fs_map_cond; + bool digest_received; + Cond digest_cond; + + PyModuleRegistry *py_module_registry; + DaemonStateIndex daemon_state; + ClusterState cluster_state; + + DaemonServer server; + + LogChannelRef clog; + LogChannelRef audit_clog; + + void load_all_metadata(); + std::map load_store(); + void init(); + + bool initialized; + bool initializing; + +public: + Mgr(MonClient *monc_, const MgrMap& mgrmap, + PyModuleRegistry *py_module_registry_, + Messenger *clientm_, Objecter *objecter_, + Client *client_, LogChannelRef clog_, LogChannelRef audit_clog_); + ~Mgr(); + + bool is_initialized() const {return initialized;} + entity_addrvec_t get_server_addrs() const { + return server.get_myaddrs(); + } + + void handle_mgr_digest(MMgrDigest* m); + void handle_fs_map(MFSMap* m); + void handle_osd_map(); + void handle_log(MLog *m); + void handle_service_map(MServiceMap *m); + void handle_mon_map(); + + bool got_mgr_map(const MgrMap& m); + + bool ms_dispatch(Message *m); + + void background_init(Context *completion); + void shutdown(); + + std::map get_services() const; +}; + +/** + * Context for completion of metadata mon commands: take + * the result and stash it in DaemonStateIndex + */ +class MetadataUpdate : public Context +{ + +private: + DaemonStateIndex &daemon_state; + DaemonKey key; + + std::map defaults; + +public: + bufferlist outbl; + std::string outs; + + MetadataUpdate(DaemonStateIndex &daemon_state_, const DaemonKey &key_) + : daemon_state(daemon_state_), key(key_) + { + daemon_state.notify_updating(key); + } + + void set_default(const std::string &k, const std::string &v) + { + defaults[k] = v; + } + + void finish(int r) override; +}; + + +#endif diff --git a/src/mgr/MgrCap.cc b/src/mgr/MgrCap.cc new file mode 100644 index 00000000..ef1f3943 --- /dev/null +++ b/src/mgr/MgrCap.cc @@ -0,0 +1,580 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "MgrCap.h" +#include "include/stringify.h" +#include "include/ipaddr.h" +#include "common/debug.h" +#include "common/Formatter.h" + +#include +#include + +#include "include/ceph_assert.h" + +static inline bool is_not_alnum_space(char c) { + return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_')); +} + +static std::string maybe_quote_string(const std::string& str) { + if (find_if(str.begin(), str.end(), is_not_alnum_space) == str.end()) + return str; + return std::string("\"") + str + std::string("\""); +} + +#define dout_subsys ceph_subsys_mgr + +ostream& operator<<(ostream& out, const mgr_rwxa_t& p) { + if (p == MGR_CAP_ANY) + return out << "*"; + + if (p & MGR_CAP_R) + out << "r"; + if (p & MGR_CAP_W) + out << "w"; + if (p & MGR_CAP_X) + out << "x"; + return out; +} + +ostream& operator<<(ostream& out, const MgrCapGrantConstraint& c) { + switch (c.match_type) { + case MgrCapGrantConstraint::MATCH_TYPE_EQUAL: + out << "="; + break; + case MgrCapGrantConstraint::MATCH_TYPE_PREFIX: + out << " prefix "; + break; + case MgrCapGrantConstraint::MATCH_TYPE_REGEX: + out << " regex "; + break; + default: + break; + } + out << maybe_quote_string(c.value); + return out; +} + +ostream& operator<<(ostream& out, const MgrCapGrant& m) { + if (!m.profile.empty()) { + out << "profile " << maybe_quote_string(m.profile); + } else { + out << "allow"; + if (!m.service.empty()) { + out << " service " << maybe_quote_string(m.service); + } else if (!m.module.empty()) { + out << " module " << maybe_quote_string(m.module); + } else if (!m.command.empty()) { + out << " command " << maybe_quote_string(m.command); + } + } + + if (!m.arguments.empty()) { + out << (!m.profile.empty() ? "" : " with"); + for (auto& [key, constraint] : m.arguments) { + out << " " << maybe_quote_string(key) << constraint; + } + } + + if (m.allow != 0) { + out << " " << m.allow; + } + + if (m.network.size()) { + out << " network " << m.network; + } + return out; +} + +// +// fusion lets us easily populate structs via the qi parser. + +typedef std::map kvmap; + +BOOST_FUSION_ADAPT_STRUCT(MgrCapGrant, + (std::string, service) + (std::string, module) + (std::string, profile) + (std::string, command) + (kvmap, arguments) + (mgr_rwxa_t, allow) + (std::string, network)) + +BOOST_FUSION_ADAPT_STRUCT(MgrCapGrantConstraint, + (MgrCapGrantConstraint::MatchType, match_type) + (std::string, value)) + +// + +void MgrCapGrant::parse_network() { + network_valid = ::parse_network(network.c_str(), &network_parsed, + &network_prefix); +} + +void MgrCapGrant::expand_profile(std::ostream *err) const { + // only generate this list once + if (!profile_grants.empty()) { + return; + } + + if (profile == "read-only") { + // grants READ-ONLY caps MGR-wide + profile_grants.push_back({{}, {}, {}, {}, {}, mgr_rwxa_t{MGR_CAP_R}}); + return; + } + + if (profile == "read-write") { + // grants READ-WRITE caps MGR-wide + profile_grants.push_back({{}, {}, {}, {}, {}, + mgr_rwxa_t{MGR_CAP_R | MGR_CAP_W}}); + return; + } + + if (profile == "crash") { + profile_grants.push_back({{}, {}, {}, "crash post", {}, {}}); + return; + } + + if (profile == "osd") { + // this is a documented profile (so we need to accept it as valid), but it + // currently doesn't do anything + return; + } + + if (profile == "mds") { + // this is a documented profile (so we need to accept it as valid), but it + // currently doesn't do anything + return; + } + + if (profile == "rbd" || profile == "rbd-read-only") { + Arguments filtered_arguments; + for (auto& [key, constraint] : arguments) { + if (key == "pool" || key == "namespace") { + filtered_arguments[key] = std::move(constraint); + } else { + if (err != nullptr) { + *err << "profile '" << profile << "' does not recognize key '" << key + << "'"; + } + return; + } + } + + mgr_rwxa_t perms = mgr_rwxa_t{MGR_CAP_R}; + if (profile == "rbd") { + perms = mgr_rwxa_t{MGR_CAP_R | MGR_CAP_W}; + } + + // whitelist all 'rbd_support' commands (restricted by optional + // pool/namespace constraints) + profile_grants.push_back({{}, "rbd_support", {}, {}, + std::move(filtered_arguments), perms}); + return; + } + + if (err != nullptr) { + *err << "unrecognized profile '" << profile << "'"; + } +} + +bool MgrCapGrant::validate_arguments( + const std::map& args) const { + for (auto& [key, constraint] : arguments) { + auto q = args.find(key); + + // argument must be present if a constraint exists + if (q == args.end()) { + return false; + } + + switch (constraint.match_type) { + case MgrCapGrantConstraint::MATCH_TYPE_EQUAL: + if (constraint.value != q->second) + return false; + break; + case MgrCapGrantConstraint::MATCH_TYPE_PREFIX: + if (q->second.find(constraint.value) != 0) + return false; + break; + case MgrCapGrantConstraint::MATCH_TYPE_REGEX: + try { + std::regex pattern(constraint.value, std::regex::extended); + if (!std::regex_match(q->second, pattern)) { + return false; + } + } catch(const std::regex_error&) { + return false; + } + break; + default: + return false; + } + } + + return true; +} + +mgr_rwxa_t MgrCapGrant::get_allowed( + CephContext *cct, EntityName name, const std::string& s, + const std::string& m, const std::string& c, + const std::map& args) const { + if (!profile.empty()) { + expand_profile(nullptr); + mgr_rwxa_t a; + for (auto& grant : profile_grants) { + a = a | grant.get_allowed(cct, name, s, m, c, args); + } + return a; + } + + if (!service.empty()) { + if (service != s) { + return mgr_rwxa_t{}; + } + return allow; + } + + if (!module.empty()) { + if (module != m) { + return mgr_rwxa_t{}; + } + + // don't test module arguments when validating a specific command + if (c.empty() && !validate_arguments(args)) { + return mgr_rwxa_t{}; + } + return allow; + } + + if (!command.empty()) { + if (command != c) { + return mgr_rwxa_t{}; + } + if (!validate_arguments(args)) { + return mgr_rwxa_t{}; + } + return mgr_rwxa_t{MGR_CAP_ANY}; + } + + return allow; +} + +ostream& operator<<(ostream&out, const MgrCap& m) { + bool first = true; + for (auto& grant : m.grants) { + if (!first) { + out << ", "; + } + first = false; + + out << grant; + } + return out; +} + +bool MgrCap::is_allow_all() const { + for (auto& grant : grants) { + if (grant.is_allow_all()) { + return true; + } + } + return false; +} + +void MgrCap::set_allow_all() { + grants.clear(); + grants.push_back({{}, {}, {}, {}, {}, mgr_rwxa_t{MGR_CAP_ANY}}); + text = "allow *"; +} + +bool MgrCap::is_capable( + CephContext *cct, + EntityName name, + const std::string& service, + const std::string& module, + const std::string& command, + const std::map& command_args, + bool op_may_read, bool op_may_write, bool op_may_exec, + const entity_addr_t& addr) const { + if (cct) { + ldout(cct, 20) << "is_capable service=" << service << " " + << "module=" << module << " " + << "command=" << command + << (op_may_read ? " read":"") + << (op_may_write ? " write":"") + << (op_may_exec ? " exec":"") + << " addr " << addr + << " on cap " << *this + << dendl; + } + + mgr_rwxa_t allow; + for (auto& grant : grants) { + if (cct) + ldout(cct, 20) << " allow so far " << allow << ", doing grant " << grant + << dendl; + + if (grant.network.size() && + (!grant.network_valid || + !network_contains(grant.network_parsed, + grant.network_prefix, + addr))) { + continue; + } + + if (grant.is_allow_all()) { + if (cct) { + ldout(cct, 20) << " allow all" << dendl; + } + return true; + } + + // check enumerated caps + allow = allow | grant.get_allowed(cct, name, service, module, command, + command_args); + if ((!op_may_read || (allow & MGR_CAP_R)) && + (!op_may_write || (allow & MGR_CAP_W)) && + (!op_may_exec || (allow & MGR_CAP_X))) { + if (cct) { + ldout(cct, 20) << " match" << dendl; + } + return true; + } + } + return false; +} + +void MgrCap::encode(bufferlist& bl) const { + // remain backwards compatible w/ MgrCap + ENCODE_START(4, 4, bl); + encode(text, bl); + ENCODE_FINISH(bl); +} + +void MgrCap::decode(bufferlist::const_iterator& bl) { + // remain backwards compatible w/ MgrCap + std::string s; + DECODE_START(4, bl); + decode(s, bl); + DECODE_FINISH(bl); + parse(s, NULL); +} + +void MgrCap::dump(Formatter *f) const { + f->dump_string("text", text); +} + +void MgrCap::generate_test_instances(list& ls) { + ls.push_back(new MgrCap); + ls.push_back(new MgrCap); + ls.back()->parse("allow *"); + ls.push_back(new MgrCap); + ls.back()->parse("allow rwx"); + ls.push_back(new MgrCap); + ls.back()->parse("allow service foo x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow command bar x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow service foo r, allow command bar x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow command bar with k1=v1 x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow command bar with k1=v1 k2=v2 x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow module bar with k1=v1 k2=v2 x"); + ls.push_back(new MgrCap); + ls.back()->parse("profile rbd pool=rbd"); +} + +// grammar +namespace qi = boost::spirit::qi; +namespace ascii = boost::spirit::ascii; +namespace phoenix = boost::phoenix; + +template +struct MgrCapParser : qi::grammar { + MgrCapParser() : MgrCapParser::base_type(mgrcap) { + using qi::char_; + using qi::int_; + using qi::ulong_long; + using qi::lexeme; + using qi::alnum; + using qi::_val; + using qi::_1; + using qi::_2; + using qi::_3; + using qi::eps; + using qi::lit; + + quoted_string %= + lexeme['"' >> +(char_ - '"') >> '"'] | + lexeme['\'' >> +(char_ - '\'') >> '\'']; + unquoted_word %= +char_("a-zA-Z0-9_./-"); + str %= quoted_string | unquoted_word; + network_str %= +char_("/.:a-fA-F0-9]["); + + spaces = +(lit(' ') | lit('\n') | lit('\t')); + + // key <=|prefix|regex> value[ ...] + str_match = -spaces >> lit('=') >> -spaces >> + qi::attr(MgrCapGrantConstraint::MATCH_TYPE_EQUAL) >> str; + str_prefix = spaces >> lit("prefix") >> spaces >> + qi::attr(MgrCapGrantConstraint::MATCH_TYPE_PREFIX) >> str; + str_regex = spaces >> lit("regex") >> spaces >> + qi::attr(MgrCapGrantConstraint::MATCH_TYPE_REGEX) >> str; + kv_pair = str >> (str_match | str_prefix | str_regex); + kv_map %= kv_pair >> *(spaces >> kv_pair); + + // command := command[=]cmd [k1=v1 k2=v2 ...] + command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> str + >> -(spaces >> lit("with") >> spaces >> kv_map) + >> qi::attr(0) + >> -(spaces >> lit("network") >> spaces >> network_str); + + // service foo rwxa + service_match %= -spaces >> lit("allow") >> spaces >> lit("service") >> (lit('=') | spaces) + >> str + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(map()) + >> spaces >> rwxa + >> -(spaces >> lit("network") >> spaces >> network_str); + + // module foo rwxa + module_match %= -spaces >> lit("allow") >> spaces >> lit("module") >> (lit('=') | spaces) + >> qi::attr(std::string()) + >> str + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> -(spaces >> lit("with") >> spaces >> kv_map) + >> spaces >> rwxa + >> -(spaces >> lit("network") >> spaces >> network_str); + + // profile foo + profile_match %= -spaces >> -(lit("allow") >> spaces) + >> lit("profile") >> (lit('=') | spaces) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> str + >> qi::attr(std::string()) + >> -(spaces >> kv_map) + >> qi::attr(0) + >> -(spaces >> lit("network") >> spaces >> network_str); + + // rwxa + rwxa_match %= -spaces >> lit("allow") >> spaces + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::map()) + >> rwxa + >> -(spaces >> lit("network") >> spaces >> network_str); + + // rwxa := * | [r][w][x] + rwxa = + (lit("*")[_val = MGR_CAP_ANY]) | + (lit("all")[_val = MGR_CAP_ANY]) | + ( eps[_val = 0] >> + ( lit('r')[_val |= MGR_CAP_R] || + lit('w')[_val |= MGR_CAP_W] || + lit('x')[_val |= MGR_CAP_X] + ) + ); + + // grant := allow ... + grant = -spaces >> (rwxa_match | profile_match | service_match | + module_match | command_match) >> -spaces; + + // mgrcap := grant [grant ...] + grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' '))); + mgrcap = grants [_val = phoenix::construct(_1)]; + } + + qi::rule spaces; + qi::rule rwxa; + qi::rule quoted_string; + qi::rule unquoted_word; + qi::rule str, network_str; + + qi::rule str_match, str_prefix, str_regex; + qi::rule()> kv_pair; + qi::rule()> kv_map; + + qi::rule rwxa_match; + qi::rule command_match; + qi::rule service_match; + qi::rule module_match; + qi::rule profile_match; + qi::rule grant; + qi::rule()> grants; + qi::rule mgrcap; +}; + +bool MgrCap::parse(const std::string& str, ostream *err) { + auto iter = str.begin(); + auto end = str.end(); + + MgrCapParser exp; + bool r = qi::parse(iter, end, exp, *this); + if (r && iter == end) { + text = str; + + std::stringstream profile_err; + for (auto& g : grants) { + g.parse_network(); + + if (!g.profile.empty()) { + g.expand_profile(&profile_err); + } + } + + if (!profile_err.str().empty()) { + if (err != nullptr) { + *err << "mgr capability parse failed during profile evaluation: " + << profile_err.str(); + } + return false; + } + return true; + } + + // Make sure no grants are kept after parsing failed! + grants.clear(); + + if (err) { + if (iter != end) + *err << "mgr capability parse failed, stopped at '" + << std::string(iter, end) << "' of '" << str << "'"; + else + *err << "mgr capability parse failed, stopped at end of '" << str << "'"; + } + + return false; +} diff --git a/src/mgr/MgrCap.h b/src/mgr/MgrCap.h new file mode 100644 index 00000000..3482ff21 --- /dev/null +++ b/src/mgr/MgrCap.h @@ -0,0 +1,202 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MGRCAP_H +#define CEPH_MGRCAP_H + +#include + +#include "include/types.h" +#include "common/entity_name.h" + +class CephContext; + +static const __u8 MGR_CAP_R = (1 << 1); // read +static const __u8 MGR_CAP_W = (1 << 2); // write +static const __u8 MGR_CAP_X = (1 << 3); // execute +static const __u8 MGR_CAP_ANY = 0xff; // * + +struct mgr_rwxa_t { + __u8 val = 0U; + + mgr_rwxa_t() {} + explicit mgr_rwxa_t(__u8 v) : val(v) {} + + mgr_rwxa_t& operator=(__u8 v) { + val = v; + return *this; + } + operator __u8() const { + return val; + } +}; + +std::ostream& operator<<(std::ostream& out, const mgr_rwxa_t& p); + +struct MgrCapGrantConstraint { + enum MatchType { + MATCH_TYPE_NONE, + MATCH_TYPE_EQUAL, + MATCH_TYPE_PREFIX, + MATCH_TYPE_REGEX + }; + + MatchType match_type = MATCH_TYPE_NONE; + std::string value; + + MgrCapGrantConstraint() {} + MgrCapGrantConstraint(MatchType match_type, std::string value) + : match_type(match_type), value(value) { + } +}; + +std::ostream& operator<<(std::ostream& out, const MgrCapGrantConstraint& c); + +struct MgrCapGrant { + /* + * A grant can come in one of four forms: + * + * - a blanket allow ('allow rw', 'allow *') + * - this will match against any service and the read/write/exec flags + * in the mgr code. semantics of what X means are somewhat ad hoc. + * + * - a service allow ('allow service mds rw') + * - this will match against a specific service and the r/w/x flags. + * + * - a module allow ('allow module rbd_support rw, allow module rbd_support with pool=rbd rw') + * - this will match against a specific python add-on module and the r/w/x + * flags. + * + * - a profile ('profile read-only, profile rbd pool=rbd') + * - this will match against specific MGR-enforced semantics of what + * this type of user should need to do. examples include 'read-write', + * 'read-only', 'crash'. + * + * - a command ('allow command foo', 'allow command bar with arg1=val1 arg2 prefix val2') + * this includes the command name (the prefix string) + * + * The command, module, and profile caps can also accept an optional + * key/value map. If not provided, all command arguments and module + * meta-arguments are allowed. If a key/value pair is specified, that + * argument must be present and must match the provided constraint. + */ + typedef std::map Arguments; + + std::string service; + std::string module; + std::string profile; + std::string command; + Arguments arguments; + + // restrict by network + std::string network; + + // these are filled in by parse_network(), called by MgrCap::parse() + entity_addr_t network_parsed; + unsigned network_prefix = 0; + bool network_valid = true; + + void parse_network(); + + mgr_rwxa_t allow; + + // explicit grants that a profile grant expands to; populated as + // needed by expand_profile() (via is_match()) and cached here. + mutable std::list profile_grants; + + void expand_profile(std::ostream *err=nullptr) const; + + MgrCapGrant() : allow(0) {} + MgrCapGrant(std::string&& service, + std::string&& module, + std::string&& profile, + std::string&& command, + Arguments&& arguments, + mgr_rwxa_t allow) + : service(std::move(service)), module(std::move(module)), + profile(std::move(profile)), command(std::move(command)), + arguments(std::move(arguments)), allow(allow) { + } + + bool validate_arguments( + const std::map& arguments) const; + + /** + * check if given request parameters match our constraints + * + * @param cct context + * @param name entity name + * @param service service (if any) + * @param module module (if any) + * @param command command (if any) + * @param arguments profile/module/command args (if any) + * @return bits we allow + */ + mgr_rwxa_t get_allowed( + CephContext *cct, + EntityName name, + const std::string& service, + const std::string& module, + const std::string& command, + const std::map& arguments) const; + + bool is_allow_all() const { + return (allow == MGR_CAP_ANY && + service.empty() && + module.empty() && + profile.empty() && + command.empty()); + } +}; + +std::ostream& operator<<(std::ostream& out, const MgrCapGrant& g); + +struct MgrCap { + std::string text; + std::vector grants; + + MgrCap() {} + explicit MgrCap(const std::vector &g) : grants(g) {} + + std::string get_str() const { + return text; + } + + bool is_allow_all() const; + void set_allow_all(); + bool parse(const std::string& str, std::ostream *err=NULL); + + /** + * check if we are capable of something + * + * This method actually checks a description of a particular operation against + * what the capability has specified. + * + * @param service service name + * @param module module name + * @param command command id + * @param arguments + * @param op_may_read whether the operation may need to read + * @param op_may_write whether the operation may need to write + * @param op_may_exec whether the operation may exec + * @return true if the operation is allowed, false otherwise + */ + bool is_capable(CephContext *cct, + EntityName name, + const std::string& service, + const std::string& module, + const std::string& command, + const std::map& arguments, + bool op_may_read, bool op_may_write, bool op_may_exec, + const entity_addr_t& addr) const; + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& ls); +}; +WRITE_CLASS_ENCODER(MgrCap) + +std::ostream& operator<<(std::ostream& out, const MgrCap& cap); + +#endif // CEPH_MGRCAP_H diff --git a/src/mgr/MgrClient.cc b/src/mgr/MgrClient.cc new file mode 100644 index 00000000..738e9a3b --- /dev/null +++ b/src/mgr/MgrClient.cc @@ -0,0 +1,531 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "MgrClient.h" + +#include "mgr/MgrContext.h" + +#include "msg/Messenger.h" +#include "messages/MMgrMap.h" +#include "messages/MMgrReport.h" +#include "messages/MMgrOpen.h" +#include "messages/MMgrClose.h" +#include "messages/MMgrConfigure.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MPGStats.h" + +#define dout_subsys ceph_subsys_mgrc +#undef dout_prefix +#define dout_prefix *_dout << "mgrc " << __func__ << " " + +MgrClient::MgrClient(CephContext *cct_, Messenger *msgr_) + : Dispatcher(cct_), cct(cct_), msgr(msgr_), + timer(cct_, lock) +{ + ceph_assert(cct != nullptr); +} + +void MgrClient::init() +{ + std::lock_guard l(lock); + + ceph_assert(msgr != nullptr); + + timer.init(); + initialized = true; +} + +void MgrClient::shutdown() +{ + std::lock_guard l(lock); + ldout(cct, 10) << dendl; + + if (connect_retry_callback) { + timer.cancel_event(connect_retry_callback); + connect_retry_callback = nullptr; + } + + // forget about in-flight commands if we are prematurely shut down + // (e.g., by control-C) + command_table.clear(); + if (service_daemon && + session && + session->con && + HAVE_FEATURE(session->con->get_features(), SERVER_MIMIC)) { + ldout(cct, 10) << "closing mgr session" << dendl; + MMgrClose *m = new MMgrClose(); + m->daemon_name = daemon_name; + m->service_name = service_name; + session->con->send_message(m); + utime_t timeout; + timeout.set_from_double(cct->_conf.get_val( + "mgr_client_service_daemon_unregister_timeout")); + shutdown_cond.WaitInterval(lock, timeout); + } + + timer.shutdown(); + if (session) { + session->con->mark_down(); + session.reset(); + } +} + +bool MgrClient::ms_dispatch(Message *m) +{ + std::lock_guard l(lock); + + switch(m->get_type()) { + case MSG_MGR_MAP: + return handle_mgr_map(static_cast(m)); + case MSG_MGR_CONFIGURE: + return handle_mgr_configure(static_cast(m)); + case MSG_MGR_CLOSE: + return handle_mgr_close(static_cast(m)); + case MSG_COMMAND_REPLY: + if (m->get_source().type() == CEPH_ENTITY_TYPE_MGR) { + handle_command_reply(static_cast(m)); + return true; + } else { + return false; + } + default: + ldout(cct, 30) << "Not handling " << *m << dendl; + return false; + } +} + +void MgrClient::reconnect() +{ + ceph_assert(lock.is_locked_by_me()); + + if (session) { + ldout(cct, 4) << "Terminating session with " + << session->con->get_peer_addr() << dendl; + session->con->mark_down(); + session.reset(); + stats_period = 0; + if (report_callback != nullptr) { + timer.cancel_event(report_callback); + report_callback = nullptr; + } + } + + if (!map.get_available()) { + ldout(cct, 4) << "No active mgr available yet" << dendl; + return; + } + + if (last_connect_attempt != utime_t()) { + utime_t now = ceph_clock_now(); + utime_t when = last_connect_attempt; + when += cct->_conf.get_val("mgr_connect_retry_interval"); + if (now < when) { + if (!connect_retry_callback) { + connect_retry_callback = timer.add_event_at( + when, + new FunctionContext([this](int r){ + connect_retry_callback = nullptr; + reconnect(); + })); + } + ldout(cct, 4) << "waiting to retry connect until " << when << dendl; + return; + } + } + + if (connect_retry_callback) { + timer.cancel_event(connect_retry_callback); + connect_retry_callback = nullptr; + } + + ldout(cct, 4) << "Starting new session with " << map.get_active_addrs() + << dendl; + last_connect_attempt = ceph_clock_now(); + + session.reset(new MgrSessionState()); + session->con = msgr->connect_to(CEPH_ENTITY_TYPE_MGR, + map.get_active_addrs()); + + if (service_daemon) { + daemon_dirty_status = true; + } + + // Don't send an open if we're just a client (i.e. doing + // command-sending, not stats etc) + if (msgr->get_mytype() != CEPH_ENTITY_TYPE_CLIENT || service_daemon) { + _send_open(); + } + + // resend any pending commands + for (const auto &p : command_table.get_commands()) { + auto m = p.second.get_message({}); + ceph_assert(session); + ceph_assert(session->con); + session->con->send_message2(std::move(m)); + } +} + +void MgrClient::_send_open() +{ + if (session && session->con) { + auto open = new MMgrOpen(); + if (!service_name.empty()) { + open->service_name = service_name; + open->daemon_name = daemon_name; + } else { + open->daemon_name = cct->_conf->name.get_id(); + } + if (service_daemon) { + open->service_daemon = service_daemon; + open->daemon_metadata = daemon_metadata; + } + cct->_conf.get_config_bl(0, &open->config_bl, &last_config_bl_version); + cct->_conf.get_defaults_bl(&open->config_defaults_bl); + session->con->send_message(open); + } +} + +bool MgrClient::handle_mgr_map(MMgrMap *m) +{ + ceph_assert(lock.is_locked_by_me()); + + ldout(cct, 20) << *m << dendl; + + map = m->get_map(); + ldout(cct, 4) << "Got map version " << map.epoch << dendl; + m->put(); + + ldout(cct, 4) << "Active mgr is now " << map.get_active_addrs() << dendl; + + // Reset session? + if (!session || + session->con->get_peer_addrs() != map.get_active_addrs()) { + reconnect(); + } + + return true; +} + +bool MgrClient::ms_handle_reset(Connection *con) +{ + std::lock_guard l(lock); + if (session && con == session->con) { + ldout(cct, 4) << __func__ << " con " << con << dendl; + reconnect(); + return true; + } + return false; +} + +bool MgrClient::ms_handle_refused(Connection *con) +{ + // do nothing for now + return false; +} + +void MgrClient::_send_stats() +{ + _send_report(); + _send_pgstats(); + if (stats_period != 0) { + report_callback = timer.add_event_after( + stats_period, + new FunctionContext([this](int) { + _send_stats(); + })); + } +} + +void MgrClient::_send_report() +{ + ceph_assert(lock.is_locked_by_me()); + ceph_assert(session); + report_callback = nullptr; + + auto report = new MMgrReport(); + auto pcc = cct->get_perfcounters_collection(); + + pcc->with_counters([this, report]( + const PerfCountersCollectionImpl::CounterMap &by_path) + { + // Helper for checking whether a counter should be included + auto include_counter = [this]( + const PerfCounters::perf_counter_data_any_d &ctr, + const PerfCounters &perf_counters) + { + return perf_counters.get_adjusted_priority(ctr.prio) >= (int)stats_threshold; + }; + + // Helper for cases where we want to forget a counter + auto undeclare = [report, this](const std::string &path) + { + report->undeclare_types.push_back(path); + ldout(cct,20) << " undeclare " << path << dendl; + session->declared.erase(path); + }; + + ENCODE_START(1, 1, report->packed); + + // Find counters that no longer exist, and undeclare them + for (auto p = session->declared.begin(); p != session->declared.end(); ) { + const auto &path = *(p++); + if (by_path.count(path) == 0) { + undeclare(path); + } + } + + for (const auto &i : by_path) { + auto& path = i.first; + auto& data = *(i.second.data); + auto& perf_counters = *(i.second.perf_counters); + + // Find counters that still exist, but are no longer permitted by + // stats_threshold + if (!include_counter(data, perf_counters)) { + if (session->declared.count(path)) { + undeclare(path); + } + continue; + } + + if (session->declared.count(path) == 0) { + ldout(cct,20) << " declare " << path << dendl; + PerfCounterType type; + type.path = path; + if (data.description) { + type.description = data.description; + } + if (data.nick) { + type.nick = data.nick; + } + type.type = data.type; + type.priority = perf_counters.get_adjusted_priority(data.prio); + type.unit = data.unit; + report->declare_types.push_back(std::move(type)); + session->declared.insert(path); + } + + encode(static_cast(data.u64), report->packed); + if (data.type & PERFCOUNTER_LONGRUNAVG) { + encode(static_cast(data.avgcount), report->packed); + encode(static_cast(data.avgcount2), report->packed); + } + } + ENCODE_FINISH(report->packed); + + ldout(cct, 20) << "sending " << session->declared.size() << " counters (" + "of possible " << by_path.size() << "), " + << report->declare_types.size() << " new, " + << report->undeclare_types.size() << " removed" + << dendl; + }); + + ldout(cct, 20) << "encoded " << report->packed.length() << " bytes" << dendl; + + if (daemon_name.size()) { + report->daemon_name = daemon_name; + } else { + report->daemon_name = cct->_conf->name.get_id(); + } + report->service_name = service_name; + + if (daemon_dirty_status) { + report->daemon_status = daemon_status; + daemon_dirty_status = false; + } + + if (task_dirty_status) { + report->task_status = task_status; + task_dirty_status = false; + } + + report->daemon_health_metrics = std::move(daemon_health_metrics); + + cct->_conf.get_config_bl(last_config_bl_version, &report->config_bl, + &last_config_bl_version); + + if (get_perf_report_cb) { + get_perf_report_cb(&report->osd_perf_metric_reports); + } + + session->con->send_message(report); +} + +void MgrClient::send_pgstats() +{ + std::lock_guard l(lock); + _send_pgstats(); +} + +void MgrClient::_send_pgstats() +{ + if (pgstats_cb && session) { + session->con->send_message(pgstats_cb()); + } +} + +bool MgrClient::handle_mgr_configure(MMgrConfigure *m) +{ + ceph_assert(lock.is_locked_by_me()); + + ldout(cct, 20) << *m << dendl; + + if (!session) { + lderr(cct) << "dropping unexpected configure message" << dendl; + m->put(); + return true; + } + + ldout(cct, 4) << "stats_period=" << m->stats_period << dendl; + + if (stats_threshold != m->stats_threshold) { + ldout(cct, 4) << "updated stats threshold: " << m->stats_threshold << dendl; + stats_threshold = m->stats_threshold; + } + + if (set_perf_queries_cb) { + set_perf_queries_cb(m->osd_perf_metric_queries); + } + + bool starting = (stats_period == 0) && (m->stats_period != 0); + stats_period = m->stats_period; + if (starting) { + _send_stats(); + } + + m->put(); + return true; +} + +bool MgrClient::handle_mgr_close(MMgrClose *m) +{ + service_daemon = false; + shutdown_cond.Signal(); + m->put(); + return true; +} + +int MgrClient::start_command(const vector& cmd, const bufferlist& inbl, + bufferlist *outbl, string *outs, + Context *onfinish) +{ + std::lock_guard l(lock); + + ldout(cct, 20) << "cmd: " << cmd << dendl; + + if (map.epoch == 0 && mgr_optional) { + ldout(cct,20) << " no MgrMap, assuming EACCES" << dendl; + return -EACCES; + } + + auto &op = command_table.start_command(); + op.cmd = cmd; + op.inbl = inbl; + op.outbl = outbl; + op.outs = outs; + op.on_finish = onfinish; + + if (session && session->con) { + // Leaving fsid argument null because it isn't used. + auto m = op.get_message({}); + session->con->send_message2(std::move(m)); + } else { + ldout(cct, 5) << "no mgr session (no running mgr daemon?), waiting" << dendl; + } + return 0; +} + +bool MgrClient::handle_command_reply(MCommandReply *m) +{ + ceph_assert(lock.is_locked_by_me()); + + ldout(cct, 20) << *m << dendl; + + const auto tid = m->get_tid(); + if (!command_table.exists(tid)) { + ldout(cct, 4) << "handle_command_reply tid " << m->get_tid() + << " not found" << dendl; + m->put(); + return true; + } + + auto &op = command_table.get_command(tid); + if (op.outbl) { + op.outbl->claim(m->get_data()); + } + + if (op.outs) { + *(op.outs) = m->rs; + } + + if (op.on_finish) { + op.on_finish->complete(m->r); + } + + command_table.erase(tid); + + m->put(); + return true; +} + +int MgrClient::service_daemon_register( + const std::string& service, + const std::string& name, + const std::map& metadata) +{ + std::lock_guard l(lock); + if (service_daemon) { + return -EEXIST; + } + ldout(cct,1) << service << "." << name << " metadata " << metadata << dendl; + service_daemon = true; + service_name = service; + daemon_name = name; + daemon_metadata = metadata; + daemon_dirty_status = true; + + // late register? + if (msgr->get_mytype() == CEPH_ENTITY_TYPE_CLIENT && session && session->con) { + _send_open(); + } + + return 0; +} + +int MgrClient::service_daemon_update_status( + std::map&& status) +{ + std::lock_guard l(lock); + ldout(cct,10) << status << dendl; + daemon_status = std::move(status); + daemon_dirty_status = true; + return 0; +} + +int MgrClient::service_daemon_update_task_status( + std::map &&status) { + std::lock_guard l(lock); + ldout(cct,10) << status << dendl; + task_status = std::move(status); + task_dirty_status = true; + return 0; +} + +void MgrClient::update_daemon_health(std::vector&& metrics) +{ + std::lock_guard l(lock); + daemon_health_metrics = std::move(metrics); +} + diff --git a/src/mgr/MgrClient.h b/src/mgr/MgrClient.h new file mode 100644 index 00000000..e7a6cc77 --- /dev/null +++ b/src/mgr/MgrClient.h @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MGR_CLIENT_H_ +#define MGR_CLIENT_H_ + +#include "msg/Connection.h" +#include "msg/Dispatcher.h" +#include "mon/MgrMap.h" +#include "mgr/DaemonHealthMetric.h" + +#include "messages/MMgrReport.h" +#include "mgr/OSDPerfMetricTypes.h" + +#include "common/perf_counters.h" +#include "common/Timer.h" +#include "common/CommandTable.h" + +class MMgrMap; +class MMgrConfigure; +class MMgrClose; +class Messenger; +class MCommandReply; +class MPGStats; + +class MgrSessionState +{ + public: + // Which performance counters have we already transmitted schema for? + std::set declared; + + // Our connection to the mgr + ConnectionRef con; +}; + +class MgrCommand : public CommandOp +{ + public: + + explicit MgrCommand(ceph_tid_t t) : CommandOp(t) {} + MgrCommand() : CommandOp() {} +}; + +class MgrClient : public Dispatcher +{ +protected: + CephContext *cct; + MgrMap map; + Messenger *msgr; + + unique_ptr session; + + Mutex lock = {"MgrClient::lock"}; + Cond shutdown_cond; + + uint32_t stats_period = 0; + uint32_t stats_threshold = 0; + SafeTimer timer; + + CommandTable command_table; + + utime_t last_connect_attempt; + + uint64_t last_config_bl_version = 0; + + Context *report_callback = nullptr; + Context *connect_retry_callback = nullptr; + + // If provided, use this to compose an MPGStats to send with + // our reports (hook for use by OSD) + std::function pgstats_cb; + std::function &)> set_perf_queries_cb; + std::function *)> get_perf_report_cb; + + // for service registration and beacon + bool service_daemon = false; + bool daemon_dirty_status = false; + bool task_dirty_status = false; + std::string service_name, daemon_name; + std::map daemon_metadata; + std::map daemon_status; + std::map task_status; + std::vector daemon_health_metrics; + + void reconnect(); + void _send_open(); + + // In pre-luminous clusters, the ceph-mgr service is absent or optional, + // so we must not block in start_command waiting for it. + bool mgr_optional = false; + +public: + MgrClient(CephContext *cct_, Messenger *msgr_); + + void set_messenger(Messenger *msgr_) { msgr = msgr_; } + + void init(); + void shutdown(); + + void set_mgr_optional(bool optional_) {mgr_optional = optional_;} + + bool ms_dispatch(Message *m) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override; + + bool handle_mgr_map(MMgrMap *m); + bool handle_mgr_configure(MMgrConfigure *m); + bool handle_mgr_close(MMgrClose *m); + bool handle_command_reply(MCommandReply *m); + + void set_perf_metric_query_cb( + std::function &)> cb_set, + std::function *)> cb_get) + { + std::lock_guard l(lock); + set_perf_queries_cb = cb_set; + get_perf_report_cb = cb_get; + } + + void send_pgstats(); + void set_pgstats_cb(std::function&& cb_) + { + std::lock_guard l(lock); + pgstats_cb = std::move(cb_); + } + + int start_command(const vector& cmd, const bufferlist& inbl, + bufferlist *outbl, string *outs, + Context *onfinish); + + int service_daemon_register( + const std::string& service, + const std::string& name, + const std::map& metadata); + int service_daemon_update_status( + std::map&& status); + int service_daemon_update_task_status( + std::map &&task_status); + void update_daemon_health(std::vector&& metrics); + + bool is_initialized() const { return initialized; } + +private: + void _send_stats(); + void _send_pgstats(); + void _send_report(); + + bool initialized = false; +}; + +#endif diff --git a/src/mgr/MgrCommands.h b/src/mgr/MgrCommands.h new file mode 100644 index 00000000..4116318b --- /dev/null +++ b/src/mgr/MgrCommands.h @@ -0,0 +1,210 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* no guard; may be included multiple times */ + +// see MonCommands.h + +COMMAND("pg stat", "show placement group status.", + "pg", "r") +COMMAND("pg getmap", "get binary pg map to -o/stdout", "pg", "r") + +COMMAND("pg dump " \ + "name=dumpcontents,type=CephChoices,strings=all|summary|sum|delta|pools|osds|pgs|pgs_brief,n=N,req=false", \ + "show human-readable versions of pg map (only 'all' valid with plain)", "pg", "r") +COMMAND("pg dump_json " \ + "name=dumpcontents,type=CephChoices,strings=all|summary|sum|pools|osds|pgs,n=N,req=false", \ + "show human-readable version of pg map in json only",\ + "pg", "r") +COMMAND("pg dump_pools_json", "show pg pools info in json only",\ + "pg", "r") + +COMMAND("pg ls-by-pool " \ + "name=poolstr,type=CephString " \ + "name=states,type=CephString,n=N,req=false", \ + "list pg with pool = [poolname]", "pg", "r") +COMMAND("pg ls-by-primary " \ + "name=osd,type=CephOsdName " \ + "name=pool,type=CephInt,req=false " \ + "name=states,type=CephString,n=N,req=false", \ + "list pg with primary = [osd]", "pg", "r") +COMMAND("pg ls-by-osd " \ + "name=osd,type=CephOsdName " \ + "name=pool,type=CephInt,req=false " \ + "name=states,type=CephString,n=N,req=false", \ + "list pg on osd [osd]", "pg", "r") +COMMAND("pg ls " \ + "name=pool,type=CephInt,req=false " \ + "name=states,type=CephString,n=N,req=false", \ + "list pg with specific pool, osd, state", "pg", "r") +COMMAND("pg dump_stuck " \ + "name=stuckops,type=CephChoices,strings=inactive|unclean|stale|undersized|degraded,n=N,req=false " \ + "name=threshold,type=CephInt,req=false", + "show information about stuck pgs",\ + "pg", "r") +COMMAND("pg debug " \ + "name=debugop,type=CephChoices,strings=unfound_objects_exist|degraded_pgs_exist", \ + "show debug info about pgs", "pg", "r") + +COMMAND("pg scrub name=pgid,type=CephPgid", "start scrub on ", \ + "pg", "rw") +COMMAND("pg deep-scrub name=pgid,type=CephPgid", "start deep-scrub on ", \ + "pg", "rw") +COMMAND("pg repair name=pgid,type=CephPgid", "start repair on ", \ + "pg", "rw") + +COMMAND("pg force-recovery name=pgid,type=CephPgid,n=N", "force recovery of first", \ + "pg", "rw") +COMMAND("pg force-backfill name=pgid,type=CephPgid,n=N", "force backfill of first", \ + "pg", "rw") +COMMAND("pg cancel-force-recovery name=pgid,type=CephPgid,n=N", "restore normal recovery priority of ", \ + "pg", "rw") +COMMAND("pg cancel-force-backfill name=pgid,type=CephPgid,n=N", "restore normal backfill priority of ", \ + "pg", "rw") + +// stuff in osd namespace +COMMAND("osd perf", \ + "print dump of OSD perf summary stats", \ + "osd", \ + "r") +COMMAND("osd df " \ + "name=output_method,type=CephChoices,strings=plain|tree,req=false " \ + "name=filter_by,type=CephChoices,strings=class|name,req=false " \ + "name=filter,type=CephString,req=false", \ + "show OSD utilization", "osd", "r") +COMMAND("osd blocked-by", \ + "print histogram of which OSDs are blocking their peers", \ + "osd", "r") +COMMAND("osd pool stats " \ + "name=pool_name,type=CephPoolname,req=false", + "obtain stats from all pools, or from specified pool", + "osd", "r") +COMMAND("osd pool scrub " \ + "name=who,type=CephPoolname,n=N", \ + "initiate scrub on pool ", \ + "osd", "rw") +COMMAND("osd pool deep-scrub " \ + "name=who,type=CephPoolname,n=N", \ + "initiate deep-scrub on pool ", \ + "osd", "rw") +COMMAND("osd pool repair " \ + "name=who,type=CephPoolname,n=N", \ + "initiate repair on pool ", \ + "osd", "rw") +COMMAND("osd pool force-recovery " \ + "name=who,type=CephPoolname,n=N", \ + "force recovery of specified pool first", \ + "osd", "rw") +COMMAND("osd pool force-backfill " \ + "name=who,type=CephPoolname,n=N", \ + "force backfill of specified pool first", \ + "osd", "rw") +COMMAND("osd pool cancel-force-recovery " \ + "name=who,type=CephPoolname,n=N", \ + "restore normal recovery priority of specified pool ", \ + "osd", "rw") +COMMAND("osd pool cancel-force-backfill " \ + "name=who,type=CephPoolname,n=N", \ + "restore normal recovery priority of specified pool ", \ + "osd", "rw") +COMMAND("osd reweight-by-utilization " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\ + "reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \ + "osd", "rw") +COMMAND("osd test-reweight-by-utilization " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=no_increasing,type=CephBool,req=false",\ + "dry run of reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \ + "osd", "r") +COMMAND("osd reweight-by-pg " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=pools,type=CephPoolname,n=N,req=false", \ + "reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \ + "osd", "rw") +COMMAND("osd test-reweight-by-pg " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=pools,type=CephPoolname,n=N,req=false", \ + "dry run of reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \ + "osd", "r") + +COMMAND("osd destroy " \ + "name=id,type=CephOsdName " \ + "name=force,type=CephBool,req=false " + // backward compat synonym for --force + "name=yes_i_really_mean_it,type=CephBool,req=false", \ + "mark osd as being destroyed. Keeps the ID intact (allowing reuse), " \ + "but removes cephx keys, config-key data and lockbox keys, "\ + "rendering data permanently unreadable.", \ + "osd", "rw") +COMMAND("osd purge " \ + "name=id,type=CephOsdName " \ + "name=force,type=CephBool,req=false " + // backward compat synonym for --force + "name=yes_i_really_mean_it,type=CephBool,req=false", \ + "purge all osd data from the monitors including the OSD id " \ + "and CRUSH position", \ + "osd", "rw") + +COMMAND("osd safe-to-destroy name=ids,type=CephString,n=N", + "check whether osd(s) can be safely destroyed without reducing data durability", + "osd", "r") +COMMAND("osd ok-to-stop name=ids,type=CephString,n=N", + "check whether osd(s) can be safely stopped without reducing immediate"\ + " data availability", "osd", "r") + +COMMAND("osd scrub " \ + "name=who,type=CephString", \ + "initiate scrub on osd , or use to scrub all", \ + "osd", "rw") +COMMAND("osd deep-scrub " \ + "name=who,type=CephString", \ + "initiate deep scrub on osd , or use to deep scrub all", \ + "osd", "rw") +COMMAND("osd repair " \ + "name=who,type=CephString", \ + "initiate repair on osd , or use to repair all", \ + "osd", "rw") + +COMMAND("service dump", + "dump service map", "service", "r") +COMMAND("service status", + "dump service state", "service", "r") + +COMMAND("config show " \ + "name=who,type=CephString name=key,type=CephString,req=False", + "Show running configuration", + "mgr", "r") +COMMAND("config show-with-defaults " \ + "name=who,type=CephString", + "Show running configuration (including compiled-in defaults)", + "mgr", "r") + +COMMAND("device ls", + "Show devices", + "mgr", "r") +COMMAND("device info name=devid,type=CephString", + "Show information about a device", + "mgr", "r") +COMMAND("device ls-by-daemon name=who,type=CephString", + "Show devices associated with a daemon", + "mgr", "r") +COMMAND("device ls-by-host name=host,type=CephString", + "Show devices on a host", + "mgr", "r") +COMMAND("device set-life-expectancy name=devid,type=CephString "\ + "name=from,type=CephString "\ + "name=to,type=CephString,req=False", + "Set predicted device life expectancy", + "mgr", "rw") +COMMAND("device rm-life-expectancy name=devid,type=CephString", + "Clear predicted device life expectancy", + "mgr", "rw") diff --git a/src/mgr/MgrContext.h b/src/mgr/MgrContext.h new file mode 100644 index 00000000..3268e5c8 --- /dev/null +++ b/src/mgr/MgrContext.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MGR_CONTEXT_H_ +#define MGR_CONTEXT_H_ + +#include + +#include "common/ceph_json.h" +#include "mon/MonClient.h" + +class Command +{ +protected: + C_SaferCond cond; +public: + bufferlist outbl; + std::string outs; + int r; + + void run(MonClient *monc, const std::string &command) + { + monc->start_mon_command({command}, {}, + &outbl, &outs, &cond); + } + + virtual void wait() + { + r = cond.wait(); + } + + virtual ~Command() {} +}; + + +class JSONCommand : public Command +{ +public: + json_spirit::mValue json_result; + + void wait() override + { + Command::wait(); + + if (r == 0) { + bool read_ok = json_spirit::read( + outbl.to_str(), json_result); + if (!read_ok) { + r = -EINVAL; + } + } + } +}; + +#endif + diff --git a/src/mgr/MgrSession.h b/src/mgr/MgrSession.h new file mode 100644 index 00000000..f5ad9338 --- /dev/null +++ b/src/mgr/MgrSession.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MGR_MGRSESSION_H +#define CEPH_MGR_MGRSESSION_H + +#include "common/RefCountedObj.h" +#include "common/entity_name.h" +#include "msg/msg_types.h" +#include "MgrCap.h" + + +/** + * Session state associated with the Connection. + */ +struct MgrSession : public RefCountedObject { + uint64_t global_id = 0; + EntityName entity_name; + entity_inst_t inst; + + int osd_id = -1; ///< osd id (if an osd) + + MgrCap caps; + + std::set declared_types; + + explicit MgrSession(CephContext *cct) : RefCountedObject(cct, 0) {} + ~MgrSession() override {} + + const entity_addr_t& get_peer_addr() const { + return inst.addr; + } +}; + +typedef boost::intrusive_ptr MgrSessionRef; + + +#endif diff --git a/src/mgr/MgrStandby.cc b/src/mgr/MgrStandby.cc new file mode 100644 index 00000000..9da7a196 --- /dev/null +++ b/src/mgr/MgrStandby.cc @@ -0,0 +1,515 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include + +#include "common/errno.h" +#include "common/signal.h" +#include "include/compat.h" + +#include "include/stringify.h" +#include "global/global_context.h" +#include "global/signal_handler.h" + +#include "mgr/MgrContext.h" +#include "mgr/mgr_commands.h" + +#include "messages/MMgrBeacon.h" +#include "messages/MMgrMap.h" +#include "Mgr.h" + +#include "MgrStandby.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + + +MgrStandby::MgrStandby(int argc, const char **argv) : + Dispatcher(g_ceph_context), + monc{g_ceph_context}, + client_messenger(Messenger::create( + g_ceph_context, + cct->_conf.get_val("ms_type"), + entity_name_t::MGR(), + "mgr", + getpid(), + 0)), + objecter{g_ceph_context, client_messenger.get(), &monc, NULL, 0, 0}, + client{client_messenger.get(), &monc, &objecter}, + mgrc(g_ceph_context, client_messenger.get()), + log_client(g_ceph_context, client_messenger.get(), &monc.monmap, LogClient::NO_FLAGS), + clog(log_client.create_channel(CLOG_CHANNEL_CLUSTER)), + audit_clog(log_client.create_channel(CLOG_CHANNEL_AUDIT)), + lock("MgrStandby::lock"), + finisher(g_ceph_context, "MgrStandby", "mgrsb-fin"), + timer(g_ceph_context, lock), + py_module_registry(clog), + active_mgr(nullptr), + orig_argc(argc), + orig_argv(argv), + available_in_map(false) +{ +} + +MgrStandby::~MgrStandby() = default; + +const char** MgrStandby::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + // clog & admin clog + "clog_to_monitors", + "clog_to_syslog", + "clog_to_syslog_facility", + "clog_to_syslog_level", + "osd_objectstore_fuse", + "clog_to_graylog", + "clog_to_graylog_host", + "clog_to_graylog_port", + "host", + "fsid", + NULL + }; + return KEYS; +} + +void MgrStandby::handle_conf_change( + const ConfigProxy& conf, + const std::set &changed) +{ + if (changed.count("clog_to_monitors") || + changed.count("clog_to_syslog") || + changed.count("clog_to_syslog_level") || + changed.count("clog_to_syslog_facility") || + changed.count("clog_to_graylog") || + changed.count("clog_to_graylog_host") || + changed.count("clog_to_graylog_port") || + changed.count("host") || + changed.count("fsid")) { + _update_log_config(); + } +} + +int MgrStandby::init() +{ + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + + std::lock_guard l(lock); + + // Start finisher + finisher.start(); + + // Initialize Messenger + client_messenger->add_dispatcher_tail(this); + client_messenger->add_dispatcher_head(&objecter); + client_messenger->add_dispatcher_tail(&client); + client_messenger->start(); + + // Initialize MonClient + if (monc.build_initial_monmap() < 0) { + client_messenger->shutdown(); + client_messenger->wait(); + return -1; + } + + monc.sub_want("mgrmap", 0, 0); + + monc.set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD + |CEPH_ENTITY_TYPE_MDS|CEPH_ENTITY_TYPE_MGR); + monc.set_messenger(client_messenger.get()); + + // We must register our config callback before calling init(), so + // that we see the initial configuration message + monc.register_config_callback([this](const std::string &k, const std::string &v){ + // removing value to hide sensitive data going into mgr logs + // leaving this for debugging purposes + // dout(10) << "config_callback: " << k << " : " << v << dendl; + dout(10) << "config_callback: " << k << " : " << dendl; + if (k.substr(0, 4) == "mgr/") { + const std::string global_key = PyModule::config_prefix + k.substr(4); + py_module_registry.handle_config(global_key, v); + + return true; + } + return false; + }); + monc.register_config_notify_callback([this]() { + py_module_registry.handle_config_notify(); + }); + dout(4) << "Registered monc callback" << dendl; + + int r = monc.init(); + if (r < 0) { + monc.shutdown(); + client_messenger->shutdown(); + client_messenger->wait(); + return r; + } + mgrc.init(); + client_messenger->add_dispatcher_tail(&mgrc); + + r = monc.authenticate(); + if (r < 0) { + derr << "Authentication failed, did you specify a mgr ID with a valid keyring?" << dendl; + monc.shutdown(); + client_messenger->shutdown(); + client_messenger->wait(); + return r; + } + // only forward monmap updates after authentication finishes, otherwise + // monc.authenticate() will be waiting for MgrStandy::ms_dispatch() + // to acquire the lock forever, as it is already locked in the beginning of + // this method. + monc.set_passthrough_monmap(); + + client_t whoami = monc.get_global_id(); + client_messenger->set_myname(entity_name_t::MGR(whoami.v)); + monc.set_log_client(&log_client); + _update_log_config(); + objecter.set_client_incarnation(0); + objecter.init(); + objecter.start(); + client.init(); + timer.init(); + + py_module_registry.init(); + + tick(); + + dout(4) << "Complete." << dendl; + return 0; +} + +void MgrStandby::send_beacon() +{ + ceph_assert(lock.is_locked_by_me()); + dout(4) << state_str() << dendl; + + std::list modules = py_module_registry.get_modules(); + + // Construct a list of the info about each loaded module + // which we will transmit to the monitor. + std::vector module_info; + for (const auto &module : modules) { + MgrMap::ModuleInfo info; + info.name = module->get_name(); + info.error_string = module->get_error_string(); + info.can_run = module->get_can_run(); + info.module_options = module->get_options(); + module_info.push_back(std::move(info)); + } + + // Whether I think I am available (request MgrMonitor to set me + // as available in the map) + bool available = active_mgr != nullptr && active_mgr->is_initialized(); + + auto addrs = available ? active_mgr->get_server_addrs() : entity_addrvec_t(); + dout(10) << "sending beacon as gid " << monc.get_global_id() << dendl; + + map metadata; + metadata["addr"] = client_messenger->get_myaddr_legacy().ip_only_to_str(); + metadata["addrs"] = stringify(client_messenger->get_myaddrs()); + collect_sys_info(&metadata, g_ceph_context); + + MMgrBeacon *m = new MMgrBeacon(monc.get_fsid(), + monc.get_global_id(), + g_conf()->name.get_id(), + addrs, + available, + std::move(module_info), + std::move(metadata)); + + if (available) { + if (!available_in_map) { + // We are informing the mon that we are done initializing: inform + // it of our command set. This has to happen after init() because + // it needs the python modules to have loaded. + std::vector commands = mgr_commands; + std::vector py_commands = py_module_registry.get_commands(); + commands.insert(commands.end(), py_commands.begin(), py_commands.end()); + m->set_command_descs(commands); + dout(4) << "going active, including " << m->get_command_descs().size() + << " commands in beacon" << dendl; + } + + m->set_services(active_mgr->get_services()); + } + + monc.send_mon_message(m); +} + +void MgrStandby::tick() +{ + dout(10) << __func__ << dendl; + send_beacon(); + + timer.add_event_after( + g_conf().get_val("mgr_tick_period").count(), + new FunctionContext([this](int r){ + tick(); + } + )); +} + +void MgrStandby::handle_signal(int signum) +{ + ceph_assert(signum == SIGINT || signum == SIGTERM); + derr << "*** Got signal " << sig_str(signum) << " ***" << dendl; + _exit(0); // exit with 0 result code, as if we had done an orderly shutdown + //shutdown(); +} + +void MgrStandby::shutdown() +{ + finisher.queue(new FunctionContext([&](int) { + std::lock_guard l(lock); + + dout(4) << "Shutting down" << dendl; + + // stop sending beacon first, i use monc to talk with monitors + timer.shutdown(); + // client uses monc and objecter + client.shutdown(); + mgrc.shutdown(); + // stop monc, so mon won't be able to instruct me to shutdown/activate after + // the active_mgr is stopped + monc.shutdown(); + if (active_mgr) { + active_mgr->shutdown(); + } + + py_module_registry.shutdown(); + + // objecter is used by monc and active_mgr + objecter.shutdown(); + // client_messenger is used by all of them, so stop it in the end + client_messenger->shutdown(); + })); + + // Then stop the finisher to ensure its enqueued contexts aren't going + // to touch references to the things we're about to tear down + finisher.wait_for_empty(); + finisher.stop(); +} + +void MgrStandby::respawn() +{ + // --- WARNING TO FUTURE COPY/PASTERS --- + // You must also add a call like + // + // ceph_pthread_setname(pthread_self(), "ceph-mgr"); + // + // to main() so that /proc/$pid/stat field 2 contains "(ceph-mgr)" + // instead of "(exe)", so that killall (and log rotation) will work. + + char *new_argv[orig_argc+1]; + dout(1) << " e: '" << orig_argv[0] << "'" << dendl; + for (int i=0; i log_to_monitors; + map log_to_syslog; + map log_channel; + map log_prio; + map log_to_graylog; + map log_to_graylog_host; + map log_to_graylog_port; + uuid_d fsid; + string host; + + if (parse_log_client_options(cct, log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host) == 0) { + clog->update_config(log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host); + audit_clog->update_config(log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host); + } +} + +void MgrStandby::handle_mgr_map(MMgrMap* mmap) +{ + auto &map = mmap->get_map(); + dout(4) << "received map epoch " << map.get_epoch() << dendl; + const bool active_in_map = map.active_gid == monc.get_global_id(); + dout(4) << "active in map: " << active_in_map + << " active is " << map.active_gid << dendl; + + // PyModuleRegistry may ask us to respawn if it sees that + // this MgrMap is changing its set of enabled modules + bool need_respawn = py_module_registry.handle_mgr_map(map); + if (need_respawn) { + dout(1) << "respawning because set of enabled modules changed!" << dendl; + respawn(); + } + + if (active_in_map) { + if (!active_mgr) { + dout(1) << "Activating!" << dendl; + active_mgr.reset(new Mgr(&monc, map, &py_module_registry, + client_messenger.get(), &objecter, + &client, clog, audit_clog)); + active_mgr->background_init(new FunctionContext( + [this](int r){ + // Advertise our active-ness ASAP instead of waiting for + // next tick. + std::lock_guard l(lock); + send_beacon(); + })); + dout(1) << "I am now activating" << dendl; + } else { + dout(10) << "I was already active" << dendl; + bool need_respawn = active_mgr->got_mgr_map(map); + if (need_respawn) { + respawn(); + } + } + + if (!available_in_map && map.get_available()) { + dout(4) << "Map now says I am available" << dendl; + available_in_map = true; + } + } else if (active_mgr != nullptr) { + derr << "I was active but no longer am" << dendl; + respawn(); + } else { + if (map.active_gid != 0 && map.active_name != g_conf()->name.get_id()) { + // I am the standby and someone else is active, start modules + // in standby mode to do redirects if needed + if (!py_module_registry.is_standby_running()) { + py_module_registry.standby_start(monc, finisher); + } + } + } +} + +bool MgrStandby::ms_dispatch(Message *m) +{ + std::lock_guard l(lock); + dout(4) << state_str() << " " << *m << dendl; + + if (m->get_type() == MSG_MGR_MAP) { + handle_mgr_map(static_cast(m)); + } + bool handled = false; + if (active_mgr) { + auto am = active_mgr; + lock.Unlock(); + handled = am->ms_dispatch(m); + lock.Lock(); + } + if (m->get_type() == MSG_MGR_MAP) { + // let this pass through for mgrc + handled = false; + } + return handled; +} + + +bool MgrStandby::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) +{ + if (dest_type == CEPH_ENTITY_TYPE_MON) + return true; + + *authorizer = monc.build_authorizer(dest_type); + return *authorizer != NULL; +} + +bool MgrStandby::ms_handle_refused(Connection *con) +{ + // do nothing for now + return false; +} + +// A reference for use by the signal handler +static MgrStandby *signal_mgr = nullptr; + +static void handle_mgr_signal(int signum) +{ + if (signal_mgr) { + signal_mgr->handle_signal(signum); + } +} + +int MgrStandby::main(vector args) +{ + // Enable signal handlers + signal_mgr = this; + register_async_signal_handler_oneshot(SIGINT, handle_mgr_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_mgr_signal); + + client_messenger->wait(); + + // Disable signal handlers + unregister_async_signal_handler(SIGHUP, sighup_handler); + unregister_async_signal_handler(SIGINT, handle_mgr_signal); + unregister_async_signal_handler(SIGTERM, handle_mgr_signal); + shutdown_async_signal_handler(); + signal_mgr = nullptr; + + return 0; +} + + +std::string MgrStandby::state_str() +{ + if (active_mgr == nullptr) { + return "standby"; + } else if (active_mgr->is_initialized()) { + return "active"; + } else { + return "active (starting)"; + } +} + diff --git a/src/mgr/MgrStandby.h b/src/mgr/MgrStandby.h new file mode 100644 index 00000000..7adab68d --- /dev/null +++ b/src/mgr/MgrStandby.h @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#ifndef MGR_STANDBY_H_ +#define MGR_STANDBY_H_ + +#include "auth/Auth.h" +#include "common/Finisher.h" +#include "common/Timer.h" +#include "common/LogClient.h" + +#include "client/Client.h" +#include "mon/MonClient.h" +#include "osdc/Objecter.h" +#include "PyModuleRegistry.h" +#include "MgrClient.h" + +class MMgrMap; +class Mgr; +class PyModuleConfig; + +class MgrStandby : public Dispatcher, + public md_config_obs_t { +public: + // config observer bits + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set &changed) override; + +protected: + MonClient monc; + std::unique_ptr client_messenger; + Objecter objecter; + Client client; + + MgrClient mgrc; + + LogClient log_client; + LogChannelRef clog, audit_clog; + + Mutex lock; + Finisher finisher; + SafeTimer timer; + + PyModuleRegistry py_module_registry; + std::shared_ptr active_mgr; + + int orig_argc; + const char **orig_argv; + + std::string state_str(); + + void handle_mgr_map(MMgrMap *m); + void _update_log_config(); + void send_beacon(); + + bool available_in_map; + +public: + MgrStandby(int argc, const char **argv); + ~MgrStandby() override; + + bool ms_dispatch(Message *m) override; + bool ms_handle_reset(Connection *con) override { return false; } + void ms_handle_remote_reset(Connection *con) override {} + bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) override; + bool ms_handle_refused(Connection *con) override; + + int init(); + void shutdown(); + void respawn(); + int main(vector args); + void handle_signal(int signum); + void tick(); +}; + +#endif + diff --git a/src/mgr/OSDPerfMetricCollector.cc b/src/mgr/OSDPerfMetricCollector.cc new file mode 100644 index 00000000..5cd8e273 --- /dev/null +++ b/src/mgr/OSDPerfMetricCollector.cc @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" + +#include "OSDPerfMetricCollector.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr.osd_perf_metric_collector " << __func__ << " " + +namespace { + +bool is_limited(const std::map> &limits) { + for (auto &it : limits) { + if (!it.second) { + return false; + } + } + return true; +} + +} // anonymous namespace + +OSDPerfMetricCollector::OSDPerfMetricCollector(Listener &listener) + : listener(listener), lock("OSDPerfMetricCollector::lock") { +} + +std::map +OSDPerfMetricCollector::get_queries() const { + std::lock_guard locker(lock); + + std::map result; + for (auto &it : queries) { + auto &query = it.first; + auto &limits = it.second; + auto result_it = result.insert({query, {}}).first; + if (is_limited(limits)) { + for (auto &iter : limits) { + result_it->second.insert(*iter.second); + } + } + } + + return result; +} + +OSDPerfMetricQueryID OSDPerfMetricCollector::add_query( + const OSDPerfMetricQuery& query, + const std::optional &limit) { + uint64_t query_id; + bool notify = false; + + { + std::lock_guard locker(lock); + + query_id = next_query_id++; + auto it = queries.find(query); + if (it == queries.end()) { + it = queries.insert({query, {}}).first; + notify = true; + } else if (is_limited(it->second)) { + notify = true; + } + it->second.insert({query_id, limit}); + counters[query_id]; + } + + dout(10) << query << " " << (limit ? stringify(*limit) : "unlimited") + << " query_id=" << query_id << dendl; + + if (notify) { + listener.handle_query_updated(); + } + + return query_id; +} + +int OSDPerfMetricCollector::remove_query(int query_id) { + bool found = false; + bool notify = false; + + { + std::lock_guard locker(lock); + + for (auto it = queries.begin() ; it != queries.end(); it++) { + auto iter = it->second.find(query_id); + if (iter == it->second.end()) { + continue; + } + + it->second.erase(iter); + if (it->second.empty()) { + queries.erase(it); + notify = true; + } else if (is_limited(it->second)) { + notify = true; + } + found = true; + break; + } + counters.erase(query_id); + } + + if (!found) { + dout(10) << query_id << " not found" << dendl; + return -ENOENT; + } + + dout(10) << query_id << dendl; + + if (notify) { + listener.handle_query_updated(); + } + + return 0; +} + +void OSDPerfMetricCollector::remove_all_queries() { + dout(10) << dendl; + + bool notify; + + { + std::lock_guard locker(lock); + + notify = !queries.empty(); + queries.clear(); + } + + if (notify) { + listener.handle_query_updated(); + } +} + +int OSDPerfMetricCollector::get_counters( + OSDPerfMetricQueryID query_id, + std::map *c) { + std::lock_guard locker(lock); + + auto it = counters.find(query_id); + if (it == counters.end()) { + dout(10) << "counters for " << query_id << " not found" << dendl; + return -ENOENT; + } + + *c = std::move(it->second); + it->second.clear(); + + return 0; +} + +void OSDPerfMetricCollector::process_reports( + const std::map &reports) { + + if (reports.empty()) { + return; + } + + std::lock_guard locker(lock); + + for (auto &it : reports) { + auto &query = it.first; + auto &report = it.second; + dout(10) << "report for " << query << " query: " + << report.group_packed_performance_counters.size() << " records" + << dendl; + + for (auto &it : report.group_packed_performance_counters) { + auto &key = it.first; + auto bl_it = it.second.cbegin(); + + for (auto &queries_it : queries[query]) { + auto query_id = queries_it.first; + auto &key_counters = counters[query_id][key]; + if (key_counters.empty()) { + key_counters.resize(query.performance_counter_descriptors.size(), + {0, 0}); + } + } + + auto desc_it = report.performance_counter_descriptors.begin(); + for (size_t i = 0; i < query.performance_counter_descriptors.size(); i++) { + if (desc_it == report.performance_counter_descriptors.end()) { + break; + } + if (*desc_it != query.performance_counter_descriptors[i]) { + continue; + } + PerformanceCounter c; + desc_it->unpack_counter(bl_it, &c); + dout(20) << "counter " << key << " " << *desc_it << ": " << c << dendl; + + for (auto &queries_it : queries[query]) { + auto query_id = queries_it.first; + auto &key_counters = counters[query_id][key]; + key_counters[i].first += c.first; + key_counters[i].second += c.second; + } + desc_it++; + } + } + } +} diff --git a/src/mgr/OSDPerfMetricCollector.h b/src/mgr/OSDPerfMetricCollector.h new file mode 100644 index 00000000..89e33091 --- /dev/null +++ b/src/mgr/OSDPerfMetricCollector.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef OSD_PERF_METRIC_COLLECTOR_H_ +#define OSD_PERF_METRIC_COLLECTOR_H_ + +#include "common/Mutex.h" + +#include "mgr/OSDPerfMetricTypes.h" + +#include + +/** + * OSD performance query class. + */ +class OSDPerfMetricCollector { +public: + struct Listener { + virtual ~Listener() { + } + + virtual void handle_query_updated() = 0; + }; + + OSDPerfMetricCollector(Listener &listener); + + std::map get_queries() const; + + OSDPerfMetricQueryID add_query( + const OSDPerfMetricQuery& query, + const std::optional &limit); + int remove_query(OSDPerfMetricQueryID query_id); + void remove_all_queries(); + + int get_counters(OSDPerfMetricQueryID query_id, + std::map *counters); + + void process_reports( + const std::map &reports); + +private: + typedef std::optional OptionalLimit; + typedef std::map> Queries; + typedef std::map> Counters; + + Listener &listener; + mutable Mutex lock; + OSDPerfMetricQueryID next_query_id = 0; + Queries queries; + Counters counters; +}; + +#endif // OSD_PERF_METRIC_COLLECTOR_H_ diff --git a/src/mgr/OSDPerfMetricTypes.cc b/src/mgr/OSDPerfMetricTypes.cc new file mode 100644 index 00000000..06f7081d --- /dev/null +++ b/src/mgr/OSDPerfMetricTypes.cc @@ -0,0 +1,132 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mgr/OSDPerfMetricTypes.h" + +#include + +std::ostream& operator<<(std::ostream& os, + const OSDPerfMetricSubKeyDescriptor &d) { + switch(d.type) { + case OSDPerfMetricSubKeyType::CLIENT_ID: + os << "client_id"; + break; + case OSDPerfMetricSubKeyType::CLIENT_ADDRESS: + os << "client_address"; + break; + case OSDPerfMetricSubKeyType::POOL_ID: + os << "pool_id"; + break; + case OSDPerfMetricSubKeyType::NAMESPACE: + os << "namespace"; + break; + case OSDPerfMetricSubKeyType::OSD_ID: + os << "osd_id"; + break; + case OSDPerfMetricSubKeyType::PG_ID: + os << "pg_id"; + break; + case OSDPerfMetricSubKeyType::OBJECT_NAME: + os << "object_name"; + break; + case OSDPerfMetricSubKeyType::SNAP_ID: + os << "snap_id"; + break; + default: + os << "unknown (" << static_cast(d.type) << ")"; + } + return os << "~/" << d.regex_str << "/"; +} + +void PerformanceCounterDescriptor::pack_counter(const PerformanceCounter &c, + bufferlist *bl) const { + using ceph::encode; + encode(c.first, *bl); + switch(type) { + case PerformanceCounterType::OPS: + case PerformanceCounterType::WRITE_OPS: + case PerformanceCounterType::READ_OPS: + case PerformanceCounterType::BYTES: + case PerformanceCounterType::WRITE_BYTES: + case PerformanceCounterType::READ_BYTES: + break; + case PerformanceCounterType::LATENCY: + case PerformanceCounterType::WRITE_LATENCY: + case PerformanceCounterType::READ_LATENCY: + encode(c.second, *bl); + break; + default: + ceph_abort_msg("unknown counter type"); + } +} + +void PerformanceCounterDescriptor::unpack_counter( + bufferlist::const_iterator& bl, PerformanceCounter *c) const { + using ceph::decode; + decode(c->first, bl); + switch(type) { + case PerformanceCounterType::OPS: + case PerformanceCounterType::WRITE_OPS: + case PerformanceCounterType::READ_OPS: + case PerformanceCounterType::BYTES: + case PerformanceCounterType::WRITE_BYTES: + case PerformanceCounterType::READ_BYTES: + break; + case PerformanceCounterType::LATENCY: + case PerformanceCounterType::WRITE_LATENCY: + case PerformanceCounterType::READ_LATENCY: + decode(c->second, bl); + break; + default: + ceph_abort_msg("unknown counter type"); + } +} + +std::ostream& operator<<(std::ostream& os, + const PerformanceCounterDescriptor &d) { + switch(d.type) { + case PerformanceCounterType::OPS: + return os << "ops"; + case PerformanceCounterType::WRITE_OPS: + return os << "write ops"; + case PerformanceCounterType::READ_OPS: + return os << "read ops"; + case PerformanceCounterType::BYTES: + return os << "bytes"; + case PerformanceCounterType::WRITE_BYTES: + return os << "write bytes"; + case PerformanceCounterType::READ_BYTES: + return os << "read bytes"; + case PerformanceCounterType::LATENCY: + return os << "latency"; + case PerformanceCounterType::WRITE_LATENCY: + return os << "write latency"; + case PerformanceCounterType::READ_LATENCY: + return os << "read latency"; + default: + return os << "unknown (" << static_cast(d.type) << ")"; + } +} + +std::ostream& operator<<(std::ostream& os, const OSDPerfMetricLimit &limit) { + return os << "{order_by=" << limit.order_by << ", max_count=" + << limit.max_count << "}"; +} + +void OSDPerfMetricQuery::pack_counters(const PerformanceCounters &counters, + bufferlist *bl) const { + auto it = counters.begin(); + for (auto &descriptor : performance_counter_descriptors) { + if (it == counters.end()) { + descriptor.pack_counter(PerformanceCounter(), bl); + } else { + descriptor.pack_counter(*it, bl); + it++; + } + } +} + +std::ostream& operator<<(std::ostream& os, const OSDPerfMetricQuery &query) { + return os << "{key=" << query.key_descriptor << ", counters=" + << query.performance_counter_descriptors << "}"; +} diff --git a/src/mgr/OSDPerfMetricTypes.h b/src/mgr/OSDPerfMetricTypes.h new file mode 100644 index 00000000..b40c3de8 --- /dev/null +++ b/src/mgr/OSDPerfMetricTypes.h @@ -0,0 +1,355 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef OSD_PERF_METRIC_H_ +#define OSD_PERF_METRIC_H_ + +#include "include/denc.h" +#include "include/stringify.h" + +#include + +typedef std::vector OSDPerfMetricSubKey; // array of regex match +typedef std::vector OSDPerfMetricKey; + +enum class OSDPerfMetricSubKeyType : uint8_t { + CLIENT_ID = 0, + CLIENT_ADDRESS = 1, + POOL_ID = 2, + NAMESPACE = 3, + OSD_ID = 4, + PG_ID = 5, + OBJECT_NAME = 6, + SNAP_ID = 7, +}; + +struct OSDPerfMetricSubKeyDescriptor { + OSDPerfMetricSubKeyType type = static_cast(-1); + std::string regex_str; + std::regex regex; + + bool is_supported() const { + switch (type) { + case OSDPerfMetricSubKeyType::CLIENT_ID: + case OSDPerfMetricSubKeyType::CLIENT_ADDRESS: + case OSDPerfMetricSubKeyType::POOL_ID: + case OSDPerfMetricSubKeyType::NAMESPACE: + case OSDPerfMetricSubKeyType::OSD_ID: + case OSDPerfMetricSubKeyType::PG_ID: + case OSDPerfMetricSubKeyType::OBJECT_NAME: + case OSDPerfMetricSubKeyType::SNAP_ID: + return true; + default: + return false; + } + } + + OSDPerfMetricSubKeyDescriptor() { + } + + OSDPerfMetricSubKeyDescriptor(OSDPerfMetricSubKeyType type, + const std::string regex) + : type(type), regex_str(regex) { + } + + bool operator<(const OSDPerfMetricSubKeyDescriptor &other) const { + if (type < other.type) { + return true; + } + if (type > other.type) { + return false; + } + return regex_str < other.regex_str; + } + + DENC(OSDPerfMetricSubKeyDescriptor, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.regex_str, p); + DENC_FINISH(p); + } +}; +WRITE_CLASS_DENC(OSDPerfMetricSubKeyDescriptor) + +std::ostream& operator<<(std::ostream& os, + const OSDPerfMetricSubKeyDescriptor &d); + +typedef std::vector OSDPerfMetricKeyDescriptor; + +template<> +struct denc_traits { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = true; + static void bound_encode(const OSDPerfMetricKeyDescriptor& v, size_t& p) { + p += sizeof(uint32_t); + const auto size = v.size(); + if (size) { + size_t per = 0; + denc(v.front(), per); + p += per * size; + } + } + static void encode(const OSDPerfMetricKeyDescriptor& v, + bufferlist::contiguous_appender& p) { + denc_varint(v.size(), p); + for (auto& i : v) { + denc(i, p); + } + } + static void decode(OSDPerfMetricKeyDescriptor& v, + bufferptr::const_iterator& p) { + unsigned num; + denc_varint(num, p); + v.clear(); + v.reserve(num); + for (unsigned i=0; i < num; ++i) { + OSDPerfMetricSubKeyDescriptor d; + denc(d, p); + if (!d.is_supported()) { + v.clear(); + return; + } + try { + d.regex = d.regex_str.c_str(); + } catch (const std::regex_error& e) { + v.clear(); + return; + } + if (d.regex.mark_count() == 0) { + v.clear(); + return; + } + v.push_back(std::move(d)); + } + } +}; + +typedef std::pair PerformanceCounter; +typedef std::vector PerformanceCounters; + +enum class PerformanceCounterType : uint8_t { + OPS = 0, + WRITE_OPS = 1, + READ_OPS = 2, + BYTES = 3, + WRITE_BYTES = 4, + READ_BYTES = 5, + LATENCY = 6, + WRITE_LATENCY = 7, + READ_LATENCY = 8, +}; + +struct PerformanceCounterDescriptor { + PerformanceCounterType type = static_cast(-1); + + bool is_supported() const { + switch (type) { + case PerformanceCounterType::OPS: + case PerformanceCounterType::WRITE_OPS: + case PerformanceCounterType::READ_OPS: + case PerformanceCounterType::BYTES: + case PerformanceCounterType::WRITE_BYTES: + case PerformanceCounterType::READ_BYTES: + case PerformanceCounterType::LATENCY: + case PerformanceCounterType::WRITE_LATENCY: + case PerformanceCounterType::READ_LATENCY: + return true; + default: + return false; + } + } + + PerformanceCounterDescriptor() { + } + + PerformanceCounterDescriptor(PerformanceCounterType type) : type(type) { + } + + bool operator<(const PerformanceCounterDescriptor &other) const { + return type < other.type; + } + + bool operator==(const PerformanceCounterDescriptor &other) const { + return type == other.type; + } + + bool operator!=(const PerformanceCounterDescriptor &other) const { + return type != other.type; + } + + DENC(PerformanceCounterDescriptor, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + DENC_FINISH(p); + } + + void pack_counter(const PerformanceCounter &c, bufferlist *bl) const; + void unpack_counter(bufferlist::const_iterator& bl, + PerformanceCounter *c) const; +}; +WRITE_CLASS_DENC(PerformanceCounterDescriptor) + +std::ostream& operator<<(std::ostream& os, + const PerformanceCounterDescriptor &d); + +typedef std::vector PerformanceCounterDescriptors; + +template<> +struct denc_traits { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = true; + static void bound_encode(const PerformanceCounterDescriptors& v, size_t& p) { + p += sizeof(uint32_t); + const auto size = v.size(); + if (size) { + size_t per = 0; + denc(v.front(), per); + p += per * size; + } + } + static void encode(const PerformanceCounterDescriptors& v, + bufferlist::contiguous_appender& p) { + denc_varint(v.size(), p); + for (auto& i : v) { + denc(i, p); + } + } + static void decode(PerformanceCounterDescriptors& v, + bufferptr::const_iterator& p) { + unsigned num; + denc_varint(num, p); + v.clear(); + v.reserve(num); + for (unsigned i=0; i < num; ++i) { + PerformanceCounterDescriptor d; + denc(d, p); + if (d.is_supported()) { + v.push_back(std::move(d)); + } + } + } +}; + +struct OSDPerfMetricLimit { + PerformanceCounterDescriptor order_by; + uint64_t max_count = 0; + + OSDPerfMetricLimit() { + } + + OSDPerfMetricLimit(const PerformanceCounterDescriptor &order_by, + uint64_t max_count) + : order_by(order_by), max_count(max_count) { + } + + bool operator<(const OSDPerfMetricLimit &other) const { + if (order_by != other.order_by) { + return order_by < other.order_by; + } + return max_count < other.max_count; + } + + DENC(OSDPerfMetricLimit, v, p) { + DENC_START(1, 1, p); + denc(v.order_by, p); + denc(v.max_count, p); + DENC_FINISH(p); + } +}; +WRITE_CLASS_DENC(OSDPerfMetricLimit) + +std::ostream& operator<<(std::ostream& os, const OSDPerfMetricLimit &limit); + +typedef std::set OSDPerfMetricLimits; + +typedef int OSDPerfMetricQueryID; + +struct OSDPerfMetricQuery { + bool operator<(const OSDPerfMetricQuery &other) const { + if (key_descriptor < other.key_descriptor) { + return true; + } + if (key_descriptor > other.key_descriptor) { + return false; + } + return (performance_counter_descriptors < + other.performance_counter_descriptors); + } + + OSDPerfMetricQuery() { + } + + OSDPerfMetricQuery( + const OSDPerfMetricKeyDescriptor &key_descriptor, + const PerformanceCounterDescriptors &performance_counter_descriptors) + : key_descriptor(key_descriptor), + performance_counter_descriptors(performance_counter_descriptors) { + } + + template + bool get_key(L&& get_sub_key, OSDPerfMetricKey *key) const { + for (auto &sub_key_descriptor : key_descriptor) { + OSDPerfMetricSubKey sub_key; + if (!get_sub_key(sub_key_descriptor, &sub_key)) { + return false; + } + key->push_back(sub_key); + } + return true; + } + + DENC(OSDPerfMetricQuery, v, p) { + DENC_START(1, 1, p); + denc(v.key_descriptor, p); + denc(v.performance_counter_descriptors, p); + DENC_FINISH(p); + } + + void get_performance_counter_descriptors( + PerformanceCounterDescriptors *descriptors) const { + *descriptors = performance_counter_descriptors; + } + + template + void update_counters(L &&update_counter, + PerformanceCounters *counters) const { + auto it = counters->begin(); + for (auto &descriptor : performance_counter_descriptors) { + // TODO: optimize + if (it == counters->end()) { + counters->push_back(PerformanceCounter()); + it = std::prev(counters->end()); + } + update_counter(descriptor, &(*it)); + it++; + } + } + + void pack_counters(const PerformanceCounters &counters, bufferlist *bl) const; + + OSDPerfMetricKeyDescriptor key_descriptor; + PerformanceCounterDescriptors performance_counter_descriptors; +}; +WRITE_CLASS_DENC(OSDPerfMetricQuery) + +std::ostream& operator<<(std::ostream& os, const OSDPerfMetricQuery &query); + +struct OSDPerfMetricReport { + PerformanceCounterDescriptors performance_counter_descriptors; + std::map group_packed_performance_counters; + + DENC(OSDPerfMetricReport, v, p) { + DENC_START(1, 1, p); + denc(v.performance_counter_descriptors, p); + denc(v.group_packed_performance_counters, p); + DENC_FINISH(p); + } +}; +WRITE_CLASS_DENC(OSDPerfMetricReport) + +#endif // OSD_PERF_METRIC_H_ + diff --git a/src/mgr/PyFormatter.cc b/src/mgr/PyFormatter.cc new file mode 100644 index 00000000..a9067473 --- /dev/null +++ b/src/mgr/PyFormatter.cc @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat Inc + * + * Author: John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "PyFormatter.h" + +#define LARGE_SIZE 1024 + + +void PyFormatter::open_array_section(const char *name) +{ + PyObject *list = PyList_New(0); + dump_pyobject(name, list); + stack.push(cursor); + cursor = list; +} + +void PyFormatter::open_object_section(const char *name) +{ + PyObject *dict = PyDict_New(); + dump_pyobject(name, dict); + stack.push(cursor); + cursor = dict; +} + +void PyFormatter::dump_unsigned(const char *name, uint64_t u) +{ + PyObject *p = PyLong_FromUnsignedLong(u); + ceph_assert(p); + dump_pyobject(name, p); +} + +void PyFormatter::dump_int(const char *name, int64_t u) +{ + PyObject *p = PyLong_FromLongLong(u); + ceph_assert(p); + dump_pyobject(name, p); +} + +void PyFormatter::dump_float(const char *name, double d) +{ + dump_pyobject(name, PyFloat_FromDouble(d)); +} + +void PyFormatter::dump_string(const char *name, std::string_view s) +{ + dump_pyobject(name, PyString_FromString(s.data())); +} + +void PyFormatter::dump_bool(const char *name, bool b) +{ + if (b) { + Py_INCREF(Py_True); + dump_pyobject(name, Py_True); + } else { + Py_INCREF(Py_False); + dump_pyobject(name, Py_False); + } +} + +std::ostream& PyFormatter::dump_stream(const char *name) +{ + // Give the caller an ostream, construct a PyString, + // and remember the association between the two. On flush, + // we'll read from the ostream into the PyString + auto ps = std::make_shared(); + ps->cursor = cursor; + ps->name = name; + + pending_streams.push_back(ps); + + return ps->stream; +} + +void PyFormatter::dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) +{ + char buf[LARGE_SIZE]; + vsnprintf(buf, LARGE_SIZE, fmt, ap); + + dump_pyobject(name, PyString_FromString(buf)); +} + +/** + * Steals reference to `p` + */ +void PyFormatter::dump_pyobject(const char *name, PyObject *p) +{ + if (PyList_Check(cursor)) { + PyList_Append(cursor, p); + Py_DECREF(p); + } else if (PyDict_Check(cursor)) { + PyObject *key = PyString_FromString(name); + PyDict_SetItem(cursor, key, p); + Py_DECREF(key); + Py_DECREF(p); + } else { + ceph_abort(); + } +} + +void PyFormatter::finish_pending_streams() +{ + for (const auto &i : pending_streams) { + PyObject *tmp_cur = cursor; + cursor = i->cursor; + dump_pyobject( + i->name.c_str(), + PyString_FromString(i->stream.str().c_str())); + cursor = tmp_cur; + } + + pending_streams.clear(); +} + diff --git a/src/mgr/PyFormatter.h b/src/mgr/PyFormatter.h new file mode 100644 index 00000000..58fead17 --- /dev/null +++ b/src/mgr/PyFormatter.h @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat Inc + * + * Author: John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef PY_FORMATTER_H_ +#define PY_FORMATTER_H_ + +// Python.h comes first because otherwise it clobbers ceph's assert +#include "PythonCompat.h" + +#include +#include +#include + +#include "common/Formatter.h" +#include "include/ceph_assert.h" + +class PyFormatter : public ceph::Formatter +{ +public: + PyFormatter (const PyFormatter&) = delete; + PyFormatter& operator= (const PyFormatter&) = delete; + PyFormatter(bool pretty = false, bool array = false) + { + // It is forbidden to instantiate me outside of the GIL, + // because I construct python objects right away + + // Initialise cursor to an empty dict + if (!array) { + root = cursor = PyDict_New(); + } else { + root = cursor = PyList_New(0); + } + } + + ~PyFormatter() override + { + cursor = NULL; + Py_DECREF(root); + root = NULL; + } + + // Obscure, don't care. + void open_array_section_in_ns(const char *name, const char *ns) override + {ceph_abort();} + void open_object_section_in_ns(const char *name, const char *ns) override + {ceph_abort();} + + void reset() override + { + const bool array = PyList_Check(root); + Py_DECREF(root); + if (array) { + root = cursor = PyList_New(0); + } else { + root = cursor = PyDict_New(); + } + } + + void set_status(int status, const char* status_name) override {} + void output_header() override {}; + void output_footer() override {}; + void enable_line_break() override {}; + + void open_array_section(const char *name) override; + void open_object_section(const char *name) override; + void close_section() override + { + ceph_assert(cursor != root); + ceph_assert(!stack.empty()); + cursor = stack.top(); + stack.pop(); + } + void dump_bool(const char *name, bool b) override; + void dump_unsigned(const char *name, uint64_t u) override; + void dump_int(const char *name, int64_t u) override; + void dump_float(const char *name, double d) override; + void dump_string(const char *name, std::string_view s) override; + std::ostream& dump_stream(const char *name) override; + void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override; + + void flush(std::ostream& os) override + { + // This class is not a serializer: this doesn't make sense + ceph_abort(); + } + + int get_len() const override + { + // This class is not a serializer: this doesn't make sense + ceph_abort(); + return 0; + } + + void write_raw_data(const char *data) override + { + // This class is not a serializer: this doesn't make sense + ceph_abort(); + } + + PyObject *get() + { + finish_pending_streams(); + + Py_INCREF(root); + return root; + } + + void finish_pending_streams(); + +private: + PyObject *root; + PyObject *cursor; + std::stack stack; + + void dump_pyobject(const char *name, PyObject *p); + + class PendingStream { + public: + PyObject *cursor; + std::string name; + std::stringstream stream; + }; + + std::list > pending_streams; + +}; + +#endif + diff --git a/src/mgr/PyModule.cc b/src/mgr/PyModule.cc new file mode 100644 index 00000000..8c8859ae --- /dev/null +++ b/src/mgr/PyModule.cc @@ -0,0 +1,726 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "BaseMgrModule.h" +#include "BaseMgrStandbyModule.h" +#include "PyOSDMap.h" +#include "MgrContext.h" + +#include "PyModule.h" + +#include "common/debug.h" +#include "common/errno.h" +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +#undef dout_prefix +#define dout_prefix *_dout << "mgr[py] " + +// definition for non-const static member +std::string PyModule::config_prefix = "mgr/"; + +// Courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text +#include +#include +#include "include/ceph_assert.h" // boost clobbers this +// decode a Python exception into a string +std::string handle_pyerror() +{ + using namespace boost::python; + using namespace boost; + + PyObject *exc, *val, *tb; + object formatted_list, formatted; + PyErr_Fetch(&exc, &val, &tb); + PyErr_NormalizeException(&exc, &val, &tb); + handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb)); + object traceback(import("traceback")); + if (!tb) { + object format_exception_only(traceback.attr("format_exception_only")); + try { + formatted_list = format_exception_only(hexc, hval); + } catch (error_already_set const &) { + // error while processing exception object + // returning only the exception string value + PyObject *name_attr = PyObject_GetAttrString(exc, "__name__"); + std::stringstream ss; + ss << PyString_AsString(name_attr) << ": " << PyString_AsString(val); + Py_XDECREF(name_attr); + ss << "\nError processing exception object: " << peek_pyerror(); + return ss.str(); + } + } else { + object format_exception(traceback.attr("format_exception")); + try { + formatted_list = format_exception(hexc, hval, htb); + } catch (error_already_set const &) { + // error while processing exception object + // returning only the exception string value + PyObject *name_attr = PyObject_GetAttrString(exc, "__name__"); + std::stringstream ss; + ss << PyString_AsString(name_attr) << ": " << PyString_AsString(val); + Py_XDECREF(name_attr); + ss << "\nError processing exception object: " << peek_pyerror(); + return ss.str(); + } + } + formatted = str("").join(formatted_list); + return extract(formatted); +} + +/** + * Get the single-line exception message, without clearing any + * exception state. + */ +std::string peek_pyerror() +{ + PyObject *ptype, *pvalue, *ptraceback; + PyErr_Fetch(&ptype, &pvalue, &ptraceback); + ceph_assert(ptype); + ceph_assert(pvalue); + PyObject *pvalue_str = PyObject_Str(pvalue); + std::string exc_msg = PyString_AsString(pvalue_str); + Py_DECREF(pvalue_str); + PyErr_Restore(ptype, pvalue, ptraceback); + + return exc_msg; +} + + +namespace { + PyObject* log_write(PyObject*, PyObject* args) { + char* m = nullptr; + if (PyArg_ParseTuple(args, "s", &m)) { + auto len = strlen(m); + if (len && m[len-1] == '\n') { + m[len-1] = '\0'; + } + dout(4) << m << dendl; + } + Py_RETURN_NONE; + } + + PyObject* log_flush(PyObject*, PyObject*){ + Py_RETURN_NONE; + } + + static PyMethodDef log_methods[] = { + {"write", log_write, METH_VARARGS, "write stdout and stderr"}, + {"flush", log_flush, METH_VARARGS, "flush"}, + {nullptr, nullptr, 0, nullptr} + }; + +#if PY_MAJOR_VERSION >= 3 + static PyModuleDef ceph_logger_module = { + PyModuleDef_HEAD_INIT, + "ceph_logger", + nullptr, + -1, + log_methods, + }; +#endif +} + +PyModuleConfig::PyModuleConfig() = default; + +PyModuleConfig::PyModuleConfig(PyModuleConfig &mconfig) + : config(mconfig.config) +{} + +PyModuleConfig::~PyModuleConfig() = default; + + +void PyModuleConfig::set_config( + MonClient *monc, + const std::string &module_name, + const std::string &key, const boost::optional& val) +{ + const std::string global_key = PyModule::config_prefix + + module_name + "/" + key; + Command set_cmd; + { + std::ostringstream cmd_json; + JSONFormatter jf; + jf.open_object_section("cmd"); + if (val) { + jf.dump_string("prefix", "config set"); + jf.dump_string("value", *val); + } else { + jf.dump_string("prefix", "config rm"); + } + jf.dump_string("who", "mgr"); + jf.dump_string("name", global_key); + jf.close_section(); + jf.flush(cmd_json); + set_cmd.run(monc, cmd_json.str()); + } + set_cmd.wait(); + + if (set_cmd.r == 0) { + std::lock_guard l(lock); + if (val) { + config[global_key] = *val; + } else { + config.erase(global_key); + } + } else { + if (val) { + dout(0) << "`config set mgr " << global_key << " " << val << "` failed: " + << cpp_strerror(set_cmd.r) << dendl; + } else { + dout(0) << "`config rm mgr " << global_key << "` failed: " + << cpp_strerror(set_cmd.r) << dendl; + } + dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl; + } +} + +std::string PyModule::get_site_packages() +{ + std::stringstream site_packages; + + // CPython doesn't auto-add site-packages dirs to sys.path for us, + // but it does provide a module that we can ask for them. + auto site_module = PyImport_ImportModule("site"); + ceph_assert(site_module); + + auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages"); + if (site_packages_fn != nullptr) { + auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr); + ceph_assert(site_packages_list); + + auto n = PyList_Size(site_packages_list); + for (Py_ssize_t i = 0; i < n; ++i) { + if (i != 0) { + site_packages << ":"; + } + site_packages << PyString_AsString(PyList_GetItem(site_packages_list, i)); + } + + Py_DECREF(site_packages_list); + Py_DECREF(site_packages_fn); + } else { + // Fall back to generating our own site-packages paths by imitating + // what the standard site.py does. This is annoying but it lets us + // run inside virtualenvs :-/ + + auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages"); + ceph_assert(site_packages_fn); + + auto known_paths = PySet_New(nullptr); + auto pArgs = PyTuple_Pack(1, known_paths); + PyObject_CallObject(site_packages_fn, pArgs); + Py_DECREF(pArgs); + Py_DECREF(known_paths); + Py_DECREF(site_packages_fn); + + auto sys_module = PyImport_ImportModule("sys"); + ceph_assert(sys_module); + auto sys_path = PyObject_GetAttrString(sys_module, "path"); + ceph_assert(sys_path); + + dout(1) << "sys.path:" << dendl; + auto n = PyList_Size(sys_path); + bool first = true; + for (Py_ssize_t i = 0; i < n; ++i) { + dout(1) << " " << PyString_AsString(PyList_GetItem(sys_path, i)) << dendl; + if (first) { + first = false; + } else { + site_packages << ":"; + } + site_packages << PyString_AsString(PyList_GetItem(sys_path, i)); + } + + Py_DECREF(sys_path); + Py_DECREF(sys_module); + } + + Py_DECREF(site_module); + + return site_packages.str(); +} + +#if PY_MAJOR_VERSION >= 3 +PyObject* PyModule::init_ceph_logger() +{ + auto py_logger = PyModule_Create(&ceph_logger_module); + PySys_SetObject("stderr", py_logger); + PySys_SetObject("stdout", py_logger); + return py_logger; +} +#else +void PyModule::init_ceph_logger() +{ + auto py_logger = Py_InitModule("ceph_logger", log_methods); + PySys_SetObject(const_cast("stderr"), py_logger); + PySys_SetObject(const_cast("stdout"), py_logger); +} +#endif + +#if PY_MAJOR_VERSION >= 3 +PyObject* PyModule::init_ceph_module() +#else +void PyModule::init_ceph_module() +#endif +{ + static PyMethodDef module_methods[] = { + {nullptr, nullptr, 0, nullptr} + }; +#if PY_MAJOR_VERSION >= 3 + static PyModuleDef ceph_module_def = { + PyModuleDef_HEAD_INIT, + "ceph_module", + nullptr, + -1, + module_methods, + nullptr, + nullptr, + nullptr, + nullptr + }; + PyObject *ceph_module = PyModule_Create(&ceph_module_def); +#else + PyObject *ceph_module = Py_InitModule("ceph_module", module_methods); +#endif + ceph_assert(ceph_module != nullptr); + std::map classes{ + {{"BaseMgrModule", &BaseMgrModuleType}, + {"BaseMgrStandbyModule", &BaseMgrStandbyModuleType}, + {"BasePyOSDMap", &BasePyOSDMapType}, + {"BasePyOSDMapIncremental", &BasePyOSDMapIncrementalType}, + {"BasePyCRUSH", &BasePyCRUSHType}} + }; + for (auto [name, type] : classes) { + type->tp_new = PyType_GenericNew; + if (PyType_Ready(type) < 0) { + ceph_abort(); + } + Py_INCREF(type); + + PyModule_AddObject(ceph_module, name, (PyObject *)type); + } +#if PY_MAJOR_VERSION >= 3 + return ceph_module; +#endif +} + +int PyModule::load(PyThreadState *pMainThreadState) +{ + ceph_assert(pMainThreadState != nullptr); + + // Configure sub-interpreter + { + SafeThreadState sts(pMainThreadState); + Gil gil(sts); + + auto thread_state = Py_NewInterpreter(); + if (thread_state == nullptr) { + derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl; + return -EINVAL; + } else { + pMyThreadState.set(thread_state); + // Some python modules do not cope with an unpopulated argv, so lets + // fake one. This step also picks up site-packages into sys.path. +#if PY_MAJOR_VERSION >= 3 + const wchar_t *argv[] = {L"ceph-mgr"}; + PySys_SetArgv(1, (wchar_t**)argv); +#else + const char *argv[] = {"ceph-mgr"}; + PySys_SetArgv(1, (char**)argv); +#endif + // Configure sys.path to include mgr_module_path + string paths = (":" + g_conf().get_val("mgr_module_path") + + ":" + get_site_packages()); +#if PY_MAJOR_VERSION >= 3 + wstring sys_path(Py_GetPath() + wstring(begin(paths), end(paths))); + PySys_SetPath(const_cast(sys_path.c_str())); + dout(10) << "Computed sys.path '" + << string(begin(sys_path), end(sys_path)) << "'" << dendl; +#else + string sys_path(Py_GetPath() + paths); + PySys_SetPath(const_cast(sys_path.c_str())); + dout(10) << "Computed sys.path '" << sys_path << "'" << dendl; +#endif + } + } + // Environment is all good, import the external module + { + Gil gil(pMyThreadState); + + int r; + r = load_subclass_of("MgrModule", &pClass); + if (r) { + derr << "Class not found in module '" << module_name << "'" << dendl; + return r; + } + + r = load_commands(); + if (r != 0) { + derr << "Missing or invalid COMMANDS attribute in module '" + << module_name << "'" << dendl; + error_string = "Missing or invalid COMMANDS attribute"; + return r; + } + + r = load_options(); + if (r != 0) { + derr << "Missing or invalid MODULE_OPTIONS attribute in module '" + << module_name << "'" << dendl; + error_string = "Missing or invalid MODULE_OPTIONS attribute"; + return r; + } + + // We've imported the module and found a MgrModule subclass, at this + // point the module is considered loaded. It might still not be + // runnable though, can_run populated later... + loaded = true; + + r = load_subclass_of("MgrStandbyModule", &pStandbyClass); + if (!r) { + dout(4) << "Standby mode available in module '" << module_name + << "'" << dendl; + } else { + dout(4) << "Standby mode not provided by module '" << module_name + << "'" << dendl; + } + + // Populate can_run by interrogating the module's callback that + // may check for dependencies etc + PyObject *pCanRunTuple = PyObject_CallMethod(pClass, + const_cast("can_run"), const_cast("()")); + if (pCanRunTuple != nullptr) { + if (PyTuple_Check(pCanRunTuple) && PyTuple_Size(pCanRunTuple) == 2) { + PyObject *pCanRun = PyTuple_GetItem(pCanRunTuple, 0); + PyObject *can_run_str = PyTuple_GetItem(pCanRunTuple, 1); + if (!PyBool_Check(pCanRun) || !PyString_Check(can_run_str)) { + derr << "Module " << get_name() + << " returned wrong type in can_run" << dendl; + error_string = "wrong type returned from can_run"; + can_run = false; + } else { + can_run = (pCanRun == Py_True); + if (!can_run) { + error_string = PyString_AsString(can_run_str); + dout(4) << "Module " << get_name() + << " reported that it cannot run: " + << error_string << dendl; + } + } + } else { + derr << "Module " << get_name() + << " returned wrong type in can_run" << dendl; + error_string = "wrong type returned from can_run"; + can_run = false; + } + + Py_DECREF(pCanRunTuple); + } else { + derr << "Exception calling can_run on " << get_name() << dendl; + derr << handle_pyerror() << dendl; + can_run = false; + } + } + return 0; +} + +int PyModule::walk_dict_list( + const std::string &attr_name, + std::function fn) +{ + PyObject *command_list = PyObject_GetAttrString(pClass, attr_name.c_str()); + if (command_list == nullptr) { + derr << "Module " << get_name() << " has missing " << attr_name + << " member" << dendl; + return -EINVAL; + } + if (!PyObject_TypeCheck(command_list, &PyList_Type)) { + // Relatively easy mistake for human to make, e.g. defining COMMANDS + // as a {} instead of a [] + derr << "Module " << get_name() << " has " << attr_name + << " member of wrong type (should be a list)" << dendl; + return -EINVAL; + } + + // Invoke fn on each item in the list + int r = 0; + const size_t list_size = PyList_Size(command_list); + for (size_t i = 0; i < list_size; ++i) { + PyObject *command = PyList_GetItem(command_list, i); + ceph_assert(command != nullptr); + + if (!PyDict_Check(command)) { + derr << "Module " << get_name() << " has non-dict entry " + << "in " << attr_name << " list" << dendl; + return -EINVAL; + } + + r = fn(command); + if (r != 0) { + break; + } + } + Py_DECREF(command_list); + + return r; +} + +int PyModule::load_commands() +{ + PyObject *pRegCmd = PyObject_CallMethod(pClass, + const_cast("_register_commands"), const_cast("()")); + if (pRegCmd != nullptr) { + Py_DECREF(pRegCmd); + } else { + derr << "Exception calling _register_commands on " << get_name() + << dendl; + derr << handle_pyerror() << dendl; + } + + int r = walk_dict_list("COMMANDS", [this](PyObject *pCommand) -> int { + ModuleCommand command; + + PyObject *pCmd = PyDict_GetItemString(pCommand, "cmd"); + ceph_assert(pCmd != nullptr); + command.cmdstring = PyString_AsString(pCmd); + + dout(20) << "loaded command " << command.cmdstring << dendl; + + PyObject *pDesc = PyDict_GetItemString(pCommand, "desc"); + ceph_assert(pDesc != nullptr); + command.helpstring = PyString_AsString(pDesc); + + PyObject *pPerm = PyDict_GetItemString(pCommand, "perm"); + ceph_assert(pPerm != nullptr); + command.perm = PyString_AsString(pPerm); + + command.polling = false; + PyObject *pPoll = PyDict_GetItemString(pCommand, "poll"); + if (pPoll) { + std::string polling = PyString_AsString(pPoll); + if (boost::iequals(polling, "true")) { + command.polling = true; + } + } + + command.module_name = module_name; + + commands.push_back(std::move(command)); + + return 0; + }); + + dout(10) << "loaded " << commands.size() << " commands" << dendl; + + return r; +} + +int PyModule::load_options() +{ + int r = walk_dict_list("MODULE_OPTIONS", [this](PyObject *pOption) -> int { + MgrMap::ModuleOption option; + PyObject *p; + p = PyDict_GetItemString(pOption, "name"); + ceph_assert(p != nullptr); + option.name = PyString_AsString(p); + option.type = Option::TYPE_STR; + p = PyDict_GetItemString(pOption, "type"); + if (p && PyObject_TypeCheck(p, &PyString_Type)) { + std::string s = PyString_AsString(p); + int t = Option::str_to_type(s); + if (t >= 0) { + option.type = t; + } + } + p = PyDict_GetItemString(pOption, "desc"); + if (p && PyObject_TypeCheck(p, &PyString_Type)) { + option.desc = PyString_AsString(p); + } + p = PyDict_GetItemString(pOption, "long_desc"); + if (p && PyObject_TypeCheck(p, &PyString_Type)) { + option.long_desc = PyString_AsString(p); + } + p = PyDict_GetItemString(pOption, "default"); + if (p) { + auto q = PyObject_Str(p); + option.default_value = PyString_AsString(q); + Py_DECREF(q); + } + p = PyDict_GetItemString(pOption, "min"); + if (p) { + auto q = PyObject_Str(p); + option.min = PyString_AsString(q); + Py_DECREF(q); + } + p = PyDict_GetItemString(pOption, "max"); + if (p) { + auto q = PyObject_Str(p); + option.max = PyString_AsString(q); + Py_DECREF(q); + } + p = PyDict_GetItemString(pOption, "enum_allowed"); + if (p && PyObject_TypeCheck(p, &PyList_Type)) { + for (unsigned i = 0; i < PyList_Size(p); ++i) { + auto q = PyList_GetItem(p, i); + if (q) { + auto r = PyObject_Str(q); + option.enum_allowed.insert(PyString_AsString(r)); + Py_DECREF(r); + } + } + } + p = PyDict_GetItemString(pOption, "see_also"); + if (p && PyObject_TypeCheck(p, &PyList_Type)) { + for (unsigned i = 0; i < PyList_Size(p); ++i) { + auto q = PyList_GetItem(p, i); + if (q && PyObject_TypeCheck(q, &PyString_Type)) { + option.see_also.insert(PyString_AsString(q)); + } + } + } + p = PyDict_GetItemString(pOption, "tags"); + if (p && PyObject_TypeCheck(p, &PyList_Type)) { + for (unsigned i = 0; i < PyList_Size(p); ++i) { + auto q = PyList_GetItem(p, i); + if (q && PyObject_TypeCheck(q, &PyString_Type)) { + option.tags.insert(PyString_AsString(q)); + } + } + } + p = PyDict_GetItemString(pOption, "runtime"); + if (p && PyObject_TypeCheck(p, &PyBool_Type)) { + if (p == Py_True) { + option.flags |= Option::FLAG_RUNTIME; + } + if (p == Py_False) { + option.flags &= ~Option::FLAG_RUNTIME; + } + } + dout(20) << "loaded module option " << option.name << dendl; + options[option.name] = std::move(option); + return 0; + }); + + dout(10) << "loaded " << options.size() << " options" << dendl; + + return r; +} + +bool PyModule::is_option(const std::string &option_name) +{ + std::lock_guard l(lock); + return options.count(option_name) > 0; +} + +PyObject *PyModule::get_typed_option_value(const std::string& name, + const std::string& value) +{ + // we don't need to hold a lock here because these MODULE_OPTIONS + // are set up exactly once during startup. + auto p = options.find(name); + if (p != options.end()) { + switch (p->second.type) { + case Option::TYPE_INT: + case Option::TYPE_UINT: + case Option::TYPE_SIZE: + return PyInt_FromString((char *)value.c_str(), nullptr, 0); + case Option::TYPE_SECS: + case Option::TYPE_FLOAT: + { + PyObject *s = PyString_FromString(value.c_str()); + PyObject *f = PyFloat_FromString(s, nullptr); + Py_DECREF(s); + return f; + } + case Option::TYPE_BOOL: + if (value == "1" || value == "true" || value == "True" || + value == "on" || value == "yes") { + Py_INCREF(Py_True); + return Py_True; + } else { + Py_INCREF(Py_False); + return Py_False; + } + } + } + return PyString_FromString(value.c_str()); +} + +int PyModule::load_subclass_of(const char* base_class, PyObject** py_class) +{ + // load the base class + PyObject *mgr_module = PyImport_ImportModule("mgr_module"); + if (!mgr_module) { + error_string = peek_pyerror(); + derr << "Module not found: 'mgr_module'" << dendl; + derr << handle_pyerror() << dendl; + return -EINVAL; + } + auto mgr_module_type = PyObject_GetAttrString(mgr_module, base_class); + Py_DECREF(mgr_module); + if (!mgr_module_type) { + error_string = peek_pyerror(); + derr << "Unable to import MgrModule from mgr_module" << dendl; + derr << handle_pyerror() << dendl; + return -EINVAL; + } + + // find the sub class + PyObject *plugin_module = PyImport_ImportModule(module_name.c_str()); + if (!plugin_module) { + error_string = peek_pyerror(); + derr << "Module not found: '" << module_name << "'" << dendl; + derr << handle_pyerror() << dendl; + return -ENOENT; + } + auto locals = PyModule_GetDict(plugin_module); + Py_DECREF(plugin_module); + PyObject *key, *value; + Py_ssize_t pos = 0; + *py_class = nullptr; + while (PyDict_Next(locals, &pos, &key, &value)) { + if (!PyType_Check(value)) { + continue; + } + if (!PyObject_IsSubclass(value, mgr_module_type)) { + continue; + } + if (PyObject_RichCompareBool(value, mgr_module_type, Py_EQ)) { + continue; + } + auto class_name = PyString_AsString(key); + if (*py_class) { + derr << __func__ << ": ignoring '" + << module_name << "." << class_name << "'" + << ": only one '" << base_class + << "' class is loaded from each plugin" << dendl; + continue; + } + *py_class = value; + dout(4) << __func__ << ": found class: '" + << module_name << "." << class_name << "'" << dendl; + } + Py_DECREF(mgr_module_type); + + return *py_class ? 0 : -EINVAL; +} + +PyModule::~PyModule() +{ + if (pMyThreadState.ts != nullptr) { + Gil gil(pMyThreadState, true); + Py_XDECREF(pClass); + Py_XDECREF(pStandbyClass); + } +} + diff --git a/src/mgr/PyModule.h b/src/mgr/PyModule.h new file mode 100644 index 00000000..412fa38f --- /dev/null +++ b/src/mgr/PyModule.h @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include "common/Mutex.h" +#include "Python.h" +#include "Gil.h" +#include "mon/MgrMap.h" + + +class MonClient; + +std::string handle_pyerror(); + +std::string peek_pyerror(); + +/** + * A Ceph CLI command description provided from a Python module + */ +class ModuleCommand { +public: + std::string cmdstring; + std::string helpstring; + std::string perm; + bool polling; + + // Call the ActivePyModule of this name to handle the command + std::string module_name; +}; + +class PyModule +{ + mutable Mutex lock{"PyModule::lock"}; +private: + const std::string module_name; + std::string get_site_packages(); + int load_subclass_of(const char* class_name, PyObject** py_class); + + // Did the MgrMap identify this module as one that should run? + bool enabled = false; + + // Did the MgrMap flag this module as always on? + bool always_on = false; + + // Did we successfully import this python module and look up symbols? + // (i.e. is it possible to instantiate a MgrModule subclass instance?) + bool loaded = false; + + // Did the module identify itself as being able to run? + // (i.e. should we expect instantiating and calling serve() to work?) + bool can_run = false; + + // Did the module encounter an unexpected error while running? + // (e.g. throwing an exception from serve()) + bool failed = false; + + // Populated if loaded, can_run or failed indicates a problem + std::string error_string; + + // Helper for loading MODULE_OPTIONS and COMMANDS members + int walk_dict_list( + const std::string &attr_name, + std::function fn); + + int load_commands(); + std::vector commands; + + int load_options(); + std::map options; + +public: + static std::string config_prefix; + + SafeThreadState pMyThreadState; + PyObject *pClass = nullptr; + PyObject *pStandbyClass = nullptr; + + explicit PyModule(const std::string &module_name_) + : module_name(module_name_) + { + } + + ~PyModule(); + + bool is_option(const std::string &option_name); + const std::map& get_options() const { + return options; + } + + PyObject *get_typed_option_value( + const std::string& option, + const std::string& value); + + int load(PyThreadState *pMainThreadState); +#if PY_MAJOR_VERSION >= 3 + static PyObject* init_ceph_logger(); + static PyObject* init_ceph_module(); +#else + static void init_ceph_logger(); + static void init_ceph_module(); +#endif + + void set_enabled(const bool enabled_) + { + enabled = enabled_; + } + + void set_always_on(const bool always_on_) { + always_on = always_on_; + } + + /** + * Extend `out` with the contents of `this->commands` + */ + void get_commands(std::vector *out) const + { + std::lock_guard l(lock); + ceph_assert(out != nullptr); + out->insert(out->end(), commands.begin(), commands.end()); + } + + + /** + * Mark the module as failed, recording the reason in the error + * string. + */ + void fail(const std::string &reason) + { + std::lock_guard l(lock); + failed = true; + error_string = reason; + } + + bool is_enabled() const { + std::lock_guard l(lock); + return enabled || always_on; + } + + bool is_failed() const { std::lock_guard l(lock) ; return failed; } + bool is_loaded() const { std::lock_guard l(lock) ; return loaded; } + bool is_always_on() const { std::lock_guard l(lock) ; return always_on; } + + const std::string &get_name() const { + std::lock_guard l(lock) ; return module_name; + } + const std::string &get_error_string() const { + std::lock_guard l(lock) ; return error_string; + } + bool get_can_run() const { + std::lock_guard l(lock) ; return can_run; + } +}; + +typedef std::shared_ptr PyModuleRef; + +class PyModuleConfig { +public: + mutable Mutex lock{"PyModuleConfig::lock"}; + std::map config; + + PyModuleConfig(); + + PyModuleConfig(PyModuleConfig &mconfig); + + ~PyModuleConfig(); + + void set_config( + MonClient *monc, + const std::string &module_name, + const std::string &key, const boost::optional& val); + +}; diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc new file mode 100644 index 00000000..e391ad63 --- /dev/null +++ b/src/mgr/PyModuleRegistry.cc @@ -0,0 +1,526 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "include/stringify.h" +#include "common/errno.h" + +#include "BaseMgrModule.h" +#include "PyOSDMap.h" +#include "BaseMgrStandbyModule.h" +#include "Gil.h" +#include "MgrContext.h" +#include "mgr/mgr_commands.h" + +#include "ActivePyModules.h" + +#include "PyModuleRegistry.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +#undef dout_prefix +#define dout_prefix *_dout << "mgr[py] " + +std::set obsolete_modules = { + "orchestrator_cli", +}; + +void PyModuleRegistry::init() +{ + std::lock_guard locker(lock); + + // Set up global python interpreter +#if PY_MAJOR_VERSION >= 3 +#define WCHAR(s) L ## #s + Py_SetProgramName(const_cast(WCHAR(MGR_PYTHON_EXECUTABLE))); +#undef WCHAR +#else + Py_SetProgramName(const_cast(MGR_PYTHON_EXECUTABLE)); +#endif + // Add more modules + if (g_conf().get_val("daemonize")) { + PyImport_AppendInittab("ceph_logger", PyModule::init_ceph_logger); + } + PyImport_AppendInittab("ceph_module", PyModule::init_ceph_module); + Py_InitializeEx(0); + + // Let CPython know that we will be calling it back from other + // threads in future. + if (! PyEval_ThreadsInitialized()) { + PyEval_InitThreads(); + } + + // Drop the GIL and remember the main thread state (current + // thread state becomes NULL) + pMainThreadState = PyEval_SaveThread(); + ceph_assert(pMainThreadState != nullptr); + + std::list failed_modules; + + const std::string module_path = g_conf().get_val("mgr_module_path"); + std::set module_names = probe_modules(module_path); + // Load python code + for (const auto& module_name : module_names) { + dout(1) << "Loading python module '" << module_name << "'" << dendl; + + // Everything starts disabled, set enabled flag on module + // when we see first MgrMap + auto mod = std::make_shared(module_name); + int r = mod->load(pMainThreadState); + if (r != 0) { + // Don't use handle_pyerror() here; we don't have the GIL + // or the right thread state (this is deliberate). + derr << "Error loading module '" << module_name << "': " + << cpp_strerror(r) << dendl; + failed_modules.push_back(module_name); + // Don't drop out here, load the other modules + } + + // Record the module even if the load failed, so that we can + // report its loading error + modules[module_name] = std::move(mod); + } + if (module_names.empty()) { + clog->error() << "No ceph-mgr modules found in " << module_path; + } + if (!failed_modules.empty()) { + clog->error() << "Failed to load ceph-mgr modules: " << joinify( + failed_modules.begin(), failed_modules.end(), std::string(", ")); + } +} + +bool PyModuleRegistry::handle_mgr_map(const MgrMap &mgr_map_) +{ + std::lock_guard l(lock); + + if (mgr_map.epoch == 0) { + mgr_map = mgr_map_; + + // First time we see MgrMap, set the enabled flags on modules + // This should always happen before someone calls standby_start + // or active_start + for (const auto &[module_name, module] : modules) { + const bool enabled = (mgr_map.modules.count(module_name) > 0); + module->set_enabled(enabled); + const bool always_on = (mgr_map.get_always_on_modules().count(module_name) > 0); + module->set_always_on(always_on); + } + + return false; + } else { + bool modules_changed = mgr_map_.modules != mgr_map.modules || + mgr_map_.always_on_modules != mgr_map.always_on_modules; + mgr_map = mgr_map_; + + if (standby_modules != nullptr) { + standby_modules->handle_mgr_map(mgr_map_); + } + + return modules_changed; + } +} + + + +void PyModuleRegistry::standby_start(MonClient &mc, Finisher &f) +{ + std::lock_guard l(lock); + ceph_assert(active_modules == nullptr); + ceph_assert(standby_modules == nullptr); + + // Must have seen a MgrMap by this point, in order to know + // which modules should be enabled + ceph_assert(mgr_map.epoch > 0); + + dout(4) << "Starting modules in standby mode" << dendl; + + standby_modules.reset(new StandbyPyModules( + mgr_map, module_config, clog, mc, f)); + + std::set failed_modules; + for (const auto &i : modules) { + if (!(i.second->is_enabled() && i.second->get_can_run())) { + // report always_on modules with a standby mode that won't run + if (i.second->is_always_on() && i.second->pStandbyClass) { + failed_modules.insert(i.second->get_name()); + } + continue; + } + + if (i.second->pStandbyClass) { + dout(4) << "starting module " << i.second->get_name() << dendl; + standby_modules->start_one(i.second); + } else { + dout(4) << "skipping module '" << i.second->get_name() << "' because " + "it does not implement a standby mode" << dendl; + } + } + + if (!failed_modules.empty()) { + clog->error() << "Failed to execute ceph-mgr module(s) in standby mode: " + << joinify(failed_modules.begin(), failed_modules.end(), + std::string(", ")); + } +} + +void PyModuleRegistry::active_start( + DaemonStateIndex &ds, ClusterState &cs, + const std::map &kv_store, + MonClient &mc, LogChannelRef clog_, LogChannelRef audit_clog_, + Objecter &objecter_, Client &client_, Finisher &f, + DaemonServer &server) +{ + std::lock_guard locker(lock); + + dout(4) << "Starting modules in active mode" << dendl; + + ceph_assert(active_modules == nullptr); + + // Must have seen a MgrMap by this point, in order to know + // which modules should be enabled + ceph_assert(mgr_map.epoch > 0); + + if (standby_modules != nullptr) { + standby_modules->shutdown(); + standby_modules.reset(); + } + + active_modules.reset(new ActivePyModules( + module_config, kv_store, ds, cs, mc, + clog_, audit_clog_, objecter_, client_, f, server, + *this)); + + for (const auto &i : modules) { + // Anything we're skipping because of !can_run will be flagged + // to the user separately via get_health_checks + if (!(i.second->is_enabled() && i.second->is_loaded())) { + continue; + } + + dout(4) << "Starting " << i.first << dendl; + active_modules->start_one(i.second); + } +} + +void PyModuleRegistry::active_shutdown() +{ + std::lock_guard locker(lock); + + if (active_modules != nullptr) { + active_modules->shutdown(); + active_modules.reset(); + } +} + +void PyModuleRegistry::shutdown() +{ + std::lock_guard locker(lock); + + if (standby_modules != nullptr) { + standby_modules->shutdown(); + standby_modules.reset(); + } + + // Ideally, now, we'd be able to do this for all modules: + // + // Py_EndInterpreter(pMyThreadState); + // PyThreadState_Swap(pMainThreadState); + // + // Unfortunately, if the module has any other *python* threads active + // at this point, Py_EndInterpreter() will abort with: + // + // Fatal Python error: Py_EndInterpreter: not the last thread + // + // This can happen when using CherryPy in a module, becuase CherryPy + // runs an extra thread as a timeout monitor, which spends most of its + // life inside a time.sleep(60). Unless you are very, very lucky with + // the timing calling this destructor, that thread will still be stuck + // in a sleep, and Py_EndInterpreter() will abort. + // + // This could of course also happen with a poorly written module which + // made no attempt to clean up any additional threads it created. + // + // The safest thing to do is just not call Py_EndInterpreter(), and + // let Py_Finalize() kill everything after all modules are shut down. + + modules.clear(); + + PyEval_RestoreThread(pMainThreadState); + Py_Finalize(); +} + +std::set PyModuleRegistry::probe_modules(const std::string &path) const +{ + DIR *dir = opendir(path.c_str()); + if (!dir) { + return {}; + } + + std::set modules_out; + struct dirent *entry = NULL; + while ((entry = readdir(dir)) != NULL) { + string n(entry->d_name); + string fn = path + "/" + n; + struct stat st; + int r = ::stat(fn.c_str(), &st); + if (r == 0 && S_ISDIR(st.st_mode)) { + string initfn = fn + "/module.py"; + r = ::stat(initfn.c_str(), &st); + if (r == 0) { + modules_out.insert(n); + } + } + } + closedir(dir); + + return modules_out; +} + +int PyModuleRegistry::handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss) +{ + if (active_modules) { + return active_modules->handle_command(module_command, session, cmdmap, + inbuf, ds, ss); + } else { + // We do not expect to be called before active modules is up, but + // it's straightfoward to handle this case so let's do it. + return -EAGAIN; + } +} + +std::vector PyModuleRegistry::get_py_commands() const +{ + std::lock_guard l(lock); + + std::vector result; + for (const auto& i : modules) { + i.second->get_commands(&result); + } + + return result; +} + +std::vector PyModuleRegistry::get_commands() const +{ + std::vector commands = get_py_commands(); + std::vector result; + for (auto &pyc: commands) { + uint64_t flags = MonCommand::FLAG_MGR; + if (pyc.polling) { + flags |= MonCommand::FLAG_POLL; + } + result.push_back({pyc.cmdstring, pyc.helpstring, "mgr", + pyc.perm, flags}); + } + return result; +} + +void PyModuleRegistry::get_health_checks(health_check_map_t *checks) +{ + std::lock_guard l(lock); + + // Only the active mgr reports module issues + if (active_modules) { + active_modules->get_health_checks(checks); + + std::map dependency_modules; + std::map failed_modules; + + /* + * Break up broken modules into two categories: + * - can_run=false: the module is working fine but explicitly + * telling you that a dependency is missing. Advise the user to + * read the message from the module and install what's missing. + * - failed=true or loaded=false: something unexpected is broken, + * either at runtime (from serve()) or at load time. This indicates + * a bug and the user should be guided to inspect the mgr log + * to investigate and gather evidence. + */ + + for (const auto &i : modules) { + auto module = i.second; + if (module->is_enabled() && !module->get_can_run()) { + dependency_modules[module->get_name()] = module->get_error_string(); + } else if ((module->is_enabled() && !module->is_loaded()) + || (module->is_failed() && module->get_can_run())) { + // - Unloadable modules are only reported if they're enabled, + // to avoid spamming users about modules they don't have the + // dependencies installed for because they don't use it. + // - Failed modules are only reported if they passed the can_run + // checks (to avoid outputting two health messages about a + // module that said can_run=false but we tried running it anyway) + failed_modules[module->get_name()] = module->get_error_string(); + } + } + + // report failed always_on modules as health errors + for (const auto& name : mgr_map.get_always_on_modules()) { + if (obsolete_modules.count(name)) { + continue; + } + if (active_modules->is_pending(name)) { + continue; + } + if (!active_modules->module_exists(name)) { + if (failed_modules.find(name) == failed_modules.end() && + dependency_modules.find(name) == dependency_modules.end()) { + failed_modules[name] = "Not found or unloadable"; + } + } + } + + if (!dependency_modules.empty()) { + std::ostringstream ss; + if (dependency_modules.size() == 1) { + auto iter = dependency_modules.begin(); + ss << "Module '" << iter->first << "' has failed dependency: " + << iter->second; + } else if (dependency_modules.size() > 1) { + ss << dependency_modules.size() + << " mgr modules have failed dependencies"; + } + auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str()); + for (auto& i : dependency_modules) { + std::ostringstream ss; + ss << "Module '" << i.first << "' has failed dependency: " << i.second; + d.detail.push_back(ss.str()); + } + } + + if (!failed_modules.empty()) { + std::ostringstream ss; + if (failed_modules.size() == 1) { + auto iter = failed_modules.begin(); + ss << "Module '" << iter->first << "' has failed: " << iter->second; + } else if (failed_modules.size() > 1) { + ss << failed_modules.size() << " mgr modules have failed"; + } + auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str()); + for (auto& i : failed_modules) { + std::ostringstream ss; + ss << "Module '" << i.first << "' has failed: " << i.second; + d.detail.push_back(ss.str()); + } + } + } +} + +void PyModuleRegistry::handle_config(const std::string &k, const std::string &v) +{ + std::lock_guard l(module_config.lock); + + if (!v.empty()) { + // removing value to hide sensitive data going into mgr logs + // leaving this for debugging purposes + // dout(10) << "Loaded module_config entry " << k << ":" << v << dendl; + dout(4) << "Loaded module_config entry " << k << ":" << dendl; + module_config.config[k] = v; + } else { + module_config.config.erase(k); + } +} + +void PyModuleRegistry::handle_config_notify() +{ + std::lock_guard l(lock); + if (active_modules) { + active_modules->config_notify(); + } +} + +void PyModuleRegistry::upgrade_config( + MonClient *monc, + const std::map &old_config) +{ + // Only bother doing anything if we didn't already have + // some new-style config. + if (module_config.config.empty()) { + dout(1) << "Upgrading module configuration for Mimic" << dendl; + // Upgrade luminous->mimic: migrate config-key configuration + // into main configuration store + for (auto &i : old_config) { + auto last_slash = i.first.rfind('/'); + const std::string module_name = i.first.substr(4, i.first.substr(4).find('/')); + const std::string key = i.first.substr(last_slash + 1); + + const auto &value = i.second; + + // Heuristic to skip things that look more like stores + // than configs. + bool is_config = true; + for (const auto &c : value) { + if (c == '\n' || c == '\r' || c < 0x20) { + is_config = false; + break; + } + } + + if (value.size() > 256) { + is_config = false; + } + + if (!is_config) { + dout(1) << "Not migrating config module:key " + << module_name << " : " << key << dendl; + continue; + } + + // Check that the named module exists + auto module_iter = modules.find(module_name); + if (module_iter == modules.end()) { + dout(1) << "KV store contains data for unknown module '" + << module_name << "'" << dendl; + continue; + } + PyModuleRef module = module_iter->second; + + // Parse option name out of key + std::string option_name; + auto slash_loc = key.find("/"); + if (slash_loc != std::string::npos) { + if (key.size() > slash_loc + 1) { + // Localized option + option_name = key.substr(slash_loc + 1); + } else { + // Trailing slash: garbage. + derr << "Invalid mgr store key: '" << key << "'" << dendl; + continue; + } + } else { + option_name = key; + } + + // Consult module schema to see if this is really + // a configuration value + if (!option_name.empty() && module->is_option(option_name)) { + module_config.set_config(monc, module_name, key, i.second); + dout(4) << "Rewrote configuration module:key " + << module_name << ":" << key << dendl; + } else { + dout(4) << "Leaving store module:key " << module_name + << ":" << key << " in store, not config" << dendl; + } + } + } else { + dout(10) << "Module configuration contains " + << module_config.config.size() << " keys" << dendl; + } +} + diff --git a/src/mgr/PyModuleRegistry.h b/src/mgr/PyModuleRegistry.h new file mode 100644 index 00000000..89080cdb --- /dev/null +++ b/src/mgr/PyModuleRegistry.h @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#pragma once + +// First because it includes Python.h +#include "PyModule.h" + +#include +#include +#include +#include + +#include "common/LogClient.h" + +#include "ActivePyModules.h" +#include "StandbyPyModules.h" + +class MgrSession; + +/** + * This class is responsible for setting up the python runtime environment + * and importing the python modules. + * + * It is *not* responsible for constructing instances of their BaseMgrModule + * subclasses: that is the job of ActiveMgrModule, which consumes the class + * references that we load here. + */ +class PyModuleRegistry +{ +private: + mutable Mutex lock{"PyModuleRegistry::lock"}; + LogChannelRef clog; + + std::map modules; + + std::unique_ptr active_modules; + std::unique_ptr standby_modules; + + PyThreadState *pMainThreadState; + + // We have our own copy of MgrMap, because we are constructed + // before ClusterState exists. + MgrMap mgr_map; + + /** + * Discover python modules from local disk + */ + std::set probe_modules(const std::string &path) const; + + PyModuleConfig module_config; + +public: + void handle_config(const std::string &k, const std::string &v); + void handle_config_notify(); + + /** + * Get references to all modules (whether they have loaded and/or + * errored) or not. + */ + std::list get_modules() const + { + std::lock_guard l(lock); + std::list modules_out; + for (const auto &i : modules) { + modules_out.push_back(i.second); + } + + return modules_out; + } + + explicit PyModuleRegistry(LogChannelRef clog_) + : clog(clog_) + {} + + /** + * @return true if the mgrmap has changed such that the service needs restart + */ + bool handle_mgr_map(const MgrMap &mgr_map_); + + void init(); + + void upgrade_config( + MonClient *monc, + const std::map &old_config); + + void active_start( + DaemonStateIndex &ds, ClusterState &cs, + const std::map &kv_store, + MonClient &mc, LogChannelRef clog_, LogChannelRef audit_clog_, + Objecter &objecter_, Client &client_, Finisher &f, + DaemonServer &server); + void standby_start(MonClient &mc, Finisher &f); + + bool is_standby_running() const + { + return standby_modules != nullptr; + } + + void active_shutdown(); + void shutdown(); + + std::vector get_commands() const; + std::vector get_py_commands() const; + + /** + * Get the specified module. The module does not have to be + * loaded or runnable. + * + * Returns an empty reference if it does not exist. + */ + PyModuleRef get_module(const std::string &module_name) + { + std::lock_guard l(lock); + auto module_iter = modules.find(module_name); + if (module_iter == modules.end()) { + return {}; + } + return module_iter->second; + } + + /** + * Pass through command to the named module for execution. + * + * The command must exist in the COMMANDS reported by the module. If it + * doesn't then this will abort. + * + * If ActivePyModules has not been instantiated yet then this will + * return EAGAIN. + */ + int handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss); + + /** + * Pass through health checks reported by modules, and report any + * modules that have failed (i.e. unhandled exceptions in serve()) + */ + void get_health_checks(health_check_map_t *checks); + + void get_progress_events(map *events) { + if (active_modules) { + active_modules->get_progress_events(events); + } + } + + // FIXME: breaking interface so that I don't have to go rewrite all + // the places that call into these (for now) + // >>> + void notify_all(const std::string ¬ify_type, + const std::string ¬ify_id) + { + if (active_modules) { + active_modules->notify_all(notify_type, notify_id); + } + } + + void notify_all(const LogEntry &log_entry) + { + if (active_modules) { + active_modules->notify_all(log_entry); + } + } + + std::map get_services() const + { + ceph_assert(active_modules); + return active_modules->get_services(); + } + // <<< (end of ActivePyModules cheeky call-throughs) +}; diff --git a/src/mgr/PyModuleRunner.cc b/src/mgr/PyModuleRunner.cc new file mode 100644 index 00000000..cde54f21 --- /dev/null +++ b/src/mgr/PyModuleRunner.cc @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +// Python.h comes first because otherwise it clobbers ceph's assert +#include "PythonCompat.h" + +#include "PyModule.h" + +#include "common/debug.h" +#include "mgr/Gil.h" + +#include "PyModuleRunner.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + + +PyModuleRunner::~PyModuleRunner() +{ + Gil gil(py_module->pMyThreadState, true); + + if (pClassInstance) { + Py_XDECREF(pClassInstance); + pClassInstance = nullptr; + } +} + +int PyModuleRunner::serve() +{ + ceph_assert(pClassInstance != nullptr); + + // This method is called from a separate OS thread (i.e. a thread not + // created by Python), so tell Gil to wrap this in a new thread state. + Gil gil(py_module->pMyThreadState, true); + + auto pValue = PyObject_CallMethod(pClassInstance, + const_cast("serve"), nullptr); + + int r = 0; + if (pValue != NULL) { + Py_DECREF(pValue); + } else { + // This is not a very informative log message because it's an + // unknown/unexpected exception that we can't say much about. + + + // Get short exception message for the cluster log, before + // dumping the full backtrace to the local log. + std::string exc_msg = peek_pyerror(); + + clog->error() << "Unhandled exception from module '" << get_name() + << "' while running on mgr." << g_conf()->name.get_id() + << ": " << exc_msg; + derr << get_name() << ".serve:" << dendl; + derr << handle_pyerror() << dendl; + + py_module->fail(exc_msg); + + return -EINVAL; + } + + return r; +} + +void PyModuleRunner::shutdown() +{ + ceph_assert(pClassInstance != nullptr); + + Gil gil(py_module->pMyThreadState, true); + + auto pValue = PyObject_CallMethod(pClassInstance, + const_cast("shutdown"), nullptr); + + if (pValue != NULL) { + Py_DECREF(pValue); + } else { + derr << "Failed to invoke shutdown() on " << get_name() << dendl; + derr << handle_pyerror() << dendl; + } + + dead = true; +} + +void PyModuleRunner::log(int level, const std::string &record) +{ +#undef dout_prefix +#define dout_prefix *_dout << "mgr[" << get_name() << "] " + dout(ceph::dout::need_dynamic(level)) << record << dendl; +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " +} + +void* PyModuleRunner::PyModuleRunnerThread::entry() +{ + // No need to acquire the GIL here; the module does it. + dout(4) << "Entering thread for " << mod->get_name() << dendl; + mod->serve(); + return nullptr; +} diff --git a/src/mgr/PyModuleRunner.h b/src/mgr/PyModuleRunner.h new file mode 100644 index 00000000..52f60be5 --- /dev/null +++ b/src/mgr/PyModuleRunner.h @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#pragma once + +#include "common/Thread.h" +#include "common/LogClient.h" +#include "mgr/Gil.h" + +#include "PyModule.h" + +/** + * Implement the pattern of calling serve() on a module in a thread, + * until shutdown() is called. + */ +class PyModuleRunner +{ +public: + // Info about the module we're going to run + PyModuleRef py_module; + +protected: + // Populated by descendent class + PyObject *pClassInstance = nullptr; + + LogChannelRef clog; + + class PyModuleRunnerThread : public Thread + { + PyModuleRunner *mod; + + public: + explicit PyModuleRunnerThread(PyModuleRunner *mod_) + : mod(mod_) {} + + void *entry() override; + }; + + bool is_dead() const { return dead; } + + std::string thread_name; + +public: + int serve(); + void shutdown(); + void log(int level, const std::string &record); + + const char *get_thread_name() const + { + return thread_name.c_str(); + } + + PyModuleRunner( + const PyModuleRef &py_module_, + LogChannelRef clog_) + : + py_module(py_module_), + clog(clog_), + thread(this) + { + // Shortened name for use as thread name, because thread names + // required to be <16 chars + thread_name = py_module->get_name().substr(0, 15); + + ceph_assert(py_module != nullptr); + } + + ~PyModuleRunner(); + + PyModuleRunnerThread thread; + + std::string const &get_name() const { return py_module->get_name(); } + +private: + bool dead = false; +}; + + diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc new file mode 100644 index 00000000..681546a8 --- /dev/null +++ b/src/mgr/PyOSDMap.cc @@ -0,0 +1,667 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Mgr.h" + +#include "osd/OSDMap.h" +#include "common/errno.h" +#include "common/version.h" +#include "include/stringify.h" + +#include "PyOSDMap.h" +#include "PyFormatter.h" +#include "Gil.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + + +typedef struct { + PyObject_HEAD + OSDMap *osdmap; +} BasePyOSDMap; + +typedef struct { + PyObject_HEAD + OSDMap::Incremental *inc; +} BasePyOSDMapIncremental; + +typedef struct { + PyObject_HEAD + std::shared_ptr crush; +} BasePyCRUSH; + +// ---------- + +static PyObject *osdmap_get_epoch(BasePyOSDMap *self, PyObject *obj) +{ + return PyInt_FromLong(self->osdmap->get_epoch()); +} + +static PyObject *osdmap_get_crush_version(BasePyOSDMap* self, PyObject *obj) +{ + return PyInt_FromLong(self->osdmap->get_crush_version()); +} + +static PyObject *osdmap_dump(BasePyOSDMap* self, PyObject *obj) +{ + PyFormatter f; + self->osdmap->dump(&f); + return f.get(); +} + +static PyObject *osdmap_new_incremental(BasePyOSDMap *self, PyObject *obj) +{ + OSDMap::Incremental *inc = new OSDMap::Incremental; + + inc->fsid = self->osdmap->get_fsid(); + inc->epoch = self->osdmap->get_epoch() + 1; + // always include latest crush map here... this is okay since we never + // actually use this map in the real world (and even if we did it would + // be a no-op). + self->osdmap->crush->encode(inc->crush, CEPH_FEATURES_ALL); + dout(10) << __func__ << " " << inc << dendl; + + return construct_with_capsule("mgr_module", "OSDMapIncremental", + (void*)(inc)); +} + +static PyObject *osdmap_apply_incremental(BasePyOSDMap *self, + BasePyOSDMapIncremental *incobj) +{ + if (!PyObject_TypeCheck(incobj, &BasePyOSDMapIncrementalType)) { + derr << "Wrong type in osdmap_apply_incremental!" << dendl; + return nullptr; + } + + bufferlist bl; + self->osdmap->encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED); + OSDMap *next = new OSDMap; + next->decode(bl); + next->apply_incremental(*(incobj->inc)); + dout(10) << __func__ << " map " << self->osdmap << " inc " << incobj->inc + << " next " << next << dendl; + + return construct_with_capsule("mgr_module", "OSDMap", (void*)next); +} + +static PyObject *osdmap_get_crush(BasePyOSDMap* self, PyObject *obj) +{ + return construct_with_capsule("mgr_module", "CRUSHMap", + (void*)(&(self->osdmap->crush))); +} + +static PyObject *osdmap_get_pools_by_take(BasePyOSDMap* self, PyObject *args) +{ + int take; + if (!PyArg_ParseTuple(args, "i:get_pools_by_take", + &take)) { + return nullptr; + } + + PyFormatter f; + f.open_array_section("pools"); + for (auto& p : self->osdmap->get_pools()) { + if (self->osdmap->crush->rule_has_take(p.second.crush_rule, take)) { + f.dump_int("pool", p.first); + } + } + f.close_section(); + return f.get(); +} + +static PyObject *osdmap_calc_pg_upmaps(BasePyOSDMap* self, PyObject *args) +{ + PyObject *pool_list; + BasePyOSDMapIncremental *incobj; + int max_deviation = 0; + int max_iterations = 0; + if (!PyArg_ParseTuple(args, "OiiO:calc_pg_upmaps", + &incobj, &max_deviation, + &max_iterations, &pool_list)) { + return nullptr; + } + if (!PyList_CheckExact(pool_list)) { + derr << __func__ << " pool_list not a list" << dendl; + return nullptr; + } + set pools; + for (auto i = 0; i < PyList_Size(pool_list); ++i) { + PyObject *pool_name = PyList_GET_ITEM(pool_list, i); + if (!PyString_Check(pool_name)) { + derr << __func__ << " " << pool_name << " not a string" << dendl; + return nullptr; + } + auto pool_id = self->osdmap->lookup_pg_pool_name( + PyString_AsString(pool_name)); + if (pool_id < 0) { + derr << __func__ << " pool '" << PyString_AsString(pool_name) + << "' does not exist" << dendl; + return nullptr; + } + pools.insert(pool_id); + } + + dout(10) << __func__ << " osdmap " << self->osdmap << " inc " << incobj->inc + << " max_deviation " << max_deviation + << " max_iterations " << max_iterations + << " pools " << pools + << dendl; + PyThreadState *tstate = PyEval_SaveThread(); + int r = self->osdmap->calc_pg_upmaps(g_ceph_context, + max_deviation, + max_iterations, + pools, + incobj->inc); + PyEval_RestoreThread(tstate); + dout(10) << __func__ << " r = " << r << dendl; + return PyInt_FromLong(r); +} + +static PyObject *osdmap_map_pool_pgs_up(BasePyOSDMap* self, PyObject *args) +{ + int poolid; + if (!PyArg_ParseTuple(args, "i:map_pool_pgs_up", + &poolid)) { + return nullptr; + } + auto pi = self->osdmap->get_pg_pool(poolid); + if (!pi) + return nullptr; + map> pm; + for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) { + pg_t pgid(ps, poolid); + self->osdmap->pg_to_up_acting_osds(pgid, &pm[pgid], nullptr, nullptr, nullptr); + } + PyFormatter f; + for (auto p : pm) { + string pg = stringify(p.first); + f.open_array_section(pg.c_str()); + for (auto o : p.second) { + f.dump_int("osd", o); + } + f.close_section(); + } + return f.get(); +} + +static int +BasePyOSDMap_init(BasePyOSDMap *self, PyObject *args, PyObject *kwds) +{ + PyObject *osdmap_capsule = nullptr; + static const char *kwlist[] = {"osdmap_capsule", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "O", + const_cast(kwlist), + &osdmap_capsule)) { + ceph_abort(); + return -1; + } + ceph_assert(PyObject_TypeCheck(osdmap_capsule, &PyCapsule_Type)); + + self->osdmap = (OSDMap*)PyCapsule_GetPointer( + osdmap_capsule, nullptr); + ceph_assert(self->osdmap); + + return 0; +} + + +static void +BasePyOSDMap_dealloc(BasePyOSDMap *self) +{ + if (self->osdmap) { + delete self->osdmap; + self->osdmap = nullptr; + } else { + derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl; + } + Py_TYPE(self)->tp_free(self); +} + +static PyObject *osdmap_pg_to_up_acting_osds(BasePyOSDMap *self, PyObject *args) +{ + int pool_id = 0; + int ps = 0; + if (!PyArg_ParseTuple(args, "ii:pg_to_up_acting_osds", + &pool_id, &ps)) { + return nullptr; + } + + std::vector up; + int up_primary; + std::vector acting; + int acting_primary; + pg_t pg_id(ps, pool_id); + self->osdmap->pg_to_up_acting_osds(pg_id, + &up, &up_primary, + &acting, &acting_primary); + + // (Ab)use PyFormatter as a convenient way to generate a dict + PyFormatter f; + f.dump_int("up_primary", up_primary); + f.dump_int("acting_primary", acting_primary); + f.open_array_section("up"); + for (const auto &i : up) { + f.dump_int("osd", i); + } + f.close_section(); + f.open_array_section("acting"); + for (const auto &i : acting) { + f.dump_int("osd", i); + } + f.close_section(); + + return f.get(); +} + +static PyObject *osdmap_pool_raw_used_rate(BasePyOSDMap *self, PyObject *args) +{ + int pool_id = 0; + if (!PyArg_ParseTuple(args, "i:pool_raw_used_rate", + &pool_id)) { + return nullptr; + } + + if (!self->osdmap->have_pg_pool(pool_id)) { + return nullptr; + } + + float rate = self->osdmap->pool_raw_used_rate(pool_id); + + return PyFloat_FromDouble(rate); +} + + +PyMethodDef BasePyOSDMap_methods[] = { + {"_get_epoch", (PyCFunction)osdmap_get_epoch, METH_NOARGS, "Get OSDMap epoch"}, + {"_get_crush_version", (PyCFunction)osdmap_get_crush_version, METH_NOARGS, + "Get CRUSH version"}, + {"_dump", (PyCFunction)osdmap_dump, METH_NOARGS, "Dump OSDMap::Incremental"}, + {"_new_incremental", (PyCFunction)osdmap_new_incremental, METH_NOARGS, + "Create OSDMap::Incremental"}, + {"_apply_incremental", (PyCFunction)osdmap_apply_incremental, METH_O, + "Apply OSDMap::Incremental and return the resulting OSDMap"}, + {"_get_crush", (PyCFunction)osdmap_get_crush, METH_NOARGS, "Get CrushWrapper"}, + {"_get_pools_by_take", (PyCFunction)osdmap_get_pools_by_take, METH_VARARGS, + "Get pools that have CRUSH rules that TAKE the given root"}, + {"_calc_pg_upmaps", (PyCFunction)osdmap_calc_pg_upmaps, METH_VARARGS, + "Calculate new pg-upmap values"}, + {"_map_pool_pgs_up", (PyCFunction)osdmap_map_pool_pgs_up, METH_VARARGS, + "Calculate up set mappings for all PGs in a pool"}, + {"_pg_to_up_acting_osds", (PyCFunction)osdmap_pg_to_up_acting_osds, METH_VARARGS, + "Calculate up+acting OSDs for a PG ID"}, + {"_pool_raw_used_rate", (PyCFunction)osdmap_pool_raw_used_rate, METH_VARARGS, + "Get raw space to logical space ratio"}, + {NULL, NULL, 0, NULL} +}; + +PyTypeObject BasePyOSDMapType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BasePyOSDMap", /* tp_name */ + sizeof(BasePyOSDMap), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)BasePyOSDMap_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "Ceph OSDMap", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BasePyOSDMap_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BasePyOSDMap_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; + +// ---------- + + +static int +BasePyOSDMapIncremental_init(BasePyOSDMapIncremental *self, + PyObject *args, PyObject *kwds) +{ + PyObject *inc_capsule = nullptr; + static const char *kwlist[] = {"inc_capsule", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "O", + const_cast(kwlist), + &inc_capsule)) { + ceph_abort(); + return -1; + } + ceph_assert(PyObject_TypeCheck(inc_capsule, &PyCapsule_Type)); + + self->inc = (OSDMap::Incremental*)PyCapsule_GetPointer( + inc_capsule, nullptr); + ceph_assert(self->inc); + + return 0; +} + +static void +BasePyOSDMapIncremental_dealloc(BasePyOSDMapIncremental *self) +{ + if (self->inc) { + delete self->inc; + self->inc = nullptr; + } else { + derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl; + } + Py_TYPE(self)->tp_free(self); +} + +static PyObject *osdmap_inc_get_epoch(BasePyOSDMapIncremental *self, + PyObject *obj) +{ + return PyInt_FromLong(self->inc->epoch); +} + +static PyObject *osdmap_inc_dump(BasePyOSDMapIncremental *self, + PyObject *obj) +{ + PyFormatter f; + self->inc->dump(&f); + return f.get(); +} + +static int get_int_float_map(PyObject *obj, map *out) +{ + PyObject *ls = PyDict_Items(obj); + for (int j = 0; j < PyList_Size(ls); ++j) { + PyObject *pair = PyList_GET_ITEM(ls, j); + if (!PyTuple_Check(pair)) { + derr << __func__ << " item " << j << " not a tuple" << dendl; + Py_DECREF(ls); + return -1; + } + int k; + double v; + if (!PyArg_ParseTuple(pair, "id:pair", &k, &v)) { + derr << __func__ << " item " << j << " not a size 2 tuple" << dendl; + Py_DECREF(ls); + return -1; + } + (*out)[k] = v; + } + + Py_DECREF(ls); + return 0; +} + +static PyObject *osdmap_inc_set_osd_reweights(BasePyOSDMapIncremental *self, + PyObject *weightobj) +{ + map wm; + if (get_int_float_map(weightobj, &wm) < 0) { + return nullptr; + } + + for (auto i : wm) { + self->inc->new_weight[i.first] = std::max(0.0, std::min(1.0, i.second)) * 0x10000; + } + Py_RETURN_NONE; +} + +static PyObject *osdmap_inc_set_compat_weight_set_weights( + BasePyOSDMapIncremental *self, PyObject *weightobj) +{ + map wm; + if (get_int_float_map(weightobj, &wm) < 0) { + return nullptr; + } + + CrushWrapper crush; + ceph_assert(self->inc->crush.length()); // see new_incremental + auto p = self->inc->crush.cbegin(); + decode(crush, p); + crush.create_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS, 1); + for (auto i : wm) { + crush.choose_args_adjust_item_weightf( + g_ceph_context, + crush.choose_args_get(CrushWrapper::DEFAULT_CHOOSE_ARGS), + i.first, + { i.second }, + nullptr); + } + self->inc->crush.clear(); + crush.encode(self->inc->crush, CEPH_FEATURES_ALL); + Py_RETURN_NONE; +} + +PyMethodDef BasePyOSDMapIncremental_methods[] = { + {"_get_epoch", (PyCFunction)osdmap_inc_get_epoch, METH_NOARGS, + "Get OSDMap::Incremental epoch"}, + {"_dump", (PyCFunction)osdmap_inc_dump, METH_NOARGS, + "Dump OSDMap::Incremental"}, + {"_set_osd_reweights", (PyCFunction)osdmap_inc_set_osd_reweights, + METH_O, "Set osd reweight values"}, + {"_set_crush_compat_weight_set_weights", + (PyCFunction)osdmap_inc_set_compat_weight_set_weights, METH_O, + "Set weight values in the pending CRUSH compat weight-set"}, + {NULL, NULL, 0, NULL} +}; + +PyTypeObject BasePyOSDMapIncrementalType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BasePyOSDMapIncremental", /* tp_name */ + sizeof(BasePyOSDMapIncremental), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)BasePyOSDMapIncremental_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "Ceph OSDMapIncremental", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BasePyOSDMapIncremental_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BasePyOSDMapIncremental_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; + + +// ---------- + +static int +BasePyCRUSH_init(BasePyCRUSH *self, + PyObject *args, PyObject *kwds) +{ + PyObject *crush_capsule = nullptr; + static const char *kwlist[] = {"crush_capsule", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "O", + const_cast(kwlist), + &crush_capsule)) { + ceph_abort(); + return -1; + } + ceph_assert(PyObject_TypeCheck(crush_capsule, &PyCapsule_Type)); + + auto ptr_ref = (std::shared_ptr*)( + PyCapsule_GetPointer(crush_capsule, nullptr)); + + // We passed a pointer to a shared pointer, which is weird, but + // just enough to get it into the constructor: this is a real shared + // pointer construction now, and then we throw away that pointer to + // the shared pointer. + self->crush = *ptr_ref; + ceph_assert(self->crush); + + return 0; +} + +static void +BasePyCRUSH_dealloc(BasePyCRUSH *self) +{ + self->crush.reset(); + Py_TYPE(self)->tp_free(self); +} + +static PyObject *crush_dump(BasePyCRUSH *self, PyObject *obj) +{ + PyFormatter f; + self->crush->dump(&f); + return f.get(); +} + +static PyObject *crush_get_item_name(BasePyCRUSH *self, PyObject *args) +{ + int item; + if (!PyArg_ParseTuple(args, "i:get_item_name", &item)) { + return nullptr; + } + if (!self->crush->item_exists(item)) { + Py_RETURN_NONE; + } + return PyString_FromString(self->crush->get_item_name(item)); +} + +static PyObject *crush_get_item_weight(BasePyCRUSH *self, PyObject *args) +{ + int item; + if (!PyArg_ParseTuple(args, "i:get_item_weight", &item)) { + return nullptr; + } + if (!self->crush->item_exists(item)) { + Py_RETURN_NONE; + } + return PyFloat_FromDouble(self->crush->get_item_weightf(item)); +} + +static PyObject *crush_find_takes(BasePyCRUSH *self, PyObject *obj) +{ + set takes; + self->crush->find_takes(&takes); + PyFormatter f; + f.open_array_section("takes"); + for (auto root : takes) { + f.dump_int("root", root); + } + f.close_section(); + return f.get(); +} + +static PyObject *crush_get_take_weight_osd_map(BasePyCRUSH *self, PyObject *args) +{ + int root; + if (!PyArg_ParseTuple(args, "i:get_take_weight_osd_map", + &root)) { + return nullptr; + } + map wmap; + + if (!self->crush->item_exists(root)) { + return nullptr; + } + + self->crush->get_take_weight_osd_map(root, &wmap); + PyFormatter f; + f.open_object_section("weights"); + for (auto& p : wmap) { + string n = stringify(p.first); // ick + f.dump_float(n.c_str(), p.second); + } + f.close_section(); + return f.get(); +} + +PyMethodDef BasePyCRUSH_methods[] = { + {"_dump", (PyCFunction)crush_dump, METH_NOARGS, "Dump map"}, + {"_get_item_name", (PyCFunction)crush_get_item_name, METH_VARARGS, + "Get item name"}, + {"_get_item_weight", (PyCFunction)crush_get_item_weight, METH_VARARGS, + "Get item weight"}, + {"_find_takes", (PyCFunction)crush_find_takes, METH_NOARGS, + "Find distinct TAKE roots"}, + {"_get_take_weight_osd_map", (PyCFunction)crush_get_take_weight_osd_map, + METH_VARARGS, "Get OSD weight map for a given TAKE root node"}, + {NULL, NULL, 0, NULL} +}; + +PyTypeObject BasePyCRUSHType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BasePyCRUSH", /* tp_name */ + sizeof(BasePyCRUSH), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)BasePyCRUSH_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "Ceph OSDMapIncremental", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BasePyCRUSH_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BasePyCRUSH_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; diff --git a/src/mgr/PyOSDMap.h b/src/mgr/PyOSDMap.h new file mode 100644 index 00000000..9d737424 --- /dev/null +++ b/src/mgr/PyOSDMap.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "PythonCompat.h" + + + +extern PyTypeObject BasePyOSDMapType; +extern PyTypeObject BasePyOSDMapIncrementalType; +extern PyTypeObject BasePyCRUSHType; + +PyObject *construct_with_capsule( + const std::string &module, + const std::string &clsname, + void *wrapped); + diff --git a/src/mgr/PythonCompat.h b/src/mgr/PythonCompat.h new file mode 100644 index 00000000..4ffb2eee --- /dev/null +++ b/src/mgr/PythonCompat.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +// Python's pyconfig-64.h conflicts with ceph's acconfig.h +#undef HAVE_SYS_WAIT_H +#undef HAVE_UNISTD_H +#undef HAVE_UTIME_H +#undef _POSIX_C_SOURCE +#undef _XOPEN_SOURCE + +#if PY_MAJOR_VERSION >= 3 +inline PyObject* PyString_FromString(const char *v) { + return PyUnicode_FromFormat("%s", v); +} +inline const char* PyString_AsString(PyObject *string) { + return PyUnicode_AsUTF8(string); +} +inline long PyInt_AsLong(PyObject *io) { + return PyLong_AsLong(io); +} +inline PyObject* PyInt_FromLong(long ival) { + return PyLong_FromLong(ival); +} +inline int PyString_Check(PyObject *o) { + return PyUnicode_Check(o); +} +inline PyObject* PyFloat_FromString(PyObject *s, void *arg) { + return PyFloat_FromString(s); +} +inline PyObject* PyInt_FromString(const char *str, char **pend, int base) { + return PyLong_FromString(str, pend, base); +} +#define PyString_Type PyUnicode_Type +#endif diff --git a/src/mgr/ServiceMap.cc b/src/mgr/ServiceMap.cc new file mode 100644 index 00000000..054e3a99 --- /dev/null +++ b/src/mgr/ServiceMap.cc @@ -0,0 +1,138 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mgr/ServiceMap.h" + +#include "common/Formatter.h" + +// Daemon + +void ServiceMap::Daemon::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(2, 1, bl); + encode(gid, bl); + encode(addr, bl, features); + encode(start_epoch, bl); + encode(start_stamp, bl); + encode(metadata, bl); + encode(task_status, bl); + ENCODE_FINISH(bl); +} + +void ServiceMap::Daemon::decode(bufferlist::const_iterator& p) +{ + DECODE_START(2, p); + decode(gid, p); + decode(addr, p); + decode(start_epoch, p); + decode(start_stamp, p); + decode(metadata, p); + if (struct_v >= 2) { + decode(task_status, p); + } + DECODE_FINISH(p); +} + +void ServiceMap::Daemon::dump(Formatter *f) const +{ + f->dump_unsigned("start_epoch", start_epoch); + f->dump_stream("start_stamp") << start_stamp; + f->dump_unsigned("gid", gid); + f->dump_string("addr", addr.get_legacy_str()); + f->open_object_section("metadata"); + for (auto& p : metadata) { + f->dump_string(p.first.c_str(), p.second); + } + f->close_section(); + f->open_object_section("task_status"); + for (auto& p : task_status) { + f->dump_string(p.first.c_str(), p.second); + } + f->close_section(); +} + +void ServiceMap::Daemon::generate_test_instances(std::list& ls) +{ + ls.push_back(new Daemon); + ls.push_back(new Daemon); + ls.back()->gid = 222; + ls.back()->metadata["this"] = "that"; + ls.back()->task_status["task1"] = "running"; +} + +// Service + +void ServiceMap::Service::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(daemons, bl, features); + encode(summary, bl); + ENCODE_FINISH(bl); +} + +void ServiceMap::Service::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(daemons, p); + decode(summary, p); + DECODE_FINISH(p); +} + +void ServiceMap::Service::dump(Formatter *f) const +{ + f->open_object_section("daemons"); + f->dump_string("summary", summary); + for (auto& p : daemons) { + f->dump_object(p.first.c_str(), p.second); + } + f->close_section(); +} + +void ServiceMap::Service::generate_test_instances(std::list& ls) +{ + ls.push_back(new Service); + ls.push_back(new Service); + ls.back()->daemons["one"].gid = 1; + ls.back()->daemons["two"].gid = 2; +} + +// ServiceMap + +void ServiceMap::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(epoch, bl); + encode(modified, bl); + encode(services, bl, features); + ENCODE_FINISH(bl); +} + +void ServiceMap::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(epoch, p); + decode(modified, p); + decode(services, p); + DECODE_FINISH(p); +} + +void ServiceMap::dump(Formatter *f) const +{ + f->dump_unsigned("epoch", epoch); + f->dump_stream("modified") << modified; + f->open_object_section("services"); + for (auto& p : services) { + f->dump_object(p.first.c_str(), p.second); + } + f->close_section(); +} + +void ServiceMap::generate_test_instances(std::list& ls) +{ + ls.push_back(new ServiceMap); + ls.push_back(new ServiceMap); + ls.back()->epoch = 123; + ls.back()->services["rgw"].daemons["one"].gid = 123; + ls.back()->services["rgw"].daemons["two"].gid = 344; + ls.back()->services["iscsi"].daemons["foo"].gid = 3222; +} diff --git a/src/mgr/ServiceMap.h b/src/mgr/ServiceMap.h new file mode 100644 index 00000000..c0cd65c8 --- /dev/null +++ b/src/mgr/ServiceMap.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include + +#include "include/utime.h" +#include "include/buffer.h" +#include "msg/msg_types.h" + +namespace ceph { + class Formatter; +} + +struct ServiceMap { + struct Daemon { + uint64_t gid = 0; + entity_addr_t addr; + epoch_t start_epoch = 0; ///< epoch first registered + utime_t start_stamp; ///< timestamp daemon started/registered + std::map metadata; ///< static metadata + std::map task_status; ///< running task status + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(std::list& ls); + }; + + struct Service { + map daemons; + std::string summary; ///< summary status string for 'ceph -s' + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(std::list& ls); + + std::string get_summary() const { + if (summary.size()) { + return summary; + } + if (daemons.empty()) { + return "no daemons active"; + } + std::ostringstream ss; + ss << daemons.size() << (daemons.size() > 1 ? " daemons" : " daemon") + << " active"; + + if (!daemons.empty()) { + ss << " ("; + for (auto p = daemons.begin(); p != daemons.end(); ++p) { + if (p != daemons.begin()) { + ss << ", "; + } + ss << p->first; + } + ss << ")"; + } + + return ss.str(); + } + + std::string get_task_summary(const std::string_view task_prefix) const { + // contruct a map similar to: + // {"service1 status" -> {"service1.0" -> "running"}} + // {"service2 status" -> {"service2.0" -> "idle"}, + // {"service2.1" -> "running"}} + std::map> by_task; + for (const auto &p : daemons) { + std::stringstream d; + d << task_prefix << "." << p.first; + for (const auto &q : p.second.task_status) { + auto p1 = by_task.emplace(q.first, std::map{}).first; + auto p2 = p1->second.emplace(d.str(), std::string()).first; + p2->second = q.second; + } + } + + std::stringstream ss; + for (const auto &p : by_task) { + ss << "\n " << p.first << ":"; + for (auto q : p.second) { + ss << "\n " << q.first << ": " << q.second; + } + } + + return ss.str(); + } + + void count_metadata(const std::string& field, + std::map *out) const { + for (auto& p : daemons) { + auto q = p.second.metadata.find(field); + if (q == p.second.metadata.end()) { + (*out)["unknown"]++; + } else { + (*out)[q->second]++; + } + } + } + + }; + + epoch_t epoch = 0; + utime_t modified; + map services; + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(std::list& ls); + + std::pair get_daemon(const std::string& service, + const std::string& daemon) { + auto& s = services[service]; + auto [d, added] = s.daemons.try_emplace(daemon); + return {&d->second, added}; + } + + bool rm_daemon(const std::string& service, + const std::string& daemon) { + auto p = services.find(service); + if (p == services.end()) { + return false; + } + auto q = p->second.daemons.find(daemon); + if (q == p->second.daemons.end()) { + return false; + } + p->second.daemons.erase(q); + if (p->second.daemons.empty()) { + services.erase(p); + } + return true; + } + + static inline bool is_normal_ceph_entity(std::string_view type) { + if (type == "osd" || + type == "client" || + type == "mon" || + type == "mds" || + type == "mgr") { + return true; + } + + return false; + } +}; +WRITE_CLASS_ENCODER_FEATURES(ServiceMap) +WRITE_CLASS_ENCODER_FEATURES(ServiceMap::Service) +WRITE_CLASS_ENCODER_FEATURES(ServiceMap::Daemon) diff --git a/src/mgr/StandbyPyModules.cc b/src/mgr/StandbyPyModules.cc new file mode 100644 index 00000000..6240b997 --- /dev/null +++ b/src/mgr/StandbyPyModules.cc @@ -0,0 +1,205 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "StandbyPyModules.h" + +#include "common/Finisher.h" +#include "common/debug.h" +#include "common/errno.h" + +#include "mgr/MgrContext.h" +#include "mgr/Gil.h" + + +#include +#include "include/ceph_assert.h" // boost clobbers this + +// For ::config_prefix +#include "PyModuleRegistry.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + + +StandbyPyModules::StandbyPyModules( + const MgrMap &mgr_map_, + PyModuleConfig &module_config, + LogChannelRef clog_, + MonClient &monc_, + Finisher &f) + : state(module_config, monc_), + clog(clog_), + finisher(f) +{ + state.set_mgr_map(mgr_map_); +} + +// FIXME: completely identical to ActivePyModules +void StandbyPyModules::shutdown() +{ + std::lock_guard locker(lock); + + // Signal modules to drop out of serve() and/or tear down resources + for (auto &i : modules) { + auto module = i.second.get(); + const auto& name = i.first; + dout(10) << "waiting for module " << name << " to shutdown" << dendl; + lock.Unlock(); + module->shutdown(); + lock.Lock(); + dout(10) << "module " << name << " shutdown" << dendl; + } + + // For modules implementing serve(), finish the threads where we + // were running that. + for (auto &i : modules) { + lock.Unlock(); + dout(10) << "joining thread for module " << i.first << dendl; + i.second->thread.join(); + dout(10) << "joined thread for module " << i.first << dendl; + lock.Lock(); + } + + modules.clear(); +} + +void StandbyPyModules::start_one(PyModuleRef py_module) +{ + std::lock_guard l(lock); + const auto name = py_module->get_name(); + auto standby_module = new StandbyPyModule(state, py_module, clog); + + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + finisher.queue(new FunctionContext([this, standby_module, name](int) { + int r = standby_module->load(); + if (r != 0) { + derr << "Failed to run module in standby mode ('" << name << "')" + << dendl; + delete standby_module; + } else { + std::lock_guard l(lock); + auto em = modules.emplace(name, standby_module); + ceph_assert(em.second); // actually inserted + + dout(4) << "Starting thread for " << name << dendl; + standby_module->thread.create(standby_module->get_thread_name()); + } + })); +} + +int StandbyPyModule::load() +{ + Gil gil(py_module->pMyThreadState, true); + + // We tell the module how we name it, so that it can be consistent + // with us in logging etc. + auto pThisPtr = PyCapsule_New(this, nullptr, nullptr); + ceph_assert(pThisPtr != nullptr); + auto pModuleName = PyString_FromString(get_name().c_str()); + ceph_assert(pModuleName != nullptr); + auto pArgs = PyTuple_Pack(2, pModuleName, pThisPtr); + Py_DECREF(pThisPtr); + Py_DECREF(pModuleName); + + pClassInstance = PyObject_CallObject(py_module->pStandbyClass, pArgs); + Py_DECREF(pArgs); + if (pClassInstance == nullptr) { + derr << "Failed to construct class in '" << get_name() << "'" << dendl; + derr << handle_pyerror() << dendl; + return -EINVAL; + } else { + dout(1) << "Constructed class from module: " << get_name() << dendl; + return 0; + } +} + +bool StandbyPyModule::get_config(const std::string &key, + std::string *value) const +{ + const std::string global_key = PyModule::config_prefix + + get_name() + "/" + key; + + dout(4) << __func__ << " key: " << global_key << dendl; + + return state.with_config([global_key, value](const PyModuleConfig &config){ + if (config.config.count(global_key)) { + *value = config.config.at(global_key); + return true; + } else { + return false; + } + }); +} + +bool StandbyPyModule::get_store(const std::string &key, + std::string *value) const +{ + + const std::string global_key = PyModule::config_prefix + + get_name() + "/" + key; + + dout(4) << __func__ << " key: " << global_key << dendl; + + // Active modules use a cache of store values (kept up to date + // as writes pass through the active mgr), but standbys + // fetch values synchronously to get an up to date value. + // It's an acceptable cost because standby modules should not be + // doing a lot. + + MonClient &monc = state.get_monc(); + + std::ostringstream cmd_json; + cmd_json << "{\"prefix\": \"config-key get\", \"key\": \"" + << global_key << "\"}"; + + bufferlist outbl; + std::string outs; + C_SaferCond c; + monc.start_mon_command( + {cmd_json.str()}, + {}, + &outbl, + &outs, + &c); + + int r = c.wait(); + if (r == -ENOENT) { + return false; + } else if (r != 0) { + // This is some internal error, not meaningful to python modules, + // so let them just see no value. + derr << __func__ << " error fetching store key '" << global_key << "': " + << cpp_strerror(r) << " " << outs << dendl; + return false; + } else { + *value = outbl.to_str(); + return true; + } +} + +std::string StandbyPyModule::get_active_uri() const +{ + std::string result; + state.with_mgr_map([&result, this](const MgrMap &mgr_map){ + auto iter = mgr_map.services.find(get_name()); + if (iter != mgr_map.services.end()) { + result = iter->second; + } + }); + + return result; +} + diff --git a/src/mgr/StandbyPyModules.h b/src/mgr/StandbyPyModules.h new file mode 100644 index 00000000..0de0b07e --- /dev/null +++ b/src/mgr/StandbyPyModules.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include "PythonCompat.h" + +#include +#include + +#include "common/Thread.h" +#include "common/Mutex.h" + +#include "mgr/Gil.h" +#include "mon/MonClient.h" +#include "mon/MgrMap.h" +#include "mgr/PyModuleRunner.h" + +class Finisher; + +/** + * State that is read by all modules running in standby mode + */ +class StandbyPyModuleState +{ + mutable Mutex lock{"StandbyPyModuleState::lock"}; + + MgrMap mgr_map; + PyModuleConfig &module_config; + MonClient &monc; + +public: + + + StandbyPyModuleState(PyModuleConfig &module_config_, MonClient &monc_) + : module_config(module_config_), monc(monc_) + {} + + void set_mgr_map(const MgrMap &mgr_map_) + { + std::lock_guard l(lock); + + mgr_map = mgr_map_; + } + + // MonClient does all its own locking so we're happy to hand out + // references. + MonClient &get_monc() {return monc;}; + + template + void with_mgr_map(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + std::forward(cb)(mgr_map, std::forward(args)...); + } + + template + auto with_config(Callback&& cb, Args&&... args) const -> + decltype(cb(module_config, std::forward(args)...)) { + std::lock_guard l(lock); + + return std::forward(cb)(module_config, std::forward(args)...); + } +}; + + +class StandbyPyModule : public PyModuleRunner +{ + StandbyPyModuleState &state; + + public: + + StandbyPyModule( + StandbyPyModuleState &state_, + const PyModuleRef &py_module_, + LogChannelRef clog_) + : + PyModuleRunner(py_module_, clog_), + state(state_) + { + } + + bool get_config(const std::string &key, std::string *value) const; + bool get_store(const std::string &key, std::string *value) const; + std::string get_active_uri() const; + + int load(); +}; + +class StandbyPyModules +{ +private: + mutable Mutex lock{"StandbyPyModules::lock"}; + std::map> modules; + + StandbyPyModuleState state; + + LogChannelRef clog; + + Finisher &finisher; + +public: + + StandbyPyModules( + const MgrMap &mgr_map_, + PyModuleConfig &module_config, + LogChannelRef clog_, + MonClient &monc, + Finisher &f); + + void start_one(PyModuleRef py_module); + + void shutdown(); + + void handle_mgr_map(const MgrMap &mgr_map) + { + state.set_mgr_map(mgr_map); + } + +}; diff --git a/src/mgr/mgr_commands.cc b/src/mgr/mgr_commands.cc new file mode 100644 index 00000000..206d1126 --- /dev/null +++ b/src/mgr/mgr_commands.cc @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mgr_commands.h" + +/* The set of statically defined (C++-handled) commands. This + * does not include the Python-defined commands, which are loaded + * in PyModules */ +const std::vector mgr_commands = { +#define COMMAND(parsesig, helptext, module, perm) \ + {parsesig, helptext, module, perm, 0}, +#include "MgrCommands.h" +#undef COMMAND +}; diff --git a/src/mgr/mgr_commands.h b/src/mgr/mgr_commands.h new file mode 100644 index 00000000..c6ed6c68 --- /dev/null +++ b/src/mgr/mgr_commands.h @@ -0,0 +1,9 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "mon/MonCommand.h" +#include + +extern const std::vector mgr_commands; -- cgit v1.2.3