diff options
Diffstat (limited to '')
67 files changed, 18690 insertions, 0 deletions
diff --git a/src/mgr/ActivePyModule.cc b/src/mgr/ActivePyModule.cc new file mode 100644 index 000000000..c244966e5 --- /dev/null +++ b/src/mgr/ActivePyModule.cc @@ -0,0 +1,279 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "PyFormatter.h" + +#include "common/debug.h" +#include "mon/MonCommand.h" + +#include "ActivePyModule.h" +#include "MgrSession.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +using std::string; +using namespace std::literals; + +int ActivePyModule::load(ActivePyModules *py_modules) +{ + ceph_assert(py_modules); + Gil gil(py_module->pMyThreadState, true); + + // We tell the module how we name it, so that it can be consistent + // with us in logging etc. + auto pThisPtr = PyCapsule_New(this, nullptr, nullptr); + auto pPyModules = PyCapsule_New(py_modules, nullptr, nullptr); + auto pModuleName = PyUnicode_FromString(get_name().c_str()); + auto pArgs = PyTuple_Pack(3, pModuleName, pPyModules, pThisPtr); + + pClassInstance = PyObject_CallObject(py_module->pClass, pArgs); + Py_DECREF(pModuleName); + Py_DECREF(pArgs); + if (pClassInstance == nullptr) { + derr << "Failed to construct class in '" << get_name() << "'" << dendl; + derr << handle_pyerror(true, get_name(), "ActivePyModule::load") << dendl; + return -EINVAL; + } else { + dout(1) << "Constructed class from module: " << get_name() << dendl; + } + + return 0; +} + +void ActivePyModule::notify(const std::string ¬ify_type, const std::string ¬ify_id) +{ + if (is_dead()) { + dout(5) << "cancelling notify " << notify_type << " " << notify_id << dendl; + return; + } + + ceph_assert(pClassInstance != nullptr); + + Gil gil(py_module->pMyThreadState, true); + + // Execute + auto pValue = PyObject_CallMethod(pClassInstance, + const_cast<char*>("notify"), const_cast<char*>("(ss)"), + notify_type.c_str(), notify_id.c_str()); + + if (pValue != NULL) { + Py_DECREF(pValue); + } else { + derr << get_name() << ".notify:" << dendl; + derr << handle_pyerror(true, get_name(), "ActivePyModule::notify") << dendl; + // FIXME: callers can't be expected to handle a python module + // that has spontaneously broken, but Mgr() should provide + // a hook to unload misbehaving modules when they have an + // error somewhere like this + } +} + +void ActivePyModule::notify_clog(const LogEntry &log_entry) +{ + if (is_dead()) { + dout(5) << "cancelling notify_clog" << dendl; + return; + } + + ceph_assert(pClassInstance != nullptr); + + Gil gil(py_module->pMyThreadState, true); + + // Construct python-ized LogEntry + PyFormatter f; + log_entry.dump(&f); + auto py_log_entry = f.get(); + + // Execute + auto pValue = PyObject_CallMethod(pClassInstance, + const_cast<char*>("notify"), const_cast<char*>("(sN)"), + "clog", py_log_entry); + + if (pValue != NULL) { + Py_DECREF(pValue); + } else { + derr << get_name() << ".notify_clog:" << dendl; + derr << handle_pyerror(true, get_name(), "ActivePyModule::notify_clog") << dendl; + // FIXME: callers can't be expected to handle a python module + // that has spontaneously broken, but Mgr() should provide + // a hook to unload misbehaving modules when they have an + // error somewhere like this + } +} + +bool ActivePyModule::method_exists(const std::string &method) const +{ + Gil gil(py_module->pMyThreadState, true); + + auto boundMethod = PyObject_GetAttrString(pClassInstance, method.c_str()); + if (boundMethod == nullptr) { + return false; + } else { + Py_DECREF(boundMethod); + return true; + } +} + +PyObject *ActivePyModule::dispatch_remote( + const std::string &method, + PyObject *args, + PyObject *kwargs, + std::string *err) +{ + ceph_assert(err != nullptr); + + // Rather than serializing arguments, pass the CPython objects. + // Works because we happen to know that the subinterpreter + // implementation shares a GIL, allocator, deallocator and GC state, so + // it's okay to pass the objects between subinterpreters. + // But in future this might involve serialization to support a CSP-aware + // future Python interpreter a la PEP554 + + Gil gil(py_module->pMyThreadState, true); + + // Fire the receiving method + auto boundMethod = PyObject_GetAttrString(pClassInstance, method.c_str()); + + // Caller should have done method_exists check first! + ceph_assert(boundMethod != nullptr); + + dout(20) << "Calling " << py_module->get_name() + << "." << method << "..." << dendl; + + auto remoteResult = PyObject_Call(boundMethod, + args, kwargs); + Py_DECREF(boundMethod); + + if (remoteResult == nullptr) { + // Because the caller is in a different context, we can't let this + // exception bubble up, need to re-raise it from the caller's + // context later. + std::string caller = "ActivePyModule::dispatch_remote "s + method; + *err = handle_pyerror(true, get_name(), caller); + } else { + dout(20) << "Success calling '" << method << "'" << dendl; + } + + return remoteResult; +} + +void ActivePyModule::config_notify() +{ + if (is_dead()) { + dout(5) << "cancelling config_notify" << dendl; + return; + } + + Gil gil(py_module->pMyThreadState, true); + dout(20) << "Calling " << py_module->get_name() << "._config_notify..." + << dendl; + auto remoteResult = PyObject_CallMethod(pClassInstance, + const_cast<char*>("_config_notify"), + (char*)NULL); + if (remoteResult != nullptr) { + Py_DECREF(remoteResult); + } +} + +int ActivePyModule::handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss) +{ + ceph_assert(ss != nullptr); + ceph_assert(ds != nullptr); + + if (pClassInstance == nullptr) { + // Not the friendliest error string, but we could only + // hit this in quite niche cases, if at all. + *ss << "Module not instantiated"; + return -EINVAL; + } + + Gil gil(py_module->pMyThreadState, true); + + PyFormatter f; + TOPNSPC::common::cmdmap_dump(cmdmap, &f); + PyObject *py_cmd = f.get(); + string instr; + inbuf.begin().copy(inbuf.length(), instr); + + ceph_assert(m_session == nullptr); + m_command_perms = module_command.perm; + m_session = &session; + + auto pResult = PyObject_CallMethod(pClassInstance, + const_cast<char*>("_handle_command"), const_cast<char*>("s#O"), + instr.c_str(), instr.length(), py_cmd); + + m_command_perms.clear(); + m_session = nullptr; + Py_DECREF(py_cmd); + + int r = 0; + if (pResult != NULL) { + if (PyTuple_Size(pResult) != 3) { + derr << "module '" << py_module->get_name() << "' command handler " + "returned wrong type!" << dendl; + r = -EINVAL; + } else { + r = PyLong_AsLong(PyTuple_GetItem(pResult, 0)); + *ds << PyUnicode_AsUTF8(PyTuple_GetItem(pResult, 1)); + *ss << PyUnicode_AsUTF8(PyTuple_GetItem(pResult, 2)); + } + + Py_DECREF(pResult); + } else { + derr << "module '" << py_module->get_name() << "' command handler " + "threw exception: " << peek_pyerror() << dendl; + *ds << ""; + *ss << handle_pyerror(); + r = -EINVAL; + } + + return r; +} + +void ActivePyModule::get_health_checks(health_check_map_t *checks) +{ + if (is_dead()) { + dout(5) << "cancelling get_health_checks" << dendl; + return; + } + checks->merge(health_checks); +} + +bool ActivePyModule::is_authorized( + const std::map<std::string, std::string>& arguments) const { + if (m_session == nullptr) { + return false; + } + + // No need to pass command prefix here since that would have already been + // tested before command invokation. Instead, only test for service/module + // arguments as defined by the module itself. + MonCommand mon_command {"", "", "", m_command_perms}; + return m_session->caps.is_capable(nullptr, m_session->entity_name, "py", + py_module->get_name(), "", arguments, + mon_command.requires_perm('r'), + mon_command.requires_perm('w'), + mon_command.requires_perm('x'), + m_session->get_peer_addr()); +} diff --git a/src/mgr/ActivePyModule.h b/src/mgr/ActivePyModule.h new file mode 100644 index 000000000..187fb68f8 --- /dev/null +++ b/src/mgr/ActivePyModule.h @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#pragma once + +// Python.h comes first because otherwise it clobbers ceph's assert +#include "Python.h" + +#include "common/cmdparse.h" +#include "common/LogEntry.h" +#include "common/Thread.h" +#include "common/Finisher.h" +#include "mon/health_check.h" +#include "mgr/Gil.h" + +#include "PyModuleRunner.h" +#include "PyModule.h" + +#include <vector> +#include <string> + + +class ActivePyModule; +class ActivePyModules; +class MgrSession; +class ModuleCommand; + +class ActivePyModule : public PyModuleRunner +{ +private: + health_check_map_t health_checks; + + // Optional, URI exposed by plugins that implement serve() + std::string uri; + + std::string m_command_perms; + const MgrSession* m_session = nullptr; + std::string fin_thread_name; +public: + Finisher finisher; // per active module finisher to execute commands + +public: + ActivePyModule(const PyModuleRef &py_module_, + LogChannelRef clog_) + : PyModuleRunner(py_module_, clog_), + fin_thread_name(std::string("m-fin-" + py_module->get_name()).substr(0,15)), + finisher(g_ceph_context, thread_name, fin_thread_name) + + { + } + + int load(ActivePyModules *py_modules); + void notify(const std::string ¬ify_type, const std::string ¬ify_id); + void notify_clog(const LogEntry &le); + + bool method_exists(const std::string &method) const; + + PyObject *dispatch_remote( + const std::string &method, + PyObject *args, + PyObject *kwargs, + std::string *err); + + int handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss); + + + bool set_health_checks(health_check_map_t&& c) { + // when health checks change a report is immediately sent to the monitors. + // currently modules have static health check details, but this equality + // test could be made smarter if too much noise shows up in the future. + bool changed = health_checks != c; + health_checks = std::move(c); + return changed; + } + void get_health_checks(health_check_map_t *checks); + void config_notify(); + + void set_uri(const std::string &str) + { + uri = str; + } + + std::string get_uri() const + { + return uri; + } + + std::string get_fin_thread_name() const + { + return fin_thread_name; + } + + bool is_authorized(const std::map<std::string, std::string>& arguments) const; + +}; + + diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc new file mode 100644 index 000000000..45038e734 --- /dev/null +++ b/src/mgr/ActivePyModules.cc @@ -0,0 +1,1553 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +// Include this first to get python headers earlier +#include "Gil.h" + +#include "ActivePyModules.h" + +#include <rocksdb/version.h> + +#include "common/errno.h" +#include "include/stringify.h" + +#include "mon/MonMap.h" +#include "osd/OSDMap.h" +#include "osd/osd_types.h" +#include "mgr/MgrContext.h" +#include "mgr/TTLCache.h" +#include "mgr/mgr_perf_counters.h" + +#include "DaemonKey.h" +#include "DaemonServer.h" +#include "mgr/MgrContext.h" +#include "PyFormatter.h" +// For ::mgr_store_prefix +#include "PyModule.h" +#include "PyModuleRegistry.h" +#include "PyUtil.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +using std::pair; +using std::string; +using namespace std::literals; + +ActivePyModules::ActivePyModules( + PyModuleConfig &module_config_, + std::map<std::string, std::string> store_data, + bool mon_provides_kv_sub, + DaemonStateIndex &ds, ClusterState &cs, + MonClient &mc, LogChannelRef clog_, + LogChannelRef audit_clog_, Objecter &objecter_, + Client &client_, Finisher &f, DaemonServer &server, + PyModuleRegistry &pmr) +: module_config(module_config_), daemon_state(ds), cluster_state(cs), + monc(mc), clog(clog_), audit_clog(audit_clog_), objecter(objecter_), + client(client_), finisher(f), + cmd_finisher(g_ceph_context, "cmd_finisher", "cmdfin"), + server(server), py_module_registry(pmr) +{ + store_cache = std::move(store_data); + // we can only trust our ConfigMap if the mon cluster has provided + // kv sub since our startup. + have_local_config_map = mon_provides_kv_sub; + _refresh_config_map(); + cmd_finisher.start(); +} + +ActivePyModules::~ActivePyModules() = default; + +void ActivePyModules::dump_server(const std::string &hostname, + const DaemonStateCollection &dmc, + Formatter *f) +{ + f->dump_string("hostname", hostname); + f->open_array_section("services"); + std::string ceph_version; + + for (const auto &[key, state] : dmc) { + std::string id; + without_gil([&ceph_version, &id, state=state] { + std::lock_guard l(state->lock); + // TODO: pick the highest version, and make sure that + // somewhere else (during health reporting?) we are + // indicating to the user if we see mixed versions + auto ver_iter = state->metadata.find("ceph_version"); + if (ver_iter != state->metadata.end()) { + ceph_version = state->metadata.at("ceph_version"); + } + if (state->metadata.find("id") != state->metadata.end()) { + id = state->metadata.at("id"); + } + }); + f->open_object_section("service"); + f->dump_string("type", key.type); + f->dump_string("id", key.name); + f->dump_string("ceph_version", ceph_version); + if (!id.empty()) { + f->dump_string("name", id); + } + f->close_section(); + } + f->close_section(); + + f->dump_string("ceph_version", ceph_version); +} + +PyObject *ActivePyModules::get_server_python(const std::string &hostname) +{ + const auto dmc = without_gil([&]{ + std::lock_guard l(lock); + dout(10) << " (" << hostname << ")" << dendl; + return daemon_state.get_by_server(hostname); + }); + PyFormatter f; + dump_server(hostname, dmc, &f); + return f.get(); +} + + +PyObject *ActivePyModules::list_servers_python() +{ + dout(10) << " >" << dendl; + + without_gil_t no_gil; + return daemon_state.with_daemons_by_server([this, &no_gil] + (const std::map<std::string, DaemonStateCollection> &all) { + no_gil.acquire_gil(); + PyFormatter f(false, true); + for (const auto &[hostname, daemon_state] : all) { + f.open_object_section("server"); + dump_server(hostname, daemon_state, &f); + f.close_section(); + } + return f.get(); + }); +} + +PyObject *ActivePyModules::get_metadata_python( + const std::string &svc_type, + const std::string &svc_id) +{ + auto metadata = daemon_state.get(DaemonKey{svc_type, svc_id}); + if (metadata == nullptr) { + derr << "Requested missing service " << svc_type << "." << svc_id << dendl; + Py_RETURN_NONE; + } + auto l = without_gil([&] { + return std::lock_guard(lock); + }); + PyFormatter f; + f.dump_string("hostname", metadata->hostname); + for (const auto &[key, val] : metadata->metadata) { + f.dump_string(key, val); + } + + return f.get(); +} + +PyObject *ActivePyModules::get_daemon_status_python( + const std::string &svc_type, + const std::string &svc_id) +{ + auto metadata = daemon_state.get(DaemonKey{svc_type, svc_id}); + if (metadata == nullptr) { + derr << "Requested missing service " << svc_type << "." << svc_id << dendl; + Py_RETURN_NONE; + } + auto l = without_gil([&] { + return std::lock_guard(lock); + }); + PyFormatter f; + for (const auto &[daemon, status] : metadata->service_status) { + f.dump_string(daemon, status); + } + return f.get(); +} + +void ActivePyModules::update_cache_metrics() { + auto hit_miss_ratio = ttl_cache.get_hit_miss_ratio(); + perfcounter->set(l_mgr_cache_hit, hit_miss_ratio.first); + perfcounter->set(l_mgr_cache_miss, hit_miss_ratio.second); +} + +PyObject *ActivePyModules::cacheable_get_python(const std::string &what) +{ + uint64_t ttl_seconds = g_conf().get_val<uint64_t>("mgr_ttl_cache_expire_seconds"); + if(ttl_seconds > 0) { + ttl_cache.set_ttl(ttl_seconds); + try{ + PyObject* cached = ttl_cache.get(what); + update_cache_metrics(); + return cached; + } catch (std::out_of_range& e) {} + } + + PyObject *obj = get_python(what); + if(ttl_seconds && ttl_cache.is_cacheable(what)) { + ttl_cache.insert(what, obj); + Py_INCREF(obj); + } + update_cache_metrics(); + return obj; +} + +PyObject *ActivePyModules::get_python(const std::string &what) +{ + uint64_t ttl_seconds = g_conf().get_val<uint64_t>("mgr_ttl_cache_expire_seconds"); + + PyFormatter pf; + PyJSONFormatter jf; + // Use PyJSONFormatter if TTL cache is enabled. + Formatter &f = ttl_seconds ? (Formatter&)jf : (Formatter&)pf; + + if (what == "fs_map") { + without_gil_t no_gil; + cluster_state.with_fsmap([&](const FSMap &fsmap) { + no_gil.acquire_gil(); + fsmap.dump(&f); + }); + } else if (what == "osdmap_crush_map_text") { + without_gil_t no_gil; + bufferlist rdata; + cluster_state.with_osdmap([&](const OSDMap &osd_map){ + osd_map.crush->encode(rdata, CEPH_FEATURES_SUPPORTED_DEFAULT); + }); + std::string crush_text = rdata.to_str(); + no_gil.acquire_gil(); + return PyUnicode_FromString(crush_text.c_str()); + } else if (what.substr(0, 7) == "osd_map") { + without_gil_t no_gil; + cluster_state.with_osdmap([&](const OSDMap &osd_map){ + no_gil.acquire_gil(); + if (what == "osd_map") { + osd_map.dump(&f, g_ceph_context); + } else if (what == "osd_map_tree") { + osd_map.print_tree(&f, nullptr); + } else if (what == "osd_map_crush") { + osd_map.crush->dump(&f); + } + }); + } else if (what == "modified_config_options") { + without_gil_t no_gil; + auto all_daemons = daemon_state.get_all(); + set<string> names; + for (auto& [key, daemon] : all_daemons) { + std::lock_guard l(daemon->lock); + for (auto& [name, valmap] : daemon->config) { + names.insert(name); + } + } + no_gil.acquire_gil(); + f.open_array_section("options"); + for (auto& name : names) { + f.dump_string("name", name); + } + f.close_section(); + } else if (what.substr(0, 6) == "config") { + // We make a copy of the global config to avoid printing + // to py formater (which may drop-take GIL) while holding + // the global config lock, which might deadlock with other + // thread that is holding the GIL and acquiring the global + // config lock. + ConfigProxy config{g_conf()}; + if (what == "config_options") { + config.config_options(&f); + } else if (what == "config") { + config.show_config(&f); + } + } else if (what == "mon_map") { + without_gil_t no_gil; + cluster_state.with_monmap([&](const MonMap &monmap) { + no_gil.acquire_gil(); + monmap.dump(&f); + }); + } else if (what == "service_map") { + without_gil_t no_gil; + cluster_state.with_servicemap([&](const ServiceMap &service_map) { + no_gil.acquire_gil(); + service_map.dump(&f); + }); + } else if (what == "osd_metadata") { + without_gil_t no_gil; + auto dmc = daemon_state.get_by_service("osd"); + for (const auto &[key, state] : dmc) { + std::lock_guard l(state->lock); + with_gil(no_gil, [&f, &name=key.name, state=state] { + f.open_object_section(name.c_str()); + f.dump_string("hostname", state->hostname); + for (const auto &[name, val] : state->metadata) { + f.dump_string(name.c_str(), val); + } + f.close_section(); + }); + } + } else if (what == "mds_metadata") { + without_gil_t no_gil; + auto dmc = daemon_state.get_by_service("mds"); + for (const auto &[key, state] : dmc) { + std::lock_guard l(state->lock); + with_gil(no_gil, [&f, &name=key.name, state=state] { + f.open_object_section(name.c_str()); + f.dump_string("hostname", state->hostname); + for (const auto &[name, val] : state->metadata) { + f.dump_string(name.c_str(), val); + } + f.close_section(); + }); + } + } else if (what == "pg_summary") { + without_gil_t no_gil; + cluster_state.with_pgmap( + [&f, &no_gil](const PGMap &pg_map) { + std::map<std::string, std::map<std::string, uint32_t> > osds; + std::map<std::string, std::map<std::string, uint32_t> > pools; + std::map<std::string, uint32_t> all; + for (const auto &i : pg_map.pg_stat) { + const auto pool = i.first.m_pool; + const std::string state = pg_state_string(i.second.state); + // Insert to per-pool map + pools[stringify(pool)][state]++; + for (const auto &osd_id : i.second.acting) { + osds[stringify(osd_id)][state]++; + } + all[state]++; + } + no_gil.acquire_gil(); + f.open_object_section("by_osd"); + for (const auto &i : osds) { + f.open_object_section(i.first.c_str()); + for (const auto &j : i.second) { + f.dump_int(j.first.c_str(), j.second); + } + f.close_section(); + } + f.close_section(); + f.open_object_section("by_pool"); + for (const auto &i : pools) { + f.open_object_section(i.first.c_str()); + for (const auto &j : i.second) { + f.dump_int(j.first.c_str(), j.second); + } + f.close_section(); + } + f.close_section(); + f.open_object_section("all"); + for (const auto &i : all) { + f.dump_int(i.first.c_str(), i.second); + } + f.close_section(); + f.open_object_section("pg_stats_sum"); + pg_map.pg_sum.dump(&f); + f.close_section(); + } + ); + } else if (what == "pg_status") { + without_gil_t no_gil; + cluster_state.with_pgmap( + [&](const PGMap &pg_map) { + no_gil.acquire_gil(); + pg_map.print_summary(&f, nullptr); + } + ); + } else if (what == "pg_dump") { + without_gil_t no_gil; + cluster_state.with_pgmap( + [&](const PGMap &pg_map) { + no_gil.acquire_gil(); + pg_map.dump(&f, false); + } + ); + } else if (what == "devices") { + without_gil_t no_gil; + daemon_state.with_devices2( + [&] { + with_gil(no_gil, [&] { f.open_array_section("devices"); }); + }, + [&](const DeviceState &dev) { + with_gil(no_gil, [&] { f.dump_object("device", dev); }); + }); + with_gil(no_gil, [&] { + f.close_section(); + }); + } else if (what.size() > 7 && + what.substr(0, 7) == "device ") { + without_gil_t no_gil; + string devid = what.substr(7); + if (!daemon_state.with_device(devid, + [&] (const DeviceState& dev) { + with_gil_t with_gil{no_gil}; + f.dump_object("device", dev); + })) { + // device not found + } + } else if (what == "io_rate") { + without_gil_t no_gil; + cluster_state.with_pgmap( + [&](const PGMap &pg_map) { + no_gil.acquire_gil(); + pg_map.dump_delta(&f); + } + ); + } else if (what == "df") { + without_gil_t no_gil; + cluster_state.with_osdmap_and_pgmap( + [&]( + const OSDMap& osd_map, + const PGMap &pg_map) { + no_gil.acquire_gil(); + pg_map.dump_cluster_stats(nullptr, &f, true); + pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true); + }); + } else if (what == "pg_stats") { + without_gil_t no_gil; + cluster_state.with_pgmap([&](const PGMap &pg_map) { + no_gil.acquire_gil(); + pg_map.dump_pg_stats(&f, false); + }); + } else if (what == "pool_stats") { + without_gil_t no_gil; + cluster_state.with_pgmap([&](const PGMap &pg_map) { + no_gil.acquire_gil(); + pg_map.dump_pool_stats(&f); + }); + } else if (what == "pg_ready") { + server.dump_pg_ready(&f); + } else if (what == "pg_progress") { + without_gil_t no_gil; + cluster_state.with_pgmap([&](const PGMap &pg_map) { + no_gil.acquire_gil(); + pg_map.dump_pg_progress(&f); + server.dump_pg_ready(&f); + }); + } else if (what == "osd_stats") { + without_gil_t no_gil; + cluster_state.with_pgmap([&](const PGMap &pg_map) { + no_gil.acquire_gil(); + pg_map.dump_osd_stats(&f, false); + }); + } else if (what == "osd_ping_times") { + without_gil_t no_gil; + cluster_state.with_pgmap([&](const PGMap &pg_map) { + no_gil.acquire_gil(); + pg_map.dump_osd_ping_times(&f); + }); + } else if (what == "osd_pool_stats") { + without_gil_t no_gil; + int64_t poolid = -ENOENT; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, + const PGMap& pg_map) { + no_gil.acquire_gil(); + f.open_array_section("pool_stats"); + for (auto &p : osdmap.get_pools()) { + poolid = p.first; + pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, &f, nullptr); + } + f.close_section(); + }); + } else if (what == "health") { + without_gil_t no_gil; + cluster_state.with_health([&](const ceph::bufferlist &health_json) { + no_gil.acquire_gil(); + f.dump_string("json", health_json.to_str()); + }); + } else if (what == "mon_status") { + without_gil_t no_gil; + cluster_state.with_mon_status( + [&](const ceph::bufferlist &mon_status_json) { + no_gil.acquire_gil(); + f.dump_string("json", mon_status_json.to_str()); + }); + } else if (what == "mgr_map") { + without_gil_t no_gil; + cluster_state.with_mgrmap([&](const MgrMap &mgr_map) { + no_gil.acquire_gil(); + mgr_map.dump(&f); + }); + } else if (what == "mgr_ips") { + entity_addrvec_t myaddrs = server.get_myaddrs(); + f.open_array_section("ips"); + std::set<std::string> did; + for (auto& i : myaddrs.v) { + std::string ip = i.ip_only_to_str(); + if (auto [where, inserted] = did.insert(ip); inserted) { + f.dump_string("ip", ip); + } + } + f.close_section(); + } else if (what == "have_local_config_map") { + f.dump_bool("have_local_config_map", have_local_config_map); + } else if (what == "active_clean_pgs"){ + without_gil_t no_gil; + cluster_state.with_pgmap( + [&](const PGMap &pg_map) { + no_gil.acquire_gil(); + f.open_array_section("pg_stats"); + for (auto &i : pg_map.pg_stat) { + const auto state = i.second.state; + const auto pgid_raw = i.first; + const auto pgid = stringify(pgid_raw.m_pool) + "." + stringify(pgid_raw.m_seed); + const auto reported_epoch = i.second.reported_epoch; + if (state & PG_STATE_ACTIVE && state & PG_STATE_CLEAN) { + f.open_object_section("pg_stat"); + f.dump_string("pgid", pgid); + f.dump_string("state", pg_state_string(state)); + f.dump_unsigned("reported_epoch", reported_epoch); + f.close_section(); + } + } + f.close_section(); + const auto num_pg = pg_map.num_pg; + f.dump_unsigned("total_num_pgs", num_pg); + }); + } else { + derr << "Python module requested unknown data '" << what << "'" << dendl; + Py_RETURN_NONE; + } + if(ttl_seconds) { + return jf.get(); + } else { + return pf.get(); + } +} + +void ActivePyModules::start_one(PyModuleRef py_module) +{ + std::lock_guard l(lock); + + const auto name = py_module->get_name(); + auto active_module = std::make_shared<ActivePyModule>(py_module, clog); + + pending_modules.insert(name); + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + finisher.queue(new LambdaContext([this, active_module, name](int) { + int r = active_module->load(this); + std::lock_guard l(lock); + pending_modules.erase(name); + if (r != 0) { + derr << "Failed to run module in active mode ('" << name << "')" + << dendl; + } else { + auto em = modules.emplace(name, active_module); + ceph_assert(em.second); // actually inserted + + dout(4) << "Starting thread for " << name << dendl; + active_module->thread.create(active_module->get_thread_name()); + dout(4) << "Starting active module " << name <<" finisher thread " + << active_module->get_fin_thread_name() << dendl; + active_module->finisher.start(); + } + })); +} + +void ActivePyModules::shutdown() +{ + std::lock_guard locker(lock); + + // Stop per active module finisher thread + for (auto& [name, module] : modules) { + dout(4) << "Stopping active module " << name << " finisher thread" << dendl; + module->finisher.wait_for_empty(); + module->finisher.stop(); + } + + // Signal modules to drop out of serve() and/or tear down resources + for (auto& [name, module] : modules) { + lock.unlock(); + dout(10) << "calling module " << name << " shutdown()" << dendl; + module->shutdown(); + dout(10) << "module " << name << " shutdown() returned" << dendl; + lock.lock(); + } + + // For modules implementing serve(), finish the threads where we + // were running that. + for (auto& [name, module] : modules) { + lock.unlock(); + dout(10) << "joining module " << name << dendl; + module->thread.join(); + dout(10) << "joined module " << name << dendl; + lock.lock(); + } + + cmd_finisher.wait_for_empty(); + cmd_finisher.stop(); + + modules.clear(); +} + +void ActivePyModules::notify_all(const std::string ¬ify_type, + const std::string ¬ify_id) +{ + std::lock_guard l(lock); + + dout(10) << __func__ << ": notify_all " << notify_type << dendl; + for (auto& [name, module] : modules) { + if (!py_module_registry.should_notify(name, notify_type)) { + continue; + } + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + dout(15) << "queuing notify (" << notify_type << ") to " << name << dendl; + Finisher& mod_finisher = py_module_registry.get_active_module_finisher(name); + // workaround for https://bugs.llvm.org/show_bug.cgi?id=35984 + mod_finisher.queue(new LambdaContext([module=module, notify_type, notify_id] + (int r){ + module->notify(notify_type, notify_id); + })); + } +} + +void ActivePyModules::notify_all(const LogEntry &log_entry) +{ + std::lock_guard l(lock); + + dout(10) << __func__ << ": notify_all (clog)" << dendl; + for (auto& [name, module] : modules) { + if (!py_module_registry.should_notify(name, "clog")) { + continue; + } + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + // + // Note intentional use of non-reference lambda binding on + // log_entry: we take a copy because caller's instance is + // probably ephemeral. + dout(15) << "queuing notify (clog) to " << name << dendl; + Finisher& mod_finisher = py_module_registry.get_active_module_finisher(name); + // workaround for https://bugs.llvm.org/show_bug.cgi?id=35984 + mod_finisher.queue(new LambdaContext([module=module, log_entry](int r){ + module->notify_clog(log_entry); + })); + } +} + +bool ActivePyModules::get_store(const std::string &module_name, + const std::string &key, std::string *val) const +{ + without_gil_t no_gil; + std::lock_guard l(lock); + + const std::string global_key = PyModule::mgr_store_prefix + + module_name + "/" + key; + + dout(4) << __func__ << " key: " << global_key << dendl; + + auto i = store_cache.find(global_key); + if (i != store_cache.end()) { + *val = i->second; + return true; + } else { + return false; + } +} + +PyObject *ActivePyModules::dispatch_remote( + const std::string &other_module, + const std::string &method, + PyObject *args, + PyObject *kwargs, + std::string *err) +{ + auto mod_iter = modules.find(other_module); + ceph_assert(mod_iter != modules.end()); + + return mod_iter->second->dispatch_remote(method, args, kwargs, err); +} + +bool ActivePyModules::get_config(const std::string &module_name, + const std::string &key, std::string *val) const +{ + const std::string global_key = "mgr/" + module_name + "/" + key; + + dout(20) << " key: " << global_key << dendl; + + std::lock_guard lock(module_config.lock); + + auto i = module_config.config.find(global_key); + if (i != module_config.config.end()) { + *val = i->second; + return true; + } else { + return false; + } +} + +PyObject *ActivePyModules::get_typed_config( + const std::string &module_name, + const std::string &key, + const std::string &prefix) const +{ + without_gil_t no_gil; + std::string value; + std::string final_key; + bool found = false; + if (prefix.size()) { + final_key = prefix + "/" + key; + found = get_config(module_name, final_key, &value); + } + if (!found) { + final_key = key; + found = get_config(module_name, final_key, &value); + } + if (found) { + PyModuleRef module = py_module_registry.get_module(module_name); + no_gil.acquire_gil(); + if (!module) { + derr << "Module '" << module_name << "' is not available" << dendl; + Py_RETURN_NONE; + } + // removing value to hide sensitive data going into mgr logs + // leaving this for debugging purposes + // dout(10) << __func__ << " " << final_key << " found: " << value << dendl; + dout(10) << __func__ << " " << final_key << " found" << dendl; + return module->get_typed_option_value(key, value); + } + if (prefix.size()) { + dout(10) << " [" << prefix << "/]" << key << " not found " + << dendl; + } else { + dout(10) << " " << key << " not found " << dendl; + } + Py_RETURN_NONE; +} + +PyObject *ActivePyModules::get_store_prefix(const std::string &module_name, + const std::string &prefix) const +{ + without_gil_t no_gil; + std::lock_guard l(lock); + std::lock_guard lock(module_config.lock); + no_gil.acquire_gil(); + + const std::string base_prefix = PyModule::mgr_store_prefix + + module_name + "/"; + const std::string global_prefix = base_prefix + prefix; + dout(4) << __func__ << " prefix: " << global_prefix << dendl; + + PyFormatter f; + for (auto p = store_cache.lower_bound(global_prefix); + p != store_cache.end() && p->first.find(global_prefix) == 0; ++p) { + f.dump_string(p->first.c_str() + base_prefix.size(), p->second); + } + return f.get(); +} + +void ActivePyModules::set_store(const std::string &module_name, + const std::string &key, const std::optional<std::string>& val) +{ + const std::string global_key = PyModule::mgr_store_prefix + + module_name + "/" + key; + + Command set_cmd; + { + std::lock_guard l(lock); + + // NOTE: this isn't strictly necessary since we'll also get an MKVData + // update from the mon due to our subscription *before* our command is acked. + if (val) { + store_cache[global_key] = *val; + } else { + store_cache.erase(global_key); + } + + std::ostringstream cmd_json; + JSONFormatter jf; + jf.open_object_section("cmd"); + if (val) { + jf.dump_string("prefix", "config-key set"); + jf.dump_string("key", global_key); + jf.dump_string("val", *val); + } else { + jf.dump_string("prefix", "config-key del"); + jf.dump_string("key", global_key); + } + jf.close_section(); + jf.flush(cmd_json); + set_cmd.run(&monc, cmd_json.str()); + } + set_cmd.wait(); + + if (set_cmd.r != 0) { + // config-key set will fail if mgr's auth key has insufficient + // permission to set config keys + // FIXME: should this somehow raise an exception back into Python land? + dout(0) << "`config-key set " << global_key << " " << val << "` failed: " + << cpp_strerror(set_cmd.r) << dendl; + dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl; + } +} + +std::pair<int, std::string> ActivePyModules::set_config( + const std::string &module_name, + const std::string &key, + const std::optional<std::string>& val) +{ + return module_config.set_config(&monc, module_name, key, val); +} + +std::map<std::string, std::string> ActivePyModules::get_services() const +{ + std::map<std::string, std::string> result; + std::lock_guard l(lock); + for (const auto& [name, module] : modules) { + std::string svc_str = module->get_uri(); + if (!svc_str.empty()) { + result[name] = svc_str; + } + } + + return result; +} + +void ActivePyModules::update_kv_data( + const std::string prefix, + bool incremental, + const map<std::string, std::optional<bufferlist>, std::less<>>& data) +{ + std::lock_guard l(lock); + bool do_config = false; + if (!incremental) { + dout(10) << "full update on " << prefix << dendl; + auto p = store_cache.lower_bound(prefix); + while (p != store_cache.end() && p->first.find(prefix) == 0) { + dout(20) << " rm prior " << p->first << dendl; + p = store_cache.erase(p); + } + } else { + dout(10) << "incremental update on " << prefix << dendl; + } + for (auto& i : data) { + if (i.second) { + dout(20) << " set " << i.first << " = " << i.second->to_str() << dendl; + store_cache[i.first] = i.second->to_str(); + } else { + dout(20) << " rm " << i.first << dendl; + store_cache.erase(i.first); + } + if (i.first.find("config/") == 0) { + do_config = true; + } + } + if (do_config) { + _refresh_config_map(); + } +} + +void ActivePyModules::_refresh_config_map() +{ + dout(10) << dendl; + config_map.clear(); + for (auto p = store_cache.lower_bound("config/"); + p != store_cache.end() && p->first.find("config/") == 0; + ++p) { + string key = p->first.substr(7); + if (key.find("mgr/") == 0) { + // NOTE: for now, we ignore module options. see also ceph_foreign_option_get(). + continue; + } + string value = p->second; + string name; + string who; + config_map.parse_key(key, &name, &who); + + const Option *opt = g_conf().find_option(name); + if (!opt) { + config_map.stray_options.push_back( + std::unique_ptr<Option>( + new Option(name, Option::TYPE_STR, Option::LEVEL_UNKNOWN))); + opt = config_map.stray_options.back().get(); + } + + string err; + int r = opt->pre_validate(&value, &err); + if (r < 0) { + dout(10) << __func__ << " pre-validate failed on '" << name << "' = '" + << value << "' for " << name << dendl; + } + + MaskedOption mopt(opt); + mopt.raw_value = value; + string section_name; + if (who.size() && + !ConfigMap::parse_mask(who, §ion_name, &mopt.mask)) { + derr << __func__ << " invalid mask for key " << key << dendl; + } else if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) { + dout(10) << __func__ << " NO_MON_UPDATE option '" + << name << "' = '" << value << "' for " << name + << dendl; + } else { + Section *section = &config_map.global;; + if (section_name.size() && section_name != "global") { + if (section_name.find('.') != std::string::npos) { + section = &config_map.by_id[section_name]; + } else { + section = &config_map.by_type[section_name]; + } + } + section->options.insert(make_pair(name, std::move(mopt))); + } + } +} + +PyObject* ActivePyModules::with_perf_counters( + std::function<void(PerfCounterInstance& counter_instance, PerfCounterType& counter_type, PyFormatter& f)> fct, + const std::string &svc_name, + const std::string &svc_id, + const std::string &path) const +{ + PyFormatter f; + f.open_array_section(path); + { + without_gil_t no_gil; + std::lock_guard l(lock); + auto metadata = daemon_state.get(DaemonKey{svc_name, svc_id}); + if (metadata) { + std::lock_guard l2(metadata->lock); + if (metadata->perf_counters.instances.count(path)) { + auto counter_instance = metadata->perf_counters.instances.at(path); + auto counter_type = metadata->perf_counters.types.at(path); + with_gil(no_gil, [&] { + fct(counter_instance, counter_type, f); + }); + } else { + dout(4) << "Missing counter: '" << path << "' (" + << svc_name << "." << svc_id << ")" << dendl; + dout(20) << "Paths are:" << dendl; + for (const auto &i : metadata->perf_counters.instances) { + dout(20) << i.first << dendl; + } + } + } else { + dout(4) << "No daemon state for " << svc_name << "." << svc_id << ")" + << dendl; + } + } + f.close_section(); + return f.get(); +} + +PyObject* ActivePyModules::get_counter_python( + const std::string &svc_name, + const std::string &svc_id, + const std::string &path) +{ + auto extract_counters = []( + PerfCounterInstance& counter_instance, + PerfCounterType& counter_type, + PyFormatter& f) + { + if (counter_type.type & PERFCOUNTER_LONGRUNAVG) { + const auto &avg_data = counter_instance.get_data_avg(); + for (const auto &datapoint : avg_data) { + f.open_array_section("datapoint"); + f.dump_float("t", datapoint.t); + f.dump_unsigned("s", datapoint.s); + f.dump_unsigned("c", datapoint.c); + f.close_section(); + } + } else { + const auto &data = counter_instance.get_data(); + for (const auto &datapoint : data) { + f.open_array_section("datapoint"); + f.dump_float("t", datapoint.t); + f.dump_unsigned("v", datapoint.v); + f.close_section(); + } + } + }; + return with_perf_counters(extract_counters, svc_name, svc_id, path); +} + +PyObject* ActivePyModules::get_latest_counter_python( + const std::string &svc_name, + const std::string &svc_id, + const std::string &path) +{ + auto extract_latest_counters = []( + PerfCounterInstance& counter_instance, + PerfCounterType& counter_type, + PyFormatter& f) + { + if (counter_type.type & PERFCOUNTER_LONGRUNAVG) { + const auto &datapoint = counter_instance.get_latest_data_avg(); + f.dump_float("t", datapoint.t); + f.dump_unsigned("s", datapoint.s); + f.dump_unsigned("c", datapoint.c); + } else { + const auto &datapoint = counter_instance.get_latest_data(); + f.dump_float("t", datapoint.t); + f.dump_unsigned("v", datapoint.v); + } + }; + return with_perf_counters(extract_latest_counters, svc_name, svc_id, path); +} + +PyObject* ActivePyModules::get_perf_schema_python( + const std::string &svc_type, + const std::string &svc_id) +{ + without_gil_t no_gil; + std::lock_guard l(lock); + + DaemonStateCollection daemons; + + if (svc_type == "") { + daemons = daemon_state.get_all(); + } else if (svc_id.empty()) { + daemons = daemon_state.get_by_service(svc_type); + } else { + auto key = DaemonKey{svc_type, svc_id}; + // so that the below can be a loop in all cases + auto got = daemon_state.get(key); + if (got != nullptr) { + daemons[key] = got; + } + } + + auto f = with_gil(no_gil, [&] { + return PyFormatter(); + }); + if (!daemons.empty()) { + for (auto& [key, state] : daemons) { + std::lock_guard l(state->lock); + with_gil(no_gil, [&, key=ceph::to_string(key), state=state] { + f.open_object_section(key.c_str()); + for (auto ctr_inst_iter : state->perf_counters.instances) { + const auto &counter_name = ctr_inst_iter.first; + f.open_object_section(counter_name.c_str()); + auto type = state->perf_counters.types[counter_name]; + f.dump_string("description", type.description); + if (!type.nick.empty()) { + f.dump_string("nick", type.nick); + } + f.dump_unsigned("type", type.type); + f.dump_unsigned("priority", type.priority); + f.dump_unsigned("units", type.unit); + f.close_section(); + } + f.close_section(); + }); + } + } else { + dout(4) << __func__ << ": No daemon state found for " + << svc_type << "." << svc_id << ")" << dendl; + } + return f.get(); +} + +PyObject* ActivePyModules::get_rocksdb_version() +{ + std::string version = std::to_string(ROCKSDB_MAJOR) + "." + + std::to_string(ROCKSDB_MINOR) + "." + + std::to_string(ROCKSDB_PATCH); + + return PyUnicode_FromString(version.c_str()); +} + +PyObject *ActivePyModules::get_context() +{ + auto l = without_gil([&] { + return std::lock_guard(lock); + }); + // Construct a capsule containing ceph context. + // Not incrementing/decrementing ref count on the context because + // it's the global one and it has process lifetime. + auto capsule = PyCapsule_New(g_ceph_context, nullptr, nullptr); + return capsule; +} + +/** + * Helper for our wrapped types that take a capsule in their constructor. + */ +PyObject *construct_with_capsule( + const std::string &module_name, + const std::string &clsname, + void *wrapped) +{ + // Look up the OSDMap type which we will construct + PyObject *module = PyImport_ImportModule(module_name.c_str()); + if (!module) { + derr << "Failed to import python module:" << dendl; + derr << handle_pyerror(true, module_name, + "construct_with_capsule "s + module_name + " " + clsname) << dendl; + } + ceph_assert(module); + + PyObject *wrapper_type = PyObject_GetAttrString( + module, (const char*)clsname.c_str()); + if (!wrapper_type) { + derr << "Failed to get python type:" << dendl; + derr << handle_pyerror(true, module_name, + "construct_with_capsule "s + module_name + " " + clsname) << dendl; + } + ceph_assert(wrapper_type); + + // Construct a capsule containing an OSDMap. + auto wrapped_capsule = PyCapsule_New(wrapped, nullptr, nullptr); + ceph_assert(wrapped_capsule); + + // Construct the python OSDMap + auto pArgs = PyTuple_Pack(1, wrapped_capsule); + auto wrapper_instance = PyObject_CallObject(wrapper_type, pArgs); + if (wrapper_instance == nullptr) { + derr << "Failed to construct python OSDMap:" << dendl; + derr << handle_pyerror(true, module_name, + "construct_with_capsule "s + module_name + " " + clsname) << dendl; + } + ceph_assert(wrapper_instance != nullptr); + Py_DECREF(pArgs); + Py_DECREF(wrapped_capsule); + + Py_DECREF(wrapper_type); + Py_DECREF(module); + + return wrapper_instance; +} + +PyObject *ActivePyModules::get_osdmap() +{ + auto newmap = without_gil([&] { + OSDMap *newmap = new OSDMap; + cluster_state.with_osdmap([&](const OSDMap& o) { + newmap->deepish_copy_from(o); + }); + return newmap; + }); + return construct_with_capsule("mgr_module", "OSDMap", (void*)newmap); +} + +PyObject *ActivePyModules::get_foreign_config( + const std::string& who, + const std::string& name) +{ + dout(10) << "ceph_foreign_option_get " << who << " " << name << dendl; + + // NOTE: for now this will only work with build-in options, not module options. + const Option *opt = g_conf().find_option(name); + if (!opt) { + dout(4) << "ceph_foreign_option_get " << name << " not found " << dendl; + PyErr_Format(PyExc_KeyError, "option not found: %s", name.c_str()); + return nullptr; + } + + // If the monitors are not yet running pacific, we cannot rely on our local + // ConfigMap + if (!have_local_config_map) { + dout(20) << "mon cluster wasn't pacific when we started: falling back to 'config get'" + << dendl; + without_gil_t no_gil; + Command cmd; + { + std::lock_guard l(lock); + cmd.run( + &monc, + "{\"prefix\": \"config get\","s + + "\"who\": \""s + who + "\","s + + "\"key\": \""s + name + "\"}"); + } + cmd.wait(); + dout(10) << "ceph_foreign_option_get (mon command) " << who << " " << name << " = " + << cmd.outbl.to_str() << dendl; + no_gil.acquire_gil(); + return get_python_typed_option_value(opt->type, cmd.outbl.to_str()); + } + + // mimic the behavor of mon/ConfigMonitor's 'config get' command + EntityName entity; + if (!entity.from_str(who) && + !entity.from_str(who + ".")) { + dout(5) << "unrecognized entity '" << who << "'" << dendl; + PyErr_Format(PyExc_KeyError, "invalid entity: %s", who.c_str()); + return nullptr; + } + + without_gil_t no_gil; + lock.lock(); + + // FIXME: this is super inefficient, since we generate the entire daemon + // config just to extract one value from it! + + std::map<std::string,std::string,std::less<>> config; + cluster_state.with_osdmap([&](const OSDMap &osdmap) { + map<string,string> crush_location; + string device_class; + if (entity.is_osd()) { + osdmap.crush->get_full_location(who, &crush_location); + int id = atoi(entity.get_id().c_str()); + const char *c = osdmap.crush->get_item_class(id); + if (c) { + device_class = c; + } + dout(10) << __func__ << " crush_location " << crush_location + << " class " << device_class << dendl; + } + + std::map<std::string,pair<std::string,const MaskedOption*>> src; + config = config_map.generate_entity_map( + entity, + crush_location, + osdmap.crush.get(), + device_class, + &src); + }); + + // get a single value + string value; + auto p = config.find(name); + if (p != config.end()) { + value = p->second; + } else { + if (!entity.is_client() && + opt->daemon_value != Option::value_t{}) { + value = Option::to_str(opt->daemon_value); + } else { + value = Option::to_str(opt->value); + } + } + + dout(10) << "ceph_foreign_option_get (configmap) " << who << " " << name << " = " + << value << dendl; + lock.unlock(); + no_gil.acquire_gil(); + return get_python_typed_option_value(opt->type, value); +} + +void ActivePyModules::set_health_checks(const std::string& module_name, + health_check_map_t&& checks) +{ + bool changed = false; + + lock.lock(); + auto p = modules.find(module_name); + if (p != modules.end()) { + changed = p->second->set_health_checks(std::move(checks)); + } + lock.unlock(); + + // immediately schedule a report to be sent to the monitors with the new + // health checks that have changed. This is done asynchronusly to avoid + // blocking python land. ActivePyModules::lock needs to be dropped to make + // lockdep happy: + // + // send_report callers: DaemonServer::lock -> PyModuleRegistery::lock + // active_start: PyModuleRegistry::lock -> ActivePyModules::lock + // + // if we don't release this->lock before calling schedule_tick a cycle is + // formed with the addition of ActivePyModules::lock -> DaemonServer::lock. + // This is still correct as send_report is run asynchronously under + // DaemonServer::lock. + if (changed) + server.schedule_tick(0); +} + +int ActivePyModules::handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss) +{ + lock.lock(); + auto mod_iter = modules.find(module_command.module_name); + if (mod_iter == modules.end()) { + *ss << "Module '" << module_command.module_name << "' is not available"; + lock.unlock(); + return -ENOENT; + } + + lock.unlock(); + return mod_iter->second->handle_command(module_command, session, cmdmap, + inbuf, ds, ss); +} + +void ActivePyModules::get_health_checks(health_check_map_t *checks) +{ + std::lock_guard l(lock); + for (auto& [name, module] : modules) { + dout(15) << "getting health checks for " << name << dendl; + module->get_health_checks(checks); + } +} + +void ActivePyModules::update_progress_event( + const std::string& evid, + const std::string& desc, + float progress, + bool add_to_ceph_s) +{ + std::lock_guard l(lock); + auto& pe = progress_events[evid]; + pe.message = desc; + pe.progress = progress; + pe.add_to_ceph_s = add_to_ceph_s; +} + +void ActivePyModules::complete_progress_event(const std::string& evid) +{ + std::lock_guard l(lock); + progress_events.erase(evid); +} + +void ActivePyModules::clear_all_progress_events() +{ + std::lock_guard l(lock); + progress_events.clear(); +} + +void ActivePyModules::get_progress_events(std::map<std::string,ProgressEvent> *events) +{ + std::lock_guard l(lock); + *events = progress_events; +} + +void ActivePyModules::config_notify() +{ + std::lock_guard l(lock); + for (auto& [name, module] : modules) { + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + dout(15) << "notify (config) " << name << dendl; + Finisher& mod_finisher = py_module_registry.get_active_module_finisher(name); + // workaround for https://bugs.llvm.org/show_bug.cgi?id=35984 + mod_finisher.queue(new LambdaContext([module=module](int r){ + module->config_notify(); + })); + } +} + +void ActivePyModules::set_uri(const std::string& module_name, + const std::string &uri) +{ + std::lock_guard l(lock); + + dout(4) << " module " << module_name << " set URI '" << uri << "'" << dendl; + + modules.at(module_name)->set_uri(uri); +} + +void ActivePyModules::set_device_wear_level(const std::string& devid, + float wear_level) +{ + // update mgr state + map<string,string> meta; + daemon_state.with_device( + devid, + [wear_level, &meta] (DeviceState& dev) { + dev.set_wear_level(wear_level); + meta = dev.metadata; + }); + + // tell mon + json_spirit::Object json_object; + for (auto& i : meta) { + json_spirit::Config::add(json_object, i.first, i.second); + } + bufferlist json; + json.append(json_spirit::write(json_object)); + const string cmd = + "{" + "\"prefix\": \"config-key set\", " + "\"key\": \"device/" + devid + "\"" + "}"; + + Command set_cmd; + set_cmd.run(&monc, cmd, json); + set_cmd.wait(); +} + +MetricQueryID ActivePyModules::add_osd_perf_query( + const OSDPerfMetricQuery &query, + const std::optional<OSDPerfMetricLimit> &limit) +{ + return server.add_osd_perf_query(query, limit); +} + +void ActivePyModules::remove_osd_perf_query(MetricQueryID query_id) +{ + int r = server.remove_osd_perf_query(query_id); + if (r < 0) { + dout(0) << "remove_osd_perf_query for query_id=" << query_id << " failed: " + << cpp_strerror(r) << dendl; + } +} + +PyObject *ActivePyModules::get_osd_perf_counters(MetricQueryID query_id) +{ + OSDPerfCollector collector(query_id); + int r = server.get_osd_perf_counters(&collector); + if (r < 0) { + dout(0) << "get_osd_perf_counters for query_id=" << query_id << " failed: " + << cpp_strerror(r) << dendl; + Py_RETURN_NONE; + } + + PyFormatter f; + const std::map<OSDPerfMetricKey, PerformanceCounters> &counters = collector.counters; + + f.open_array_section("counters"); + for (auto &[key, instance_counters] : counters) { + f.open_object_section("i"); + f.open_array_section("k"); + for (auto &sub_key : key) { + f.open_array_section("s"); + for (size_t i = 0; i < sub_key.size(); i++) { + f.dump_string(stringify(i).c_str(), sub_key[i]); + } + f.close_section(); // s + } + f.close_section(); // k + f.open_array_section("c"); + for (auto &c : instance_counters) { + f.open_array_section("p"); + f.dump_unsigned("0", c.first); + f.dump_unsigned("1", c.second); + f.close_section(); // p + } + f.close_section(); // c + f.close_section(); // i + } + f.close_section(); // counters + + return f.get(); +} + +MetricQueryID ActivePyModules::add_mds_perf_query( + const MDSPerfMetricQuery &query, + const std::optional<MDSPerfMetricLimit> &limit) +{ + return server.add_mds_perf_query(query, limit); +} + +void ActivePyModules::remove_mds_perf_query(MetricQueryID query_id) +{ + int r = server.remove_mds_perf_query(query_id); + if (r < 0) { + dout(0) << "remove_mds_perf_query for query_id=" << query_id << " failed: " + << cpp_strerror(r) << dendl; + } +} + +void ActivePyModules::reregister_mds_perf_queries() +{ + server.reregister_mds_perf_queries(); +} + +PyObject *ActivePyModules::get_mds_perf_counters(MetricQueryID query_id) +{ + MDSPerfCollector collector(query_id); + int r = server.get_mds_perf_counters(&collector); + if (r < 0) { + dout(0) << "get_mds_perf_counters for query_id=" << query_id << " failed: " + << cpp_strerror(r) << dendl; + Py_RETURN_NONE; + } + + PyFormatter f; + const std::map<MDSPerfMetricKey, PerformanceCounters> &counters = collector.counters; + + f.open_array_section("metrics"); + + f.open_array_section("delayed_ranks"); + f.dump_string("ranks", stringify(collector.delayed_ranks).c_str()); + f.close_section(); // delayed_ranks + + f.open_array_section("counters"); + for (auto &[key, instance_counters] : counters) { + f.open_object_section("i"); + f.open_array_section("k"); + for (auto &sub_key : key) { + f.open_array_section("s"); + for (size_t i = 0; i < sub_key.size(); i++) { + f.dump_string(stringify(i).c_str(), sub_key[i]); + } + f.close_section(); // s + } + f.close_section(); // k + f.open_array_section("c"); + for (auto &c : instance_counters) { + f.open_array_section("p"); + f.dump_unsigned("0", c.first); + f.dump_unsigned("1", c.second); + f.close_section(); // p + } + f.close_section(); // c + f.close_section(); // i + } + f.close_section(); // counters + + f.open_array_section("last_updated"); + f.dump_float("last_updated_mono", collector.last_updated_mono); + f.close_section(); // last_updated + + f.close_section(); // metrics + + return f.get(); +} + +void ActivePyModules::cluster_log(const std::string &channel, clog_type prio, + const std::string &message) +{ + std::lock_guard l(lock); + + auto cl = monc.get_log_client()->create_channel(channel); + cl->parse_client_options(g_ceph_context); + cl->do_log(prio, message); +} + +void ActivePyModules::register_client(std::string_view name, std::string addrs, bool replace) +{ + entity_addrvec_t addrv; + addrv.parse(addrs.data()); + + dout(7) << "registering msgr client handle " << addrv << " (replace=" << replace << ")" << dendl; + py_module_registry.register_client(name, std::move(addrv), replace); +} + +void ActivePyModules::unregister_client(std::string_view name, std::string addrs) +{ + entity_addrvec_t addrv; + addrv.parse(addrs.data()); + + dout(7) << "unregistering msgr client handle " << addrv << dendl; + py_module_registry.unregister_client(name, addrv); +} + +PyObject* ActivePyModules::get_daemon_health_metrics() +{ + without_gil_t no_gil; + return daemon_state.with_daemons_by_server([&no_gil] + (const std::map<std::string, DaemonStateCollection> &all) { + no_gil.acquire_gil(); + PyFormatter f; + for (const auto &[hostname, daemon_state] : all) { + for (const auto &[key, state] : daemon_state) { + f.open_array_section(ceph::to_string(key)); + for (const auto &metric : state->daemon_health_metrics) { + f.open_object_section(metric.get_type_name()); + f.dump_int("value", metric.get_n1()); + f.dump_string("type", metric.get_type_name()); + f.close_section(); + } + f.close_section(); + } + } + return f.get(); + }); +} diff --git a/src/mgr/ActivePyModules.h b/src/mgr/ActivePyModules.h new file mode 100644 index 000000000..283f96a6e --- /dev/null +++ b/src/mgr/ActivePyModules.h @@ -0,0 +1,234 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include "ActivePyModule.h" + +#include "common/Finisher.h" +#include "common/ceph_mutex.h" + +#include "PyFormatter.h" + +#include "osdc/Objecter.h" +#include "client/Client.h" +#include "common/LogClient.h" +#include "mon/MgrMap.h" +#include "mon/MonCommand.h" +#include "mon/mon_types.h" +#include "mon/ConfigMap.h" +#include "mgr/TTLCache.h" + +#include "DaemonState.h" +#include "ClusterState.h" +#include "OSDPerfMetricTypes.h" + +class health_check_map_t; +class DaemonServer; +class MgrSession; +class ModuleCommand; +class PyModuleRegistry; + +class ActivePyModules +{ + // module class instances not yet created + std::set<std::string, std::less<>> pending_modules; + // module class instances already created + std::map<std::string, std::shared_ptr<ActivePyModule>> modules; + PyModuleConfig &module_config; + bool have_local_config_map = false; + std::map<std::string, std::string> store_cache; + ConfigMap config_map; ///< derived from store_cache config/ keys + DaemonStateIndex &daemon_state; + ClusterState &cluster_state; + MonClient &monc; + LogChannelRef clog, audit_clog; + Objecter &objecter; + Client &client; + Finisher &finisher; + TTLCache<std::string, PyObject*> ttl_cache; +public: + Finisher cmd_finisher; +private: + DaemonServer &server; + PyModuleRegistry &py_module_registry; + + std::map<std::string,ProgressEvent> progress_events; + + mutable ceph::mutex lock = ceph::make_mutex("ActivePyModules::lock"); + +public: + ActivePyModules( + PyModuleConfig &module_config, + std::map<std::string, std::string> store_data, + bool mon_provides_kv_sub, + DaemonStateIndex &ds, ClusterState &cs, MonClient &mc, + LogChannelRef clog_, LogChannelRef audit_clog_, Objecter &objecter_, Client &client_, + Finisher &f, DaemonServer &server, PyModuleRegistry &pmr); + + ~ActivePyModules(); + + // FIXME: wrap for send_command? + MonClient &get_monc() {return monc;} + Objecter &get_objecter() {return objecter;} + Client &get_client() {return client;} + PyObject *cacheable_get_python(const std::string &what); + PyObject *get_python(const std::string &what); + PyObject *get_server_python(const std::string &hostname); + PyObject *list_servers_python(); + PyObject *get_metadata_python( + const std::string &svc_type, const std::string &svc_id); + PyObject *get_daemon_status_python( + const std::string &svc_type, const std::string &svc_id); + PyObject *get_counter_python( + const std::string &svc_type, + const std::string &svc_id, + const std::string &path); + PyObject *get_latest_counter_python( + const std::string &svc_type, + const std::string &svc_id, + const std::string &path); + PyObject *get_perf_schema_python( + const std::string &svc_type, + const std::string &svc_id); + PyObject *get_rocksdb_version(); + PyObject *get_context(); + PyObject *get_osdmap(); + /// @note @c fct is not allowed to acquire locks when holding GIL + PyObject *with_perf_counters( + std::function<void( + PerfCounterInstance& counter_instance, + PerfCounterType& counter_type, + PyFormatter& f)> fct, + const std::string &svc_name, + const std::string &svc_id, + const std::string &path) const; + + MetricQueryID add_osd_perf_query( + const OSDPerfMetricQuery &query, + const std::optional<OSDPerfMetricLimit> &limit); + void remove_osd_perf_query(MetricQueryID query_id); + PyObject *get_osd_perf_counters(MetricQueryID query_id); + + MetricQueryID add_mds_perf_query( + const MDSPerfMetricQuery &query, + const std::optional<MDSPerfMetricLimit> &limit); + void remove_mds_perf_query(MetricQueryID query_id); + void reregister_mds_perf_queries(); + PyObject *get_mds_perf_counters(MetricQueryID query_id); + + bool get_store(const std::string &module_name, + const std::string &key, std::string *val) const; + PyObject *get_store_prefix(const std::string &module_name, + const std::string &prefix) const; + void set_store(const std::string &module_name, + const std::string &key, const std::optional<std::string> &val); + + bool get_config(const std::string &module_name, + const std::string &key, std::string *val) const; + std::pair<int, std::string> set_config(const std::string &module_name, + const std::string &key, const std::optional<std::string> &val); + + PyObject *get_typed_config(const std::string &module_name, + const std::string &key, + const std::string &prefix = "") const; + PyObject *get_foreign_config( + const std::string& who, + const std::string& name); + + void set_health_checks(const std::string& module_name, + health_check_map_t&& checks); + void get_health_checks(health_check_map_t *checks); + + void update_progress_event(const std::string& evid, + const std::string& desc, + float progress, + bool add_to_ceph_s); + void complete_progress_event(const std::string& evid); + void clear_all_progress_events(); + void get_progress_events(std::map<std::string,ProgressEvent>* events); + + void register_client(std::string_view name, std::string addrs, bool replace); + void unregister_client(std::string_view name, std::string addrs); + + void config_notify(); + + void set_uri(const std::string& module_name, const std::string &uri); + void set_device_wear_level(const std::string& devid, float wear_level); + + int handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss); + + std::map<std::string, std::string> get_services() const; + + void update_kv_data( + const std::string prefix, + bool incremental, + const map<std::string, std::optional<bufferlist>, std::less<>>& data); + void _refresh_config_map(); + + // Public so that MonCommandCompletion can use it + // FIXME: for send_command completion notifications, + // send it to only the module that sent the command, not everyone + void notify_all(const std::string ¬ify_type, + const std::string ¬ify_id); + void notify_all(const LogEntry &log_entry); + + auto& get_module_finisher(const std::string &name) { + return modules.at(name)->finisher; + } + + bool is_pending(std::string_view name) const { + return pending_modules.count(name) > 0; + } + bool module_exists(const std::string &name) const + { + return modules.count(name) > 0; + } + + bool method_exists( + const std::string &module_name, + const std::string &method_name) const + { + return modules.at(module_name)->method_exists(method_name); + } + + PyObject *dispatch_remote( + const std::string &other_module, + const std::string &method, + PyObject *args, + PyObject *kwargs, + std::string *err); + + int init(); + void shutdown(); + + void start_one(PyModuleRef py_module); + + void dump_server(const std::string &hostname, + const DaemonStateCollection &dmc, + Formatter *f); + + void cluster_log(const std::string &channel, clog_type prio, + const std::string &message); + PyObject* get_daemon_health_metrics(); + + bool inject_python_on() const; + void update_cache_metrics(); +}; + diff --git a/src/mgr/BaseMgrModule.cc b/src/mgr/BaseMgrModule.cc new file mode 100644 index 000000000..ab64ac39f --- /dev/null +++ b/src/mgr/BaseMgrModule.cc @@ -0,0 +1,1634 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +/** + * The interface we present to python code that runs within + * ceph-mgr. This is implemented as a Python class from which + * all modules must inherit -- access to the Ceph state is then + * available as methods on that object. + */ + +#include "Python.h" + +#include "Mgr.h" + +#include "mon/MonClient.h" +#include "common/errno.h" +#include "common/version.h" +#include "mgr/Types.h" + +#include "PyUtil.h" +#include "BaseMgrModule.h" +#include "Gil.h" + +#include <algorithm> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +#define PLACEHOLDER "" + +using std::list; +using std::string; + +typedef struct { + PyObject_HEAD + ActivePyModules *py_modules; + ActivePyModule *this_module; +} BaseMgrModule; + +class MonCommandCompletion : public Context +{ + ActivePyModules *py_modules; + PyObject *python_completion; + const std::string tag; + SafeThreadState pThreadState; + +public: + std::string outs; + bufferlist outbl; + + MonCommandCompletion( + ActivePyModules *py_modules_, PyObject* ev, + const std::string &tag_, PyThreadState *ts_) + : py_modules(py_modules_), python_completion(ev), + tag(tag_), pThreadState(ts_) + { + ceph_assert(python_completion != nullptr); + Py_INCREF(python_completion); + } + + ~MonCommandCompletion() override + { + if (python_completion) { + // Usually do this in finish(): this path is only for if we're + // being destroyed without completing. + Gil gil(pThreadState, true); + Py_DECREF(python_completion); + python_completion = nullptr; + } + } + + void finish(int r) override + { + ceph_assert(python_completion != nullptr); + + dout(10) << "MonCommandCompletion::finish()" << dendl; + { + // Scoped so the Gil is released before calling notify_all() + // Create new thread state because this is called via the MonClient + // Finisher, not the PyModules finisher. + Gil gil(pThreadState, true); + + auto set_fn = PyObject_GetAttrString(python_completion, "complete"); + ceph_assert(set_fn != nullptr); + + auto pyR = PyLong_FromLong(r); + auto pyOutBl = PyUnicode_FromString(outbl.to_str().c_str()); + auto pyOutS = PyUnicode_FromString(outs.c_str()); + auto args = PyTuple_Pack(3, pyR, pyOutBl, pyOutS); + Py_DECREF(pyR); + Py_DECREF(pyOutBl); + Py_DECREF(pyOutS); + + auto rtn = PyObject_CallObject(set_fn, args); + if (rtn != nullptr) { + Py_DECREF(rtn); + } + Py_DECREF(args); + Py_DECREF(set_fn); + + Py_DECREF(python_completion); + python_completion = nullptr; + } + py_modules->notify_all("command", tag); + } +}; + + +static PyObject* +ceph_send_command(BaseMgrModule *self, PyObject *args) +{ + // Like mon, osd, mds + char *type = nullptr; + + // Like "23" for an OSD or "myid" for an MDS + char *name = nullptr; + + char *cmd_json = nullptr; + char *tag = nullptr; + char *inbuf_ptr = nullptr; + Py_ssize_t inbuf_len = 0; + bufferlist inbuf = {}; + + PyObject *completion = nullptr; + if (!PyArg_ParseTuple(args, "Ossssz#:ceph_send_command", + &completion, &type, &name, &cmd_json, &tag, &inbuf_ptr, &inbuf_len)) { + return nullptr; + } + + if (inbuf_ptr) { + inbuf.append(inbuf_ptr, (unsigned)inbuf_len); + } + + auto set_fn = PyObject_GetAttrString(completion, "complete"); + if (set_fn == nullptr) { + ceph_abort(); // TODO raise python exception instead + } else { + ceph_assert(PyCallable_Check(set_fn)); + } + Py_DECREF(set_fn); + + MonCommandCompletion *command_c = new MonCommandCompletion(self->py_modules, + completion, tag, PyThreadState_Get()); + + PyThreadState *tstate = PyEval_SaveThread(); + + if (std::string(type) == "mon") { + + // Wait for the latest OSDMap after each command we send to + // the mons. This is a heavy-handed hack to make life simpler + // for python module authors, so that they know whenever they + // run a command they've gt a fresh OSDMap afterwards. + // TODO: enhance MCommand interface so that it returns + // latest cluster map versions on completion, and callers + // can wait for those. + auto c = new LambdaContext([command_c, self](int command_r){ + self->py_modules->get_objecter().wait_for_latest_osdmap( + [command_c, command_r](boost::system::error_code) { + command_c->complete(command_r); + }); + }); + + self->py_modules->get_monc().start_mon_command( + name, + {cmd_json}, + inbuf, + &command_c->outbl, + &command_c->outs, + new C_OnFinisher(c, &self->py_modules->cmd_finisher)); + } else if (std::string(type) == "osd") { + std::string err; + uint64_t osd_id = strict_strtoll(name, 10, &err); + if (!err.empty()) { + delete command_c; + string msg("invalid osd_id: "); + msg.append("\"").append(name).append("\""); + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_ValueError, msg.c_str()); + return nullptr; + } + + ceph_tid_t tid; + self->py_modules->get_objecter().osd_command( + osd_id, + {cmd_json}, + inbuf, + &tid, + [command_c, f = &self->py_modules->cmd_finisher] + (boost::system::error_code ec, std::string s, ceph::buffer::list bl) { + command_c->outs = std::move(s); + command_c->outbl = std::move(bl); + f->queue(command_c); + }); + } else if (std::string(type) == "mds") { + int r = self->py_modules->get_client().mds_command( + name, + {cmd_json}, + inbuf, + &command_c->outbl, + &command_c->outs, + new C_OnFinisher(command_c, &self->py_modules->cmd_finisher)); + if (r != 0) { + string msg("failed to send command to mds: "); + msg.append(cpp_strerror(r)); + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_RuntimeError, msg.c_str()); + return nullptr; + } + } else if (std::string(type) == "pg") { + pg_t pgid; + if (!pgid.parse(name)) { + delete command_c; + string msg("invalid pgid: "); + msg.append("\"").append(name).append("\""); + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_ValueError, msg.c_str()); + return nullptr; + } + + ceph_tid_t tid; + self->py_modules->get_objecter().pg_command( + pgid, + {cmd_json}, + inbuf, + &tid, + [command_c, f = &self->py_modules->cmd_finisher] + (boost::system::error_code ec, std::string s, ceph::buffer::list bl) { + command_c->outs = std::move(s); + command_c->outbl = std::move(bl); + f->queue(command_c); + }); + PyEval_RestoreThread(tstate); + return nullptr; + } else { + delete command_c; + string msg("unknown service type: "); + msg.append(type); + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_ValueError, msg.c_str()); + return nullptr; + } + + PyEval_RestoreThread(tstate); + Py_RETURN_NONE; +} + +static PyObject* +ceph_set_health_checks(BaseMgrModule *self, PyObject *args) +{ + PyObject *checks = NULL; + if (!PyArg_ParseTuple(args, "O:ceph_set_health_checks", &checks)) { + return NULL; + } + if (!PyDict_Check(checks)) { + derr << __func__ << " arg not a dict" << dendl; + Py_RETURN_NONE; + } + PyObject *checksls = PyDict_Items(checks); + health_check_map_t out_checks; + for (int i = 0; i < PyList_Size(checksls); ++i) { + PyObject *kv = PyList_GET_ITEM(checksls, i); + char *check_name = nullptr; + PyObject *check_info = nullptr; + if (!PyArg_ParseTuple(kv, "sO:pair", &check_name, &check_info)) { + derr << __func__ << " dict item " << i + << " not a size 2 tuple" << dendl; + continue; + } + if (!PyDict_Check(check_info)) { + derr << __func__ << " item " << i << " " << check_name + << " value not a dict" << dendl; + continue; + } + health_status_t severity = HEALTH_OK; + string summary; + list<string> detail; + int64_t count = 0; + PyObject *infols = PyDict_Items(check_info); + for (int j = 0; j < PyList_Size(infols); ++j) { + PyObject *pair = PyList_GET_ITEM(infols, j); + if (!PyTuple_Check(pair)) { + derr << __func__ << " item " << i << " pair " << j + << " not a tuple" << dendl; + continue; + } + char *k = nullptr; + PyObject *v = nullptr; + if (!PyArg_ParseTuple(pair, "sO:pair", &k, &v)) { + derr << __func__ << " item " << i << " pair " << j + << " not a size 2 tuple" << dendl; + continue; + } + string ks(k); + if (ks == "severity") { + if (!PyUnicode_Check(v)) { + derr << __func__ << " check " << check_name + << " severity value not string" << dendl; + continue; + } + if (const string vs = PyUnicode_AsUTF8(v); vs == "warning") { + severity = HEALTH_WARN; + } else if (vs == "error") { + severity = HEALTH_ERR; + } + } else if (ks == "summary") { + if (!PyUnicode_Check(v)) { + derr << __func__ << " check " << check_name + << " summary value not [unicode] string" << dendl; + continue; + } else { + summary = PyUnicode_AsUTF8(v); + } + } else if (ks == "count") { + if (PyLong_Check(v)) { + count = PyLong_AsLong(v); + } else { + derr << __func__ << " check " << check_name + << " count value not int" << dendl; + continue; + } + } else if (ks == "detail") { + if (!PyList_Check(v)) { + derr << __func__ << " check " << check_name + << " detail value not list" << dendl; + continue; + } + for (int k = 0; k < PyList_Size(v); ++k) { + PyObject *di = PyList_GET_ITEM(v, k); + if (!PyUnicode_Check(di)) { + derr << __func__ << " check " << check_name + << " detail item " << k << " not a [unicode] string" << dendl; + continue; + } else { + detail.push_back(PyUnicode_AsUTF8(di)); + } + } + } else { + derr << __func__ << " check " << check_name + << " unexpected key " << k << dendl; + } + } + auto& d = out_checks.add(check_name, severity, summary, count); + d.detail.swap(detail); + } + + JSONFormatter jf(true); + dout(10) << "module " << self->this_module->get_name() + << " health checks:\n"; + out_checks.dump(&jf); + jf.flush(*_dout); + *_dout << dendl; + without_gil([&] { + self->py_modules->set_health_checks(self->this_module->get_name(), + std::move(out_checks)); + }); + Py_RETURN_NONE; +} + + +static PyObject* +ceph_state_get(BaseMgrModule *self, PyObject *args) +{ + char *what = NULL; + if (!PyArg_ParseTuple(args, "s:ceph_state_get", &what)) { + return NULL; + } + + return self->py_modules->cacheable_get_python(what); +} + + +static PyObject* +ceph_get_server(BaseMgrModule *self, PyObject *args) +{ + char *hostname = NULL; + if (!PyArg_ParseTuple(args, "z:ceph_get_server", &hostname)) { + return NULL; + } + + if (hostname) { + return self->py_modules->get_server_python(hostname); + } else { + return self->py_modules->list_servers_python(); + } +} + +static PyObject* +ceph_get_mgr_id(BaseMgrModule *self, PyObject *args) +{ + return PyUnicode_FromString(g_conf()->name.get_id().c_str()); +} + +static PyObject* +ceph_option_get(BaseMgrModule *self, PyObject *args) +{ + char *what = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_option_get", &what)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + const Option *opt = g_conf().find_option(string(what)); + if (opt) { + std::string value; + switch (int r = g_conf().get_val(string(what), &value); r) { + case -ENOMEM: + PyErr_NoMemory(); + return nullptr; + case -ENAMETOOLONG: + PyErr_SetString(PyExc_ValueError, "value too long"); + return nullptr; + default: + ceph_assert(r == 0); + break; + } + dout(10) << "ceph_option_get " << what << " found: " << value << dendl; + return get_python_typed_option_value(opt->type, value); + } else { + dout(4) << "ceph_option_get " << what << " not found " << dendl; + PyErr_Format(PyExc_KeyError, "option not found: %s", what); + return nullptr; + } +} + +static PyObject* +ceph_foreign_option_get(BaseMgrModule *self, PyObject *args) +{ + char *who = nullptr; + char *what = nullptr; + if (!PyArg_ParseTuple(args, "ss:ceph_foreign_option_get", &who, &what)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + return self->py_modules->get_foreign_config(who, what); +} + +static PyObject* +ceph_get_module_option(BaseMgrModule *self, PyObject *args) +{ + char *module = nullptr; + char *key = nullptr; + char *prefix = nullptr; + if (!PyArg_ParseTuple(args, "ss|s:ceph_get_module_option", &module, &key, + &prefix)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + std::string str_prefix; + if (prefix) { + str_prefix = prefix; + } + assert(self->this_module->py_module); + auto pResult = self->py_modules->get_typed_config(module, key, str_prefix); + return pResult; +} + +static PyObject* +ceph_store_get_prefix(BaseMgrModule *self, PyObject *args) +{ + char *prefix = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_store_get_prefix", &prefix)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + return self->py_modules->get_store_prefix(self->this_module->get_name(), + prefix); +} + +static PyObject* +ceph_set_module_option(BaseMgrModule *self, PyObject *args) +{ + char *module = nullptr; + char *key = nullptr; + char *value = nullptr; + if (!PyArg_ParseTuple(args, "ssz:ceph_set_module_option", + &module, &key, &value)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + std::optional<string> val; + if (value) { + val = value; + } + auto [ret, msg] = without_gil([&] { + return self->py_modules->set_config(module, key, val); + }); + if (ret) { + PyErr_SetString(PyExc_ValueError, msg.c_str()); + return nullptr; + } + Py_RETURN_NONE; +} + +static PyObject* +ceph_store_get(BaseMgrModule *self, PyObject *args) +{ + char *what = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_store_get", &what)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + std::string value; + bool found = self->py_modules->get_store(self->this_module->get_name(), + what, &value); + if (found) { + dout(10) << "ceph_store_get " << what << " found: " << value.c_str() << dendl; + return PyUnicode_FromString(value.c_str()); + } else { + dout(4) << "ceph_store_get " << what << " not found " << dendl; + Py_RETURN_NONE; + } +} + +static PyObject* +ceph_store_set(BaseMgrModule *self, PyObject *args) +{ + char *key = nullptr; + char *value = nullptr; + if (!PyArg_ParseTuple(args, "sz:ceph_store_set", &key, &value)) { + return nullptr; + } + std::optional<string> val; + if (value) { + val = value; + } + without_gil([&] { + self->py_modules->set_store(self->this_module->get_name(), key, val); + }); + Py_RETURN_NONE; +} + +static PyObject* +get_metadata(BaseMgrModule *self, PyObject *args) +{ + char *svc_name = NULL; + char *svc_id = NULL; + if (!PyArg_ParseTuple(args, "ss:get_metadata", &svc_name, &svc_id)) { + return nullptr; + } + return self->py_modules->get_metadata_python(svc_name, svc_id); +} + +static PyObject* +get_daemon_status(BaseMgrModule *self, PyObject *args) +{ + char *svc_name = NULL; + char *svc_id = NULL; + if (!PyArg_ParseTuple(args, "ss:get_daemon_status", &svc_name, + &svc_id)) { + return nullptr; + } + return self->py_modules->get_daemon_status_python(svc_name, svc_id); +} + +static PyObject* +ceph_log(BaseMgrModule *self, PyObject *args) +{ + char *record = nullptr; + if (!PyArg_ParseTuple(args, "s:log", &record)) { + return nullptr; + } + + ceph_assert(self->this_module); + + self->this_module->log(record); + + Py_RETURN_NONE; +} + +static PyObject* +ceph_cluster_log(BaseMgrModule *self, PyObject *args) +{ + int prio = 0; + char *channel = nullptr; + char *message = nullptr; + + if (!PyArg_ParseTuple(args, "sis:ceph_cluster_log", &channel, &prio, &message)) { + return nullptr; + } + without_gil([&] { + self->py_modules->cluster_log(channel, (clog_type)prio, message); + }); + Py_RETURN_NONE; +} + +static PyObject * +ceph_get_version(BaseMgrModule *self, PyObject *args) +{ + return PyUnicode_FromString(pretty_version_to_str().c_str()); +} + +static PyObject * +ceph_get_ceph_conf_path(BaseMgrModule *self, PyObject *args) +{ + return PyUnicode_FromString(g_conf().get_conf_path().c_str()); +} + +static PyObject * +ceph_get_release_name(BaseMgrModule *self, PyObject *args) +{ + return PyUnicode_FromString(ceph_release_to_str()); +} + +static PyObject * +ceph_lookup_release_name(BaseMgrModule *self, PyObject *args) +{ + int major = 0; + if (!PyArg_ParseTuple(args, "i:ceph_lookup_release_name", &major)) { + return nullptr; + } + return PyUnicode_FromString(ceph_release_name(major)); +} + +static PyObject * +ceph_get_context(BaseMgrModule *self) +{ + return self->py_modules->get_context(); +} + +static PyObject* +get_counter(BaseMgrModule *self, PyObject *args) +{ + char *svc_name = nullptr; + char *svc_id = nullptr; + char *counter_path = nullptr; + if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name, + &svc_id, &counter_path)) { + return nullptr; + } + return self->py_modules->get_counter_python( + svc_name, svc_id, counter_path); +} + +static PyObject* +get_latest_counter(BaseMgrModule *self, PyObject *args) +{ + char *svc_name = nullptr; + char *svc_id = nullptr; + char *counter_path = nullptr; + if (!PyArg_ParseTuple(args, "sss:get_counter", &svc_name, + &svc_id, &counter_path)) { + return nullptr; + } + return self->py_modules->get_latest_counter_python( + svc_name, svc_id, counter_path); +} + +static PyObject* +get_perf_schema(BaseMgrModule *self, PyObject *args) +{ + char *type_str = nullptr; + char *svc_id = nullptr; + if (!PyArg_ParseTuple(args, "ss:get_perf_schema", &type_str, + &svc_id)) { + return nullptr; + } + + return self->py_modules->get_perf_schema_python(type_str, svc_id); +} + +static PyObject* +ceph_get_rocksdb_version(BaseMgrModule *self) +{ + return self->py_modules->get_rocksdb_version(); +} + + +static PyObject * +ceph_get_osdmap(BaseMgrModule *self, PyObject *args) +{ + return self->py_modules->get_osdmap(); +} + +static PyObject* +ceph_set_uri(BaseMgrModule *self, PyObject *args) +{ + char *svc_str = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_advertize_service", + &svc_str)) { + return nullptr; + } + // We call down into PyModules even though we have a MgrPyModule + // reference here, because MgrPyModule's fields are protected + // by PyModules' lock. + without_gil([&] { + self->py_modules->set_uri(self->this_module->get_name(), svc_str); + }); + Py_RETURN_NONE; +} + +static PyObject* +ceph_set_wear_level(BaseMgrModule *self, PyObject *args) +{ + char *devid = nullptr; + float wear_level; + if (!PyArg_ParseTuple(args, "sf:ceph_set_wear_level", + &devid, &wear_level)) { + return nullptr; + } + without_gil([&] { + self->py_modules->set_device_wear_level(devid, wear_level); + }); + Py_RETURN_NONE; +} + +static PyObject* +ceph_have_mon_connection(BaseMgrModule *self, PyObject *args) +{ + if (self->py_modules->get_monc().is_connected()) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } +} + +static PyObject* +ceph_update_progress_event(BaseMgrModule *self, PyObject *args) +{ + char *evid = nullptr; + char *desc = nullptr; + float progress = 0.0; + bool add_to_ceph_s = false; + if (!PyArg_ParseTuple(args, "ssfb:ceph_update_progress_event", + &evid, &desc, &progress, &add_to_ceph_s)) { + return nullptr; + } + without_gil([&] { + self->py_modules->update_progress_event(evid, desc, progress, add_to_ceph_s); + }); + Py_RETURN_NONE; +} + +static PyObject* +ceph_complete_progress_event(BaseMgrModule *self, PyObject *args) +{ + char *evid = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_complete_progress_event", + &evid)) { + return nullptr; + } + without_gil([&] { + self->py_modules->complete_progress_event(evid); + }); + Py_RETURN_NONE; +} + +static PyObject* +ceph_clear_all_progress_events(BaseMgrModule *self, PyObject *args) +{ + without_gil([&] { + self->py_modules->clear_all_progress_events(); + }); + Py_RETURN_NONE; +} + + + +static PyObject * +ceph_dispatch_remote(BaseMgrModule *self, PyObject *args) +{ + char *other_module = nullptr; + char *method = nullptr; + PyObject *remote_args = nullptr; + PyObject *remote_kwargs = nullptr; + if (!PyArg_ParseTuple(args, "ssOO:ceph_dispatch_remote", + &other_module, &method, &remote_args, &remote_kwargs)) { + return nullptr; + } + + // Early error handling, because if the module doesn't exist then we + // won't be able to use its thread state to set python error state + // inside dispatch_remote(). + if (!self->py_modules->module_exists(other_module)) { + derr << "no module '" << other_module << "'" << dendl; + PyErr_SetString(PyExc_ImportError, "Module not found"); + return nullptr; + } + + // Drop GIL from calling python thread state, it will be taken + // both for checking for method existence and for executing method. + PyThreadState *tstate = PyEval_SaveThread(); + + if (!self->py_modules->method_exists(other_module, method)) { + PyEval_RestoreThread(tstate); + PyErr_SetString(PyExc_NameError, "Method not found"); + return nullptr; + } + + std::string err; + auto result = self->py_modules->dispatch_remote(other_module, method, + remote_args, remote_kwargs, &err); + + PyEval_RestoreThread(tstate); + + if (result == nullptr) { + std::stringstream ss; + ss << "Remote method threw exception: " << err; + PyErr_SetString(PyExc_RuntimeError, ss.str().c_str()); + derr << ss.str() << dendl; + } + + return result; +} + +static PyObject* +ceph_add_osd_perf_query(BaseMgrModule *self, PyObject *args) +{ + static const std::string NAME_KEY_DESCRIPTOR = "key_descriptor"; + static const std::string NAME_COUNTERS_DESCRIPTORS = + "performance_counter_descriptors"; + static const std::string NAME_LIMIT = "limit"; + static const std::string NAME_SUB_KEY_TYPE = "type"; + static const std::string NAME_SUB_KEY_REGEX = "regex"; + static const std::string NAME_LIMIT_ORDER_BY = "order_by"; + static const std::string NAME_LIMIT_MAX_COUNT = "max_count"; + static const std::map<std::string, OSDPerfMetricSubKeyType> sub_key_types = { + {"client_id", OSDPerfMetricSubKeyType::CLIENT_ID}, + {"client_address", OSDPerfMetricSubKeyType::CLIENT_ADDRESS}, + {"pool_id", OSDPerfMetricSubKeyType::POOL_ID}, + {"namespace", OSDPerfMetricSubKeyType::NAMESPACE}, + {"osd_id", OSDPerfMetricSubKeyType::OSD_ID}, + {"pg_id", OSDPerfMetricSubKeyType::PG_ID}, + {"object_name", OSDPerfMetricSubKeyType::OBJECT_NAME}, + {"snap_id", OSDPerfMetricSubKeyType::SNAP_ID}, + }; + static const std::map<std::string, PerformanceCounterType> counter_types = { + {"ops", PerformanceCounterType::OPS}, + {"write_ops", PerformanceCounterType::WRITE_OPS}, + {"read_ops", PerformanceCounterType::READ_OPS}, + {"bytes", PerformanceCounterType::BYTES}, + {"write_bytes", PerformanceCounterType::WRITE_BYTES}, + {"read_bytes", PerformanceCounterType::READ_BYTES}, + {"latency", PerformanceCounterType::LATENCY}, + {"write_latency", PerformanceCounterType::WRITE_LATENCY}, + {"read_latency", PerformanceCounterType::READ_LATENCY}, + }; + + PyObject *py_query = nullptr; + if (!PyArg_ParseTuple(args, "O:ceph_add_osd_perf_query", &py_query)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + if (!PyDict_Check(py_query)) { + derr << __func__ << " arg not a dict" << dendl; + Py_RETURN_NONE; + } + + PyObject *query_params = PyDict_Items(py_query); + OSDPerfMetricQuery query; + std::optional<OSDPerfMetricLimit> limit; + + // { + // 'key_descriptor': [ + // {'type': subkey_type, 'regex': regex_pattern}, + // ... + // ], + // 'performance_counter_descriptors': [ + // list, of, descriptor, types + // ], + // 'limit': {'order_by': performance_counter_type, 'max_count': n}, + // } + + for (int i = 0; i < PyList_Size(query_params); ++i) { + PyObject *kv = PyList_GET_ITEM(query_params, i); + char *query_param_name = nullptr; + PyObject *query_param_val = nullptr; + if (!PyArg_ParseTuple(kv, "sO:pair", &query_param_name, &query_param_val)) { + derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl; + Py_RETURN_NONE; + } + if (query_param_name == NAME_KEY_DESCRIPTOR) { + if (!PyList_Check(query_param_val)) { + derr << __func__ << " " << query_param_name << " not a list" << dendl; + Py_RETURN_NONE; + } + for (int j = 0; j < PyList_Size(query_param_val); j++) { + PyObject *sub_key = PyList_GET_ITEM(query_param_val, j); + if (!PyDict_Check(sub_key)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " not a dict" << dendl; + Py_RETURN_NONE; + } + OSDPerfMetricSubKeyDescriptor d; + PyObject *sub_key_params = PyDict_Items(sub_key); + for (int k = 0; k < PyList_Size(sub_key_params); ++k) { + PyObject *pair = PyList_GET_ITEM(sub_key_params, k); + if (!PyTuple_Check(pair)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " pair " << k << " not a tuple" << dendl; + Py_RETURN_NONE; + } + char *param_name = nullptr; + PyObject *param_value = nullptr; + if (!PyArg_ParseTuple(pair, "sO:pair", ¶m_name, ¶m_value)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " pair " << k << " not a size 2 tuple" << dendl; + Py_RETURN_NONE; + } + if (param_name == NAME_SUB_KEY_TYPE) { + if (!PyUnicode_Check(param_value)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid param " << param_name << dendl; + Py_RETURN_NONE; + } + auto type = PyUnicode_AsUTF8(param_value); + auto it = sub_key_types.find(type); + if (it == sub_key_types.end()) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid type " << dendl; + Py_RETURN_NONE; + } + d.type = it->second; + } else if (param_name == NAME_SUB_KEY_REGEX) { + if (!PyUnicode_Check(param_value)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid param " << param_name << dendl; + Py_RETURN_NONE; + } + d.regex_str = PyUnicode_AsUTF8(param_value); + try { + d.regex = d.regex_str.c_str(); + } catch (const std::regex_error& e) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid regex " << d.regex_str << dendl; + Py_RETURN_NONE; + } + if (d.regex.mark_count() == 0) { + derr << __func__ << " query " << query_param_name << " item " << j + << " regex " << d.regex_str << ": no capturing groups" + << dendl; + Py_RETURN_NONE; + } + } else { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid param " << param_name << dendl; + Py_RETURN_NONE; + } + } + if (d.type == static_cast<OSDPerfMetricSubKeyType>(-1) || + d.regex_str.empty()) { + derr << __func__ << " query " << query_param_name << " item " << i + << " invalid" << dendl; + Py_RETURN_NONE; + } + query.key_descriptor.push_back(d); + } + } else if (query_param_name == NAME_COUNTERS_DESCRIPTORS) { + if (!PyList_Check(query_param_val)) { + derr << __func__ << " " << query_param_name << " not a list" << dendl; + Py_RETURN_NONE; + } + for (int j = 0; j < PyList_Size(query_param_val); j++) { + PyObject *py_type = PyList_GET_ITEM(query_param_val, j); + if (!PyUnicode_Check(py_type)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " not a string" << dendl; + Py_RETURN_NONE; + } + auto type = PyUnicode_AsUTF8(py_type); + auto it = counter_types.find(type); + if (it == counter_types.end()) { + derr << __func__ << " query " << query_param_name << " item " << type + << " is not valid type" << dendl; + Py_RETURN_NONE; + } + query.performance_counter_descriptors.push_back(it->second); + } + } else if (query_param_name == NAME_LIMIT) { + if (!PyDict_Check(query_param_val)) { + derr << __func__ << " query " << query_param_name << " not a dict" + << dendl; + Py_RETURN_NONE; + } + + limit = OSDPerfMetricLimit(); + PyObject *limit_params = PyDict_Items(query_param_val); + + for (int j = 0; j < PyList_Size(limit_params); ++j) { + PyObject *kv = PyList_GET_ITEM(limit_params, j); + char *limit_param_name = nullptr; + PyObject *limit_param_val = nullptr; + if (!PyArg_ParseTuple(kv, "sO:pair", &limit_param_name, + &limit_param_val)) { + derr << __func__ << " limit item " << j << " not a size 2 tuple" + << dendl; + Py_RETURN_NONE; + } + + if (limit_param_name == NAME_LIMIT_ORDER_BY) { + if (!PyUnicode_Check(limit_param_val)) { + derr << __func__ << " " << limit_param_name << " not a string" + << dendl; + Py_RETURN_NONE; + } + auto order_by = PyUnicode_AsUTF8(limit_param_val); + auto it = counter_types.find(order_by); + if (it == counter_types.end()) { + derr << __func__ << " limit " << limit_param_name + << " not a valid counter type" << dendl; + Py_RETURN_NONE; + } + limit->order_by = it->second; + } else if (limit_param_name == NAME_LIMIT_MAX_COUNT) { + if (!PyLong_Check(limit_param_val)) { + derr << __func__ << " " << limit_param_name << " not an int" + << dendl; + Py_RETURN_NONE; + } + limit->max_count = PyLong_AsLong(limit_param_val); + } else { + derr << __func__ << " unknown limit param: " << limit_param_name + << dendl; + Py_RETURN_NONE; + } + } + } else { + derr << __func__ << " unknown query param: " << query_param_name << dendl; + Py_RETURN_NONE; + } + } + + if (query.key_descriptor.empty() || + query.performance_counter_descriptors.empty()) { + derr << __func__ << " invalid query" << dendl; + Py_RETURN_NONE; + } + + if (limit) { + auto &ds = query.performance_counter_descriptors; + if (std::find(ds.begin(), ds.end(), limit->order_by) == ds.end()) { + derr << __func__ << " limit order_by " << limit->order_by + << " not in performance_counter_descriptors" << dendl; + Py_RETURN_NONE; + } + } + + auto query_id = self->py_modules->add_osd_perf_query(query, limit); + return PyLong_FromLong(query_id); +} + +static PyObject* +ceph_remove_osd_perf_query(BaseMgrModule *self, PyObject *args) +{ + MetricQueryID query_id; + if (!PyArg_ParseTuple(args, "i:ceph_remove_osd_perf_query", &query_id)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + self->py_modules->remove_osd_perf_query(query_id); + Py_RETURN_NONE; +} + +static PyObject* +ceph_get_osd_perf_counters(BaseMgrModule *self, PyObject *args) +{ + MetricQueryID query_id; + if (!PyArg_ParseTuple(args, "i:ceph_get_osd_perf_counters", &query_id)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + return self->py_modules->get_osd_perf_counters(query_id); +} + +// MDS perf query interface -- mostly follows ceph_add_osd_perf_query() +// style + +static PyObject* +ceph_add_mds_perf_query(BaseMgrModule *self, PyObject *args) +{ + static const std::string NAME_KEY_DESCRIPTOR = "key_descriptor"; + static const std::string NAME_COUNTERS_DESCRIPTORS = + "performance_counter_descriptors"; + static const std::string NAME_LIMIT = "limit"; + static const std::string NAME_SUB_KEY_TYPE = "type"; + static const std::string NAME_SUB_KEY_REGEX = "regex"; + static const std::string NAME_LIMIT_ORDER_BY = "order_by"; + static const std::string NAME_LIMIT_MAX_COUNT = "max_count"; + static const std::map<std::string, MDSPerfMetricSubKeyType> sub_key_types = { + {"mds_rank", MDSPerfMetricSubKeyType::MDS_RANK}, + {"client_id", MDSPerfMetricSubKeyType::CLIENT_ID}, + }; + static const std::map<std::string, MDSPerformanceCounterType> counter_types = { + {"cap_hit", MDSPerformanceCounterType::CAP_HIT_METRIC}, + {"read_latency", MDSPerformanceCounterType::READ_LATENCY_METRIC}, + {"write_latency", MDSPerformanceCounterType::WRITE_LATENCY_METRIC}, + {"metadata_latency", MDSPerformanceCounterType::METADATA_LATENCY_METRIC}, + {"dentry_lease", MDSPerformanceCounterType::DENTRY_LEASE_METRIC}, + {"opened_files", MDSPerformanceCounterType::OPENED_FILES_METRIC}, + {"pinned_icaps", MDSPerformanceCounterType::PINNED_ICAPS_METRIC}, + {"opened_inodes", MDSPerformanceCounterType::OPENED_INODES_METRIC}, + {"read_io_sizes", MDSPerformanceCounterType::READ_IO_SIZES_METRIC}, + {"write_io_sizes", MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC}, + {"avg_read_latency", MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC}, + {"stdev_read_latency", MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC}, + {"avg_write_latency", MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC}, + {"stdev_write_latency", MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC}, + {"avg_metadata_latency", MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC}, + {"stdev_metadata_latency", MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC}, + }; + + PyObject *py_query = nullptr; + if (!PyArg_ParseTuple(args, "O:ceph_add_mds_perf_query", &py_query)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + if (!PyDict_Check(py_query)) { + derr << __func__ << " arg not a dict" << dendl; + Py_RETURN_NONE; + } + + PyObject *query_params = PyDict_Items(py_query); + MDSPerfMetricQuery query; + std::optional<MDSPerfMetricLimit> limit; + + // { + // 'key_descriptor': [ + // {'type': subkey_type, 'regex': regex_pattern}, + // ... + // ], + // 'performance_counter_descriptors': [ + // list, of, descriptor, types + // ], + // 'limit': {'order_by': performance_counter_type, 'max_count': n}, + // } + + for (int i = 0; i < PyList_Size(query_params); ++i) { + PyObject *kv = PyList_GET_ITEM(query_params, i); + char *query_param_name = nullptr; + PyObject *query_param_val = nullptr; + if (!PyArg_ParseTuple(kv, "sO:pair", &query_param_name, &query_param_val)) { + derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl; + Py_RETURN_NONE; + } + if (query_param_name == NAME_KEY_DESCRIPTOR) { + if (!PyList_Check(query_param_val)) { + derr << __func__ << " " << query_param_name << " not a list" << dendl; + Py_RETURN_NONE; + } + for (int j = 0; j < PyList_Size(query_param_val); j++) { + PyObject *sub_key = PyList_GET_ITEM(query_param_val, j); + if (!PyDict_Check(sub_key)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " not a dict" << dendl; + Py_RETURN_NONE; + } + MDSPerfMetricSubKeyDescriptor d; + PyObject *sub_key_params = PyDict_Items(sub_key); + for (int k = 0; k < PyList_Size(sub_key_params); ++k) { + PyObject *pair = PyList_GET_ITEM(sub_key_params, k); + if (!PyTuple_Check(pair)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " pair " << k << " not a tuple" << dendl; + Py_RETURN_NONE; + } + char *param_name = nullptr; + PyObject *param_value = nullptr; + if (!PyArg_ParseTuple(pair, "sO:pair", ¶m_name, ¶m_value)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " pair " << k << " not a size 2 tuple" << dendl; + Py_RETURN_NONE; + } + if (param_name == NAME_SUB_KEY_TYPE) { + if (!PyUnicode_Check(param_value)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid param " << param_name << dendl; + Py_RETURN_NONE; + } + auto type = PyUnicode_AsUTF8(param_value); + auto it = sub_key_types.find(type); + if (it == sub_key_types.end()) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid type " << dendl; + Py_RETURN_NONE; + } + d.type = it->second; + } else if (param_name == NAME_SUB_KEY_REGEX) { + if (!PyUnicode_Check(param_value)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid param " << param_name << dendl; + Py_RETURN_NONE; + } + d.regex_str = PyUnicode_AsUTF8(param_value); + try { + d.regex = d.regex_str.c_str(); + } catch (const std::regex_error& e) { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid regex " << d.regex_str << dendl; + Py_RETURN_NONE; + } + if (d.regex.mark_count() == 0) { + derr << __func__ << " query " << query_param_name << " item " << j + << " regex " << d.regex_str << ": no capturing groups" + << dendl; + Py_RETURN_NONE; + } + } else { + derr << __func__ << " query " << query_param_name << " item " << j + << " contains invalid param " << param_name << dendl; + Py_RETURN_NONE; + } + } + if (d.type == static_cast<MDSPerfMetricSubKeyType>(-1) || + d.regex_str.empty()) { + derr << __func__ << " query " << query_param_name << " item " << i + << " invalid" << dendl; + Py_RETURN_NONE; + } + query.key_descriptor.push_back(d); + } + } else if (query_param_name == NAME_COUNTERS_DESCRIPTORS) { + if (!PyList_Check(query_param_val)) { + derr << __func__ << " " << query_param_name << " not a list" << dendl; + Py_RETURN_NONE; + } + for (int j = 0; j < PyList_Size(query_param_val); j++) { + PyObject *py_type = PyList_GET_ITEM(query_param_val, j); + if (!PyUnicode_Check(py_type)) { + derr << __func__ << " query " << query_param_name << " item " << j + << " not a string" << dendl; + Py_RETURN_NONE; + } + auto type = PyUnicode_AsUTF8(py_type); + auto it = counter_types.find(type); + if (it == counter_types.end()) { + derr << __func__ << " query " << query_param_name << " item " << type + << " is not valid type" << dendl; + Py_RETURN_NONE; + } + query.performance_counter_descriptors.push_back(it->second); + } + } else if (query_param_name == NAME_LIMIT) { + if (!PyDict_Check(query_param_val)) { + derr << __func__ << " query " << query_param_name << " not a dict" + << dendl; + Py_RETURN_NONE; + } + + limit = MDSPerfMetricLimit(); + PyObject *limit_params = PyDict_Items(query_param_val); + + for (int j = 0; j < PyList_Size(limit_params); ++j) { + PyObject *kv = PyList_GET_ITEM(limit_params, j); + char *limit_param_name = nullptr; + PyObject *limit_param_val = nullptr; + if (!PyArg_ParseTuple(kv, "sO:pair", &limit_param_name, + &limit_param_val)) { + derr << __func__ << " limit item " << j << " not a size 2 tuple" + << dendl; + Py_RETURN_NONE; + } + + if (limit_param_name == NAME_LIMIT_ORDER_BY) { + if (!PyUnicode_Check(limit_param_val)) { + derr << __func__ << " " << limit_param_name << " not a string" + << dendl; + Py_RETURN_NONE; + } + auto order_by = PyUnicode_AsUTF8(limit_param_val); + auto it = counter_types.find(order_by); + if (it == counter_types.end()) { + derr << __func__ << " limit " << limit_param_name + << " not a valid counter type" << dendl; + Py_RETURN_NONE; + } + limit->order_by = it->second; + } else if (limit_param_name == NAME_LIMIT_MAX_COUNT) { + if (!PyLong_Check(limit_param_val)) { + derr << __func__ << " " << limit_param_name << " not an int" + << dendl; + Py_RETURN_NONE; + } + limit->max_count = PyLong_AsLong(limit_param_val); + } else { + derr << __func__ << " unknown limit param: " << limit_param_name + << dendl; + Py_RETURN_NONE; + } + } + } else { + derr << __func__ << " unknown query param: " << query_param_name << dendl; + Py_RETURN_NONE; + } + } + + if (query.key_descriptor.empty()) { + derr << __func__ << " invalid query" << dendl; + Py_RETURN_NONE; + } + + if (limit) { + auto &ds = query.performance_counter_descriptors; + if (std::find(ds.begin(), ds.end(), limit->order_by) == ds.end()) { + derr << __func__ << " limit order_by " << limit->order_by + << " not in performance_counter_descriptors" << dendl; + Py_RETURN_NONE; + } + } + + auto query_id = self->py_modules->add_mds_perf_query(query, limit); + return PyLong_FromLong(query_id); +} + +static PyObject* +ceph_remove_mds_perf_query(BaseMgrModule *self, PyObject *args) +{ + MetricQueryID query_id; + if (!PyArg_ParseTuple(args, "i:ceph_remove_mds_perf_query", &query_id)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + self->py_modules->remove_mds_perf_query(query_id); + Py_RETURN_NONE; +} + +static PyObject* +ceph_reregister_mds_perf_queries(BaseMgrModule *self, PyObject *args) +{ + self->py_modules->reregister_mds_perf_queries(); + Py_RETURN_NONE; +} + +static PyObject* +ceph_get_mds_perf_counters(BaseMgrModule *self, PyObject *args) +{ + MetricQueryID query_id; + if (!PyArg_ParseTuple(args, "i:ceph_get_mds_perf_counters", &query_id)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + return self->py_modules->get_mds_perf_counters(query_id); +} + +static PyObject* +ceph_is_authorized(BaseMgrModule *self, PyObject *args) +{ + PyObject *args_dict = NULL; + if (!PyArg_ParseTuple(args, "O:ceph_is_authorized", &args_dict)) { + return nullptr; + } + + if (!PyDict_Check(args_dict)) { + derr << __func__ << " arg not a dict" << dendl; + Py_RETURN_FALSE; + } + + std::map<std::string, std::string> arguments; + + PyObject *args_list = PyDict_Items(args_dict); + for (int i = 0; i < PyList_Size(args_list); ++i) { + PyObject *kv = PyList_GET_ITEM(args_list, i); + + char *arg_key = nullptr; + char *arg_value = nullptr; + if (!PyArg_ParseTuple(kv, "ss:pair", &arg_key, &arg_value)) { + derr << __func__ << " dict item " << i << " not a size 2 tuple" << dendl; + continue; + } + + arguments[arg_key] = arg_value; + } + + bool r = without_gil([&] { + return self->this_module->is_authorized(arguments); + }); + + if (r) { + Py_RETURN_TRUE; + } + Py_RETURN_FALSE; +} + +static PyObject* +ceph_register_client(BaseMgrModule *self, PyObject *args) +{ + const char* _name = nullptr; + char* addrs = nullptr; + int replace = 0; + if (!PyArg_ParseTuple(args, "zsp:ceph_register_client", &_name, &addrs, &replace)) { + return nullptr; + } + auto name = _name ? std::string(_name) : std::string(self->this_module->get_name()); + without_gil([&] { + self->py_modules->register_client(name, addrs, replace); + }); + Py_RETURN_NONE; +} + +static PyObject* +ceph_unregister_client(BaseMgrModule *self, PyObject *args) +{ + const char* _name = nullptr; + char* addrs = nullptr; + if (!PyArg_ParseTuple(args, "zs:ceph_unregister_client", &_name, &addrs)) { + return nullptr; + } + auto name = _name ? std::string(_name) : std::string(self->this_module->get_name()); + without_gil([&] { + self->py_modules->unregister_client(name, addrs); + }); + Py_RETURN_NONE; +} + +static PyObject* +ceph_get_daemon_health_metrics(BaseMgrModule *self, PyObject *args) +{ + return self->py_modules->get_daemon_health_metrics(); +} + +PyMethodDef BaseMgrModule_methods[] = { + {"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS, + "Get a cluster object"}, + + {"_ceph_get_server", (PyCFunction)ceph_get_server, METH_VARARGS, + "Get a server object"}, + + {"_ceph_get_metadata", (PyCFunction)get_metadata, METH_VARARGS, + "Get a service's metadata"}, + + {"_ceph_get_daemon_status", (PyCFunction)get_daemon_status, METH_VARARGS, + "Get a service's status"}, + + {"_ceph_send_command", (PyCFunction)ceph_send_command, METH_VARARGS, + "Send a mon command"}, + + {"_ceph_set_health_checks", (PyCFunction)ceph_set_health_checks, METH_VARARGS, + "Set health checks for this module"}, + + {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS, + "Get the name of the Mgr daemon where we are running"}, + + {"_ceph_get_ceph_conf_path", (PyCFunction)ceph_get_ceph_conf_path, METH_NOARGS, + "Get path to ceph.conf"}, + + {"_ceph_get_option", (PyCFunction)ceph_option_get, METH_VARARGS, + "Get a native configuration option value"}, + + {"_ceph_get_foreign_option", (PyCFunction)ceph_foreign_option_get, METH_VARARGS, + "Get a native configuration option value for another entity"}, + + {"_ceph_get_module_option", (PyCFunction)ceph_get_module_option, METH_VARARGS, + "Get a module configuration option value"}, + + {"_ceph_get_store_prefix", (PyCFunction)ceph_store_get_prefix, METH_VARARGS, + "Get all KV store values with a given prefix"}, + + {"_ceph_set_module_option", (PyCFunction)ceph_set_module_option, METH_VARARGS, + "Set a module configuration option value"}, + + {"_ceph_get_store", (PyCFunction)ceph_store_get, METH_VARARGS, + "Get a stored field"}, + + {"_ceph_set_store", (PyCFunction)ceph_store_set, METH_VARARGS, + "Set a stored field"}, + + {"_ceph_get_counter", (PyCFunction)get_counter, METH_VARARGS, + "Get a performance counter"}, + + {"_ceph_get_latest_counter", (PyCFunction)get_latest_counter, METH_VARARGS, + "Get the latest performance counter"}, + + {"_ceph_get_perf_schema", (PyCFunction)get_perf_schema, METH_VARARGS, + "Get the performance counter schema"}, + + {"_ceph_get_rocksdb_version", (PyCFunction)ceph_get_rocksdb_version, METH_NOARGS, + "Get the current RocksDB version number"}, + + {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS, + "Emit a (local) log message"}, + + {"_ceph_cluster_log", (PyCFunction)ceph_cluster_log, METH_VARARGS, + "Emit a cluster log message"}, + + {"_ceph_get_version", (PyCFunction)ceph_get_version, METH_NOARGS, + "Get the ceph version of this process"}, + + {"_ceph_get_release_name", (PyCFunction)ceph_get_release_name, METH_NOARGS, + "Get the ceph release name of this process"}, + + {"_ceph_lookup_release_name", (PyCFunction)ceph_lookup_release_name, METH_VARARGS, + "Get the ceph release name for a given major number"}, + + {"_ceph_get_context", (PyCFunction)ceph_get_context, METH_NOARGS, + "Get a CephContext* in a python capsule"}, + + {"_ceph_get_osdmap", (PyCFunction)ceph_get_osdmap, METH_NOARGS, + "Get an OSDMap* in a python capsule"}, + + {"_ceph_set_uri", (PyCFunction)ceph_set_uri, METH_VARARGS, + "Advertize a service URI served by this module"}, + + {"_ceph_set_device_wear_level", (PyCFunction)ceph_set_wear_level, METH_VARARGS, + "Set device wear_level value"}, + + {"_ceph_have_mon_connection", (PyCFunction)ceph_have_mon_connection, + METH_NOARGS, "Find out whether this mgr daemon currently has " + "a connection to a monitor"}, + + {"_ceph_update_progress_event", (PyCFunction)ceph_update_progress_event, + METH_VARARGS, "Update status of a progress event"}, + {"_ceph_complete_progress_event", (PyCFunction)ceph_complete_progress_event, + METH_VARARGS, "Complete a progress event"}, + {"_ceph_clear_all_progress_events", (PyCFunction)ceph_clear_all_progress_events, + METH_NOARGS, "Clear all progress events"}, + + {"_ceph_dispatch_remote", (PyCFunction)ceph_dispatch_remote, + METH_VARARGS, "Dispatch a call to another module"}, + + {"_ceph_add_osd_perf_query", (PyCFunction)ceph_add_osd_perf_query, + METH_VARARGS, "Add an osd perf query"}, + + {"_ceph_remove_osd_perf_query", (PyCFunction)ceph_remove_osd_perf_query, + METH_VARARGS, "Remove an osd perf query"}, + + {"_ceph_get_osd_perf_counters", (PyCFunction)ceph_get_osd_perf_counters, + METH_VARARGS, "Get osd perf counters"}, + + {"_ceph_add_mds_perf_query", (PyCFunction)ceph_add_mds_perf_query, + METH_VARARGS, "Add an mds perf query"}, + + {"_ceph_remove_mds_perf_query", (PyCFunction)ceph_remove_mds_perf_query, + METH_VARARGS, "Remove an mds perf query"}, + + {"_ceph_reregister_mds_perf_queries", (PyCFunction)ceph_reregister_mds_perf_queries, + METH_NOARGS, "Re-register mds perf queries"}, + + {"_ceph_get_mds_perf_counters", (PyCFunction)ceph_get_mds_perf_counters, + METH_VARARGS, "Get mds perf counters"}, + + {"_ceph_is_authorized", (PyCFunction)ceph_is_authorized, + METH_VARARGS, "Verify the current session caps are valid"}, + + {"_ceph_register_client", (PyCFunction)ceph_register_client, + METH_VARARGS, "Register RADOS instance for potential blocklisting"}, + + {"_ceph_unregister_client", (PyCFunction)ceph_unregister_client, + METH_VARARGS, "Unregister RADOS instance for potential blocklisting"}, + + {"_ceph_get_daemon_health_metrics", (PyCFunction)ceph_get_daemon_health_metrics, + METH_VARARGS, "Get health metrics for all daemons"}, + + {NULL, NULL, 0, NULL} +}; + + +static PyObject * +BaseMgrModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + BaseMgrModule *self; + + self = (BaseMgrModule *)type->tp_alloc(type, 0); + + return (PyObject *)self; +} + +static int +BaseMgrModule_init(BaseMgrModule *self, PyObject *args, PyObject *kwds) +{ + PyObject *py_modules_capsule = nullptr; + PyObject *this_module_capsule = nullptr; + static const char *kwlist[] = {"py_modules", "this_module", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "OO", + const_cast<char**>(kwlist), + &py_modules_capsule, + &this_module_capsule)) { + return -1; + } + + self->py_modules = static_cast<ActivePyModules*>(PyCapsule_GetPointer( + py_modules_capsule, nullptr)); + ceph_assert(self->py_modules); + self->this_module = static_cast<ActivePyModule*>(PyCapsule_GetPointer( + this_module_capsule, nullptr)); + ceph_assert(self->this_module); + + return 0; +} + +PyTypeObject BaseMgrModuleType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BaseMgrModule", /* tp_name */ + sizeof(BaseMgrModule), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "ceph-mgr Python Plugin", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BaseMgrModule_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BaseMgrModule_init, /* tp_init */ + 0, /* tp_alloc */ + BaseMgrModule_new, /* tp_new */ +}; diff --git a/src/mgr/BaseMgrModule.h b/src/mgr/BaseMgrModule.h new file mode 100644 index 000000000..2c2e5deb3 --- /dev/null +++ b/src/mgr/BaseMgrModule.h @@ -0,0 +1,7 @@ + +#pragma once + +#include "Python.h" + +extern PyTypeObject BaseMgrModuleType; + diff --git a/src/mgr/BaseMgrStandbyModule.cc b/src/mgr/BaseMgrStandbyModule.cc new file mode 100644 index 000000000..22dfd3be8 --- /dev/null +++ b/src/mgr/BaseMgrStandbyModule.cc @@ -0,0 +1,271 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "BaseMgrStandbyModule.h" + +#include "StandbyPyModules.h" +#include "PyFormatter.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +using std::string; + +typedef struct { + PyObject_HEAD + StandbyPyModule *this_module; +} BaseMgrStandbyModule; + +static PyObject * +BaseMgrStandbyModule_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + BaseMgrStandbyModule *self; + + self = (BaseMgrStandbyModule *)type->tp_alloc(type, 0); + + return (PyObject *)self; +} + +static int +BaseMgrStandbyModule_init(BaseMgrStandbyModule *self, PyObject *args, PyObject *kwds) +{ + PyObject *this_module_capsule = nullptr; + static const char *kwlist[] = {"this_module", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "O", + const_cast<char**>(kwlist), + &this_module_capsule)) { + return -1; + } + + self->this_module = static_cast<StandbyPyModule*>(PyCapsule_GetPointer( + this_module_capsule, nullptr)); + ceph_assert(self->this_module); + + return 0; +} + +static PyObject* +ceph_get_mgr_id(BaseMgrStandbyModule *self, PyObject *args) +{ + return PyUnicode_FromString(g_conf()->name.get_id().c_str()); +} + +static PyObject* +ceph_get_module_option(BaseMgrStandbyModule *self, PyObject *args) +{ + char *what = nullptr; + char *prefix = nullptr; + if (!PyArg_ParseTuple(args, "s|s:ceph_get_module_option", &what, &prefix)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + PyThreadState *tstate = PyEval_SaveThread(); + std::string final_key; + std::string value; + bool found = false; + if (prefix) { + final_key = std::string(prefix) + "/" + what; + found = self->this_module->get_config(final_key, &value); + } + if (!found) { + final_key = what; + found = self->this_module->get_config(final_key, &value); + } + PyEval_RestoreThread(tstate); + if (found) { + dout(10) << __func__ << " " << final_key << " found: " << value + << dendl; + return self->this_module->py_module->get_typed_option_value(what, value); + } else { + if (prefix) { + dout(4) << __func__ << " [" << prefix << "/]" << what << " not found " + << dendl; + } else { + dout(4) << __func__ << " " << what << " not found " << dendl; + } + Py_RETURN_NONE; + } +} + +static PyObject* +ceph_option_get(BaseMgrStandbyModule *self, PyObject *args) +{ + char *what = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_option_get", &what)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + std::string value; + int r = g_conf().get_val(string(what), &value); + if (r >= 0) { + dout(10) << "ceph_option_get " << what << " found: " << value << dendl; + return PyUnicode_FromString(value.c_str()); + } else { + dout(4) << "ceph_option_get " << what << " not found " << dendl; + Py_RETURN_NONE; + } +} + +static PyObject* +ceph_store_get(BaseMgrStandbyModule *self, PyObject *args) +{ + char *what = nullptr; + if (!PyArg_ParseTuple(args, "s:ceph_store_get", &what)) { + derr << "Invalid args!" << dendl; + return nullptr; + } + + // Drop GIL for blocking mon command execution + PyThreadState *tstate = PyEval_SaveThread(); + + std::string value; + bool found = self->this_module->get_store(what, &value); + + PyEval_RestoreThread(tstate); + + if (found) { + dout(10) << "ceph_store_get " << what << " found: " << value.c_str() << dendl; + return PyUnicode_FromString(value.c_str()); + } else { + dout(4) << "ceph_store_get " << what << " not found " << dendl; + Py_RETURN_NONE; + } +} + +static PyObject* +ceph_get_active_uri(BaseMgrStandbyModule *self, PyObject *args) +{ + return PyUnicode_FromString(self->this_module->get_active_uri().c_str()); +} + +static PyObject* +ceph_log(BaseMgrStandbyModule *self, PyObject *args) +{ + char *record = nullptr; + if (!PyArg_ParseTuple(args, "s:log", &record)) { + return nullptr; + } + + ceph_assert(self->this_module); + + self->this_module->log(record); + + Py_RETURN_NONE; +} + +static PyObject* +ceph_standby_state_get(BaseMgrStandbyModule *self, PyObject *args) +{ + char *whatc = NULL; + if (!PyArg_ParseTuple(args, "s:ceph_state_get", &whatc)) { + return NULL; + } + std::string what(whatc); + + PyFormatter f; + + // Drop the GIL, as most of the following blocks will block on + // a mutex -- they are all responsible for re-taking the GIL before + // touching the PyFormatter instance or returning from the function. + without_gil_t no_gil; + + if (what == "mgr_ips") { + entity_addrvec_t myaddrs = self->this_module->get_myaddrs(); + with_gil_t with_gil{no_gil}; + f.open_array_section("ips"); + std::set<std::string> did; + for (auto& i : myaddrs.v) { + std::string ip = i.ip_only_to_str(); + if (auto [where, inserted] = did.insert(ip); inserted) { + f.dump_string("ip", ip); + } + } + f.close_section(); + return f.get(); + } else { + derr << "Python module requested unknown data '" << what << "'" << dendl; + with_gil_t with_gil{no_gil}; + Py_RETURN_NONE; + } +} + + +PyMethodDef BaseMgrStandbyModule_methods[] = { + {"_ceph_get", (PyCFunction)ceph_standby_state_get, METH_VARARGS, + "Get a cluster object (standby)"}, + + {"_ceph_get_mgr_id", (PyCFunction)ceph_get_mgr_id, METH_NOARGS, + "Get the name of the Mgr daemon where we are running"}, + + {"_ceph_get_module_option", (PyCFunction)ceph_get_module_option, METH_VARARGS, + "Get a module configuration option value"}, + + {"_ceph_get_option", (PyCFunction)ceph_option_get, METH_VARARGS, + "Get a native configuration option value"}, + + {"_ceph_get_store", (PyCFunction)ceph_store_get, METH_VARARGS, + "Get a KV store value"}, + + {"_ceph_get_active_uri", (PyCFunction)ceph_get_active_uri, METH_NOARGS, + "Get the URI of the active instance of this module, if any"}, + + {"_ceph_log", (PyCFunction)ceph_log, METH_VARARGS, + "Emit a log message"}, + + {NULL, NULL, 0, NULL} +}; + +PyTypeObject BaseMgrStandbyModuleType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BaseMgrStandbyModule", /* tp_name */ + sizeof(BaseMgrStandbyModule), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "ceph-mgr Standby Python Plugin", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BaseMgrStandbyModule_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BaseMgrStandbyModule_init, /* tp_init */ + 0, /* tp_alloc */ + BaseMgrStandbyModule_new, /* tp_new */ +}; diff --git a/src/mgr/BaseMgrStandbyModule.h b/src/mgr/BaseMgrStandbyModule.h new file mode 100644 index 000000000..82bda9105 --- /dev/null +++ b/src/mgr/BaseMgrStandbyModule.h @@ -0,0 +1,6 @@ +#pragma once + +#include <Python.h> + +extern PyTypeObject BaseMgrStandbyModuleType; + diff --git a/src/mgr/CMakeLists.txt b/src/mgr/CMakeLists.txt new file mode 100644 index 000000000..f9ec04317 --- /dev/null +++ b/src/mgr/CMakeLists.txt @@ -0,0 +1,54 @@ +add_library(mgr_cap_obj OBJECT + MgrCap.cc) + +if(WITH_MGR) + set(mgr_srcs + ${CMAKE_SOURCE_DIR}/src/ceph_mgr.cc + ${CMAKE_SOURCE_DIR}/src/mon/PGMap.cc + ${CMAKE_SOURCE_DIR}/src/mon/ConfigMap.cc + ActivePyModule.cc + ActivePyModules.cc + BaseMgrModule.cc + BaseMgrStandbyModule.cc + ClusterState.cc + DaemonHealthMetricCollector.cc + DaemonKey.cc + DaemonServer.cc + DaemonState.cc + Gil.cc + Mgr.cc + mgr_perf_counters.cc + MgrStandby.cc + MetricCollector.cc + OSDPerfMetricTypes.cc + OSDPerfMetricCollector.cc + MDSPerfMetricTypes.cc + MDSPerfMetricCollector.cc + PyFormatter.cc + PyUtil.cc + PyModule.cc + PyModuleRegistry.cc + PyModuleRunner.cc + PyOSDMap.cc + StandbyPyModules.cc + mgr_commands.cc + $<TARGET_OBJECTS:mgr_cap_obj>) + add_executable(ceph-mgr ${mgr_srcs}) + target_compile_definitions(ceph-mgr PRIVATE PY_SSIZE_T_CLEAN) + if(WITH_LIBCEPHSQLITE) + target_link_libraries(ceph-mgr cephsqlite SQLite3::SQLite3) + endif() + target_include_directories(ceph-mgr PRIVATE + $<TARGET_PROPERTY:RocksDB::RocksDB,INTERFACE_INCLUDE_DIRECTORIES>) + target_link_libraries(ceph-mgr + osdc client heap_profiler + global-static ceph-common + Boost::python${MGR_PYTHON_VERSION_MAJOR}${MGR_PYTHON_VERSION_MINOR} + Python3::Python + ${ALLOC_LIBS} + ${CMAKE_DL_LIBS} + ${GSSAPI_LIBRARIES}) + set_target_properties(ceph-mgr PROPERTIES + POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE}) + install(TARGETS ceph-mgr DESTINATION bin) +endif() diff --git a/src/mgr/ClusterState.cc b/src/mgr/ClusterState.cc new file mode 100644 index 000000000..7f811a5e4 --- /dev/null +++ b/src/mgr/ClusterState.cc @@ -0,0 +1,391 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "messages/MMgrDigest.h" +#include "messages/MMonMgrReport.h" +#include "messages/MPGStats.h" + +#include "mgr/ClusterState.h" +#include <time.h> +#include <boost/range/adaptor/reversed.hpp> + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +using std::ostream; +using std::set; +using std::string; +using std::stringstream; + +ClusterState::ClusterState( + MonClient *monc_, + Objecter *objecter_, + const MgrMap& mgrmap) + : monc(monc_), + objecter(objecter_), + mgr_map(mgrmap), + asok_hook(NULL) +{} + +void ClusterState::set_objecter(Objecter *objecter_) +{ + std::lock_guard l(lock); + + objecter = objecter_; +} + +void ClusterState::set_fsmap(FSMap const &new_fsmap) +{ + std::lock_guard l(lock); + + fsmap = new_fsmap; +} + +void ClusterState::set_mgr_map(MgrMap const &new_mgrmap) +{ + std::lock_guard l(lock); + mgr_map = new_mgrmap; +} + +void ClusterState::set_service_map(ServiceMap const &new_service_map) +{ + std::lock_guard l(lock); + servicemap = new_service_map; +} + +void ClusterState::load_digest(MMgrDigest *m) +{ + std::lock_guard l(lock); + health_json = std::move(m->health_json); + mon_status_json = std::move(m->mon_status_json); +} + +void ClusterState::ingest_pgstats(ref_t<MPGStats> stats) +{ + std::lock_guard l(lock); + + const int from = stats->get_orig_source().num(); + bool is_in = with_osdmap([from](const OSDMap& osdmap) { + return osdmap.is_in(from); + }); + + if (is_in) { + pending_inc.update_stat(from, std::move(stats->osd_stat)); + } else { + osd_stat_t empty_stat; + empty_stat.seq = stats->osd_stat.seq; + pending_inc.update_stat(from, std::move(empty_stat)); + } + + for (auto p : stats->pg_stat) { + pg_t pgid = p.first; + const auto &pg_stats = p.second; + + // In case we're hearing about a PG that according to last + // OSDMap update should not exist + auto r = existing_pools.find(pgid.pool()); + if (r == existing_pools.end()) { + dout(15) << " got " << pgid + << " reported at " << pg_stats.reported_epoch << ":" + << pg_stats.reported_seq + << " state " << pg_state_string(pg_stats.state) + << " but pool not in " << existing_pools + << dendl; + continue; + } + if (pgid.ps() >= r->second) { + dout(15) << " got " << pgid + << " reported at " << pg_stats.reported_epoch << ":" + << pg_stats.reported_seq + << " state " << pg_state_string(pg_stats.state) + << " but > pg_num " << r->second + << dendl; + continue; + } + // In case we already heard about more recent stats from this PG + // from another OSD + const auto q = pg_map.pg_stat.find(pgid); + if (q != pg_map.pg_stat.end() && + q->second.get_version_pair() > pg_stats.get_version_pair()) { + dout(15) << " had " << pgid << " from " + << q->second.reported_epoch << ":" + << q->second.reported_seq << dendl; + continue; + } + + pending_inc.pg_stat_updates[pgid] = pg_stats; + } + for (auto p : stats->pool_stat) { + pending_inc.pool_statfs_updates[std::make_pair(p.first, from)] = p.second; + } +} + +void ClusterState::update_delta_stats() +{ + pending_inc.stamp = ceph_clock_now(); + pending_inc.version = pg_map.version + 1; // to make apply_incremental happy + dout(10) << " v" << pending_inc.version << dendl; + + dout(30) << " pg_map before:\n"; + JSONFormatter jf(true); + jf.dump_object("pg_map", pg_map); + jf.flush(*_dout); + *_dout << dendl; + dout(30) << " incremental:\n"; + JSONFormatter jf(true); + jf.dump_object("pending_inc", pending_inc); + jf.flush(*_dout); + *_dout << dendl; + pg_map.apply_incremental(g_ceph_context, pending_inc); + pending_inc = PGMap::Incremental(); +} + +void ClusterState::notify_osdmap(const OSDMap &osd_map) +{ + assert(ceph_mutex_is_locked(lock)); + + pending_inc.stamp = ceph_clock_now(); + pending_inc.version = pg_map.version + 1; // to make apply_incremental happy + dout(10) << " v" << pending_inc.version << dendl; + + PGMapUpdater::check_osd_map(g_ceph_context, osd_map, pg_map, &pending_inc); + + // update our list of pools that exist, so that we can filter pg_map updates + // in synchrony with this OSDMap. + existing_pools.clear(); + for (auto& p : osd_map.get_pools()) { + existing_pools[p.first] = p.second.get_pg_num(); + } + + // brute force this for now (don't bother being clever by only + // checking osds that went up/down) + set<int> need_check_down_pg_osds; + PGMapUpdater::check_down_pgs(osd_map, pg_map, true, + need_check_down_pg_osds, &pending_inc); + + dout(30) << " pg_map before:\n"; + JSONFormatter jf(true); + jf.dump_object("pg_map", pg_map); + jf.flush(*_dout); + *_dout << dendl; + dout(30) << " incremental:\n"; + JSONFormatter jf(true); + jf.dump_object("pending_inc", pending_inc); + jf.flush(*_dout); + *_dout << dendl; + + pg_map.apply_incremental(g_ceph_context, pending_inc); + pending_inc = PGMap::Incremental(); + // TODO: Complete the separation of PG state handling so + // that a cut-down set of functionality remains in PGMonitor + // while the full-blown PGMap lives only here. +} + +class ClusterSocketHook : public AdminSocketHook { + ClusterState *cluster_state; +public: + explicit ClusterSocketHook(ClusterState *o) : cluster_state(o) {} + int call(std::string_view admin_command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& errss, + bufferlist& out) override { + stringstream outss; + int r = 0; + try { + r = cluster_state->asok_command(admin_command, cmdmap, f, outss); + out.append(outss); + } catch (const TOPNSPC::common::bad_cmd_get& e) { + errss << e.what(); + r = -EINVAL; + } + return r; + } +}; + +void ClusterState::final_init() +{ + AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); + asok_hook = new ClusterSocketHook(this); + int r = admin_socket->register_command( + "dump_osd_network name=value,type=CephInt,req=false", asok_hook, + "Dump osd heartbeat network ping times"); + ceph_assert(r == 0); +} + +void ClusterState::shutdown() +{ + // unregister commands + g_ceph_context->get_admin_socket()->unregister_commands(asok_hook); + delete asok_hook; + asok_hook = NULL; +} + +bool ClusterState::asok_command( + std::string_view admin_command, + const cmdmap_t& cmdmap, + Formatter *f, + ostream& ss) +{ + std::lock_guard l(lock); + + if (admin_command == "dump_osd_network") { + int64_t value = 0; + // Default to health warning level if nothing specified + if (!(TOPNSPC::common::cmd_getval(cmdmap, "value", value))) { + // Convert milliseconds to microseconds + value = static_cast<int64_t>(g_ceph_context->_conf.get_val<double>("mon_warn_on_slow_ping_time")) * 1000; + if (value == 0) { + double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio"); + value = g_conf().get_val<int64_t>("osd_heartbeat_grace"); + value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio + } + } else { + // Convert user input to microseconds + value *= 1000; + } + if (value < 0) + value = 0; + + struct mgr_ping_time_t { + uint32_t pingtime; + int from; + int to; + bool back; + std::array<uint32_t,3> times; + std::array<uint32_t,3> min; + std::array<uint32_t,3> max; + uint32_t last; + uint32_t last_update; + + bool operator<(const mgr_ping_time_t& rhs) const { + if (pingtime < rhs.pingtime) + return true; + if (pingtime > rhs.pingtime) + return false; + if (from < rhs.from) + return true; + if (from > rhs.from) + return false; + if (to < rhs.to) + return true; + if (to > rhs.to) + return false; + return back; + } + }; + + set<mgr_ping_time_t> sorted; + utime_t now = ceph_clock_now(); + for (auto i : pg_map.osd_stat) { + for (auto j : i.second.hb_pingtime) { + + if (j.second.last_update == 0) + continue; + auto stale_time = g_ceph_context->_conf.get_val<int64_t>("osd_mon_heartbeat_stat_stale"); + if (now.sec() - j.second.last_update > stale_time) { + dout(20) << __func__ << " time out heartbeat for osd " << i.first + << " last_update " << j.second.last_update << dendl; + continue; + } + mgr_ping_time_t item; + item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); + item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]); + if (!value || item.pingtime >= value) { + item.from = i.first; + item.to = j.first; + item.times[0] = j.second.back_pingtime[0]; + item.times[1] = j.second.back_pingtime[1]; + item.times[2] = j.second.back_pingtime[2]; + item.min[0] = j.second.back_min[0]; + item.min[1] = j.second.back_min[1]; + item.min[2] = j.second.back_min[2]; + item.max[0] = j.second.back_max[0]; + item.max[1] = j.second.back_max[1]; + item.max[2] = j.second.back_max[2]; + item.last = j.second.back_last; + item.back = true; + item.last_update = j.second.last_update; + sorted.emplace(item); + } + + if (j.second.front_last == 0) + continue; + item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]); + item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]); + if (!value || item.pingtime >= value) { + item.from = i.first; + item.to = j.first; + item.times[0] = j.second.front_pingtime[0]; + item.times[1] = j.second.front_pingtime[1]; + item.times[2] = j.second.front_pingtime[2]; + item.min[0] = j.second.front_min[0]; + item.min[1] = j.second.front_min[1]; + item.min[2] = j.second.front_min[2]; + item.max[0] = j.second.front_max[0]; + item.max[1] = j.second.front_max[1]; + item.max[2] = j.second.front_max[2]; + item.last = j.second.front_last; + item.back = false; + item.last_update = j.second.last_update; + sorted.emplace(item); + } + } + } + + // Network ping times (1min 5min 15min) + f->open_object_section("network_ping_times"); + f->dump_int("threshold", value / 1000); + f->open_array_section("entries"); + for (auto &sitem : boost::adaptors::reverse(sorted)) { + ceph_assert(!value || sitem.pingtime >= value); + + f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = g_ceph_context->_conf.get_val<int64_t>("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); + f->dump_int("from osd", sitem.from); + f->dump_int("to osd", sitem.to); + f->dump_string("interface", (sitem.back ? "back" : "front")); + f->open_object_section("average"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str()); + f->close_section(); // average + f->open_object_section("min"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.min[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.min[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.min[2],3).c_str()); + f->close_section(); // min + f->open_object_section("max"); + f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str()); + f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str()); + f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str()); + f->close_section(); // max + f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str()); + f->close_section(); // entry + } + f->close_section(); // entries + f->close_section(); // network_ping_times + } else { + ceph_abort_msg("broken asok registration"); + } + return true; +} diff --git a/src/mgr/ClusterState.h b/src/mgr/ClusterState.h new file mode 100644 index 000000000..7939cd8eb --- /dev/null +++ b/src/mgr/ClusterState.h @@ -0,0 +1,163 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CLUSTER_STATE_H_ +#define CLUSTER_STATE_H_ + +#include "mds/FSMap.h" +#include "mon/MgrMap.h" +#include "common/ceph_mutex.h" + +#include "osdc/Objecter.h" +#include "mon/MonClient.h" +#include "mon/PGMap.h" +#include "mgr/ServiceMap.h" + +class MMgrDigest; +class MMonMgrReport; +class MPGStats; + + +/** + * Cluster-scope state (things like cluster maps) as opposed + * to daemon-level state (things like perf counters and smart) + */ +class ClusterState +{ +protected: + MonClient *monc; + Objecter *objecter; + FSMap fsmap; + ServiceMap servicemap; + mutable ceph::mutex lock = ceph::make_mutex("ClusterState"); + + MgrMap mgr_map; + + std::map<int64_t,unsigned> existing_pools; ///< pools that exist, and pg_num, as of PGMap epoch + PGMap pg_map; + PGMap::Incremental pending_inc; + + bufferlist health_json; + bufferlist mon_status_json; + + class ClusterSocketHook *asok_hook; + +public: + + void load_digest(MMgrDigest *m); + void ingest_pgstats(ceph::ref_t<MPGStats> stats); + + void update_delta_stats(); + + ClusterState(MonClient *monc_, Objecter *objecter_, const MgrMap& mgrmap); + + void set_objecter(Objecter *objecter_); + void set_fsmap(FSMap const &new_fsmap); + void set_mgr_map(MgrMap const &new_mgrmap); + void set_service_map(ServiceMap const &new_service_map); + + void notify_osdmap(const OSDMap &osd_map); + + bool have_fsmap() const { + std::lock_guard l(lock); + return fsmap.get_epoch() > 0; + } + + template<typename Callback, typename...Args> + auto with_servicemap(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + return std::forward<Callback>(cb)(servicemap, std::forward<Args>(args)...); + } + + template<typename Callback, typename...Args> + auto with_fsmap(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + return std::forward<Callback>(cb)(fsmap, std::forward<Args>(args)...); + } + + template<typename Callback, typename...Args> + auto with_mgrmap(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + return std::forward<Callback>(cb)(mgr_map, std::forward<Args>(args)...); + } + + template<typename Callback, typename...Args> + auto with_pgmap(Callback&& cb, Args&&...args) const -> + decltype(cb(pg_map, std::forward<Args>(args)...)) + { + std::lock_guard l(lock); + return std::forward<Callback>(cb)(pg_map, std::forward<Args>(args)...); + } + + template<typename Callback, typename...Args> + auto with_mutable_pgmap(Callback&& cb, Args&&...args) -> + decltype(cb(pg_map, std::forward<Args>(args)...)) + { + std::lock_guard l(lock); + return std::forward<Callback>(cb)(pg_map, std::forward<Args>(args)...); + } + + template<typename... Args> + auto with_monmap(Args &&... args) const + { + std::lock_guard l(lock); + ceph_assert(monc != nullptr); + return monc->with_monmap(std::forward<Args>(args)...); + } + + template<typename... Args> + auto with_osdmap(Args &&... args) const -> + decltype(objecter->with_osdmap(std::forward<Args>(args)...)) + { + ceph_assert(objecter != nullptr); + return objecter->with_osdmap(std::forward<Args>(args)...); + } + + // call cb(osdmap, pg_map, ...args) with the appropriate locks + template <typename Callback, typename ...Args> + auto with_osdmap_and_pgmap(Callback&& cb, Args&& ...args) const { + ceph_assert(objecter != nullptr); + std::lock_guard l(lock); + return objecter->with_osdmap( + std::forward<Callback>(cb), + pg_map, + std::forward<Args>(args)...); + } + + template<typename Callback, typename...Args> + auto with_health(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + return std::forward<Callback>(cb)(health_json, std::forward<Args>(args)...); + } + + template<typename Callback, typename...Args> + auto with_mon_status(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + return std::forward<Callback>(cb)(mon_status_json, std::forward<Args>(args)...); + } + + void final_init(); + void shutdown(); + bool asok_command(std::string_view admin_command, + const cmdmap_t& cmdmap, + Formatter *f, + std::ostream& ss); +}; + +#endif + diff --git a/src/mgr/DaemonHealthMetric.h b/src/mgr/DaemonHealthMetric.h new file mode 100644 index 000000000..ce0dad2c8 --- /dev/null +++ b/src/mgr/DaemonHealthMetric.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> +#include <ostream> +#include "include/denc.h" + +enum class daemon_metric : uint8_t { + SLOW_OPS, + PENDING_CREATING_PGS, + NONE, +}; + +static inline const char *daemon_metric_name(daemon_metric t) { + switch (t) { + case daemon_metric::SLOW_OPS: return "SLOW_OPS"; + case daemon_metric::PENDING_CREATING_PGS: return "PENDING_CREATING_PGS"; + case daemon_metric::NONE: return "NONE"; + default: return "???"; + } +} + +union daemon_metric_t { + struct { + uint32_t n1; + uint32_t n2; + }; + uint64_t n; + daemon_metric_t(uint32_t x, uint32_t y) + : n1(x), n2(y) + {} + daemon_metric_t(uint64_t x = 0) + : n(x) + {} +}; + +class DaemonHealthMetric +{ +public: + DaemonHealthMetric() = default; + DaemonHealthMetric(daemon_metric type_, uint64_t n) + : type(type_), value(n) + {} + DaemonHealthMetric(daemon_metric type_, uint32_t n1, uint32_t n2) + : type(type_), value(n1, n2) + {} + + daemon_metric get_type() const { + return type; + } + uint64_t get_n() const { + return value.n; + } + uint32_t get_n1() const { + return value.n1; + } + uint32_t get_n2() const { + return value.n2; + } + + DENC(DaemonHealthMetric, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.value.n, p); + DENC_FINISH(p); + } + + std::string get_type_name() const { + return daemon_metric_name(get_type()); + } + + friend std::ostream& operator<<(std::ostream& out, const DaemonHealthMetric& m) { + return out << daemon_metric_name(m.get_type()) << "(" + << m.get_n() << "|(" << m.get_n1() << "," << m.get_n2() << "))"; + } +private: + daemon_metric type = daemon_metric::NONE; + daemon_metric_t value; +}; +WRITE_CLASS_DENC(DaemonHealthMetric) diff --git a/src/mgr/DaemonHealthMetricCollector.cc b/src/mgr/DaemonHealthMetricCollector.cc new file mode 100644 index 000000000..bf206015a --- /dev/null +++ b/src/mgr/DaemonHealthMetricCollector.cc @@ -0,0 +1,105 @@ +#include <fmt/format.h> + +#include "include/health.h" +#include "include/types.h" +#include "DaemonHealthMetricCollector.h" + +namespace { + +using std::unique_ptr; +using std::vector; +using std::ostringstream; + +class SlowOps final : public DaemonHealthMetricCollector { + bool _is_relevant(daemon_metric type) const override { + return type == daemon_metric::SLOW_OPS; + } + health_check_t& _get_check(health_check_map_t& cm) const override { + return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "", 1); + } + bool _update(const DaemonKey& daemon, + const DaemonHealthMetric& metric) override { + auto num_slow = metric.get_n1(); + auto blocked_time = metric.get_n2(); + value.n1 += num_slow; + value.n2 = std::max(value.n2, blocked_time); + if (num_slow || blocked_time) { + daemons.push_back(daemon); + return true; + } else { + return false; + } + } + void _summarize(health_check_t& check) const override { + if (daemons.empty()) { + return; + } + // Note this message format is used in mgr/prometheus, so any change in format + // requires a corresponding change in the mgr/prometheus module. + ostringstream ss; + if (daemons.size() > 1) { + if (daemons.size() > 10) { + ss << "daemons " << vector<DaemonKey>(daemons.begin(), daemons.begin()+10) + << "..." << " have slow ops."; + } else { + ss << "daemons " << daemons << " have slow ops."; + } + } else { + ss << daemons.front() << " has slow ops"; + } + check.summary = + fmt::format("{} slow ops, oldest one blocked for {} sec, {}", + value.n1, value.n2, ss.str()); + // No detail + } + vector<DaemonKey> daemons; +}; + + +class PendingPGs final : public DaemonHealthMetricCollector { + bool _is_relevant(daemon_metric type) const override { + return type == daemon_metric::PENDING_CREATING_PGS; + } + health_check_t& _get_check(health_check_map_t& cm) const override { + return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "", 1); + } + bool _update(const DaemonKey& osd, + const DaemonHealthMetric& metric) override { + value.n += metric.get_n(); + if (metric.get_n()) { + osds.push_back(osd); + return true; + } else { + return false; + } + } + void _summarize(health_check_t& check) const override { + if (osds.empty()) { + return; + } + check.summary = fmt::format("{} PGs pending on creation", value.n); + ostringstream ss; + if (osds.size() > 1) { + ss << "osds " << osds << " have pending PGs."; + } else { + ss << osds.front() << " has pending PGs"; + } + check.detail.push_back(ss.str()); + } + vector<DaemonKey> osds; +}; + +} // anonymous namespace + +unique_ptr<DaemonHealthMetricCollector> +DaemonHealthMetricCollector::create(daemon_metric m) +{ + switch (m) { + case daemon_metric::SLOW_OPS: + return std::make_unique<SlowOps>(); + case daemon_metric::PENDING_CREATING_PGS: + return std::make_unique<PendingPGs>(); + default: + return {}; + } +} diff --git a/src/mgr/DaemonHealthMetricCollector.h b/src/mgr/DaemonHealthMetricCollector.h new file mode 100644 index 000000000..558f4e334 --- /dev/null +++ b/src/mgr/DaemonHealthMetricCollector.h @@ -0,0 +1,32 @@ +#pragma once + +#include <memory> +#include <string> + +#include "DaemonHealthMetric.h" +#include "DaemonKey.h" +#include "mon/health_check.h" + +class DaemonHealthMetricCollector { +public: + static std::unique_ptr<DaemonHealthMetricCollector> create(daemon_metric m); + void update(const DaemonKey& daemon, const DaemonHealthMetric& metric) { + if (_is_relevant(metric.get_type())) { + reported |= _update(daemon, metric); + } + } + void summarize(health_check_map_t& cm) { + if (reported) { + _summarize(_get_check(cm)); + } + } + virtual ~DaemonHealthMetricCollector() {} +private: + virtual bool _is_relevant(daemon_metric type) const = 0; + virtual health_check_t& _get_check(health_check_map_t& cm) const = 0; + virtual bool _update(const DaemonKey& daemon, const DaemonHealthMetric& metric) = 0; + virtual void _summarize(health_check_t& check) const = 0; +protected: + daemon_metric_t value; + bool reported = false; +}; diff --git a/src/mgr/DaemonKey.cc b/src/mgr/DaemonKey.cc new file mode 100644 index 000000000..5501ac106 --- /dev/null +++ b/src/mgr/DaemonKey.cc @@ -0,0 +1,35 @@ +#include "DaemonKey.h" + +std::pair<DaemonKey, bool> DaemonKey::parse(const std::string& s) +{ + auto p = s.find('.'); + if (p == s.npos) { + return {{}, false}; + } else { + return {DaemonKey{s.substr(0, p), s.substr(p + 1)}, true}; + } +} + +bool operator<(const DaemonKey& lhs, const DaemonKey& rhs) +{ + if (int cmp = lhs.type.compare(rhs.type); cmp < 0) { + return true; + } else if (cmp > 0) { + return false; + } else { + return lhs.name < rhs.name; + } +} + +std::ostream& operator<<(std::ostream& os, const DaemonKey& key) +{ + return os << key.type << '.' << key.name; +} + +namespace ceph { +std::string to_string(const DaemonKey& key) +{ + return key.type + '.' + key.name; +} +} + diff --git a/src/mgr/DaemonKey.h b/src/mgr/DaemonKey.h new file mode 100644 index 000000000..92bacd649 --- /dev/null +++ b/src/mgr/DaemonKey.h @@ -0,0 +1,24 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> +#include <string> +#include <utility> + +// Unique reference to a daemon within a cluster +struct DaemonKey +{ + std::string type; // service type, like "osd", "mon" + std::string name; // service id / name, like "1", "a" + static std::pair<DaemonKey, bool> parse(const std::string& s); +}; + +bool operator<(const DaemonKey& lhs, const DaemonKey& rhs); +std::ostream& operator<<(std::ostream& os, const DaemonKey& key); + +namespace ceph { + std::string to_string(const DaemonKey& key); +} + diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc new file mode 100644 index 000000000..0e9e6be2a --- /dev/null +++ b/src/mgr/DaemonServer.cc @@ -0,0 +1,3142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "DaemonServer.h" +#include <boost/algorithm/string.hpp> +#include "mgr/Mgr.h" + +#include "include/stringify.h" +#include "include/str_list.h" +#include "auth/RotatingKeyRing.h" +#include "json_spirit/json_spirit_writer.h" + +#include "mgr/mgr_commands.h" +#include "mgr/DaemonHealthMetricCollector.h" +#include "mgr/OSDPerfMetricCollector.h" +#include "mgr/MDSPerfMetricCollector.h" +#include "mon/MonCommand.h" + +#include "messages/MMgrOpen.h" +#include "messages/MMgrUpdate.h" +#include "messages/MMgrClose.h" +#include "messages/MMgrConfigure.h" +#include "messages/MMonMgrReport.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MMgrCommand.h" +#include "messages/MMgrCommandReply.h" +#include "messages/MPGStats.h" +#include "messages/MOSDScrub2.h" +#include "messages/MOSDForceRecovery.h" +#include "common/errno.h" +#include "common/pick_address.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr.server " << __func__ << " " + +using namespace TOPNSPC::common; + +using std::list; +using std::ostringstream; +using std::string; +using std::stringstream; +using std::vector; +using std::unique_ptr; + +namespace { + template <typename Map> + bool map_compare(Map const &lhs, Map const &rhs) { + return lhs.size() == rhs.size() + && std::equal(lhs.begin(), lhs.end(), rhs.begin(), + [] (auto a, auto b) { return a.first == b.first && a.second == b.second; }); + } +} + +DaemonServer::DaemonServer(MonClient *monc_, + Finisher &finisher_, + DaemonStateIndex &daemon_state_, + ClusterState &cluster_state_, + PyModuleRegistry &py_modules_, + LogChannelRef clog_, + LogChannelRef audit_clog_) + : Dispatcher(g_ceph_context), + client_byte_throttler(new Throttle(g_ceph_context, "mgr_client_bytes", + g_conf().get_val<Option::size_t>("mgr_client_bytes"))), + client_msg_throttler(new Throttle(g_ceph_context, "mgr_client_messages", + g_conf().get_val<uint64_t>("mgr_client_messages"))), + osd_byte_throttler(new Throttle(g_ceph_context, "mgr_osd_bytes", + g_conf().get_val<Option::size_t>("mgr_osd_bytes"))), + osd_msg_throttler(new Throttle(g_ceph_context, "mgr_osd_messsages", + g_conf().get_val<uint64_t>("mgr_osd_messages"))), + mds_byte_throttler(new Throttle(g_ceph_context, "mgr_mds_bytes", + g_conf().get_val<Option::size_t>("mgr_mds_bytes"))), + mds_msg_throttler(new Throttle(g_ceph_context, "mgr_mds_messsages", + g_conf().get_val<uint64_t>("mgr_mds_messages"))), + mon_byte_throttler(new Throttle(g_ceph_context, "mgr_mon_bytes", + g_conf().get_val<Option::size_t>("mgr_mon_bytes"))), + mon_msg_throttler(new Throttle(g_ceph_context, "mgr_mon_messsages", + g_conf().get_val<uint64_t>("mgr_mon_messages"))), + msgr(nullptr), + monc(monc_), + finisher(finisher_), + daemon_state(daemon_state_), + cluster_state(cluster_state_), + py_modules(py_modules_), + clog(clog_), + audit_clog(audit_clog_), + pgmap_ready(false), + timer(g_ceph_context, lock), + shutting_down(false), + tick_event(nullptr), + osd_perf_metric_collector_listener(this), + osd_perf_metric_collector(osd_perf_metric_collector_listener), + mds_perf_metric_collector_listener(this), + mds_perf_metric_collector(mds_perf_metric_collector_listener) +{ + g_conf().add_observer(this); +} + +DaemonServer::~DaemonServer() { + delete msgr; + g_conf().remove_observer(this); +} + +int DaemonServer::init(uint64_t gid, entity_addrvec_t client_addrs) +{ + // Initialize Messenger + std::string public_msgr_type = g_conf()->ms_public_type.empty() ? + g_conf().get_val<std::string>("ms_type") : g_conf()->ms_public_type; + msgr = Messenger::create(g_ceph_context, public_msgr_type, + entity_name_t::MGR(gid), + "mgr", + Messenger::get_pid_nonce()); + msgr->set_default_policy(Messenger::Policy::stateless_server(0)); + + msgr->set_auth_client(monc); + + // throttle clients + msgr->set_policy_throttlers(entity_name_t::TYPE_CLIENT, + client_byte_throttler.get(), + client_msg_throttler.get()); + + // servers + msgr->set_policy_throttlers(entity_name_t::TYPE_OSD, + osd_byte_throttler.get(), + osd_msg_throttler.get()); + msgr->set_policy_throttlers(entity_name_t::TYPE_MDS, + mds_byte_throttler.get(), + mds_msg_throttler.get()); + msgr->set_policy_throttlers(entity_name_t::TYPE_MON, + mon_byte_throttler.get(), + mon_msg_throttler.get()); + + entity_addrvec_t addrs; + int r = pick_addresses(cct, CEPH_PICK_ADDRESS_PUBLIC, &addrs); + if (r < 0) { + return r; + } + dout(20) << __func__ << " will bind to " << addrs << dendl; + r = msgr->bindv(addrs); + if (r < 0) { + derr << "unable to bind mgr to " << addrs << dendl; + return r; + } + + msgr->set_myname(entity_name_t::MGR(gid)); + msgr->set_addr_unknowns(client_addrs); + + msgr->start(); + msgr->add_dispatcher_tail(this); + + msgr->set_auth_server(monc); + monc->set_handle_authentication_dispatcher(this); + + started_at = ceph_clock_now(); + + std::lock_guard l(lock); + timer.init(); + + schedule_tick_locked( + g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count()); + + return 0; +} + +entity_addrvec_t DaemonServer::get_myaddrs() const +{ + return msgr->get_myaddrs(); +} + +int DaemonServer::ms_handle_fast_authentication(Connection *con) +{ + auto s = ceph::make_ref<MgrSession>(cct); + con->set_priv(s); + s->inst.addr = con->get_peer_addr(); + s->entity_name = con->peer_name; + dout(10) << __func__ << " new session " << s << " con " << con + << " entity " << con->peer_name + << " addr " << con->get_peer_addrs() + << dendl; + + AuthCapsInfo &caps_info = con->get_peer_caps_info(); + if (caps_info.allow_all) { + dout(10) << " session " << s << " " << s->entity_name + << " allow_all" << dendl; + s->caps.set_allow_all(); + } else if (caps_info.caps.length() > 0) { + auto p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } + catch (buffer::error& e) { + dout(10) << " session " << s << " " << s->entity_name + << " failed to decode caps" << dendl; + return -EACCES; + } + if (!s->caps.parse(str)) { + dout(10) << " session " << s << " " << s->entity_name + << " failed to parse caps '" << str << "'" << dendl; + return -EACCES; + } + dout(10) << " session " << s << " " << s->entity_name + << " has caps " << s->caps << " '" << str << "'" << dendl; + } + return 1; +} + +void DaemonServer::ms_handle_accept(Connection* con) +{ + if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) { + auto s = ceph::ref_cast<MgrSession>(con->get_priv()); + std::lock_guard l(lock); + s->osd_id = atoi(s->entity_name.get_id().c_str()); + dout(10) << "registering osd." << s->osd_id << " session " + << s << " con " << con << dendl; + osd_cons[s->osd_id].insert(con); + } +} + +bool DaemonServer::ms_handle_reset(Connection *con) +{ + if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) { + auto priv = con->get_priv(); + auto session = static_cast<MgrSession*>(priv.get()); + if (!session) { + return false; + } + std::lock_guard l(lock); + dout(10) << "unregistering osd." << session->osd_id + << " session " << session << " con " << con << dendl; + osd_cons[session->osd_id].erase(con); + + auto iter = daemon_connections.find(con); + if (iter != daemon_connections.end()) { + daemon_connections.erase(iter); + } + } + return false; +} + +bool DaemonServer::ms_handle_refused(Connection *con) +{ + // do nothing for now + return false; +} + +bool DaemonServer::ms_dispatch2(const ref_t<Message>& m) +{ + // Note that we do *not* take ::lock here, in order to avoid + // serializing all message handling. It's up to each handler + // to take whatever locks it needs. + switch (m->get_type()) { + case MSG_PGSTATS: + cluster_state.ingest_pgstats(ref_cast<MPGStats>(m)); + maybe_ready(m->get_source().num()); + return true; + case MSG_MGR_REPORT: + return handle_report(ref_cast<MMgrReport>(m)); + case MSG_MGR_OPEN: + return handle_open(ref_cast<MMgrOpen>(m)); + case MSG_MGR_UPDATE: + return handle_update(ref_cast<MMgrUpdate>(m)); + case MSG_MGR_CLOSE: + return handle_close(ref_cast<MMgrClose>(m)); + case MSG_COMMAND: + return handle_command(ref_cast<MCommand>(m)); + case MSG_MGR_COMMAND: + return handle_command(ref_cast<MMgrCommand>(m)); + default: + dout(1) << "Unhandled message type " << m->get_type() << dendl; + return false; + }; +} + +void DaemonServer::dump_pg_ready(ceph::Formatter *f) +{ + f->dump_bool("pg_ready", pgmap_ready.load()); +} + +void DaemonServer::maybe_ready(int32_t osd_id) +{ + if (pgmap_ready.load()) { + // Fast path: we don't need to take lock because pgmap_ready + // is already set + } else { + std::lock_guard l(lock); + + if (reported_osds.find(osd_id) == reported_osds.end()) { + dout(4) << "initial report from osd " << osd_id << dendl; + reported_osds.insert(osd_id); + std::set<int32_t> up_osds; + + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + osdmap.get_up_osds(up_osds); + }); + + std::set<int32_t> unreported_osds; + std::set_difference(up_osds.begin(), up_osds.end(), + reported_osds.begin(), reported_osds.end(), + std::inserter(unreported_osds, unreported_osds.begin())); + + if (unreported_osds.size() == 0) { + dout(4) << "all osds have reported, sending PG state to mon" << dendl; + pgmap_ready = true; + reported_osds.clear(); + // Avoid waiting for next tick + send_report(); + } else { + dout(4) << "still waiting for " << unreported_osds.size() << " osds" + " to report in before PGMap is ready" << dendl; + } + } + } +} + +void DaemonServer::tick() +{ + dout(10) << dendl; + send_report(); + adjust_pgs(); + + schedule_tick_locked( + g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count()); +} + +// Currently modules do not set health checks in response to events delivered to +// all modules (e.g. notify) so we do not risk a thundering hurd situation here. +// if this pattern emerges in the future, this scheduler could be modified to +// fire after all modules have had a chance to set their health checks. +void DaemonServer::schedule_tick_locked(double delay_sec) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + if (tick_event) { + timer.cancel_event(tick_event); + tick_event = nullptr; + } + + // on shutdown start rejecting explicit requests to send reports that may + // originate from python land which may still be running. + if (shutting_down) + return; + + tick_event = timer.add_event_after(delay_sec, + new LambdaContext([this](int r) { + tick(); + })); +} + +void DaemonServer::schedule_tick(double delay_sec) +{ + std::lock_guard l(lock); + schedule_tick_locked(delay_sec); +} + +void DaemonServer::handle_osd_perf_metric_query_updated() +{ + dout(10) << dendl; + + // Send a fresh MMgrConfigure to all clients, so that they can follow + // the new policy for transmitting stats + finisher.queue(new LambdaContext([this](int r) { + std::lock_guard l(lock); + for (auto &c : daemon_connections) { + if (c->peer_is_osd()) { + _send_configure(c); + } + } + })); +} + +void DaemonServer::handle_mds_perf_metric_query_updated() +{ + dout(10) << dendl; + + // Send a fresh MMgrConfigure to all clients, so that they can follow + // the new policy for transmitting stats + finisher.queue(new LambdaContext([this](int r) { + std::lock_guard l(lock); + for (auto &c : daemon_connections) { + if (c->peer_is_mds()) { + _send_configure(c); + } + } + })); +} + +void DaemonServer::shutdown() +{ + dout(10) << "begin" << dendl; + msgr->shutdown(); + msgr->wait(); + cluster_state.shutdown(); + dout(10) << "done" << dendl; + + std::lock_guard l(lock); + shutting_down = true; + timer.shutdown(); +} + +static DaemonKey key_from_service( + const std::string& service_name, + int peer_type, + const std::string& daemon_name) +{ + if (!service_name.empty()) { + return DaemonKey{service_name, daemon_name}; + } else { + return DaemonKey{ceph_entity_type_name(peer_type), daemon_name}; + } +} + +void DaemonServer::fetch_missing_metadata(const DaemonKey& key, + const entity_addr_t& addr) +{ + if (!daemon_state.is_updating(key) && + (key.type == "osd" || key.type == "mds" || key.type == "mon")) { + std::ostringstream oss; + auto c = new MetadataUpdate(daemon_state, key); + if (key.type == "osd") { + oss << "{\"prefix\": \"osd metadata\", \"id\": " + << key.name<< "}"; + } else if (key.type == "mds") { + c->set_default("addr", stringify(addr)); + oss << "{\"prefix\": \"mds metadata\", \"who\": \"" + << key.name << "\"}"; + } else if (key.type == "mon") { + oss << "{\"prefix\": \"mon metadata\", \"id\": \"" + << key.name << "\"}"; + } else { + ceph_abort(); + } + monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c); + } +} + +bool DaemonServer::handle_open(const ref_t<MMgrOpen>& m) +{ + std::unique_lock l(lock); + + DaemonKey key = key_from_service(m->service_name, + m->get_connection()->get_peer_type(), + m->daemon_name); + + auto con = m->get_connection(); + dout(10) << "from " << key << " " << con->get_peer_addr() << dendl; + + _send_configure(con); + + DaemonStatePtr daemon; + if (daemon_state.exists(key)) { + dout(20) << "updating existing DaemonState for " << key << dendl; + daemon = daemon_state.get(key); + } + if (!daemon) { + if (m->service_daemon) { + dout(4) << "constructing new DaemonState for " << key << dendl; + daemon = std::make_shared<DaemonState>(daemon_state.types); + daemon->key = key; + daemon->service_daemon = true; + daemon_state.insert(daemon); + } else { + /* A normal Ceph daemon has connected but we are or should be waiting on + * metadata for it. Close the session so that it tries to reconnect. + */ + dout(2) << "ignoring open from " << key << " " << con->get_peer_addr() + << "; not ready for session (expect reconnect)" << dendl; + con->mark_down(); + l.unlock(); + fetch_missing_metadata(key, m->get_source_addr()); + return true; + } + } + if (daemon) { + if (m->service_daemon) { + // update the metadata through the daemon state index to + // ensure it's kept up-to-date + daemon_state.update_metadata(daemon, m->daemon_metadata); + } + + std::lock_guard l(daemon->lock); + daemon->perf_counters.clear(); + + daemon->service_daemon = m->service_daemon; + if (m->service_daemon) { + daemon->service_status = m->daemon_status; + + utime_t now = ceph_clock_now(); + auto [d, added] = pending_service_map.get_daemon(m->service_name, + m->daemon_name); + if (added || d->gid != (uint64_t)m->get_source().num()) { + dout(10) << "registering " << key << " in pending_service_map" << dendl; + d->gid = m->get_source().num(); + d->addr = m->get_source_addr(); + d->start_epoch = pending_service_map.epoch; + d->start_stamp = now; + d->metadata = m->daemon_metadata; + pending_service_map_dirty = pending_service_map.epoch; + } + } + + auto p = m->config_bl.cbegin(); + if (p != m->config_bl.end()) { + decode(daemon->config, p); + decode(daemon->ignored_mon_config, p); + dout(20) << " got config " << daemon->config + << " ignored " << daemon->ignored_mon_config << dendl; + } + daemon->config_defaults_bl = m->config_defaults_bl; + daemon->config_defaults.clear(); + dout(20) << " got config_defaults_bl " << daemon->config_defaults_bl.length() + << " bytes" << dendl; + } + + if (con->get_peer_type() != entity_name_t::TYPE_CLIENT && + m->service_name.empty()) + { + // Store in set of the daemon/service connections, i.e. those + // connections that require an update in the event of stats + // configuration changes. + daemon_connections.insert(con); + } + + return true; +} + +bool DaemonServer::handle_update(const ref_t<MMgrUpdate>& m) +{ + DaemonKey key; + if (!m->service_name.empty()) { + key.type = m->service_name; + } else { + key.type = ceph_entity_type_name(m->get_connection()->get_peer_type()); + } + key.name = m->daemon_name; + + dout(10) << "from " << m->get_connection() << " " << key << dendl; + + if (m->get_connection()->get_peer_type() == entity_name_t::TYPE_CLIENT && + m->service_name.empty()) { + // Clients should not be sending us update request + dout(10) << "rejecting update request from non-daemon client " << m->daemon_name + << dendl; + clog->warn() << "rejecting report from non-daemon client " << m->daemon_name + << " at " << m->get_connection()->get_peer_addrs(); + m->get_connection()->mark_down(); + return true; + } + + + { + std::unique_lock locker(lock); + + DaemonStatePtr daemon; + // Look up the DaemonState + if (daemon_state.exists(key)) { + dout(20) << "updating existing DaemonState for " << key << dendl; + + daemon = daemon_state.get(key); + if (m->need_metadata_update && + !m->daemon_metadata.empty()) { + daemon_state.update_metadata(daemon, m->daemon_metadata); + } + } + } + + return true; +} + +bool DaemonServer::handle_close(const ref_t<MMgrClose>& m) +{ + std::lock_guard l(lock); + + DaemonKey key = key_from_service(m->service_name, + m->get_connection()->get_peer_type(), + m->daemon_name); + dout(4) << "from " << m->get_connection() << " " << key << dendl; + + if (daemon_state.exists(key)) { + DaemonStatePtr daemon = daemon_state.get(key); + daemon_state.rm(key); + { + std::lock_guard l(daemon->lock); + if (daemon->service_daemon) { + pending_service_map.rm_daemon(m->service_name, m->daemon_name); + pending_service_map_dirty = pending_service_map.epoch; + } + } + } + + // send same message back as a reply + m->get_connection()->send_message2(m); + return true; +} + +void DaemonServer::update_task_status( + DaemonKey key, + const std::map<std::string,std::string>& task_status) +{ + dout(10) << "got task status from " << key << dendl; + + [[maybe_unused]] auto [daemon, added] = + pending_service_map.get_daemon(key.type, key.name); + if (daemon->task_status != task_status) { + daemon->task_status = task_status; + pending_service_map_dirty = pending_service_map.epoch; + } +} + +bool DaemonServer::handle_report(const ref_t<MMgrReport>& m) +{ + DaemonKey key; + if (!m->service_name.empty()) { + key.type = m->service_name; + } else { + key.type = ceph_entity_type_name(m->get_connection()->get_peer_type()); + } + key.name = m->daemon_name; + + dout(10) << "from " << m->get_connection() << " " << key << dendl; + + if (m->get_connection()->get_peer_type() == entity_name_t::TYPE_CLIENT && + m->service_name.empty()) { + // Clients should not be sending us stats unless they are declaring + // themselves to be a daemon for some service. + dout(10) << "rejecting report from non-daemon client " << m->daemon_name + << dendl; + clog->warn() << "rejecting report from non-daemon client " << m->daemon_name + << " at " << m->get_connection()->get_peer_addrs(); + m->get_connection()->mark_down(); + return true; + } + + + { + std::unique_lock locker(lock); + + DaemonStatePtr daemon; + // Look up the DaemonState + if (daemon = daemon_state.get(key); daemon != nullptr) { + dout(20) << "updating existing DaemonState for " << key << dendl; + } else { + locker.unlock(); + + // we don't know the hostname at this stage, reject MMgrReport here. + dout(5) << "rejecting report from " << key << ", since we do not have its metadata now." + << dendl; + // issue metadata request in background + fetch_missing_metadata(key, m->get_source_addr()); + + locker.lock(); + + // kill session + auto priv = m->get_connection()->get_priv(); + auto session = static_cast<MgrSession*>(priv.get()); + if (!session) { + return false; + } + m->get_connection()->mark_down(); + + dout(10) << "unregistering osd." << session->osd_id + << " session " << session << " con " << m->get_connection() << dendl; + + if (osd_cons.find(session->osd_id) != osd_cons.end()) { + osd_cons[session->osd_id].erase(m->get_connection()); + } + + auto iter = daemon_connections.find(m->get_connection()); + if (iter != daemon_connections.end()) { + daemon_connections.erase(iter); + } + + return false; + } + + // Update the DaemonState + ceph_assert(daemon != nullptr); + { + std::lock_guard l(daemon->lock); + auto &daemon_counters = daemon->perf_counters; + daemon_counters.update(*m.get()); + + auto p = m->config_bl.cbegin(); + if (p != m->config_bl.end()) { + decode(daemon->config, p); + decode(daemon->ignored_mon_config, p); + dout(20) << " got config " << daemon->config + << " ignored " << daemon->ignored_mon_config << dendl; + } + + utime_t now = ceph_clock_now(); + if (daemon->service_daemon) { + if (m->daemon_status) { + daemon->service_status_stamp = now; + daemon->service_status = *m->daemon_status; + } + daemon->last_service_beacon = now; + } else if (m->daemon_status) { + derr << "got status from non-daemon " << key << dendl; + } + // update task status + if (m->task_status) { + update_task_status(key, *m->task_status); + daemon->last_service_beacon = now; + } + if (m->get_connection()->peer_is_osd() || m->get_connection()->peer_is_mon()) { + // only OSD and MON send health_checks to me now + daemon->daemon_health_metrics = std::move(m->daemon_health_metrics); + dout(10) << "daemon_health_metrics " << daemon->daemon_health_metrics + << dendl; + } + } + } + + // if there are any schema updates, notify the python modules + /* no users currently + if (!m->declare_types.empty() || !m->undeclare_types.empty()) { + py_modules.notify_all("perf_schema_update", ceph::to_string(key)); + } + */ + + if (m->get_connection()->peer_is_osd()) { + osd_perf_metric_collector.process_reports(m->osd_perf_metric_reports); + } + + if (m->metric_report_message) { + const MetricReportMessage &message = *m->metric_report_message; + boost::apply_visitor(HandlePayloadVisitor(this), message.payload); + } + + return true; +} + + +void DaemonServer::_generate_command_map( + cmdmap_t& cmdmap, + map<string,string> ¶m_str_map) +{ + for (auto p = cmdmap.begin(); + p != cmdmap.end(); ++p) { + if (p->first == "prefix") + continue; + if (p->first == "caps") { + vector<string> cv; + if (cmd_getval(cmdmap, "caps", cv) && + cv.size() % 2 == 0) { + for (unsigned i = 0; i < cv.size(); i += 2) { + string k = string("caps_") + cv[i]; + param_str_map[k] = cv[i + 1]; + } + continue; + } + } + param_str_map[p->first] = cmd_vartype_stringify(p->second); + } +} + +const MonCommand *DaemonServer::_get_mgrcommand( + const string &cmd_prefix, + const std::vector<MonCommand> &cmds) +{ + const MonCommand *this_cmd = nullptr; + for (const auto &cmd : cmds) { + if (cmd.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) { + this_cmd = &cmd; + break; + } + } + return this_cmd; +} + +bool DaemonServer::_allowed_command( + MgrSession *s, + const string &service, + const string &module, + const string &prefix, + const cmdmap_t& cmdmap, + const map<string,string>& param_str_map, + const MonCommand *this_cmd) { + + if (s->entity_name.is_mon()) { + // mon is all-powerful. even when it is forwarding commands on behalf of + // old clients; we expect the mon is validating commands before proxying! + return true; + } + + bool cmd_r = this_cmd->requires_perm('r'); + bool cmd_w = this_cmd->requires_perm('w'); + bool cmd_x = this_cmd->requires_perm('x'); + + bool capable = s->caps.is_capable( + g_ceph_context, + s->entity_name, + service, module, prefix, param_str_map, + cmd_r, cmd_w, cmd_x, + s->get_peer_addr()); + + dout(10) << " " << s->entity_name << " " + << (capable ? "" : "not ") << "capable" << dendl; + return capable; +} + +/** + * The working data for processing an MCommand. This lives in + * a class to enable passing it into other threads for processing + * outside of the thread/locks that called handle_command. + */ +class CommandContext { +public: + ceph::ref_t<MCommand> m_tell; + ceph::ref_t<MMgrCommand> m_mgr; + const std::vector<std::string>& cmd; ///< ref into m_tell or m_mgr + const bufferlist& data; ///< ref into m_tell or m_mgr + bufferlist odata; + cmdmap_t cmdmap; + + explicit CommandContext(ceph::ref_t<MCommand> m) + : m_tell{std::move(m)}, + cmd(m_tell->cmd), + data(m_tell->get_data()) { + } + explicit CommandContext(ceph::ref_t<MMgrCommand> m) + : m_mgr{std::move(m)}, + cmd(m_mgr->cmd), + data(m_mgr->get_data()) { + } + + void reply(int r, const std::stringstream &ss) { + reply(r, ss.str()); + } + + void reply(int r, const std::string &rs) { + // Let the connection drop as soon as we've sent our response + ConnectionRef con = m_tell ? m_tell->get_connection() + : m_mgr->get_connection(); + if (con) { + con->mark_disposable(); + } + + if (r == 0) { + dout(20) << "success" << dendl; + } else { + derr << __func__ << " " << cpp_strerror(r) << " " << rs << dendl; + } + if (con) { + if (m_tell) { + MCommandReply *reply = new MCommandReply(r, rs); + reply->set_tid(m_tell->get_tid()); + reply->set_data(odata); + con->send_message(reply); + } else { + MMgrCommandReply *reply = new MMgrCommandReply(r, rs); + reply->set_tid(m_mgr->get_tid()); + reply->set_data(odata); + con->send_message(reply); + } + } + } +}; + +/** + * A context for receiving a bufferlist/error string from a background + * function and then calling back to a CommandContext when it's done + */ +class ReplyOnFinish : public Context { + std::shared_ptr<CommandContext> cmdctx; + +public: + bufferlist from_mon; + string outs; + + explicit ReplyOnFinish(const std::shared_ptr<CommandContext> &cmdctx_) + : cmdctx(cmdctx_) + {} + void finish(int r) override { + cmdctx->odata.claim_append(from_mon); + cmdctx->reply(r, outs); + } +}; + +bool DaemonServer::handle_command(const ref_t<MCommand>& m) +{ + std::lock_guard l(lock); + auto cmdctx = std::make_shared<CommandContext>(m); + try { + return _handle_command(cmdctx); + } catch (const bad_cmd_get& e) { + cmdctx->reply(-EINVAL, e.what()); + return true; + } +} + +bool DaemonServer::handle_command(const ref_t<MMgrCommand>& m) +{ + std::lock_guard l(lock); + auto cmdctx = std::make_shared<CommandContext>(m); + try { + return _handle_command(cmdctx); + } catch (const bad_cmd_get& e) { + cmdctx->reply(-EINVAL, e.what()); + return true; + } +} + +void DaemonServer::log_access_denied( + std::shared_ptr<CommandContext>& cmdctx, + MgrSession* session, std::stringstream& ss) { + dout(1) << " access denied" << dendl; + audit_clog->info() << "from='" << session->inst << "' " + << "entity='" << session->entity_name << "' " + << "cmd=" << cmdctx->cmd << ": access denied"; + ss << "access denied: does your client key have mgr caps? " + "See http://docs.ceph.com/en/latest/mgr/administrator/" + "#client-authentication"; +} + +void DaemonServer::_check_offlines_pgs( + const set<int>& osds, + const OSDMap& osdmap, + const PGMap& pgmap, + offline_pg_report *report) +{ + // reset output + *report = offline_pg_report(); + report->osds = osds; + + for (const auto& q : pgmap.pg_stat) { + set<int32_t> pg_acting; // net acting sets (with no missing if degraded) + bool found = false; + if (q.second.state == 0) { + report->unknown.insert(q.first); + continue; + } + if (q.second.state & PG_STATE_DEGRADED) { + for (auto& anm : q.second.avail_no_missing) { + if (osds.count(anm.osd)) { + found = true; + continue; + } + if (anm.osd != CRUSH_ITEM_NONE) { + pg_acting.insert(anm.osd); + } + } + } else { + for (auto& a : q.second.acting) { + if (osds.count(a)) { + found = true; + continue; + } + if (a != CRUSH_ITEM_NONE) { + pg_acting.insert(a); + } + } + } + if (!found) { + continue; + } + const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool()); + bool dangerous = false; + if (!pi) { + report->bad_no_pool.insert(q.first); // pool is creating or deleting + dangerous = true; + } + if (!(q.second.state & PG_STATE_ACTIVE)) { + report->bad_already_inactive.insert(q.first); + dangerous = true; + } + if (pg_acting.size() < pi->min_size) { + report->bad_become_inactive.insert(q.first); + dangerous = true; + } + if (dangerous) { + report->not_ok.insert(q.first); + } else { + report->ok.insert(q.first); + if (q.second.state & PG_STATE_DEGRADED) { + report->ok_become_more_degraded.insert(q.first); + } else { + report->ok_become_degraded.insert(q.first); + } + } + } + dout(20) << osds << " -> " << report->ok.size() << " ok, " + << report->not_ok.size() << " not ok, " + << report->unknown.size() << " unknown" + << dendl; +} + +void DaemonServer::_maximize_ok_to_stop_set( + const set<int>& orig_osds, + unsigned max, + const OSDMap& osdmap, + const PGMap& pgmap, + offline_pg_report *out_report) +{ + dout(20) << "orig_osds " << orig_osds << " max " << max << dendl; + _check_offlines_pgs(orig_osds, osdmap, pgmap, out_report); + if (!out_report->ok_to_stop()) { + return; + } + if (orig_osds.size() >= max) { + // already at max + return; + } + + // semi-arbitrarily start with the first osd in the set + offline_pg_report report; + set<int> osds = orig_osds; + int parent = *osds.begin(); + set<int> children; + + while (true) { + // identify the next parent + int r = osdmap.crush->get_immediate_parent_id(parent, &parent); + if (r < 0) { + return; // just go with what we have so far! + } + + // get candidate additions that are beneath this point in the tree + children.clear(); + r = osdmap.crush->get_all_children(parent, &children); + if (r < 0) { + return; // just go with what we have so far! + } + dout(20) << " parent " << parent << " children " << children << dendl; + + // try adding in more osds + int failed = 0; // how many children we failed to add to our set + for (auto o : children) { + if (o >= 0 && osdmap.is_up(o) && osds.count(o) == 0) { + osds.insert(o); + _check_offlines_pgs(osds, osdmap, pgmap, &report); + if (!report.ok_to_stop()) { + osds.erase(o); + ++failed; + continue; + } + *out_report = report; + if (osds.size() == max) { + dout(20) << " hit max" << dendl; + return; // yay, we hit the max + } + } + } + + if (failed) { + // we hit some failures; go with what we have + dout(20) << " hit some peer failures" << dendl; + return; + } + } +} + +bool DaemonServer::_handle_command( + std::shared_ptr<CommandContext>& cmdctx) +{ + MessageRef m; + bool admin_socket_cmd = false; + if (cmdctx->m_tell) { + m = cmdctx->m_tell; + // a blank fsid in MCommand signals a legacy client sending a "mon-mgr" CLI + // command. + admin_socket_cmd = (cmdctx->m_tell->fsid != uuid_d()); + } else { + m = cmdctx->m_mgr; + } + auto priv = m->get_connection()->get_priv(); + auto session = static_cast<MgrSession*>(priv.get()); + if (!session) { + return true; + } + if (session->inst.name == entity_name_t()) { + session->inst.name = m->get_source(); + } + + map<string,string> param_str_map; + std::stringstream ss; + int r = 0; + + if (!cmdmap_from_json(cmdctx->cmd, &(cmdctx->cmdmap), ss)) { + cmdctx->reply(-EINVAL, ss); + return true; + } + + string prefix; + cmd_getval(cmdctx->cmdmap, "prefix", prefix); + dout(10) << "decoded-size=" << cmdctx->cmdmap.size() << " prefix=" << prefix << dendl; + + boost::scoped_ptr<Formatter> f; + { + std::string format; + if (boost::algorithm::ends_with(prefix, "_json")) { + format = "json"; + } else { + format = cmd_getval_or<string>(cmdctx->cmdmap, "format", "plain"); + } + f.reset(Formatter::create(format)); + } + + // this is just for mgr commands - admin socket commands will fall + // through and use the admin socket version of + // get_command_descriptions + if (prefix == "get_command_descriptions" && !admin_socket_cmd) { + dout(10) << "reading commands from python modules" << dendl; + const auto py_commands = py_modules.get_commands(); + + int cmdnum = 0; + JSONFormatter f; + f.open_object_section("command_descriptions"); + + auto dump_cmd = [&cmdnum, &f, m](const MonCommand &mc){ + ostringstream secname; + secname << "cmd" << std::setfill('0') << std::setw(3) << cmdnum; + dump_cmddesc_to_json(&f, m->get_connection()->get_features(), + secname.str(), mc.cmdstring, mc.helpstring, + mc.module, mc.req_perms, 0); + cmdnum++; + }; + + for (const auto &pyc : py_commands) { + dump_cmd(pyc); + } + + for (const auto &mgr_cmd : mgr_commands) { + dump_cmd(mgr_cmd); + } + + f.close_section(); // command_descriptions + f.flush(cmdctx->odata); + cmdctx->reply(0, ss); + return true; + } + + // lookup command + const MonCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands); + _generate_command_map(cmdctx->cmdmap, param_str_map); + + bool is_allowed = false; + ModuleCommand py_command; + if (admin_socket_cmd) { + // admin socket commands require all capabilities + is_allowed = session->caps.is_allow_all(); + } else if (!mgr_cmd) { + // Resolve the command to the name of the module that will + // handle it (if the command exists) + auto py_commands = py_modules.get_py_commands(); + for (const auto &pyc : py_commands) { + auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring); + if (pyc_prefix == prefix) { + py_command = pyc; + break; + } + } + + MonCommand pyc = {"", "", "py", py_command.perm}; + is_allowed = _allowed_command(session, "py", py_command.module_name, + prefix, cmdctx->cmdmap, param_str_map, + &pyc); + } else { + // validate user's permissions for requested command + is_allowed = _allowed_command(session, mgr_cmd->module, "", + prefix, cmdctx->cmdmap, param_str_map, mgr_cmd); + } + + if (!is_allowed) { + log_access_denied(cmdctx, session, ss); + cmdctx->reply(-EACCES, ss); + return true; + } + + audit_clog->debug() + << "from='" << session->inst << "' " + << "entity='" << session->entity_name << "' " + << "cmd=" << cmdctx->cmd << ": dispatch"; + + if (admin_socket_cmd) { + cct->get_admin_socket()->queue_tell_command(cmdctx->m_tell); + return true; + } + + // ---------------- + // service map commands + if (prefix == "service dump") { + if (!f) + f.reset(Formatter::create("json-pretty")); + cluster_state.with_servicemap([&](const ServiceMap &service_map) { + f->dump_object("service_map", service_map); + }); + f->flush(cmdctx->odata); + cmdctx->reply(0, ss); + return true; + } + if (prefix == "service status") { + if (!f) + f.reset(Formatter::create("json-pretty")); + // only include state from services that are in the persisted service map + f->open_object_section("service_status"); + for (auto& [type, service] : pending_service_map.services) { + if (ServiceMap::is_normal_ceph_entity(type)) { + continue; + } + + f->open_object_section(type.c_str()); + for (auto& q : service.daemons) { + f->open_object_section(q.first.c_str()); + DaemonKey key{type, q.first}; + ceph_assert(daemon_state.exists(key)); + auto daemon = daemon_state.get(key); + std::lock_guard l(daemon->lock); + f->dump_stream("status_stamp") << daemon->service_status_stamp; + f->dump_stream("last_beacon") << daemon->last_service_beacon; + f->open_object_section("status"); + for (auto& r : daemon->service_status) { + f->dump_string(r.first.c_str(), r.second); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + } + f->close_section(); + f->flush(cmdctx->odata); + cmdctx->reply(0, ss); + return true; + } + + if (prefix == "config set") { + std::string key; + std::string val; + cmd_getval(cmdctx->cmdmap, "key", key); + cmd_getval(cmdctx->cmdmap, "value", val); + r = cct->_conf.set_val(key, val, &ss); + if (r == 0) { + cct->_conf.apply_changes(nullptr); + } + cmdctx->reply(0, ss); + return true; + } + + // ----------- + // PG commands + + if (prefix == "pg scrub" || + prefix == "pg repair" || + prefix == "pg deep-scrub") { + string scrubop = prefix.substr(3, string::npos); + pg_t pgid; + spg_t spgid; + string pgidstr; + cmd_getval(cmdctx->cmdmap, "pgid", pgidstr); + if (!pgid.parse(pgidstr.c_str())) { + ss << "invalid pgid '" << pgidstr << "'"; + cmdctx->reply(-EINVAL, ss); + return true; + } + bool pg_exists = false; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + pg_exists = osdmap.pg_exists(pgid); + }); + if (!pg_exists) { + ss << "pg " << pgid << " does not exist"; + cmdctx->reply(-ENOENT, ss); + return true; + } + int acting_primary = -1; + epoch_t epoch; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + epoch = osdmap.get_epoch(); + osdmap.get_primary_shard(pgid, &acting_primary, &spgid); + }); + if (acting_primary == -1) { + ss << "pg " << pgid << " has no primary osd"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + auto p = osd_cons.find(acting_primary); + if (p == osd_cons.end()) { + ss << "pg " << pgid << " primary osd." << acting_primary + << " is not currently connected"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + for (auto& con : p->second) { + assert(HAVE_FEATURE(con->get_features(), SERVER_OCTOPUS)); + vector<spg_t> pgs = { spgid }; + con->send_message(new MOSDScrub2(monc->get_fsid(), + epoch, + pgs, + scrubop == "repair", + scrubop == "deep-scrub")); + } + ss << "instructing pg " << spgid << " on osd." << acting_primary + << " to " << scrubop; + cmdctx->reply(0, ss); + return true; + } else if (prefix == "osd scrub" || + prefix == "osd deep-scrub" || + prefix == "osd repair") { + string whostr; + cmd_getval(cmdctx->cmdmap, "who", whostr); + vector<string> pvec; + get_str_vec(prefix, pvec); + + set<int> osds; + if (whostr == "*" || whostr == "all" || whostr == "any") { + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (int i = 0; i < osdmap.get_max_osd(); i++) + if (osdmap.is_up(i)) { + osds.insert(i); + } + }); + } else { + long osd = parse_osd_id(whostr.c_str(), &ss); + if (osd < 0) { + ss << "invalid osd '" << whostr << "'"; + cmdctx->reply(-EINVAL, ss); + return true; + } + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + if (osdmap.is_up(osd)) { + osds.insert(osd); + } + }); + if (osds.empty()) { + ss << "osd." << osd << " is not up"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + } + set<int> sent_osds, failed_osds; + for (auto osd : osds) { + vector<spg_t> spgs; + epoch_t epoch; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) { + epoch = osdmap.get_epoch(); + auto p = pgmap.pg_by_osd.find(osd); + if (p != pgmap.pg_by_osd.end()) { + for (auto pgid : p->second) { + int primary; + spg_t spg; + osdmap.get_primary_shard(pgid, &primary, &spg); + if (primary == osd) { + spgs.push_back(spg); + } + } + } + }); + auto p = osd_cons.find(osd); + if (p == osd_cons.end()) { + failed_osds.insert(osd); + } else { + sent_osds.insert(osd); + for (auto& con : p->second) { + con->send_message(new MOSDScrub2(monc->get_fsid(), + epoch, + spgs, + pvec.back() == "repair", + pvec.back() == "deep-scrub")); + } + } + } + if (failed_osds.size() == osds.size()) { + ss << "failed to instruct osd(s) " << osds << " to " << pvec.back() + << " (not connected)"; + r = -EAGAIN; + } else { + ss << "instructed osd(s) " << sent_osds << " to " << pvec.back(); + if (!failed_osds.empty()) { + ss << "; osd(s) " << failed_osds << " were not connected"; + } + r = 0; + } + cmdctx->reply(0, ss); + return true; + } else if (prefix == "osd pool scrub" || + prefix == "osd pool deep-scrub" || + prefix == "osd pool repair") { + vector<string> pool_names; + cmd_getval(cmdctx->cmdmap, "who", pool_names); + if (pool_names.empty()) { + ss << "must specify one or more pool names"; + cmdctx->reply(-EINVAL, ss); + return true; + } + epoch_t epoch; + map<int32_t, vector<pg_t>> pgs_by_primary; // legacy + map<int32_t, vector<spg_t>> spgs_by_primary; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + epoch = osdmap.get_epoch(); + for (auto& pool_name : pool_names) { + auto pool_id = osdmap.lookup_pg_pool_name(pool_name); + if (pool_id < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + return; + } + auto pool_pg_num = osdmap.get_pg_num(pool_id); + for (int i = 0; i < pool_pg_num; i++) { + pg_t pg(i, pool_id); + int primary; + spg_t spg; + auto got = osdmap.get_primary_shard(pg, &primary, &spg); + if (!got) + continue; + pgs_by_primary[primary].push_back(pg); + spgs_by_primary[primary].push_back(spg); + } + } + }); + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + for (auto& it : spgs_by_primary) { + auto primary = it.first; + auto p = osd_cons.find(primary); + if (p == osd_cons.end()) { + ss << "osd." << primary << " is not currently connected"; + cmdctx->reply(-EAGAIN, ss); + return true; + } + for (auto& con : p->second) { + con->send_message(new MOSDScrub2(monc->get_fsid(), + epoch, + it.second, + prefix == "osd pool repair", + prefix == "osd pool deep-scrub")); + } + } + cmdctx->reply(0, ""); + return true; + } else if (prefix == "osd reweight-by-pg" || + prefix == "osd reweight-by-utilization" || + prefix == "osd test-reweight-by-pg" || + prefix == "osd test-reweight-by-utilization") { + bool by_pg = + prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg"; + bool dry_run = + prefix == "osd test-reweight-by-pg" || + prefix == "osd test-reweight-by-utilization"; + int64_t oload = cmd_getval_or<int64_t>(cmdctx->cmdmap, "oload", 120); + set<int64_t> pools; + vector<string> poolnames; + cmd_getval(cmdctx->cmdmap, "pools", poolnames); + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (const auto& poolname : poolnames) { + int64_t pool = osdmap.lookup_pg_pool_name(poolname); + if (pool < 0) { + ss << "pool '" << poolname << "' does not exist"; + r = -ENOENT; + } + pools.insert(pool); + } + }); + if (r) { + cmdctx->reply(r, ss); + return true; + } + + double max_change = g_conf().get_val<double>("mon_reweight_max_change"); + cmd_getval(cmdctx->cmdmap, "max_change", max_change); + if (max_change <= 0.0) { + ss << "max_change " << max_change << " must be positive"; + cmdctx->reply(-EINVAL, ss); + return true; + } + int64_t max_osds = g_conf().get_val<int64_t>("mon_reweight_max_osds"); + cmd_getval(cmdctx->cmdmap, "max_osds", max_osds); + if (max_osds <= 0) { + ss << "max_osds " << max_osds << " must be positive"; + cmdctx->reply(-EINVAL, ss); + return true; + } + bool no_increasing = false; + cmd_getval_compat_cephbool(cmdctx->cmdmap, "no_increasing", no_increasing); + string out_str; + mempool::osdmap::map<int32_t, uint32_t> new_weights; + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap &osdmap, const PGMap& pgmap) { + return reweight::by_utilization(osdmap, pgmap, + oload, + max_change, + max_osds, + by_pg, + pools.empty() ? NULL : &pools, + no_increasing, + &new_weights, + &ss, &out_str, f.get()); + }); + if (r >= 0) { + dout(10) << "reweight::by_utilization: finished with " << out_str << dendl; + } + if (f) { + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(out_str); + } + if (r < 0) { + ss << "FAILED reweight-by-pg"; + cmdctx->reply(r, ss); + return true; + } else if (r == 0 || dry_run) { + ss << "no change"; + cmdctx->reply(r, ss); + return true; + } else { + json_spirit::Object json_object; + for (const auto& osd_weight : new_weights) { + json_spirit::Config::add(json_object, + std::to_string(osd_weight.first), + std::to_string(osd_weight.second)); + } + string s = json_spirit::write(json_object); + std::replace(begin(s), end(s), '\"', '\''); + const string cmd = + "{" + "\"prefix\": \"osd reweightn\", " + "\"weights\": \"" + s + "\"" + "}"; + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, {}, + &on_finish->from_mon, &on_finish->outs, on_finish); + return true; + } + } else if (prefix == "osd df") { + string method, filter; + cmd_getval(cmdctx->cmdmap, "output_method", method); + cmd_getval(cmdctx->cmdmap, "filter", filter); + stringstream rs; + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pgmap) { + // sanity check filter(s) + if (!filter.empty() && + osdmap.lookup_pg_pool_name(filter) < 0 && + !osdmap.crush->class_exists(filter) && + !osdmap.crush->name_exists(filter)) { + rs << "'" << filter << "' not a pool, crush node or device class name"; + return -EINVAL; + } + print_osd_utilization(osdmap, pgmap, ss, + f.get(), method == "tree", filter); + cmdctx->odata.append(ss); + return 0; + }); + cmdctx->reply(r, rs); + return true; + } else if (prefix == "osd pool stats") { + string pool_name; + cmd_getval(cmdctx->cmdmap, "pool_name", pool_name); + int64_t poolid = -ENOENT; + bool one_pool = false; + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + if (!pool_name.empty()) { + poolid = osdmap.lookup_pg_pool_name(pool_name); + if (poolid < 0) { + ceph_assert(poolid == -ENOENT); + ss << "unrecognized pool '" << pool_name << "'"; + return -ENOENT; + } + one_pool = true; + } + stringstream rs; + if (f) + f->open_array_section("pool_stats"); + else { + if (osdmap.get_pools().empty()) { + ss << "there are no pools!"; + goto stats_out; + } + } + for (auto &p : osdmap.get_pools()) { + if (!one_pool) { + poolid = p.first; + } + pg_map.dump_pool_stats_and_io_rate(poolid, osdmap, f.get(), &rs); + if (one_pool) { + break; + } + } + stats_out: + if (f) { + f->close_section(); + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(rs.str()); + } + return 0; + }); + if (r != -EOPNOTSUPP) { + cmdctx->reply(r, ss); + return true; + } + } else if (prefix == "osd safe-to-destroy" || + prefix == "osd destroy" || + prefix == "osd purge") { + set<int> osds; + int r = 0; + if (prefix == "osd safe-to-destroy") { + vector<string> ids; + cmd_getval(cmdctx->cmdmap, "ids", ids); + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + r = osdmap.parse_osd_id_list(ids, &osds, &ss); + }); + if (!r && osds.empty()) { + ss << "must specify one or more OSDs"; + r = -EINVAL; + } + } else { + int64_t id; + if (!cmd_getval(cmdctx->cmdmap, "id", id)) { + r = -EINVAL; + ss << "must specify OSD id"; + } else { + osds.insert(id); + } + } + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + set<int> active_osds, missing_stats, stored_pgs, safe_to_destroy; + int affected_pgs = 0; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + if (pg_map.num_pg_unknown > 0) { + ss << pg_map.num_pg_unknown << " pgs have unknown state; cannot draw" + << " any conclusions"; + r = -EAGAIN; + return; + } + int num_active_clean = 0; + for (auto& p : pg_map.num_pg_by_state) { + unsigned want = PG_STATE_ACTIVE|PG_STATE_CLEAN; + if ((p.first & want) == want) { + num_active_clean += p.second; + } + } + for (auto osd : osds) { + if (!osdmap.exists(osd)) { + safe_to_destroy.insert(osd); + continue; // clearly safe to destroy + } + auto q = pg_map.num_pg_by_osd.find(osd); + if (q != pg_map.num_pg_by_osd.end()) { + if (q->second.acting > 0 || q->second.up_not_acting > 0) { + active_osds.insert(osd); + // XXX: For overlapping PGs, this counts them again + affected_pgs += q->second.acting + q->second.up_not_acting; + continue; + } + } + if (num_active_clean < pg_map.num_pg) { + // all pgs aren't active+clean; we need to be careful. + auto p = pg_map.osd_stat.find(osd); + if (p == pg_map.osd_stat.end() || !osdmap.is_up(osd)) { + missing_stats.insert(osd); + continue; + } else if (p->second.num_pgs > 0) { + stored_pgs.insert(osd); + continue; + } + } + safe_to_destroy.insert(osd); + } + }); + if (r && prefix == "osd safe-to-destroy") { + cmdctx->reply(r, ss); // regardless of formatter + return true; + } + if (!r && (!active_osds.empty() || + !missing_stats.empty() || !stored_pgs.empty())) { + if (!safe_to_destroy.empty()) { + ss << "OSD(s) " << safe_to_destroy + << " are safe to destroy without reducing data durability. "; + } + if (!active_osds.empty()) { + ss << "OSD(s) " << active_osds << " have " << affected_pgs + << " pgs currently mapped to them. "; + } + if (!missing_stats.empty()) { + ss << "OSD(s) " << missing_stats << " have no reported stats, and not all" + << " PGs are active+clean; we cannot draw any conclusions. "; + } + if (!stored_pgs.empty()) { + ss << "OSD(s) " << stored_pgs << " last reported they still store some PG" + << " data, and not all PGs are active+clean; we cannot be sure they" + << " aren't still needed."; + } + if (!active_osds.empty() || !stored_pgs.empty()) { + r = -EBUSY; + } else { + r = -EAGAIN; + } + } + + if (prefix == "osd safe-to-destroy") { + if (!r) { + ss << "OSD(s) " << osds << " are safe to destroy without reducing data" + << " durability."; + } + if (f) { + f->open_object_section("osd_status"); + f->open_array_section("safe_to_destroy"); + for (auto i : safe_to_destroy) + f->dump_int("osd", i); + f->close_section(); + f->open_array_section("active"); + for (auto i : active_osds) + f->dump_int("osd", i); + f->close_section(); + f->open_array_section("missing_stats"); + for (auto i : missing_stats) + f->dump_int("osd", i); + f->close_section(); + f->open_array_section("stored_pgs"); + for (auto i : stored_pgs) + f->dump_int("osd", i); + f->close_section(); + f->close_section(); // osd_status + f->flush(cmdctx->odata); + r = 0; + std::stringstream().swap(ss); + } + cmdctx->reply(r, ss); + return true; + } + + if (r) { + bool force = false; + cmd_getval(cmdctx->cmdmap, "force", force); + if (!force) { + // Backward compat + cmd_getval(cmdctx->cmdmap, "yes_i_really_mean_it", force); + } + if (!force) { + ss << "\nYou can proceed by passing --force, but be warned that" + " this will likely mean real, permanent data loss."; + } else { + r = 0; + } + } + if (r) { + cmdctx->reply(r, ss); + return true; + } + const string cmd = + "{" + "\"prefix\": \"" + prefix + "-actual\", " + "\"id\": " + stringify(osds) + ", " + "\"yes_i_really_mean_it\": true" + "}"; + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, {}, nullptr, &on_finish->outs, on_finish); + return true; + } else if (prefix == "osd ok-to-stop") { + vector<string> ids; + cmd_getval(cmdctx->cmdmap, "ids", ids); + set<int> osds; + int64_t max = 1; + cmd_getval(cmdctx->cmdmap, "max", max); + int r; + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + r = osdmap.parse_osd_id_list(ids, &osds, &ss); + }); + if (!r && osds.empty()) { + ss << "must specify one or more OSDs"; + r = -EINVAL; + } + if (max < (int)osds.size()) { + max = osds.size(); + } + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + offline_pg_report out_report; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + _maximize_ok_to_stop_set( + osds, max, osdmap, pg_map, + &out_report); + }); + if (!f) { + f.reset(Formatter::create("json")); + } + f->dump_object("ok_to_stop", out_report); + f->flush(cmdctx->odata); + cmdctx->odata.append("\n"); + if (!out_report.unknown.empty()) { + ss << out_report.unknown.size() << " pgs have unknown state; " + << "cannot draw any conclusions"; + cmdctx->reply(-EAGAIN, ss); + } + if (!out_report.ok_to_stop()) { + ss << "unsafe to stop osd(s) at this time (" << out_report.not_ok.size() << " PGs are or would become offline)"; + cmdctx->reply(-EBUSY, ss); + } else { + cmdctx->reply(0, ss); + } + return true; + } else if (prefix == "pg force-recovery" || + prefix == "pg force-backfill" || + prefix == "pg cancel-force-recovery" || + prefix == "pg cancel-force-backfill" || + prefix == "osd pool force-recovery" || + prefix == "osd pool force-backfill" || + prefix == "osd pool cancel-force-recovery" || + prefix == "osd pool cancel-force-backfill") { + vector<string> vs; + get_str_vec(prefix, vs); + auto& granularity = vs.front(); + auto& forceop = vs.back(); + vector<pg_t> pgs; + + // figure out actual op just once + int actual_op = 0; + if (forceop == "force-recovery") { + actual_op = OFR_RECOVERY; + } else if (forceop == "force-backfill") { + actual_op = OFR_BACKFILL; + } else if (forceop == "cancel-force-backfill") { + actual_op = OFR_BACKFILL | OFR_CANCEL; + } else if (forceop == "cancel-force-recovery") { + actual_op = OFR_RECOVERY | OFR_CANCEL; + } + + set<pg_t> candidates; // deduped + if (granularity == "pg") { + // covnert pg names to pgs, discard any invalid ones while at it + vector<string> pgids; + cmd_getval(cmdctx->cmdmap, "pgid", pgids); + for (auto& i : pgids) { + pg_t pgid; + if (!pgid.parse(i.c_str())) { + ss << "invlaid pgid '" << i << "'; "; + r = -EINVAL; + continue; + } + candidates.insert(pgid); + } + } else { + // per pool + vector<string> pool_names; + cmd_getval(cmdctx->cmdmap, "who", pool_names); + if (pool_names.empty()) { + ss << "must specify one or more pool names"; + cmdctx->reply(-EINVAL, ss); + return true; + } + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (auto& pool_name : pool_names) { + auto pool_id = osdmap.lookup_pg_pool_name(pool_name); + if (pool_id < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + return; + } + auto pool_pg_num = osdmap.get_pg_num(pool_id); + for (int i = 0; i < pool_pg_num; i++) + candidates.insert({(unsigned int)i, (uint64_t)pool_id}); + } + }); + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } + } + + cluster_state.with_pgmap([&](const PGMap& pg_map) { + for (auto& i : candidates) { + auto it = pg_map.pg_stat.find(i); + if (it == pg_map.pg_stat.end()) { + ss << "pg " << i << " does not exist; "; + r = -ENOENT; + continue; + } + auto state = it->second.state; + // discard pgs for which user requests are pointless + switch (actual_op) { + case OFR_RECOVERY: + if ((state & (PG_STATE_DEGRADED | + PG_STATE_RECOVERY_WAIT | + PG_STATE_RECOVERING)) == 0) { + // don't return error, user script may be racing with cluster. + // not fatal. + ss << "pg " << i << " doesn't require recovery; "; + continue; + } else if (state & PG_STATE_FORCED_RECOVERY) { + ss << "pg " << i << " recovery already forced; "; + // return error, as it may be a bug in user script + r = -EINVAL; + continue; + } + break; + case OFR_BACKFILL: + if ((state & (PG_STATE_DEGRADED | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILLING)) == 0) { + ss << "pg " << i << " doesn't require backfilling; "; + continue; + } else if (state & PG_STATE_FORCED_BACKFILL) { + ss << "pg " << i << " backfill already forced; "; + r = -EINVAL; + continue; + } + break; + case OFR_BACKFILL | OFR_CANCEL: + if ((state & PG_STATE_FORCED_BACKFILL) == 0) { + ss << "pg " << i << " backfill not forced; "; + continue; + } + break; + case OFR_RECOVERY | OFR_CANCEL: + if ((state & PG_STATE_FORCED_RECOVERY) == 0) { + ss << "pg " << i << " recovery not forced; "; + continue; + } + break; + default: + ceph_abort_msg("actual_op value is not supported"); + } + pgs.push_back(i); + } // for + }); + + // respond with error only when no pgs are correct + // yes, in case of mixed errors, only the last one will be emitted, + // but the message presented will be fine + if (pgs.size() != 0) { + // clear error to not confuse users/scripts + r = 0; + } + + // optimize the command -> messages conversion, use only one + // message per distinct OSD + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + // group pgs to process by osd + map<int, vector<spg_t>> osdpgs; + for (auto& pgid : pgs) { + int primary; + spg_t spg; + if (osdmap.get_primary_shard(pgid, &primary, &spg)) { + osdpgs[primary].push_back(spg); + } + } + for (auto& i : osdpgs) { + if (osdmap.is_up(i.first)) { + auto p = osd_cons.find(i.first); + if (p == osd_cons.end()) { + ss << "osd." << i.first << " is not currently connected"; + r = -EAGAIN; + continue; + } + for (auto& con : p->second) { + con->send_message( + new MOSDForceRecovery(monc->get_fsid(), i.second, actual_op)); + } + ss << "instructing pg(s) " << i.second << " on osd." << i.first + << " to " << forceop << "; "; + } + } + }); + ss << std::endl; + cmdctx->reply(r, ss); + return true; + } else if (prefix == "config show" || + prefix == "config show-with-defaults") { + string who; + cmd_getval(cmdctx->cmdmap, "who", who); + auto [key, valid] = DaemonKey::parse(who); + if (!valid) { + ss << "invalid daemon name: use <type>.<id>"; + cmdctx->reply(-EINVAL, ss); + return true; + } + DaemonStatePtr daemon = daemon_state.get(key); + if (!daemon) { + ss << "no config state for daemon " << who; + cmdctx->reply(-ENOENT, ss); + return true; + } + + std::lock_guard l(daemon->lock); + + int r = 0; + string name; + if (cmd_getval(cmdctx->cmdmap, "key", name)) { + // handle special options + if (name == "fsid") { + cmdctx->odata.append(stringify(monc->get_fsid()) + "\n"); + cmdctx->reply(r, ss); + return true; + } + auto p = daemon->config.find(name); + if (p != daemon->config.end() && + !p->second.empty()) { + cmdctx->odata.append(p->second.rbegin()->second + "\n"); + } else { + auto& defaults = daemon->_get_config_defaults(); + auto q = defaults.find(name); + if (q != defaults.end()) { + cmdctx->odata.append(q->second + "\n"); + } else { + r = -ENOENT; + } + } + } else if (daemon->config_defaults_bl.length() > 0) { + TextTable tbl; + if (f) { + f->open_array_section("config"); + } else { + tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("SOURCE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("OVERRIDES", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("IGNORES", TextTable::LEFT, TextTable::LEFT); + } + if (prefix == "config show") { + // show + for (auto& i : daemon->config) { + dout(20) << " " << i.first << " -> " << i.second << dendl; + if (i.second.empty()) { + continue; + } + if (f) { + f->open_object_section("value"); + f->dump_string("name", i.first); + f->dump_string("value", i.second.rbegin()->second); + f->dump_string("source", ceph_conf_level_name( + i.second.rbegin()->first)); + if (i.second.size() > 1) { + f->open_array_section("overrides"); + auto j = i.second.rend(); + for (--j; j != i.second.rbegin(); --j) { + f->open_object_section("value"); + f->dump_string("source", ceph_conf_level_name(j->first)); + f->dump_string("value", j->second); + f->close_section(); + } + f->close_section(); + } + if (daemon->ignored_mon_config.count(i.first)) { + f->dump_string("ignores", "mon"); + } + f->close_section(); + } else { + tbl << i.first; + tbl << i.second.rbegin()->second; + tbl << ceph_conf_level_name(i.second.rbegin()->first); + if (i.second.size() > 1) { + list<string> ov; + auto j = i.second.rend(); + for (--j; j != i.second.rbegin(); --j) { + if (j->second == i.second.rbegin()->second) { + ov.push_front(string("(") + ceph_conf_level_name(j->first) + + string("[") + j->second + string("]") + + string(")")); + } else { + ov.push_front(ceph_conf_level_name(j->first) + + string("[") + j->second + string("]")); + + } + } + tbl << ov; + } else { + tbl << ""; + } + tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : ""); + tbl << TextTable::endrow; + } + } + } else { + // show-with-defaults + auto& defaults = daemon->_get_config_defaults(); + for (auto& i : defaults) { + if (f) { + f->open_object_section("value"); + f->dump_string("name", i.first); + } else { + tbl << i.first; + } + auto j = daemon->config.find(i.first); + if (j != daemon->config.end() && !j->second.empty()) { + // have config + if (f) { + f->dump_string("value", j->second.rbegin()->second); + f->dump_string("source", ceph_conf_level_name( + j->second.rbegin()->first)); + if (j->second.size() > 1) { + f->open_array_section("overrides"); + auto k = j->second.rend(); + for (--k; k != j->second.rbegin(); --k) { + f->open_object_section("value"); + f->dump_string("source", ceph_conf_level_name(k->first)); + f->dump_string("value", k->second); + f->close_section(); + } + f->close_section(); + } + if (daemon->ignored_mon_config.count(i.first)) { + f->dump_string("ignores", "mon"); + } + f->close_section(); + } else { + tbl << j->second.rbegin()->second; + tbl << ceph_conf_level_name(j->second.rbegin()->first); + if (j->second.size() > 1) { + list<string> ov; + auto k = j->second.rend(); + for (--k; k != j->second.rbegin(); --k) { + if (k->second == j->second.rbegin()->second) { + ov.push_front(string("(") + ceph_conf_level_name(k->first) + + string("[") + k->second + string("]") + + string(")")); + } else { + ov.push_front(ceph_conf_level_name(k->first) + + string("[") + k->second + string("]")); + } + } + tbl << ov; + } else { + tbl << ""; + } + tbl << (daemon->ignored_mon_config.count(i.first) ? "mon" : ""); + tbl << TextTable::endrow; + } + } else { + // only have default + if (f) { + f->dump_string("value", i.second); + f->dump_string("source", ceph_conf_level_name(CONF_DEFAULT)); + f->close_section(); + } else { + tbl << i.second; + tbl << ceph_conf_level_name(CONF_DEFAULT); + tbl << ""; + tbl << ""; + tbl << TextTable::endrow; + } + } + } + } + if (f) { + f->close_section(); + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(stringify(tbl)); + } + } + cmdctx->reply(r, ss); + return true; + } else if (prefix == "device ls") { + set<string> devids; + TextTable tbl; + if (f) { + f->open_array_section("devices"); + daemon_state.with_devices([&f](const DeviceState& dev) { + f->dump_object("device", dev); + }); + f->close_section(); + f->flush(cmdctx->odata); + } else { + tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("WEAR", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("LIFE EXPECTANCY", TextTable::LEFT, TextTable::LEFT); + auto now = ceph_clock_now(); + daemon_state.with_devices([&tbl, now](const DeviceState& dev) { + string h; + for (auto& i : dev.attachments) { + if (h.size()) { + h += " "; + } + h += std::get<0>(i) + ":" + std::get<1>(i); + } + string d; + for (auto& i : dev.daemons) { + if (d.size()) { + d += " "; + } + d += to_string(i); + } + char wear_level_str[16] = {0}; + if (dev.wear_level >= 0) { + snprintf(wear_level_str, sizeof(wear_level_str)-1, "%d%%", + (int)(100.1 * dev.wear_level)); + } + tbl << dev.devid + << h + << d + << wear_level_str + << dev.get_life_expectancy_str(now) + << TextTable::endrow; + }); + cmdctx->odata.append(stringify(tbl)); + } + cmdctx->reply(0, ss); + return true; + } else if (prefix == "device ls-by-daemon") { + string who; + cmd_getval(cmdctx->cmdmap, "who", who); + if (auto [k, valid] = DaemonKey::parse(who); !valid) { + ss << who << " is not a valid daemon name"; + r = -EINVAL; + } else { + auto dm = daemon_state.get(k); + if (dm) { + if (f) { + f->open_array_section("devices"); + for (auto& i : dm->devices) { + daemon_state.with_device(i.first, [&f] (const DeviceState& dev) { + f->dump_object("device", dev); + }); + } + f->close_section(); + f->flush(cmdctx->odata); + } else { + TextTable tbl; + tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("HOST:DEV", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("EXPECTED FAILURE", TextTable::LEFT, + TextTable::LEFT); + auto now = ceph_clock_now(); + for (auto& i : dm->devices) { + daemon_state.with_device( + i.first, [&tbl, now] (const DeviceState& dev) { + string h; + for (auto& i : dev.attachments) { + if (h.size()) { + h += " "; + } + h += std::get<0>(i) + ":" + std::get<1>(i); + } + tbl << dev.devid + << h + << dev.get_life_expectancy_str(now) + << TextTable::endrow; + }); + } + cmdctx->odata.append(stringify(tbl)); + } + } else { + r = -ENOENT; + ss << "daemon " << who << " not found"; + } + cmdctx->reply(r, ss); + } + } else if (prefix == "device ls-by-host") { + string host; + cmd_getval(cmdctx->cmdmap, "host", host); + set<string> devids; + daemon_state.list_devids_by_server(host, &devids); + if (f) { + f->open_array_section("devices"); + for (auto& devid : devids) { + daemon_state.with_device( + devid, [&f] (const DeviceState& dev) { + f->dump_object("device", dev); + }); + } + f->close_section(); + f->flush(cmdctx->odata); + } else { + TextTable tbl; + tbl.define_column("DEVICE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DEV", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DAEMONS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("EXPECTED FAILURE", TextTable::LEFT, TextTable::LEFT); + auto now = ceph_clock_now(); + for (auto& devid : devids) { + daemon_state.with_device( + devid, [&tbl, &host, now] (const DeviceState& dev) { + string n; + for (auto& j : dev.attachments) { + if (std::get<0>(j) == host) { + if (n.size()) { + n += " "; + } + n += std::get<1>(j); + } + } + string d; + for (auto& i : dev.daemons) { + if (d.size()) { + d += " "; + } + d += to_string(i); + } + tbl << dev.devid + << n + << d + << dev.get_life_expectancy_str(now) + << TextTable::endrow; + }); + } + cmdctx->odata.append(stringify(tbl)); + } + cmdctx->reply(0, ss); + return true; + } else if (prefix == "device info") { + string devid; + cmd_getval(cmdctx->cmdmap, "devid", devid); + int r = 0; + ostringstream rs; + if (!daemon_state.with_device(devid, + [&f, &rs] (const DeviceState& dev) { + if (f) { + f->dump_object("device", dev); + } else { + dev.print(rs); + } + })) { + ss << "device " << devid << " not found"; + r = -ENOENT; + } else { + if (f) { + f->flush(cmdctx->odata); + } else { + cmdctx->odata.append(rs.str()); + } + } + cmdctx->reply(r, ss); + return true; + } else if (prefix == "device set-life-expectancy") { + string devid; + cmd_getval(cmdctx->cmdmap, "devid", devid); + string from_str, to_str; + cmd_getval(cmdctx->cmdmap, "from", from_str); + cmd_getval(cmdctx->cmdmap, "to", to_str); + utime_t from, to; + if (!from.parse(from_str)) { + ss << "unable to parse datetime '" << from_str << "'"; + r = -EINVAL; + cmdctx->reply(r, ss); + } else if (to_str.size() && !to.parse(to_str)) { + ss << "unable to parse datetime '" << to_str << "'"; + r = -EINVAL; + cmdctx->reply(r, ss); + } else { + map<string,string> meta; + daemon_state.with_device_create( + devid, + [from, to, &meta] (DeviceState& dev) { + dev.set_life_expectancy(from, to, ceph_clock_now()); + meta = dev.metadata; + }); + json_spirit::Object json_object; + for (auto& i : meta) { + json_spirit::Config::add(json_object, i.first, i.second); + } + bufferlist json; + json.append(json_spirit::write(json_object)); + const string cmd = + "{" + "\"prefix\": \"config-key set\", " + "\"key\": \"device/" + devid + "\"" + "}"; + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish); + } + return true; + } else if (prefix == "device rm-life-expectancy") { + string devid; + cmd_getval(cmdctx->cmdmap, "devid", devid); + map<string,string> meta; + if (daemon_state.with_device_write(devid, [&meta] (DeviceState& dev) { + dev.rm_life_expectancy(); + meta = dev.metadata; + })) { + string cmd; + bufferlist json; + if (meta.empty()) { + cmd = + "{" + "\"prefix\": \"config-key rm\", " + "\"key\": \"device/" + devid + "\"" + "}"; + } else { + json_spirit::Object json_object; + for (auto& i : meta) { + json_spirit::Config::add(json_object, i.first, i.second); + } + json.append(json_spirit::write(json_object)); + cmd = + "{" + "\"prefix\": \"config-key set\", " + "\"key\": \"device/" + devid + "\"" + "}"; + } + auto on_finish = new ReplyOnFinish(cmdctx); + monc->start_mon_command({cmd}, json, nullptr, nullptr, on_finish); + } else { + cmdctx->reply(0, ss); + } + return true; + } else { + if (!pgmap_ready) { + ss << "Warning: due to ceph-mgr restart, some PG states may not be up to date\n"; + } + if (f) { + f->open_object_section("pg_info"); + f->dump_bool("pg_ready", pgmap_ready); + } + + // fall back to feeding command to PGMap + r = cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + return process_pg_map_command(prefix, cmdctx->cmdmap, pg_map, osdmap, + f.get(), &ss, &cmdctx->odata); + }); + + if (f) { + f->close_section(); + } + if (r != -EOPNOTSUPP) { + if (f) { + f->flush(cmdctx->odata); + } + cmdctx->reply(r, ss); + return true; + } + } + + // Was the command unfound? + if (py_command.cmdstring.empty()) { + ss << "No handler found for '" << prefix << "'"; + dout(4) << "No handler found for '" << prefix << "'" << dendl; + cmdctx->reply(-EINVAL, ss); + return true; + } + + // Validate that the module is active + auto& mod_name = py_command.module_name; + if (!py_modules.is_module_active(mod_name)) { + ss << "Module '" << mod_name << "' is not enabled/loaded (required by " + "command '" << prefix << "'): use `ceph mgr module enable " + << mod_name << "` to enable it"; + dout(4) << ss.str() << dendl; + cmdctx->reply(-EOPNOTSUPP, ss); + return true; + } + + dout(10) << "passing through command '" << prefix << "' size " << cmdctx->cmdmap.size() << dendl; + Finisher& mod_finisher = py_modules.get_active_module_finisher(mod_name); + mod_finisher.queue(new LambdaContext([this, cmdctx, session, py_command, prefix] + (int r_) mutable { + std::stringstream ss; + + dout(10) << "dispatching command '" << prefix << "' size " << cmdctx->cmdmap.size() << dendl; + + // Validate that the module is enabled + auto& py_handler_name = py_command.module_name; + PyModuleRef module = py_modules.get_module(py_handler_name); + ceph_assert(module); + if (!module->is_enabled()) { + ss << "Module '" << py_handler_name << "' is not enabled (required by " + "command '" << prefix << "'): use `ceph mgr module enable " + << py_handler_name << "` to enable it"; + dout(4) << ss.str() << dendl; + cmdctx->reply(-EOPNOTSUPP, ss); + return; + } + + // Hack: allow the self-test method to run on unhealthy modules. + // Fix this in future by creating a special path for self test rather + // than having the hook be a normal module command. + std::string self_test_prefix = py_handler_name + " " + "self-test"; + + // Validate that the module is healthy + bool accept_command; + if (module->is_loaded()) { + if (module->get_can_run() && !module->is_failed()) { + // Healthy module + accept_command = true; + } else if (self_test_prefix == prefix) { + // Unhealthy, but allow because it's a self test command + accept_command = true; + } else { + accept_command = false; + ss << "Module '" << py_handler_name << "' has experienced an error and " + "cannot handle commands: " << module->get_error_string(); + } + } else { + // Module not loaded + accept_command = false; + ss << "Module '" << py_handler_name << "' failed to load and " + "cannot handle commands: " << module->get_error_string(); + } + + if (!accept_command) { + dout(4) << ss.str() << dendl; + cmdctx->reply(-EIO, ss); + return; + } + + std::stringstream ds; + bufferlist inbl = cmdctx->data; + int r = py_modules.handle_command(py_command, *session, cmdctx->cmdmap, + inbl, &ds, &ss); + if (r == -EACCES) { + log_access_denied(cmdctx, session, ss); + } + + cmdctx->odata.append(ds); + cmdctx->reply(r, ss); + dout(10) << " command returned " << r << dendl; + })); + return true; +} + +void DaemonServer::_prune_pending_service_map() +{ + utime_t cutoff = ceph_clock_now(); + cutoff -= g_conf().get_val<double>("mgr_service_beacon_grace"); + auto p = pending_service_map.services.begin(); + while (p != pending_service_map.services.end()) { + auto q = p->second.daemons.begin(); + while (q != p->second.daemons.end()) { + DaemonKey key{p->first, q->first}; + if (!daemon_state.exists(key)) { + if (ServiceMap::is_normal_ceph_entity(p->first)) { + dout(10) << "daemon " << key << " in service map but not in daemon state " + << "index -- force pruning" << dendl; + q = p->second.daemons.erase(q); + pending_service_map_dirty = pending_service_map.epoch; + } else { + derr << "missing key " << key << dendl; + ++q; + } + + continue; + } + + auto daemon = daemon_state.get(key); + std::lock_guard l(daemon->lock); + if (daemon->last_service_beacon == utime_t()) { + // we must have just restarted; assume they are alive now. + daemon->last_service_beacon = ceph_clock_now(); + ++q; + continue; + } + if (daemon->last_service_beacon < cutoff) { + dout(10) << "pruning stale " << p->first << "." << q->first + << " last_beacon " << daemon->last_service_beacon << dendl; + q = p->second.daemons.erase(q); + pending_service_map_dirty = pending_service_map.epoch; + } else { + ++q; + } + } + if (p->second.daemons.empty()) { + p = pending_service_map.services.erase(p); + pending_service_map_dirty = pending_service_map.epoch; + } else { + ++p; + } + } +} + +void DaemonServer::send_report() +{ + if (!pgmap_ready) { + if (ceph_clock_now() - started_at > g_conf().get_val<int64_t>("mgr_stats_period") * 4.0) { + pgmap_ready = true; + reported_osds.clear(); + dout(1) << "Giving up on OSDs that haven't reported yet, sending " + << "potentially incomplete PG state to mon" << dendl; + } else { + dout(1) << "Not sending PG status to monitor yet, waiting for OSDs" + << dendl; + return; + } + } + + auto m = ceph::make_message<MMonMgrReport>(); + m->gid = monc->get_global_id(); + py_modules.get_health_checks(&m->health_checks); + py_modules.get_progress_events(&m->progress_events); + + cluster_state.with_mutable_pgmap([&](PGMap& pg_map) { + cluster_state.update_delta_stats(); + + if (pending_service_map.epoch) { + _prune_pending_service_map(); + if (pending_service_map_dirty >= pending_service_map.epoch) { + pending_service_map.modified = ceph_clock_now(); + encode(pending_service_map, m->service_map_bl, CEPH_FEATURES_ALL); + dout(10) << "sending service_map e" << pending_service_map.epoch + << dendl; + pending_service_map.epoch++; + } + } + + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + // FIXME: no easy way to get mon features here. this will do for + // now, though, as long as we don't make a backward-incompat change. + pg_map.encode_digest(osdmap, m->get_data(), CEPH_FEATURES_ALL); + dout(10) << pg_map << dendl; + + pg_map.get_health_checks(g_ceph_context, osdmap, + &m->health_checks); + + dout(10) << m->health_checks.checks.size() << " health checks" + << dendl; + dout(20) << "health checks:\n"; + JSONFormatter jf(true); + jf.dump_object("health_checks", m->health_checks); + jf.flush(*_dout); + *_dout << dendl; + if (osdmap.require_osd_release >= ceph_release_t::luminous) { + clog->debug() << "pgmap v" << pg_map.version << ": " << pg_map; + } + }); + }); + + map<daemon_metric, unique_ptr<DaemonHealthMetricCollector>> accumulated; + for (auto service : {"osd", "mon"} ) { + auto daemons = daemon_state.get_by_service(service); + for (const auto& [key,state] : daemons) { + std::lock_guard l{state->lock}; + for (const auto& metric : state->daemon_health_metrics) { + auto acc = accumulated.find(metric.get_type()); + if (acc == accumulated.end()) { + auto collector = DaemonHealthMetricCollector::create(metric.get_type()); + if (!collector) { + derr << __func__ << " " << key + << " sent me an unknown health metric: " + << std::hex << static_cast<uint8_t>(metric.get_type()) + << std::dec << dendl; + continue; + } + tie(acc, std::ignore) = accumulated.emplace(metric.get_type(), + std::move(collector)); + } + acc->second->update(key, metric); + } + } + } + for (const auto& acc : accumulated) { + acc.second->summarize(m->health_checks); + } + // TODO? We currently do not notify the PyModules + // TODO: respect needs_send, so we send the report only if we are asked to do + // so, or the state is updated. + monc->send_mon_message(std::move(m)); +} + +void DaemonServer::adjust_pgs() +{ + dout(20) << dendl; + unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs); + double max_misplaced = g_conf().get_val<double>("target_max_misplaced_ratio"); + bool aggro = g_conf().get_val<bool>("mgr_debug_aggressive_pg_num_changes"); + + map<string,unsigned> pg_num_to_set; + map<string,unsigned> pgp_num_to_set; + set<pg_t> upmaps_to_clear; + cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { + unsigned creating_or_unknown = 0; + for (auto& i : pg_map.num_pg_by_state) { + if ((i.first & (PG_STATE_CREATING)) || + i.first == 0) { + creating_or_unknown += i.second; + } + } + unsigned left = max; + if (creating_or_unknown >= max) { + return; + } + left -= creating_or_unknown; + dout(10) << "creating_or_unknown " << creating_or_unknown + << " max_creating " << max + << " left " << left + << dendl; + + // FIXME: These checks are fundamentally racy given that adjust_pgs() + // can run more frequently than we get updated pg stats from OSDs. We + // may make multiple adjustments with stale informaiton. + double misplaced_ratio, degraded_ratio; + double inactive_pgs_ratio, unknown_pgs_ratio; + pg_map.get_recovery_stats(&misplaced_ratio, °raded_ratio, + &inactive_pgs_ratio, &unknown_pgs_ratio); + dout(20) << "misplaced_ratio " << misplaced_ratio + << " degraded_ratio " << degraded_ratio + << " inactive_pgs_ratio " << inactive_pgs_ratio + << " unknown_pgs_ratio " << unknown_pgs_ratio + << "; target_max_misplaced_ratio " << max_misplaced + << dendl; + + for (auto& i : osdmap.get_pools()) { + const pg_pool_t& p = i.second; + + // adjust pg_num? + if (p.get_pg_num_target() != p.get_pg_num()) { + dout(20) << "pool " << i.first + << " pg_num " << p.get_pg_num() + << " target " << p.get_pg_num_target() + << dendl; + if (p.has_flag(pg_pool_t::FLAG_CREATING)) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - still creating initial pgs" + << dendl; + } else if (p.get_pg_num_target() < p.get_pg_num()) { + // pg_num decrease (merge) + pg_t merge_source(p.get_pg_num() - 1, i.first); + pg_t merge_target = merge_source.get_parent(); + bool ok = true; + + if (p.get_pg_num() != p.get_pg_num_pending()) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - decrease and pg_num_pending != pg_num, waiting" + << dendl; + ok = false; + } else if (p.get_pg_num() == p.get_pgp_num()) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - decrease blocked by pgp_num " + << p.get_pgp_num() + << dendl; + ok = false; + } + vector<int32_t> source_acting; + for (auto &merge_participant : {merge_source, merge_target}) { + bool is_merge_source = merge_participant == merge_source; + if (osdmap.have_pg_upmaps(merge_participant)) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << (is_merge_source ? " - merge source " : " - merge target ") + << merge_participant + << " has upmap" << dendl; + upmaps_to_clear.insert(merge_participant); + ok = false; + } + auto q = pg_map.pg_stat.find(merge_participant); + if (q == pg_map.pg_stat.end()) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - no state for " << merge_participant + << (is_merge_source ? " (merge source)" : " (merge target)") + << dendl; + ok = false; + } else if ((q->second.state & (PG_STATE_ACTIVE | PG_STATE_CLEAN)) != + (PG_STATE_ACTIVE | PG_STATE_CLEAN)) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << (is_merge_source ? " - merge source " : " - merge target ") + << merge_participant + << " not clean (" << pg_state_string(q->second.state) + << ")" << dendl; + ok = false; + } + if (is_merge_source) { + source_acting = q->second.acting; + } else if (ok && q->second.acting != source_acting) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << (is_merge_source ? " - merge source " : " - merge target ") + << merge_participant + << " acting does not match (source " << source_acting + << " != target " << q->second.acting + << ")" << dendl; + ok = false; + } + } + + if (ok) { + unsigned target = p.get_pg_num() - 1; + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " -> " << target + << " (merging " << merge_source + << " and " << merge_target + << ")" << dendl; + pg_num_to_set[osdmap.get_pool_name(i.first)] = target; + continue; + } + } else if (p.get_pg_num_target() > p.get_pg_num()) { + // pg_num increase (split) + bool active = true; + auto q = pg_map.num_pg_by_pool_state.find(i.first); + if (q != pg_map.num_pg_by_pool_state.end()) { + for (auto& j : q->second) { + if ((j.first & (PG_STATE_ACTIVE|PG_STATE_PEERED)) == 0) { + dout(20) << "pool " << i.first << " has " << j.second + << " pgs in " << pg_state_string(j.first) + << dendl; + active = false; + break; + } + } + } else { + active = false; + } + unsigned pg_gap = p.get_pg_num() - p.get_pgp_num(); + unsigned max_jump = cct->_conf->mgr_max_pg_num_change; + if (!active) { + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " - not all pgs active" + << dendl; + } else if (pg_gap >= max_jump) { + dout(10) << "pool " << i.first + << " pg_num " << p.get_pg_num() + << " - pgp_num " << p.get_pgp_num() + << " gap >= max_pg_num_change " << max_jump + << " - must scale pgp_num first" + << dendl; + } else { + unsigned add = std::min( + std::min(left, max_jump - pg_gap), + p.get_pg_num_target() - p.get_pg_num()); + unsigned target = p.get_pg_num() + add; + left -= add; + dout(10) << "pool " << i.first + << " pg_num_target " << p.get_pg_num_target() + << " pg_num " << p.get_pg_num() + << " -> " << target << dendl; + pg_num_to_set[osdmap.get_pool_name(i.first)] = target; + } + } + } + + // adjust pgp_num? + unsigned target = std::min(p.get_pg_num_pending(), + p.get_pgp_num_target()); + if (target != p.get_pgp_num()) { + dout(20) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " -> " << target << dendl; + if (target > p.get_pgp_num() && + p.get_pgp_num() == p.get_pg_num()) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " - increase blocked by pg_num " << p.get_pg_num() + << dendl; + } else if (!aggro && (inactive_pgs_ratio > 0 || + degraded_ratio > 0 || + unknown_pgs_ratio > 0)) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " - inactive|degraded|unknown pgs, deferring pgp_num" + << " update" << dendl; + } else if (!aggro && (misplaced_ratio > max_misplaced)) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " - misplaced_ratio " << misplaced_ratio + << " > max " << max_misplaced + << ", deferring pgp_num update" << dendl; + } else { + // NOTE: this calculation assumes objects are + // basically uniformly distributed across all PGs + // (regardless of pool), which is probably not + // perfectly correct, but it's a start. make no + // single adjustment that's more than half of the + // max_misplaced, to somewhat limit the magnitude of + // our potential error here. + unsigned next; + static constexpr unsigned MAX_NUM_OBJECTS_PER_PG_FOR_LEAP = 1; + pool_stat_t s = pg_map.get_pg_pool_sum_stat(i.first); + if (aggro || + // pool is (virtually) empty; just jump to final pgp_num? + (p.get_pgp_num_target() > p.get_pgp_num() && + s.stats.sum.num_objects <= (MAX_NUM_OBJECTS_PER_PG_FOR_LEAP * + p.get_pgp_num_target()))) { + next = target; + } else { + double room = + std::min<double>(max_misplaced - misplaced_ratio, + max_misplaced / 2.0); + unsigned estmax = std::max<unsigned>( + (double)p.get_pg_num() * room, 1u); + unsigned next_min = 0; + if (p.get_pgp_num() > estmax) { + next_min = p.get_pgp_num() - estmax; + } + next = std::clamp(target, + next_min, + p.get_pgp_num() + estmax); + dout(20) << " room " << room << " estmax " << estmax + << " delta " << (target-p.get_pgp_num()) + << " next " << next << dendl; + if (p.get_pgp_num_target() == p.get_pg_num_target() && + p.get_pgp_num_target() < p.get_pg_num()) { + // since pgp_num is tracking pg_num, ceph is handling + // pgp_num. so, be responsible: don't let pgp_num get + // too far out ahead of merges (if we are merging). + // this avoids moving lots of unmerged pgs onto a + // small number of OSDs where we might blow out the + // per-osd pg max. + unsigned max_outpace_merges = + std::max<unsigned>(8, p.get_pg_num() * max_misplaced); + if (next + max_outpace_merges < p.get_pg_num()) { + next = p.get_pg_num() - max_outpace_merges; + dout(10) << " using next " << next + << " to avoid outpacing merges (max_outpace_merges " + << max_outpace_merges << ")" << dendl; + } + } + } + if (next != p.get_pgp_num()) { + dout(10) << "pool " << i.first + << " pgp_num_target " << p.get_pgp_num_target() + << " pgp_num " << p.get_pgp_num() + << " -> " << next << dendl; + pgp_num_to_set[osdmap.get_pool_name(i.first)] = next; + } + } + } + if (left == 0) { + return; + } + } + }); + for (auto i : pg_num_to_set) { + const string cmd = + "{" + "\"prefix\": \"osd pool set\", " + "\"pool\": \"" + i.first + "\", " + "\"var\": \"pg_num_actual\", " + "\"val\": \"" + stringify(i.second) + "\"" + "}"; + monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr); + } + for (auto i : pgp_num_to_set) { + const string cmd = + "{" + "\"prefix\": \"osd pool set\", " + "\"pool\": \"" + i.first + "\", " + "\"var\": \"pgp_num_actual\", " + "\"val\": \"" + stringify(i.second) + "\"" + "}"; + monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr); + } + for (auto pg : upmaps_to_clear) { + const string cmd = + "{" + "\"prefix\": \"osd rm-pg-upmap\", " + "\"pgid\": \"" + stringify(pg) + "\"" + "}"; + monc->start_mon_command({cmd}, {}, nullptr, nullptr, nullptr); + const string cmd2 = + "{" + "\"prefix\": \"osd rm-pg-upmap-items\", " + "\"pgid\": \"" + stringify(pg) + "\"" + + "}"; + monc->start_mon_command({cmd2}, {}, nullptr, nullptr, nullptr); + } +} + +void DaemonServer::got_service_map() +{ + std::lock_guard l(lock); + + cluster_state.with_servicemap([&](const ServiceMap& service_map) { + if (pending_service_map.epoch == 0) { + // we just started up + dout(10) << "got initial map e" << service_map.epoch << dendl; + ceph_assert(pending_service_map_dirty == 0); + pending_service_map = service_map; + pending_service_map.epoch = service_map.epoch + 1; + } else if (pending_service_map.epoch <= service_map.epoch) { + // we just started up but got one more not our own map + dout(10) << "got newer initial map e" << service_map.epoch << dendl; + ceph_assert(pending_service_map_dirty == 0); + pending_service_map = service_map; + pending_service_map.epoch = service_map.epoch + 1; + } else { + // we already active and therefore must have persisted it, + // which means ours is the same or newer. + dout(10) << "got updated map e" << service_map.epoch << dendl; + } + }); + + // cull missing daemons, populate new ones + std::set<std::string> types; + for (auto& [type, service] : pending_service_map.services) { + if (ServiceMap::is_normal_ceph_entity(type)) { + continue; + } + + types.insert(type); + + std::set<std::string> names; + for (auto& q : service.daemons) { + names.insert(q.first); + DaemonKey key{type, q.first}; + if (!daemon_state.exists(key)) { + auto daemon = std::make_shared<DaemonState>(daemon_state.types); + daemon->key = key; + daemon->set_metadata(q.second.metadata); + daemon->service_daemon = true; + daemon_state.insert(daemon); + dout(10) << "added missing " << key << dendl; + } + } + daemon_state.cull(type, names); + } + daemon_state.cull_services(types); +} + +void DaemonServer::got_mgr_map() +{ + std::lock_guard l(lock); + set<std::string> have; + cluster_state.with_mgrmap([&](const MgrMap& mgrmap) { + auto md_update = [&] (DaemonKey key) { + std::ostringstream oss; + auto c = new MetadataUpdate(daemon_state, key); + // FIXME remove post-nautilus: include 'id' for luminous mons + oss << "{\"prefix\": \"mgr metadata\", \"who\": \"" + << key.name << "\", \"id\": \"" << key.name << "\"}"; + monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c); + }; + if (mgrmap.active_name.size()) { + DaemonKey key{"mgr", mgrmap.active_name}; + have.insert(mgrmap.active_name); + if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) { + md_update(key); + dout(10) << "triggered addition of " << key << " via metadata update" << dendl; + } + } + for (auto& i : mgrmap.standbys) { + DaemonKey key{"mgr", i.second.name}; + have.insert(i.second.name); + if (!daemon_state.exists(key) && !daemon_state.is_updating(key)) { + md_update(key); + dout(10) << "triggered addition of " << key << " via metadata update" << dendl; + } + } + }); + daemon_state.cull("mgr", have); +} + +const char** DaemonServer::get_tracked_conf_keys() const +{ + static const char *KEYS[] = { + "mgr_stats_threshold", + "mgr_stats_period", + nullptr + }; + + return KEYS; +} + +void DaemonServer::handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + + if (changed.count("mgr_stats_threshold") || changed.count("mgr_stats_period")) { + dout(4) << "Updating stats threshold/period on " + << daemon_connections.size() << " clients" << dendl; + // Send a fresh MMgrConfigure to all clients, so that they can follow + // the new policy for transmitting stats + finisher.queue(new LambdaContext([this](int r) { + std::lock_guard l(lock); + for (auto &c : daemon_connections) { + _send_configure(c); + } + })); + } +} + +void DaemonServer::_send_configure(ConnectionRef c) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + auto configure = make_message<MMgrConfigure>(); + configure->stats_period = g_conf().get_val<int64_t>("mgr_stats_period"); + configure->stats_threshold = g_conf().get_val<int64_t>("mgr_stats_threshold"); + + if (c->peer_is_osd()) { + configure->osd_perf_metric_queries = + osd_perf_metric_collector.get_queries(); + } else if (c->peer_is_mds()) { + configure->metric_config_message = + MetricConfigMessage(MDSConfigPayload(mds_perf_metric_collector.get_queries())); + } + + c->send_message2(configure); +} + +MetricQueryID DaemonServer::add_osd_perf_query( + const OSDPerfMetricQuery &query, + const std::optional<OSDPerfMetricLimit> &limit) +{ + return osd_perf_metric_collector.add_query(query, limit); +} + +int DaemonServer::remove_osd_perf_query(MetricQueryID query_id) +{ + return osd_perf_metric_collector.remove_query(query_id); +} + +int DaemonServer::get_osd_perf_counters(OSDPerfCollector *collector) +{ + return osd_perf_metric_collector.get_counters(collector); +} + +MetricQueryID DaemonServer::add_mds_perf_query( + const MDSPerfMetricQuery &query, + const std::optional<MDSPerfMetricLimit> &limit) +{ + return mds_perf_metric_collector.add_query(query, limit); +} + +int DaemonServer::remove_mds_perf_query(MetricQueryID query_id) +{ + return mds_perf_metric_collector.remove_query(query_id); +} + +void DaemonServer::reregister_mds_perf_queries() +{ + mds_perf_metric_collector.reregister_queries(); +} + +int DaemonServer::get_mds_perf_counters(MDSPerfCollector *collector) +{ + return mds_perf_metric_collector.get_counters(collector); +} diff --git a/src/mgr/DaemonServer.h b/src/mgr/DaemonServer.h new file mode 100644 index 000000000..a7b645610 --- /dev/null +++ b/src/mgr/DaemonServer.h @@ -0,0 +1,317 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef DAEMON_SERVER_H_ +#define DAEMON_SERVER_H_ + +#include "PyModuleRegistry.h" + +#include <set> +#include <string> +#include <boost/variant.hpp> + +#include "common/ceph_mutex.h" +#include "common/LogClient.h" +#include "common/Timer.h" + +#include <msg/Messenger.h> +#include <mon/MonClient.h> + +#include "ServiceMap.h" +#include "MgrSession.h" +#include "DaemonState.h" +#include "MetricCollector.h" +#include "OSDPerfMetricCollector.h" +#include "MDSPerfMetricCollector.h" + +class MMgrReport; +class MMgrOpen; +class MMgrUpdate; +class MMgrClose; +class MMonMgrReport; +class MCommand; +class MMgrCommand; +struct MonCommand; +class CommandContext; +struct OSDPerfMetricQuery; +struct MDSPerfMetricQuery; + + +struct offline_pg_report { + set<int> osds; + set<pg_t> ok, not_ok, unknown; + set<pg_t> ok_become_degraded, ok_become_more_degraded; // ok + set<pg_t> bad_no_pool, bad_already_inactive, bad_become_inactive; // not ok + + bool ok_to_stop() const { + return not_ok.empty() && unknown.empty(); + } + + void dump(Formatter *f) const { + f->dump_bool("ok_to_stop", ok_to_stop()); + f->open_array_section("osds"); + for (auto o : osds) { + f->dump_int("osd", o); + } + f->close_section(); + f->dump_unsigned("num_ok_pgs", ok.size()); + f->dump_unsigned("num_not_ok_pgs", not_ok.size()); + + // ambiguous + if (!unknown.empty()) { + f->open_array_section("unknown_pgs"); + for (auto pg : unknown) { + f->dump_stream("pg") << pg; + } + f->close_section(); + } + + // bad news + if (!bad_no_pool.empty()) { + f->open_array_section("bad_no_pool_pgs"); + for (auto pg : bad_no_pool) { + f->dump_stream("pg") << pg; + } + f->close_section(); + } + if (!bad_already_inactive.empty()) { + f->open_array_section("bad_already_inactive"); + for (auto pg : bad_already_inactive) { + f->dump_stream("pg") << pg; + } + f->close_section(); + } + if (!bad_become_inactive.empty()) { + f->open_array_section("bad_become_inactive"); + for (auto pg : bad_become_inactive) { + f->dump_stream("pg") << pg; + } + f->close_section(); + } + + // informative + if (!ok_become_degraded.empty()) { + f->open_array_section("ok_become_degraded"); + for (auto pg : ok_become_degraded) { + f->dump_stream("pg") << pg; + } + f->close_section(); + } + if (!ok_become_more_degraded.empty()) { + f->open_array_section("ok_become_more_degraded"); + for (auto pg : ok_become_more_degraded) { + f->dump_stream("pg") << pg; + } + f->close_section(); + } + } +}; + + +/** + * Server used in ceph-mgr to communicate with Ceph daemons like + * MDSs and OSDs. + */ +class DaemonServer : public Dispatcher, public md_config_obs_t +{ +protected: + boost::scoped_ptr<Throttle> client_byte_throttler; + boost::scoped_ptr<Throttle> client_msg_throttler; + boost::scoped_ptr<Throttle> osd_byte_throttler; + boost::scoped_ptr<Throttle> osd_msg_throttler; + boost::scoped_ptr<Throttle> mds_byte_throttler; + boost::scoped_ptr<Throttle> mds_msg_throttler; + boost::scoped_ptr<Throttle> mon_byte_throttler; + boost::scoped_ptr<Throttle> mon_msg_throttler; + + Messenger *msgr; + MonClient *monc; + Finisher &finisher; + DaemonStateIndex &daemon_state; + ClusterState &cluster_state; + PyModuleRegistry &py_modules; + LogChannelRef clog, audit_clog; + + // Connections for daemons, and clients with service names set + // (i.e. those MgrClients that are allowed to send MMgrReports) + std::set<ConnectionRef> daemon_connections; + + /// connections for osds + ceph::unordered_map<int,std::set<ConnectionRef>> osd_cons; + + ServiceMap pending_service_map; // uncommitted + + epoch_t pending_service_map_dirty = 0; + + ceph::mutex lock = ceph::make_mutex("DaemonServer"); + + static void _generate_command_map(cmdmap_t& cmdmap, + std::map<std::string,std::string> ¶m_str_map); + static const MonCommand *_get_mgrcommand(const std::string &cmd_prefix, + const std::vector<MonCommand> &commands); + bool _allowed_command( + MgrSession *s, const std::string &service, const std::string &module, + const std::string &prefix, const cmdmap_t& cmdmap, + const std::map<std::string,std::string>& param_str_map, + const MonCommand *this_cmd); + +private: + friend class ReplyOnFinish; + bool _reply(MCommand* m, + int ret, const std::string& s, const bufferlist& payload); + + void _prune_pending_service_map(); + + void _check_offlines_pgs( + const std::set<int>& osds, + const OSDMap& osdmap, + const PGMap& pgmap, + offline_pg_report *report); + void _maximize_ok_to_stop_set( + const set<int>& orig_osds, + unsigned max, + const OSDMap& osdmap, + const PGMap& pgmap, + offline_pg_report *report); + + utime_t started_at; + std::atomic<bool> pgmap_ready; + std::set<int32_t> reported_osds; + void maybe_ready(int32_t osd_id); + + SafeTimer timer; + bool shutting_down; + Context *tick_event; + void tick(); + void schedule_tick_locked(double delay_sec); + + class OSDPerfMetricCollectorListener : public MetricListener { + public: + OSDPerfMetricCollectorListener(DaemonServer *server) + : server(server) { + } + void handle_query_updated() override { + server->handle_osd_perf_metric_query_updated(); + } + private: + DaemonServer *server; + }; + OSDPerfMetricCollectorListener osd_perf_metric_collector_listener; + OSDPerfMetricCollector osd_perf_metric_collector; + void handle_osd_perf_metric_query_updated(); + + class MDSPerfMetricCollectorListener : public MetricListener { + public: + MDSPerfMetricCollectorListener(DaemonServer *server) + : server(server) { + } + void handle_query_updated() override { + server->handle_mds_perf_metric_query_updated(); + } + private: + DaemonServer *server; + }; + MDSPerfMetricCollectorListener mds_perf_metric_collector_listener; + MDSPerfMetricCollector mds_perf_metric_collector; + void handle_mds_perf_metric_query_updated(); + + void handle_metric_payload(const OSDMetricPayload &payload) { + osd_perf_metric_collector.process_reports(payload); + } + + void handle_metric_payload(const MDSMetricPayload &payload) { + mds_perf_metric_collector.process_reports(payload); + } + + void handle_metric_payload(const UnknownMetricPayload &payload) { + ceph_abort(); + } + + struct HandlePayloadVisitor : public boost::static_visitor<void> { + DaemonServer *server; + + HandlePayloadVisitor(DaemonServer *server) + : server(server) { + } + + template <typename MetricPayload> + inline void operator()(const MetricPayload &payload) const { + server->handle_metric_payload(payload); + } + }; + + void update_task_status(DaemonKey key, + const std::map<std::string,std::string>& task_status); + +public: + int init(uint64_t gid, entity_addrvec_t client_addrs); + void shutdown(); + + entity_addrvec_t get_myaddrs() const; + + DaemonServer(MonClient *monc_, + Finisher &finisher_, + DaemonStateIndex &daemon_state_, + ClusterState &cluster_state_, + PyModuleRegistry &py_modules_, + LogChannelRef cl, + LogChannelRef auditcl); + ~DaemonServer() override; + + bool ms_dispatch2(const ceph::ref_t<Message>& m) override; + int ms_handle_fast_authentication(Connection *con) override; + void ms_handle_accept(Connection *con) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override; + + void fetch_missing_metadata(const DaemonKey& key, const entity_addr_t& addr); + bool handle_open(const ceph::ref_t<MMgrOpen>& m); + bool handle_update(const ceph::ref_t<MMgrUpdate>& m); + bool handle_close(const ceph::ref_t<MMgrClose>& m); + bool handle_report(const ceph::ref_t<MMgrReport>& m); + bool handle_command(const ceph::ref_t<MCommand>& m); + bool handle_command(const ceph::ref_t<MMgrCommand>& m); + bool _handle_command(std::shared_ptr<CommandContext>& cmdctx); + void send_report(); + void got_service_map(); + void got_mgr_map(); + void adjust_pgs(); + + void _send_configure(ConnectionRef c); + + MetricQueryID add_osd_perf_query( + const OSDPerfMetricQuery &query, + const std::optional<OSDPerfMetricLimit> &limit); + int remove_osd_perf_query(MetricQueryID query_id); + int get_osd_perf_counters(OSDPerfCollector *collector); + + MetricQueryID add_mds_perf_query(const MDSPerfMetricQuery &query, + const std::optional<MDSPerfMetricLimit> &limit); + int remove_mds_perf_query(MetricQueryID query_id); + void reregister_mds_perf_queries(); + int get_mds_perf_counters(MDSPerfCollector *collector); + + virtual const char** get_tracked_conf_keys() const override; + virtual void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) override; + + void schedule_tick(double delay_sec); + + void log_access_denied(std::shared_ptr<CommandContext>& cmdctx, + MgrSession* session, std::stringstream& ss); + void dump_pg_ready(ceph::Formatter *f); +}; + +#endif + diff --git a/src/mgr/DaemonState.cc b/src/mgr/DaemonState.cc new file mode 100644 index 000000000..044ddadad --- /dev/null +++ b/src/mgr/DaemonState.cc @@ -0,0 +1,434 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "DaemonState.h" + +#include <experimental/iterator> + +#include "MgrSession.h" +#include "include/stringify.h" +#include "common/Formatter.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +using std::list; +using std::make_pair; +using std::map; +using std::ostream; +using std::ostringstream; +using std::string; +using std::stringstream; +using std::unique_ptr; + +void DeviceState::set_metadata(map<string,string>&& m) +{ + metadata = std::move(m); + auto p = metadata.find("life_expectancy_min"); + if (p != metadata.end()) { + life_expectancy.first.parse(p->second); + } + p = metadata.find("life_expectancy_max"); + if (p != metadata.end()) { + life_expectancy.second.parse(p->second); + } + p = metadata.find("life_expectancy_stamp"); + if (p != metadata.end()) { + life_expectancy_stamp.parse(p->second); + } + p = metadata.find("wear_level"); + if (p != metadata.end()) { + wear_level = atof(p->second.c_str()); + } +} + +void DeviceState::set_life_expectancy(utime_t from, utime_t to, utime_t now) +{ + life_expectancy = make_pair(from, to); + life_expectancy_stamp = now; + if (from != utime_t()) { + metadata["life_expectancy_min"] = stringify(from); + } else { + metadata["life_expectancy_min"] = ""; + } + if (to != utime_t()) { + metadata["life_expectancy_max"] = stringify(to); + } else { + metadata["life_expectancy_max"] = ""; + } + if (now != utime_t()) { + metadata["life_expectancy_stamp"] = stringify(now); + } else { + metadata["life_expectancy_stamp"] = ""; + } +} + +void DeviceState::rm_life_expectancy() +{ + life_expectancy = make_pair(utime_t(), utime_t()); + life_expectancy_stamp = utime_t(); + metadata.erase("life_expectancy_min"); + metadata.erase("life_expectancy_max"); + metadata.erase("life_expectancy_stamp"); +} + +void DeviceState::set_wear_level(float wear) +{ + wear_level = wear; + if (wear >= 0) { + metadata["wear_level"] = stringify(wear); + } else { + metadata.erase("wear_level"); + } +} + +string DeviceState::get_life_expectancy_str(utime_t now) const +{ + if (life_expectancy.first == utime_t()) { + return string(); + } + if (now >= life_expectancy.first) { + return "now"; + } + utime_t min = life_expectancy.first - now; + utime_t max = life_expectancy.second - now; + if (life_expectancy.second == utime_t()) { + return string(">") + timespan_str(make_timespan(min)); + } + string a = timespan_str(make_timespan(min)); + string b = timespan_str(make_timespan(max)); + if (a == b) { + return a; + } + return a + " to " + b; +} + +void DeviceState::dump(Formatter *f) const +{ + f->dump_string("devid", devid); + f->open_array_section("location"); + for (auto& i : attachments) { + f->open_object_section("attachment"); + f->dump_string("host", std::get<0>(i)); + f->dump_string("dev", std::get<1>(i)); + f->dump_string("path", std::get<2>(i)); + f->close_section(); + } + f->close_section(); + f->open_array_section("daemons"); + for (auto& i : daemons) { + f->dump_stream("daemon") << i; + } + f->close_section(); + if (life_expectancy.first != utime_t()) { + f->dump_stream("life_expectancy_min") << life_expectancy.first; + f->dump_stream("life_expectancy_max") << life_expectancy.second; + f->dump_stream("life_expectancy_stamp") + << life_expectancy_stamp; + } + if (wear_level >= 0) { + f->dump_float("wear_level", wear_level); + } +} + +void DeviceState::print(ostream& out) const +{ + out << "device " << devid << "\n"; + for (auto& i : attachments) { + out << "attachment " << std::get<0>(i) << " " << std::get<1>(i) << " " + << std::get<2>(i) << "\n"; + out << "\n"; + } + std::copy(std::begin(daemons), std::end(daemons), + std::experimental::make_ostream_joiner(out, ",")); + out << '\n'; + if (life_expectancy.first != utime_t()) { + out << "life_expectancy " << life_expectancy.first << " to " + << life_expectancy.second + << " (as of " << life_expectancy_stamp << ")\n"; + } + if (wear_level >= 0) { + out << "wear_level " << wear_level << "\n"; + } +} + +void DaemonState::set_metadata(const std::map<std::string,std::string>& m) +{ + devices.clear(); + devices_bypath.clear(); + metadata = m; + if (auto found = m.find("device_ids"); found != m.end()) { + auto& device_ids = found->second; + std::map<std::string,std::string> paths; // devname -> id or path + if (auto found = m.find("device_paths"); found != m.end()) { + get_str_map(found->second, &paths, ",; "); + } + for_each_pair( + device_ids, ",; ", + [&paths, this](std::string_view devname, std::string_view id) { + // skip blank ids + if (id.empty()) { + return; + } + // id -> devname + devices.emplace(id, devname); + if (auto path = paths.find(std::string(id)); path != paths.end()) { + // id -> path + devices_bypath.emplace(id, path->second); + } + }); + } + if (auto found = m.find("hostname"); found != m.end()) { + hostname = found->second; + } +} + +const std::map<std::string,std::string>& DaemonState::_get_config_defaults() +{ + if (config_defaults.empty() && + config_defaults_bl.length()) { + auto p = config_defaults_bl.cbegin(); + try { + decode(config_defaults, p); + } catch (buffer::error& e) { + } + } + return config_defaults; +} + +void DaemonStateIndex::insert(DaemonStatePtr dm) +{ + std::unique_lock l{lock}; + _insert(dm); +} + +void DaemonStateIndex::_insert(DaemonStatePtr dm) +{ + if (all.count(dm->key)) { + _erase(dm->key); + } + + by_server[dm->hostname][dm->key] = dm; + all[dm->key] = dm; + + for (auto& i : dm->devices) { + auto d = _get_or_create_device(i.first); + d->daemons.insert(dm->key); + auto p = dm->devices_bypath.find(i.first); + if (p != dm->devices_bypath.end()) { + d->attachments.insert(std::make_tuple(dm->hostname, i.second, p->second)); + } else { + d->attachments.insert(std::make_tuple(dm->hostname, i.second, + std::string())); + } + } +} + +void DaemonStateIndex::_erase(const DaemonKey& dmk) +{ + ceph_assert(ceph_mutex_is_wlocked(lock)); + + const auto to_erase = all.find(dmk); + ceph_assert(to_erase != all.end()); + const auto dm = to_erase->second; + + for (auto& i : dm->devices) { + auto d = _get_or_create_device(i.first); + ceph_assert(d->daemons.count(dmk)); + d->daemons.erase(dmk); + auto p = dm->devices_bypath.find(i.first); + if (p != dm->devices_bypath.end()) { + d->attachments.erase(make_tuple(dm->hostname, i.second, p->second)); + } else { + d->attachments.erase(make_tuple(dm->hostname, i.second, std::string())); + } + if (d->empty()) { + _erase_device(d); + } + } + + auto &server_collection = by_server[dm->hostname]; + server_collection.erase(dm->key); + if (server_collection.empty()) { + by_server.erase(dm->hostname); + } + + all.erase(to_erase); +} + +DaemonStateCollection DaemonStateIndex::get_by_service( + const std::string& svc) const +{ + std::shared_lock l{lock}; + + DaemonStateCollection result; + + for (const auto& [key, state] : all) { + if (key.type == svc) { + result[key] = state; + } + } + + return result; +} + +DaemonStateCollection DaemonStateIndex::get_by_server( + const std::string &hostname) const +{ + std::shared_lock l{lock}; + + if (auto found = by_server.find(hostname); found != by_server.end()) { + return found->second; + } else { + return {}; + } +} + +bool DaemonStateIndex::exists(const DaemonKey &key) const +{ + std::shared_lock l{lock}; + + return all.count(key) > 0; +} + +DaemonStatePtr DaemonStateIndex::get(const DaemonKey &key) +{ + std::shared_lock l{lock}; + + auto iter = all.find(key); + if (iter != all.end()) { + return iter->second; + } else { + return nullptr; + } +} + +void DaemonStateIndex::rm(const DaemonKey &key) +{ + std::unique_lock l{lock}; + _rm(key); +} + +void DaemonStateIndex::_rm(const DaemonKey &key) +{ + if (all.count(key)) { + _erase(key); + } +} + +void DaemonStateIndex::cull(const std::string& svc_name, + const std::set<std::string>& names_exist) +{ + std::vector<string> victims; + + std::unique_lock l{lock}; + auto begin = all.lower_bound({svc_name, ""}); + auto end = all.end(); + for (auto &i = begin; i != end; ++i) { + const auto& daemon_key = i->first; + if (daemon_key.type != svc_name) + break; + if (names_exist.count(daemon_key.name) == 0) { + victims.push_back(daemon_key.name); + } + } + + for (auto &i : victims) { + DaemonKey daemon_key{svc_name, i}; + dout(4) << "Removing data for " << daemon_key << dendl; + _erase(daemon_key); + } +} + +void DaemonStateIndex::cull_services(const std::set<std::string>& types_exist) +{ + std::set<DaemonKey> victims; + + std::unique_lock l{lock}; + for (auto it = all.begin(); it != all.end(); ++it) { + const auto& daemon_key = it->first; + if (it->second->service_daemon && + types_exist.count(daemon_key.type) == 0) { + victims.insert(daemon_key); + } + } + + for (auto &i : victims) { + dout(4) << "Removing data for " << i << dendl; + _erase(i); + } +} + +void DaemonPerfCounters::update(const MMgrReport& report) +{ + dout(20) << "loading " << report.declare_types.size() << " new types, " + << report.undeclare_types.size() << " old types, had " + << types.size() << " types, got " + << report.packed.length() << " bytes of data" << dendl; + + // Retrieve session state + auto priv = report.get_connection()->get_priv(); + auto session = static_cast<MgrSession*>(priv.get()); + + // Load any newly declared types + for (const auto &t : report.declare_types) { + types.insert(std::make_pair(t.path, t)); + session->declared_types.insert(t.path); + } + // Remove any old types + for (const auto &t : report.undeclare_types) { + session->declared_types.erase(t); + } + + const auto now = ceph_clock_now(); + + // Parse packed data according to declared set of types + auto p = report.packed.cbegin(); + DECODE_START(1, p); + for (const auto &t_path : session->declared_types) { + const auto &t = types.at(t_path); + auto instances_it = instances.find(t_path); + // Always check the instance exists, as we don't prevent yet + // multiple sessions from daemons with the same name, and one + // session clearing stats created by another on open. + if (instances_it == instances.end()) { + instances_it = instances.insert({t_path, t.type}).first; + } + uint64_t val = 0; + uint64_t avgcount = 0; + uint64_t avgcount2 = 0; + + decode(val, p); + if (t.type & PERFCOUNTER_LONGRUNAVG) { + decode(avgcount, p); + decode(avgcount2, p); + instances_it->second.push_avg(now, val, avgcount); + } else { + instances_it->second.push(now, val); + } + } + DECODE_FINISH(p); +} + +void PerfCounterInstance::push(utime_t t, uint64_t const &v) +{ + buffer.push_back({t, v}); +} + +void PerfCounterInstance::push_avg(utime_t t, uint64_t const &s, + uint64_t const &c) +{ + avg_buffer.push_back({t, s, c}); +} diff --git a/src/mgr/DaemonState.h b/src/mgr/DaemonState.h new file mode 100644 index 000000000..0688db81b --- /dev/null +++ b/src/mgr/DaemonState.h @@ -0,0 +1,370 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef DAEMON_STATE_H_ +#define DAEMON_STATE_H_ + +#include <map> +#include <string> +#include <memory> +#include <set> +#include <boost/circular_buffer.hpp> + +#include "include/str_map.h" + +#include "msg/msg_types.h" + +// For PerfCounterType +#include "messages/MMgrReport.h" +#include "DaemonKey.h" + +namespace ceph { + class Formatter; +} + +// An instance of a performance counter type, within +// a particular daemon. +class PerfCounterInstance +{ + class DataPoint + { + public: + utime_t t; + uint64_t v; + DataPoint(utime_t t_, uint64_t v_) + : t(t_), v(v_) + {} + }; + + class AvgDataPoint + { + public: + utime_t t; + uint64_t s; + uint64_t c; + AvgDataPoint(utime_t t_, uint64_t s_, uint64_t c_) + : t(t_), s(s_), c(c_) + {} + }; + + boost::circular_buffer<DataPoint> buffer; + boost::circular_buffer<AvgDataPoint> avg_buffer; + + uint64_t get_current() const; + + public: + const boost::circular_buffer<DataPoint> & get_data() const + { + return buffer; + } + const DataPoint& get_latest_data() const + { + return buffer.back(); + } + const boost::circular_buffer<AvgDataPoint> & get_data_avg() const + { + return avg_buffer; + } + const AvgDataPoint& get_latest_data_avg() const + { + return avg_buffer.back(); + } + void push(utime_t t, uint64_t const &v); + void push_avg(utime_t t, uint64_t const &s, uint64_t const &c); + + PerfCounterInstance(enum perfcounter_type_d type) + { + if (type & PERFCOUNTER_LONGRUNAVG) + avg_buffer = boost::circular_buffer<AvgDataPoint>(20); + else + buffer = boost::circular_buffer<DataPoint>(20); + }; +}; + + +typedef std::map<std::string, PerfCounterType> PerfCounterTypes; + +// Performance counters for one daemon +class DaemonPerfCounters +{ + public: + // The record of perf stat types, shared between daemons + PerfCounterTypes &types; + + explicit DaemonPerfCounters(PerfCounterTypes &types_) + : types(types_) + {} + + std::map<std::string, PerfCounterInstance> instances; + + void update(const MMgrReport& report); + + void clear() + { + instances.clear(); + } +}; + +// The state that we store about one daemon +class DaemonState +{ + public: + ceph::mutex lock = ceph::make_mutex("DaemonState::lock"); + + DaemonKey key; + + // The hostname where daemon was last seen running (extracted + // from the metadata) + std::string hostname; + + // The metadata (hostname, version, etc) sent from the daemon + std::map<std::string, std::string> metadata; + + /// device ids -> devname, derived from metadata[device_ids] + std::map<std::string,std::string> devices; + + /// device ids -> by-path, derived from metadata[device_ids] + std::map<std::string,std::string> devices_bypath; + + // TODO: this can be generalized to other daemons + std::vector<DaemonHealthMetric> daemon_health_metrics; + + // Ephemeral state + bool service_daemon = false; + utime_t service_status_stamp; + std::map<std::string, std::string> service_status; + utime_t last_service_beacon; + + // running config + std::map<std::string,std::map<int32_t,std::string>> config; + + // mon config values we failed to set + std::map<std::string,std::string> ignored_mon_config; + + // compiled-in config defaults (rarely used, so we leave them encoded!) + bufferlist config_defaults_bl; + std::map<std::string,std::string> config_defaults; + + // The perf counters received in MMgrReport messages + DaemonPerfCounters perf_counters; + + explicit DaemonState(PerfCounterTypes &types_) + : perf_counters(types_) + { + } + void set_metadata(const std::map<std::string,std::string>& m); + const std::map<std::string,std::string>& _get_config_defaults(); +}; + +typedef std::shared_ptr<DaemonState> DaemonStatePtr; +typedef std::map<DaemonKey, DaemonStatePtr> DaemonStateCollection; + + +struct DeviceState : public RefCountedObject +{ + std::string devid; + /// (server,devname,path) + std::set<std::tuple<std::string,std::string,std::string>> attachments; + std::set<DaemonKey> daemons; + + std::map<std::string,std::string> metadata; ///< persistent metadata + + std::pair<utime_t,utime_t> life_expectancy; ///< when device failure is expected + utime_t life_expectancy_stamp; ///< when life expectency was recorded + float wear_level = -1; ///< SSD wear level (negative if unknown) + + void set_metadata(std::map<std::string,std::string>&& m); + + void set_life_expectancy(utime_t from, utime_t to, utime_t now); + void rm_life_expectancy(); + + void set_wear_level(float wear); + + std::string get_life_expectancy_str(utime_t now) const; + + /// true of we can be safely forgotten/removed from memory + bool empty() const { + return daemons.empty() && metadata.empty(); + } + + void dump(Formatter *f) const; + void print(std::ostream& out) const; + +private: + FRIEND_MAKE_REF(DeviceState); + DeviceState(const std::string& n) : devid(n) {} +}; + +/** + * Fuse the collection of per-daemon metadata from Ceph into + * a view that can be queried by service type, ID or also + * by server (aka fqdn). + */ +class DaemonStateIndex +{ +private: + mutable ceph::shared_mutex lock = + ceph::make_shared_mutex("DaemonStateIndex", true, true, true); + + std::map<std::string, DaemonStateCollection> by_server; + DaemonStateCollection all; + std::set<DaemonKey> updating; + + std::map<std::string,ceph::ref_t<DeviceState>> devices; + + void _erase(const DaemonKey& dmk); + + ceph::ref_t<DeviceState> _get_or_create_device(const std::string& dev) { + auto em = devices.try_emplace(dev, nullptr); + auto& d = em.first->second; + if (em.second) { + d = ceph::make_ref<DeviceState>(dev); + } + return d; + } + void _erase_device(const ceph::ref_t<DeviceState>& d) { + devices.erase(d->devid); + } + +public: + DaemonStateIndex() {} + + // FIXME: shouldn't really be public, maybe construct DaemonState + // objects internally to avoid this. + PerfCounterTypes types; + + void insert(DaemonStatePtr dm); + void _insert(DaemonStatePtr dm); + bool exists(const DaemonKey &key) const; + DaemonStatePtr get(const DaemonKey &key); + void rm(const DaemonKey &key); + void _rm(const DaemonKey &key); + + // Note that these return by value rather than reference to avoid + // callers needing to stay in lock while using result. Callers must + // still take the individual DaemonState::lock on each entry though. + DaemonStateCollection get_by_server(const std::string &hostname) const; + DaemonStateCollection get_by_service(const std::string &svc_name) const; + DaemonStateCollection get_all() const {return all;} + + template<typename Callback, typename...Args> + auto with_daemons_by_server(Callback&& cb, Args&&... args) const -> + decltype(cb(by_server, std::forward<Args>(args)...)) { + std::shared_lock l{lock}; + + return std::forward<Callback>(cb)(by_server, std::forward<Args>(args)...); + } + + template<typename Callback, typename...Args> + bool with_device(const std::string& dev, + Callback&& cb, Args&&... args) const { + std::shared_lock l{lock}; + auto p = devices.find(dev); + if (p == devices.end()) { + return false; + } + std::forward<Callback>(cb)(*p->second, std::forward<Args>(args)...); + return true; + } + + template<typename Callback, typename...Args> + bool with_device_write(const std::string& dev, + Callback&& cb, Args&&... args) { + std::unique_lock l{lock}; + auto p = devices.find(dev); + if (p == devices.end()) { + return false; + } + std::forward<Callback>(cb)(*p->second, std::forward<Args>(args)...); + if (p->second->empty()) { + _erase_device(p->second); + } + return true; + } + + template<typename Callback, typename...Args> + void with_device_create(const std::string& dev, + Callback&& cb, Args&&... args) { + std::unique_lock l{lock}; + auto d = _get_or_create_device(dev); + std::forward<Callback>(cb)(*d, std::forward<Args>(args)...); + } + + template<typename Callback, typename...Args> + void with_devices(Callback&& cb, Args&&... args) const { + std::shared_lock l{lock}; + for (auto& i : devices) { + std::forward<Callback>(cb)(*i.second, std::forward<Args>(args)...); + } + } + + template<typename CallbackInitial, typename Callback, typename...Args> + void with_devices2(CallbackInitial&& cbi, // with lock taken + Callback&& cb, // for each device + Args&&... args) const { + std::shared_lock l{lock}; + cbi(); + for (auto& i : devices) { + std::forward<Callback>(cb)(*i.second, std::forward<Args>(args)...); + } + } + + void list_devids_by_server(const std::string& server, + std::set<std::string> *ls) { + auto m = get_by_server(server); + for (auto& i : m) { + std::lock_guard l(i.second->lock); + for (auto& j : i.second->devices) { + ls->insert(j.first); + } + } + } + + void notify_updating(const DaemonKey &k) { + std::unique_lock l{lock}; + updating.insert(k); + } + void clear_updating(const DaemonKey &k) { + std::unique_lock l{lock}; + updating.erase(k); + } + bool is_updating(const DaemonKey &k) { + std::shared_lock l{lock}; + return updating.count(k) > 0; + } + + void update_metadata(DaemonStatePtr state, + const std::map<std::string,std::string>& meta) { + // remove and re-insert in case the device metadata changed + std::unique_lock l{lock}; + _rm(state->key); + { + std::lock_guard l2{state->lock}; + state->set_metadata(meta); + } + _insert(state); + } + + /** + * Remove state for all daemons of this type whose names are + * not present in `names_exist`. Use this function when you have + * a cluster map and want to ensure that anything absent in the map + * is also absent in this class. + */ + void cull(const std::string& svc_name, + const std::set<std::string>& names_exist); + void cull_services(const std::set<std::string>& types_exist); +}; + +#endif + diff --git a/src/mgr/Gil.cc b/src/mgr/Gil.cc new file mode 100644 index 000000000..de27b9acd --- /dev/null +++ b/src/mgr/Gil.cc @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "Python.h" + +#include "common/debug.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +#include "Gil.h" + +SafeThreadState::SafeThreadState(PyThreadState *ts_) + : ts(ts_) +{ + ceph_assert(ts != nullptr); + thread = pthread_self(); +} + +Gil::Gil(SafeThreadState &ts, bool new_thread) : pThreadState(ts) +{ + // Acquire the GIL, set the current thread state + PyEval_RestoreThread(pThreadState.ts); + dout(25) << "GIL acquired for thread state " << pThreadState.ts << dendl; + + // + // If called from a separate OS thread (i.e. a thread not created + // by Python, that does't already have a python thread state that + // was created when that thread was active), we need to manually + // create and switch to a python thread state specifically for this + // OS thread. + // + // Note that instead of requring the caller to set new_thread == true + // when calling this from a separate OS thread, we could figure out + // if this was necessary automatically, as follows: + // + // if (pThreadState->thread_id != PyThread_get_thread_ident()) { + // + // However, this means we're accessing pThreadState->thread_id, but + // the Python C API docs say that "The only public data member is + // PyInterpreterState *interp", i.e. doing this would violate + // something that's meant to be a black box. + // + if (new_thread) { + pNewThreadState = PyThreadState_New(pThreadState.ts->interp); + PyThreadState_Swap(pNewThreadState); + dout(20) << "Switched to new thread state " << pNewThreadState << dendl; + } else { + ceph_assert(pthread_self() == pThreadState.thread); + } +} + +Gil::~Gil() +{ + if (pNewThreadState != nullptr) { + dout(20) << "Destroying new thread state " << pNewThreadState << dendl; + PyThreadState_Swap(pThreadState.ts); + PyThreadState_Clear(pNewThreadState); + PyThreadState_Delete(pNewThreadState); + } + // Release the GIL, reset the thread state to NULL + PyEval_SaveThread(); + dout(25) << "GIL released for thread state " << pThreadState.ts << dendl; +} + +without_gil_t::without_gil_t() +{ + assert(PyGILState_Check()); + release_gil(); +} + +without_gil_t::~without_gil_t() +{ + if (save) { + acquire_gil(); + } +} + +void without_gil_t::release_gil() +{ + save = PyEval_SaveThread(); +} + +void without_gil_t::acquire_gil() +{ + assert(save); + PyEval_RestoreThread(save); + save = nullptr; +} + +with_gil_t::with_gil_t(without_gil_t& allow_threads) + : allow_threads{allow_threads} +{ + allow_threads.acquire_gil(); +} + +with_gil_t::~with_gil_t() +{ + allow_threads.release_gil(); +} diff --git a/src/mgr/Gil.h b/src/mgr/Gil.h new file mode 100644 index 000000000..72675a503 --- /dev/null +++ b/src/mgr/Gil.h @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <cassert> +#include <functional> + +struct _ts; +typedef struct _ts PyThreadState; + +#include <pthread.h> + + +/** + * Wrap PyThreadState to carry a record of which POSIX thread + * the thread state relates to. This allows the Gil class to + * validate that we're being used from the right thread. + */ +class SafeThreadState +{ + public: + explicit SafeThreadState(PyThreadState *ts_); + + SafeThreadState() + : ts(nullptr), thread(0) + { + } + + PyThreadState *ts; + pthread_t thread; + + void set(PyThreadState *ts_) + { + ts = ts_; + thread = pthread_self(); + } +}; + +// +// Use one of these in any scope in which you need to hold Python's +// Global Interpreter Lock. +// +// Do *not* nest these, as a second GIL acquire will deadlock (see +// https://docs.python.org/2/c-api/init.html#c.PyEval_RestoreThread) +// +// If in doubt, explicitly put a scope around the block of code you +// know you need the GIL in. +// +// See the comment in Gil::Gil for when to set new_thread == true +// +class Gil { +public: + Gil(const Gil&) = delete; + Gil& operator=(const Gil&) = delete; + + Gil(SafeThreadState &ts, bool new_thread = false); + ~Gil(); + +private: + SafeThreadState &pThreadState; + PyThreadState *pNewThreadState = nullptr; +}; + +// because the Python runtime could relinquish the GIL when performing GC +// and re-acquire it afterwards, we should enforce following locking policy: +// 1. do not acquire locks when holding the GIL, use a without_gil or +// without_gil_t to guard the code which acquires non-gil locks. +// 2. always hold a GIL when calling python functions, for example, when +// constructing a PyFormatter instance. +// +// a wrapper that provides a convenient RAII-style mechinary for acquiring +// and releasing GIL, like the macros of Py_BEGIN_ALLOW_THREADS and +// Py_END_ALLOW_THREADS. +struct without_gil_t { + without_gil_t(); + ~without_gil_t(); + void release_gil(); + void acquire_gil(); +private: + PyThreadState *save = nullptr; + friend struct with_gil_t; +}; + +struct with_gil_t { + with_gil_t(without_gil_t& allow_threads); + ~with_gil_t(); +private: + without_gil_t& allow_threads; +}; + +// invoke func with GIL acquired +template<typename Func> +auto with_gil(without_gil_t& no_gil, Func&& func) { + with_gil_t gil{no_gil}; + return std::invoke(std::forward<Func>(func)); +} + +template<typename Func> +auto without_gil(Func&& func) { + without_gil_t no_gil; + return std::invoke(std::forward<Func>(func)); +} diff --git a/src/mgr/MDSPerfMetricCollector.cc b/src/mgr/MDSPerfMetricCollector.cc new file mode 100644 index 000000000..62298aba3 --- /dev/null +++ b/src/mgr/MDSPerfMetricCollector.cc @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" + +#include "messages/MMgrReport.h" +#include "mgr/MDSPerfMetricTypes.h" +#include "mgr/MDSPerfMetricCollector.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr.mds_perf_metric_collector " << __func__ << " " + +MDSPerfMetricCollector::MDSPerfMetricCollector(MetricListener &listener) + : MetricCollector<MDSPerfMetricQuery, + MDSPerfMetricLimit, + MDSPerfMetricKey, + MDSPerfMetrics>(listener) { +} + +void MDSPerfMetricCollector::process_reports(const MetricPayload &payload) { + const MDSPerfMetricReport &metric_report = boost::get<MDSMetricPayload>(payload).metric_report; + + std::lock_guard locker(lock); + process_reports_generic( + metric_report.reports, [](PerformanceCounter *counter, const PerformanceCounter &update) { + counter->first = update.first; + counter->second = update.second; + }); + + // update delayed rank set + delayed_ranks = metric_report.rank_metrics_delayed; + dout(20) << ": delayed ranks=[" << delayed_ranks << "]" << dendl; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &last_updated_mono); +} + +int MDSPerfMetricCollector::get_counters(PerfCollector *collector) { + MDSPerfCollector *c = static_cast<MDSPerfCollector *>(collector); + + std::lock_guard locker(lock); + + int r = get_counters_generic(c->query_id, &c->counters); + if (r != 0) { + return r; + } + + get_delayed_ranks(&c->delayed_ranks); + + get_last_updated(&c->last_updated_mono); + return r; +} + +void MDSPerfMetricCollector::get_delayed_ranks(std::set<mds_rank_t> *ranks) { + ceph_assert(ceph_mutex_is_locked(lock)); + *ranks = delayed_ranks; +} + +void MDSPerfMetricCollector::get_last_updated(utime_t *ts) { + ceph_assert(ceph_mutex_is_locked(lock)); + *ts = utime_t(last_updated_mono); +} diff --git a/src/mgr/MDSPerfMetricCollector.h b/src/mgr/MDSPerfMetricCollector.h new file mode 100644 index 000000000..c72bce091 --- /dev/null +++ b/src/mgr/MDSPerfMetricCollector.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MGR_MDS_PERF_COLLECTOR_H +#define CEPH_MGR_MDS_PERF_COLLECTOR_H + +#include "mgr/MetricCollector.h" +#include "mgr/MDSPerfMetricTypes.h" + +// MDS performance query class +class MDSPerfMetricCollector + : public MetricCollector<MDSPerfMetricQuery, MDSPerfMetricLimit, MDSPerfMetricKey, + MDSPerfMetrics> { +private: + std::set<mds_rank_t> delayed_ranks; + struct timespec last_updated_mono; + + void get_delayed_ranks(std::set<mds_rank_t> *ranks); + + void get_last_updated(utime_t *ts); +public: + MDSPerfMetricCollector(MetricListener &listener); + + void process_reports(const MetricPayload &payload) override; + int get_counters(PerfCollector *collector) override; +}; + +#endif // CEPH_MGR_MDS_PERF_COLLECTOR_H diff --git a/src/mgr/MDSPerfMetricTypes.cc b/src/mgr/MDSPerfMetricTypes.cc new file mode 100644 index 000000000..a16003774 --- /dev/null +++ b/src/mgr/MDSPerfMetricTypes.cc @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <ostream> +#include "mgr/MDSPerfMetricTypes.h" + +std::ostream& operator<<(std::ostream& os, const MDSPerfMetricSubKeyDescriptor &d) { + switch (d.type) { + case MDSPerfMetricSubKeyType::MDS_RANK: + os << "mds_rank"; + break; + case MDSPerfMetricSubKeyType::CLIENT_ID: + os << "client_id"; + break; + default: + os << "unknown (" << static_cast<int>(d.type) << ")"; + } + + return os << "~/" << d.regex_str << "/"; +} + +void MDSPerformanceCounterDescriptor::pack_counter( + const PerformanceCounter &c, bufferlist *bl) const { + using ceph::encode; + encode(c.first, *bl); + encode(c.second, *bl); + switch(type) { + case MDSPerformanceCounterType::CAP_HIT_METRIC: + case MDSPerformanceCounterType::READ_LATENCY_METRIC: + case MDSPerformanceCounterType::WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::METADATA_LATENCY_METRIC: + case MDSPerformanceCounterType::DENTRY_LEASE_METRIC: + case MDSPerformanceCounterType::OPENED_FILES_METRIC: + case MDSPerformanceCounterType::PINNED_ICAPS_METRIC: + case MDSPerformanceCounterType::OPENED_INODES_METRIC: + case MDSPerformanceCounterType::READ_IO_SIZES_METRIC: + case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC: + case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC: + break; + default: + ceph_abort_msg("unknown counter type"); + } +} + +void MDSPerformanceCounterDescriptor::unpack_counter( + bufferlist::const_iterator& bl, PerformanceCounter *c) const { + using ceph::decode; + decode(c->first, bl); + decode(c->second, bl); + switch(type) { + case MDSPerformanceCounterType::CAP_HIT_METRIC: + case MDSPerformanceCounterType::READ_LATENCY_METRIC: + case MDSPerformanceCounterType::WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::METADATA_LATENCY_METRIC: + case MDSPerformanceCounterType::DENTRY_LEASE_METRIC: + case MDSPerformanceCounterType::OPENED_FILES_METRIC: + case MDSPerformanceCounterType::PINNED_ICAPS_METRIC: + case MDSPerformanceCounterType::OPENED_INODES_METRIC: + case MDSPerformanceCounterType::READ_IO_SIZES_METRIC: + case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC: + case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC: + break; + default: + ceph_abort_msg("unknown counter type"); + } +} + +std::ostream& operator<<(std::ostream &os, const MDSPerformanceCounterDescriptor &d) { + switch(d.type) { + case MDSPerformanceCounterType::CAP_HIT_METRIC: + os << "cap_hit_metric"; + break; + case MDSPerformanceCounterType::READ_LATENCY_METRIC: + os << "read_latency_metric"; + break; + case MDSPerformanceCounterType::WRITE_LATENCY_METRIC: + os << "write_latency_metric"; + break; + case MDSPerformanceCounterType::METADATA_LATENCY_METRIC: + os << "metadata_latency_metric"; + break; + case MDSPerformanceCounterType::DENTRY_LEASE_METRIC: + os << "dentry_lease_metric"; + break; + case MDSPerformanceCounterType::OPENED_FILES_METRIC: + os << "opened_files_metric"; + break; + case MDSPerformanceCounterType::PINNED_ICAPS_METRIC: + os << "pinned_icaps_metric"; + break; + case MDSPerformanceCounterType::OPENED_INODES_METRIC: + os << "opened_inodes_metric"; + break; + case MDSPerformanceCounterType::READ_IO_SIZES_METRIC: + os << "read_io_sizes_metric"; + break; + case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC: + os << "write_io_sizes_metric"; + break; + case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC: + os << "avg_read_latency"; + break; + case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC: + os << "stdev_read_latency"; + break; + case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC: + os << "avg_write_latency"; + break; + case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC: + os << "stdev_write_latency"; + break; + case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC: + os << "avg_metadata_latency"; + break; + case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC: + os << "stdev_metadata_latency"; + break; + } + + return os; +} + +std::ostream &operator<<(std::ostream &os, const MDSPerfMetricLimit &limit) { + return os << "[order_by=" << limit.order_by << ", max_count=" << limit.max_count << "]"; +} + +void MDSPerfMetricQuery::pack_counters(const PerformanceCounters &counters, + bufferlist *bl) const { + auto it = counters.begin(); + for (auto &descriptor : performance_counter_descriptors) { + if (it == counters.end()) { + descriptor.pack_counter(PerformanceCounter(), bl); + } else { + descriptor.pack_counter(*it, bl); + it++; + } + } +} + +std::ostream &operator<<(std::ostream &os, const MDSPerfMetricQuery &query) { + return os << "[key=" << query.key_descriptor << ", counter=" + << query.performance_counter_descriptors << "]"; +} diff --git a/src/mgr/MDSPerfMetricTypes.h b/src/mgr/MDSPerfMetricTypes.h new file mode 100644 index 000000000..aa35b8cab --- /dev/null +++ b/src/mgr/MDSPerfMetricTypes.h @@ -0,0 +1,367 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MGR_MDS_PERF_METRIC_TYPES_H +#define CEPH_MGR_MDS_PERF_METRIC_TYPES_H + +#include <regex> +#include <vector> +#include <iostream> + +#include "include/denc.h" +#include "include/stringify.h" + +#include "mds/mdstypes.h" +#include "mgr/Types.h" + +typedef std::vector<std::string> MDSPerfMetricSubKey; // array of regex match +typedef std::vector<MDSPerfMetricSubKey> MDSPerfMetricKey; + +enum class MDSPerfMetricSubKeyType : uint8_t { + MDS_RANK = 0, + CLIENT_ID = 1, +}; + +struct MDSPerfMetricSubKeyDescriptor { + MDSPerfMetricSubKeyType type = static_cast<MDSPerfMetricSubKeyType>(-1); + std::string regex_str; + std::regex regex; + + bool is_supported() const { + switch (type) { + case MDSPerfMetricSubKeyType::MDS_RANK: + case MDSPerfMetricSubKeyType::CLIENT_ID: + return true; + default: + return false; + } + } + + MDSPerfMetricSubKeyDescriptor() { + } + MDSPerfMetricSubKeyDescriptor(MDSPerfMetricSubKeyType type, const std::string ®ex_str) + : type(type), regex_str(regex_str) { + } + + bool operator<(const MDSPerfMetricSubKeyDescriptor &other) const { + if (type < other.type) { + return true; + } + if (type > other.type) { + return false; + } + return regex_str < other.regex_str; + } + + DENC(MDSPerfMetricSubKeyDescriptor, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.regex_str, p); + DENC_FINISH(p); + } +}; +WRITE_CLASS_DENC(MDSPerfMetricSubKeyDescriptor) + +std::ostream& operator<<(std::ostream& os, const MDSPerfMetricSubKeyDescriptor &d); +typedef std::vector<MDSPerfMetricSubKeyDescriptor> MDSPerfMetricKeyDescriptor; + +template<> +struct denc_traits<MDSPerfMetricKeyDescriptor> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = true; + static void bound_encode(const MDSPerfMetricKeyDescriptor& v, size_t& p) { + p += sizeof(uint32_t); + const auto size = v.size(); + if (size) { + size_t per = 0; + denc(v.front(), per); + p += per * size; + } + } + static void encode(const MDSPerfMetricKeyDescriptor& v, + ceph::buffer::list::contiguous_appender& p) { + denc_varint(v.size(), p); + for (auto& i : v) { + denc(i, p); + } + } + static void decode(MDSPerfMetricKeyDescriptor& v, + ceph::buffer::ptr::const_iterator& p) { + unsigned num; + denc_varint(num, p); + v.clear(); + v.reserve(num); + for (unsigned i=0; i < num; ++i) { + MDSPerfMetricSubKeyDescriptor d; + denc(d, p); + if (!d.is_supported()) { + v.clear(); + return; + } + try { + d.regex = d.regex_str.c_str(); + } catch (const std::regex_error& e) { + v.clear(); + return; + } + if (d.regex.mark_count() == 0) { + v.clear(); + return; + } + v.push_back(std::move(d)); + } + } +}; + +enum class MDSPerformanceCounterType : uint8_t { + CAP_HIT_METRIC = 0, + READ_LATENCY_METRIC = 1, + WRITE_LATENCY_METRIC = 2, + METADATA_LATENCY_METRIC = 3, + DENTRY_LEASE_METRIC = 4, + OPENED_FILES_METRIC = 5, + PINNED_ICAPS_METRIC = 6, + OPENED_INODES_METRIC = 7, + READ_IO_SIZES_METRIC = 8, + WRITE_IO_SIZES_METRIC = 9, + AVG_READ_LATENCY_METRIC = 10, + STDEV_READ_LATENCY_METRIC = 11, + AVG_WRITE_LATENCY_METRIC = 12, + STDEV_WRITE_LATENCY_METRIC = 13, + AVG_METADATA_LATENCY_METRIC = 14, + STDEV_METADATA_LATENCY_METRIC = 15, +}; + +struct MDSPerformanceCounterDescriptor { + MDSPerformanceCounterType type = static_cast<MDSPerformanceCounterType>(-1); + + bool is_supported() const { + switch(type) { + case MDSPerformanceCounterType::CAP_HIT_METRIC: + case MDSPerformanceCounterType::READ_LATENCY_METRIC: + case MDSPerformanceCounterType::WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::METADATA_LATENCY_METRIC: + case MDSPerformanceCounterType::DENTRY_LEASE_METRIC: + case MDSPerformanceCounterType::OPENED_FILES_METRIC: + case MDSPerformanceCounterType::PINNED_ICAPS_METRIC: + case MDSPerformanceCounterType::OPENED_INODES_METRIC: + case MDSPerformanceCounterType::READ_IO_SIZES_METRIC: + case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC: + case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC: + return true; + default: + return false; + } + } + + MDSPerformanceCounterDescriptor() { + } + MDSPerformanceCounterDescriptor(MDSPerformanceCounterType type) : type(type) { + } + + bool operator<(const MDSPerformanceCounterDescriptor &other) const { + return type < other.type; + } + + bool operator==(const MDSPerformanceCounterDescriptor &other) const { + return type == other.type; + } + + bool operator!=(const MDSPerformanceCounterDescriptor &other) const { + return type != other.type; + } + + DENC(MDSPerformanceCounterDescriptor, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + DENC_FINISH(p); + } + + void pack_counter(const PerformanceCounter &c, ceph::buffer::list *bl) const; + void unpack_counter(ceph::buffer::list::const_iterator& bl, PerformanceCounter *c) const; +}; +WRITE_CLASS_DENC(MDSPerformanceCounterDescriptor) + +std::ostream& operator<<(std::ostream &os, const MDSPerformanceCounterDescriptor &d); +typedef std::vector<MDSPerformanceCounterDescriptor> MDSPerformanceCounterDescriptors; + +template<> +struct denc_traits<MDSPerformanceCounterDescriptors> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = true; + static void bound_encode(const MDSPerformanceCounterDescriptors& v, size_t& p) { + p += sizeof(uint32_t); + const auto size = v.size(); + if (size) { + size_t per = 0; + denc(v.front(), per); + p += per * size; + } + } + static void encode(const MDSPerformanceCounterDescriptors& v, + ceph::buffer::list::contiguous_appender& p) { + denc_varint(v.size(), p); + for (auto& i : v) { + denc(i, p); + } + } + static void decode(MDSPerformanceCounterDescriptors& v, + ceph::buffer::ptr::const_iterator& p) { + unsigned num; + denc_varint(num, p); + v.clear(); + v.reserve(num); + for (unsigned i=0; i < num; ++i) { + MDSPerformanceCounterDescriptor d; + denc(d, p); + if (d.is_supported()) { + v.push_back(std::move(d)); + } + } + } +}; + +struct MDSPerfMetricLimit { + MDSPerformanceCounterDescriptor order_by; + uint64_t max_count; + + MDSPerfMetricLimit() { + } + MDSPerfMetricLimit(const MDSPerformanceCounterDescriptor &order_by, uint64_t max_count) + : order_by(order_by), max_count(max_count) { + } + + bool operator<(const MDSPerfMetricLimit &other) const { + if (order_by != other.order_by) { + return order_by < other.order_by; + } + + return max_count < other.max_count; + } + + DENC(MDSPerfMetricLimit, v, p) { + DENC_START(1, 1, p); + denc(v.order_by, p); + denc(v.max_count, p); + DENC_FINISH(p); + } +}; +WRITE_CLASS_DENC(MDSPerfMetricLimit) + +std::ostream &operator<<(std::ostream &os, const MDSPerfMetricLimit &limit); +typedef std::set<MDSPerfMetricLimit> MDSPerfMetricLimits; + +struct MDSPerfMetricQuery { + MDSPerfMetricKeyDescriptor key_descriptor; + MDSPerformanceCounterDescriptors performance_counter_descriptors; + + MDSPerfMetricQuery() { + } + MDSPerfMetricQuery(const MDSPerfMetricKeyDescriptor &key_descriptor, + const MDSPerformanceCounterDescriptors &performance_counter_descriptors) + : key_descriptor(key_descriptor), + performance_counter_descriptors(performance_counter_descriptors) + { + } + + bool operator<(const MDSPerfMetricQuery &other) const { + if (key_descriptor < other.key_descriptor) { + return true; + } + if (key_descriptor > other.key_descriptor) { + return false; + } + return performance_counter_descriptors < other.performance_counter_descriptors; + } + + template <typename L> + bool get_key(L&& get_sub_key, MDSPerfMetricKey *key) const { + for (auto &sub_key_descriptor : key_descriptor) { + MDSPerfMetricSubKey sub_key; + if (!get_sub_key(sub_key_descriptor, &sub_key)) { + return false; + } + key->push_back(sub_key); + } + return true; + } + + void get_performance_counter_descriptors(MDSPerformanceCounterDescriptors *descriptors) const { + *descriptors = performance_counter_descriptors; + } + + template <typename L> + void update_counters(L &&update_counter, PerformanceCounters *counters) const { + auto it = counters->begin(); + for (auto &descriptor : performance_counter_descriptors) { + // TODO: optimize + if (it == counters->end()) { + counters->push_back(PerformanceCounter()); + it = std::prev(counters->end()); + } + update_counter(descriptor, &(*it)); + it++; + } + } + + DENC(MDSPerfMetricQuery, v, p) { + DENC_START(1, 1, p); + denc(v.key_descriptor, p); + denc(v.performance_counter_descriptors, p); + DENC_FINISH(p); + } + + void pack_counters(const PerformanceCounters &counters, ceph::buffer::list *bl) const; +}; +WRITE_CLASS_DENC(MDSPerfMetricQuery) + +std::ostream &operator<<(std::ostream &os, const MDSPerfMetricQuery &query); + +struct MDSPerfCollector : PerfCollector { + std::map<MDSPerfMetricKey, PerformanceCounters> counters; + std::set<mds_rank_t> delayed_ranks; + utime_t last_updated_mono; + + MDSPerfCollector(MetricQueryID query_id) + : PerfCollector(query_id) { + } +}; + +struct MDSPerfMetrics { + MDSPerformanceCounterDescriptors performance_counter_descriptors; + std::map<MDSPerfMetricKey, ceph::buffer::list> group_packed_performance_counters; + + DENC(MDSPerfMetrics, v, p) { + DENC_START(1, 1, p); + denc(v.performance_counter_descriptors, p); + denc(v.group_packed_performance_counters, p); + DENC_FINISH(p); + } +}; + +struct MDSPerfMetricReport { + std::map<MDSPerfMetricQuery, MDSPerfMetrics> reports; + // set of active ranks that have delayed (stale) metrics + std::set<mds_rank_t> rank_metrics_delayed; + + DENC(MDSPerfMetricReport, v, p) { + DENC_START(1, 1, p); + denc(v.reports, p); + denc(v.rank_metrics_delayed, p); + DENC_FINISH(p); + } +}; + +WRITE_CLASS_DENC(MDSPerfMetrics) +WRITE_CLASS_DENC(MDSPerfMetricReport) + +#endif // CEPH_MGR_MDS_PERF_METRIC_TYPES_H diff --git a/src/mgr/MetricCollector.cc b/src/mgr/MetricCollector.cc new file mode 100644 index 000000000..c31dcf0b9 --- /dev/null +++ b/src/mgr/MetricCollector.cc @@ -0,0 +1,191 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" + +#include "mgr/MetricCollector.h" +#include "mgr/OSDPerfMetricTypes.h" +#include "mgr/MDSPerfMetricTypes.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr.metric_collector " << __func__ << ": " + +template <typename Query, typename Limit, typename Key, typename Report> +MetricCollector<Query, Limit, Key, Report>::MetricCollector(MetricListener &listener) + : listener(listener) +{ +} + +template <typename Query, typename Limit, typename Key, typename Report> +MetricQueryID MetricCollector<Query, Limit, Key, Report>::add_query( + const Query &query, + const std::optional<Limit> &limit) { + dout(20) << "query=" << query << ", limit=" << limit << dendl; + uint64_t query_id; + bool notify = false; + + { + std::lock_guard locker(lock); + + query_id = next_query_id++; + auto it = queries.find(query); + if (it == queries.end()) { + it = queries.emplace(query, std::map<MetricQueryID, OptionalLimit>{}).first; + notify = true; + } else if (is_limited(it->second)) { + notify = true; + } + + it->second.emplace(query_id, limit); + counters.emplace(query_id, std::map<Key, PerformanceCounters>{}); + } + + dout(10) << query << " " << (limit ? stringify(*limit) : "unlimited") + << " query_id=" << query_id << dendl; + + if (notify) { + listener.handle_query_updated(); + } + + return query_id; +} + +template <typename Query, typename Limit, typename Key, typename Report> +int MetricCollector<Query, Limit, Key, Report>::remove_query(MetricQueryID query_id) { + dout(20) << "query_id=" << query_id << dendl; + bool found = false; + bool notify = false; + + { + std::lock_guard locker(lock); + + for (auto it = queries.begin() ; it != queries.end();) { + auto iter = it->second.find(query_id); + if (iter == it->second.end()) { + ++it; + continue; + } + + it->second.erase(iter); + if (it->second.empty()) { + it = queries.erase(it); + notify = true; + } else if (is_limited(it->second)) { + ++it; + notify = true; + } + found = true; + break; + } + counters.erase(query_id); + } + + if (!found) { + dout(10) << query_id << " not found" << dendl; + return -ENOENT; + } + + dout(10) << query_id << dendl; + + if (notify) { + listener.handle_query_updated(); + } + + return 0; +} + +template <typename Query, typename Limit, typename Key, typename Report> +void MetricCollector<Query, Limit, Key, Report>::remove_all_queries() { + dout(20) << dendl; + bool notify; + + { + std::lock_guard locker(lock); + + notify = !queries.empty(); + queries.clear(); + } + + if (notify) { + listener.handle_query_updated(); + } +} + +template <typename Query, typename Limit, typename Key, typename Report> +void MetricCollector<Query, Limit, Key, Report>::reregister_queries() { + dout(20) << dendl; + listener.handle_query_updated(); +} + +template <typename Query, typename Limit, typename Key, typename Report> +int MetricCollector<Query, Limit, Key, Report>::get_counters_generic( + MetricQueryID query_id, std::map<Key, PerformanceCounters> *c) { + dout(20) << dendl; + ceph_assert(ceph_mutex_is_locked(lock)); + + auto it = counters.find(query_id); + if (it == counters.end()) { + dout(10) << "counters for " << query_id << " not found" << dendl; + return -ENOENT; + } + + *c = std::move(it->second); + it->second.clear(); + + return 0; +} + +template <typename Query, typename Limit, typename Key, typename Report> +void MetricCollector<Query, Limit, Key, Report>::process_reports_generic( + const std::map<Query, Report> &reports, UpdateCallback callback) { + ceph_assert(ceph_mutex_is_locked(lock)); + + if (reports.empty()) { + return; + } + + for (auto& [query, report] : reports) { + dout(10) << "report for " << query << " query: " + << report.group_packed_performance_counters.size() << " records" + << dendl; + + for (auto& [key, bl] : report.group_packed_performance_counters) { + auto bl_it = bl.cbegin(); + + for (auto& p : queries[query]) { + auto &key_counters = counters[p.first][key]; + if (key_counters.empty()) { + key_counters.resize(query.performance_counter_descriptors.size(), + {0, 0}); + } + } + + auto desc_it = report.performance_counter_descriptors.begin(); + for (size_t i = 0; i < query.performance_counter_descriptors.size(); i++) { + if (desc_it == report.performance_counter_descriptors.end()) { + break; + } + if (*desc_it != query.performance_counter_descriptors[i]) { + continue; + } + PerformanceCounter c; + desc_it->unpack_counter(bl_it, &c); + dout(20) << "counter " << key << " " << *desc_it << ": " << c << dendl; + + for (auto& p : queries[query]) { + auto &key_counters = counters[p.first][key]; + callback(&key_counters[i], c); + } + desc_it++; + } + } + } +} + +template class +MetricCollector<OSDPerfMetricQuery, OSDPerfMetricLimit, OSDPerfMetricKey, OSDPerfMetricReport>; +template class +MetricCollector<MDSPerfMetricQuery, MDSPerfMetricLimit, MDSPerfMetricKey, MDSPerfMetrics>; diff --git a/src/mgr/MetricCollector.h b/src/mgr/MetricCollector.h new file mode 100644 index 000000000..91fa78781 --- /dev/null +++ b/src/mgr/MetricCollector.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MGR_METRIC_COLLECTOR_H +#define CEPH_MGR_METRIC_COLLECTOR_H + +#include <map> +#include <set> +#include <tuple> +#include <vector> +#include <utility> +#include <algorithm> + +#include "common/ceph_mutex.h" +#include "msg/Message.h" +#include "mgr/Types.h" +#include "mgr/MetricTypes.h" + +class MMgrReport; + +template <typename Query, typename Limit, typename Key, typename Report> +class MetricCollector { +public: + virtual ~MetricCollector() { + } + + using Limits = std::set<Limit>; + + MetricCollector(MetricListener &listener); + + MetricQueryID add_query(const Query &query, const std::optional<Limit> &limit); + + int remove_query(MetricQueryID query_id); + + void remove_all_queries(); + + void reregister_queries(); + + std::map<Query, Limits> get_queries() const { + std::lock_guard locker(lock); + + std::map<Query, Limits> result; + for (auto& [query, limits] : queries) { + auto result_it = result.insert({query, {}}).first; + if (is_limited(limits)) { + for (auto& limit : limits) { + if (limit.second) { + result_it->second.insert(*limit.second); + } + } + } + } + + return result; + } + + virtual void process_reports(const MetricPayload &payload) = 0; + virtual int get_counters(PerfCollector *collector) = 0; + +protected: + typedef std::optional<Limit> OptionalLimit; + typedef std::map<MetricQueryID, OptionalLimit> QueryIDLimit; + typedef std::map<Query, QueryIDLimit> Queries; + typedef std::map<MetricQueryID, std::map<Key, PerformanceCounters>> Counters; + typedef std::function<void(PerformanceCounter *, const PerformanceCounter &)> UpdateCallback; + + mutable ceph::mutex lock = ceph::make_mutex("mgr::metric::collector::lock"); + + Queries queries; + Counters counters; + + void process_reports_generic(const std::map<Query, Report> &reports, UpdateCallback callback); + int get_counters_generic(MetricQueryID query_id, std::map<Key, PerformanceCounters> *counters); + +private: + MetricListener &listener; + MetricQueryID next_query_id = 0; + + bool is_limited(const std::map<MetricQueryID, OptionalLimit> &limits) const { + return std::any_of(begin(limits), end(limits), + [](auto &limits) { return limits.second.has_value(); }); + } +}; + +#endif // CEPH_MGR_METRIC_COLLECTOR_H diff --git a/src/mgr/MetricTypes.h b/src/mgr/MetricTypes.h new file mode 100644 index 000000000..586c470ca --- /dev/null +++ b/src/mgr/MetricTypes.h @@ -0,0 +1,277 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MGR_METRIC_TYPES_H +#define CEPH_MGR_METRIC_TYPES_H + +#include <boost/variant.hpp> +#include "include/denc.h" +#include "include/ceph_features.h" +#include "mgr/OSDPerfMetricTypes.h" +#include "mgr/MDSPerfMetricTypes.h" + +enum class MetricReportType { + METRIC_REPORT_TYPE_OSD = 0, + METRIC_REPORT_TYPE_MDS = 1, +}; + +struct OSDMetricPayload { + static const MetricReportType METRIC_REPORT_TYPE = MetricReportType::METRIC_REPORT_TYPE_OSD; + std::map<OSDPerfMetricQuery, OSDPerfMetricReport> report; + + OSDMetricPayload() { + } + OSDMetricPayload(const std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &report) + : report(report) { + } + + DENC(OSDMetricPayload, v, p) { + DENC_START(1, 1, p); + denc(v.report, p); + DENC_FINISH(p); + } +}; + +struct MDSMetricPayload { + static const MetricReportType METRIC_REPORT_TYPE = MetricReportType::METRIC_REPORT_TYPE_MDS; + MDSPerfMetricReport metric_report; + + MDSMetricPayload() { + } + MDSMetricPayload(const MDSPerfMetricReport &metric_report) + : metric_report(metric_report) { + } + + DENC(MDSMetricPayload, v, p) { + DENC_START(1, 1, p); + denc(v.metric_report, p); + DENC_FINISH(p); + } +}; + +struct UnknownMetricPayload { + static const MetricReportType METRIC_REPORT_TYPE = static_cast<MetricReportType>(-1); + + UnknownMetricPayload() { } + + DENC(UnknownMetricPayload, v, p) { + ceph_abort(); + } +}; + +WRITE_CLASS_DENC(OSDMetricPayload) +WRITE_CLASS_DENC(MDSMetricPayload) +WRITE_CLASS_DENC(UnknownMetricPayload) + +typedef boost::variant<OSDMetricPayload, + MDSMetricPayload, + UnknownMetricPayload> MetricPayload; + +class EncodeMetricPayloadVisitor : public boost::static_visitor<void> { +public: + explicit EncodeMetricPayloadVisitor(ceph::buffer::list &bl) : m_bl(bl) { + } + + template <typename MetricPayload> + inline void operator()(const MetricPayload &payload) const { + using ceph::encode; + encode(static_cast<uint32_t>(MetricPayload::METRIC_REPORT_TYPE), m_bl); + encode(payload, m_bl); + } + +private: + ceph::buffer::list &m_bl; +}; + +class DecodeMetricPayloadVisitor : public boost::static_visitor<void> { +public: + DecodeMetricPayloadVisitor(ceph::buffer::list::const_iterator &iter) : m_iter(iter) { + } + + template <typename MetricPayload> + inline void operator()(MetricPayload &payload) const { + using ceph::decode; + decode(payload, m_iter); + } + +private: + ceph::buffer::list::const_iterator &m_iter; +}; + +struct MetricReportMessage { + MetricPayload payload; + + MetricReportMessage(const MetricPayload &payload = UnknownMetricPayload()) + : payload(payload) { + } + + bool should_encode(uint64_t features) const { + if (!HAVE_FEATURE(features, SERVER_PACIFIC) && + boost::get<MDSMetricPayload>(&payload)) { + return false; + } + return true; + } + + void encode(ceph::buffer::list &bl) const { + boost::apply_visitor(EncodeMetricPayloadVisitor(bl), payload); + } + + void decode(ceph::buffer::list::const_iterator &iter) { + using ceph::decode; + + uint32_t metric_report_type; + decode(metric_report_type, iter); + + switch (static_cast<MetricReportType>(metric_report_type)) { + case MetricReportType::METRIC_REPORT_TYPE_OSD: + payload = OSDMetricPayload(); + break; + case MetricReportType::METRIC_REPORT_TYPE_MDS: + payload = MDSMetricPayload(); + break; + default: + payload = UnknownMetricPayload(); + break; + } + + boost::apply_visitor(DecodeMetricPayloadVisitor(iter), payload); + } +}; + +WRITE_CLASS_ENCODER(MetricReportMessage); + +// variant for sending configure message to mgr clients + +enum MetricConfigType { + METRIC_CONFIG_TYPE_OSD = 0, + METRIC_CONFIG_TYPE_MDS = 1, +}; + +struct OSDConfigPayload { + static const MetricConfigType METRIC_CONFIG_TYPE = MetricConfigType::METRIC_CONFIG_TYPE_OSD; + std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> config; + + OSDConfigPayload() { + } + OSDConfigPayload(const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &config) + : config(config) { + } + + DENC(OSDConfigPayload, v, p) { + DENC_START(1, 1, p); + denc(v.config, p); + DENC_FINISH(p); + } +}; + +struct MDSConfigPayload { + static const MetricConfigType METRIC_CONFIG_TYPE = MetricConfigType::METRIC_CONFIG_TYPE_MDS; + std::map<MDSPerfMetricQuery, MDSPerfMetricLimits> config; + + MDSConfigPayload() { + } + MDSConfigPayload(const std::map<MDSPerfMetricQuery, MDSPerfMetricLimits> &config) + : config(config) { + } + + DENC(MDSConfigPayload, v, p) { + DENC_START(1, 1, p); + denc(v.config, p); + DENC_FINISH(p); + } +}; + +struct UnknownConfigPayload { + static const MetricConfigType METRIC_CONFIG_TYPE = static_cast<MetricConfigType>(-1); + + UnknownConfigPayload() { } + + DENC(UnknownConfigPayload, v, p) { + ceph_abort(); + } +}; + +WRITE_CLASS_DENC(OSDConfigPayload) +WRITE_CLASS_DENC(MDSConfigPayload) +WRITE_CLASS_DENC(UnknownConfigPayload) + +typedef boost::variant<OSDConfigPayload, + MDSConfigPayload, + UnknownConfigPayload> ConfigPayload; + +class EncodeConfigPayloadVisitor : public boost::static_visitor<void> { +public: + explicit EncodeConfigPayloadVisitor(ceph::buffer::list &bl) : m_bl(bl) { + } + + template <typename ConfigPayload> + inline void operator()(const ConfigPayload &payload) const { + using ceph::encode; + encode(static_cast<uint32_t>(ConfigPayload::METRIC_CONFIG_TYPE), m_bl); + encode(payload, m_bl); + } + +private: + ceph::buffer::list &m_bl; +}; + +class DecodeConfigPayloadVisitor : public boost::static_visitor<void> { +public: + DecodeConfigPayloadVisitor(ceph::buffer::list::const_iterator &iter) : m_iter(iter) { + } + + template <typename ConfigPayload> + inline void operator()(ConfigPayload &payload) const { + using ceph::decode; + decode(payload, m_iter); + } + +private: + ceph::buffer::list::const_iterator &m_iter; +}; + +struct MetricConfigMessage { + ConfigPayload payload; + + MetricConfigMessage(const ConfigPayload &payload = UnknownConfigPayload()) + : payload(payload) { + } + + bool should_encode(uint64_t features) const { + if (!HAVE_FEATURE(features, SERVER_PACIFIC) && + boost::get<MDSConfigPayload>(&payload)) { + return false; + } + return true; + } + + void encode(ceph::buffer::list &bl) const { + boost::apply_visitor(EncodeConfigPayloadVisitor(bl), payload); + } + + void decode(ceph::buffer::list::const_iterator &iter) { + using ceph::decode; + + uint32_t metric_config_type; + decode(metric_config_type, iter); + + switch (metric_config_type) { + case MetricConfigType::METRIC_CONFIG_TYPE_OSD: + payload = OSDConfigPayload(); + break; + case MetricConfigType::METRIC_CONFIG_TYPE_MDS: + payload = MDSConfigPayload(); + break; + default: + payload = UnknownConfigPayload(); + break; + } + + boost::apply_visitor(DecodeConfigPayloadVisitor(iter), payload); + } +}; + +WRITE_CLASS_ENCODER(MetricConfigMessage); + +#endif // CEPH_MGR_METRIC_TYPES_H diff --git a/src/mgr/Mgr.cc b/src/mgr/Mgr.cc new file mode 100644 index 000000000..cb988cf76 --- /dev/null +++ b/src/mgr/Mgr.cc @@ -0,0 +1,832 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include <Python.h> + +#include "osdc/Objecter.h" +#include "client/Client.h" +#include "common/errno.h" +#include "mon/MonClient.h" +#include "include/stringify.h" +#include "global/global_context.h" +#include "global/signal_handler.h" + +#ifdef WITH_LIBCEPHSQLITE +# include <sqlite3.h> +# include "include/libcephsqlite.h" +#endif + +#include "mgr/MgrContext.h" + +#include "DaemonServer.h" +#include "messages/MMgrDigest.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MLog.h" +#include "messages/MServiceMap.h" +#include "messages/MKVData.h" +#include "PyModule.h" +#include "Mgr.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +using namespace std::literals; + +using std::map; +using std::ostringstream; +using std::string; + +Mgr::Mgr(MonClient *monc_, const MgrMap& mgrmap, + PyModuleRegistry *py_module_registry_, + Messenger *clientm_, Objecter *objecter_, + Client* client_, LogChannelRef clog_, LogChannelRef audit_clog_) : + monc(monc_), + objecter(objecter_), + client(client_), + client_messenger(clientm_), + finisher(g_ceph_context, "Mgr", "mgr-fin"), + digest_received(false), + py_module_registry(py_module_registry_), + cluster_state(monc, nullptr, mgrmap), + server(monc, finisher, daemon_state, cluster_state, *py_module_registry, + clog_, audit_clog_), + clog(clog_), + audit_clog(audit_clog_), + initialized(false), + initializing(false) +{ + cluster_state.set_objecter(objecter); +} + + +Mgr::~Mgr() +{ +} + +void MetadataUpdate::finish(int r) +{ + daemon_state.clear_updating(key); + if (r == 0) { + if (key.type == "mds" || key.type == "osd" || + key.type == "mgr" || key.type == "mon") { + json_spirit::mValue json_result; + bool read_ok = json_spirit::read( + outbl.to_str(), json_result); + if (!read_ok) { + dout(1) << "mon returned invalid JSON for " << key << dendl; + return; + } + if (json_result.type() != json_spirit::obj_type) { + dout(1) << "mon returned valid JSON " << key + << " but not an object: '" << outbl.to_str() << "'" << dendl; + return; + } + dout(4) << "mon returned valid metadata JSON for " << key << dendl; + + json_spirit::mObject daemon_meta = json_result.get_obj(); + + // Skip daemon who doesn't have hostname yet + if (daemon_meta.count("hostname") == 0) { + dout(1) << "Skipping incomplete metadata entry for " << key << dendl; + return; + } + + // Apply any defaults + for (const auto &i : defaults) { + if (daemon_meta.find(i.first) == daemon_meta.end()) { + daemon_meta[i.first] = i.second; + } + } + + if (daemon_state.exists(key)) { + DaemonStatePtr state = daemon_state.get(key); + map<string,string> m; + { + std::lock_guard l(state->lock); + state->hostname = daemon_meta.at("hostname").get_str(); + + if (key.type == "mds" || key.type == "mgr" || key.type == "mon") { + daemon_meta.erase("name"); + } else if (key.type == "osd") { + daemon_meta.erase("id"); + } + daemon_meta.erase("hostname"); + for (const auto &[key, val] : daemon_meta) { + m.emplace(key, val.get_str()); + } + } + daemon_state.update_metadata(state, m); + } else { + auto state = std::make_shared<DaemonState>(daemon_state.types); + state->key = key; + state->hostname = daemon_meta.at("hostname").get_str(); + + if (key.type == "mds" || key.type == "mgr" || key.type == "mon") { + daemon_meta.erase("name"); + } else if (key.type == "osd") { + daemon_meta.erase("id"); + } + daemon_meta.erase("hostname"); + + map<string,string> m; + for (const auto &[key, val] : daemon_meta) { + m.emplace(key, val.get_str()); + } + state->set_metadata(m); + + daemon_state.insert(state); + } + } else { + ceph_abort(); + } + } else { + dout(1) << "mon failed to return metadata for " << key + << ": " << cpp_strerror(r) << dendl; + } +} + +void Mgr::background_init(Context *completion) +{ + std::lock_guard l(lock); + ceph_assert(!initializing); + ceph_assert(!initialized); + initializing = true; + + finisher.start(); + + finisher.queue(new LambdaContext([this, completion](int r){ + init(); + completion->complete(0); + })); +} + +std::map<std::string, std::string> Mgr::load_store() +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + dout(10) << "listing keys" << dendl; + JSONCommand cmd; + cmd.run(monc, "{\"prefix\": \"config-key ls\"}"); + lock.unlock(); + cmd.wait(); + lock.lock(); + ceph_assert(cmd.r == 0); + + std::map<std::string, std::string> loaded; + + for (auto &key_str : cmd.json_result.get_array()) { + std::string const key = key_str.get_str(); + + dout(20) << "saw key '" << key << "'" << dendl; + + const std::string store_prefix = PyModule::mgr_store_prefix; + const std::string device_prefix = "device/"; + + if (key.substr(0, device_prefix.size()) == device_prefix || + key.substr(0, store_prefix.size()) == store_prefix) { + dout(20) << "fetching '" << key << "'" << dendl; + Command get_cmd; + std::ostringstream cmd_json; + cmd_json << "{\"prefix\": \"config-key get\", \"key\": \"" << key << "\"}"; + get_cmd.run(monc, cmd_json.str()); + lock.unlock(); + get_cmd.wait(); + lock.lock(); + if (get_cmd.r == 0) { // tolerate racing config-key change + loaded[key] = get_cmd.outbl.to_str(); + } + } + } + + return loaded; +} + +void Mgr::handle_signal(int signum) +{ + ceph_assert(signum == SIGINT || signum == SIGTERM); + shutdown(); +} + +static void handle_mgr_signal(int signum) +{ + derr << " *** Got signal " << sig_str(signum) << " ***" << dendl; + + // The python modules don't reliably shut down, so don't even + // try. The mon will blocklist us (and all of our rados/cephfs + // clients) anyway. Just exit! + + _exit(0); // exit with 0 result code, as if we had done an orderly shutdown +} + +void Mgr::init() +{ + std::unique_lock l(lock); + ceph_assert(initializing); + ceph_assert(!initialized); + + // Enable signal handlers + register_async_signal_handler_oneshot(SIGINT, handle_mgr_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_mgr_signal); + + // Only pacific+ monitors support subscribe to kv updates + bool mon_allows_kv_sub = false; + monc->with_monmap( + [&](const MonMap &monmap) { + if (monmap.get_required_features().contains_all( + ceph::features::mon::FEATURE_PACIFIC)) { + mon_allows_kv_sub = true; + } + }); + if (!mon_allows_kv_sub) { + // mons are still pre-pacific. wait long enough to ensure our + // next beacon is processed so that our module options are + // propagated. See https://tracker.ceph.com/issues/49778 + lock.unlock(); + dout(10) << "waiting a bit for the pre-pacific mon to process our beacon" << dendl; + sleep(g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count() * 3); + lock.lock(); + } + + // subscribe to all the maps + monc->sub_want("log-info", 0, 0); + monc->sub_want("mgrdigest", 0, 0); + monc->sub_want("fsmap", 0, 0); + monc->sub_want("servicemap", 0, 0); + if (mon_allows_kv_sub) { + monc->sub_want("kv:config/", 0, 0); + monc->sub_want("kv:mgr/", 0, 0); + monc->sub_want("kv:device/", 0, 0); + } + + dout(4) << "waiting for OSDMap..." << dendl; + // Subscribe to OSDMap update to pass on to ClusterState + objecter->maybe_request_map(); + + // reset the mon session. we get these maps through subscriptions which + // are stateful with the connection, so even if *we* don't have them a + // previous incarnation sharing the same MonClient may have. + monc->reopen_session(); + + // Start Objecter and wait for OSD map + lock.unlock(); // Drop lock because OSDMap dispatch calls into my ms_dispatch + epoch_t e; + cluster_state.with_mgrmap([&e](const MgrMap& m) { + e = m.last_failure_osd_epoch; + }); + /* wait for any blocklists to be applied to previous mgr instance */ + dout(4) << "Waiting for new OSDMap (e=" << e + << ") that may blocklist prior active." << dendl; + objecter->wait_for_osd_map(e); + lock.lock(); + + // Start communicating with daemons to learn statistics etc + int r = server.init(monc->get_global_id(), client_messenger->get_myaddrs()); + if (r < 0) { + derr << "Initialize server fail: " << cpp_strerror(r) << dendl; + // This is typically due to a bind() failure, so let's let + // systemd restart us. + exit(1); + } + dout(4) << "Initialized server at " << server.get_myaddrs() << dendl; + + // Preload all daemon metadata (will subsequently keep this + // up to date by watching maps, so do the initial load before + // we subscribe to any maps) + dout(4) << "Loading daemon metadata..." << dendl; + load_all_metadata(); + + // Populate PGs in ClusterState + cluster_state.with_osdmap_and_pgmap([this](const OSDMap &osd_map, + const PGMap& pg_map) { + cluster_state.notify_osdmap(osd_map); + }); + + // Wait for FSMap + dout(4) << "waiting for FSMap..." << dendl; + fs_map_cond.wait(l, [this] { return cluster_state.have_fsmap();}); + + // Wait for MgrDigest... + dout(4) << "waiting for MgrDigest..." << dendl; + digest_cond.wait(l, [this] { return digest_received; }); + + if (!mon_allows_kv_sub) { + dout(4) << "loading config-key data from pre-pacific mon cluster..." << dendl; + pre_init_store = load_store(); + } + + dout(4) << "initializing device state..." << dendl; + // Note: we only have to do this during startup because once we are + // active the only changes to this state will originate from one of our + // own modules. + for (auto p = pre_init_store.lower_bound("device/"); + p != pre_init_store.end() && p->first.find("device/") == 0; + ++p) { + string devid = p->first.substr(7); + dout(10) << " updating " << devid << dendl; + map<string,string> meta; + ostringstream ss; + int r = get_json_str_map(p->second, ss, &meta, false); + if (r < 0) { + derr << __func__ << " failed to parse " << p->second << ": " << ss.str() + << dendl; + } else { + daemon_state.with_device_create( + devid, [&meta] (DeviceState& dev) { + dev.set_metadata(std::move(meta)); + }); + } + } + + // assume finisher already initialized in background_init + dout(4) << "starting python modules..." << dendl; + py_module_registry->active_start( + daemon_state, cluster_state, + pre_init_store, mon_allows_kv_sub, + *monc, clog, audit_clog, *objecter, *client, + finisher, server); + + cluster_state.final_init(); + + AdminSocket *admin_socket = g_ceph_context->get_admin_socket(); + r = admin_socket->register_command( + "mgr_status", this, + "Dump mgr status"); + ceph_assert(r == 0); + +#ifdef WITH_LIBCEPHSQLITE + dout(4) << "Using sqlite3 version: " << sqlite3_libversion() << dendl; + /* See libcephsqlite.h for rationale of this code. */ + sqlite3_auto_extension((void (*)())sqlite3_cephsqlite_init); + { + sqlite3* db = nullptr; + if (int rc = sqlite3_open_v2(":memory:", &db, SQLITE_OPEN_READWRITE, nullptr); rc == SQLITE_OK) { + sqlite3_close(db); + } else { + derr << "could not open sqlite3: " << rc << dendl; + ceph_abort(); + } + } + { + char *ident = nullptr; + if (int rc = cephsqlite_setcct(g_ceph_context, &ident); rc < 0) { + derr << "could not set libcephsqlite cct: " << rc << dendl; + ceph_abort(); + } + entity_addrvec_t addrv; + addrv.parse(ident); + ident = (char*)realloc(ident, 0); + py_module_registry->register_client("libcephsqlite", addrv, true); + } +#endif + + dout(4) << "Complete." << dendl; + initializing = false; + initialized = true; +} + +void Mgr::load_all_metadata() +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + JSONCommand mds_cmd; + mds_cmd.run(monc, "{\"prefix\": \"mds metadata\"}"); + JSONCommand osd_cmd; + osd_cmd.run(monc, "{\"prefix\": \"osd metadata\"}"); + JSONCommand mon_cmd; + mon_cmd.run(monc, "{\"prefix\": \"mon metadata\"}"); + + lock.unlock(); + mds_cmd.wait(); + osd_cmd.wait(); + mon_cmd.wait(); + lock.lock(); + + ceph_assert(mds_cmd.r == 0); + ceph_assert(mon_cmd.r == 0); + ceph_assert(osd_cmd.r == 0); + + for (auto &metadata_val : mds_cmd.json_result.get_array()) { + json_spirit::mObject daemon_meta = metadata_val.get_obj(); + if (daemon_meta.count("hostname") == 0) { + dout(1) << "Skipping incomplete metadata entry" << dendl; + continue; + } + + DaemonStatePtr dm = std::make_shared<DaemonState>(daemon_state.types); + dm->key = DaemonKey{"mds", + daemon_meta.at("name").get_str()}; + dm->hostname = daemon_meta.at("hostname").get_str(); + + daemon_meta.erase("name"); + daemon_meta.erase("hostname"); + + for (const auto &[key, val] : daemon_meta) { + dm->metadata.emplace(key, val.get_str()); + } + + daemon_state.insert(dm); + } + + for (auto &metadata_val : mon_cmd.json_result.get_array()) { + json_spirit::mObject daemon_meta = metadata_val.get_obj(); + if (daemon_meta.count("hostname") == 0) { + dout(1) << "Skipping incomplete metadata entry" << dendl; + continue; + } + + DaemonStatePtr dm = std::make_shared<DaemonState>(daemon_state.types); + dm->key = DaemonKey{"mon", + daemon_meta.at("name").get_str()}; + dm->hostname = daemon_meta.at("hostname").get_str(); + + daemon_meta.erase("name"); + daemon_meta.erase("hostname"); + + map<string,string> m; + for (const auto &[key, val] : daemon_meta) { + m.emplace(key, val.get_str()); + } + dm->set_metadata(m); + + daemon_state.insert(dm); + } + + for (auto &osd_metadata_val : osd_cmd.json_result.get_array()) { + json_spirit::mObject osd_metadata = osd_metadata_val.get_obj(); + if (osd_metadata.count("hostname") == 0) { + dout(1) << "Skipping incomplete metadata entry" << dendl; + continue; + } + dout(4) << osd_metadata.at("hostname").get_str() << dendl; + + DaemonStatePtr dm = std::make_shared<DaemonState>(daemon_state.types); + dm->key = DaemonKey{"osd", + stringify(osd_metadata.at("id").get_int())}; + dm->hostname = osd_metadata.at("hostname").get_str(); + + osd_metadata.erase("id"); + osd_metadata.erase("hostname"); + + map<string,string> m; + for (const auto &i : osd_metadata) { + m[i.first] = i.second.get_str(); + } + dm->set_metadata(m); + + daemon_state.insert(dm); + } +} + + +void Mgr::shutdown() +{ + dout(10) << "mgr shutdown init" << dendl; + finisher.queue(new LambdaContext([&](int) { + { + std::lock_guard l(lock); + // First stop the server so that we're not taking any more incoming + // requests + server.shutdown(); + } + // after the messenger is stopped, signal modules to shutdown via finisher + py_module_registry->active_shutdown(); + })); + + // Then stop the finisher to ensure its enqueued contexts aren't going + // to touch references to the things we're about to tear down + finisher.wait_for_empty(); + finisher.stop(); +} + +void Mgr::handle_osd_map() +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + std::set<std::string> names_exist; + + /** + * When we see a new OSD map, inspect the entity addrs to + * see if they have changed (service restart), and if so + * reload the metadata. + */ + cluster_state.with_osdmap_and_pgmap([this, &names_exist](const OSDMap &osd_map, + const PGMap &pg_map) { + for (int osd_id = 0; osd_id < osd_map.get_max_osd(); ++osd_id) { + if (!osd_map.exists(osd_id)) { + continue; + } + + // Remember which OSDs exist so that we can cull any that don't + names_exist.insert(stringify(osd_id)); + + // Consider whether to update the daemon metadata (new/restarted daemon) + const auto k = DaemonKey{"osd", std::to_string(osd_id)}; + if (daemon_state.is_updating(k)) { + continue; + } + + bool update_meta = false; + if (daemon_state.exists(k)) { + if (osd_map.get_up_from(osd_id) == osd_map.get_epoch()) { + dout(4) << "Mgr::handle_osd_map: osd." << osd_id + << " joined cluster at " << "e" << osd_map.get_epoch() + << dendl; + update_meta = true; + } + } else { + update_meta = true; + } + if (update_meta) { + auto c = new MetadataUpdate(daemon_state, k); + std::ostringstream cmd; + cmd << "{\"prefix\": \"osd metadata\", \"id\": " + << osd_id << "}"; + monc->start_mon_command( + {cmd.str()}, + {}, &c->outbl, &c->outs, c); + } + } + + cluster_state.notify_osdmap(osd_map); + }); + + // TODO: same culling for MonMap + daemon_state.cull("osd", names_exist); +} + +void Mgr::handle_log(ref_t<MLog> m) +{ + for (const auto &e : m->entries) { + py_module_registry->notify_all(e); + } +} + +void Mgr::handle_service_map(ref_t<MServiceMap> m) +{ + dout(10) << "e" << m->service_map.epoch << dendl; + monc->sub_got("servicemap", m->service_map.epoch); + cluster_state.set_service_map(m->service_map); + server.got_service_map(); +} + +void Mgr::handle_mon_map() +{ + dout(20) << __func__ << dendl; + assert(ceph_mutex_is_locked_by_me(lock)); + std::set<std::string> names_exist; + cluster_state.with_monmap([&] (auto &monmap) { + for (unsigned int i = 0; i < monmap.size(); i++) { + names_exist.insert(monmap.get_name(i)); + } + }); + for (const auto& name : names_exist) { + const auto k = DaemonKey{"mon", name}; + if (daemon_state.is_updating(k)) { + continue; + } + auto c = new MetadataUpdate(daemon_state, k); + constexpr std::string_view cmd = R"({{"prefix": "mon metadata", "id": "{}"}})"; + monc->start_mon_command({fmt::format(cmd, name)}, {}, + &c->outbl, &c->outs, c); + } + daemon_state.cull("mon", names_exist); +} + +bool Mgr::ms_dispatch2(const ref_t<Message>& m) +{ + dout(10) << *m << dendl; + std::lock_guard l(lock); + + switch (m->get_type()) { + case MSG_MGR_DIGEST: + handle_mgr_digest(ref_cast<MMgrDigest>(m)); + break; + case CEPH_MSG_MON_MAP: + py_module_registry->notify_all("mon_map", ""); + handle_mon_map(); + break; + case CEPH_MSG_FS_MAP: + py_module_registry->notify_all("fs_map", ""); + handle_fs_map(ref_cast<MFSMap>(m)); + return false; // I shall let this pass through for Client + case CEPH_MSG_OSD_MAP: + handle_osd_map(); + + py_module_registry->notify_all("osd_map", ""); + + // Continuous subscribe, so that we can generate notifications + // for our MgrPyModules + objecter->maybe_request_map(); + break; + case MSG_SERVICE_MAP: + handle_service_map(ref_cast<MServiceMap>(m)); + //no users: py_module_registry->notify_all("service_map", ""); + break; + case MSG_LOG: + handle_log(ref_cast<MLog>(m)); + break; + case MSG_KV_DATA: + { + auto msg = ref_cast<MKVData>(m); + monc->sub_got("kv:"s + msg->prefix, msg->version); + if (!msg->data.empty()) { + if (initialized) { + py_module_registry->update_kv_data( + msg->prefix, + msg->incremental, + msg->data + ); + } else { + // before we have created the ActivePyModules, we need to + // track the store regions we're monitoring + if (!msg->incremental) { + dout(10) << "full update on " << msg->prefix << dendl; + auto p = pre_init_store.lower_bound(msg->prefix); + while (p != pre_init_store.end() && p->first.find(msg->prefix) == 0) { + dout(20) << " rm prior " << p->first << dendl; + p = pre_init_store.erase(p); + } + } else { + dout(10) << "incremental update on " << msg->prefix << dendl; + } + for (auto& i : msg->data) { + if (i.second) { + dout(20) << " set " << i.first << " = " << i.second->to_str() << dendl; + pre_init_store[i.first] = i.second->to_str(); + } else { + dout(20) << " rm " << i.first << dendl; + pre_init_store.erase(i.first); + } + } + } + } + } + break; + + default: + return false; + } + return true; +} + + +void Mgr::handle_fs_map(ref_t<MFSMap> m) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + std::set<std::string> names_exist; + const FSMap &new_fsmap = m->get_fsmap(); + + monc->sub_got("fsmap", m->epoch); + + fs_map_cond.notify_all(); + + // TODO: callers (e.g. from python land) are potentially going to see + // the new fsmap before we've bothered populating all the resulting + // daemon_state. Maybe we should block python land while we're making + // this kind of update? + + cluster_state.set_fsmap(new_fsmap); + + auto mds_info = new_fsmap.get_mds_info(); + for (const auto &i : mds_info) { + const auto &info = i.second; + + if (!new_fsmap.gid_exists(i.first)){ + continue; + } + + // Remember which MDS exists so that we can cull any that don't + names_exist.insert(info.name); + + const auto k = DaemonKey{"mds", info.name}; + if (daemon_state.is_updating(k)) { + continue; + } + + bool update = false; + if (daemon_state.exists(k)) { + auto metadata = daemon_state.get(k); + std::lock_guard l(metadata->lock); + if (metadata->metadata.empty() || + metadata->metadata.count("addr") == 0) { + update = true; + } else { + auto metadata_addrs = metadata->metadata.at("addr"); + const auto map_addrs = info.addrs; + update = metadata_addrs != stringify(map_addrs); + if (update) { + dout(4) << "MDS[" << info.name << "] addr change " << metadata_addrs + << " != " << stringify(map_addrs) << dendl; + } + } + } else { + update = true; + } + + if (update) { + auto c = new MetadataUpdate(daemon_state, k); + + // Older MDS daemons don't have addr in the metadata, so + // fake it if the returned metadata doesn't have the field. + c->set_default("addr", stringify(info.addrs)); + + std::ostringstream cmd; + cmd << "{\"prefix\": \"mds metadata\", \"who\": \"" + << info.name << "\"}"; + monc->start_mon_command( + {cmd.str()}, + {}, &c->outbl, &c->outs, c); + } + } + daemon_state.cull("mds", names_exist); +} + +bool Mgr::got_mgr_map(const MgrMap& m) +{ + std::lock_guard l(lock); + dout(10) << m << dendl; + + set<string> old_modules; + cluster_state.with_mgrmap([&](const MgrMap& m) { + old_modules = m.modules; + }); + if (m.modules != old_modules) { + derr << "mgrmap module list changed to (" << m.modules << "), respawn" + << dendl; + return true; + } + + cluster_state.set_mgr_map(m); + server.got_mgr_map(); + + return false; +} + +void Mgr::handle_mgr_digest(ref_t<MMgrDigest> m) +{ + dout(10) << m->mon_status_json.length() << dendl; + dout(10) << m->health_json.length() << dendl; + cluster_state.load_digest(m.get()); + //no users: py_module_registry->notify_all("mon_status", ""); + py_module_registry->notify_all("health", ""); + + // Hack: use this as a tick/opportunity to prompt python-land that + // the pgmap might have changed since last time we were here. + py_module_registry->notify_all("pg_summary", ""); + dout(10) << "done." << dendl; + m.reset(); + + if (!digest_received) { + digest_received = true; + digest_cond.notify_all(); + } +} + +std::map<std::string, std::string> Mgr::get_services() const +{ + std::lock_guard l(lock); + + return py_module_registry->get_services(); +} + +int Mgr::call( + std::string_view admin_command, + const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& errss, + bufferlist& out) +{ + try { + if (admin_command == "mgr_status") { + f->open_object_section("mgr_status"); + cluster_state.with_mgrmap( + [f](const MgrMap& mm) { + f->dump_unsigned("mgrmap_epoch", mm.get_epoch()); + }); + f->dump_bool("initialized", initialized); + f->close_section(); + return 0; + } else { + return -ENOSYS; + } + } catch (const TOPNSPC::common::bad_cmd_get& e) { + errss << e.what(); + return -EINVAL; + } + return 0; +} diff --git a/src/mgr/Mgr.h b/src/mgr/Mgr.h new file mode 100644 index 000000000..22ebdb680 --- /dev/null +++ b/src/mgr/Mgr.h @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray <john.spray@inktank.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_MGR_H_ +#define CEPH_MGR_H_ + +// Python.h comes first because otherwise it clobbers ceph's assert +#include <Python.h> + +#include "mds/FSMap.h" +#include "messages/MFSMap.h" +#include "msg/Messenger.h" +#include "auth/Auth.h" +#include "common/Finisher.h" +#include "mon/MgrMap.h" + +#include "DaemonServer.h" +#include "PyModuleRegistry.h" + +#include "DaemonState.h" +#include "ClusterState.h" + +class MCommand; +class MMgrDigest; +class MLog; +class MServiceMap; +class Objecter; +class Client; + +class Mgr : public AdminSocketHook { +protected: + MonClient *monc; + Objecter *objecter; + Client *client; + Messenger *client_messenger; + + mutable ceph::mutex lock = ceph::make_mutex("Mgr::lock"); + Finisher finisher; + + // Track receipt of initial data during startup + ceph::condition_variable fs_map_cond; + bool digest_received; + ceph::condition_variable digest_cond; + + PyModuleRegistry *py_module_registry; + DaemonStateIndex daemon_state; + ClusterState cluster_state; + + DaemonServer server; + + LogChannelRef clog; + LogChannelRef audit_clog; + + std::map<std::string, std::string> pre_init_store; + + void load_all_metadata(); + std::map<std::string, std::string> load_store(); + void init(); + + bool initialized; + bool initializing; + +public: + Mgr(MonClient *monc_, const MgrMap& mgrmap, + PyModuleRegistry *py_module_registry_, + Messenger *clientm_, Objecter *objecter_, + Client *client_, LogChannelRef clog_, LogChannelRef audit_clog_); + ~Mgr(); + + bool is_initialized() const {return initialized;} + entity_addrvec_t get_server_addrs() const { + return server.get_myaddrs(); + } + + void handle_mgr_digest(ceph::ref_t<MMgrDigest> m); + void handle_fs_map(ceph::ref_t<MFSMap> m); + void handle_osd_map(); + void handle_log(ceph::ref_t<MLog> m); + void handle_service_map(ceph::ref_t<MServiceMap> m); + void handle_mon_map(); + + bool got_mgr_map(const MgrMap& m); + + bool ms_dispatch2(const ceph::ref_t<Message>& m); + + void background_init(Context *completion); + void shutdown(); + + void handle_signal(int signum); + + std::map<std::string, std::string> get_services() const; + + int call( + std::string_view command, + const cmdmap_t& cmdmap, + const bufferlist& inbl, + Formatter *f, + std::ostream& errss, + ceph::buffer::list& out) override; +}; + +/** + * Context for completion of metadata mon commands: take + * the result and stash it in DaemonStateIndex + */ +class MetadataUpdate : public Context +{ + +private: + DaemonStateIndex &daemon_state; + DaemonKey key; + + std::map<std::string, std::string> defaults; + +public: + bufferlist outbl; + std::string outs; + + MetadataUpdate(DaemonStateIndex &daemon_state_, const DaemonKey &key_) + : daemon_state(daemon_state_), key(key_) + { + daemon_state.notify_updating(key); + } + + void set_default(const std::string &k, const std::string &v) + { + defaults[k] = v; + } + + void finish(int r) override; +}; + + +#endif diff --git a/src/mgr/MgrCap.cc b/src/mgr/MgrCap.cc new file mode 100644 index 000000000..1563516a9 --- /dev/null +++ b/src/mgr/MgrCap.cc @@ -0,0 +1,580 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <boost/algorithm/string/predicate.hpp> +#include <boost/config/warning_disable.hpp> +#include <boost/fusion/adapted/struct/adapt_struct.hpp> +#include <boost/fusion/include/adapt_struct.hpp> +#include <boost/fusion/include/std_pair.hpp> +#include <boost/phoenix.hpp> +#include <boost/spirit/include/qi.hpp> +#include <boost/spirit/include/qi_uint.hpp> + +#include "MgrCap.h" +#include "include/stringify.h" +#include "include/ipaddr.h" +#include "common/debug.h" +#include "common/Formatter.h" + +#include <algorithm> +#include <regex> + +#include "include/ceph_assert.h" + +static inline bool is_not_alnum_space(char c) { + return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_')); +} + +static std::string maybe_quote_string(const std::string& str) { + if (find_if(str.begin(), str.end(), is_not_alnum_space) == str.end()) + return str; + return std::string("\"") + str + std::string("\""); +} + +#define dout_subsys ceph_subsys_mgr + +std::ostream& operator<<(std::ostream& out, const mgr_rwxa_t& p) { + if (p == MGR_CAP_ANY) + return out << "*"; + + if (p & MGR_CAP_R) + out << "r"; + if (p & MGR_CAP_W) + out << "w"; + if (p & MGR_CAP_X) + out << "x"; + return out; +} + +std::ostream& operator<<(std::ostream& out, const MgrCapGrantConstraint& c) { + switch (c.match_type) { + case MgrCapGrantConstraint::MATCH_TYPE_EQUAL: + out << "="; + break; + case MgrCapGrantConstraint::MATCH_TYPE_PREFIX: + out << " prefix "; + break; + case MgrCapGrantConstraint::MATCH_TYPE_REGEX: + out << " regex "; + break; + default: + break; + } + out << maybe_quote_string(c.value); + return out; +} + +std::ostream& operator<<(std::ostream& out, const MgrCapGrant& m) { + if (!m.profile.empty()) { + out << "profile " << maybe_quote_string(m.profile); + } else { + out << "allow"; + if (!m.service.empty()) { + out << " service " << maybe_quote_string(m.service); + } else if (!m.module.empty()) { + out << " module " << maybe_quote_string(m.module); + } else if (!m.command.empty()) { + out << " command " << maybe_quote_string(m.command); + } + } + + if (!m.arguments.empty()) { + out << (!m.profile.empty() ? "" : " with"); + for (auto& [key, constraint] : m.arguments) { + out << " " << maybe_quote_string(key) << constraint; + } + } + + if (m.allow != 0) { + out << " " << m.allow; + } + + if (m.network.size()) { + out << " network " << m.network; + } + return out; +} + +// <magic> +// fusion lets us easily populate structs via the qi parser. + +typedef std::map<std::string, MgrCapGrantConstraint> kvmap; + +BOOST_FUSION_ADAPT_STRUCT(MgrCapGrant, + (std::string, service) + (std::string, module) + (std::string, profile) + (std::string, command) + (kvmap, arguments) + (mgr_rwxa_t, allow) + (std::string, network)) + +BOOST_FUSION_ADAPT_STRUCT(MgrCapGrantConstraint, + (MgrCapGrantConstraint::MatchType, match_type) + (std::string, value)) + +// </magic> + +void MgrCapGrant::parse_network() { + network_valid = ::parse_network(network.c_str(), &network_parsed, + &network_prefix); +} + +void MgrCapGrant::expand_profile(std::ostream *err) const { + // only generate this list once + if (!profile_grants.empty()) { + return; + } + + if (profile == "read-only") { + // grants READ-ONLY caps MGR-wide + profile_grants.push_back({{}, {}, {}, {}, {}, mgr_rwxa_t{MGR_CAP_R}}); + return; + } + + if (profile == "read-write") { + // grants READ-WRITE caps MGR-wide + profile_grants.push_back({{}, {}, {}, {}, {}, + mgr_rwxa_t{MGR_CAP_R | MGR_CAP_W}}); + return; + } + + if (profile == "crash") { + profile_grants.push_back({{}, {}, {}, "crash post", {}, {}}); + return; + } + + if (profile == "osd") { + // this is a documented profile (so we need to accept it as valid), but it + // currently doesn't do anything + return; + } + + if (profile == "mds") { + // this is a documented profile (so we need to accept it as valid), but it + // currently doesn't do anything + return; + } + + if (profile == "rbd" || profile == "rbd-read-only") { + Arguments filtered_arguments; + for (auto& [key, constraint] : arguments) { + if (key == "pool" || key == "namespace") { + filtered_arguments[key] = std::move(constraint); + } else { + if (err != nullptr) { + *err << "profile '" << profile << "' does not recognize key '" << key + << "'"; + } + return; + } + } + + mgr_rwxa_t perms = mgr_rwxa_t{MGR_CAP_R}; + if (profile == "rbd") { + perms = mgr_rwxa_t{MGR_CAP_R | MGR_CAP_W}; + } + + // allow all 'rbd_support' commands (restricted by optional + // pool/namespace constraints) + profile_grants.push_back({{}, "rbd_support", {}, {}, + std::move(filtered_arguments), perms}); + return; + } + + if (err != nullptr) { + *err << "unrecognized profile '" << profile << "'"; + } +} + +bool MgrCapGrant::validate_arguments( + const std::map<std::string, std::string>& args) const { + for (auto& [key, constraint] : arguments) { + auto q = args.find(key); + + // argument must be present if a constraint exists + if (q == args.end()) { + return false; + } + + switch (constraint.match_type) { + case MgrCapGrantConstraint::MATCH_TYPE_EQUAL: + if (constraint.value != q->second) + return false; + break; + case MgrCapGrantConstraint::MATCH_TYPE_PREFIX: + if (q->second.find(constraint.value) != 0) + return false; + break; + case MgrCapGrantConstraint::MATCH_TYPE_REGEX: + try { + std::regex pattern(constraint.value, std::regex::extended); + if (!std::regex_match(q->second, pattern)) { + return false; + } + } catch(const std::regex_error&) { + return false; + } + break; + default: + return false; + } + } + + return true; +} + +mgr_rwxa_t MgrCapGrant::get_allowed( + CephContext *cct, EntityName name, const std::string& s, + const std::string& m, const std::string& c, + const std::map<std::string, std::string>& args) const { + if (!profile.empty()) { + expand_profile(nullptr); + mgr_rwxa_t a; + for (auto& grant : profile_grants) { + a = a | grant.get_allowed(cct, name, s, m, c, args); + } + return a; + } + + if (!service.empty()) { + if (service != s) { + return mgr_rwxa_t{}; + } + return allow; + } + + if (!module.empty()) { + if (module != m) { + return mgr_rwxa_t{}; + } + + // don't test module arguments when validating a specific command + if (c.empty() && !validate_arguments(args)) { + return mgr_rwxa_t{}; + } + return allow; + } + + if (!command.empty()) { + if (command != c) { + return mgr_rwxa_t{}; + } + if (!validate_arguments(args)) { + return mgr_rwxa_t{}; + } + return mgr_rwxa_t{MGR_CAP_ANY}; + } + + return allow; +} + +std::ostream& operator<<(std::ostream&out, const MgrCap& m) { + bool first = true; + for (auto& grant : m.grants) { + if (!first) { + out << ", "; + } + first = false; + + out << grant; + } + return out; +} + +bool MgrCap::is_allow_all() const { + for (auto& grant : grants) { + if (grant.is_allow_all()) { + return true; + } + } + return false; +} + +void MgrCap::set_allow_all() { + grants.clear(); + grants.push_back({{}, {}, {}, {}, {}, mgr_rwxa_t{MGR_CAP_ANY}}); + text = "allow *"; +} + +bool MgrCap::is_capable( + CephContext *cct, + EntityName name, + const std::string& service, + const std::string& module, + const std::string& command, + const std::map<std::string, std::string>& command_args, + bool op_may_read, bool op_may_write, bool op_may_exec, + const entity_addr_t& addr) const { + if (cct) { + ldout(cct, 20) << "is_capable service=" << service << " " + << "module=" << module << " " + << "command=" << command + << (op_may_read ? " read":"") + << (op_may_write ? " write":"") + << (op_may_exec ? " exec":"") + << " addr " << addr + << " on cap " << *this + << dendl; + } + + mgr_rwxa_t allow; + for (auto& grant : grants) { + if (cct) + ldout(cct, 20) << " allow so far " << allow << ", doing grant " << grant + << dendl; + + if (grant.network.size() && + (!grant.network_valid || + !network_contains(grant.network_parsed, + grant.network_prefix, + addr))) { + continue; + } + + if (grant.is_allow_all()) { + if (cct) { + ldout(cct, 20) << " allow all" << dendl; + } + return true; + } + + // check enumerated caps + allow = allow | grant.get_allowed(cct, name, service, module, command, + command_args); + if ((!op_may_read || (allow & MGR_CAP_R)) && + (!op_may_write || (allow & MGR_CAP_W)) && + (!op_may_exec || (allow & MGR_CAP_X))) { + if (cct) { + ldout(cct, 20) << " match" << dendl; + } + return true; + } + } + return false; +} + +void MgrCap::encode(ceph::buffer::list& bl) const { + // remain backwards compatible w/ MgrCap + ENCODE_START(4, 4, bl); + encode(text, bl); + ENCODE_FINISH(bl); +} + +void MgrCap::decode(ceph::buffer::list::const_iterator& bl) { + // remain backwards compatible w/ MgrCap + std::string s; + DECODE_START(4, bl); + decode(s, bl); + DECODE_FINISH(bl); + parse(s, NULL); +} + +void MgrCap::dump(ceph::Formatter *f) const { + f->dump_string("text", text); +} + +void MgrCap::generate_test_instances(std::list<MgrCap*>& ls) { + ls.push_back(new MgrCap); + ls.push_back(new MgrCap); + ls.back()->parse("allow *"); + ls.push_back(new MgrCap); + ls.back()->parse("allow rwx"); + ls.push_back(new MgrCap); + ls.back()->parse("allow service foo x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow command bar x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow service foo r, allow command bar x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow command bar with k1=v1 x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow command bar with k1=v1 k2=v2 x"); + ls.push_back(new MgrCap); + ls.back()->parse("allow module bar with k1=v1 k2=v2 x"); + ls.push_back(new MgrCap); + ls.back()->parse("profile rbd pool=rbd"); +} + +// grammar +namespace qi = boost::spirit::qi; +namespace ascii = boost::spirit::ascii; +namespace phoenix = boost::phoenix; + +template <typename Iterator> +struct MgrCapParser : qi::grammar<Iterator, MgrCap()> { + MgrCapParser() : MgrCapParser::base_type(mgrcap) { + using qi::char_; + using qi::int_; + using qi::ulong_long; + using qi::lexeme; + using qi::alnum; + using qi::_val; + using qi::_1; + using qi::_2; + using qi::_3; + using qi::eps; + using qi::lit; + + quoted_string %= + lexeme['"' >> +(char_ - '"') >> '"'] | + lexeme['\'' >> +(char_ - '\'') >> '\'']; + unquoted_word %= +char_("a-zA-Z0-9_./-"); + str %= quoted_string | unquoted_word; + network_str %= +char_("/.:a-fA-F0-9]["); + + spaces = +(lit(' ') | lit('\n') | lit('\t')); + + // key <=|prefix|regex> value[ ...] + str_match = -spaces >> lit('=') >> -spaces >> + qi::attr(MgrCapGrantConstraint::MATCH_TYPE_EQUAL) >> str; + str_prefix = spaces >> lit("prefix") >> spaces >> + qi::attr(MgrCapGrantConstraint::MATCH_TYPE_PREFIX) >> str; + str_regex = spaces >> lit("regex") >> spaces >> + qi::attr(MgrCapGrantConstraint::MATCH_TYPE_REGEX) >> str; + kv_pair = str >> (str_match | str_prefix | str_regex); + kv_map %= kv_pair >> *(spaces >> kv_pair); + + // command := command[=]cmd [k1=v1 k2=v2 ...] + command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> str + >> -(spaces >> lit("with") >> spaces >> kv_map) + >> qi::attr(0) + >> -(spaces >> lit("network") >> spaces >> network_str); + + // service foo rwxa + service_match %= -spaces >> lit("allow") >> spaces >> lit("service") >> (lit('=') | spaces) + >> str + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::map<std::string, MgrCapGrantConstraint>()) + >> spaces >> rwxa + >> -(spaces >> lit("network") >> spaces >> network_str); + + // module foo rwxa + module_match %= -spaces >> lit("allow") >> spaces >> lit("module") >> (lit('=') | spaces) + >> qi::attr(std::string()) + >> str + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> -(spaces >> lit("with") >> spaces >> kv_map) + >> spaces >> rwxa + >> -(spaces >> lit("network") >> spaces >> network_str); + + // profile foo + profile_match %= -spaces >> -(lit("allow") >> spaces) + >> lit("profile") >> (lit('=') | spaces) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> str + >> qi::attr(std::string()) + >> -(spaces >> kv_map) + >> qi::attr(0) + >> -(spaces >> lit("network") >> spaces >> network_str); + + // rwxa + rwxa_match %= -spaces >> lit("allow") >> spaces + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::string()) + >> qi::attr(std::map<std::string,MgrCapGrantConstraint>()) + >> rwxa + >> -(spaces >> lit("network") >> spaces >> network_str); + + // rwxa := * | [r][w][x] + rwxa = + (lit("*")[_val = MGR_CAP_ANY]) | + (lit("all")[_val = MGR_CAP_ANY]) | + ( eps[_val = 0] >> + ( lit('r')[_val |= MGR_CAP_R] || + lit('w')[_val |= MGR_CAP_W] || + lit('x')[_val |= MGR_CAP_X] + ) + ); + + // grant := allow ... + grant = -spaces >> (rwxa_match | profile_match | service_match | + module_match | command_match) >> -spaces; + + // mgrcap := grant [grant ...] + grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' '))); + mgrcap = grants [_val = phoenix::construct<MgrCap>(_1)]; + } + + qi::rule<Iterator> spaces; + qi::rule<Iterator, unsigned()> rwxa; + qi::rule<Iterator, std::string()> quoted_string; + qi::rule<Iterator, std::string()> unquoted_word; + qi::rule<Iterator, std::string()> str, network_str; + + qi::rule<Iterator, MgrCapGrantConstraint()> str_match, str_prefix, str_regex; + qi::rule<Iterator, std::pair<std::string, MgrCapGrantConstraint>()> kv_pair; + qi::rule<Iterator, std::map<std::string, MgrCapGrantConstraint>()> kv_map; + + qi::rule<Iterator, MgrCapGrant()> rwxa_match; + qi::rule<Iterator, MgrCapGrant()> command_match; + qi::rule<Iterator, MgrCapGrant()> service_match; + qi::rule<Iterator, MgrCapGrant()> module_match; + qi::rule<Iterator, MgrCapGrant()> profile_match; + qi::rule<Iterator, MgrCapGrant()> grant; + qi::rule<Iterator, std::vector<MgrCapGrant>()> grants; + qi::rule<Iterator, MgrCap()> mgrcap; +}; + +bool MgrCap::parse(const std::string& str, std::ostream *err) { + auto iter = str.begin(); + auto end = str.end(); + + MgrCapParser<std::string::const_iterator> exp; + bool r = qi::parse(iter, end, exp, *this); + if (r && iter == end) { + text = str; + + std::stringstream profile_err; + for (auto& g : grants) { + g.parse_network(); + + if (!g.profile.empty()) { + g.expand_profile(&profile_err); + } + } + + if (!profile_err.str().empty()) { + if (err != nullptr) { + *err << "mgr capability parse failed during profile evaluation: " + << profile_err.str(); + } + return false; + } + return true; + } + + // Make sure no grants are kept after parsing failed! + grants.clear(); + + if (err) { + if (iter != end) + *err << "mgr capability parse failed, stopped at '" + << std::string(iter, end) << "' of '" << str << "'"; + else + *err << "mgr capability parse failed, stopped at end of '" << str << "'"; + } + + return false; +} diff --git a/src/mgr/MgrCap.h b/src/mgr/MgrCap.h new file mode 100644 index 000000000..f7a8bd5f8 --- /dev/null +++ b/src/mgr/MgrCap.h @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MGRCAP_H +#define CEPH_MGRCAP_H + +#include <iosfwd> + +#include "include/common_fwd.h" +#include "include/types.h" +#include "common/entity_name.h" + +static const __u8 MGR_CAP_R = (1 << 1); // read +static const __u8 MGR_CAP_W = (1 << 2); // write +static const __u8 MGR_CAP_X = (1 << 3); // execute +static const __u8 MGR_CAP_ANY = 0xff; // * + +struct mgr_rwxa_t { + __u8 val = 0U; + + mgr_rwxa_t() {} + explicit mgr_rwxa_t(__u8 v) : val(v) {} + + mgr_rwxa_t& operator=(__u8 v) { + val = v; + return *this; + } + operator __u8() const { + return val; + } +}; + +std::ostream& operator<<(std::ostream& out, const mgr_rwxa_t& p); + +struct MgrCapGrantConstraint { + enum MatchType { + MATCH_TYPE_NONE, + MATCH_TYPE_EQUAL, + MATCH_TYPE_PREFIX, + MATCH_TYPE_REGEX + }; + + MatchType match_type = MATCH_TYPE_NONE; + std::string value; + + MgrCapGrantConstraint() {} + MgrCapGrantConstraint(MatchType match_type, std::string value) + : match_type(match_type), value(value) { + } +}; + +std::ostream& operator<<(std::ostream& out, const MgrCapGrantConstraint& c); + +struct MgrCapGrant { + /* + * A grant can come in one of four forms: + * + * - a blanket allow ('allow rw', 'allow *') + * - this will match against any service and the read/write/exec flags + * in the mgr code. semantics of what X means are somewhat ad hoc. + * + * - a service allow ('allow service mds rw') + * - this will match against a specific service and the r/w/x flags. + * + * - a module allow ('allow module rbd_support rw, allow module rbd_support with pool=rbd rw') + * - this will match against a specific python add-on module and the r/w/x + * flags. + * + * - a profile ('profile read-only, profile rbd pool=rbd') + * - this will match against specific MGR-enforced semantics of what + * this type of user should need to do. examples include 'read-write', + * 'read-only', 'crash'. + * + * - a command ('allow command foo', 'allow command bar with arg1=val1 arg2 prefix val2') + * this includes the command name (the prefix string) + * + * The command, module, and profile caps can also accept an optional + * key/value map. If not provided, all command arguments and module + * meta-arguments are allowed. If a key/value pair is specified, that + * argument must be present and must match the provided constraint. + */ + typedef std::map<std::string, MgrCapGrantConstraint> Arguments; + + std::string service; + std::string module; + std::string profile; + std::string command; + Arguments arguments; + + // restrict by network + std::string network; + + // these are filled in by parse_network(), called by MgrCap::parse() + entity_addr_t network_parsed; + unsigned network_prefix = 0; + bool network_valid = true; + + void parse_network(); + + mgr_rwxa_t allow; + + // explicit grants that a profile grant expands to; populated as + // needed by expand_profile() (via is_match()) and cached here. + mutable std::list<MgrCapGrant> profile_grants; + + void expand_profile(std::ostream *err=nullptr) const; + + MgrCapGrant() : allow(0) {} + MgrCapGrant(std::string&& service, + std::string&& module, + std::string&& profile, + std::string&& command, + Arguments&& arguments, + mgr_rwxa_t allow) + : service(std::move(service)), module(std::move(module)), + profile(std::move(profile)), command(std::move(command)), + arguments(std::move(arguments)), allow(allow) { + } + + bool validate_arguments( + const std::map<std::string, std::string>& arguments) const; + + /** + * check if given request parameters match our constraints + * + * @param cct context + * @param name entity name + * @param service service (if any) + * @param module module (if any) + * @param command command (if any) + * @param arguments profile/module/command args (if any) + * @return bits we allow + */ + mgr_rwxa_t get_allowed( + CephContext *cct, + EntityName name, + const std::string& service, + const std::string& module, + const std::string& command, + const std::map<std::string, std::string>& arguments) const; + + bool is_allow_all() const { + return (allow == MGR_CAP_ANY && + service.empty() && + module.empty() && + profile.empty() && + command.empty()); + } +}; + +std::ostream& operator<<(std::ostream& out, const MgrCapGrant& g); + +struct MgrCap { + std::string text; + std::vector<MgrCapGrant> grants; + + MgrCap() {} + explicit MgrCap(const std::vector<MgrCapGrant> &g) : grants(g) {} + + std::string get_str() const { + return text; + } + + bool is_allow_all() const; + void set_allow_all(); + bool parse(const std::string& str, std::ostream *err=NULL); + + /** + * check if we are capable of something + * + * This method actually checks a description of a particular operation against + * what the capability has specified. + * + * @param service service name + * @param module module name + * @param command command id + * @param arguments + * @param op_may_read whether the operation may need to read + * @param op_may_write whether the operation may need to write + * @param op_may_exec whether the operation may exec + * @return true if the operation is allowed, false otherwise + */ + bool is_capable(CephContext *cct, + EntityName name, + const std::string& service, + const std::string& module, + const std::string& command, + const std::map<std::string, std::string>& arguments, + bool op_may_read, bool op_may_write, bool op_may_exec, + const entity_addr_t& addr) const; + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<MgrCap*>& ls); +}; +WRITE_CLASS_ENCODER(MgrCap) + +std::ostream& operator<<(std::ostream& out, const MgrCap& cap); + +#endif // CEPH_MGRCAP_H diff --git a/src/mgr/MgrClient.cc b/src/mgr/MgrClient.cc new file mode 100644 index 000000000..6250ea3b9 --- /dev/null +++ b/src/mgr/MgrClient.cc @@ -0,0 +1,667 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "MgrClient.h" + +#include "common/perf_counters_key.h" +#include "mgr/MgrContext.h" +#include "mon/MonMap.h" + +#include "msg/Messenger.h" +#include "messages/MMgrMap.h" +#include "messages/MMgrReport.h" +#include "messages/MMgrOpen.h" +#include "messages/MMgrUpdate.h" +#include "messages/MMgrClose.h" +#include "messages/MMgrConfigure.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MMgrCommand.h" +#include "messages/MMgrCommandReply.h" +#include "messages/MPGStats.h" + +using std::string; +using std::vector; + +using ceph::bufferlist; +using ceph::make_message; +using ceph::ref_cast; +using ceph::ref_t; + +#define dout_subsys ceph_subsys_mgrc +#undef dout_prefix +#define dout_prefix *_dout << "mgrc " << __func__ << " " + +MgrClient::MgrClient(CephContext *cct_, Messenger *msgr_, MonMap *monmap_) + : Dispatcher(cct_), + cct(cct_), + msgr(msgr_), + monmap(monmap_), + timer(cct_, lock) +{ + ceph_assert(cct != nullptr); +} + +void MgrClient::init() +{ + std::lock_guard l(lock); + + ceph_assert(msgr != nullptr); + + timer.init(); + initialized = true; +} + +void MgrClient::shutdown() +{ + std::unique_lock l(lock); + ldout(cct, 10) << dendl; + + if (connect_retry_callback) { + timer.cancel_event(connect_retry_callback); + connect_retry_callback = nullptr; + } + + // forget about in-flight commands if we are prematurely shut down + // (e.g., by control-C) + command_table.clear(); + if (service_daemon && + session && + session->con && + HAVE_FEATURE(session->con->get_features(), SERVER_MIMIC)) { + ldout(cct, 10) << "closing mgr session" << dendl; + auto m = make_message<MMgrClose>(); + m->daemon_name = daemon_name; + m->service_name = service_name; + session->con->send_message2(m); + auto timeout = ceph::make_timespan(cct->_conf.get_val<double>( + "mgr_client_service_daemon_unregister_timeout")); + shutdown_cond.wait_for(l, timeout); + } + + timer.shutdown(); + if (session) { + session->con->mark_down(); + session.reset(); + } +} + +bool MgrClient::ms_dispatch2(const ref_t<Message>& m) +{ + std::lock_guard l(lock); + + switch(m->get_type()) { + case MSG_MGR_MAP: + return handle_mgr_map(ref_cast<MMgrMap>(m)); + case MSG_MGR_CONFIGURE: + return handle_mgr_configure(ref_cast<MMgrConfigure>(m)); + case MSG_MGR_CLOSE: + return handle_mgr_close(ref_cast<MMgrClose>(m)); + case MSG_COMMAND_REPLY: + if (m->get_source().type() == CEPH_ENTITY_TYPE_MGR) { + MCommandReply *c = static_cast<MCommandReply*>(m.get()); + handle_command_reply(c->get_tid(), c->get_data(), c->rs, c->r); + return true; + } else { + return false; + } + case MSG_MGR_COMMAND_REPLY: + if (m->get_source().type() == CEPH_ENTITY_TYPE_MGR) { + MMgrCommandReply *c = static_cast<MMgrCommandReply*>(m.get()); + handle_command_reply(c->get_tid(), c->get_data(), c->rs, c->r); + return true; + } else { + return false; + } + default: + ldout(cct, 30) << "Not handling " << *m << dendl; + return false; + } +} + +void MgrClient::reconnect() +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + if (session) { + ldout(cct, 4) << "Terminating session with " + << session->con->get_peer_addr() << dendl; + session->con->mark_down(); + session.reset(); + stats_period = 0; + if (report_callback != nullptr) { + timer.cancel_event(report_callback); + report_callback = nullptr; + } + } + + if (!map.get_available()) { + ldout(cct, 4) << "No active mgr available yet" << dendl; + return; + } + + if (!clock_t::is_zero(last_connect_attempt)) { + auto now = clock_t::now(); + auto when = last_connect_attempt + + ceph::make_timespan( + cct->_conf.get_val<double>("mgr_connect_retry_interval")); + if (now < when) { + if (!connect_retry_callback) { + connect_retry_callback = timer.add_event_at( + when, + new LambdaContext([this](int r){ + connect_retry_callback = nullptr; + reconnect(); + })); + } + ldout(cct, 4) << "waiting to retry connect until " << when << dendl; + return; + } + } + + if (connect_retry_callback) { + timer.cancel_event(connect_retry_callback); + connect_retry_callback = nullptr; + } + + ldout(cct, 4) << "Starting new session with " << map.get_active_addrs() + << dendl; + last_connect_attempt = clock_t::now(); + + session.reset(new MgrSessionState()); + session->con = msgr->connect_to(CEPH_ENTITY_TYPE_MGR, + map.get_active_addrs()); + + if (service_daemon) { + daemon_dirty_status = true; + } + task_dirty_status = true; + + // Don't send an open if we're just a client (i.e. doing + // command-sending, not stats etc) + if (msgr->get_mytype() != CEPH_ENTITY_TYPE_CLIENT || service_daemon) { + _send_open(); + } + + // resend any pending commands + auto p = command_table.get_commands().begin(); + while (p != command_table.get_commands().end()) { + auto tid = p->first; + auto& op = p->second; + ldout(cct,10) << "resending " << tid << (op.tell ? " (tell)":" (cli)") << dendl; + MessageRef m; + if (op.tell) { + if (op.name.size() && op.name != map.active_name) { + ldout(cct, 10) << "active mgr " << map.active_name << " != target " + << op.name << dendl; + if (op.on_finish) { + op.on_finish->complete(-ENXIO); + } + ++p; + command_table.erase(tid); + continue; + } + // Set fsid argument to signal that this is really a tell message (and + // we are not a legacy client sending a non-tell command via MCommand). + m = op.get_message(monmap->fsid, false); + } else { + m = op.get_message( + {}, + HAVE_FEATURE(map.active_mgr_features, SERVER_OCTOPUS)); + } + ceph_assert(session); + ceph_assert(session->con); + session->con->send_message2(std::move(m)); + ++p; + } +} + +void MgrClient::_send_open() +{ + if (session && session->con) { + auto open = make_message<MMgrOpen>(); + if (!service_name.empty()) { + open->service_name = service_name; + open->daemon_name = daemon_name; + } else { + open->daemon_name = cct->_conf->name.get_id(); + } + if (service_daemon) { + open->service_daemon = service_daemon; + open->daemon_metadata = daemon_metadata; + } + cct->_conf.get_config_bl(0, &open->config_bl, &last_config_bl_version); + cct->_conf.get_defaults_bl(&open->config_defaults_bl); + session->con->send_message2(open); + } +} + +void MgrClient::_send_update() +{ + if (session && session->con) { + auto update = make_message<MMgrUpdate>(); + if (!service_name.empty()) { + update->service_name = service_name; + update->daemon_name = daemon_name; + } else { + update->daemon_name = cct->_conf->name.get_id(); + } + if (need_metadata_update) { + update->daemon_metadata = daemon_metadata; + } + update->need_metadata_update = need_metadata_update; + session->con->send_message2(update); + } +} + +bool MgrClient::handle_mgr_map(ref_t<MMgrMap> m) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + ldout(cct, 20) << *m << dendl; + + map = m->get_map(); + ldout(cct, 4) << "Got map version " << map.epoch << dendl; + + ldout(cct, 4) << "Active mgr is now " << map.get_active_addrs() << dendl; + + // Reset session? + if (!session || + session->con->get_peer_addrs() != map.get_active_addrs()) { + reconnect(); + } + + return true; +} + +bool MgrClient::ms_handle_reset(Connection *con) +{ + std::lock_guard l(lock); + if (session && con == session->con) { + ldout(cct, 4) << __func__ << " con " << con << dendl; + reconnect(); + return true; + } + return false; +} + +bool MgrClient::ms_handle_refused(Connection *con) +{ + // do nothing for now + return false; +} + +void MgrClient::_send_stats() +{ + _send_report(); + _send_pgstats(); + if (stats_period != 0) { + report_callback = timer.add_event_after( + stats_period, + new LambdaContext([this](int) { + _send_stats(); + })); + } +} + +void MgrClient::_send_report() +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + ceph_assert(session); + report_callback = nullptr; + + auto report = make_message<MMgrReport>(); + auto pcc = cct->get_perfcounters_collection(); + + pcc->with_counters([this, report]( + const PerfCountersCollectionImpl::CounterMap &by_path) + { + // Helper for checking whether a counter should be included + auto include_counter = [this]( + const PerfCounters::perf_counter_data_any_d &ctr, + const PerfCounters &perf_counters) + { + // FIXME: We don't send labeled perf counters to the mgr currently. + auto labels = ceph::perf_counters::key_labels(perf_counters.get_name()); + if (labels.begin() != labels.end()) { + return false; + } + + return perf_counters.get_adjusted_priority(ctr.prio) >= (int)stats_threshold; + }; + + // Helper for cases where we want to forget a counter + auto undeclare = [report, this](const std::string &path) + { + report->undeclare_types.push_back(path); + ldout(cct,20) << " undeclare " << path << dendl; + session->declared.erase(path); + }; + + ENCODE_START(1, 1, report->packed); + + // Find counters that no longer exist, and undeclare them + for (auto p = session->declared.begin(); p != session->declared.end(); ) { + const auto &path = *(p++); + if (by_path.count(path) == 0) { + undeclare(path); + } + } + + for (const auto &i : by_path) { + auto& path = i.first; + auto& data = *(i.second.data); + auto& perf_counters = *(i.second.perf_counters); + + // Find counters that still exist, but are no longer permitted by + // stats_threshold + if (!include_counter(data, perf_counters)) { + if (session->declared.count(path)) { + undeclare(path); + } + continue; + } + + if (session->declared.count(path) == 0) { + ldout(cct, 20) << " declare " << path << dendl; + PerfCounterType type; + type.path = path; + if (data.description) { + type.description = data.description; + } + if (data.nick) { + type.nick = data.nick; + } + type.type = data.type; + type.priority = perf_counters.get_adjusted_priority(data.prio); + type.unit = data.unit; + report->declare_types.push_back(std::move(type)); + session->declared.insert(path); + } + + encode(static_cast<uint64_t>(data.u64), report->packed); + if (data.type & PERFCOUNTER_LONGRUNAVG) { + encode(static_cast<uint64_t>(data.avgcount), report->packed); + encode(static_cast<uint64_t>(data.avgcount2), report->packed); + } + } + ENCODE_FINISH(report->packed); + + ldout(cct, 20) << "sending " << session->declared.size() << " counters (" + "of possible " << by_path.size() << "), " + << report->declare_types.size() << " new, " + << report->undeclare_types.size() << " removed" + << dendl; + }); + + ldout(cct, 20) << "encoded " << report->packed.length() << " bytes" << dendl; + + if (daemon_name.size()) { + report->daemon_name = daemon_name; + } else { + report->daemon_name = cct->_conf->name.get_id(); + } + report->service_name = service_name; + + if (daemon_dirty_status) { + report->daemon_status = daemon_status; + daemon_dirty_status = false; + } + + if (task_dirty_status) { + report->task_status = task_status; + task_dirty_status = false; + } + + report->daemon_health_metrics = std::move(daemon_health_metrics); + + cct->_conf.get_config_bl(last_config_bl_version, &report->config_bl, + &last_config_bl_version); + + if (get_perf_report_cb) { + report->metric_report_message = MetricReportMessage(get_perf_report_cb()); + } + + session->con->send_message2(report); +} + +void MgrClient::send_pgstats() +{ + std::lock_guard l(lock); + _send_pgstats(); +} + +void MgrClient::_send_pgstats() +{ + if (pgstats_cb && session) { + session->con->send_message(pgstats_cb()); + } +} + +bool MgrClient::handle_mgr_configure(ref_t<MMgrConfigure> m) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + ldout(cct, 20) << *m << dendl; + + if (!session) { + lderr(cct) << "dropping unexpected configure message" << dendl; + return true; + } + + ldout(cct, 4) << "stats_period=" << m->stats_period << dendl; + + if (stats_threshold != m->stats_threshold) { + ldout(cct, 4) << "updated stats threshold: " << m->stats_threshold << dendl; + stats_threshold = m->stats_threshold; + } + + if (!m->osd_perf_metric_queries.empty()) { + handle_config_payload(m->osd_perf_metric_queries); + } else if (m->metric_config_message) { + const MetricConfigMessage &message = *m->metric_config_message; + boost::apply_visitor(HandlePayloadVisitor(this), message.payload); + } + + bool starting = (stats_period == 0) && (m->stats_period != 0); + stats_period = m->stats_period; + if (starting) { + _send_stats(); + } + + return true; +} + +bool MgrClient::handle_mgr_close(ref_t<MMgrClose> m) +{ + service_daemon = false; + shutdown_cond.notify_all(); + return true; +} + +int MgrClient::start_command(const vector<string>& cmd, const bufferlist& inbl, + bufferlist *outbl, string *outs, + Context *onfinish) +{ + std::lock_guard l(lock); + + ldout(cct, 20) << "cmd: " << cmd << dendl; + + if (map.epoch == 0 && mgr_optional) { + ldout(cct,20) << " no MgrMap, assuming EACCES" << dendl; + return -EACCES; + } + + auto &op = command_table.start_command(); + op.cmd = cmd; + op.inbl = inbl; + op.outbl = outbl; + op.outs = outs; + op.on_finish = onfinish; + + if (session && session->con) { + // Leaving fsid argument null because it isn't used historically, and + // we can use it as a signal that we are sending a non-tell command. + auto m = op.get_message( + {}, + HAVE_FEATURE(map.active_mgr_features, SERVER_OCTOPUS)); + session->con->send_message2(std::move(m)); + } else { + ldout(cct, 5) << "no mgr session (no running mgr daemon?), waiting" << dendl; + } + return 0; +} + +int MgrClient::start_tell_command( + const string& name, + const vector<string>& cmd, const bufferlist& inbl, + bufferlist *outbl, string *outs, + Context *onfinish) +{ + std::lock_guard l(lock); + + ldout(cct, 20) << "target: " << name << " cmd: " << cmd << dendl; + + if (map.epoch == 0 && mgr_optional) { + ldout(cct,20) << " no MgrMap, assuming EACCES" << dendl; + return -EACCES; + } + + auto &op = command_table.start_command(); + op.tell = true; + op.name = name; + op.cmd = cmd; + op.inbl = inbl; + op.outbl = outbl; + op.outs = outs; + op.on_finish = onfinish; + + if (session && session->con && (name.size() == 0 || map.active_name == name)) { + // Set fsid argument to signal that this is really a tell message (and + // we are not a legacy client sending a non-tell command via MCommand). + auto m = op.get_message(monmap->fsid, false); + session->con->send_message2(std::move(m)); + } else { + ldout(cct, 5) << "no mgr session (no running mgr daemon?), or " + << name << " not active mgr, waiting" << dendl; + } + return 0; +} + +bool MgrClient::handle_command_reply( + uint64_t tid, + bufferlist& data, + const std::string& rs, + int r) +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + + ldout(cct, 20) << "tid " << tid << " r " << r << dendl; + + if (!command_table.exists(tid)) { + ldout(cct, 4) << "handle_command_reply tid " << tid + << " not found" << dendl; + return true; + } + + auto &op = command_table.get_command(tid); + if (op.outbl) { + *op.outbl = std::move(data); + } + + if (op.outs) { + *(op.outs) = rs; + } + + if (op.on_finish) { + op.on_finish->complete(r); + } + + command_table.erase(tid); + return true; +} + +int MgrClient::update_daemon_metadata( + const std::string& service, + const std::string& name, + const std::map<std::string,std::string>& metadata) +{ + std::lock_guard l(lock); + if (service_daemon) { + return -EEXIST; + } + ldout(cct,1) << service << "." << name << " metadata " << metadata << dendl; + service_name = service; + daemon_name = name; + daemon_metadata = metadata; + daemon_dirty_status = true; + + if (need_metadata_update && + !daemon_metadata.empty()) { + _send_update(); + need_metadata_update = false; + } + + return 0; +} + +int MgrClient::service_daemon_register( + const std::string& service, + const std::string& name, + const std::map<std::string,std::string>& metadata) +{ + std::lock_guard l(lock); + if (service_daemon) { + return -EEXIST; + } + ldout(cct,1) << service << "." << name << " metadata " << metadata << dendl; + service_daemon = true; + service_name = service; + daemon_name = name; + daemon_metadata = metadata; + daemon_dirty_status = true; + + // late register? + if (msgr->get_mytype() == CEPH_ENTITY_TYPE_CLIENT && session && session->con) { + _send_open(); + } + + return 0; +} + +int MgrClient::service_daemon_update_status( + std::map<std::string,std::string>&& status) +{ + std::lock_guard l(lock); + ldout(cct,10) << status << dendl; + daemon_status = std::move(status); + daemon_dirty_status = true; + return 0; +} + +int MgrClient::service_daemon_update_task_status( + std::map<std::string,std::string> &&status) { + std::lock_guard l(lock); + ldout(cct,10) << status << dendl; + task_status = std::move(status); + task_dirty_status = true; + return 0; +} + +void MgrClient::update_daemon_health(std::vector<DaemonHealthMetric>&& metrics) +{ + std::lock_guard l(lock); + daemon_health_metrics = std::move(metrics); +} + diff --git a/src/mgr/MgrClient.h b/src/mgr/MgrClient.h new file mode 100644 index 000000000..a48ae163e --- /dev/null +++ b/src/mgr/MgrClient.h @@ -0,0 +1,215 @@ + +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MGR_CLIENT_H_ +#define MGR_CLIENT_H_ + +#include <boost/variant.hpp> + +#include "msg/Connection.h" +#include "msg/Dispatcher.h" +#include "mon/MgrMap.h" +#include "mgr/DaemonHealthMetric.h" + +#include "messages/MMgrReport.h" +#include "mgr/MetricTypes.h" + +#include "common/perf_counters.h" +#include "common/Timer.h" +#include "common/CommandTable.h" + +class MMgrMap; +class MMgrConfigure; +class MMgrClose; +class Messenger; +class MCommandReply; +class MPGStats; +class MonMap; + +class MgrSessionState +{ + public: + // Which performance counters have we already transmitted schema for? + std::set<std::string> declared; + + // Our connection to the mgr + ConnectionRef con; +}; + +class MgrCommand : public CommandOp +{ + public: + std::string name; + bool tell = false; + + explicit MgrCommand(ceph_tid_t t) : CommandOp(t) {} + MgrCommand() : CommandOp() {} +}; + +class MgrClient : public Dispatcher +{ +protected: + CephContext *cct; + MgrMap map; + Messenger *msgr; + MonMap *monmap; + + std::unique_ptr<MgrSessionState> session; + + ceph::mutex lock = ceph::make_mutex("MgrClient::lock"); + ceph::condition_variable shutdown_cond; + + uint32_t stats_period = 0; + uint32_t stats_threshold = 0; + SafeTimer timer; + + CommandTable<MgrCommand> command_table; + + using clock_t = ceph::mono_clock; + clock_t::time_point last_connect_attempt; + + uint64_t last_config_bl_version = 0; + + Context *report_callback = nullptr; + Context *connect_retry_callback = nullptr; + + // If provided, use this to compose an MPGStats to send with + // our reports (hook for use by OSD) + std::function<MPGStats*()> pgstats_cb; + std::function<void(const ConfigPayload &)> set_perf_queries_cb; + std::function<MetricPayload()> get_perf_report_cb; + + // for service registration and beacon + bool service_daemon = false; + bool daemon_dirty_status = false; + bool task_dirty_status = false; + bool need_metadata_update = true; + std::string service_name, daemon_name; + std::map<std::string,std::string> daemon_metadata; + std::map<std::string,std::string> daemon_status; + std::map<std::string,std::string> task_status; + std::vector<DaemonHealthMetric> daemon_health_metrics; + + void reconnect(); + void _send_open(); + void _send_update(); + + // In pre-luminous clusters, the ceph-mgr service is absent or optional, + // so we must not block in start_command waiting for it. + bool mgr_optional = false; + +public: + MgrClient(CephContext *cct_, Messenger *msgr_, MonMap *monmap); + + void set_messenger(Messenger *msgr_) { msgr = msgr_; } + + void init(); + void shutdown(); + + void set_mgr_optional(bool optional_) {mgr_optional = optional_;} + + bool ms_dispatch2(const ceph::ref_t<Message>& m) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override; + + bool handle_mgr_map(ceph::ref_t<MMgrMap> m); + bool handle_mgr_configure(ceph::ref_t<MMgrConfigure> m); + bool handle_mgr_close(ceph::ref_t<MMgrClose> m); + bool handle_command_reply( + uint64_t tid, + ceph::buffer::list& data, + const std::string& rs, + int r); + + void set_perf_metric_query_cb( + std::function<void(const ConfigPayload &)> cb_set, + std::function<MetricPayload()> cb_get) + { + std::lock_guard l(lock); + set_perf_queries_cb = cb_set; + get_perf_report_cb = cb_get; + } + + void send_pgstats(); + void set_pgstats_cb(std::function<MPGStats*()>&& cb_) + { + std::lock_guard l(lock); + pgstats_cb = std::move(cb_); + } + + int start_command( + const std::vector<std::string>& cmd, const ceph::buffer::list& inbl, + ceph::buffer::list *outbl, std::string *outs, + Context *onfinish); + int start_tell_command( + const std::string& name, + const std::vector<std::string>& cmd, const ceph::buffer::list& inbl, + ceph::buffer::list *outbl, std::string *outs, + Context *onfinish); + + int update_daemon_metadata( + const std::string& service, + const std::string& name, + const std::map<std::string,std::string>& metadata); + int service_daemon_register( + const std::string& service, + const std::string& name, + const std::map<std::string,std::string>& metadata); + int service_daemon_update_status( + std::map<std::string,std::string>&& status); + int service_daemon_update_task_status( + std::map<std::string,std::string> &&task_status); + void update_daemon_health(std::vector<DaemonHealthMetric>&& metrics); + + bool is_initialized() const { return initialized; } + +private: + void handle_config_payload(const OSDConfigPayload &payload) { + if (set_perf_queries_cb) { + set_perf_queries_cb(payload); + } + } + + void handle_config_payload(const MDSConfigPayload &payload) { + if (set_perf_queries_cb) { + set_perf_queries_cb(payload); + } + } + + void handle_config_payload(const UnknownConfigPayload &payload) { + ceph_abort(); + } + + struct HandlePayloadVisitor : public boost::static_visitor<void> { + MgrClient *mgrc; + + HandlePayloadVisitor(MgrClient *mgrc) + : mgrc(mgrc) { + } + + template <typename ConfigPayload> + inline void operator()(const ConfigPayload &payload) const { + mgrc->handle_config_payload(payload); + } + }; + + void _send_stats(); + void _send_pgstats(); + void _send_report(); + + bool initialized = false; +}; + +#endif diff --git a/src/mgr/MgrCommands.h b/src/mgr/MgrCommands.h new file mode 100644 index 000000000..439f07c67 --- /dev/null +++ b/src/mgr/MgrCommands.h @@ -0,0 +1,211 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* no guard; may be included multiple times */ + +// see MonCommands.h + +COMMAND("pg stat", "show placement group status.", + "pg", "r") +COMMAND("pg getmap", "get binary pg map to -o/stdout", "pg", "r") + +COMMAND("pg dump " \ + "name=dumpcontents,type=CephChoices,strings=all|summary|sum|delta|pools|osds|pgs|pgs_brief,n=N,req=false", \ + "show human-readable versions of pg map (only 'all' valid with plain)", "pg", "r") +COMMAND("pg dump_json " \ + "name=dumpcontents,type=CephChoices,strings=all|summary|sum|pools|osds|pgs,n=N,req=false", \ + "show human-readable version of pg map in json only",\ + "pg", "r") +COMMAND("pg dump_pools_json", "show pg pools info in json only",\ + "pg", "r") + +COMMAND("pg ls-by-pool " \ + "name=poolstr,type=CephString " \ + "name=states,type=CephString,n=N,req=false", \ + "list pg with pool = [poolname]", "pg", "r") +COMMAND("pg ls-by-primary " \ + "name=osd,type=CephOsdName " \ + "name=pool,type=CephInt,req=false " \ + "name=states,type=CephString,n=N,req=false", \ + "list pg with primary = [osd]", "pg", "r") +COMMAND("pg ls-by-osd " \ + "name=osd,type=CephOsdName " \ + "name=pool,type=CephInt,req=false " \ + "name=states,type=CephString,n=N,req=false", \ + "list pg on osd [osd]", "pg", "r") +COMMAND("pg ls " \ + "name=pool,type=CephInt,req=false " \ + "name=states,type=CephString,n=N,req=false", \ + "list pg with specific pool, osd, state", "pg", "r") +COMMAND("pg dump_stuck " \ + "name=stuckops,type=CephChoices,strings=inactive|unclean|stale|undersized|degraded,n=N,req=false " \ + "name=threshold,type=CephInt,req=false", + "show information about stuck pgs",\ + "pg", "r") +COMMAND("pg debug " \ + "name=debugop,type=CephChoices,strings=unfound_objects_exist|degraded_pgs_exist", \ + "show debug info about pgs", "pg", "r") + +COMMAND("pg scrub name=pgid,type=CephPgid", "start scrub on <pgid>", \ + "pg", "rw") +COMMAND("pg deep-scrub name=pgid,type=CephPgid", "start deep-scrub on <pgid>", \ + "pg", "rw") +COMMAND("pg repair name=pgid,type=CephPgid", "start repair on <pgid>", \ + "pg", "rw") + +COMMAND("pg force-recovery name=pgid,type=CephPgid,n=N", "force recovery of <pgid> first", \ + "pg", "rw") +COMMAND("pg force-backfill name=pgid,type=CephPgid,n=N", "force backfill of <pgid> first", \ + "pg", "rw") +COMMAND("pg cancel-force-recovery name=pgid,type=CephPgid,n=N", "restore normal recovery priority of <pgid>", \ + "pg", "rw") +COMMAND("pg cancel-force-backfill name=pgid,type=CephPgid,n=N", "restore normal backfill priority of <pgid>", \ + "pg", "rw") + +// stuff in osd namespace +COMMAND("osd perf", \ + "print dump of OSD perf summary stats", \ + "osd", \ + "r") +COMMAND("osd df " \ + "name=output_method,type=CephChoices,strings=plain|tree,req=false " \ + "name=filter_by,type=CephChoices,strings=class|name,req=false " \ + "name=filter,type=CephString,req=false", \ + "show OSD utilization", "osd", "r") +COMMAND("osd blocked-by", \ + "print histogram of which OSDs are blocking their peers", \ + "osd", "r") +COMMAND("osd pool stats " \ + "name=pool_name,type=CephPoolname,req=false", + "obtain stats from all pools, or from specified pool", + "osd", "r") +COMMAND("osd pool scrub " \ + "name=who,type=CephPoolname,n=N", \ + "initiate scrub on pool <who>", \ + "osd", "rw") +COMMAND("osd pool deep-scrub " \ + "name=who,type=CephPoolname,n=N", \ + "initiate deep-scrub on pool <who>", \ + "osd", "rw") +COMMAND("osd pool repair " \ + "name=who,type=CephPoolname,n=N", \ + "initiate repair on pool <who>", \ + "osd", "rw") +COMMAND("osd pool force-recovery " \ + "name=who,type=CephPoolname,n=N", \ + "force recovery of specified pool <who> first", \ + "osd", "rw") +COMMAND("osd pool force-backfill " \ + "name=who,type=CephPoolname,n=N", \ + "force backfill of specified pool <who> first", \ + "osd", "rw") +COMMAND("osd pool cancel-force-recovery " \ + "name=who,type=CephPoolname,n=N", \ + "restore normal recovery priority of specified pool <who>", \ + "osd", "rw") +COMMAND("osd pool cancel-force-backfill " \ + "name=who,type=CephPoolname,n=N", \ + "restore normal recovery priority of specified pool <who>", \ + "osd", "rw") +COMMAND("osd reweight-by-utilization " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=no_increasing,type=CephBool,req=false",\ + "reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \ + "osd", "rw") +COMMAND("osd test-reweight-by-utilization " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=no_increasing,type=CephBool,req=false",\ + "dry run of reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \ + "osd", "r") +COMMAND("osd reweight-by-pg " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=pools,type=CephPoolname,n=N,req=false", \ + "reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \ + "osd", "rw") +COMMAND("osd test-reweight-by-pg " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=pools,type=CephPoolname,n=N,req=false", \ + "dry run of reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \ + "osd", "r") + +COMMAND("osd destroy " \ + "name=id,type=CephOsdName " \ + "name=force,type=CephBool,req=false " + // backward compat synonym for --force + "name=yes_i_really_mean_it,type=CephBool,req=false", \ + "mark osd as being destroyed. Keeps the ID intact (allowing reuse), " \ + "but removes cephx keys, config-key data and lockbox keys, "\ + "rendering data permanently unreadable.", \ + "osd", "rw") +COMMAND("osd purge " \ + "name=id,type=CephOsdName " \ + "name=force,type=CephBool,req=false " + // backward compat synonym for --force + "name=yes_i_really_mean_it,type=CephBool,req=false", \ + "purge all osd data from the monitors including the OSD id " \ + "and CRUSH position", \ + "osd", "rw") + +COMMAND("osd safe-to-destroy name=ids,type=CephString,n=N", + "check whether osd(s) can be safely destroyed without reducing data durability", + "osd", "r") +COMMAND("osd ok-to-stop name=ids,type=CephString,n=N "\ + "name=max,type=CephInt,req=false", + "check whether osd(s) can be safely stopped without reducing immediate"\ + " data availability", "osd", "r") + +COMMAND("osd scrub " \ + "name=who,type=CephString", \ + "initiate scrub on osd <who>, or use <all|any> to scrub all", \ + "osd", "rw") +COMMAND("osd deep-scrub " \ + "name=who,type=CephString", \ + "initiate deep scrub on osd <who>, or use <all|any> to deep scrub all", \ + "osd", "rw") +COMMAND("osd repair " \ + "name=who,type=CephString", \ + "initiate repair on osd <who>, or use <all|any> to repair all", \ + "osd", "rw") + +COMMAND("service dump", + "dump service map", "service", "r") +COMMAND("service status", + "dump service state", "service", "r") + +COMMAND("config show " \ + "name=who,type=CephString name=key,type=CephString,req=false", + "Show running configuration", + "mgr", "r") +COMMAND("config show-with-defaults " \ + "name=who,type=CephString", + "Show running configuration (including compiled-in defaults)", + "mgr", "r") + +COMMAND("device ls", + "Show devices", + "mgr", "r") +COMMAND("device info name=devid,type=CephString", + "Show information about a device", + "mgr", "r") +COMMAND("device ls-by-daemon name=who,type=CephString", + "Show devices associated with a daemon", + "mgr", "r") +COMMAND("device ls-by-host name=host,type=CephString", + "Show devices on a host", + "mgr", "r") +COMMAND("device set-life-expectancy name=devid,type=CephString "\ + "name=from,type=CephString "\ + "name=to,type=CephString,req=false", + "Set predicted device life expectancy", + "mgr", "rw") +COMMAND("device rm-life-expectancy name=devid,type=CephString", + "Clear predicted device life expectancy", + "mgr", "rw") diff --git a/src/mgr/MgrContext.h b/src/mgr/MgrContext.h new file mode 100644 index 000000000..a5490bef3 --- /dev/null +++ b/src/mgr/MgrContext.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MGR_CONTEXT_H_ +#define MGR_CONTEXT_H_ + +#include <memory> + +#include "common/ceph_json.h" +#include "common/Cond.h" +#include "mon/MonClient.h" + +class Command +{ +protected: + C_SaferCond cond; +public: + ceph::buffer::list outbl; + std::string outs; + int r; + + void run(MonClient *monc, const std::string &command) + { + monc->start_mon_command({command}, {}, + &outbl, &outs, &cond); + } + + void run(MonClient *monc, const std::string &command, const ceph::buffer::list &inbl) + { + monc->start_mon_command({command}, inbl, + &outbl, &outs, &cond); + } + + virtual void wait() + { + r = cond.wait(); + } + + virtual ~Command() {} +}; + + +class JSONCommand : public Command +{ +public: + json_spirit::mValue json_result; + + void wait() override + { + Command::wait(); + + if (r == 0) { + bool read_ok = json_spirit::read( + outbl.to_str(), json_result); + if (!read_ok) { + r = -EINVAL; + } + } + } +}; + +#endif + diff --git a/src/mgr/MgrSession.h b/src/mgr/MgrSession.h new file mode 100644 index 000000000..40b50220b --- /dev/null +++ b/src/mgr/MgrSession.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MGR_MGRSESSION_H +#define CEPH_MGR_MGRSESSION_H + +#include "common/RefCountedObj.h" +#include "common/entity_name.h" +#include "msg/msg_types.h" +#include "MgrCap.h" + + +/** + * Session state associated with the Connection. + */ +struct MgrSession : public RefCountedObject { + uint64_t global_id = 0; + EntityName entity_name; + entity_inst_t inst; + + int osd_id = -1; ///< osd id (if an osd) + + MgrCap caps; + + std::set<std::string> declared_types; + + const entity_addr_t& get_peer_addr() const { + return inst.addr; + } + +private: + FRIEND_MAKE_REF(MgrSession); + explicit MgrSession(CephContext *cct) : RefCountedObject(cct) {} + ~MgrSession() override = default; +}; + +using MgrSessionRef = ceph::ref_t<MgrSession>; + + +#endif diff --git a/src/mgr/MgrStandby.cc b/src/mgr/MgrStandby.cc new file mode 100644 index 000000000..764929843 --- /dev/null +++ b/src/mgr/MgrStandby.cc @@ -0,0 +1,493 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include <Python.h> +#include <boost/algorithm/string/replace.hpp> + +#include "common/errno.h" +#include "common/signal.h" +#include "include/compat.h" + +#include "include/stringify.h" +#include "global/global_context.h" +#include "global/signal_handler.h" + +#include "mgr/MgrContext.h" +#include "mgr/mgr_commands.h" +#include "mgr/mgr_perf_counters.h" + +#include "messages/MMgrBeacon.h" +#include "messages/MMgrMap.h" +#include "Mgr.h" + +#include "MgrStandby.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + +using std::map; +using std::string; +using std::vector; + +MgrStandby::MgrStandby(int argc, const char **argv) : + Dispatcher(g_ceph_context), + monc{g_ceph_context, poolctx}, + client_messenger(Messenger::create( + g_ceph_context, + cct->_conf.get_val<std::string>("ms_public_type").empty() ? + cct->_conf.get_val<std::string>("ms_type") : cct->_conf.get_val<std::string>("ms_public_type"), + entity_name_t::MGR(), + "mgr", + Messenger::get_pid_nonce())), + objecter{g_ceph_context, client_messenger.get(), &monc, poolctx}, + client{client_messenger.get(), &monc, &objecter}, + mgrc(g_ceph_context, client_messenger.get(), &monc.monmap), + log_client(g_ceph_context, client_messenger.get(), &monc.monmap, LogClient::NO_FLAGS), + clog(log_client.create_channel(CLOG_CHANNEL_CLUSTER)), + audit_clog(log_client.create_channel(CLOG_CHANNEL_AUDIT)), + finisher(g_ceph_context, "MgrStandby", "mgrsb-fin"), + timer(g_ceph_context, lock), + py_module_registry(clog), + active_mgr(nullptr), + orig_argc(argc), + orig_argv(argv), + available_in_map(false) +{ +} + +MgrStandby::~MgrStandby() = default; + +const char** MgrStandby::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + // clog & admin clog + "clog_to_monitors", + "clog_to_syslog", + "clog_to_syslog_facility", + "clog_to_syslog_level", + "clog_to_graylog", + "clog_to_graylog_host", + "clog_to_graylog_port", + "mgr_standby_modules", + "host", + "fsid", + NULL + }; + return KEYS; +} + +void MgrStandby::handle_conf_change( + const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + if (changed.count("clog_to_monitors") || + changed.count("clog_to_syslog") || + changed.count("clog_to_syslog_level") || + changed.count("clog_to_syslog_facility") || + changed.count("clog_to_graylog") || + changed.count("clog_to_graylog_host") || + changed.count("clog_to_graylog_port") || + changed.count("host") || + changed.count("fsid")) { + _update_log_config(); + } + if (changed.count("mgr_standby_modules") && !active_mgr) { + if (g_conf().get_val<bool>("mgr_standby_modules") != py_module_registry.have_standby_modules()) { + dout(1) << "mgr_standby_modules now " + << (int)g_conf().get_val<bool>("mgr_standby_modules") + << ", standby modules are " + << (py_module_registry.have_standby_modules() ? "":"not ") + << "active, respawning" + << dendl; + respawn(); + } + } +} + +int MgrStandby::init() +{ + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + + cct->_conf.add_observer(this); + + std::lock_guard l(lock); + + // Start finisher + finisher.start(); + + // Initialize Messenger + client_messenger->add_dispatcher_tail(this); + client_messenger->add_dispatcher_head(&objecter); + client_messenger->add_dispatcher_tail(&client); + client_messenger->start(); + + poolctx.start(2); + + // Initialize MonClient + if (monc.build_initial_monmap() < 0) { + client_messenger->shutdown(); + client_messenger->wait(); + return -1; + } + + monc.sub_want("mgrmap", 0, 0); + + monc.set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD + |CEPH_ENTITY_TYPE_MDS|CEPH_ENTITY_TYPE_MGR); + monc.set_messenger(client_messenger.get()); + + // We must register our config callback before calling init(), so + // that we see the initial configuration message + monc.register_config_callback([this](const std::string &k, const std::string &v){ + // removing value to hide sensitive data going into mgr logs + // leaving this for debugging purposes + // dout(10) << "config_callback: " << k << " : " << v << dendl; + dout(10) << "config_callback: " << k << " : " << dendl; + if (k.substr(0, 4) == "mgr/") { + py_module_registry.handle_config(k, v); + return true; + } + return false; + }); + monc.register_config_notify_callback([this]() { + py_module_registry.handle_config_notify(); + }); + dout(4) << "Registered monc callback" << dendl; + + int r = monc.init(); + if (r < 0) { + monc.shutdown(); + client_messenger->shutdown(); + client_messenger->wait(); + return r; + } + mgrc.init(); + client_messenger->add_dispatcher_tail(&mgrc); + + r = monc.authenticate(); + if (r < 0) { + derr << "Authentication failed, did you specify a mgr ID with a valid keyring?" << dendl; + monc.shutdown(); + client_messenger->shutdown(); + client_messenger->wait(); + return r; + } + // only forward monmap updates after authentication finishes, otherwise + // monc.authenticate() will be waiting for MgrStandy::ms_dispatch() + // to acquire the lock forever, as it is already locked in the beginning of + // this method. + monc.set_passthrough_monmap(); + + client_t whoami = monc.get_global_id(); + client_messenger->set_myname(entity_name_t::MGR(whoami.v)); + monc.set_log_client(&log_client); + _update_log_config(); + objecter.set_client_incarnation(0); + objecter.init(); + objecter.start(); + client.init(); + timer.init(); + + py_module_registry.init(); + mgr_perf_start(g_ceph_context); + + + tick(); + + dout(4) << "Complete." << dendl; + return 0; +} + +void MgrStandby::send_beacon() +{ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + dout(20) << state_str() << dendl; + + auto modules = py_module_registry.get_modules(); + + // Construct a list of the info about each loaded module + // which we will transmit to the monitor. + std::vector<MgrMap::ModuleInfo> module_info; + for (const auto &module : modules) { + MgrMap::ModuleInfo info; + info.name = module->get_name(); + info.error_string = module->get_error_string(); + info.can_run = module->get_can_run(); + info.module_options = module->get_options(); + module_info.push_back(std::move(info)); + } + + auto clients = py_module_registry.get_clients(); + for (const auto& client : clients) { + dout(15) << "noting RADOS client for blocklist: " << client << dendl; + } + + // Whether I think I am available (request MgrMonitor to set me + // as available in the map) + bool available = active_mgr != nullptr && active_mgr->is_initialized(); + + auto addrs = available ? active_mgr->get_server_addrs() : entity_addrvec_t(); + dout(10) << "sending beacon as gid " << monc.get_global_id() << dendl; + + map<string,string> metadata; + metadata["addr"] = client_messenger->get_myaddr_legacy().ip_only_to_str(); + metadata["addrs"] = stringify(client_messenger->get_myaddrs()); + collect_sys_info(&metadata, g_ceph_context); + + auto m = ceph::make_message<MMgrBeacon>(monc.get_fsid(), + monc.get_global_id(), + g_conf()->name.get_id(), + addrs, + available, + std::move(module_info), + std::move(metadata), + std::move(clients), + CEPH_FEATURES_ALL); + + if (available) { + if (!available_in_map) { + // We are informing the mon that we are done initializing: inform + // it of our command set. This has to happen after init() because + // it needs the python modules to have loaded. + std::vector<MonCommand> commands = mgr_commands; + std::vector<MonCommand> py_commands = py_module_registry.get_commands(); + commands.insert(commands.end(), py_commands.begin(), py_commands.end()); + if (monc.monmap.min_mon_release < ceph_release_t::quincy) { + dout(10) << " stripping out positional=false quincy-ism" << dendl; + for (auto& i : commands) { + boost::replace_all(i.cmdstring, ",positional=false", ""); + } + } + m->set_command_descs(commands); + dout(4) << "going active, including " << m->get_command_descs().size() + << " commands in beacon" << dendl; + } + + m->set_services(active_mgr->get_services()); + } + + monc.send_mon_message(std::move(m)); +} + +void MgrStandby::tick() +{ + dout(10) << __func__ << dendl; + send_beacon(); + + timer.add_event_after( + g_conf().get_val<std::chrono::seconds>("mgr_tick_period").count(), + new LambdaContext([this](int r){ + tick(); + } + )); +} + +void MgrStandby::shutdown() +{ + finisher.queue(new LambdaContext([&](int) { + std::lock_guard l(lock); + + dout(4) << "Shutting down" << dendl; + + py_module_registry.shutdown(); + // stop sending beacon first, I use monc to talk with monitors + timer.shutdown(); + // client uses monc and objecter + client.shutdown(); + mgrc.shutdown(); + // Stop asio threads, so leftover events won't call into shut down + // monclient/objecter. + poolctx.finish(); + // stop monc, so mon won't be able to instruct me to shutdown/activate after + // the active_mgr is stopped + monc.shutdown(); + if (active_mgr) { + active_mgr->shutdown(); + } + // objecter is used by monc and active_mgr + objecter.shutdown(); + // client_messenger is used by all of them, so stop it in the end + client_messenger->shutdown(); + })); + + // Then stop the finisher to ensure its enqueued contexts aren't going + // to touch references to the things we're about to tear down + finisher.wait_for_empty(); + finisher.stop(); + mgr_perf_stop(g_ceph_context); +} + +void MgrStandby::respawn() +{ + // --- WARNING TO FUTURE COPY/PASTERS --- + // You must also add a call like + // + // ceph_pthread_setname(pthread_self(), "ceph-mgr"); + // + // to main() so that /proc/$pid/stat field 2 contains "(ceph-mgr)" + // instead of "(exe)", so that killall (and log rotation) will work. + + char *new_argv[orig_argc+1]; + dout(1) << " e: '" << orig_argv[0] << "'" << dendl; + for (int i=0; i<orig_argc; i++) { + new_argv[i] = (char *)orig_argv[i]; + dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl; + } + new_argv[orig_argc] = NULL; + + /* Determine the path to our executable, test if Linux /proc/self/exe exists. + * This allows us to exec the same executable even if it has since been + * unlinked. + */ + char exe_path[PATH_MAX] = ""; + if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) == -1) { + /* Print CWD for the user's interest */ + char buf[PATH_MAX]; + char *cwd = getcwd(buf, sizeof(buf)); + ceph_assert(cwd); + dout(1) << " cwd " << cwd << dendl; + + /* Fall back to a best-effort: just running in our CWD */ + strncpy(exe_path, orig_argv[0], PATH_MAX-1); + } else { + dout(1) << "respawning with exe " << exe_path << dendl; + strcpy(exe_path, PROCPREFIX "/proc/self/exe"); + } + + dout(1) << " exe_path " << exe_path << dendl; + + unblock_all_signals(NULL); + execv(exe_path, new_argv); + + derr << "respawn execv " << orig_argv[0] + << " failed with " << cpp_strerror(errno) << dendl; + ceph_abort(); +} + +void MgrStandby::_update_log_config() +{ + clog->parse_client_options(cct); + audit_clog->parse_client_options(cct); +} + +void MgrStandby::handle_mgr_map(ref_t<MMgrMap> mmap) +{ + auto &map = mmap->get_map(); + dout(4) << "received map epoch " << map.get_epoch() << dendl; + const bool active_in_map = map.active_gid == monc.get_global_id(); + dout(4) << "active in map: " << active_in_map + << " active is " << map.active_gid << dendl; + + // PyModuleRegistry may ask us to respawn if it sees that + // this MgrMap is changing its set of enabled modules + bool need_respawn = py_module_registry.handle_mgr_map(map); + if (need_respawn) { + dout(1) << "respawning because set of enabled modules changed!" << dendl; + respawn(); + } + + if (active_in_map) { + if (!active_mgr) { + dout(1) << "Activating!" << dendl; + active_mgr.reset(new Mgr(&monc, map, &py_module_registry, + client_messenger.get(), &objecter, + &client, clog, audit_clog)); + active_mgr->background_init(new LambdaContext( + [this](int r){ + // Advertise our active-ness ASAP instead of waiting for + // next tick. + std::lock_guard l(lock); + send_beacon(); + })); + dout(1) << "I am now activating" << dendl; + } else { + dout(10) << "I was already active" << dendl; + bool need_respawn = active_mgr->got_mgr_map(map); + if (need_respawn) { + respawn(); + } + } + + if (!available_in_map && map.get_available()) { + dout(4) << "Map now says I am available" << dendl; + available_in_map = true; + } + } else if (active_mgr != nullptr) { + derr << "I was active but no longer am" << dendl; + respawn(); + } else { + if (map.active_gid != 0 && map.active_name != g_conf()->name.get_id()) { + // I am the standby and someone else is active, start modules + // in standby mode to do redirects if needed + if (!py_module_registry.is_standby_running() && + g_conf().get_val<bool>("mgr_standby_modules")) { + py_module_registry.standby_start(monc, finisher); + } + } + } +} + +bool MgrStandby::ms_dispatch2(const ref_t<Message>& m) +{ + std::lock_guard l(lock); + dout(10) << state_str() << " " << *m << dendl; + + if (m->get_type() == MSG_MGR_MAP) { + handle_mgr_map(ref_cast<MMgrMap>(m)); + } + bool handled = false; + if (active_mgr) { + auto am = active_mgr; + lock.unlock(); + handled = am->ms_dispatch2(m); + lock.lock(); + } + if (m->get_type() == MSG_MGR_MAP) { + // let this pass through for mgrc + handled = false; + } + return handled; +} + + +bool MgrStandby::ms_handle_refused(Connection *con) +{ + // do nothing for now + return false; +} + +int MgrStandby::main(vector<const char *> args) +{ + client_messenger->wait(); + + // Disable signal handlers + unregister_async_signal_handler(SIGHUP, sighup_handler); + shutdown_async_signal_handler(); + + return 0; +} + + +std::string MgrStandby::state_str() +{ + if (active_mgr == nullptr) { + return "standby"; + } else if (active_mgr->is_initialized()) { + return "active"; + } else { + return "active (starting)"; + } +} diff --git a/src/mgr/MgrStandby.h b/src/mgr/MgrStandby.h new file mode 100644 index 000000000..0f06e3074 --- /dev/null +++ b/src/mgr/MgrStandby.h @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#ifndef MGR_STANDBY_H_ +#define MGR_STANDBY_H_ + +#include "auth/Auth.h" +#include "common/async/context_pool.h" +#include "common/Finisher.h" +#include "common/Timer.h" +#include "common/LogClient.h" + +#include "client/Client.h" +#include "mon/MonClient.h" +#include "osdc/Objecter.h" +#include "PyModuleRegistry.h" +#include "MgrClient.h" + +class MMgrMap; +class Mgr; +class PyModuleConfig; + +class MgrStandby : public Dispatcher, + public md_config_obs_t { +public: + // config observer bits + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) override; + +protected: + ceph::async::io_context_pool poolctx; + MonClient monc; + std::unique_ptr<Messenger> client_messenger; + Objecter objecter; + Client client; + + MgrClient mgrc; + + LogClient log_client; + LogChannelRef clog, audit_clog; + + ceph::mutex lock = ceph::make_mutex("MgrStandby::lock"); + Finisher finisher; + SafeTimer timer; + + PyModuleRegistry py_module_registry; + std::shared_ptr<Mgr> active_mgr; + + int orig_argc; + const char **orig_argv; + + std::string state_str(); + + void handle_mgr_map(ceph::ref_t<MMgrMap> m); + void _update_log_config(); + void send_beacon(); + + bool available_in_map; + +public: + MgrStandby(int argc, const char **argv); + ~MgrStandby() override; + + bool ms_dispatch2(const ceph::ref_t<Message>& m) override; + bool ms_handle_reset(Connection *con) override { return false; } + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override; + + int init(); + void shutdown(); + void respawn(); + int main(std::vector<const char *> args); + void tick(); +}; + +#endif + diff --git a/src/mgr/OSDPerfMetricCollector.cc b/src/mgr/OSDPerfMetricCollector.cc new file mode 100644 index 000000000..eb548ce70 --- /dev/null +++ b/src/mgr/OSDPerfMetricCollector.cc @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "common/errno.h" + +#include "messages/MMgrReport.h" +#include "OSDPerfMetricCollector.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr.osd_perf_metric_collector " << __func__ << " " + +OSDPerfMetricCollector::OSDPerfMetricCollector(MetricListener &listener) + : MetricCollector<OSDPerfMetricQuery, + OSDPerfMetricLimit, + OSDPerfMetricKey, + OSDPerfMetricReport>(listener) { +} + +void OSDPerfMetricCollector::process_reports(const MetricPayload &payload) { + const std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = + boost::get<OSDMetricPayload>(payload).report; + + std::lock_guard locker(lock); + process_reports_generic( + reports, [](PerformanceCounter *counter, const PerformanceCounter &update) { + counter->first += update.first; + counter->second += update.second; + }); +} + +int OSDPerfMetricCollector::get_counters(PerfCollector *collector) { + OSDPerfCollector *c = static_cast<OSDPerfCollector *>(collector); + + std::lock_guard locker(lock); + return get_counters_generic(c->query_id, &c->counters); +} diff --git a/src/mgr/OSDPerfMetricCollector.h b/src/mgr/OSDPerfMetricCollector.h new file mode 100644 index 000000000..c531dbf63 --- /dev/null +++ b/src/mgr/OSDPerfMetricCollector.h @@ -0,0 +1,23 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef OSD_PERF_METRIC_COLLECTOR_H_ +#define OSD_PERF_METRIC_COLLECTOR_H_ + +#include "mgr/MetricCollector.h" +#include "mgr/OSDPerfMetricTypes.h" + +/** + * OSD performance query class. + */ +class OSDPerfMetricCollector + : public MetricCollector<OSDPerfMetricQuery, OSDPerfMetricLimit, OSDPerfMetricKey, + OSDPerfMetricReport> { +public: + OSDPerfMetricCollector(MetricListener &listener); + + void process_reports(const MetricPayload &payload) override; + int get_counters(PerfCollector *collector) override; +}; + +#endif // OSD_PERF_METRIC_COLLECTOR_H_ diff --git a/src/mgr/OSDPerfMetricTypes.cc b/src/mgr/OSDPerfMetricTypes.cc new file mode 100644 index 000000000..bce95e0ae --- /dev/null +++ b/src/mgr/OSDPerfMetricTypes.cc @@ -0,0 +1,134 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mgr/OSDPerfMetricTypes.h" + +#include <ostream> + +using ceph::bufferlist; + +std::ostream& operator<<(std::ostream& os, + const OSDPerfMetricSubKeyDescriptor &d) { + switch(d.type) { + case OSDPerfMetricSubKeyType::CLIENT_ID: + os << "client_id"; + break; + case OSDPerfMetricSubKeyType::CLIENT_ADDRESS: + os << "client_address"; + break; + case OSDPerfMetricSubKeyType::POOL_ID: + os << "pool_id"; + break; + case OSDPerfMetricSubKeyType::NAMESPACE: + os << "namespace"; + break; + case OSDPerfMetricSubKeyType::OSD_ID: + os << "osd_id"; + break; + case OSDPerfMetricSubKeyType::PG_ID: + os << "pg_id"; + break; + case OSDPerfMetricSubKeyType::OBJECT_NAME: + os << "object_name"; + break; + case OSDPerfMetricSubKeyType::SNAP_ID: + os << "snap_id"; + break; + default: + os << "unknown (" << static_cast<int>(d.type) << ")"; + } + return os << "~/" << d.regex_str << "/"; +} + +void PerformanceCounterDescriptor::pack_counter(const PerformanceCounter &c, + bufferlist *bl) const { + using ceph::encode; + encode(c.first, *bl); + switch(type) { + case PerformanceCounterType::OPS: + case PerformanceCounterType::WRITE_OPS: + case PerformanceCounterType::READ_OPS: + case PerformanceCounterType::BYTES: + case PerformanceCounterType::WRITE_BYTES: + case PerformanceCounterType::READ_BYTES: + break; + case PerformanceCounterType::LATENCY: + case PerformanceCounterType::WRITE_LATENCY: + case PerformanceCounterType::READ_LATENCY: + encode(c.second, *bl); + break; + default: + ceph_abort_msg("unknown counter type"); + } +} + +void PerformanceCounterDescriptor::unpack_counter( + bufferlist::const_iterator& bl, PerformanceCounter *c) const { + using ceph::decode; + decode(c->first, bl); + switch(type) { + case PerformanceCounterType::OPS: + case PerformanceCounterType::WRITE_OPS: + case PerformanceCounterType::READ_OPS: + case PerformanceCounterType::BYTES: + case PerformanceCounterType::WRITE_BYTES: + case PerformanceCounterType::READ_BYTES: + break; + case PerformanceCounterType::LATENCY: + case PerformanceCounterType::WRITE_LATENCY: + case PerformanceCounterType::READ_LATENCY: + decode(c->second, bl); + break; + default: + ceph_abort_msg("unknown counter type"); + } +} + +std::ostream& operator<<(std::ostream& os, + const PerformanceCounterDescriptor &d) { + switch(d.type) { + case PerformanceCounterType::OPS: + return os << "ops"; + case PerformanceCounterType::WRITE_OPS: + return os << "write ops"; + case PerformanceCounterType::READ_OPS: + return os << "read ops"; + case PerformanceCounterType::BYTES: + return os << "bytes"; + case PerformanceCounterType::WRITE_BYTES: + return os << "write bytes"; + case PerformanceCounterType::READ_BYTES: + return os << "read bytes"; + case PerformanceCounterType::LATENCY: + return os << "latency"; + case PerformanceCounterType::WRITE_LATENCY: + return os << "write latency"; + case PerformanceCounterType::READ_LATENCY: + return os << "read latency"; + default: + return os << "unknown (" << static_cast<int>(d.type) << ")"; + } +} + +std::ostream& operator<<(std::ostream& os, const OSDPerfMetricLimit &limit) { + return os << "{order_by=" << limit.order_by << ", max_count=" + << limit.max_count << "}"; +} + +void OSDPerfMetricQuery::pack_counters(const PerformanceCounters &counters, + bufferlist *bl) const { + auto it = counters.begin(); + for (auto &descriptor : performance_counter_descriptors) { + if (it == counters.end()) { + descriptor.pack_counter(PerformanceCounter(), bl); + } else { + descriptor.pack_counter(*it, bl); + it++; + } + } +} + +std::ostream& operator<<(std::ostream& os, const OSDPerfMetricQuery &query) { + return os << "{key=" << query.key_descriptor << ", counters=" + << query.performance_counter_descriptors << "}"; +} diff --git a/src/mgr/OSDPerfMetricTypes.h b/src/mgr/OSDPerfMetricTypes.h new file mode 100644 index 000000000..1b5904e13 --- /dev/null +++ b/src/mgr/OSDPerfMetricTypes.h @@ -0,0 +1,360 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef OSD_PERF_METRIC_H_ +#define OSD_PERF_METRIC_H_ + +#include "include/denc.h" +#include "include/stringify.h" + +#include "mgr/Types.h" + +#include <regex> + +typedef std::vector<std::string> OSDPerfMetricSubKey; // array of regex match +typedef std::vector<OSDPerfMetricSubKey> OSDPerfMetricKey; + +enum class OSDPerfMetricSubKeyType : uint8_t { + CLIENT_ID = 0, + CLIENT_ADDRESS = 1, + POOL_ID = 2, + NAMESPACE = 3, + OSD_ID = 4, + PG_ID = 5, + OBJECT_NAME = 6, + SNAP_ID = 7, +}; + +struct OSDPerfMetricSubKeyDescriptor { + OSDPerfMetricSubKeyType type = static_cast<OSDPerfMetricSubKeyType>(-1); + std::string regex_str; + std::regex regex; + + bool is_supported() const { + switch (type) { + case OSDPerfMetricSubKeyType::CLIENT_ID: + case OSDPerfMetricSubKeyType::CLIENT_ADDRESS: + case OSDPerfMetricSubKeyType::POOL_ID: + case OSDPerfMetricSubKeyType::NAMESPACE: + case OSDPerfMetricSubKeyType::OSD_ID: + case OSDPerfMetricSubKeyType::PG_ID: + case OSDPerfMetricSubKeyType::OBJECT_NAME: + case OSDPerfMetricSubKeyType::SNAP_ID: + return true; + default: + return false; + } + } + + OSDPerfMetricSubKeyDescriptor() { + } + + OSDPerfMetricSubKeyDescriptor(OSDPerfMetricSubKeyType type, + const std::string regex) + : type(type), regex_str(regex) { + } + + bool operator<(const OSDPerfMetricSubKeyDescriptor &other) const { + if (type < other.type) { + return true; + } + if (type > other.type) { + return false; + } + return regex_str < other.regex_str; + } + + DENC(OSDPerfMetricSubKeyDescriptor, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.regex_str, p); + DENC_FINISH(p); + } +}; +WRITE_CLASS_DENC(OSDPerfMetricSubKeyDescriptor) + +std::ostream& operator<<(std::ostream& os, + const OSDPerfMetricSubKeyDescriptor &d); + +typedef std::vector<OSDPerfMetricSubKeyDescriptor> OSDPerfMetricKeyDescriptor; + +template<> +struct denc_traits<OSDPerfMetricKeyDescriptor> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = true; + static void bound_encode(const OSDPerfMetricKeyDescriptor& v, size_t& p) { + p += sizeof(uint32_t); + const auto size = v.size(); + if (size) { + size_t per = 0; + denc(v.front(), per); + p += per * size; + } + } + static void encode(const OSDPerfMetricKeyDescriptor& v, + ceph::buffer::list::contiguous_appender& p) { + denc_varint(v.size(), p); + for (auto& i : v) { + denc(i, p); + } + } + static void decode(OSDPerfMetricKeyDescriptor& v, + ceph::buffer::ptr::const_iterator& p) { + unsigned num; + denc_varint(num, p); + v.clear(); + v.reserve(num); + for (unsigned i=0; i < num; ++i) { + OSDPerfMetricSubKeyDescriptor d; + denc(d, p); + if (!d.is_supported()) { + v.clear(); + return; + } + try { + d.regex = d.regex_str.c_str(); + } catch (const std::regex_error& e) { + v.clear(); + return; + } + if (d.regex.mark_count() == 0) { + v.clear(); + return; + } + v.push_back(std::move(d)); + } + } +}; + +enum class PerformanceCounterType : uint8_t { + OPS = 0, + WRITE_OPS = 1, + READ_OPS = 2, + BYTES = 3, + WRITE_BYTES = 4, + READ_BYTES = 5, + LATENCY = 6, + WRITE_LATENCY = 7, + READ_LATENCY = 8, +}; + +struct PerformanceCounterDescriptor { + PerformanceCounterType type = static_cast<PerformanceCounterType>(-1); + + bool is_supported() const { + switch (type) { + case PerformanceCounterType::OPS: + case PerformanceCounterType::WRITE_OPS: + case PerformanceCounterType::READ_OPS: + case PerformanceCounterType::BYTES: + case PerformanceCounterType::WRITE_BYTES: + case PerformanceCounterType::READ_BYTES: + case PerformanceCounterType::LATENCY: + case PerformanceCounterType::WRITE_LATENCY: + case PerformanceCounterType::READ_LATENCY: + return true; + default: + return false; + } + } + + PerformanceCounterDescriptor() { + } + + PerformanceCounterDescriptor(PerformanceCounterType type) : type(type) { + } + + bool operator<(const PerformanceCounterDescriptor &other) const { + return type < other.type; + } + + bool operator==(const PerformanceCounterDescriptor &other) const { + return type == other.type; + } + + bool operator!=(const PerformanceCounterDescriptor &other) const { + return type != other.type; + } + + DENC(PerformanceCounterDescriptor, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + DENC_FINISH(p); + } + + void pack_counter(const PerformanceCounter &c, ceph::buffer::list *bl) const; + void unpack_counter(ceph::buffer::list::const_iterator& bl, + PerformanceCounter *c) const; +}; +WRITE_CLASS_DENC(PerformanceCounterDescriptor) + +std::ostream& operator<<(std::ostream& os, + const PerformanceCounterDescriptor &d); + +typedef std::vector<PerformanceCounterDescriptor> PerformanceCounterDescriptors; + +template<> +struct denc_traits<PerformanceCounterDescriptors> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = true; + static void bound_encode(const PerformanceCounterDescriptors& v, size_t& p) { + p += sizeof(uint32_t); + const auto size = v.size(); + if (size) { + size_t per = 0; + denc(v.front(), per); + p += per * size; + } + } + static void encode(const PerformanceCounterDescriptors& v, + ceph::buffer::list::contiguous_appender& p) { + denc_varint(v.size(), p); + for (auto& i : v) { + denc(i, p); + } + } + static void decode(PerformanceCounterDescriptors& v, + ceph::buffer::ptr::const_iterator& p) { + unsigned num; + denc_varint(num, p); + v.clear(); + v.reserve(num); + for (unsigned i=0; i < num; ++i) { + PerformanceCounterDescriptor d; + denc(d, p); + if (d.is_supported()) { + v.push_back(std::move(d)); + } + } + } +}; + +struct OSDPerfMetricLimit { + PerformanceCounterDescriptor order_by; + uint64_t max_count = 0; + + OSDPerfMetricLimit() { + } + + OSDPerfMetricLimit(const PerformanceCounterDescriptor &order_by, + uint64_t max_count) + : order_by(order_by), max_count(max_count) { + } + + bool operator<(const OSDPerfMetricLimit &other) const { + if (order_by != other.order_by) { + return order_by < other.order_by; + } + return max_count < other.max_count; + } + + DENC(OSDPerfMetricLimit, v, p) { + DENC_START(1, 1, p); + denc(v.order_by, p); + denc(v.max_count, p); + DENC_FINISH(p); + } +}; +WRITE_CLASS_DENC(OSDPerfMetricLimit) + +std::ostream& operator<<(std::ostream& os, const OSDPerfMetricLimit &limit); + +typedef std::set<OSDPerfMetricLimit> OSDPerfMetricLimits; + +struct OSDPerfMetricQuery { + bool operator<(const OSDPerfMetricQuery &other) const { + if (key_descriptor < other.key_descriptor) { + return true; + } + if (key_descriptor > other.key_descriptor) { + return false; + } + return (performance_counter_descriptors < + other.performance_counter_descriptors); + } + + OSDPerfMetricQuery() { + } + + OSDPerfMetricQuery( + const OSDPerfMetricKeyDescriptor &key_descriptor, + const PerformanceCounterDescriptors &performance_counter_descriptors) + : key_descriptor(key_descriptor), + performance_counter_descriptors(performance_counter_descriptors) { + } + + template <typename L> + bool get_key(L&& get_sub_key, OSDPerfMetricKey *key) const { + for (auto &sub_key_descriptor : key_descriptor) { + OSDPerfMetricSubKey sub_key; + if (!get_sub_key(sub_key_descriptor, &sub_key)) { + return false; + } + key->push_back(sub_key); + } + return true; + } + + DENC(OSDPerfMetricQuery, v, p) { + DENC_START(1, 1, p); + denc(v.key_descriptor, p); + denc(v.performance_counter_descriptors, p); + DENC_FINISH(p); + } + + void get_performance_counter_descriptors( + PerformanceCounterDescriptors *descriptors) const { + *descriptors = performance_counter_descriptors; + } + + template <typename L> + void update_counters(L &&update_counter, + PerformanceCounters *counters) const { + auto it = counters->begin(); + for (auto &descriptor : performance_counter_descriptors) { + // TODO: optimize + if (it == counters->end()) { + counters->push_back(PerformanceCounter()); + it = std::prev(counters->end()); + } + update_counter(descriptor, &(*it)); + it++; + } + } + + void pack_counters(const PerformanceCounters &counters, ceph::buffer::list *bl) const; + + OSDPerfMetricKeyDescriptor key_descriptor; + PerformanceCounterDescriptors performance_counter_descriptors; +}; +WRITE_CLASS_DENC(OSDPerfMetricQuery) + +struct OSDPerfCollector : PerfCollector { + std::map<OSDPerfMetricKey, PerformanceCounters> counters; + + OSDPerfCollector(MetricQueryID query_id) + : PerfCollector(query_id) { + } +}; + +std::ostream& operator<<(std::ostream& os, const OSDPerfMetricQuery &query); + +struct OSDPerfMetricReport { + PerformanceCounterDescriptors performance_counter_descriptors; + std::map<OSDPerfMetricKey, ceph::buffer::list> group_packed_performance_counters; + + DENC(OSDPerfMetricReport, v, p) { + DENC_START(1, 1, p); + denc(v.performance_counter_descriptors, p); + denc(v.group_packed_performance_counters, p); + DENC_FINISH(p); + } +}; +WRITE_CLASS_DENC(OSDPerfMetricReport) + +#endif // OSD_PERF_METRIC_H_ + diff --git a/src/mgr/PyFormatter.cc b/src/mgr/PyFormatter.cc new file mode 100644 index 000000000..6a7f3e982 --- /dev/null +++ b/src/mgr/PyFormatter.cc @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat Inc + * + * Author: John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "PyFormatter.h" +#include <fstream> + +#define LARGE_SIZE 1024 + + +void PyFormatter::open_array_section(std::string_view name) +{ + PyObject *list = PyList_New(0); + dump_pyobject(name, list); + stack.push(cursor); + cursor = list; +} + +void PyFormatter::open_object_section(std::string_view name) +{ + PyObject *dict = PyDict_New(); + dump_pyobject(name, dict); + stack.push(cursor); + cursor = dict; +} + +void PyFormatter::dump_null(std::string_view name) +{ + dump_pyobject(name, Py_None); +} + +void PyFormatter::dump_unsigned(std::string_view name, uint64_t u) +{ + PyObject *p = PyLong_FromUnsignedLong(u); + ceph_assert(p); + dump_pyobject(name, p); +} + +void PyFormatter::dump_int(std::string_view name, int64_t u) +{ + PyObject *p = PyLong_FromLongLong(u); + ceph_assert(p); + dump_pyobject(name, p); +} + +void PyFormatter::dump_float(std::string_view name, double d) +{ + dump_pyobject(name, PyFloat_FromDouble(d)); +} + +void PyFormatter::dump_string(std::string_view name, std::string_view s) +{ + dump_pyobject(name, PyUnicode_FromString(s.data())); +} + +void PyFormatter::dump_bool(std::string_view name, bool b) +{ + if (b) { + Py_INCREF(Py_True); + dump_pyobject(name, Py_True); + } else { + Py_INCREF(Py_False); + dump_pyobject(name, Py_False); + } +} + +std::ostream& PyFormatter::dump_stream(std::string_view name) +{ + // Give the caller an ostream, construct a PyString, + // and remember the association between the two. On flush, + // we'll read from the ostream into the PyString + auto ps = std::make_shared<PendingStream>(); + ps->cursor = cursor; + ps->name = name; + + pending_streams.push_back(ps); + + return ps->stream; +} + +void PyFormatter::dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) +{ + char buf[LARGE_SIZE]; + vsnprintf(buf, LARGE_SIZE, fmt, ap); + + dump_pyobject(name, PyUnicode_FromString(buf)); +} + +/** + * Steals reference to `p` + */ +void PyFormatter::dump_pyobject(std::string_view name, PyObject *p) +{ + if (PyList_Check(cursor)) { + PyList_Append(cursor, p); + Py_DECREF(p); + } else if (PyDict_Check(cursor)) { + PyObject *key = PyUnicode_DecodeUTF8(name.data(), name.size(), nullptr); + PyDict_SetItem(cursor, key, p); + Py_DECREF(key); + Py_DECREF(p); + } else { + ceph_abort(); + } +} + +void PyFormatter::finish_pending_streams() +{ + for (const auto &i : pending_streams) { + PyObject *tmp_cur = cursor; + cursor = i->cursor; + dump_pyobject( + i->name.c_str(), + PyUnicode_FromString(i->stream.str().c_str())); + cursor = tmp_cur; + } + + pending_streams.clear(); +} + +PyObject* PyJSONFormatter::get() +{ + if(json_formatter::stack_size()) { + close_section(); + } + ceph_assert(!json_formatter::stack_size()); + std::ostringstream ss; + flush(ss); + std::string s = ss.str(); + PyObject* obj = PyBytes_FromStringAndSize(std::move(s.c_str()), s.size()); + return obj; +} diff --git a/src/mgr/PyFormatter.h b/src/mgr/PyFormatter.h new file mode 100644 index 000000000..b45fbf162 --- /dev/null +++ b/src/mgr/PyFormatter.h @@ -0,0 +1,164 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat Inc + * + * Author: John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef PY_FORMATTER_H_ +#define PY_FORMATTER_H_ + +// Python.h comes first because otherwise it clobbers ceph's assert +#include <Python.h> + +#include <stack> +#include <string> +#include <string_view> +#include <sstream> +#include <memory> +#include <list> + +#include "common/Formatter.h" +#include "include/ceph_assert.h" + +class PyFormatter : public ceph::Formatter +{ +public: + PyFormatter (const PyFormatter&) = delete; + PyFormatter& operator= (const PyFormatter&) = delete; + PyFormatter(bool pretty = false, bool array = false) + { + // It is forbidden to instantiate me outside of the GIL, + // because I construct python objects right away + + // Initialise cursor to an empty dict + if (!array) { + root = cursor = PyDict_New(); + } else { + root = cursor = PyList_New(0); + } + } + + ~PyFormatter() override + { + cursor = NULL; + Py_DECREF(root); + root = NULL; + } + + // Obscure, don't care. + void open_array_section_in_ns(std::string_view name, const char *ns) override + {ceph_abort();} + void open_object_section_in_ns(std::string_view name, const char *ns) override + {ceph_abort();} + + void reset() override + { + const bool array = PyList_Check(root); + Py_DECREF(root); + if (array) { + root = cursor = PyList_New(0); + } else { + root = cursor = PyDict_New(); + } + } + + void set_status(int status, const char* status_name) override {} + void output_header() override {}; + void output_footer() override {}; + void enable_line_break() override {}; + + void open_array_section(std::string_view name) override; + void open_object_section(std::string_view name) override; + void close_section() override + { + ceph_assert(cursor != root); + ceph_assert(!stack.empty()); + cursor = stack.top(); + stack.pop(); + } + void dump_bool(std::string_view name, bool b) override; + void dump_null(std::string_view name) override; + void dump_unsigned(std::string_view name, uint64_t u) override; + void dump_int(std::string_view name, int64_t u) override; + void dump_float(std::string_view name, double d) override; + void dump_string(std::string_view name, std::string_view s) override; + std::ostream& dump_stream(std::string_view name) override; + void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) override; + + void flush(std::ostream& os) override + { + // This class is not a serializer: this doesn't make sense + ceph_abort(); + } + + int get_len() const override + { + // This class is not a serializer: this doesn't make sense + ceph_abort(); + return 0; + } + + void write_raw_data(const char *data) override + { + // This class is not a serializer: this doesn't make sense + ceph_abort(); + } + + PyObject *get() + { + finish_pending_streams(); + + Py_INCREF(root); + return root; + } + + void finish_pending_streams(); + +private: + PyObject *root; + PyObject *cursor; + std::stack<PyObject *> stack; + + void dump_pyobject(std::string_view name, PyObject *p); + + class PendingStream { + public: + PyObject *cursor; + std::string name; + std::stringstream stream; + }; + + std::list<std::shared_ptr<PendingStream> > pending_streams; + +}; + +class PyJSONFormatter : public JSONFormatter { +public: + PyObject *get(); + PyJSONFormatter (const PyJSONFormatter&) = default; + PyJSONFormatter(bool pretty=false, bool is_array=false) : JSONFormatter(pretty) { + if(is_array) { + open_array_section(""); + } else { + open_object_section(""); + } +} + +private: + using json_formatter = JSONFormatter; + template <class T> void add_value(std::string_view name, T val); + void add_value(std::string_view name, std::string_view val, bool quoted); +}; + +#endif + diff --git a/src/mgr/PyModule.cc b/src/mgr/PyModule.cc new file mode 100644 index 000000000..084cf3ffc --- /dev/null +++ b/src/mgr/PyModule.cc @@ -0,0 +1,771 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "BaseMgrModule.h" +#include "BaseMgrStandbyModule.h" +#include "PyOSDMap.h" +#include "MgrContext.h" +#include "PyUtil.h" + +#include "PyModule.h" + +#include "include/stringify.h" +#include "common/BackTrace.h" +#include "global/signal_handler.h" + +#include "common/debug.h" +#include "common/errno.h" +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +#undef dout_prefix +#define dout_prefix *_dout << "mgr[py] " + +// definition for non-const static member +std::string PyModule::mgr_store_prefix = "mgr/"; + +// Courtesy of http://stackoverflow.com/questions/1418015/how-to-get-python-exception-text +#define BOOST_BIND_GLOBAL_PLACEHOLDERS +// Boost apparently can't be bothered to fix its own usage of its own +// deprecated features. +#include <boost/python/extract.hpp> +#include <boost/python/import.hpp> +#include <boost/python/object.hpp> +#undef BOOST_BIND_GLOBAL_PLACEHOLDERS +#include <boost/algorithm/string/predicate.hpp> +#include "include/ceph_assert.h" // boost clobbers this + + +using std::string; +using std::wstring; + +// decode a Python exception into a string +std::string handle_pyerror( + bool crash_dump, + std::string module, + std::string caller) +{ + using namespace boost::python; + using namespace boost; + + PyObject *exc, *val, *tb; + object formatted_list, formatted; + PyErr_Fetch(&exc, &val, &tb); + PyErr_NormalizeException(&exc, &val, &tb); + handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb)); + + object traceback(import("traceback")); + if (!tb) { + object format_exception_only(traceback.attr("format_exception_only")); + try { + formatted_list = format_exception_only(hexc, hval); + } catch (error_already_set const &) { + // error while processing exception object + // returning only the exception string value + PyObject *name_attr = PyObject_GetAttrString(exc, "__name__"); + std::stringstream ss; + ss << PyUnicode_AsUTF8(name_attr) << ": " << PyUnicode_AsUTF8(val); + Py_XDECREF(name_attr); + ss << "\nError processing exception object: " << peek_pyerror(); + return ss.str(); + } + } else { + object format_exception(traceback.attr("format_exception")); + try { + formatted_list = format_exception(hexc, hval, htb); + } catch (error_already_set const &) { + // error while processing exception object + // returning only the exception string value + PyObject *name_attr = PyObject_GetAttrString(exc, "__name__"); + std::stringstream ss; + ss << PyUnicode_AsUTF8(name_attr) << ": " << PyUnicode_AsUTF8(val); + Py_XDECREF(name_attr); + ss << "\nError processing exception object: " << peek_pyerror(); + return ss.str(); + } + } + formatted = str("").join(formatted_list); + + if (!module.empty()) { + std::list<std::string> bt_strings; + std::map<std::string, std::string> extra; + + extra["mgr_module"] = module; + extra["mgr_module_caller"] = caller; + PyObject *name_attr = PyObject_GetAttrString(exc, "__name__"); + extra["mgr_python_exception"] = stringify(PyUnicode_AsUTF8(name_attr)); + Py_XDECREF(name_attr); + + PyObject *l = get_managed_object(formatted_list, boost::python::tag); + if (PyList_Check(l)) { + // skip first line, which is: "Traceback (most recent call last):\n" + for (unsigned i = 1; i < PyList_Size(l); ++i) { + PyObject *val = PyList_GET_ITEM(l, i); + std::string s = PyUnicode_AsUTF8(val); + s.resize(s.size() - 1); // strip off newline character + bt_strings.push_back(s); + } + } + PyBackTrace bt(bt_strings); + + char crash_path[PATH_MAX]; + generate_crash_dump(crash_path, bt, &extra); + } + + return extract<std::string>(formatted); +} + +/** + * Get the single-line exception message, without clearing any + * exception state. + */ +std::string peek_pyerror() +{ + PyObject *ptype, *pvalue, *ptraceback; + PyErr_Fetch(&ptype, &pvalue, &ptraceback); + ceph_assert(ptype); + ceph_assert(pvalue); + PyObject *pvalue_str = PyObject_Str(pvalue); + std::string exc_msg = PyUnicode_AsUTF8(pvalue_str); + Py_DECREF(pvalue_str); + PyErr_Restore(ptype, pvalue, ptraceback); + + return exc_msg; +} + + +namespace { + PyObject* log_write(PyObject*, PyObject* args) { + char* m = nullptr; + if (PyArg_ParseTuple(args, "s", &m)) { + auto len = strlen(m); + if (len && m[len-1] == '\n') { + m[len-1] = '\0'; + } + dout(4) << m << dendl; + } + Py_RETURN_NONE; + } + + PyObject* log_flush(PyObject*, PyObject*){ + Py_RETURN_NONE; + } + + static PyMethodDef log_methods[] = { + {"write", log_write, METH_VARARGS, "write stdout and stderr"}, + {"flush", log_flush, METH_VARARGS, "flush"}, + {nullptr, nullptr, 0, nullptr} + }; + + static PyModuleDef ceph_logger_module = { + PyModuleDef_HEAD_INIT, + "ceph_logger", + nullptr, + -1, + log_methods, + }; +} + +PyModuleConfig::PyModuleConfig() = default; + +PyModuleConfig::PyModuleConfig(PyModuleConfig &mconfig) + : config(mconfig.config) +{} + +PyModuleConfig::~PyModuleConfig() = default; + + +std::pair<int, std::string> PyModuleConfig::set_config( + MonClient *monc, + const std::string &module_name, + const std::string &key, const std::optional<std::string>& val) +{ + const std::string global_key = "mgr/" + module_name + "/" + key; + Command set_cmd; + { + std::ostringstream cmd_json; + JSONFormatter jf; + jf.open_object_section("cmd"); + if (val) { + jf.dump_string("prefix", "config set"); + jf.dump_string("value", *val); + } else { + jf.dump_string("prefix", "config rm"); + } + jf.dump_string("who", "mgr"); + jf.dump_string("name", global_key); + jf.close_section(); + jf.flush(cmd_json); + set_cmd.run(monc, cmd_json.str()); + } + set_cmd.wait(); + + if (set_cmd.r == 0) { + std::lock_guard l(lock); + if (val) { + config[global_key] = *val; + } else { + config.erase(global_key); + } + return {0, ""}; + } else { + if (val) { + dout(0) << "`config set mgr " << global_key << " " << val << "` failed: " + << cpp_strerror(set_cmd.r) << dendl; + } else { + dout(0) << "`config rm mgr " << global_key << "` failed: " + << cpp_strerror(set_cmd.r) << dendl; + } + dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl; + return {set_cmd.r, set_cmd.outs}; + } +} + +std::string PyModule::get_site_packages() +{ + std::stringstream site_packages; + + // CPython doesn't auto-add site-packages dirs to sys.path for us, + // but it does provide a module that we can ask for them. + auto site_module = PyImport_ImportModule("site"); + ceph_assert(site_module); + + auto site_packages_fn = PyObject_GetAttrString(site_module, "getsitepackages"); + if (site_packages_fn != nullptr) { + auto site_packages_list = PyObject_CallObject(site_packages_fn, nullptr); + ceph_assert(site_packages_list); + + auto n = PyList_Size(site_packages_list); + for (Py_ssize_t i = 0; i < n; ++i) { + if (i != 0) { + site_packages << ":"; + } + site_packages << PyUnicode_AsUTF8(PyList_GetItem(site_packages_list, i)); + } + + Py_DECREF(site_packages_list); + Py_DECREF(site_packages_fn); + } else { + // Fall back to generating our own site-packages paths by imitating + // what the standard site.py does. This is annoying but it lets us + // run inside virtualenvs :-/ + + auto site_packages_fn = PyObject_GetAttrString(site_module, "addsitepackages"); + ceph_assert(site_packages_fn); + + auto known_paths = PySet_New(nullptr); + auto pArgs = PyTuple_Pack(1, known_paths); + PyObject_CallObject(site_packages_fn, pArgs); + Py_DECREF(pArgs); + Py_DECREF(known_paths); + Py_DECREF(site_packages_fn); + + auto sys_module = PyImport_ImportModule("sys"); + ceph_assert(sys_module); + auto sys_path = PyObject_GetAttrString(sys_module, "path"); + ceph_assert(sys_path); + + dout(1) << "sys.path:" << dendl; + auto n = PyList_Size(sys_path); + bool first = true; + for (Py_ssize_t i = 0; i < n; ++i) { + dout(1) << " " << PyUnicode_AsUTF8(PyList_GetItem(sys_path, i)) << dendl; + if (first) { + first = false; + } else { + site_packages << ":"; + } + site_packages << PyUnicode_AsUTF8(PyList_GetItem(sys_path, i)); + } + + Py_DECREF(sys_path); + Py_DECREF(sys_module); + } + + Py_DECREF(site_module); + + return site_packages.str(); +} + +PyObject* PyModule::init_ceph_logger() +{ + auto py_logger = PyModule_Create(&ceph_logger_module); + PySys_SetObject("stderr", py_logger); + PySys_SetObject("stdout", py_logger); + return py_logger; +} + +PyObject* PyModule::init_ceph_module() +{ + static PyMethodDef module_methods[] = { + {nullptr, nullptr, 0, nullptr} + }; + static PyModuleDef ceph_module_def = { + PyModuleDef_HEAD_INIT, + "ceph_module", + nullptr, + -1, + module_methods, + nullptr, + nullptr, + nullptr, + nullptr + }; + PyObject *ceph_module = PyModule_Create(&ceph_module_def); + ceph_assert(ceph_module != nullptr); + std::map<const char*, PyTypeObject*> classes{ + {{"BaseMgrModule", &BaseMgrModuleType}, + {"BaseMgrStandbyModule", &BaseMgrStandbyModuleType}, + {"BasePyOSDMap", &BasePyOSDMapType}, + {"BasePyOSDMapIncremental", &BasePyOSDMapIncrementalType}, + {"BasePyCRUSH", &BasePyCRUSHType}} + }; + for (auto [name, type] : classes) { + type->tp_new = PyType_GenericNew; + if (PyType_Ready(type) < 0) { + ceph_abort(); + } + Py_INCREF(type); + + PyModule_AddObject(ceph_module, name, (PyObject *)type); + } + return ceph_module; +} + +int PyModule::load(PyThreadState *pMainThreadState) +{ + ceph_assert(pMainThreadState != nullptr); + + // Configure sub-interpreter + { + SafeThreadState sts(pMainThreadState); + Gil gil(sts); + + auto thread_state = Py_NewInterpreter(); + if (thread_state == nullptr) { + derr << "Failed to create python sub-interpreter for '" << module_name << '"' << dendl; + return -EINVAL; + } else { + pMyThreadState.set(thread_state); + // Some python modules do not cope with an unpopulated argv, so lets + // fake one. This step also picks up site-packages into sys.path. + const wchar_t *argv[] = {L"ceph-mgr"}; + PySys_SetArgv(1, (wchar_t**)argv); + // Configure sys.path to include mgr_module_path + string paths = (g_conf().get_val<std::string>("mgr_module_path") + ':' + + get_site_packages() + ':'); + wstring sys_path(wstring(begin(paths), end(paths)) + Py_GetPath()); + PySys_SetPath(const_cast<wchar_t*>(sys_path.c_str())); + dout(10) << "Computed sys.path '" + << string(begin(sys_path), end(sys_path)) << "'" << dendl; + } + } + // Environment is all good, import the external module + { + Gil gil(pMyThreadState); + + int r; + r = load_subclass_of("MgrModule", &pClass); + if (r) { + derr << "Class not found in module '" << module_name << "'" << dendl; + return r; + } + + r = load_commands(); + if (r != 0) { + derr << "Missing or invalid COMMANDS attribute in module '" + << module_name << "'" << dendl; + error_string = "Missing or invalid COMMANDS attribute"; + return r; + } + + register_options(pClass); + r = load_options(); + if (r != 0) { + derr << "Missing or invalid MODULE_OPTIONS attribute in module '" + << module_name << "'" << dendl; + error_string = "Missing or invalid MODULE_OPTIONS attribute"; + return r; + } + + load_notify_types(); + + // We've imported the module and found a MgrModule subclass, at this + // point the module is considered loaded. It might still not be + // runnable though, can_run populated later... + loaded = true; + + r = load_subclass_of("MgrStandbyModule", &pStandbyClass); + if (!r) { + dout(4) << "Standby mode available in module '" << module_name + << "'" << dendl; + register_options(pStandbyClass); + } else { + dout(4) << "Standby mode not provided by module '" << module_name + << "'" << dendl; + } + + // Populate can_run by interrogating the module's callback that + // may check for dependencies etc + PyObject *pCanRunTuple = PyObject_CallMethod(pClass, + const_cast<char*>("can_run"), const_cast<char*>("()")); + if (pCanRunTuple != nullptr) { + if (PyTuple_Check(pCanRunTuple) && PyTuple_Size(pCanRunTuple) == 2) { + PyObject *pCanRun = PyTuple_GetItem(pCanRunTuple, 0); + PyObject *can_run_str = PyTuple_GetItem(pCanRunTuple, 1); + if (!PyBool_Check(pCanRun) || !PyUnicode_Check(can_run_str)) { + derr << "Module " << get_name() + << " returned wrong type in can_run" << dendl; + error_string = "wrong type returned from can_run"; + can_run = false; + } else { + can_run = (pCanRun == Py_True); + if (!can_run) { + error_string = PyUnicode_AsUTF8(can_run_str); + dout(4) << "Module " << get_name() + << " reported that it cannot run: " + << error_string << dendl; + } + } + } else { + derr << "Module " << get_name() + << " returned wrong type in can_run" << dendl; + error_string = "wrong type returned from can_run"; + can_run = false; + } + + Py_DECREF(pCanRunTuple); + } else { + derr << "Exception calling can_run on " << get_name() << dendl; + derr << handle_pyerror(true, get_name(), "PyModule::load") << dendl; + can_run = false; + } + } + return 0; +} + +int PyModule::walk_dict_list( + const std::string &attr_name, + std::function<int(PyObject*)> fn) +{ + PyObject *command_list = PyObject_GetAttrString(pClass, attr_name.c_str()); + if (command_list == nullptr) { + derr << "Module " << get_name() << " has missing " << attr_name + << " member" << dendl; + return -EINVAL; + } + if (!PyObject_TypeCheck(command_list, &PyList_Type)) { + // Relatively easy mistake for human to make, e.g. defining COMMANDS + // as a {} instead of a [] + derr << "Module " << get_name() << " has " << attr_name + << " member of wrong type (should be a list)" << dendl; + return -EINVAL; + } + + // Invoke fn on each item in the list + int r = 0; + const size_t list_size = PyList_Size(command_list); + for (size_t i = 0; i < list_size; ++i) { + PyObject *command = PyList_GetItem(command_list, i); + ceph_assert(command != nullptr); + + if (!PyDict_Check(command)) { + derr << "Module " << get_name() << " has non-dict entry " + << "in " << attr_name << " list" << dendl; + return -EINVAL; + } + + r = fn(command); + if (r != 0) { + break; + } + } + Py_DECREF(command_list); + + return r; +} + +int PyModule::register_options(PyObject *cls) +{ + PyObject *pRegCmd = PyObject_CallMethod( + cls, + const_cast<char*>("_register_options"), const_cast<char*>("(s)"), + module_name.c_str()); + if (pRegCmd != nullptr) { + Py_DECREF(pRegCmd); + } else { + derr << "Exception calling _register_options on " << get_name() + << dendl; + derr << handle_pyerror(true, module_name, "PyModule::register_options") << dendl; + } + return 0; +} + +int PyModule::load_notify_types() +{ + PyObject *ls = PyObject_GetAttrString(pClass, "NOTIFY_TYPES"); + if (ls == nullptr) { + derr << "Module " << get_name() << " has missing NOTIFY_TYPES member" << dendl; + return -EINVAL; + } + if (!PyObject_TypeCheck(ls, &PyList_Type)) { + // Relatively easy mistake for human to make, e.g. defining COMMANDS + // as a {} instead of a [] + derr << "Module " << get_name() << " has NOTIFY_TYPES that is not a list" << dendl; + return -EINVAL; + } + + const size_t list_size = PyList_Size(ls); + for (size_t i = 0; i < list_size; ++i) { + PyObject *notify_type = PyList_GetItem(ls, i); + ceph_assert(notify_type != nullptr); + + if (!PyObject_TypeCheck(notify_type, &PyUnicode_Type)) { + derr << "Module " << get_name() << " has non-string entry in NOTIFY_TYPES list" + << dendl; + return -EINVAL; + } + + notify_types.insert(PyUnicode_AsUTF8(notify_type)); + } + Py_DECREF(ls); + dout(10) << "Module " << get_name() << " notify_types " << notify_types << dendl; + + return 0; +} + +int PyModule::load_commands() +{ + PyObject *pRegCmd = PyObject_CallMethod(pClass, + const_cast<char*>("_register_commands"), const_cast<char*>("(s)"), + module_name.c_str()); + if (pRegCmd != nullptr) { + Py_DECREF(pRegCmd); + } else { + derr << "Exception calling _register_commands on " << get_name() + << dendl; + derr << handle_pyerror(true, module_name, "PyModule::load_commands") << dendl; + } + + int r = walk_dict_list("COMMANDS", [this](PyObject *pCommand) -> int { + ModuleCommand command; + + PyObject *pCmd = PyDict_GetItemString(pCommand, "cmd"); + ceph_assert(pCmd != nullptr); + command.cmdstring = PyUnicode_AsUTF8(pCmd); + + dout(20) << "loaded command " << command.cmdstring << dendl; + + PyObject *pDesc = PyDict_GetItemString(pCommand, "desc"); + ceph_assert(pDesc != nullptr); + command.helpstring = PyUnicode_AsUTF8(pDesc); + + PyObject *pPerm = PyDict_GetItemString(pCommand, "perm"); + ceph_assert(pPerm != nullptr); + command.perm = PyUnicode_AsUTF8(pPerm); + + command.polling = false; + if (PyObject *pPoll = PyDict_GetItemString(pCommand, "poll"); + pPoll && PyObject_IsTrue(pPoll)) { + command.polling = true; + } + + command.module_name = module_name; + + commands.push_back(std::move(command)); + + return 0; + }); + + dout(10) << "loaded " << commands.size() << " commands" << dendl; + + return r; +} + +int PyModule::load_options() +{ + int r = walk_dict_list("MODULE_OPTIONS", [this](PyObject *pOption) -> int { + MgrMap::ModuleOption option; + PyObject *p; + p = PyDict_GetItemString(pOption, "name"); + ceph_assert(p != nullptr); + option.name = PyUnicode_AsUTF8(p); + option.type = Option::TYPE_STR; + p = PyDict_GetItemString(pOption, "type"); + if (p && PyObject_TypeCheck(p, &PyUnicode_Type)) { + std::string s = PyUnicode_AsUTF8(p); + int t = Option::str_to_type(s); + if (t >= 0) { + option.type = t; + } + } + p = PyDict_GetItemString(pOption, "desc"); + if (p && PyObject_TypeCheck(p, &PyUnicode_Type)) { + option.desc = PyUnicode_AsUTF8(p); + } + p = PyDict_GetItemString(pOption, "long_desc"); + if (p && PyObject_TypeCheck(p, &PyUnicode_Type)) { + option.long_desc = PyUnicode_AsUTF8(p); + } + p = PyDict_GetItemString(pOption, "default"); + if (p) { + auto q = PyObject_Str(p); + option.default_value = PyUnicode_AsUTF8(q); + Py_DECREF(q); + } + p = PyDict_GetItemString(pOption, "min"); + if (p) { + auto q = PyObject_Str(p); + option.min = PyUnicode_AsUTF8(q); + Py_DECREF(q); + } + p = PyDict_GetItemString(pOption, "max"); + if (p) { + auto q = PyObject_Str(p); + option.max = PyUnicode_AsUTF8(q); + Py_DECREF(q); + } + p = PyDict_GetItemString(pOption, "enum_allowed"); + if (p && PyObject_TypeCheck(p, &PyList_Type)) { + for (Py_ssize_t i = 0; i < PyList_Size(p); ++i) { + auto q = PyList_GetItem(p, i); + if (q) { + auto r = PyObject_Str(q); + option.enum_allowed.insert(PyUnicode_AsUTF8(r)); + Py_DECREF(r); + } + } + } + p = PyDict_GetItemString(pOption, "see_also"); + if (p && PyObject_TypeCheck(p, &PyList_Type)) { + for (Py_ssize_t i = 0; i < PyList_Size(p); ++i) { + auto q = PyList_GetItem(p, i); + if (q && PyObject_TypeCheck(q, &PyUnicode_Type)) { + option.see_also.insert(PyUnicode_AsUTF8(q)); + } + } + } + p = PyDict_GetItemString(pOption, "tags"); + if (p && PyObject_TypeCheck(p, &PyList_Type)) { + for (Py_ssize_t i = 0; i < PyList_Size(p); ++i) { + auto q = PyList_GetItem(p, i); + if (q && PyObject_TypeCheck(q, &PyUnicode_Type)) { + option.tags.insert(PyUnicode_AsUTF8(q)); + } + } + } + p = PyDict_GetItemString(pOption, "runtime"); + if (p && PyObject_TypeCheck(p, &PyBool_Type)) { + if (p == Py_True) { + option.flags |= Option::FLAG_RUNTIME; + } + if (p == Py_False) { + option.flags &= ~Option::FLAG_RUNTIME; + } + } + dout(20) << "loaded module option " << option.name << dendl; + options[option.name] = std::move(option); + return 0; + }); + + dout(10) << "loaded " << options.size() << " options" << dendl; + + return r; +} + +bool PyModule::is_option(const std::string &option_name) +{ + std::lock_guard l(lock); + return options.count(option_name) > 0; +} + +PyObject *PyModule::get_typed_option_value(const std::string& name, + const std::string& value) +{ + // we don't need to hold a lock here because these MODULE_OPTIONS + // are set up exactly once during startup. + auto p = options.find(name); + if (p != options.end()) { + return get_python_typed_option_value((Option::type_t)p->second.type, value); + } + return PyUnicode_FromString(value.c_str()); +} + +int PyModule::load_subclass_of(const char* base_class, PyObject** py_class) +{ + // load the base class + PyObject *mgr_module = PyImport_ImportModule("mgr_module"); + if (!mgr_module) { + error_string = peek_pyerror(); + derr << "Module not found: 'mgr_module'" << dendl; + derr << handle_pyerror(true, module_name, "PyModule::load_subclass_of") << dendl; + return -EINVAL; + } + auto mgr_module_type = PyObject_GetAttrString(mgr_module, base_class); + Py_DECREF(mgr_module); + if (!mgr_module_type) { + error_string = peek_pyerror(); + derr << "Unable to import MgrModule from mgr_module" << dendl; + derr << handle_pyerror(true, module_name, "PyModule::load_subclass_of") << dendl; + return -EINVAL; + } + + // find the sub class + PyObject *plugin_module = PyImport_ImportModule(module_name.c_str()); + if (!plugin_module) { + error_string = peek_pyerror(); + derr << "Module not found: '" << module_name << "'" << dendl; + derr << handle_pyerror(true, module_name, "PyModule::load_subclass_of") << dendl; + return -ENOENT; + } + auto locals = PyModule_GetDict(plugin_module); + Py_DECREF(plugin_module); + PyObject *key, *value; + Py_ssize_t pos = 0; + *py_class = nullptr; + while (PyDict_Next(locals, &pos, &key, &value)) { + if (!PyType_Check(value)) { + continue; + } + if (!PyObject_IsSubclass(value, mgr_module_type)) { + continue; + } + if (PyObject_RichCompareBool(value, mgr_module_type, Py_EQ)) { + continue; + } + auto class_name = PyUnicode_AsUTF8(key); + if (*py_class) { + derr << __func__ << ": ignoring '" + << module_name << "." << class_name << "'" + << ": only one '" << base_class + << "' class is loaded from each plugin" << dendl; + continue; + } + *py_class = value; + dout(4) << __func__ << ": found class: '" + << module_name << "." << class_name << "'" << dendl; + } + Py_DECREF(mgr_module_type); + + return *py_class ? 0 : -EINVAL; +} + +PyModule::~PyModule() +{ + if (pMyThreadState.ts != nullptr) { + Gil gil(pMyThreadState, true); + Py_XDECREF(pClass); + Py_XDECREF(pStandbyClass); + } +} + diff --git a/src/mgr/PyModule.h b/src/mgr/PyModule.h new file mode 100644 index 000000000..8d88ff94c --- /dev/null +++ b/src/mgr/PyModule.h @@ -0,0 +1,193 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include <map> +#include <memory> +#include <string> +#include <vector> +#include <boost/optional.hpp> +#include "common/ceph_mutex.h" +#include "Python.h" +#include "Gil.h" +#include "mon/MgrMap.h" + + +class MonClient; + +std::string handle_pyerror(bool generate_crash_dump = false, + std::string module = {}, + std::string caller = {}); + +std::string peek_pyerror(); + +/** + * A Ceph CLI command description provided from a Python module + */ +class ModuleCommand { +public: + std::string cmdstring; + std::string helpstring; + std::string perm; + bool polling; + + // Call the ActivePyModule of this name to handle the command + std::string module_name; +}; + +class PyModule +{ + mutable ceph::mutex lock = ceph::make_mutex("PyModule::lock"); +private: + const std::string module_name; + std::string get_site_packages(); + int load_subclass_of(const char* class_name, PyObject** py_class); + + // Did the MgrMap identify this module as one that should run? + bool enabled = false; + + // Did the MgrMap flag this module as always on? + bool always_on = false; + + // Did we successfully import this python module and look up symbols? + // (i.e. is it possible to instantiate a MgrModule subclass instance?) + bool loaded = false; + + // Did the module identify itself as being able to run? + // (i.e. should we expect instantiating and calling serve() to work?) + bool can_run = false; + + // Did the module encounter an unexpected error while running? + // (e.g. throwing an exception from serve()) + bool failed = false; + + // Populated if loaded, can_run or failed indicates a problem + std::string error_string; + + // Helper for loading MODULE_OPTIONS and COMMANDS members + int walk_dict_list( + const std::string &attr_name, + std::function<int(PyObject*)> fn); + + int load_commands(); + std::vector<ModuleCommand> commands; + + int register_options(PyObject *cls); + int load_options(); + std::map<std::string, MgrMap::ModuleOption> options; + + int load_notify_types(); + std::set<std::string> notify_types; + +public: + static std::string mgr_store_prefix; + + SafeThreadState pMyThreadState; + PyObject *pClass = nullptr; + PyObject *pStandbyClass = nullptr; + + explicit PyModule(const std::string &module_name_) + : module_name(module_name_) + { + } + + ~PyModule(); + + bool is_option(const std::string &option_name); + const std::map<std::string,MgrMap::ModuleOption>& get_options() const { + return options; + } + + PyObject *get_typed_option_value( + const std::string& option, + const std::string& value); + + int load(PyThreadState *pMainThreadState); + static PyObject* init_ceph_logger(); + static PyObject* init_ceph_module(); + + void set_enabled(const bool enabled_) + { + enabled = enabled_; + } + + void set_always_on(const bool always_on_) { + always_on = always_on_; + } + + /** + * Extend `out` with the contents of `this->commands` + */ + void get_commands(std::vector<ModuleCommand> *out) const + { + std::lock_guard l(lock); + ceph_assert(out != nullptr); + out->insert(out->end(), commands.begin(), commands.end()); + } + + + /** + * Mark the module as failed, recording the reason in the error + * string. + */ + void fail(const std::string &reason) + { + std::lock_guard l(lock); + failed = true; + error_string = reason; + } + + bool is_enabled() const { + std::lock_guard l(lock); + return enabled || always_on; + } + + bool is_failed() const { std::lock_guard l(lock) ; return failed; } + bool is_loaded() const { std::lock_guard l(lock) ; return loaded; } + bool is_always_on() const { std::lock_guard l(lock) ; return always_on; } + + bool should_notify(const std::string& notify_type) const { + return notify_types.count(notify_type); + } + + const std::string &get_name() const { + std::lock_guard l(lock) ; return module_name; + } + const std::string &get_error_string() const { + std::lock_guard l(lock) ; return error_string; + } + bool get_can_run() const { + std::lock_guard l(lock) ; return can_run; + } +}; + +typedef std::shared_ptr<PyModule> PyModuleRef; + +class PyModuleConfig { +public: + mutable ceph::mutex lock = ceph::make_mutex("PyModuleConfig::lock"); + std::map<std::string, std::string> config; + + PyModuleConfig(); + + PyModuleConfig(PyModuleConfig &mconfig); + + ~PyModuleConfig(); + + std::pair<int, std::string> set_config( + MonClient *monc, + const std::string &module_name, + const std::string &key, const std::optional<std::string>& val); + +}; diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc new file mode 100644 index 000000000..f5f500802 --- /dev/null +++ b/src/mgr/PyModuleRegistry.cc @@ -0,0 +1,448 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "PyModuleRegistry.h" + +#include <filesystem> + +#include "include/stringify.h" +#include "common/errno.h" +#include "common/split.h" + +#include "BaseMgrModule.h" +#include "PyOSDMap.h" +#include "BaseMgrStandbyModule.h" +#include "Gil.h" +#include "MgrContext.h" +#include "mgr/mgr_commands.h" + +#include "ActivePyModules.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +#undef dout_prefix +#define dout_prefix *_dout << "mgr[py] " + +namespace fs = std::filesystem; + +std::set<std::string> obsolete_modules = { + "orchestrator_cli", +}; + +void PyModuleRegistry::init() +{ + std::lock_guard locker(lock); + + // Set up global python interpreter +#define WCHAR(s) L ## #s + Py_SetProgramName(const_cast<wchar_t*>(WCHAR(MGR_PYTHON_EXECUTABLE))); +#undef WCHAR + // Add more modules + if (g_conf().get_val<bool>("daemonize")) { + PyImport_AppendInittab("ceph_logger", PyModule::init_ceph_logger); + } + PyImport_AppendInittab("ceph_module", PyModule::init_ceph_module); + Py_InitializeEx(0); +#if PY_VERSION_HEX < 0x03090000 + // Let CPython know that we will be calling it back from other + // threads in future. + if (! PyEval_ThreadsInitialized()) { + PyEval_InitThreads(); + } +#endif + // Drop the GIL and remember the main thread state (current + // thread state becomes NULL) + pMainThreadState = PyEval_SaveThread(); + ceph_assert(pMainThreadState != nullptr); + + std::list<std::string> failed_modules; + + const std::string module_path = g_conf().get_val<std::string>("mgr_module_path"); + auto module_names = probe_modules(module_path); + // Load python code + for (const auto& module_name : module_names) { + dout(1) << "Loading python module '" << module_name << "'" << dendl; + + // Everything starts disabled, set enabled flag on module + // when we see first MgrMap + auto mod = std::make_shared<PyModule>(module_name); + int r = mod->load(pMainThreadState); + if (r != 0) { + // Don't use handle_pyerror() here; we don't have the GIL + // or the right thread state (this is deliberate). + derr << "Error loading module '" << module_name << "': " + << cpp_strerror(r) << dendl; + failed_modules.push_back(module_name); + // Don't drop out here, load the other modules + } + + // Record the module even if the load failed, so that we can + // report its loading error + modules[module_name] = std::move(mod); + } + if (module_names.empty()) { + clog->error() << "No ceph-mgr modules found in " << module_path; + } + if (!failed_modules.empty()) { + clog->error() << "Failed to load ceph-mgr modules: " << joinify( + failed_modules.begin(), failed_modules.end(), std::string(", ")); + } +} + +bool PyModuleRegistry::handle_mgr_map(const MgrMap &mgr_map_) +{ + std::lock_guard l(lock); + + if (mgr_map.epoch == 0) { + mgr_map = mgr_map_; + + // First time we see MgrMap, set the enabled flags on modules + // This should always happen before someone calls standby_start + // or active_start + for (const auto &[module_name, module] : modules) { + const bool enabled = (mgr_map.modules.count(module_name) > 0); + module->set_enabled(enabled); + const bool always_on = (mgr_map.get_always_on_modules().count(module_name) > 0); + module->set_always_on(always_on); + } + + return false; + } else { + bool modules_changed = mgr_map_.modules != mgr_map.modules || + mgr_map_.always_on_modules != mgr_map.always_on_modules; + mgr_map = mgr_map_; + + if (standby_modules != nullptr) { + standby_modules->handle_mgr_map(mgr_map_); + } + + return modules_changed; + } +} + + + +void PyModuleRegistry::standby_start(MonClient &mc, Finisher &f) +{ + std::lock_guard l(lock); + ceph_assert(active_modules == nullptr); + ceph_assert(standby_modules == nullptr); + + // Must have seen a MgrMap by this point, in order to know + // which modules should be enabled + ceph_assert(mgr_map.epoch > 0); + + dout(4) << "Starting modules in standby mode" << dendl; + + standby_modules.reset(new StandbyPyModules( + mgr_map, module_config, clog, mc, f)); + + std::set<std::string> failed_modules; + for (const auto &i : modules) { + if (!(i.second->is_enabled() && i.second->get_can_run())) { + // report always_on modules with a standby mode that won't run + if (i.second->is_always_on() && i.second->pStandbyClass) { + failed_modules.insert(i.second->get_name()); + } + continue; + } + + if (i.second->pStandbyClass) { + dout(4) << "starting module " << i.second->get_name() << dendl; + standby_modules->start_one(i.second); + } else { + dout(4) << "skipping module '" << i.second->get_name() << "' because " + "it does not implement a standby mode" << dendl; + } + } + + if (!failed_modules.empty()) { + clog->error() << "Failed to execute ceph-mgr module(s) in standby mode: " + << joinify(failed_modules.begin(), failed_modules.end(), + std::string(", ")); + } +} + +void PyModuleRegistry::active_start( + DaemonStateIndex &ds, ClusterState &cs, + const std::map<std::string, std::string> &kv_store, + bool mon_provides_kv_sub, + MonClient &mc, LogChannelRef clog_, LogChannelRef audit_clog_, + Objecter &objecter_, Client &client_, Finisher &f, + DaemonServer &server) +{ + std::lock_guard locker(lock); + + dout(4) << "Starting modules in active mode" << dendl; + + ceph_assert(active_modules == nullptr); + + // Must have seen a MgrMap by this point, in order to know + // which modules should be enabled + ceph_assert(mgr_map.epoch > 0); + + if (standby_modules != nullptr) { + standby_modules->shutdown(); + standby_modules.reset(); + } + + active_modules.reset( + new ActivePyModules( + module_config, + kv_store, mon_provides_kv_sub, + ds, cs, mc, + clog_, audit_clog_, objecter_, client_, f, server, + *this)); + + for (const auto &i : modules) { + // Anything we're skipping because of !can_run will be flagged + // to the user separately via get_health_checks + if (!(i.second->is_enabled() && i.second->is_loaded())) { + continue; + } + + dout(4) << "Starting " << i.first << dendl; + active_modules->start_one(i.second); + } +} + +void PyModuleRegistry::active_shutdown() +{ + std::lock_guard locker(lock); + + if (active_modules != nullptr) { + active_modules->shutdown(); + active_modules.reset(); + } +} + +void PyModuleRegistry::shutdown() +{ + std::lock_guard locker(lock); + + if (standby_modules != nullptr) { + standby_modules->shutdown(); + standby_modules.reset(); + } + + // Ideally, now, we'd be able to do this for all modules: + // + // Py_EndInterpreter(pMyThreadState); + // PyThreadState_Swap(pMainThreadState); + // + // Unfortunately, if the module has any other *python* threads active + // at this point, Py_EndInterpreter() will abort with: + // + // Fatal Python error: Py_EndInterpreter: not the last thread + // + // This can happen when using CherryPy in a module, becuase CherryPy + // runs an extra thread as a timeout monitor, which spends most of its + // life inside a time.sleep(60). Unless you are very, very lucky with + // the timing calling this destructor, that thread will still be stuck + // in a sleep, and Py_EndInterpreter() will abort. + // + // This could of course also happen with a poorly written module which + // made no attempt to clean up any additional threads it created. + // + // The safest thing to do is just not call Py_EndInterpreter(), and + // let Py_Finalize() kill everything after all modules are shut down. + + modules.clear(); + + PyEval_RestoreThread(pMainThreadState); + Py_Finalize(); +} + +std::vector<std::string> PyModuleRegistry::probe_modules(const std::string &path) const +{ + const auto opt = g_conf().get_val<std::string>("mgr_disabled_modules"); + const auto disabled_modules = ceph::split(opt); + + std::vector<std::string> modules; + for (const auto& entry: fs::directory_iterator(path)) { + if (!fs::is_directory(entry)) { + continue; + } + const std::string name = entry.path().filename(); + if (std::count(disabled_modules.begin(), disabled_modules.end(), name)) { + dout(10) << "ignoring disabled module " << name << dendl; + continue; + } + auto module_path = entry.path() / "module.py"; + if (fs::exists(module_path)) { + modules.push_back(name); + } + } + return modules; +} + +int PyModuleRegistry::handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss) +{ + if (active_modules) { + return active_modules->handle_command(module_command, session, cmdmap, + inbuf, ds, ss); + } else { + // We do not expect to be called before active modules is up, but + // it's straightfoward to handle this case so let's do it. + return -EAGAIN; + } +} + +std::vector<ModuleCommand> PyModuleRegistry::get_py_commands() const +{ + std::lock_guard l(lock); + + std::vector<ModuleCommand> result; + for (const auto& i : modules) { + i.second->get_commands(&result); + } + + return result; +} + +std::vector<MonCommand> PyModuleRegistry::get_commands() const +{ + std::vector<ModuleCommand> commands = get_py_commands(); + std::vector<MonCommand> result; + for (auto &pyc: commands) { + uint64_t flags = MonCommand::FLAG_MGR; + if (pyc.polling) { + flags |= MonCommand::FLAG_POLL; + } + result.push_back({pyc.cmdstring, pyc.helpstring, "mgr", + pyc.perm, flags}); + } + return result; +} + +void PyModuleRegistry::get_health_checks(health_check_map_t *checks) +{ + std::lock_guard l(lock); + + // Only the active mgr reports module issues + if (active_modules) { + active_modules->get_health_checks(checks); + + std::map<std::string, std::string> dependency_modules; + std::map<std::string, std::string> failed_modules; + + /* + * Break up broken modules into two categories: + * - can_run=false: the module is working fine but explicitly + * telling you that a dependency is missing. Advise the user to + * read the message from the module and install what's missing. + * - failed=true or loaded=false: something unexpected is broken, + * either at runtime (from serve()) or at load time. This indicates + * a bug and the user should be guided to inspect the mgr log + * to investigate and gather evidence. + */ + + for (const auto &i : modules) { + auto module = i.second; + if (module->is_enabled() && !module->get_can_run()) { + dependency_modules[module->get_name()] = module->get_error_string(); + } else if ((module->is_enabled() && !module->is_loaded()) + || (module->is_failed() && module->get_can_run())) { + // - Unloadable modules are only reported if they're enabled, + // to avoid spamming users about modules they don't have the + // dependencies installed for because they don't use it. + // - Failed modules are only reported if they passed the can_run + // checks (to avoid outputting two health messages about a + // module that said can_run=false but we tried running it anyway) + failed_modules[module->get_name()] = module->get_error_string(); + } + } + + // report failed always_on modules as health errors + for (const auto& name : mgr_map.get_always_on_modules()) { + if (obsolete_modules.count(name)) { + continue; + } + if (active_modules->is_pending(name)) { + continue; + } + if (!active_modules->module_exists(name)) { + if (failed_modules.find(name) == failed_modules.end() && + dependency_modules.find(name) == dependency_modules.end()) { + failed_modules[name] = "Not found or unloadable"; + } + } + } + + if (!dependency_modules.empty()) { + std::ostringstream ss; + if (dependency_modules.size() == 1) { + auto iter = dependency_modules.begin(); + ss << "Module '" << iter->first << "' has failed dependency: " + << iter->second; + } else if (dependency_modules.size() > 1) { + ss << dependency_modules.size() + << " mgr modules have failed dependencies"; + } + auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str(), + dependency_modules.size()); + for (auto& i : dependency_modules) { + std::ostringstream ss; + ss << "Module '" << i.first << "' has failed dependency: " << i.second; + d.detail.push_back(ss.str()); + } + } + + if (!failed_modules.empty()) { + std::ostringstream ss; + if (failed_modules.size() == 1) { + auto iter = failed_modules.begin(); + ss << "Module '" << iter->first << "' has failed: " << iter->second; + } else if (failed_modules.size() > 1) { + ss << failed_modules.size() << " mgr modules have failed"; + } + auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str(), + failed_modules.size()); + for (auto& i : failed_modules) { + std::ostringstream ss; + ss << "Module '" << i.first << "' has failed: " << i.second; + d.detail.push_back(ss.str()); + } + } + } +} + +void PyModuleRegistry::handle_config(const std::string &k, const std::string &v) +{ + std::lock_guard l(module_config.lock); + + if (!v.empty()) { + // removing value to hide sensitive data going into mgr logs + // leaving this for debugging purposes + // dout(10) << "Loaded module_config entry " << k << ":" << v << dendl; + dout(10) << "Loaded module_config entry " << k << ":" << dendl; + module_config.config[k] = v; + } else { + module_config.config.erase(k); + } +} + +void PyModuleRegistry::handle_config_notify() +{ + std::lock_guard l(lock); + if (active_modules) { + active_modules->config_notify(); + } +} diff --git a/src/mgr/PyModuleRegistry.h b/src/mgr/PyModuleRegistry.h new file mode 100644 index 000000000..9af9abb57 --- /dev/null +++ b/src/mgr/PyModuleRegistry.h @@ -0,0 +1,243 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#pragma once + +// First because it includes Python.h +#include "PyModule.h" + +#include <string> +#include <map> +#include <set> +#include <memory> + +#include "common/LogClient.h" + +#include "ActivePyModules.h" +#include "StandbyPyModules.h" + +class MgrSession; + +/** + * This class is responsible for setting up the python runtime environment + * and importing the python modules. + * + * It is *not* responsible for constructing instances of their BaseMgrModule + * subclasses: that is the job of ActiveMgrModule, which consumes the class + * references that we load here. + */ +class PyModuleRegistry +{ +private: + mutable ceph::mutex lock = ceph::make_mutex("PyModuleRegistry::lock"); + LogChannelRef clog; + + std::map<std::string, PyModuleRef> modules; + std::multimap<std::string, entity_addrvec_t> clients; + + std::unique_ptr<ActivePyModules> active_modules; + std::unique_ptr<StandbyPyModules> standby_modules; + + PyThreadState *pMainThreadState; + + // We have our own copy of MgrMap, because we are constructed + // before ClusterState exists. + MgrMap mgr_map; + + /** + * Discover python modules from local disk + */ + std::vector<std::string> probe_modules(const std::string &path) const; + + PyModuleConfig module_config; + +public: + void handle_config(const std::string &k, const std::string &v); + void handle_config_notify(); + + void update_kv_data( + const std::string prefix, + bool incremental, + const map<std::string, std::optional<bufferlist>, std::less<>>& data) { + ceph_assert(active_modules); + active_modules->update_kv_data(prefix, incremental, data); + } + + /** + * Get references to all modules (whether they have loaded and/or + * errored) or not. + */ + auto get_modules() const + { + std::vector<PyModuleRef> modules_out; + std::lock_guard l(lock); + for (const auto &i : modules) { + modules_out.push_back(i.second); + } + + return modules_out; + } + + explicit PyModuleRegistry(LogChannelRef clog_) + : clog(clog_) + {} + + /** + * @return true if the mgrmap has changed such that the service needs restart + */ + bool handle_mgr_map(const MgrMap &mgr_map_); + + bool have_standby_modules() const { + return !!standby_modules; + } + + void init(); + + void upgrade_config( + MonClient *monc, + const std::map<std::string, std::string> &old_config); + + void active_start( + DaemonStateIndex &ds, ClusterState &cs, + const std::map<std::string, std::string> &kv_store, + bool mon_provides_kv_sub, + MonClient &mc, LogChannelRef clog_, LogChannelRef audit_clog_, + Objecter &objecter_, Client &client_, Finisher &f, + DaemonServer &server); + void standby_start(MonClient &mc, Finisher &f); + + bool is_standby_running() const + { + return standby_modules != nullptr; + } + + void active_shutdown(); + void shutdown(); + + std::vector<MonCommand> get_commands() const; + std::vector<ModuleCommand> get_py_commands() const; + + /** + * Get the specified module. The module does not have to be + * loaded or runnable. + * + * Returns an empty reference if it does not exist. + */ + PyModuleRef get_module(const std::string &module_name) + { + std::lock_guard l(lock); + auto module_iter = modules.find(module_name); + if (module_iter == modules.end()) { + return {}; + } + return module_iter->second; + } + + /** + * Pass through command to the named module for execution. + * + * The command must exist in the COMMANDS reported by the module. If it + * doesn't then this will abort. + * + * If ActivePyModules has not been instantiated yet then this will + * return EAGAIN. + */ + int handle_command( + const ModuleCommand& module_command, + const MgrSession& session, + const cmdmap_t &cmdmap, + const bufferlist &inbuf, + std::stringstream *ds, + std::stringstream *ss); + + /** + * Pass through health checks reported by modules, and report any + * modules that have failed (i.e. unhandled exceptions in serve()) + */ + void get_health_checks(health_check_map_t *checks); + + void get_progress_events(map<std::string,ProgressEvent> *events) { + if (active_modules) { + active_modules->get_progress_events(events); + } + } + + // FIXME: breaking interface so that I don't have to go rewrite all + // the places that call into these (for now) + // >>> + void notify_all(const std::string ¬ify_type, + const std::string ¬ify_id) + { + if (active_modules) { + active_modules->notify_all(notify_type, notify_id); + } + } + + void notify_all(const LogEntry &log_entry) + { + if (active_modules) { + active_modules->notify_all(log_entry); + } + } + + bool should_notify(const std::string& name, + const std::string& notify_type) { + return modules.at(name)->should_notify(notify_type); + } + + std::map<std::string, std::string> get_services() const + { + ceph_assert(active_modules); + return active_modules->get_services(); + } + + void register_client(std::string_view name, entity_addrvec_t addrs, bool replace) + { + std::lock_guard l(lock); + auto n = std::string(name); + if (replace) { + clients.erase(n); + } + clients.emplace(n, std::move(addrs)); + } + void unregister_client(std::string_view name, const entity_addrvec_t& addrs) + { + std::lock_guard l(lock); + auto itp = clients.equal_range(std::string(name)); + for (auto it = itp.first; it != itp.second; ++it) { + if (it->second == addrs) { + clients.erase(it); + return; + } + } + } + + auto get_clients() const + { + std::lock_guard l(lock); + return clients; + } + + bool is_module_active(const std::string &name) { + ceph_assert(active_modules); + return active_modules->module_exists(name); + } + + auto& get_active_module_finisher(const std::string &name) { + ceph_assert(active_modules); + return active_modules->get_module_finisher(name); + } + + // <<< (end of ActivePyModules cheeky call-throughs) +}; diff --git a/src/mgr/PyModuleRunner.cc b/src/mgr/PyModuleRunner.cc new file mode 100644 index 000000000..57c90fdab --- /dev/null +++ b/src/mgr/PyModuleRunner.cc @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +// Python.h comes first because otherwise it clobbers ceph's assert +#include <Python.h> + +#include "PyModule.h" + +#include "common/debug.h" +#include "mgr/Gil.h" + +#include "PyModuleRunner.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + + +PyModuleRunner::~PyModuleRunner() +{ + Gil gil(py_module->pMyThreadState, true); + + if (pClassInstance) { + Py_XDECREF(pClassInstance); + pClassInstance = nullptr; + } +} + +int PyModuleRunner::serve() +{ + ceph_assert(pClassInstance != nullptr); + + // This method is called from a separate OS thread (i.e. a thread not + // created by Python), so tell Gil to wrap this in a new thread state. + Gil gil(py_module->pMyThreadState, true); + + auto pValue = PyObject_CallMethod(pClassInstance, + const_cast<char*>("serve"), nullptr); + + int r = 0; + if (pValue != NULL) { + Py_DECREF(pValue); + } else { + // This is not a very informative log message because it's an + // unknown/unexpected exception that we can't say much about. + + + // Get short exception message for the cluster log, before + // dumping the full backtrace to the local log. + std::string exc_msg = peek_pyerror(); + + clog->error() << "Unhandled exception from module '" << get_name() + << "' while running on mgr." << g_conf()->name.get_id() + << ": " << exc_msg; + derr << get_name() << ".serve:" << dendl; + derr << handle_pyerror(true, get_name(), "PyModuleRunner::serve") << dendl; + + py_module->fail(exc_msg); + + return -EINVAL; + } + + return r; +} + +void PyModuleRunner::shutdown() +{ + ceph_assert(pClassInstance != nullptr); + + Gil gil(py_module->pMyThreadState, true); + + auto pValue = PyObject_CallMethod(pClassInstance, + const_cast<char*>("shutdown"), nullptr); + + if (pValue != NULL) { + Py_DECREF(pValue); + } else { + derr << "Failed to invoke shutdown() on " << get_name() << dendl; + derr << handle_pyerror(true, get_name(), "PyModuleRunner::shutdown") << dendl; + } + + dead = true; +} + +void PyModuleRunner::log(const std::string &record) +{ +#undef dout_prefix +#define dout_prefix *_dout + dout(0) << record << dendl; +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " +} + +void* PyModuleRunner::PyModuleRunnerThread::entry() +{ + // No need to acquire the GIL here; the module does it. + dout(4) << "Entering thread for " << mod->get_name() << dendl; + mod->serve(); + return nullptr; +} diff --git a/src/mgr/PyModuleRunner.h b/src/mgr/PyModuleRunner.h new file mode 100644 index 000000000..88d9f755a --- /dev/null +++ b/src/mgr/PyModuleRunner.h @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#pragma once + +#include "common/Thread.h" +#include "common/LogClient.h" +#include "mgr/Gil.h" + +#include "PyModule.h" + +/** + * Implement the pattern of calling serve() on a module in a thread, + * until shutdown() is called. + */ +class PyModuleRunner +{ +public: + // Info about the module we're going to run + PyModuleRef py_module; + +protected: + // Populated by descendent class + PyObject *pClassInstance = nullptr; + + LogChannelRef clog; + + class PyModuleRunnerThread : public Thread + { + PyModuleRunner *mod; + + public: + explicit PyModuleRunnerThread(PyModuleRunner *mod_) + : mod(mod_) {} + + void *entry() override; + }; + + bool is_dead() const { return dead; } + + std::string thread_name; + +public: + int serve(); + void shutdown(); + void log(const std::string &record); + + const char *get_thread_name() const + { + return thread_name.c_str(); + } + + PyModuleRunner( + const PyModuleRef &py_module_, + LogChannelRef clog_) + : + py_module(py_module_), + clog(clog_), + thread(this) + { + // Shortened name for use as thread name, because thread names + // required to be <16 chars + thread_name = py_module->get_name().substr(0, 15); + + ceph_assert(py_module != nullptr); + } + + ~PyModuleRunner(); + + PyModuleRunnerThread thread; + + std::string const &get_name() const { return py_module->get_name(); } + +private: + bool dead = false; +}; + + diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc new file mode 100644 index 000000000..83475f5ee --- /dev/null +++ b/src/mgr/PyOSDMap.cc @@ -0,0 +1,721 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Mgr.h" + +#include "osd/OSDMap.h" +#include "common/errno.h" +#include "common/version.h" +#include "include/stringify.h" + +#include "PyOSDMap.h" +#include "PyFormatter.h" +#include "Gil.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +using std::map; +using std::set; +using std::string; +using std::vector; + +typedef struct { + PyObject_HEAD + OSDMap *osdmap; +} BasePyOSDMap; + +typedef struct { + PyObject_HEAD + OSDMap::Incremental *inc; +} BasePyOSDMapIncremental; + +typedef struct { + PyObject_HEAD + std::shared_ptr<CrushWrapper> crush; +} BasePyCRUSH; + +// ---------- + +static PyObject *osdmap_get_epoch(BasePyOSDMap *self, PyObject *obj) +{ + return PyLong_FromLong(self->osdmap->get_epoch()); +} + +static PyObject *osdmap_get_crush_version(BasePyOSDMap* self, PyObject *obj) +{ + return PyLong_FromLong(self->osdmap->get_crush_version()); +} + +static PyObject *osdmap_dump(BasePyOSDMap* self, PyObject *obj) +{ + PyFormatter f; + self->osdmap->dump(&f, g_ceph_context); + return f.get(); +} + +static PyObject *osdmap_new_incremental(BasePyOSDMap *self, PyObject *obj) +{ + OSDMap::Incremental *inc = new OSDMap::Incremental; + + inc->fsid = self->osdmap->get_fsid(); + inc->epoch = self->osdmap->get_epoch() + 1; + // always include latest crush map here... this is okay since we never + // actually use this map in the real world (and even if we did it would + // be a no-op). + self->osdmap->crush->encode(inc->crush, CEPH_FEATURES_ALL); + dout(10) << __func__ << " " << inc << dendl; + + return construct_with_capsule("mgr_module", "OSDMapIncremental", + (void*)(inc)); +} + +static PyObject *osdmap_apply_incremental(BasePyOSDMap *self, + BasePyOSDMapIncremental *incobj) +{ + if (!PyObject_TypeCheck(incobj, &BasePyOSDMapIncrementalType)) { + derr << "Wrong type in osdmap_apply_incremental!" << dendl; + return nullptr; + } + + bufferlist bl; + self->osdmap->encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED); + OSDMap *next = new OSDMap; + next->decode(bl); + next->apply_incremental(*(incobj->inc)); + dout(10) << __func__ << " map " << self->osdmap << " inc " << incobj->inc + << " next " << next << dendl; + + return construct_with_capsule("mgr_module", "OSDMap", (void*)next); +} + +static PyObject *osdmap_get_crush(BasePyOSDMap* self, PyObject *obj) +{ + return construct_with_capsule("mgr_module", "CRUSHMap", + (void*)(&(self->osdmap->crush))); +} + +static PyObject *osdmap_get_pools_by_take(BasePyOSDMap* self, PyObject *args) +{ + int take; + if (!PyArg_ParseTuple(args, "i:get_pools_by_take", + &take)) { + return nullptr; + } + + PyFormatter f; + f.open_array_section("pools"); + for (auto& p : self->osdmap->get_pools()) { + if (self->osdmap->crush->rule_has_take(p.second.crush_rule, take)) { + f.dump_int("pool", p.first); + } + } + f.close_section(); + return f.get(); +} + +static PyObject *osdmap_calc_pg_upmaps(BasePyOSDMap* self, PyObject *args) +{ + PyObject *pool_list; + BasePyOSDMapIncremental *incobj; + int max_deviation = 0; + int max_iterations = 0; + if (!PyArg_ParseTuple(args, "OiiO:calc_pg_upmaps", + &incobj, &max_deviation, + &max_iterations, &pool_list)) { + return nullptr; + } + if (!PyList_CheckExact(pool_list)) { + derr << __func__ << " pool_list not a list" << dendl; + return nullptr; + } + set<int64_t> pools; + for (auto i = 0; i < PyList_Size(pool_list); ++i) { + PyObject *pool_name = PyList_GET_ITEM(pool_list, i); + if (!PyUnicode_Check(pool_name)) { + derr << __func__ << " " << pool_name << " not a string" << dendl; + return nullptr; + } + auto pool_id = self->osdmap->lookup_pg_pool_name( + PyUnicode_AsUTF8(pool_name)); + if (pool_id < 0) { + derr << __func__ << " pool '" << PyUnicode_AsUTF8(pool_name) + << "' does not exist" << dendl; + return nullptr; + } + pools.insert(pool_id); + } + + dout(10) << __func__ << " osdmap " << self->osdmap << " inc " << incobj->inc + << " max_deviation " << max_deviation + << " max_iterations " << max_iterations + << " pools " << pools + << dendl; + PyThreadState *tstate = PyEval_SaveThread(); + int r = self->osdmap->calc_pg_upmaps(g_ceph_context, + max_deviation, + max_iterations, + pools, + incobj->inc); + PyEval_RestoreThread(tstate); + dout(10) << __func__ << " r = " << r << dendl; + return PyLong_FromLong(r); +} + +static PyObject *osdmap_map_pool_pgs_up(BasePyOSDMap* self, PyObject *args) +{ + int poolid; + if (!PyArg_ParseTuple(args, "i:map_pool_pgs_up", + &poolid)) { + return nullptr; + } + auto pi = self->osdmap->get_pg_pool(poolid); + if (!pi) + return nullptr; + map<pg_t,vector<int>> pm; + for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) { + pg_t pgid(ps, poolid); + self->osdmap->pg_to_up_acting_osds(pgid, &pm[pgid], nullptr, nullptr, nullptr); + } + PyFormatter f; + for (auto p : pm) { + string pg = stringify(p.first); + f.open_array_section(pg.c_str()); + for (auto o : p.second) { + f.dump_int("osd", o); + } + f.close_section(); + } + return f.get(); +} + +static int +BasePyOSDMap_init(BasePyOSDMap *self, PyObject *args, PyObject *kwds) +{ + PyObject *osdmap_capsule = nullptr; + static const char *kwlist[] = {"osdmap_capsule", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O", + const_cast<char**>(kwlist), + &osdmap_capsule)) { + return -1; + } + if (!PyObject_TypeCheck(osdmap_capsule, &PyCapsule_Type)) { + PyErr_Format(PyExc_TypeError, + "Expected a PyCapsule_Type, not %s", + Py_TYPE(osdmap_capsule)->tp_name); + return -1; + } + + self->osdmap = (OSDMap*)PyCapsule_GetPointer( + osdmap_capsule, nullptr); + ceph_assert(self->osdmap); + + return 0; +} + + +static void +BasePyOSDMap_dealloc(BasePyOSDMap *self) +{ + if (self->osdmap) { + delete self->osdmap; + self->osdmap = nullptr; + } else { + derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl; + } + Py_TYPE(self)->tp_free(self); +} + +static PyObject *osdmap_pg_to_up_acting_osds(BasePyOSDMap *self, PyObject *args) +{ + int pool_id = 0; + int ps = 0; + if (!PyArg_ParseTuple(args, "ii:pg_to_up_acting_osds", + &pool_id, &ps)) { + return nullptr; + } + + std::vector<int> up; + int up_primary; + std::vector<int> acting; + int acting_primary; + pg_t pg_id(ps, pool_id); + self->osdmap->pg_to_up_acting_osds(pg_id, + &up, &up_primary, + &acting, &acting_primary); + + // (Ab)use PyFormatter as a convenient way to generate a dict + PyFormatter f; + f.dump_int("up_primary", up_primary); + f.dump_int("acting_primary", acting_primary); + f.open_array_section("up"); + for (const auto &i : up) { + f.dump_int("osd", i); + } + f.close_section(); + f.open_array_section("acting"); + for (const auto &i : acting) { + f.dump_int("osd", i); + } + f.close_section(); + + return f.get(); +} + +static PyObject *osdmap_pool_raw_used_rate(BasePyOSDMap *self, PyObject *args) +{ + int pool_id = 0; + if (!PyArg_ParseTuple(args, "i:pool_raw_used_rate", + &pool_id)) { + return nullptr; + } + + if (!self->osdmap->have_pg_pool(pool_id)) { + return nullptr; + } + + float rate = self->osdmap->pool_raw_used_rate(pool_id); + + return PyFloat_FromDouble(rate); +} + +static PyObject *osdmap_build_simple(PyObject *cls, PyObject *args, PyObject *kwargs) +{ + static const char *kwlist[] = {"epoch", "uuid", "num_osd", nullptr}; + int epoch = 1; + char* uuid_str = nullptr; + int num_osd = -1; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "izi", + const_cast<char**>(kwlist), + &epoch, &uuid_str, &num_osd)) { + Py_RETURN_NONE; + } + uuid_d uuid; + if (uuid_str) { + if (!uuid.parse(uuid_str)) { + PyErr_Format(PyExc_ValueError, "bad uuid %s", uuid_str); + Py_RETURN_NONE; + } + } else { + uuid.generate_random(); + } + + auto osdmap = without_gil([&] { + OSDMap* osdmap = new OSDMap(); + // negative osd is allowed, in that case i just count all osds in ceph.conf + osdmap->build_simple(g_ceph_context, epoch, uuid, num_osd); + return osdmap; + }); + return construct_with_capsule("mgr_module", "OSDMap", reinterpret_cast<void*>(osdmap)); +} + +PyMethodDef BasePyOSDMap_methods[] = { + {"_get_epoch", (PyCFunction)osdmap_get_epoch, METH_NOARGS, "Get OSDMap epoch"}, + {"_get_crush_version", (PyCFunction)osdmap_get_crush_version, METH_NOARGS, + "Get CRUSH version"}, + {"_dump", (PyCFunction)osdmap_dump, METH_NOARGS, "Dump OSDMap::Incremental"}, + {"_new_incremental", (PyCFunction)osdmap_new_incremental, METH_NOARGS, + "Create OSDMap::Incremental"}, + {"_apply_incremental", (PyCFunction)osdmap_apply_incremental, METH_O, + "Apply OSDMap::Incremental and return the resulting OSDMap"}, + {"_get_crush", (PyCFunction)osdmap_get_crush, METH_NOARGS, "Get CrushWrapper"}, + {"_get_pools_by_take", (PyCFunction)osdmap_get_pools_by_take, METH_VARARGS, + "Get pools that have CRUSH rules that TAKE the given root"}, + {"_calc_pg_upmaps", (PyCFunction)osdmap_calc_pg_upmaps, METH_VARARGS, + "Calculate new pg-upmap values"}, + {"_map_pool_pgs_up", (PyCFunction)osdmap_map_pool_pgs_up, METH_VARARGS, + "Calculate up set mappings for all PGs in a pool"}, + {"_pg_to_up_acting_osds", (PyCFunction)osdmap_pg_to_up_acting_osds, METH_VARARGS, + "Calculate up+acting OSDs for a PG ID"}, + {"_pool_raw_used_rate", (PyCFunction)osdmap_pool_raw_used_rate, METH_VARARGS, + "Get raw space to logical space ratio"}, + {"_build_simple", (PyCFunction)osdmap_build_simple, METH_VARARGS | METH_CLASS, + "Create a simple OSDMap"}, + {NULL, NULL, 0, NULL} +}; + +PyTypeObject BasePyOSDMapType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BasePyOSDMap", /* tp_name */ + sizeof(BasePyOSDMap), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)BasePyOSDMap_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "Ceph OSDMap", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BasePyOSDMap_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BasePyOSDMap_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; + +// ---------- + + +static int +BasePyOSDMapIncremental_init(BasePyOSDMapIncremental *self, + PyObject *args, PyObject *kwds) +{ + PyObject *inc_capsule = nullptr; + static const char *kwlist[] = {"inc_capsule", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "O", + const_cast<char**>(kwlist), + &inc_capsule)) { + ceph_abort(); + return -1; + } + ceph_assert(PyObject_TypeCheck(inc_capsule, &PyCapsule_Type)); + + self->inc = (OSDMap::Incremental*)PyCapsule_GetPointer( + inc_capsule, nullptr); + ceph_assert(self->inc); + + return 0; +} + +static void +BasePyOSDMapIncremental_dealloc(BasePyOSDMapIncremental *self) +{ + if (self->inc) { + delete self->inc; + self->inc = nullptr; + } else { + derr << "Destroying improperly initialized BasePyOSDMap " << self << dendl; + } + Py_TYPE(self)->tp_free(self); +} + +static PyObject *osdmap_inc_get_epoch(BasePyOSDMapIncremental *self, + PyObject *obj) +{ + return PyLong_FromLong(self->inc->epoch); +} + +static PyObject *osdmap_inc_dump(BasePyOSDMapIncremental *self, + PyObject *obj) +{ + PyFormatter f; + self->inc->dump(&f); + return f.get(); +} + +static int get_int_float_map(PyObject *obj, map<int,double> *out) +{ + PyObject *ls = PyDict_Items(obj); + for (int j = 0; j < PyList_Size(ls); ++j) { + PyObject *pair = PyList_GET_ITEM(ls, j); + if (!PyTuple_Check(pair)) { + derr << __func__ << " item " << j << " not a tuple" << dendl; + Py_DECREF(ls); + return -1; + } + int k; + double v; + if (!PyArg_ParseTuple(pair, "id:pair", &k, &v)) { + derr << __func__ << " item " << j << " not a size 2 tuple" << dendl; + Py_DECREF(ls); + return -1; + } + (*out)[k] = v; + } + + Py_DECREF(ls); + return 0; +} + +static PyObject *osdmap_inc_set_osd_reweights(BasePyOSDMapIncremental *self, + PyObject *weightobj) +{ + map<int,double> wm; + if (get_int_float_map(weightobj, &wm) < 0) { + return nullptr; + } + + for (auto i : wm) { + self->inc->new_weight[i.first] = std::max(0.0, std::min(1.0, i.second)) * 0x10000; + } + Py_RETURN_NONE; +} + +static PyObject *osdmap_inc_set_compat_weight_set_weights( + BasePyOSDMapIncremental *self, PyObject *weightobj) +{ + map<int,double> wm; + if (get_int_float_map(weightobj, &wm) < 0) { + return nullptr; + } + + CrushWrapper crush; + ceph_assert(self->inc->crush.length()); // see new_incremental + auto p = self->inc->crush.cbegin(); + decode(crush, p); + crush.create_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS, 1); + for (auto i : wm) { + crush.choose_args_adjust_item_weightf( + g_ceph_context, + crush.choose_args_get(CrushWrapper::DEFAULT_CHOOSE_ARGS), + i.first, + { i.second }, + nullptr); + } + self->inc->crush.clear(); + crush.encode(self->inc->crush, CEPH_FEATURES_ALL); + Py_RETURN_NONE; +} + +PyMethodDef BasePyOSDMapIncremental_methods[] = { + {"_get_epoch", (PyCFunction)osdmap_inc_get_epoch, METH_NOARGS, + "Get OSDMap::Incremental epoch"}, + {"_dump", (PyCFunction)osdmap_inc_dump, METH_NOARGS, + "Dump OSDMap::Incremental"}, + {"_set_osd_reweights", (PyCFunction)osdmap_inc_set_osd_reweights, + METH_O, "Set osd reweight values"}, + {"_set_crush_compat_weight_set_weights", + (PyCFunction)osdmap_inc_set_compat_weight_set_weights, METH_O, + "Set weight values in the pending CRUSH compat weight-set"}, + {NULL, NULL, 0, NULL} +}; + +PyTypeObject BasePyOSDMapIncrementalType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BasePyOSDMapIncremental", /* tp_name */ + sizeof(BasePyOSDMapIncremental), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)BasePyOSDMapIncremental_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "Ceph OSDMapIncremental", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BasePyOSDMapIncremental_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BasePyOSDMapIncremental_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; + + +// ---------- + +static int +BasePyCRUSH_init(BasePyCRUSH *self, + PyObject *args, PyObject *kwds) +{ + PyObject *crush_capsule = nullptr; + static const char *kwlist[] = {"crush_capsule", NULL}; + + if (! PyArg_ParseTupleAndKeywords(args, kwds, "O", + const_cast<char**>(kwlist), + &crush_capsule)) { + ceph_abort(); + return -1; + } + ceph_assert(PyObject_TypeCheck(crush_capsule, &PyCapsule_Type)); + + auto ptr_ref = (std::shared_ptr<CrushWrapper>*)( + PyCapsule_GetPointer(crush_capsule, nullptr)); + + // We passed a pointer to a shared pointer, which is weird, but + // just enough to get it into the constructor: this is a real shared + // pointer construction now, and then we throw away that pointer to + // the shared pointer. + self->crush = *ptr_ref; + ceph_assert(self->crush); + + return 0; +} + +static void +BasePyCRUSH_dealloc(BasePyCRUSH *self) +{ + self->crush.reset(); + Py_TYPE(self)->tp_free(self); +} + +static PyObject *crush_dump(BasePyCRUSH *self, PyObject *obj) +{ + PyFormatter f; + self->crush->dump(&f); + return f.get(); +} + +static PyObject *crush_get_item_name(BasePyCRUSH *self, PyObject *args) +{ + int item; + if (!PyArg_ParseTuple(args, "i:get_item_name", &item)) { + return nullptr; + } + if (!self->crush->item_exists(item)) { + Py_RETURN_NONE; + } + return PyUnicode_FromString(self->crush->get_item_name(item)); +} + +static PyObject *crush_get_item_weight(BasePyCRUSH *self, PyObject *args) +{ + int item; + if (!PyArg_ParseTuple(args, "i:get_item_weight", &item)) { + return nullptr; + } + if (!self->crush->item_exists(item)) { + Py_RETURN_NONE; + } + return PyFloat_FromDouble(self->crush->get_item_weightf(item)); +} + +static PyObject *crush_find_roots(BasePyCRUSH *self) +{ + set<int> roots; + self->crush->find_roots(&roots); + PyFormatter f; + f.open_array_section("roots"); + for (auto root : roots) { + f.dump_int("root", root); + } + f.close_section(); + return f.get(); +} + +static PyObject *crush_find_takes(BasePyCRUSH *self, PyObject *obj) +{ + set<int> takes; + self->crush->find_takes(&takes); + PyFormatter f; + f.open_array_section("takes"); + for (auto root : takes) { + f.dump_int("root", root); + } + f.close_section(); + return f.get(); +} + +static PyObject *crush_get_take_weight_osd_map(BasePyCRUSH *self, PyObject *args) +{ + int root; + if (!PyArg_ParseTuple(args, "i:get_take_weight_osd_map", + &root)) { + return nullptr; + } + map<int,float> wmap; + + if (!self->crush->item_exists(root)) { + return nullptr; + } + + self->crush->get_take_weight_osd_map(root, &wmap); + PyFormatter f; + f.open_object_section("weights"); + for (auto& p : wmap) { + string n = stringify(p.first); // ick + f.dump_float(n.c_str(), p.second); + } + f.close_section(); + return f.get(); +} + +PyMethodDef BasePyCRUSH_methods[] = { + {"_dump", (PyCFunction)crush_dump, METH_NOARGS, "Dump map"}, + {"_get_item_name", (PyCFunction)crush_get_item_name, METH_VARARGS, + "Get item name"}, + {"_get_item_weight", (PyCFunction)crush_get_item_weight, METH_VARARGS, + "Get item weight"}, + {"_find_roots", (PyCFunction)crush_find_roots, METH_NOARGS, + "Find all tree roots"}, + {"_find_takes", (PyCFunction)crush_find_takes, METH_NOARGS, + "Find distinct TAKE roots"}, + {"_get_take_weight_osd_map", (PyCFunction)crush_get_take_weight_osd_map, + METH_VARARGS, "Get OSD weight map for a given TAKE root node"}, + {NULL, NULL, 0, NULL} +}; + +PyTypeObject BasePyCRUSHType = { + PyVarObject_HEAD_INIT(NULL, 0) + "ceph_module.BasePyCRUSH", /* tp_name */ + sizeof(BasePyCRUSH), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)BasePyCRUSH_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "Ceph OSDMapIncremental", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BasePyCRUSH_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)BasePyCRUSH_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; diff --git a/src/mgr/PyOSDMap.h b/src/mgr/PyOSDMap.h new file mode 100644 index 000000000..2cc30dfe2 --- /dev/null +++ b/src/mgr/PyOSDMap.h @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <Python.h> + +#include <string> + +extern PyTypeObject BasePyOSDMapType; +extern PyTypeObject BasePyOSDMapIncrementalType; +extern PyTypeObject BasePyCRUSHType; + +PyObject *construct_with_capsule( + const std::string &module, + const std::string &clsname, + void *wrapped); + diff --git a/src/mgr/PyUtil.cc b/src/mgr/PyUtil.cc new file mode 100644 index 000000000..a8efc2f28 --- /dev/null +++ b/src/mgr/PyUtil.cc @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <Python.h> + +#include "PyUtil.h" + +PyObject *get_python_typed_option_value( + Option::type_t type, + const std::string& value) +{ + switch (type) { + case Option::TYPE_INT: + case Option::TYPE_UINT: + case Option::TYPE_SIZE: + return PyLong_FromString((char *)value.c_str(), nullptr, 0); + case Option::TYPE_SECS: + case Option::TYPE_MILLISECS: + case Option::TYPE_FLOAT: + { + PyObject *s = PyUnicode_FromString(value.c_str()); + PyObject *f = PyFloat_FromString(s); + Py_DECREF(s); + return f; + } + case Option::TYPE_BOOL: + if (value == "1" || value == "true" || value == "True" || + value == "on" || value == "yes") { + Py_INCREF(Py_True); + return Py_True; + } else { + Py_INCREF(Py_False); + return Py_False; + } + case Option::TYPE_STR: + case Option::TYPE_ADDR: + case Option::TYPE_ADDRVEC: + case Option::TYPE_UUID: + break; + } + return PyUnicode_FromString(value.c_str()); +} diff --git a/src/mgr/PyUtil.h b/src/mgr/PyUtil.h new file mode 100644 index 000000000..188b3d28f --- /dev/null +++ b/src/mgr/PyUtil.h @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> + +#include <Python.h> + +#include "common/options.h" + +PyObject *get_python_typed_option_value( + Option::type_t type, + const std::string& value); diff --git a/src/mgr/ServiceMap.cc b/src/mgr/ServiceMap.cc new file mode 100644 index 000000000..fd91d19ab --- /dev/null +++ b/src/mgr/ServiceMap.cc @@ -0,0 +1,242 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mgr/ServiceMap.h" + +#include <fmt/format.h> + +#include "common/Formatter.h" + +using ceph::bufferlist; +using ceph::Formatter; + +// Daemon + +void ServiceMap::Daemon::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(2, 1, bl); + encode(gid, bl); + encode(addr, bl, features); + encode(start_epoch, bl); + encode(start_stamp, bl); + encode(metadata, bl); + encode(task_status, bl); + ENCODE_FINISH(bl); +} + +void ServiceMap::Daemon::decode(bufferlist::const_iterator& p) +{ + DECODE_START(2, p); + decode(gid, p); + decode(addr, p); + decode(start_epoch, p); + decode(start_stamp, p); + decode(metadata, p); + if (struct_v >= 2) { + decode(task_status, p); + } + DECODE_FINISH(p); +} + +void ServiceMap::Daemon::dump(Formatter *f) const +{ + f->dump_unsigned("start_epoch", start_epoch); + f->dump_stream("start_stamp") << start_stamp; + f->dump_unsigned("gid", gid); + f->dump_string("addr", addr.get_legacy_str()); + f->open_object_section("metadata"); + for (auto& p : metadata) { + f->dump_string(p.first.c_str(), p.second); + } + f->close_section(); + f->open_object_section("task_status"); + for (auto& p : task_status) { + f->dump_string(p.first.c_str(), p.second); + } + f->close_section(); +} + +void ServiceMap::Daemon::generate_test_instances(std::list<Daemon*>& ls) +{ + ls.push_back(new Daemon); + ls.push_back(new Daemon); + ls.back()->gid = 222; + ls.back()->metadata["this"] = "that"; + ls.back()->task_status["task1"] = "running"; +} + +// Service + +std::string ServiceMap::Service::get_summary() const +{ + if (!summary.empty()) { + return summary; + } + if (daemons.empty()) { + return "no daemons active"; + } + + // If "daemon_type" is present, this will be used in place of "daemon" when + // reporting the count (e.g., "${N} daemons"). + // + // We will additional break down the count by various groupings, based + // on the following keys: + // + // "hostname" -> host(s) + // "zone_id" -> zone(s) + // + // The `ceph -s` will be something likes: + // iscsi: 3 portals active (3 hosts) + // rgw: 3 gateways active (3 hosts, 1 zone) + + std::map<std::string, std::set<std::string>> groupings; + std::string type("daemon"); + int num = 0; + for (auto& d : daemons) { + ++num; + if (auto p = d.second.metadata.find("daemon_type"); + p != d.second.metadata.end()) { + type = p->second; + } + for (auto k : {std::make_pair("zone", "zone_id"), + std::make_pair("host", "hostname")}) { + auto p = d.second.metadata.find(k.second); + if (p != d.second.metadata.end()) { + groupings[k.first].insert(p->second); + } + } + } + + std::ostringstream ss; + ss << num << " " << type << (num > 1 ? "s" : "") << " active"; + if (groupings.size()) { + ss << " ("; + for (auto i = groupings.begin(); i != groupings.end(); ++i) { + if (i != groupings.begin()) { + ss << ", "; + } + ss << i->second.size() << " " << i->first << (i->second.size() ? "s" : ""); + } + ss << ")"; + } + + return ss.str(); +} + +bool ServiceMap::Service::has_running_tasks() const +{ + return std::any_of(daemons.begin(), daemons.end(), [](auto& daemon) { + return !daemon.second.task_status.empty(); + }); +} + +std::string ServiceMap::Service::get_task_summary(const std::string_view task_prefix) const +{ + // contruct a map similar to: + // {"service1 status" -> {"service1.0" -> "running"}} + // {"service2 status" -> {"service2.0" -> "idle"}, + // {"service2.1" -> "running"}} + std::map<std::string, std::map<std::string, std::string>> by_task; + for (const auto& [service_id, daemon] : daemons) { + for (const auto& [task_name, status] : daemon.task_status) { + by_task[task_name].emplace(fmt::format("{}.{}", task_prefix, service_id), + status); + } + } + std::stringstream ss; + for (const auto &[task_name, status_by_service] : by_task) { + ss << "\n " << task_name << ":"; + for (auto& [service, status] : status_by_service) { + ss << "\n " << service << ": " << status; + } + } + return ss.str(); +} + +void ServiceMap::Service::count_metadata(const std::string& field, + std::map<std::string,int> *out) const +{ + for (auto& p : daemons) { + auto q = p.second.metadata.find(field); + if (q == p.second.metadata.end()) { + (*out)["unknown"]++; + } else { + (*out)[q->second]++; + } + } +} + +void ServiceMap::Service::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(daemons, bl, features); + encode(summary, bl); + ENCODE_FINISH(bl); +} + +void ServiceMap::Service::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(daemons, p); + decode(summary, p); + DECODE_FINISH(p); +} + +void ServiceMap::Service::dump(Formatter *f) const +{ + f->open_object_section("daemons"); + f->dump_string("summary", summary); + for (auto& p : daemons) { + f->dump_object(p.first.c_str(), p.second); + } + f->close_section(); +} + +void ServiceMap::Service::generate_test_instances(std::list<Service*>& ls) +{ + ls.push_back(new Service); + ls.push_back(new Service); + ls.back()->daemons["one"].gid = 1; + ls.back()->daemons["two"].gid = 2; +} + +// ServiceMap + +void ServiceMap::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + encode(epoch, bl); + encode(modified, bl); + encode(services, bl, features); + ENCODE_FINISH(bl); +} + +void ServiceMap::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(epoch, p); + decode(modified, p); + decode(services, p); + DECODE_FINISH(p); +} + +void ServiceMap::dump(Formatter *f) const +{ + f->dump_unsigned("epoch", epoch); + f->dump_stream("modified") << modified; + f->open_object_section("services"); + for (auto& p : services) { + f->dump_object(p.first.c_str(), p.second); + } + f->close_section(); +} + +void ServiceMap::generate_test_instances(std::list<ServiceMap*>& ls) +{ + ls.push_back(new ServiceMap); + ls.push_back(new ServiceMap); + ls.back()->epoch = 123; + ls.back()->services["rgw"].daemons["one"].gid = 123; + ls.back()->services["rgw"].daemons["two"].gid = 344; + ls.back()->services["iscsi"].daemons["foo"].gid = 3222; +} diff --git a/src/mgr/ServiceMap.h b/src/mgr/ServiceMap.h new file mode 100644 index 000000000..ed027907c --- /dev/null +++ b/src/mgr/ServiceMap.h @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <map> +#include <list> +#include <sstream> + +#include "include/utime.h" +#include "include/buffer.h" +#include "msg/msg_types.h" + +namespace ceph { + class Formatter; +} + +struct ServiceMap { + struct Daemon { + uint64_t gid = 0; + entity_addr_t addr; + epoch_t start_epoch = 0; ///< epoch first registered + utime_t start_stamp; ///< timestamp daemon started/registered + std::map<std::string,std::string> metadata; ///< static metadata + std::map<std::string,std::string> task_status; ///< running task status + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<Daemon*>& ls); + }; + + struct Service { + std::map<std::string,Daemon> daemons; + std::string summary; ///< summary status std::string for 'ceph -s' + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<Service*>& ls); + + std::string get_summary() const; + bool has_running_tasks() const; + std::string get_task_summary(const std::string_view task_prefix) const; + void count_metadata(const std::string& field, + std::map<std::string,int> *out) const; + }; + + epoch_t epoch = 0; + utime_t modified; + std::map<std::string,Service> services; + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<ServiceMap*>& ls); + + std::pair<Daemon*,bool> get_daemon(const std::string& service, + const std::string& daemon) { + auto& s = services[service]; + auto [d, added] = s.daemons.try_emplace(daemon); + return {&d->second, added}; + } + + bool rm_daemon(const std::string& service, + const std::string& daemon) { + auto p = services.find(service); + if (p == services.end()) { + return false; + } + auto q = p->second.daemons.find(daemon); + if (q == p->second.daemons.end()) { + return false; + } + p->second.daemons.erase(q); + if (p->second.daemons.empty()) { + services.erase(p); + } + return true; + } + + static inline bool is_normal_ceph_entity(std::string_view type) { + if (type == "osd" || + type == "client" || + type == "mon" || + type == "mds" || + type == "mgr") { + return true; + } + + return false; + } +}; +WRITE_CLASS_ENCODER_FEATURES(ServiceMap) +WRITE_CLASS_ENCODER_FEATURES(ServiceMap::Service) +WRITE_CLASS_ENCODER_FEATURES(ServiceMap::Daemon) diff --git a/src/mgr/StandbyPyModules.cc b/src/mgr/StandbyPyModules.cc new file mode 100644 index 000000000..337aab029 --- /dev/null +++ b/src/mgr/StandbyPyModules.cc @@ -0,0 +1,200 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "StandbyPyModules.h" + +#include "common/Finisher.h" +#include "common/debug.h" +#include "common/errno.h" + +#include "mgr/MgrContext.h" +#include "mgr/Gil.h" + +// For ::mgr_store_prefix +#include "PyModuleRegistry.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr +#undef dout_prefix +#define dout_prefix *_dout << "mgr " << __func__ << " " + + +StandbyPyModules::StandbyPyModules( + const MgrMap &mgr_map_, + PyModuleConfig &module_config, + LogChannelRef clog_, + MonClient &monc_, + Finisher &f) + : state(module_config, monc_), + clog(clog_), + finisher(f) +{ + state.set_mgr_map(mgr_map_); +} + +// FIXME: completely identical to ActivePyModules +void StandbyPyModules::shutdown() +{ + std::lock_guard locker(lock); + + // Signal modules to drop out of serve() and/or tear down resources + for (auto &i : modules) { + auto module = i.second.get(); + const auto& name = i.first; + dout(10) << "waiting for module " << name << " to shutdown" << dendl; + lock.unlock(); + module->shutdown(); + lock.lock(); + dout(10) << "module " << name << " shutdown" << dendl; + } + + // For modules implementing serve(), finish the threads where we + // were running that. + for (auto &i : modules) { + lock.unlock(); + dout(10) << "joining thread for module " << i.first << dendl; + i.second->thread.join(); + dout(10) << "joined thread for module " << i.first << dendl; + lock.lock(); + } + + modules.clear(); +} + +void StandbyPyModules::start_one(PyModuleRef py_module) +{ + std::lock_guard l(lock); + const auto name = py_module->get_name(); + auto standby_module = new StandbyPyModule(state, py_module, clog); + + // Send all python calls down a Finisher to avoid blocking + // C++ code, and avoid any potential lock cycles. + finisher.queue(new LambdaContext([this, standby_module, name](int) { + int r = standby_module->load(); + if (r != 0) { + derr << "Failed to run module in standby mode ('" << name << "')" + << dendl; + delete standby_module; + } else { + std::lock_guard l(lock); + auto em = modules.emplace(name, standby_module); + ceph_assert(em.second); // actually inserted + + dout(4) << "Starting thread for " << name << dendl; + standby_module->thread.create(standby_module->get_thread_name()); + } + })); +} + +int StandbyPyModule::load() +{ + Gil gil(py_module->pMyThreadState, true); + + // We tell the module how we name it, so that it can be consistent + // with us in logging etc. + auto pThisPtr = PyCapsule_New(this, nullptr, nullptr); + ceph_assert(pThisPtr != nullptr); + auto pModuleName = PyUnicode_FromString(get_name().c_str()); + ceph_assert(pModuleName != nullptr); + auto pArgs = PyTuple_Pack(2, pModuleName, pThisPtr); + Py_DECREF(pThisPtr); + Py_DECREF(pModuleName); + + pClassInstance = PyObject_CallObject(py_module->pStandbyClass, pArgs); + Py_DECREF(pArgs); + if (pClassInstance == nullptr) { + derr << "Failed to construct class in '" << get_name() << "'" << dendl; + derr << handle_pyerror(true, get_name(), "StandbyPyModule::load") << dendl; + return -EINVAL; + } else { + dout(1) << "Constructed class from module: " << get_name() << dendl; + return 0; + } +} + +bool StandbyPyModule::get_config(const std::string &key, + std::string *value) const +{ + const std::string global_key = "mgr/" + get_name() + "/" + key; + + dout(4) << __func__ << " key: " << global_key << dendl; + + return state.with_config([global_key, value](const PyModuleConfig &config){ + if (config.config.count(global_key)) { + *value = config.config.at(global_key); + return true; + } else { + return false; + } + }); +} + +bool StandbyPyModule::get_store(const std::string &key, + std::string *value) const +{ + + const std::string global_key = PyModule::mgr_store_prefix + + get_name() + "/" + key; + + dout(4) << __func__ << " key: " << global_key << dendl; + + // Active modules use a cache of store values (kept up to date + // as writes pass through the active mgr), but standbys + // fetch values synchronously to get an up to date value. + // It's an acceptable cost because standby modules should not be + // doing a lot. + + MonClient &monc = state.get_monc(); + + std::ostringstream cmd_json; + cmd_json << "{\"prefix\": \"config-key get\", \"key\": \"" + << global_key << "\"}"; + + bufferlist outbl; + std::string outs; + C_SaferCond c; + monc.start_mon_command( + {cmd_json.str()}, + {}, + &outbl, + &outs, + &c); + + int r = c.wait(); + if (r == -ENOENT) { + return false; + } else if (r != 0) { + // This is some internal error, not meaningful to python modules, + // so let them just see no value. + derr << __func__ << " error fetching store key '" << global_key << "': " + << cpp_strerror(r) << " " << outs << dendl; + return false; + } else { + *value = outbl.to_str(); + return true; + } +} + +std::string StandbyPyModule::get_active_uri() const +{ + std::string result; + state.with_mgr_map([&result, this](const MgrMap &mgr_map){ + auto iter = mgr_map.services.find(get_name()); + if (iter != mgr_map.services.end()) { + result = iter->second; + } + }); + + return result; +} + diff --git a/src/mgr/StandbyPyModules.h b/src/mgr/StandbyPyModules.h new file mode 100644 index 000000000..501dfc8c7 --- /dev/null +++ b/src/mgr/StandbyPyModules.h @@ -0,0 +1,133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include <string> +#include <map> + +#include <Python.h> + +#include "common/Thread.h" +#include "common/ceph_mutex.h" + +#include "mgr/Gil.h" +#include "mon/MonClient.h" +#include "mon/MgrMap.h" +#include "mgr/PyModuleRunner.h" + +class Finisher; + +/** + * State that is read by all modules running in standby mode + */ +class StandbyPyModuleState +{ + mutable ceph::mutex lock = ceph::make_mutex("StandbyPyModuleState::lock"); + + MgrMap mgr_map; + PyModuleConfig &module_config; + MonClient &monc; + +public: + + + StandbyPyModuleState(PyModuleConfig &module_config_, MonClient &monc_) + : module_config(module_config_), monc(monc_) + {} + + void set_mgr_map(const MgrMap &mgr_map_) + { + std::lock_guard l(lock); + + mgr_map = mgr_map_; + } + + // MonClient does all its own locking so we're happy to hand out + // references. + MonClient &get_monc() {return monc;}; + + template<typename Callback, typename...Args> + void with_mgr_map(Callback&& cb, Args&&...args) const + { + std::lock_guard l(lock); + std::forward<Callback>(cb)(mgr_map, std::forward<Args>(args)...); + } + + template<typename Callback, typename...Args> + auto with_config(Callback&& cb, Args&&... args) const -> + decltype(cb(module_config, std::forward<Args>(args)...)) { + std::lock_guard l(lock); + + return std::forward<Callback>(cb)(module_config, std::forward<Args>(args)...); + } +}; + + +class StandbyPyModule : public PyModuleRunner +{ + StandbyPyModuleState &state; + + public: + + StandbyPyModule( + StandbyPyModuleState &state_, + const PyModuleRef &py_module_, + LogChannelRef clog_) + : + PyModuleRunner(py_module_, clog_), + state(state_) + { + } + + bool get_config(const std::string &key, std::string *value) const; + bool get_store(const std::string &key, std::string *value) const; + std::string get_active_uri() const; + entity_addrvec_t get_myaddrs() const { + return state.get_monc().get_myaddrs(); + } + + int load(); +}; + +class StandbyPyModules +{ +private: + mutable ceph::mutex lock = ceph::make_mutex("StandbyPyModules::lock"); + std::map<std::string, std::unique_ptr<StandbyPyModule>> modules; + + StandbyPyModuleState state; + + LogChannelRef clog; + + Finisher &finisher; + +public: + + StandbyPyModules( + const MgrMap &mgr_map_, + PyModuleConfig &module_config, + LogChannelRef clog_, + MonClient &monc, + Finisher &f); + + void start_one(PyModuleRef py_module); + + void shutdown(); + + void handle_mgr_map(const MgrMap &mgr_map) + { + state.set_mgr_map(mgr_map); + } + +}; diff --git a/src/mgr/TTLCache.cc b/src/mgr/TTLCache.cc new file mode 100644 index 000000000..05fe95987 --- /dev/null +++ b/src/mgr/TTLCache.cc @@ -0,0 +1,100 @@ +#include "TTLCache.h" + +#include <chrono> +#include <functional> +#include <string> + +#include "PyUtil.h" + +template <class Key, class Value> +void TTLCacheBase<Key, Value>::insert(Key key, Value value) { + auto now = std::chrono::steady_clock::now(); + + if (!ttl) return; + int16_t random_ttl_offset = + ttl * ttl_spread_ratio * (2l * rand() / float(RAND_MAX) - 1); + // in order not to have spikes of misses we increase or decrease by 25% of + // the ttl + int16_t spreaded_ttl = ttl + random_ttl_offset; + auto expiration_date = now + std::chrono::seconds(spreaded_ttl); + cache::insert(key, {value, expiration_date}); +} + +template <class Key, class Value> Value TTLCacheBase<Key, Value>::get(Key key) { + if (!exists(key)) { + throw_key_not_found(key); + } + if (expired(key)) { + erase(key); + throw_key_not_found(key); + } + Value value = {get_value(key)}; + return value; +} + +template <class Key> PyObject* TTLCache<Key, PyObject*>::get(Key key) { + if (!this->exists(key)) { + this->throw_key_not_found(key); + } + if (this->expired(key)) { + this->erase(key); + this->throw_key_not_found(key); + } + PyObject* cached_value = this->get_value(key); + Py_INCREF(cached_value); + return cached_value; +} + +template <class Key, class Value> +void TTLCacheBase<Key, Value>::erase(Key key) { + cache::erase(key); +} + +template <class Key> void TTLCache<Key, PyObject*>::erase(Key key) { + Py_DECREF(this->get_value(key, false)); + ttl_base::erase(key); +} + +template <class Key, class Value> +bool TTLCacheBase<Key, Value>::expired(Key key) { + ttl_time_point expiration_date = get_value_time_point(key); + auto now = std::chrono::steady_clock::now(); + if (now >= expiration_date) { + return true; + } else { + return false; + } +} + +template <class Key, class Value> void TTLCacheBase<Key, Value>::clear() { + cache::clear(); +} + +template <class Key, class Value> +Value TTLCacheBase<Key, Value>::get_value(Key key, bool count_hit) { + value_type stored_value = cache::get(key, count_hit); + Value value = std::get<0>(stored_value); + return value; +} + +template <class Key, class Value> +ttl_time_point TTLCacheBase<Key, Value>::get_value_time_point(Key key) { + value_type stored_value = cache::get(key, false); + ttl_time_point tp = std::get<1>(stored_value); + return tp; +} + +template <class Key, class Value> +void TTLCacheBase<Key, Value>::set_ttl(uint16_t ttl) { + this->ttl = ttl; +} + +template <class Key, class Value> +bool TTLCacheBase<Key, Value>::exists(Key key) { + return cache::exists(key); +} + +template <class Key, class Value> +void TTLCacheBase<Key, Value>::throw_key_not_found(Key key) { + cache::throw_key_not_found(key); +} diff --git a/src/mgr/TTLCache.h b/src/mgr/TTLCache.h new file mode 100644 index 000000000..d29b787bb --- /dev/null +++ b/src/mgr/TTLCache.h @@ -0,0 +1,122 @@ +#pragma once + +#include <atomic> +#include <chrono> +#include <functional> +#include <map> +#include <memory> +#include <string> +#include <vector> + +#include "PyUtil.h" + +template <class Key, class Value> class Cache { + private: + std::atomic<uint64_t> hits, misses; + + protected: + unsigned int capacity; + Cache(unsigned int size = UINT16_MAX) : hits{0}, misses{0}, capacity{size} {}; + std::map<Key, Value> content; + std::vector<std::string> allowed_keys = {"osd_map", "pg_dump", "pg_stats"}; + + void mark_miss() { + misses++; + } + + void mark_hit() { + hits++; + } + + unsigned int get_misses() { return misses; } + unsigned int get_hits() { return hits; } + void throw_key_not_found(Key key) { + std::stringstream ss; + ss << "Key " << key << " couldn't be found\n"; + throw std::out_of_range(ss.str()); + } + + public: + void insert(Key key, Value value) { + mark_miss(); + if (content.size() < capacity) { + content.insert({key, value}); + } + } + Value get(Key key, bool count_hit = true) { + if (count_hit) { + mark_hit(); + } + return content[key]; + } + void erase(Key key) { content.erase(content.find(key)); } + void clear() { content.clear(); } + bool exists(Key key) { return content.find(key) != content.end(); } + std::pair<uint64_t, uint64_t> get_hit_miss_ratio() { + return std::make_pair(hits.load(), misses.load()); + } + bool is_cacheable(Key key) { + for (auto k : allowed_keys) { + if (key == k) return true; + } + return false; + } + int size() { return content.size(); } + + ~Cache(){}; +}; + +using ttl_time_point = std::chrono::time_point<std::chrono::steady_clock>; +template <class Key, class Value> +class TTLCacheBase : public Cache<Key, std::pair<Value, ttl_time_point>> { + private: + uint16_t ttl; + float ttl_spread_ratio; + using value_type = std::pair<Value, ttl_time_point>; + using cache = Cache<Key, value_type>; + + protected: + Value get_value(Key key, bool count_hit = true); + ttl_time_point get_value_time_point(Key key); + bool exists(Key key); + bool expired(Key key); + void finish_get(Key key); + void finish_erase(Key key); + void throw_key_not_found(Key key); + + public: + TTLCacheBase(uint16_t ttl_ = 0, uint16_t size = UINT16_MAX, + float spread = 0.25) + : Cache<Key, value_type>(size), ttl{ttl_}, ttl_spread_ratio{spread} {} + ~TTLCacheBase(){}; + void insert(Key key, Value value); + Value get(Key key); + void erase(Key key); + void clear(); + uint16_t get_ttl() { return ttl; }; + void set_ttl(uint16_t ttl); +}; + +template <class Key, class Value> +class TTLCache : public TTLCacheBase<Key, Value> { + public: + TTLCache(uint16_t ttl_ = 0, uint16_t size = UINT16_MAX, float spread = 0.25) + : TTLCacheBase<Key, Value>(ttl_, size, spread) {} + ~TTLCache(){}; +}; + +template <class Key> +class TTLCache<Key, PyObject*> : public TTLCacheBase<Key, PyObject*> { + public: + TTLCache(uint16_t ttl_ = 0, uint16_t size = UINT16_MAX, float spread = 0.25) + : TTLCacheBase<Key, PyObject*>(ttl_, size, spread) {} + ~TTLCache(){}; + PyObject* get(Key key); + void erase(Key key); + + private: + using ttl_base = TTLCacheBase<Key, PyObject*>; +}; + +#include "TTLCache.cc" + diff --git a/src/mgr/Types.h b/src/mgr/Types.h new file mode 100644 index 000000000..ab90bbbe9 --- /dev/null +++ b/src/mgr/Types.h @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MGR_TYPES_H +#define CEPH_MGR_TYPES_H + +typedef int MetricQueryID; + +typedef std::pair<uint64_t,uint64_t> PerformanceCounter; +typedef std::vector<PerformanceCounter> PerformanceCounters; + +struct MetricListener { + virtual ~MetricListener() { + } + + virtual void handle_query_updated() = 0; +}; + +struct PerfCollector { + MetricQueryID query_id; + PerfCollector(MetricQueryID query_id) + : query_id(query_id) { + } +}; + +#endif // CEPH_MGR_TYPES_H diff --git a/src/mgr/mgr_commands.cc b/src/mgr/mgr_commands.cc new file mode 100644 index 000000000..206d1126a --- /dev/null +++ b/src/mgr/mgr_commands.cc @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mgr_commands.h" + +/* The set of statically defined (C++-handled) commands. This + * does not include the Python-defined commands, which are loaded + * in PyModules */ +const std::vector<MonCommand> mgr_commands = { +#define COMMAND(parsesig, helptext, module, perm) \ + {parsesig, helptext, module, perm, 0}, +#include "MgrCommands.h" +#undef COMMAND +}; diff --git a/src/mgr/mgr_commands.h b/src/mgr/mgr_commands.h new file mode 100644 index 000000000..c6ed6c68d --- /dev/null +++ b/src/mgr/mgr_commands.h @@ -0,0 +1,9 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "mon/MonCommand.h" +#include <vector> + +extern const std::vector<MonCommand> mgr_commands; diff --git a/src/mgr/mgr_perf_counters.cc b/src/mgr/mgr_perf_counters.cc new file mode 100644 index 000000000..1b5585f9e --- /dev/null +++ b/src/mgr/mgr_perf_counters.cc @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "mgr_perf_counters.h" +#include "common/perf_counters.h" +#include "common/ceph_context.h" + +PerfCounters *perfcounter = NULL; + +int mgr_perf_start(CephContext *cct) +{ + PerfCountersBuilder plb(cct, "mgr", l_mgr_first, l_mgr_last); + plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + + plb.add_u64_counter(l_mgr_cache_hit, "cache_hit", "Cache hits"); + plb.add_u64_counter(l_mgr_cache_miss, "cache_miss", "Cache miss"); + + perfcounter = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perfcounter); + return 0; +} + +void mgr_perf_stop(CephContext *cct) +{ + ceph_assert(perfcounter); + cct->get_perfcounters_collection()->remove(perfcounter); + delete perfcounter; +} diff --git a/src/mgr/mgr_perf_counters.h b/src/mgr/mgr_perf_counters.h new file mode 100644 index 000000000..d695d905f --- /dev/null +++ b/src/mgr/mgr_perf_counters.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once +#include "include/common_fwd.h" + +extern PerfCounters* perfcounter; + +extern int mgr_perf_start(CephContext* cct); +extern void mgr_perf_stop(CephContext* cct); + +enum { + l_mgr_first, + + l_mgr_cache_hit, + l_mgr_cache_miss, + + l_mgr_last, +}; + |