summaryrefslogtreecommitdiffstats
path: root/src/mgr/DaemonHealthMetricCollector.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/mgr/DaemonHealthMetricCollector.cc')
-rw-r--r--src/mgr/DaemonHealthMetricCollector.cc125
1 files changed, 125 insertions, 0 deletions
diff --git a/src/mgr/DaemonHealthMetricCollector.cc b/src/mgr/DaemonHealthMetricCollector.cc
new file mode 100644
index 00000000..1c3dc431
--- /dev/null
+++ b/src/mgr/DaemonHealthMetricCollector.cc
@@ -0,0 +1,125 @@
+#include <boost/format.hpp>
+
+#include "include/health.h"
+#include "include/types.h"
+#include "DaemonHealthMetricCollector.h"
+
+
+
+ostream& operator<<(ostream& os,
+ const DaemonHealthMetricCollector::DaemonKey& daemon) {
+ return os << daemon.first << "." << daemon.second;
+}
+
+// define operator<<(ostream&, const vector<DaemonKey>&) after
+// ostream& operator<<(ostream&, const DaemonKey&), so that C++'s
+// ADL can use the former instead of using the generic one:
+// operator<<(ostream&, const std::pair<A,B>&)
+ostream& operator<<(
+ ostream& os,
+ const vector<DaemonHealthMetricCollector::DaemonKey>& daemons)
+{
+ os << "[";
+ for (auto d = daemons.begin(); d != daemons.end(); ++d) {
+ if (d != daemons.begin()) os << ",";
+ os << *d;
+ }
+ os << "]";
+ return os;
+}
+
+namespace {
+
+class SlowOps final : public DaemonHealthMetricCollector {
+ bool _is_relevant(daemon_metric type) const override {
+ return type == daemon_metric::SLOW_OPS;
+ }
+ health_check_t& _get_check(health_check_map_t& cm) const override {
+ return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "");
+ }
+ bool _update(const DaemonKey& daemon,
+ const DaemonHealthMetric& metric) override {
+ auto num_slow = metric.get_n1();
+ auto blocked_time = metric.get_n2();
+ value.n1 += num_slow;
+ value.n2 = std::max(value.n2, blocked_time);
+ if (num_slow || blocked_time) {
+ daemons.push_back(daemon);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ void _summarize(health_check_t& check) const override {
+ if (daemons.empty()) {
+ return;
+ }
+ static const char* fmt = "%1% slow ops, oldest one blocked for %2% sec, %3%";
+ // Note this message format is used in mgr/prometheus, so any change in format
+ // requires a corresponding change in the mgr/prometheus module.
+ ostringstream ss;
+ if (daemons.size() > 1) {
+ if (daemons.size() > 10) {
+ ss << "daemons " << vector<DaemonKey>(daemons.begin(), daemons.begin()+10)
+ << "..." << " have slow ops.";
+ } else {
+ ss << "daemons " << daemons << " have slow ops.";
+ }
+ } else {
+ ss << daemons.front() << " has slow ops";
+ }
+ check.summary = boost::str(boost::format(fmt) % value.n1 % value.n2 % ss.str());
+ // No detail
+ }
+ vector<DaemonKey> daemons;
+};
+
+
+class PendingPGs final : public DaemonHealthMetricCollector {
+ bool _is_relevant(daemon_metric type) const override {
+ return type == daemon_metric::PENDING_CREATING_PGS;
+ }
+ health_check_t& _get_check(health_check_map_t& cm) const override {
+ return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "");
+ }
+ bool _update(const DaemonKey& osd,
+ const DaemonHealthMetric& metric) override {
+ value.n += metric.get_n();
+ if (metric.get_n()) {
+ osds.push_back(osd);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ void _summarize(health_check_t& check) const override {
+ if (osds.empty()) {
+ return;
+ }
+ static const char* fmt = "%1% PGs pending on creation";
+ check.summary = boost::str(boost::format(fmt) % value.n);
+ ostringstream ss;
+ if (osds.size() > 1) {
+ ss << "osds " << osds << " have pending PGs.";
+ } else {
+ ss << osds.front() << " has pending PGs";
+ }
+ check.detail.push_back(ss.str());
+ }
+ vector<DaemonKey> osds;
+};
+
+} // anonymous namespace
+
+unique_ptr<DaemonHealthMetricCollector>
+DaemonHealthMetricCollector::create(daemon_metric m)
+{
+ switch (m) {
+ case daemon_metric::SLOW_OPS:
+ return unique_ptr<DaemonHealthMetricCollector>{new SlowOps};
+ case daemon_metric::PENDING_CREATING_PGS:
+ return unique_ptr<DaemonHealthMetricCollector>{new PendingPGs};
+ default:
+ return unique_ptr<DaemonHealthMetricCollector>{};
+ }
+}