diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/mon/PGMap.cc | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/mon/PGMap.cc | 4041 |
1 files changed, 4041 insertions, 0 deletions
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc new file mode 100644 index 00000000..38bc1877 --- /dev/null +++ b/src/mon/PGMap.cc @@ -0,0 +1,4041 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/algorithm/string.hpp> + +#include "PGMap.h" + +#define dout_subsys ceph_subsys_mon +#include "common/debug.h" +#include "common/Clock.h" +#include "common/Formatter.h" +#include "global/global_context.h" +#include "include/ceph_features.h" +#include "include/stringify.h" + +#include "osd/osd_types.h" +#include "osd/OSDMap.h" +#include <boost/range/adaptor/reversed.hpp> + +#define dout_context g_ceph_context + +using std::list; +using std::make_pair; +using std::map; +using std::pair; +using std::ostream; +using std::ostringstream; +using std::set; +using std::string; +using std::stringstream; +using std::vector; + +using ceph::bufferlist; + +MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap); +MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap); +MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap); + + +// --------------------- +// PGMapDigest + +void PGMapDigest::encode(bufferlist& bl, uint64_t features) const +{ + // NOTE: see PGMap::encode_digest + uint8_t v = 4; + if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 1; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 3; + } + ENCODE_START(v, 1, bl); + encode(num_pg, bl); + encode(num_pg_active, bl); + encode(num_pg_unknown, bl); + encode(num_osd, bl); + encode(pg_pool_sum, bl, features); + encode(pg_sum, bl, features); + encode(osd_sum, bl, features); + if (v >= 2) { + encode(num_pg_by_state, bl); + } else { + uint32_t n = num_pg_by_state.size(); + encode(n, bl); + for (auto p : num_pg_by_state) { + encode((int32_t)p.first, bl); + encode(p.second, bl); + } + } + encode(num_pg_by_osd, bl); + encode(num_pg_by_pool, bl); + encode(osd_last_seq, bl); + encode(per_pool_sum_delta, bl, features); + encode(per_pool_sum_deltas_stamps, bl); + encode(pg_sum_delta, bl, features); + encode(stamp_delta, bl); + encode(avail_space_by_rule, bl); + if (struct_v >= 3) { + encode(purged_snaps, bl); + } + if (struct_v >= 4) { + encode(osd_sum_by_class, bl, features); + } + ENCODE_FINISH(bl); +} + +void PGMapDigest::decode(bufferlist::const_iterator& p) +{ + DECODE_START(4, p); + decode(num_pg, p); + decode(num_pg_active, p); + decode(num_pg_unknown, p); + decode(num_osd, p); + decode(pg_pool_sum, p); + decode(pg_sum, p); + decode(osd_sum, p); + if (struct_v >= 2) { + decode(num_pg_by_state, p); + } else { + map<int32_t, int32_t> nps; + decode(nps, p); + num_pg_by_state.clear(); + for (auto i : nps) { + num_pg_by_state[i.first] = i.second; + } + } + decode(num_pg_by_osd, p); + decode(num_pg_by_pool, p); + decode(osd_last_seq, p); + decode(per_pool_sum_delta, p); + decode(per_pool_sum_deltas_stamps, p); + decode(pg_sum_delta, p); + decode(stamp_delta, p); + decode(avail_space_by_rule, p); + if (struct_v >= 3) { + decode(purged_snaps, p); + } + if (struct_v >= 4) { + decode(osd_sum_by_class, p); + } + DECODE_FINISH(p); +} + +void PGMapDigest::dump(ceph::Formatter *f) const +{ + f->dump_unsigned("num_pg", num_pg); + f->dump_unsigned("num_pg_active", num_pg_active); + f->dump_unsigned("num_pg_unknown", num_pg_unknown); + f->dump_unsigned("num_osd", num_osd); + f->dump_object("pool_sum", pg_sum); + f->dump_object("osd_sum", osd_sum); + + f->open_object_section("osd_sum_by_class"); + for (auto& i : osd_sum_by_class) { + f->dump_object(i.first.c_str(), i.second); + } + f->close_section(); + + f->open_array_section("pool_stats"); + for (auto& p : pg_pool_sum) { + f->open_object_section("pool_stat"); + f->dump_int("poolid", p.first); + auto q = num_pg_by_pool.find(p.first); + if (q != num_pg_by_pool.end()) + f->dump_unsigned("num_pg", q->second); + p.second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("osd_stats"); + int i = 0; + // TODO: this isn't really correct since we can dump non-existent OSDs + // I dunno what osd_last_seq is set to in that case... + for (auto& p : osd_last_seq) { + f->open_object_section("osd_stat"); + f->dump_int("osd", i); + f->dump_unsigned("seq", p); + f->close_section(); + ++i; + } + f->close_section(); + f->open_array_section("num_pg_by_state"); + for (auto& p : num_pg_by_state) { + f->open_object_section("count"); + f->dump_string("state", pg_state_string(p.first)); + f->dump_unsigned("num", p.second); + f->close_section(); + } + f->close_section(); + f->open_array_section("num_pg_by_osd"); + for (auto& p : num_pg_by_osd) { + f->open_object_section("count"); + f->dump_unsigned("osd", p.first); + f->dump_unsigned("num_primary_pg", p.second.primary); + f->dump_unsigned("num_acting_pg", p.second.acting); + f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting); + f->close_section(); + } + f->close_section(); + f->open_array_section("purged_snaps"); + for (auto& j : purged_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", j.first); + f->open_object_section("purged_snaps"); + for (auto i = j.second.begin(); i != j.second.end(); ++i) { + f->open_object_section("interval"); + f->dump_stream("start") << i.get_start(); + f->dump_stream("length") << i.get_len(); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); +} + +void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls) +{ + ls.push_back(new PGMapDigest); +} + +inline std::string percentify(const float& a) { + std::stringstream ss; + if (a < 0.01) + ss << "0"; + else + ss << std::fixed << std::setprecision(2) << a; + return ss.str(); +} + +void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const +{ + if (f) + f->open_array_section("pgs_by_state"); + + // list is descending numeric order (by count) + std::multimap<int,uint64_t> state_by_count; // count -> state + for (auto p = num_pg_by_state.begin(); + p != num_pg_by_state.end(); + ++p) { + state_by_count.insert(make_pair(p->second, p->first)); + } + if (f) { + for (auto p = state_by_count.rbegin(); + p != state_by_count.rend(); + ++p) + { + f->open_object_section("pgs_by_state_element"); + f->dump_string("state_name", pg_state_string(p->second)); + f->dump_unsigned("count", p->first); + f->close_section(); + } + } + if (f) + f->close_section(); + + if (f) { + f->dump_unsigned("num_pgs", num_pg); + f->dump_unsigned("num_pools", pg_pool_sum.size()); + f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects); + f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes); + f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw()); + f->dump_unsigned("bytes_avail", osd_sum.statfs.available); + f->dump_unsigned("bytes_total", osd_sum.statfs.total); + } else { + *out << " pools: " << pg_pool_sum.size() << " pools, " + << num_pg << " pgs\n"; + *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, " + << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n"; + *out << " usage: " + << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, " + << byte_u_t(osd_sum.statfs.available) << " / " + << byte_u_t(osd_sum.statfs.total) << " avail\n"; + *out << " pgs: "; + } + + bool pad = false; + + if (num_pg_unknown > 0) { + float p = (float)num_pg_unknown / (float)num_pg; + if (f) { + f->dump_float("unknown_pgs_ratio", p); + } else { + char b[20]; + snprintf(b, sizeof(b), "%.3lf", p * 100.0); + *out << b << "% pgs unknown\n"; + pad = true; + } + } + + int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown; + if (num_pg_inactive > 0) { + float p = (float)num_pg_inactive / (float)num_pg; + if (f) { + f->dump_float("inactive_pgs_ratio", p); + } else { + if (pad) { + *out << " "; + } + char b[20]; + snprintf(b, sizeof(b), "%.3f", p * 100.0); + *out << b << "% pgs not active\n"; + pad = true; + } + } + + list<string> sl; + overall_recovery_summary(f, &sl); + if (!f && !sl.empty()) { + for (auto p = sl.begin(); p != sl.end(); ++p) { + if (pad) { + *out << " "; + } + *out << *p << "\n"; + pad = true; + } + } + sl.clear(); + + if (!f) { + unsigned max_width = 1; + for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p) + { + std::stringstream ss; + ss << p->first; + max_width = std::max<size_t>(ss.str().size(), max_width); + } + + for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p) + { + if (pad) { + *out << " "; + } + pad = true; + out->setf(std::ios::left); + *out << std::setw(max_width) << p->first + << " " << pg_state_string(p->second) << "\n"; + out->unsetf(std::ios::left); + } + } + + ostringstream ss_rec_io; + overall_recovery_rate_summary(f, &ss_rec_io); + ostringstream ss_client_io; + overall_client_io_rate_summary(f, &ss_client_io); + ostringstream ss_cache_io; + overall_cache_io_rate_summary(f, &ss_cache_io); + + if (!f && (ss_client_io.str().length() || ss_rec_io.str().length() + || ss_cache_io.str().length())) { + *out << "\n \n"; + *out << " io:\n"; + } + + if (!f && ss_client_io.str().length()) + *out << " client: " << ss_client_io.str() << "\n"; + if (!f && ss_rec_io.str().length()) + *out << " recovery: " << ss_rec_io.str() << "\n"; + if (!f && ss_cache_io.str().length()) + *out << " cache: " << ss_cache_io.str() << "\n"; +} + +void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const +{ + std::stringstream ss; + + if (f) + f->open_array_section("num_pg_by_state"); + for (auto p = num_pg_by_state.begin(); + p != num_pg_by_state.end(); + ++p) { + if (f) { + f->open_object_section("state"); + f->dump_string("name", pg_state_string(p->first)); + f->dump_unsigned("num", p->second); + f->close_section(); + } + if (p != num_pg_by_state.begin()) + ss << ", "; + ss << p->second << " " << pg_state_string(p->first); + } + if (f) + f->close_section(); + + string states = ss.str(); + if (out) + *out << num_pg << " pgs: " + << states << "; " + << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, " + << byte_u_t(osd_sum.statfs.get_used()) << " used, " + << byte_u_t(osd_sum.statfs.available) << " / " + << byte_u_t(osd_sum.statfs.total) << " avail"; + if (f) { + f->dump_unsigned("num_pgs", num_pg); + f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes); + f->dump_int("total_bytes", osd_sum.statfs.total); + f->dump_int("total_avail_bytes", osd_sum.statfs.available); + f->dump_int("total_used_bytes", osd_sum.statfs.get_used()); + f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw()); + } + + // make non-negative; we can get negative values if osds send + // uncommitted stats and then "go backward" or if they are just + // buggy/wrong. + pool_stat_t pos_delta = pg_sum_delta; + pos_delta.floor(0); + if (pos_delta.stats.sum.num_rd || + pos_delta.stats.sum.num_wr) { + if (out) + *out << "; "; + if (pos_delta.stats.sum.num_rd) { + int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta; + if (out) + *out << byte_u_t(rd) << "/s rd, "; + if (f) + f->dump_unsigned("read_bytes_sec", rd); + } + if (pos_delta.stats.sum.num_wr) { + int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta; + if (out) + *out << byte_u_t(wr) << "/s wr, "; + if (f) + f->dump_unsigned("write_bytes_sec", wr); + } + int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta; + if (out) + *out << si_u_t(iops) << " op/s"; + if (f) + f->dump_unsigned("io_sec", iops); + } + + list<string> sl; + overall_recovery_summary(f, &sl); + if (out) + for (auto p = sl.begin(); p != sl.end(); ++p) + *out << "; " << *p; + std::stringstream ssr; + overall_recovery_rate_summary(f, &ssr); + if (out && ssr.str().length()) + *out << "; " << ssr.str() << " recovering"; +} + +void PGMapDigest::get_recovery_stats( + double *misplaced_ratio, + double *degraded_ratio, + double *inactive_pgs_ratio, + double *unknown_pgs_ratio) const +{ + if (pg_sum.stats.sum.num_objects_degraded && + pg_sum.stats.sum.num_object_copies > 0) { + *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded / + (double)pg_sum.stats.sum.num_object_copies; + } else { + *degraded_ratio = 0; + } + if (pg_sum.stats.sum.num_objects_misplaced && + pg_sum.stats.sum.num_object_copies > 0) { + *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced / + (double)pg_sum.stats.sum.num_object_copies; + } else { + *misplaced_ratio = 0; + } + if (num_pg > 0) { + int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown; + *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg; + *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg; + } else { + *inactive_pgs_ratio = 0; + *unknown_pgs_ratio = 0; + } +} + +void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl, + const pool_stat_t& pool_sum) const +{ + if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) { + double pc = (double)pool_sum.stats.sum.num_objects_degraded / + (double)pool_sum.stats.sum.num_object_copies * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + if (f) { + f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded); + f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies); + f->dump_float("degraded_ratio", pc / 100.0); + } else { + ostringstream ss; + ss << pool_sum.stats.sum.num_objects_degraded + << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)"; + psl->push_back(ss.str()); + } + } + if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) { + double pc = (double)pool_sum.stats.sum.num_objects_misplaced / + (double)pool_sum.stats.sum.num_object_copies * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + if (f) { + f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced); + f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies); + f->dump_float("misplaced_ratio", pc / 100.0); + } else { + ostringstream ss; + ss << pool_sum.stats.sum.num_objects_misplaced + << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)"; + psl->push_back(ss.str()); + } + } + if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) { + double pc = (double)pool_sum.stats.sum.num_objects_unfound / + (double)pool_sum.stats.sum.num_objects * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + if (f) { + f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound); + f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects); + f->dump_float("unfound_ratio", pc / 100.0); + } else { + ostringstream ss; + ss << pool_sum.stats.sum.num_objects_unfound + << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)"; + psl->push_back(ss.str()); + } + } +} + +void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const +{ + // make non-negative; we can get negative values if osds send + // uncommitted stats and then "go backward" or if they are just + // buggy/wrong. + pool_stat_t pos_delta = delta_sum; + pos_delta.floor(0); + if (pos_delta.stats.sum.num_objects_recovered || + pos_delta.stats.sum.num_bytes_recovered || + pos_delta.stats.sum.num_keys_recovered) { + int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp; + int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp; + int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp; + if (f) { + f->dump_int("recovering_objects_per_sec", objps); + f->dump_int("recovering_bytes_per_sec", bps); + f->dump_int("recovering_keys_per_sec", kps); + f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered); + f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered); + f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered); + } else { + *out << byte_u_t(bps) << "/s"; + if (pos_delta.stats.sum.num_keys_recovered) + *out << ", " << si_u_t(kps) << " keys/s"; + *out << ", " << si_u_t(objps) << " objects/s"; + } + } +} + +void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const +{ + recovery_rate_summary(f, out, pg_sum_delta, stamp_delta); +} + +void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const +{ + recovery_summary(f, psl, pg_sum); +} + +void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out, + uint64_t poolid) const +{ + auto p = per_pool_sum_delta.find(poolid); + if (p == per_pool_sum_delta.end()) + return; + + auto ts = per_pool_sum_deltas_stamps.find(p->first); + ceph_assert(ts != per_pool_sum_deltas_stamps.end()); + recovery_rate_summary(f, out, p->second.first, ts->second); +} + +void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl, + uint64_t poolid) const +{ + auto p = pg_pool_sum.find(poolid); + if (p == pg_pool_sum.end()) + return; + + recovery_summary(f, psl, p->second); +} + +void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const +{ + pool_stat_t pos_delta = delta_sum; + pos_delta.floor(0); + if (pos_delta.stats.sum.num_rd || + pos_delta.stats.sum.num_wr) { + if (pos_delta.stats.sum.num_rd) { + int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("read_bytes_sec", rd); + } else { + *out << byte_u_t(rd) << "/s rd, "; + } + } + if (pos_delta.stats.sum.num_wr) { + int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("write_bytes_sec", wr); + } else { + *out << byte_u_t(wr) << "/s wr, "; + } + } + int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp; + int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp; + if (f) { + f->dump_int("read_op_per_sec", iops_rd); + f->dump_int("write_op_per_sec", iops_wr); + } else { + *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr"; + } + } +} + +void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const +{ + client_io_rate_summary(f, out, pg_sum_delta, stamp_delta); +} + +void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out, + uint64_t poolid) const +{ + auto p = per_pool_sum_delta.find(poolid); + if (p == per_pool_sum_delta.end()) + return; + + auto ts = per_pool_sum_deltas_stamps.find(p->first); + ceph_assert(ts != per_pool_sum_deltas_stamps.end()); + client_io_rate_summary(f, out, p->second.first, ts->second); +} + +void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const +{ + pool_stat_t pos_delta = delta_sum; + pos_delta.floor(0); + bool have_output = false; + + if (pos_delta.stats.sum.num_flush) { + int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("flush_bytes_sec", flush); + } else { + *out << byte_u_t(flush) << "/s flush"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_evict) { + int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("evict_bytes_sec", evict); + } else { + if (have_output) + *out << ", "; + *out << byte_u_t(evict) << "/s evict"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_promote) { + int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp; + if (f) { + f->dump_int("promote_op_per_sec", promote); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(promote) << " op/s promote"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_flush_mode_low) { + if (f) { + f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_flush_mode_high) { + if (f) { + f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_evict_mode_some) { + if (f) { + f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_evict_mode_full) { + if (f) { + f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)"; + } + } +} + +void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const +{ + cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta); +} + +void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out, + uint64_t poolid) const +{ + auto p = per_pool_sum_delta.find(poolid); + if (p == per_pool_sum_delta.end()) + return; + + auto ts = per_pool_sum_deltas_stamps.find(p->first); + ceph_assert(ts != per_pool_sum_deltas_stamps.end()); + cache_io_rate_summary(f, out, p->second.first, ts->second); +} + +ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap, + boost::optional<int64_t> data_pool) const +{ + ceph_statfs statfs; + bool filter = false; + object_stat_sum_t sum; + + if (data_pool) { + auto i = pg_pool_sum.find(*data_pool); + if (i != pg_pool_sum.end()) { + sum = i->second.stats.sum; + filter = true; + } + } + + if (filter) { + statfs.kb_used = (sum.num_bytes >> 10); + statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10; + statfs.num_objects = sum.num_objects; + statfs.kb = statfs.kb_used + statfs.kb_avail; + } else { + // these are in KB. + statfs.kb = osd_sum.statfs.kb(); + statfs.kb_used = osd_sum.statfs.kb_used_raw(); + statfs.kb_avail = osd_sum.statfs.kb_avail(); + statfs.num_objects = pg_sum.stats.sum.num_objects; + } + + return statfs; +} + +void PGMapDigest::dump_pool_stats_full( + const OSDMap &osd_map, + stringstream *ss, + ceph::Formatter *f, + bool verbose) const +{ + TextTable tbl; + + if (f) { + f->open_array_section("pools"); + } else { + tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("ID", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("PGS", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("STORED", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT); + + if (verbose) { + tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("USED COMPR", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("UNDER COMPR", TextTable::LEFT, TextTable::RIGHT); + } + } + + map<int,uint64_t> avail_by_rule; + for (auto p = osd_map.get_pools().begin(); + p != osd_map.get_pools().end(); ++p) { + int64_t pool_id = p->first; + if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0)) + continue; + + const string& pool_name = osd_map.get_pool_name(pool_id); + auto pool_pg_num = osd_map.get_pg_num(pool_id); + const pool_stat_t &stat = pg_pool_sum.at(pool_id); + + const pg_pool_t *pool = osd_map.get_pg_pool(pool_id); + int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(), + pool->get_type(), + pool->get_size()); + int64_t avail; + if (avail_by_rule.count(ruleno) == 0) { + // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked + avail = get_rule_avail(ruleno); + if (avail < 0) + avail = 0; + avail_by_rule[ruleno] = avail; + } else { + avail = avail_by_rule[ruleno]; + } + if (f) { + f->open_object_section("pool"); + f->dump_string("name", pool_name); + f->dump_int("id", pool_id); + f->open_object_section("stats"); + } else { + tbl << pool_name + << pool_id + << pool_pg_num; + } + float raw_used_rate = osd_map.pool_raw_used_rate(pool_id); + bool per_pool = use_per_pool_stats(); + dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool, + pool); + if (f) { + f->close_section(); // stats + f->close_section(); // pool + } else { + tbl << TextTable::endrow; + } + } + if (f) + f->close_section(); + else { + ceph_assert(ss != nullptr); + *ss << "POOLS:\n"; + tbl.set_indent(4); + *ss << tbl; + } +} + +void PGMapDigest::dump_cluster_stats(stringstream *ss, + ceph::Formatter *f, + bool verbose) const +{ + if (f) { + f->open_object_section("stats"); + f->dump_int("total_bytes", osd_sum.statfs.total); + f->dump_int("total_avail_bytes", osd_sum.statfs.available); + f->dump_int("total_used_bytes", osd_sum.statfs.get_used()); + f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw()); + f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio()); + f->dump_unsigned("num_osds", osd_sum.num_osds); + f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds); + f->close_section(); + f->open_object_section("stats_by_class"); + for (auto& i : osd_sum_by_class) { + f->open_object_section(i.first.c_str()); + f->dump_int("total_bytes", i.second.statfs.total); + f->dump_int("total_avail_bytes", i.second.statfs.available); + f->dump_int("total_used_bytes", i.second.statfs.get_used()); + f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw()); + f->dump_float("total_used_raw_ratio", + i.second.statfs.get_used_raw_ratio()); + f->close_section(); + } + f->close_section(); + } else { + ceph_assert(ss != nullptr); + TextTable tbl; + tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT); + + + for (auto& i : osd_sum_by_class) { + tbl << i.first; + tbl << stringify(byte_u_t(i.second.statfs.total)) + << stringify(byte_u_t(i.second.statfs.available)) + << stringify(byte_u_t(i.second.statfs.get_used())) + << stringify(byte_u_t(i.second.statfs.get_used_raw())) + << percentify(i.second.statfs.get_used_raw_ratio()*100.0) + << TextTable::endrow; + } + tbl << "TOTAL"; + tbl << stringify(byte_u_t(osd_sum.statfs.total)) + << stringify(byte_u_t(osd_sum.statfs.available)) + << stringify(byte_u_t(osd_sum.statfs.get_used())) + << stringify(byte_u_t(osd_sum.statfs.get_used_raw())) + << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0) + << TextTable::endrow; + + *ss << "RAW STORAGE:\n"; + tbl.set_indent(4); + *ss << tbl; + } +} + +void PGMapDigest::dump_object_stat_sum( + TextTable &tbl, ceph::Formatter *f, + const pool_stat_t &pool_stat, uint64_t avail, + float raw_used_rate, bool verbose, bool per_pool, + const pg_pool_t *pool) +{ + const object_stat_sum_t &sum = pool_stat.stats.sum; + const store_statfs_t statfs = pool_stat.store_stats; + + if (sum.num_object_copies > 0) { + raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies; + } + + uint64_t used_bytes = pool_stat.get_allocated_bytes(per_pool); + + float used = 0.0; + // note avail passed in is raw_avail, calc raw_used here. + if (avail) { + used = used_bytes; + used /= used + avail; + } else if (used_bytes) { + used = 1.0; + } + auto avail_res = raw_used_rate ? avail / raw_used_rate : 0; + // an approximation for actually stored user data + auto stored_normalized = pool_stat.get_user_bytes(raw_used_rate, per_pool); + if (f) { + f->dump_int("stored", stored_normalized); + f->dump_int("objects", sum.num_objects); + f->dump_int("kb_used", shift_round_up(used_bytes, 10)); + f->dump_int("bytes_used", used_bytes); + f->dump_float("percent_used", used); + f->dump_unsigned("max_avail", avail_res); + if (verbose) { + f->dump_int("quota_objects", pool->quota_max_objects); + f->dump_int("quota_bytes", pool->quota_max_bytes); + f->dump_int("dirty", sum.num_objects_dirty); + f->dump_int("rd", sum.num_rd); + f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull); + f->dump_int("wr", sum.num_wr); + f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull); + f->dump_int("compress_bytes_used", statfs.data_compressed_allocated); + f->dump_int("compress_under_bytes", statfs.data_compressed_original); + // Stored by user amplified by replication + f->dump_int("stored_raw", pool_stat.get_user_bytes(1.0, per_pool)); + f->dump_unsigned("avail_raw", avail); + } + } else { + tbl << stringify(byte_u_t(stored_normalized)); + tbl << stringify(si_u_t(sum.num_objects)); + tbl << stringify(byte_u_t(used_bytes)); + tbl << percentify(used*100); + tbl << stringify(byte_u_t(avail_res)); + if (verbose) { + if (pool->quota_max_objects == 0) + tbl << "N/A"; + else + tbl << stringify(si_u_t(pool->quota_max_objects)); + + if (pool->quota_max_bytes == 0) + tbl << "N/A"; + else + tbl << stringify(byte_u_t(pool->quota_max_bytes)); + + tbl << stringify(si_u_t(sum.num_objects_dirty)) + << stringify(byte_u_t(statfs.data_compressed_allocated)) + << stringify(byte_u_t(statfs.data_compressed_original)) + ; + } + } +} + +int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map, + int64_t poolid) const +{ + const pg_pool_t *pool = osd_map.get_pg_pool(poolid); + int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(), + pool->get_type(), + pool->get_size()); + int64_t avail; + avail = get_rule_avail(ruleno); + if (avail < 0) + avail = 0; + + return avail / osd_map.pool_raw_used_rate(poolid); +} + +int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const +{ + map<int,float> wm; + int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm); + if (r < 0) { + return r; + } + if (wm.empty()) { + return 0; + } + + float fratio = osdmap.get_full_ratio(); + + int64_t min = -1; + for (auto p = wm.begin(); p != wm.end(); ++p) { + auto osd_info = osd_stat.find(p->first); + if (osd_info != osd_stat.end()) { + if (osd_info->second.statfs.total == 0 || p->second == 0) { + // osd must be out, hence its stats have been zeroed + // (unless we somehow managed to have a disk with size 0...) + // + // (p->second == 0), if osd weight is 0, no need to + // calculate proj below. + continue; + } + double unusable = (double)osd_info->second.statfs.kb() * + (1.0 - fratio); + double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable); + avail *= 1024.0; + int64_t proj = (int64_t)(avail / (double)p->second); + if (min < 0 || proj < min) { + min = proj; + } + } else { + if (osdmap.is_up(p->first)) { + // This is a level 4 rather than an error, because we might have + // only just started, and not received the first stats message yet. + dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl; + } + } + } + return min; +} + +void PGMap::get_rules_avail(const OSDMap& osdmap, + std::map<int,int64_t> *avail_map) const +{ + avail_map->clear(); + for (auto p : osdmap.get_pools()) { + int64_t pool_id = p.first; + if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0)) + continue; + const pg_pool_t *pool = osdmap.get_pg_pool(pool_id); + int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(), + pool->get_type(), + pool->get_size()); + if (avail_map->count(ruleno) == 0) + (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno); + } +} + +// --------------------- +// PGMap + +void PGMap::Incremental::dump(ceph::Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_stream("stamp") << stamp; + f->dump_unsigned("osdmap_epoch", osdmap_epoch); + f->dump_unsigned("pg_scan_epoch", pg_scan); + + f->open_array_section("pg_stat_updates"); + for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) { + f->open_object_section("pg_stat"); + f->dump_stream("pgid") << p->first; + p->second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_array_section("osd_stat_updates"); + for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) { + f->open_object_section("osd_stat"); + f->dump_int("osd", p->first); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("pool_statfs_updates"); + for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) { + f->open_object_section("pool_statfs"); + f->dump_stream("poolid/osd") << p->first; + p->second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_array_section("osd_stat_removals"); + for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + + f->open_array_section("pg_removals"); + for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p) + f->dump_stream("pgid") << *p; + f->close_section(); +} + +void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o) +{ + o.push_back(new Incremental); + o.push_back(new Incremental); + o.back()->version = 1; + o.back()->stamp = utime_t(123,345); + o.push_back(new Incremental); + o.back()->version = 2; + o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t(); + o.back()->osd_stat_updates[5] = osd_stat_t(); + o.push_back(new Incremental); + o.back()->version = 3; + o.back()->osdmap_epoch = 1; + o.back()->pg_scan = 2; + o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t(); + o.back()->osd_stat_updates[6] = osd_stat_t(); + o.back()->pg_remove.insert(pg_t(1,2)); + o.back()->osd_stat_rm.insert(5); + o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t(); +} + +// -- + +void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) +{ + ceph_assert(inc.version == version+1); + version++; + + pool_stat_t pg_sum_old = pg_sum; + mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old; + pg_pool_sum_old = pg_pool_sum; + + for (auto p = inc.pg_stat_updates.begin(); + p != inc.pg_stat_updates.end(); + ++p) { + const pg_t &update_pg(p->first); + auto update_pool = update_pg.pool(); + const pg_stat_t &update_stat(p->second); + + auto pg_stat_iter = pg_stat.find(update_pg); + pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool]; + if (pg_stat_iter == pg_stat.end()) { + pg_stat.insert(make_pair(update_pg, update_stat)); + } else { + stat_pg_sub(update_pg, pg_stat_iter->second); + pool_sum_ref.sub(pg_stat_iter->second); + pg_stat_iter->second = update_stat; + } + stat_pg_add(update_pg, update_stat); + pool_sum_ref.add(update_stat); + } + + for (auto p = inc.pool_statfs_updates.begin(); + p != inc.pool_statfs_updates.end(); + ++p) { + auto update_pool = p->first.first; + auto update_osd = p->first.second; + auto& statfs_inc = p->second; + + auto pool_statfs_iter = + pool_statfs.find(std::make_pair(update_pool, update_osd)); + if (pg_pool_sum.count(update_pool)) { + pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool]; + if (pool_statfs_iter == pool_statfs.end()) { + pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc); + } else { + pool_sum_ref.sub(pool_statfs_iter->second); + pool_statfs_iter->second = statfs_inc; + } + pool_sum_ref.add(statfs_inc); + } + } + + for (auto p = inc.get_osd_stat_updates().begin(); + p != inc.get_osd_stat_updates().end(); + ++p) { + int osd = p->first; + const osd_stat_t &new_stats(p->second); + + auto t = osd_stat.find(osd); + if (t == osd_stat.end()) { + osd_stat.insert(make_pair(osd, new_stats)); + } else { + stat_osd_sub(t->first, t->second); + t->second = new_stats; + } + stat_osd_add(osd, new_stats); + } + set<int64_t> deleted_pools; + for (auto p = inc.pg_remove.begin(); + p != inc.pg_remove.end(); + ++p) { + const pg_t &removed_pg(*p); + auto s = pg_stat.find(removed_pg); + bool pool_erased = false; + if (s != pg_stat.end()) { + pool_erased = stat_pg_sub(removed_pg, s->second); + + // decrease pool stats if pg was removed + auto pool_stats_it = pg_pool_sum.find(removed_pg.pool()); + if (pool_stats_it != pg_pool_sum.end()) { + pool_stats_it->second.sub(s->second); + } + + pg_stat.erase(s); + if (pool_erased) { + deleted_pools.insert(removed_pg.pool()); + } + } + } + + for (auto p = inc.get_osd_stat_rm().begin(); + p != inc.get_osd_stat_rm().end(); + ++p) { + auto t = osd_stat.find(*p); + if (t != osd_stat.end()) { + stat_osd_sub(t->first, t->second); + osd_stat.erase(t); + } + for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) { + if (i->first.second == *p) { + pg_pool_sum[i->first.first].sub(i->second); + pool_statfs.erase(i); + } + } + } + + // skip calculating delta while sum was not synchronized + if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) { + utime_t delta_t; + delta_t = inc.stamp; + delta_t -= stamp; + // calculate a delta, and average over the last 2 deltas. + pool_stat_t d = pg_sum; + d.stats.sub(pg_sum_old.stats); + pg_sum_deltas.push_back(make_pair(d, delta_t)); + stamp_delta += delta_t; + pg_sum_delta.stats.add(d.stats); + auto smooth_intervals = + cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1; + while (pg_sum_deltas.size() > smooth_intervals) { + pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats); + stamp_delta -= pg_sum_deltas.front().second; + pg_sum_deltas.pop_front(); + } + } + stamp = inc.stamp; + + update_pool_deltas(cct, inc.stamp, pg_pool_sum_old); + + for (auto p : deleted_pools) { + if (cct) + dout(20) << " deleted pool " << p << dendl; + deleted_pool(p); + } + + if (inc.osdmap_epoch) + last_osdmap_epoch = inc.osdmap_epoch; + if (inc.pg_scan) + last_pg_scan = inc.pg_scan; +} + +void PGMap::calc_stats() +{ + num_pg = 0; + num_pg_active = 0; + num_pg_unknown = 0; + num_osd = 0; + pg_pool_sum.clear(); + num_pg_by_pool.clear(); + pg_by_osd.clear(); + pg_sum = pool_stat_t(); + osd_sum = osd_stat_t(); + osd_sum_by_class.clear(); + num_pg_by_state.clear(); + num_pg_by_pool_state.clear(); + num_pg_by_osd.clear(); + + for (auto p = pg_stat.begin(); + p != pg_stat.end(); + ++p) { + auto pg = p->first; + stat_pg_add(pg, p->second); + pg_pool_sum[pg.pool()].add(p->second); + } + for (auto p = pool_statfs.begin(); + p != pool_statfs.end(); + ++p) { + auto pool = p->first.first; + pg_pool_sum[pool].add(p->second); + } + for (auto p = osd_stat.begin(); + p != osd_stat.end(); + ++p) + stat_osd_add(p->first, p->second); +} + +void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s, + bool sameosds) +{ + auto pool = pgid.pool(); + pg_sum.add(s); + + num_pg++; + num_pg_by_state[s.state]++; + num_pg_by_pool_state[pgid.pool()][s.state]++; + num_pg_by_pool[pool]++; + + if ((s.state & PG_STATE_CREATING) && + s.parent_split_bits == 0) { + creating_pgs.insert(pgid); + if (s.acting_primary >= 0) { + creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid); + } + } + + if (s.state & PG_STATE_ACTIVE) { + ++num_pg_active; + } + if (s.state == 0) { + ++num_pg_unknown; + } + + if (sameosds) + return; + + for (auto p = s.blocked_by.begin(); + p != s.blocked_by.end(); + ++p) { + ++blocked_by_sum[*p]; + } + + for (auto p = s.acting.begin(); p != s.acting.end(); ++p) { + pg_by_osd[*p].insert(pgid); + num_pg_by_osd[*p].acting++; + } + for (auto p = s.up.begin(); p != s.up.end(); ++p) { + auto& t = pg_by_osd[*p]; + if (t.find(pgid) == t.end()) { + t.insert(pgid); + num_pg_by_osd[*p].up_not_acting++; + } + } + + if (s.up_primary >= 0) { + num_pg_by_osd[s.up_primary].primary++; + } +} + +bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, + bool sameosds) +{ + bool pool_erased = false; + pg_sum.sub(s); + + num_pg--; + int end = --num_pg_by_state[s.state]; + ceph_assert(end >= 0); + if (end == 0) + num_pg_by_state.erase(s.state); + if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) { + num_pg_by_pool_state[pgid.pool()].erase(s.state); + } + end = --num_pg_by_pool[pgid.pool()]; + if (end == 0) { + pool_erased = true; + } + + if ((s.state & PG_STATE_CREATING) && + s.parent_split_bits == 0) { + creating_pgs.erase(pgid); + if (s.acting_primary >= 0) { + map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary]; + r[s.mapping_epoch].erase(pgid); + if (r[s.mapping_epoch].empty()) + r.erase(s.mapping_epoch); + if (r.empty()) + creating_pgs_by_osd_epoch.erase(s.acting_primary); + } + } + + if (s.state & PG_STATE_ACTIVE) { + --num_pg_active; + } + if (s.state == 0) { + --num_pg_unknown; + } + + if (sameosds) + return pool_erased; + + for (auto p = s.blocked_by.begin(); + p != s.blocked_by.end(); + ++p) { + auto q = blocked_by_sum.find(*p); + ceph_assert(q != blocked_by_sum.end()); + --q->second; + if (q->second == 0) + blocked_by_sum.erase(q); + } + + set<int32_t> actingset; + for (auto p = s.acting.begin(); p != s.acting.end(); ++p) { + actingset.insert(*p); + auto& oset = pg_by_osd[*p]; + oset.erase(pgid); + if (oset.empty()) + pg_by_osd.erase(*p); + auto it = num_pg_by_osd.find(*p); + if (it != num_pg_by_osd.end() && it->second.acting > 0) + it->second.acting--; + } + for (auto p = s.up.begin(); p != s.up.end(); ++p) { + auto& oset = pg_by_osd[*p]; + oset.erase(pgid); + if (oset.empty()) + pg_by_osd.erase(*p); + if (actingset.count(*p)) + continue; + auto it = num_pg_by_osd.find(*p); + if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0) + it->second.up_not_acting--; + } + + if (s.up_primary >= 0) { + auto it = num_pg_by_osd.find(s.up_primary); + if (it != num_pg_by_osd.end() && it->second.primary > 0) + it->second.primary--; + } + return pool_erased; +} + +void PGMap::calc_purged_snaps() +{ + purged_snaps.clear(); + set<int64_t> unknown; + for (auto& i : pg_stat) { + if (i.second.state == 0) { + unknown.insert(i.first.pool()); + purged_snaps.erase(i.first.pool()); + continue; + } else if (unknown.count(i.first.pool())) { + continue; + } + auto j = purged_snaps.find(i.first.pool()); + if (j == purged_snaps.end()) { + // base case + purged_snaps[i.first.pool()] = i.second.purged_snaps; + } else { + j->second.intersection_of(i.second.purged_snaps); + } + } +} + +void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap) +{ + osd_sum_by_class.clear(); + for (auto& i : osd_stat) { + const char *class_name = osdmap.crush->get_item_class(i.first); + if (class_name) { + osd_sum_by_class[class_name].add(i.second); + } + } +} + +void PGMap::stat_osd_add(int osd, const osd_stat_t &s) +{ + num_osd++; + osd_sum.add(s); + if (osd >= (int)osd_last_seq.size()) { + osd_last_seq.resize(osd + 1); + } + osd_last_seq[osd] = s.seq; +} + +void PGMap::stat_osd_sub(int osd, const osd_stat_t &s) +{ + num_osd--; + osd_sum.sub(s); + ceph_assert(osd < (int)osd_last_seq.size()); + osd_last_seq[osd] = 0; +} + +void PGMap::encode_digest(const OSDMap& osdmap, + bufferlist& bl, uint64_t features) +{ + get_rules_avail(osdmap, &avail_space_by_rule); + calc_osd_sum_by_class(osdmap); + calc_purged_snaps(); + PGMapDigest::encode(bl, features); +} + +void PGMap::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(8, 8, bl); + encode(version, bl); + encode(pg_stat, bl); + encode(osd_stat, bl, features); + encode(last_osdmap_epoch, bl); + encode(last_pg_scan, bl); + encode(stamp, bl); + encode(pool_statfs, bl, features); + ENCODE_FINISH(bl); +} + +void PGMap::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(8, bl); + decode(version, bl); + decode(pg_stat, bl); + decode(osd_stat, bl); + decode(last_osdmap_epoch, bl); + decode(last_pg_scan, bl); + decode(stamp, bl); + decode(pool_statfs, bl); + DECODE_FINISH(bl); + + calc_stats(); +} + +void PGMap::dump(ceph::Formatter *f, bool with_net) const +{ + dump_basic(f); + dump_pg_stats(f, false); + dump_pool_stats(f); + dump_osd_stats(f, with_net); +} + +void PGMap::dump_basic(ceph::Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_stream("stamp") << stamp; + f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch); + f->dump_unsigned("last_pg_scan", last_pg_scan); + + f->open_object_section("pg_stats_sum"); + pg_sum.dump(f); + f->close_section(); + + f->open_object_section("osd_stats_sum"); + osd_sum.dump(f); + f->close_section(); + + dump_delta(f); +} + +void PGMap::dump_delta(ceph::Formatter *f) const +{ + f->open_object_section("pg_stats_delta"); + pg_sum_delta.dump(f); + f->dump_stream("stamp_delta") << stamp_delta; + f->close_section(); +} + +void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const +{ + f->open_array_section("pg_stats"); + for (auto i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + f->open_object_section("pg_stat"); + f->dump_stream("pgid") << i->first; + if (brief) + i->second.dump_brief(f); + else + i->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_pool_stats(ceph::Formatter *f) const +{ + f->open_array_section("pool_stats"); + for (auto p = pg_pool_sum.begin(); + p != pg_pool_sum.end(); + ++p) { + f->open_object_section("pool_stat"); + f->dump_int("poolid", p->first); + auto q = num_pg_by_pool.find(p->first); + if (q != num_pg_by_pool.end()) + f->dump_unsigned("num_pg", q->second); + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_osd_stats(ceph::Formatter *f, bool with_net) const +{ + f->open_array_section("osd_stats"); + for (auto q = osd_stat.begin(); + q != osd_stat.end(); + ++q) { + f->open_object_section("osd_stat"); + f->dump_int("osd", q->first); + q->second.dump(f, with_net); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_osd_ping_times(ceph::Formatter *f) const +{ + f->open_array_section("osd_ping_times"); + for (auto& [osd, stat] : osd_stat) { + f->open_object_section("osd_ping_time"); + f->dump_int("osd", osd); + stat.dump_ping_time(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_pg_stats_plain( + ostream& ss, + const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats, + bool brief) const +{ + TextTable tab; + + if (brief){ + tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + } + else { + tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT); + } + + for (auto i = pg_stats.begin(); + i != pg_stats.end(); ++i) { + const pg_stat_t &st(i->second); + if (brief) { + tab << i->first + << pg_state_string(st.state) + << st.up + << st.up_primary + << st.acting + << st.acting_primary + << TextTable::endrow; + } else { + ostringstream reported; + reported << st.reported_epoch << ":" << st.reported_seq; + + tab << i->first + << st.stats.sum.num_objects + << st.stats.sum.num_objects_missing_on_primary + << st.stats.sum.num_objects_degraded + << st.stats.sum.num_objects_misplaced + << st.stats.sum.num_objects_unfound + << st.stats.sum.num_bytes + << st.stats.sum.num_omap_bytes + << st.stats.sum.num_omap_keys + << st.log_size + << st.ondisk_log_size + << pg_state_string(st.state) + << st.last_change + << st.version + << reported.str() + << pg_vector_string(st.up) + << st.up_primary + << pg_vector_string(st.acting) + << st.acting_primary + << st.last_scrub + << st.last_scrub_stamp + << st.last_deep_scrub + << st.last_deep_scrub_stamp + << st.snaptrimq_len + << TextTable::endrow; + } + } + + ss << tab; +} + +void PGMap::dump(ostream& ss) const +{ + dump_basic(ss); + dump_pg_stats(ss, false); + dump_pool_stats(ss, false); + dump_pg_sum_stats(ss, false); + dump_osd_stats(ss); +} + +void PGMap::dump_basic(ostream& ss) const +{ + ss << "version " << version << std::endl; + ss << "stamp " << stamp << std::endl; + ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl; + ss << "last_pg_scan " << last_pg_scan << std::endl; +} + +void PGMap::dump_pg_stats(ostream& ss, bool brief) const +{ + dump_pg_stats_plain(ss, pg_stat, brief); +} + +void PGMap::dump_pool_stats(ostream& ss, bool header) const +{ + TextTable tab; + + if (header) { + tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT); + tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT); + } else { + tab.define_column("", TextTable::LEFT, TextTable::LEFT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + } + + for (auto p = pg_pool_sum.begin(); + p != pg_pool_sum.end(); + ++p) { + tab << p->first + << p->second.stats.sum.num_objects + << p->second.stats.sum.num_objects_missing_on_primary + << p->second.stats.sum.num_objects_degraded + << p->second.stats.sum.num_objects_misplaced + << p->second.stats.sum.num_objects_unfound + << p->second.stats.sum.num_bytes + << p->second.stats.sum.num_omap_bytes + << p->second.stats.sum.num_omap_keys + << p->second.log_size + << p->second.ondisk_log_size + << TextTable::endrow; + } + + ss << tab; +} + +void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const +{ + TextTable tab; + + if (header) { + tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT); + } else { + tab.define_column("", TextTable::LEFT, TextTable::LEFT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + }; + + tab << "sum" + << pg_sum.stats.sum.num_objects + << pg_sum.stats.sum.num_objects_missing_on_primary + << pg_sum.stats.sum.num_objects_degraded + << pg_sum.stats.sum.num_objects_misplaced + << pg_sum.stats.sum.num_objects_unfound + << pg_sum.stats.sum.num_bytes + << pg_sum.stats.sum.num_omap_bytes + << pg_sum.stats.sum.num_omap_keys + << pg_sum.log_size + << pg_sum.ondisk_log_size + << TextTable::endrow; + + ss << tab; +} + +void PGMap::dump_osd_stats(ostream& ss) const +{ + TextTable tab; + + tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT); + + for (auto p = osd_stat.begin(); + p != osd_stat.end(); + ++p) { + tab << p->first + << byte_u_t(p->second.statfs.get_used()) + << byte_u_t(p->second.statfs.available) + << byte_u_t(p->second.statfs.get_used_raw()) + << byte_u_t(p->second.statfs.total) + << p->second.hb_peers + << get_num_pg_by_osd(p->first) + << get_num_primary_pg_by_osd(p->first) + << TextTable::endrow; + } + + tab << "sum" + << byte_u_t(osd_sum.statfs.get_used()) + << byte_u_t(osd_sum.statfs.available) + << byte_u_t(osd_sum.statfs.get_used_raw()) + << byte_u_t(osd_sum.statfs.total) + << TextTable::endrow; + + ss << tab; +} + +void PGMap::dump_osd_sum_stats(ostream& ss) const +{ + TextTable tab; + + tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT); + + tab << "sum" + << byte_u_t(osd_sum.statfs.get_used()) + << byte_u_t(osd_sum.statfs.available) + << byte_u_t(osd_sum.statfs.get_used_raw()) + << byte_u_t(osd_sum.statfs.total) + << TextTable::endrow; + + ss << tab; +} + +void PGMap::get_stuck_stats( + int types, const utime_t cutoff, + mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const +{ + ceph_assert(types != 0); + for (auto i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + utime_t val = cutoff; // don't care about >= cutoff so that is infinity + + if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) { + if (i->second.last_active < val) + val = i->second.last_active; + } + + if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) { + if (i->second.last_clean < val) + val = i->second.last_clean; + } + + if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) { + if (i->second.last_undegraded < val) + val = i->second.last_undegraded; + } + + if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) { + if (i->second.last_fullsized < val) + val = i->second.last_fullsized; + } + + if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) { + if (i->second.last_unstale < val) + val = i->second.last_unstale; + } + + // val is now the earliest any of the requested stuck states began + if (val < cutoff) { + stuck_pgs[i->first] = i->second; + } + } +} + +bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const +{ + int inactive = 0; + int unclean = 0; + int degraded = 0; + int undersized = 0; + int stale = 0; + + for (auto i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + if (! (i->second.state & PG_STATE_ACTIVE)) { + if (i->second.last_active < cutoff) + ++inactive; + } + if (! (i->second.state & PG_STATE_CLEAN)) { + if (i->second.last_clean < cutoff) + ++unclean; + } + if (i->second.state & PG_STATE_DEGRADED) { + if (i->second.last_undegraded < cutoff) + ++degraded; + } + if (i->second.state & PG_STATE_UNDERSIZED) { + if (i->second.last_fullsized < cutoff) + ++undersized; + } + if (i->second.state & PG_STATE_STALE) { + if (i->second.last_unstale < cutoff) + ++stale; + } + } + + if (inactive) + note["stuck inactive"] = inactive; + + if (unclean) + note["stuck unclean"] = unclean; + + if (undersized) + note["stuck undersized"] = undersized; + + if (degraded) + note["stuck degraded"] = degraded; + + if (stale) + note["stuck stale"] = stale; + + return inactive || unclean || undersized || degraded || stale; +} + +void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const +{ + mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats; + get_stuck_stats(types, cutoff, stuck_pg_stats); + f->open_array_section("stuck_pg_stats"); + for (auto i = stuck_pg_stats.begin(); + i != stuck_pg_stats.end(); + ++i) { + f->open_object_section("pg_stat"); + f->dump_stream("pgid") << i->first; + i->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const +{ + mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats; + get_stuck_stats(types, cutoff, stuck_pg_stats); + if (!stuck_pg_stats.empty()) + dump_pg_stats_plain(ss, stuck_pg_stats, true); +} + +int PGMap::dump_stuck_pg_stats( + stringstream &ds, + ceph::Formatter *f, + int threshold, + vector<string>& args) const +{ + int stuck_types = 0; + + for (auto i = args.begin(); i != args.end(); ++i) { + if (*i == "inactive") + stuck_types |= PGMap::STUCK_INACTIVE; + else if (*i == "unclean") + stuck_types |= PGMap::STUCK_UNCLEAN; + else if (*i == "undersized") + stuck_types |= PGMap::STUCK_UNDERSIZED; + else if (*i == "degraded") + stuck_types |= PGMap::STUCK_DEGRADED; + else if (*i == "stale") + stuck_types |= PGMap::STUCK_STALE; + else { + ds << "Unknown type: " << *i << std::endl; + return -EINVAL; + } + } + + utime_t now(ceph_clock_now()); + utime_t cutoff = now - utime_t(threshold, 0); + + if (!f) { + dump_stuck_plain(ds, stuck_types, cutoff); + } else { + dump_stuck(f, stuck_types, cutoff); + f->flush(ds); + } + + return 0; +} + +void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const +{ + f->open_array_section("osd_perf_infos"); + for (auto i = osd_stat.begin(); + i != osd_stat.end(); + ++i) { + f->open_object_section("osd"); + f->dump_int("id", i->first); + { + f->open_object_section("perf_stats"); + i->second.os_perf_stat.dump(f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} +void PGMap::print_osd_perf_stats(std::ostream *ss) const +{ + TextTable tab; + tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT); + for (auto i = osd_stat.begin(); + i != osd_stat.end(); + ++i) { + tab << i->first; + tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull; + tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull; + tab << TextTable::endrow; + } + (*ss) << tab; +} + +void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const +{ + f->open_array_section("osd_blocked_by_infos"); + for (auto i = blocked_by_sum.begin(); + i != blocked_by_sum.end(); + ++i) { + f->open_object_section("osd"); + f->dump_int("id", i->first); + f->dump_int("num_blocked", i->second); + f->close_section(); + } + f->close_section(); +} +void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const +{ + TextTable tab; + tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT); + for (auto i = blocked_by_sum.begin(); + i != blocked_by_sum.end(); + ++i) { + tab << i->first; + tab << i->second; + tab << TextTable::endrow; + } + (*ss) << tab; +} + + +/** + * update aggregated delta + * + * @param cct ceph context + * @param ts Timestamp for the stats being delta'ed + * @param old_pool_sum Previous stats sum + * @param last_ts Last timestamp for pool + * @param result_pool_sum Resulting stats + * @param result_pool_delta Resulting pool delta + * @param result_ts_delta Resulting timestamp delta + * @param delta_avg_list List of last N computed deltas, used to average + */ +void PGMap::update_delta( + CephContext *cct, + const utime_t ts, + const pool_stat_t& old_pool_sum, + utime_t *last_ts, + const pool_stat_t& current_pool_sum, + pool_stat_t *result_pool_delta, + utime_t *result_ts_delta, + mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list) +{ + /* @p ts is the timestamp we want to associate with the data + * in @p old_pool_sum, and on which we will base ourselves to + * calculate the delta, stored in 'delta_t'. + */ + utime_t delta_t; + delta_t = ts; // start with the provided timestamp + delta_t -= *last_ts; // take the last timestamp we saw + *last_ts = ts; // @p ts becomes the last timestamp we saw + + // adjust delta_t, quick start if there is no update in a long period + delta_t = std::min(delta_t, + utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0)); + + // calculate a delta, and average over the last 6 deltas by default. + /* start by taking a copy of our current @p result_pool_sum, and by + * taking out the stats from @p old_pool_sum. This generates a stats + * delta. Stash this stats delta in @p delta_avg_list, along with the + * timestamp delta for these results. + */ + pool_stat_t d = current_pool_sum; + d.stats.sub(old_pool_sum.stats); + + /* Aggregate current delta, and take out the last seen delta (if any) to + * average it out. + * Skip calculating delta while sum was not synchronized. + */ + if(!old_pool_sum.stats.sum.is_zero()) { + delta_avg_list->push_back(make_pair(d,delta_t)); + *result_ts_delta += delta_t; + result_pool_delta->stats.add(d.stats); + } + size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1; + while (delta_avg_list->size() > s) { + result_pool_delta->stats.sub(delta_avg_list->front().first.stats); + *result_ts_delta -= delta_avg_list->front().second; + delta_avg_list->pop_front(); + } +} + +/** + * Update a given pool's deltas + * + * @param cct Ceph Context + * @param ts Timestamp for the stats being delta'ed + * @param pool Pool's id + * @param old_pool_sum Previous stats sum + */ +void PGMap::update_one_pool_delta( + CephContext *cct, + const utime_t ts, + const int64_t pool, + const pool_stat_t& old_pool_sum) +{ + if (per_pool_sum_deltas.count(pool) == 0) { + ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0); + ceph_assert(per_pool_sum_delta.count(pool) == 0); + } + + auto& sum_delta = per_pool_sum_delta[pool]; + + update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool], + &sum_delta.first, &per_pool_sum_deltas_stamps[pool], + &per_pool_sum_deltas[pool]); +} + +/** + * Update pools' deltas + * + * @param cct CephContext + * @param ts Timestamp for the stats being delta'ed + * @param pg_pool_sum_old Map of pool stats for delta calcs. + */ +void PGMap::update_pool_deltas( + CephContext *cct, const utime_t ts, + const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old) +{ + for (auto it = pg_pool_sum_old.begin(); + it != pg_pool_sum_old.end(); ++it) { + update_one_pool_delta(cct, ts, it->first, it->second); + } +} + +void PGMap::clear_delta() +{ + pg_sum_delta = pool_stat_t(); + pg_sum_deltas.clear(); + stamp_delta = utime_t(); +} + +void PGMap::generate_test_instances(list<PGMap*>& o) +{ + o.push_back(new PGMap); + list<Incremental*> inc; + Incremental::generate_test_instances(inc); + delete inc.front(); + inc.pop_front(); + while (!inc.empty()) { + PGMap *pmp = new PGMap(); + *pmp = *o.back(); + o.push_back(pmp); + o.back()->apply_incremental(NULL, *inc.front()); + delete inc.front(); + inc.pop_front(); + } +} + +void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid, + bool primary, set<pg_t>& pgs) const +{ + for (auto i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + if ((poolid >= 0) && (poolid != i->first.pool())) + continue; + if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary))) + continue; + if (state == (uint64_t)-1 || // "all" + (i->second.state & state) || // matches a state bit + (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0) + pgs.insert(i->first); + } + } +} + +void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const +{ + f->open_array_section("pg_stats"); + for (auto i = pgs.begin(); i != pgs.end(); ++i) { + const pg_stat_t& st = pg_stat.at(*i); + f->open_object_section("pg_stat"); + f->dump_stream("pgid") << *i; + st.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const +{ + TextTable tab; + utime_t now = ceph_clock_now(); + + tab.define_column("PG", TextTable::LEFT, TextTable::LEFT); + tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); + + for (auto i = pgs.begin(); i != pgs.end(); ++i) { + const pg_stat_t& st = pg_stat.at(*i); + + ostringstream reported; + reported << st.reported_epoch << ":" << st.reported_seq; + + ostringstream upstr, actingstr; + upstr << st.up << 'p' << st.up_primary; + actingstr << st.acting << 'p' << st.acting_primary; + tab << *i + << st.stats.sum.num_objects + << st.stats.sum.num_objects_degraded + << st.stats.sum.num_objects_misplaced + << st.stats.sum.num_objects_unfound + << st.stats.sum.num_bytes + << st.stats.sum.num_omap_bytes + << st.stats.sum.num_omap_keys + << st.log_size + << pg_state_string(st.state) + << utimespan_str(now - st.last_change) + << st.version + << reported.str() + << upstr.str() + << actingstr.str() + << st.last_scrub_stamp + << st.last_deep_scrub_stamp + << TextTable::endrow; + } + + ss << tab; +} + +void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map, + ceph::Formatter *f, + stringstream *rs) const { + string pool_name = osd_map.get_pool_name(poolid); + if (f) { + f->open_object_section("pool"); + f->dump_string("pool_name", pool_name.c_str()); + f->dump_int("pool_id", poolid); + f->open_object_section("recovery"); + } + list<string> sl; + stringstream tss; + pool_recovery_summary(f, &sl, poolid); + if (!f && !sl.empty()) { + for (auto &p : sl) + tss << " " << p << "\n"; + } + if (f) { + f->close_section(); // object section recovery + f->open_object_section("recovery_rate"); + } + ostringstream rss; + pool_recovery_rate_summary(f, &rss, poolid); + if (!f && !rss.str().empty()) + tss << " recovery io " << rss.str() << "\n"; + if (f) { + f->close_section(); // object section recovery_rate + f->open_object_section("client_io_rate"); + } + rss.clear(); + rss.str(""); + pool_client_io_rate_summary(f, &rss, poolid); + if (!f && !rss.str().empty()) + tss << " client io " << rss.str() << "\n"; + // dump cache tier IO rate for cache pool + const pg_pool_t *pool = osd_map.get_pg_pool(poolid); + if (pool->is_tier()) { + if (f) { + f->close_section(); // object section client_io_rate + f->open_object_section("cache_io_rate"); + } + rss.clear(); + rss.str(""); + pool_cache_io_rate_summary(f, &rss, poolid); + if (!f && !rss.str().empty()) + tss << " cache tier io " << rss.str() << "\n"; + } + if (f) { + f->close_section(); // object section cache_io_rate + f->close_section(); // object section pool + } else { + *rs << "pool " << pool_name << " id " << poolid << "\n"; + if (!tss.str().empty()) + *rs << tss.str() << "\n"; + else + *rs << " nothing is going on\n\n"; + } +} + +void PGMap::get_health_checks( + CephContext *cct, + const OSDMap& osdmap, + health_check_map_t *checks) const +{ + utime_t now = ceph_clock_now(); + const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail"); + const auto& pools = osdmap.get_pools(); + + typedef enum pg_consequence_t { + UNAVAILABLE = 1, // Client IO to the pool may block + DEGRADED = 2, // Fewer than the requested number of replicas are present + BACKFILL_FULL = 3, // Backfill is blocked for space considerations + // This may or may not be a deadlock condition. + DAMAGED = 4, // The data may be missing or inconsistent on disk and + // requires repair + RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full + } pg_consequence_t; + + // For a given PG state, how should it be reported at the pool level? + class PgStateResponse { + public: + pg_consequence_t consequence; + typedef std::function< utime_t(const pg_stat_t&) > stuck_cb; + stuck_cb stuck_since; + bool invert; + + PgStateResponse(const pg_consequence_t& c, stuck_cb&& s) + : consequence(c), stuck_since(std::move(s)), invert(false) + { + } + + PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i) + : consequence(c), stuck_since(std::move(s)), invert(i) + { + } + }; + + // Record the PG state counts that contributed to a reported pool state + class PgCauses { + public: + // Map of PG_STATE_* to number of pgs in that state. + std::map<unsigned, unsigned> states; + + // List of all PG IDs that had a state contributing + // to this health condition. + std::set<pg_t> pgs; + + std::map<pg_t, std::string> pg_messages; + }; + + // Map of PG state to how to respond to it + std::map<unsigned, PgStateResponse> state_to_response = { + // Immediate reports + { PG_STATE_INCONSISTENT, {DAMAGED, {}} }, + { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} }, + { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} }, + { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} }, + { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} }, + { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} }, + { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} }, + { PG_STATE_DEGRADED, {DEGRADED, {}} }, + { PG_STATE_DOWN, {UNAVAILABLE, {}} }, + // Delayed (wait until stuck) reports + { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } }, + { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } }, + { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } }, + // Delayed and inverted reports + { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} } + }; + + // Specialized state printer that takes account of inversion of + // ACTIVE, CLEAN checks. + auto state_name = [](const uint64_t &state) { + // Special cases for the states that are inverted checks + if (state == PG_STATE_CLEAN) { + return std::string("unclean"); + } else if (state == PG_STATE_ACTIVE) { + return std::string("inactive"); + } else { + return pg_state_string(state); + } + }; + + // Map of what is wrong to information about why, implicitly also stores + // the list of what is wrong. + std::map<pg_consequence_t, PgCauses> detected; + + // Optimisation: trim down the number of checks to apply based on + // the summary counters + std::map<unsigned, PgStateResponse> possible_responses; + for (const auto &i : num_pg_by_state) { + for (const auto &j : state_to_response) { + if (!j.second.invert) { + // Check for normal tests by seeing if any pgs have the flag + if (i.first & j.first) { + possible_responses.insert(j); + } + } + } + } + + for (const auto &j : state_to_response) { + if (j.second.invert) { + // Check for inverted tests by seeing if not-all pgs have the flag + const auto &found = num_pg_by_state.find(j.first); + if (found == num_pg_by_state.end() || found->second != num_pg) { + possible_responses.insert(j); + } + } + } + + utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0); + // Loop over all PGs, if there are any possibly-unhealthy states in there + if (!possible_responses.empty()) { + for (const auto& i : pg_stat) { + const auto &pg_id = i.first; + const auto &pg_info = i.second; + + for (const auto &j : state_to_response) { + const auto &pg_response_state = j.first; + const auto &pg_response = j.second; + + // Apply the state test + if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) { + continue; + } + + // Apply stuckness test if needed + if (pg_response.stuck_since) { + // Delayed response, check for stuckness + utime_t last_whatever = pg_response.stuck_since(pg_info); + if (last_whatever.is_zero() && + pg_info.last_change >= cutoff) { + // still moving, ignore + continue; + } else if (last_whatever >= cutoff) { + // Not stuck enough, ignore. + continue; + } else { + + } + } + + auto &causes = detected[pg_response.consequence]; + causes.states[pg_response_state]++; + causes.pgs.insert(pg_id); + + // Don't bother composing detail string if we have already recorded + // too many + if (causes.pg_messages.size() > max) { + continue; + } + + std::ostringstream ss; + if (pg_response.stuck_since) { + utime_t since = pg_response.stuck_since(pg_info); + ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state); + if (since == utime_t()) { + ss << " since forever"; + } else { + utime_t dur = now - since; + ss << " for " << dur; + } + ss << ", current state " << pg_state_string(pg_info.state) + << ", last acting " << pg_info.acting; + } else { + ss << "pg " << pg_id << " is " + << pg_state_string(pg_info.state); + ss << ", acting " << pg_info.acting; + if (pg_info.stats.sum.num_objects_unfound) { + ss << ", " << pg_info.stats.sum.num_objects_unfound + << " unfound"; + } + } + + if (pg_info.state & PG_STATE_INCOMPLETE) { + const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool()); + if (pi && pi->min_size > 1) { + ss << " (reducing pool " + << osdmap.get_pool_name(pg_id.pool()) + << " min_size from " << (int)pi->min_size + << " may help; search ceph.com/docs for 'incomplete')"; + } + } + + causes.pg_messages[pg_id] = ss.str(); + } + } + } else { + dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl; + } + + for (const auto &i : detected) { + std::string health_code; + health_status_t sev; + std::string summary; + switch(i.first) { + case UNAVAILABLE: + health_code = "PG_AVAILABILITY"; + sev = HEALTH_WARN; + summary = "Reduced data availability: "; + break; + case DEGRADED: + health_code = "PG_DEGRADED"; + summary = "Degraded data redundancy: "; + sev = HEALTH_WARN; + break; + case BACKFILL_FULL: + health_code = "PG_BACKFILL_FULL"; + summary = "Low space hindering backfill (add storage if this doesn't resolve itself): "; + sev = HEALTH_WARN; + break; + case DAMAGED: + health_code = "PG_DAMAGED"; + summary = "Possible data damage: "; + sev = HEALTH_ERR; + break; + case RECOVERY_FULL: + health_code = "PG_RECOVERY_FULL"; + summary = "Full OSDs blocking recovery: "; + sev = HEALTH_ERR; + break; + default: + ceph_abort(); + } + + if (i.first == DEGRADED) { + if (pg_sum.stats.sum.num_objects_degraded && + pg_sum.stats.sum.num_object_copies > 0) { + double pc = (double)pg_sum.stats.sum.num_objects_degraded / + (double)pg_sum.stats.sum.num_object_copies * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + ostringstream ss; + ss << pg_sum.stats.sum.num_objects_degraded + << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded (" + << b << "%)"; + + // Throw in a comma for the benefit of the following PG counts + summary += ss.str() + ", "; + } + } + + // Compose summary message saying how many PGs in what states led + // to this health check failing + std::vector<std::string> pg_msgs; + for (const auto &j : i.second.states) { + std::ostringstream msg; + msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first); + pg_msgs.push_back(msg.str()); + } + summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", ")); + + + + health_check_t *check = &checks->add( + health_code, + sev, + summary); + + // Compose list of PGs contributing to this health check failing + for (const auto &j : i.second.pg_messages) { + check->detail.push_back(j.second); + } + } + + // OSD_SCRUB_ERRORS + if (pg_sum.stats.sum.num_scrub_errors) { + ostringstream ss; + ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors"; + checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str()); + } + + // LARGE_OMAP_OBJECTS + if (pg_sum.stats.sum.num_large_omap_objects) { + list<string> detail; + for (auto &pool : pools) { + const string& pool_name = osdmap.get_pool_name(pool.first); + auto it2 = pg_pool_sum.find(pool.first); + if (it2 == pg_pool_sum.end()) { + continue; + } + const pool_stat_t *pstat = &it2->second; + if (pstat == nullptr) { + continue; + } + const object_stat_sum_t& sum = pstat->stats.sum; + if (sum.num_large_omap_objects) { + stringstream ss; + ss << sum.num_large_omap_objects << " large objects found in pool " + << "'" << pool_name << "'"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects"; + auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str()); + stringstream tip; + tip << "Search the cluster log for 'Large omap object found' for more " + << "details."; + detail.push_back(tip.str()); + d.detail.swap(detail); + } + } + + // CACHE_POOL_NEAR_FULL + { + list<string> detail; + unsigned num_pools = 0; + for (auto& p : pools) { + if ((!p.second.target_max_objects && !p.second.target_max_bytes) || + !pg_pool_sum.count(p.first)) { + continue; + } + bool nearfull = false; + const string& name = osdmap.get_pool_name(p.first); + const pool_stat_t& st = get_pg_pool_sum_stat(p.first); + uint64_t ratio = p.second.cache_target_full_ratio_micro + + ((1000000 - p.second.cache_target_full_ratio_micro) * + cct->_conf->mon_cache_target_full_warn_ratio); + if (p.second.target_max_objects && + (uint64_t)(st.stats.sum.num_objects - + st.stats.sum.num_objects_hit_set_archive) > + p.second.target_max_objects * (ratio / 1000000.0)) { + ostringstream ss; + ss << "cache pool '" << name << "' with " + << si_u_t(st.stats.sum.num_objects) + << " objects at/near target max " + << si_u_t(p.second.target_max_objects) << " objects"; + detail.push_back(ss.str()); + nearfull = true; + } + if (p.second.target_max_bytes && + (uint64_t)(st.stats.sum.num_bytes - + st.stats.sum.num_bytes_hit_set_archive) > + p.second.target_max_bytes * (ratio / 1000000.0)) { + ostringstream ss; + ss << "cache pool '" << name + << "' with " << byte_u_t(st.stats.sum.num_bytes) + << " at/near target max " + << byte_u_t(p.second.target_max_bytes); + detail.push_back(ss.str()); + nearfull = true; + } + if (nearfull) { + ++num_pools; + } + } + if (!detail.empty()) { + ostringstream ss; + ss << num_pools << " cache pools at or near target size"; + auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str()); + d.detail.swap(detail); + } + } + + // TOO_FEW_PGS + unsigned num_in = osdmap.get_num_in_osds(); + auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size()); + const auto min_pg_per_osd = + cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd"); + if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) { + auto per = sum_pg_up / num_in; + if (per < min_pg_per_osd && per) { + ostringstream ss; + ss << "too few PGs per OSD (" << per + << " < min " << min_pg_per_osd << ")"; + checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str()); + } + } + + // TOO_MANY_PGS + auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd"); + if (num_in && max_pg_per_osd > 0) { + auto per = sum_pg_up / num_in; + if (per > max_pg_per_osd) { + ostringstream ss; + ss << "too many PGs per OSD (" << per + << " > max " << max_pg_per_osd << ")"; + checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str()); + } + } + + // TOO_FEW_OSDS + auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds"); + auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size"); + if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) { + ostringstream ss; + ss << "OSD count " << osdmap.get_num_osds() + << " < osd_pool_default_size " << osd_pool_default_size; + checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str()); + } + + // SLOW_PING_TIME + // Convert milliseconds to microseconds + auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000; + auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace"); + if (warn_slow_ping_time == 0) { + double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio"); + warn_slow_ping_time = grace; + warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio + } + if (warn_slow_ping_time > 0) { + + struct mon_ping_item_t { + uint32_t pingtime; + int from; + int to; + bool improving; + + bool operator<(const mon_ping_item_t& rhs) const { + if (pingtime < rhs.pingtime) + return true; + if (pingtime > rhs.pingtime) + return false; + if (from < rhs.from) + return true; + if (from > rhs.from) + return false; + return to < rhs.to; + } + }; + + list<string> detail_back; + list<string> detail_front; + list<string> detail; + set<mon_ping_item_t> back_sorted, front_sorted; + for (auto i : osd_stat) { + for (auto j : i.second.hb_pingtime) { + + // Maybe source info is old + if (now.sec() - j.second.last_update > grace * 60) + continue; + + mon_ping_item_t back; + back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); + back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]); + back.from = i.first; + back.to = j.first; + if (back.pingtime > warn_slow_ping_time) { + back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1] + && j.second.back_pingtime[1] < j.second.back_pingtime[2]); + back_sorted.emplace(back); + } + + mon_ping_item_t front; + front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]); + front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]); + front.from = i.first; + front.to = j.first; + if (front.pingtime > warn_slow_ping_time) { + front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1] + && j.second.front_pingtime[1] < j.second.back_pingtime[2]); + front_sorted.emplace(front); + } + } + if (i.second.num_shards_repaired > + cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) { + ostringstream ss; + ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << "Too many repaired reads on " << detail.size() << " OSDs"; + auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str()); + d.detail.swap(detail); + } + int max_detail = 10; + for (auto &sback : boost::adaptors::reverse(back_sorted)) { + ostringstream ss; + if (max_detail == 0) { + ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information"; + detail_back.push_back(ss.str()); + break; + } + max_detail--; + ss << "Slow heartbeat ping on back interface from osd." << sback.from + << (osdmap.is_down(sback.from) ? " (down)" : "") + << " to osd." << sback.to + << (osdmap.is_down(sback.to) ? " (down)" : "") + << " " << fixed_u_to_string(sback.pingtime, 3) << " msec" + << (sback.improving ? " possibly improving" : ""); + detail_back.push_back(ss.str()); + } + max_detail = 10; + for (auto &sfront : boost::adaptors::reverse(front_sorted)) { + ostringstream ss; + if (max_detail == 0) { + ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information"; + detail_front.push_back(ss.str()); + break; + } + max_detail--; + ss << "Slow heartbeat ping on front interface from osd." << sfront.from + << (osdmap.is_down(sfront.from) ? " (down)" : "") + << " to osd." << sfront.to + << (osdmap.is_down(sfront.to) ? " (down)" : "") + << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec" + << (sfront.improving ? " possibly improving" : ""); + detail_front.push_back(ss.str()); + } + if (detail_back.size() != 0) { + ostringstream ss; + ss << "Long heartbeat ping times on back interface seen, longest is " + << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << " msec"; + auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str()); + d.detail.swap(detail_back); + } + if (detail_front.size() != 0) { + ostringstream ss; + ss << "Long heartbeat ping times on front interface seen, longest is " + << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << " msec"; + auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str()); + d.detail.swap(detail_front); + } + } + + // SMALLER_PGP_NUM + // MANY_OBJECTS_PER_PG + if (!pg_stat.empty()) { + list<string> pgp_detail, many_detail; + const auto mon_pg_warn_min_objects = + cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects"); + const auto mon_pg_warn_min_pool_objects = + cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects"); + const auto mon_pg_warn_max_object_skew = + cct->_conf.get_val<double>("mon_pg_warn_max_object_skew"); + for (auto p = pg_pool_sum.begin(); + p != pg_pool_sum.end(); + ++p) { + const pg_pool_t *pi = osdmap.get_pg_pool(p->first); + if (!pi) + continue; // in case osdmap changes haven't propagated to PGMap yet + const string& name = osdmap.get_pool_name(p->first); + // NOTE: we use pg_num_target and pgp_num_target for the purposes of + // the warnings. If the cluster is failing to converge on the target + // values that is a separate issue! + if (pi->get_pg_num_target() > pi->get_pgp_num_target() && + !(name.find(".DELETED") != string::npos && + cct->_conf->mon_fake_pool_delete)) { + ostringstream ss; + ss << "pool " << name << " pg_num " + << pi->get_pg_num_target() + << " > pgp_num " << pi->get_pgp_num_target(); + pgp_detail.push_back(ss.str()); + } + int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size(); + if (average_objects_per_pg > 0 && + pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects && + p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) { + int objects_per_pg = p->second.stats.sum.num_objects / + pi->get_pg_num_target(); + float ratio = (float)objects_per_pg / (float)average_objects_per_pg; + if (mon_pg_warn_max_object_skew > 0 && + ratio > mon_pg_warn_max_object_skew) { + ostringstream ss; + ss << "pool " << name << " objects per pg (" + << objects_per_pg << ") is more than " << ratio + << " times cluster average (" + << average_objects_per_pg << ")"; + many_detail.push_back(ss.str()); + } + } + } + if (!pgp_detail.empty()) { + ostringstream ss; + ss << pgp_detail.size() << " pools have pg_num > pgp_num"; + auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str()); + d.detail.swap(pgp_detail); + } + if (!many_detail.empty()) { + ostringstream ss; + ss << many_detail.size() << " pools have many more objects per pg than" + << " average"; + auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str()); + d.detail.swap(many_detail); + } + } + + // POOL_FULL + // POOL_NEAR_FULL + { + float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100; + float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100; + list<string> full_detail, nearfull_detail; + unsigned full_pools = 0, nearfull_pools = 0; + for (auto it : pools) { + auto it2 = pg_pool_sum.find(it.first); + if (it2 == pg_pool_sum.end()) { + continue; + } + const pool_stat_t *pstat = &it2->second; + const object_stat_sum_t& sum = pstat->stats.sum; + const string& pool_name = osdmap.get_pool_name(it.first); + const pg_pool_t &pool = it.second; + bool full = false, nearfull = false; + if (pool.quota_max_objects > 0) { + stringstream ss; + if ((uint64_t)sum.num_objects >= pool.quota_max_objects) { + } else if (crit_threshold > 0 && + sum.num_objects >= pool.quota_max_objects*crit_threshold) { + ss << "pool '" << pool_name + << "' has " << sum.num_objects << " objects" + << " (max " << pool.quota_max_objects << ")"; + full_detail.push_back(ss.str()); + full = true; + } else if (warn_threshold > 0 && + sum.num_objects >= pool.quota_max_objects*warn_threshold) { + ss << "pool '" << pool_name + << "' has " << sum.num_objects << " objects" + << " (max " << pool.quota_max_objects << ")"; + nearfull_detail.push_back(ss.str()); + nearfull = true; + } + } + if (pool.quota_max_bytes > 0) { + stringstream ss; + if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) { + } else if (crit_threshold > 0 && + sum.num_bytes >= pool.quota_max_bytes*crit_threshold) { + ss << "pool '" << pool_name + << "' has " << byte_u_t(sum.num_bytes) + << " (max " << byte_u_t(pool.quota_max_bytes) << ")"; + full_detail.push_back(ss.str()); + full = true; + } else if (warn_threshold > 0 && + sum.num_bytes >= pool.quota_max_bytes*warn_threshold) { + ss << "pool '" << pool_name + << "' has " << byte_u_t(sum.num_bytes) + << " (max " << byte_u_t(pool.quota_max_bytes) << ")"; + nearfull_detail.push_back(ss.str()); + nearfull = true; + } + } + if (full) { + ++full_pools; + } + if (nearfull) { + ++nearfull_pools; + } + } + if (full_pools) { + ostringstream ss; + ss << full_pools << " pools full"; + auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str()); + d.detail.swap(full_detail); + } + if (nearfull_pools) { + ostringstream ss; + ss << nearfull_pools << " pools nearfull"; + auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str()); + d.detail.swap(nearfull_detail); + } + } + + // OBJECT_MISPLACED + if (pg_sum.stats.sum.num_objects_misplaced && + pg_sum.stats.sum.num_object_copies > 0 && + cct->_conf->mon_warn_on_misplaced) { + double pc = (double)pg_sum.stats.sum.num_objects_misplaced / + (double)pg_sum.stats.sum.num_object_copies * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + ostringstream ss; + ss << pg_sum.stats.sum.num_objects_misplaced + << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced (" + << b << "%)"; + checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str()); + } + + // OBJECT_UNFOUND + if (pg_sum.stats.sum.num_objects_unfound && + pg_sum.stats.sum.num_objects) { + double pc = (double)pg_sum.stats.sum.num_objects_unfound / + (double)pg_sum.stats.sum.num_objects * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + ostringstream ss; + ss << pg_sum.stats.sum.num_objects_unfound + << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)"; + auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str()); + + for (auto& p : pg_stat) { + if (p.second.stats.sum.num_objects_unfound) { + ostringstream ss; + ss << "pg " << p.first + << " has " << p.second.stats.sum.num_objects_unfound + << " unfound objects"; + d.detail.push_back(ss.str()); + if (d.detail.size() > max) { + d.detail.push_back("(additional pgs left out for brevity)"); + break; + } + } + } + } + + // REQUEST_SLOW + // REQUEST_STUCK + // SLOW_OPS unifies them in mimic. + if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC && + cct->_conf->mon_osd_warn_op_age > 0 && + !osd_sum.op_queue_age_hist.h.empty() && + osd_sum.op_queue_age_hist.upper_bound() / 1000.0 > + cct->_conf->mon_osd_warn_op_age) { + list<string> warn_detail, error_detail; + unsigned warn = 0, error = 0; + float err_age = + cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio; + const pow2_hist_t& h = osd_sum.op_queue_age_hist; + for (unsigned i = h.h.size() - 1; i > 0; --i) { + float ub = (float)(1 << i) / 1000.0; + if (ub < cct->_conf->mon_osd_warn_op_age) + break; + if (h.h[i]) { + ostringstream ss; + ss << h.h[i] << " ops are blocked > " << ub << " sec"; + if (ub > err_age) { + error += h.h[i]; + error_detail.push_back(ss.str()); + } else { + warn += h.h[i]; + warn_detail.push_back(ss.str()); + } + } + } + + map<float,set<int>> warn_osd_by_max; // max -> osds + map<float,set<int>> error_osd_by_max; // max -> osds + if (!warn_detail.empty() || !error_detail.empty()) { + for (auto& p : osd_stat) { + const pow2_hist_t& h = p.second.op_queue_age_hist; + for (unsigned i = h.h.size() - 1; i > 0; --i) { + float ub = (float)(1 << i) / 1000.0; + if (ub < cct->_conf->mon_osd_warn_op_age) + break; + if (h.h[i]) { + if (ub > err_age) { + error_osd_by_max[ub].insert(p.first); + } else { + warn_osd_by_max[ub].insert(p.first); + } + break; + } + } + } + } + + if (!warn_detail.empty()) { + ostringstream ss; + ss << warn << " slow requests are blocked > " + << cct->_conf->mon_osd_warn_op_age << " sec"; + auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str()); + d.detail.swap(warn_detail); + int left = max; + for (auto& p : warn_osd_by_max) { + ostringstream ss; + if (p.second.size() > 1) { + ss << "osds " << p.second + << " have blocked requests > " << p.first << " sec"; + } else { + ss << "osd." << *p.second.begin() + << " has blocked requests > " << p.first << " sec"; + } + d.detail.push_back(ss.str()); + if (--left == 0) { + break; + } + } + } + if (!error_detail.empty()) { + ostringstream ss; + ss << error << " stuck requests are blocked > " + << err_age << " sec"; + auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str()); + d.detail.swap(error_detail); + int left = max; + for (auto& p : error_osd_by_max) { + ostringstream ss; + if (p.second.size() > 1) { + ss << "osds " << p.second + << " have stuck requests > " << p.first << " sec"; + } else { + ss << "osd." << *p.second.begin() + << " has stuck requests > " << p.first << " sec"; + } + d.detail.push_back(ss.str()); + if (--left == 0) { + break; + } + } + } + } + + // OBJECT_STORE_WARN + if (osd_sum.os_alerts.size()) { + map<string, pair<size_t, list<string>>> os_alerts_sum; + + for (auto& a : osd_sum.os_alerts) { + int left = max; + string s0 = " osd."; + s0 += stringify(a.first); + for (auto& aa : a.second) { + string s(s0); + s += " "; + s += aa.second; + auto it = os_alerts_sum.find(aa.first); + if (it == os_alerts_sum.end()) { + list<string> d; + d.emplace_back(s); + os_alerts_sum.emplace(aa.first, std::make_pair(1, d)); + } else { + auto& p = it->second; + ++p.first; + p.second.emplace_back(s); + } + if (--left == 0) { + break; + } + } + } + + for (auto& asum : os_alerts_sum) { + string summary; + if (asum.first == "BLUEFS_SPILLOVER") { + summary = "BlueFS spillover detected"; + } else if (asum.first == "BLUESTORE_NO_COMPRESSION") { + summary = "BlueStore compression broken"; + } else if (asum.first == "BLUESTORE_LEGACY_STATFS") { + summary = "Legacy BlueStore stats reporting detected"; + } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") { + summary = "BlueStore has dangerous mismatch between block device and free list sizes"; + } + summary += " on "; + summary += stringify(asum.second.first); + summary += " OSD(s)"; + auto& d = checks->add(asum.first, HEALTH_WARN, summary); + for (auto& s : asum.second.second) { + d.detail.push_back(s); + } + } + } + // PG_NOT_SCRUBBED + // PG_NOT_DEEP_SCRUBBED + if (cct->_conf->mon_warn_pg_not_scrubbed_ratio || + cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) { + list<string> detail, deep_detail; + int detail_max = max, deep_detail_max = max; + int detail_more = 0, deep_detail_more = 0; + int detail_total = 0, deep_detail_total = 0; + for (auto& p : pg_stat) { + int64_t pnum = p.first.pool(); + auto pool = osdmap.get_pg_pool(pnum); + if (!pool) + continue; + if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) { + double scrub_max_interval = 0; + pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval); + if (scrub_max_interval <= 0) { + scrub_max_interval = cct->_conf->osd_scrub_max_interval; + } + const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) + + scrub_max_interval; + utime_t cutoff = now; + cutoff -= age; + if (p.second.last_scrub_stamp < cutoff) { + if (detail_max > 0) { + ostringstream ss; + ss << "pg " << p.first << " not scrubbed since " + << p.second.last_scrub_stamp; + detail.push_back(ss.str()); + --detail_max; + } else { + ++detail_more; + } + ++detail_total; + } + } + if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) { + double deep_scrub_interval = 0; + pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval); + if (deep_scrub_interval <= 0) { + deep_scrub_interval = cct->_conf->osd_deep_scrub_interval; + } + double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) + + deep_scrub_interval; + utime_t deep_cutoff = now; + deep_cutoff -= deep_age; + if (p.second.last_deep_scrub_stamp < deep_cutoff) { + if (deep_detail_max > 0) { + ostringstream ss; + ss << "pg " << p.first << " not deep-scrubbed since " + << p.second.last_deep_scrub_stamp; + deep_detail.push_back(ss.str()); + --deep_detail_max; + } else { + ++deep_detail_more; + } + ++deep_detail_total; + } + } + } + if (detail_total) { + ostringstream ss; + ss << detail_total << " pgs not scrubbed in time"; + auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str()); + + if (!detail.empty()) { + d.detail.swap(detail); + + if (detail_more) { + ostringstream ss; + ss << detail_more << " more pgs... "; + d.detail.push_back(ss.str()); + } + } + } + if (deep_detail_total) { + ostringstream ss; + ss << deep_detail_total << " pgs not deep-scrubbed in time"; + auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str()); + + if (!deep_detail.empty()) { + d.detail.swap(deep_detail); + + if (deep_detail_more) { + ostringstream ss; + ss << deep_detail_more << " more pgs... "; + d.detail.push_back(ss.str()); + } + } + } + } + + // POOL_APP + if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) { + list<string> detail; + for (auto &it : pools) { + const pg_pool_t &pool = it.second; + const string& pool_name = osdmap.get_pool_name(it.first); + auto it2 = pg_pool_sum.find(it.first); + if (it2 == pg_pool_sum.end()) { + continue; + } + const pool_stat_t *pstat = &it2->second; + if (pstat == nullptr) { + continue; + } + const object_stat_sum_t& sum = pstat->stats.sum; + // application metadata is not encoded until luminous is minimum + // required release + if (sum.num_objects > 0 && pool.application_metadata.empty() && + !pool.is_tier()) { + stringstream ss; + ss << "application not enabled on pool '" << pool_name << "'"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << "application not enabled on " << detail.size() << " pool(s)"; + auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str()); + stringstream tip; + tip << "use 'ceph osd pool application enable <pool-name> " + << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', " + << "or freeform for custom applications."; + detail.push_back(tip.str()); + d.detail.swap(detail); + } + } + + // PG_SLOW_SNAP_TRIMMING + if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) { + uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on; + uint64_t snaptrimq_exceeded = 0; + uint32_t longest_queue = 0; + const pg_t* longest_q_pg = nullptr; + list<string> detail; + + for (auto& i: pg_stat) { + uint32_t current_len = i.second.snaptrimq_len; + if (current_len >= snapthreshold) { + snaptrimq_exceeded++; + if (longest_queue <= current_len) { + longest_q_pg = &i.first; + longest_queue = current_len; + } + if (detail.size() < max - 1) { + stringstream ss; + ss << "snap trim queue for pg " << i.first << " at " << current_len; + detail.push_back(ss.str()); + continue; + } + if (detail.size() < max) { + detail.push_back("...more pgs affected"); + continue; + } + } + } + + if (snaptrimq_exceeded) { + { + ostringstream ss; + ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue; + detail.push_back(ss.str()); + } + + stringstream ss; + ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)"; + auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str()); + detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\"."); + d.detail.swap(detail); + } + } +} + +int process_pg_map_command( + const string& orig_prefix, + const cmdmap_t& orig_cmdmap, + const PGMap& pg_map, + const OSDMap& osdmap, + ceph::Formatter *f, + stringstream *ss, + bufferlist *odata) +{ + string prefix = orig_prefix; + auto cmdmap = orig_cmdmap; + + string omap_stats_note = + "\n* NOTE: Omap statistics are gathered during deep scrub and " + "may be inaccurate soon afterwards depending on utilisation. See " + "http://docs.ceph.com/docs/master/dev/placement-group/#omap-statistics " + "for further details.\n"; + bool omap_stats_note_required = false; + + // perhaps these would be better in the parsing, but it's weird + bool primary = false; + if (prefix == "pg dump_json") { + vector<string> v; + v.push_back(string("all")); + cmd_putval(g_ceph_context, cmdmap, "format", string("json")); + cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v); + prefix = "pg dump"; + } else if (prefix == "pg dump_pools_json") { + vector<string> v; + v.push_back(string("pools")); + cmd_putval(g_ceph_context, cmdmap, "format", string("json")); + cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v); + prefix = "pg dump"; + } else if (prefix == "pg ls-by-primary") { + primary = true; + prefix = "pg ls"; + } else if (prefix == "pg ls-by-osd") { + prefix = "pg ls"; + } else if (prefix == "pg ls-by-pool") { + prefix = "pg ls"; + string poolstr; + cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr); + int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); + if (pool < 0) { + *ss << "pool " << poolstr << " does not exist"; + return -ENOENT; + } + cmd_putval(g_ceph_context, cmdmap, "pool", pool); + } + + stringstream ds; + if (prefix == "pg stat") { + if (f) { + f->open_object_section("pg_summary"); + pg_map.print_oneline_summary(f, NULL); + f->close_section(); + f->flush(ds); + } else { + ds << pg_map; + } + odata->append(ds); + return 0; + } + + if (prefix == "pg getmap") { + pg_map.encode(*odata); + *ss << "got pgmap version " << pg_map.version; + return 0; + } + + if (prefix == "pg dump") { + string val; + vector<string> dumpcontents; + set<string> what; + if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) { + copy(dumpcontents.begin(), dumpcontents.end(), + inserter(what, what.end())); + } + if (what.empty()) + what.insert("all"); + if (f) { + if (what.count("all")) { + f->open_object_section("pg_map"); + pg_map.dump(f); + f->close_section(); + } else if (what.count("summary") || what.count("sum")) { + f->open_object_section("pg_map"); + pg_map.dump_basic(f); + f->close_section(); + } else { + if (what.count("pools")) { + pg_map.dump_pool_stats(f); + } + if (what.count("osds")) { + pg_map.dump_osd_stats(f); + } + if (what.count("pgs")) { + pg_map.dump_pg_stats(f, false); + } + if (what.count("pgs_brief")) { + pg_map.dump_pg_stats(f, true); + } + if (what.count("delta")) { + f->open_object_section("delta"); + pg_map.dump_delta(f); + f->close_section(); + } + } + f->flush(*odata); + } else { + if (what.count("all")) { + pg_map.dump(ds); + omap_stats_note_required = true; + } else if (what.count("summary") || what.count("sum")) { + pg_map.dump_basic(ds); + pg_map.dump_pg_sum_stats(ds, true); + pg_map.dump_osd_sum_stats(ds); + omap_stats_note_required = true; + } else { + if (what.count("pgs_brief")) { + pg_map.dump_pg_stats(ds, true); + } + bool header = true; + if (what.count("pgs")) { + pg_map.dump_pg_stats(ds, false); + header = false; + omap_stats_note_required = true; + } + if (what.count("pools")) { + pg_map.dump_pool_stats(ds, header); + omap_stats_note_required = true; + } + if (what.count("osds")) { + pg_map.dump_osd_stats(ds); + } + } + odata->append(ds); + if (omap_stats_note_required) { + odata->append(omap_stats_note); + } + } + *ss << "dumped " << what; + return 0; + } + + if (prefix == "pg ls") { + int64_t osd = -1; + int64_t pool = -1; + vector<string>states; + set<pg_t> pgs; + cmd_getval(g_ceph_context, cmdmap, "pool", pool); + cmd_getval(g_ceph_context, cmdmap, "osd", osd); + cmd_getval(g_ceph_context, cmdmap, "states", states); + if (pool >= 0 && !osdmap.have_pg_pool(pool)) { + *ss << "pool " << pool << " does not exist"; + return -ENOENT; + } + if (osd >= 0 && !osdmap.is_up(osd)) { + *ss << "osd " << osd << " is not up"; + return -EAGAIN; + } + if (states.empty()) + states.push_back("all"); + + uint64_t state = 0; + + while (!states.empty()) { + string state_str = states.back(); + + if (state_str == "all") { + state = -1; + break; + } else { + auto filter = pg_string_state(state_str); + if (!filter) { + *ss << "'" << state_str << "' is not a valid pg state," + << " available choices: " << pg_state_string(0xFFFFFFFF); + return -EINVAL; + } + state |= *filter; + } + + states.pop_back(); + } + + pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs); + + if (f && !pgs.empty()) { + pg_map.dump_filtered_pg_stats(f, pgs); + f->flush(*odata); + } else if (!pgs.empty()) { + pg_map.dump_filtered_pg_stats(ds, pgs); + odata->append(ds); + odata->append(omap_stats_note); + } + return 0; + } + + if (prefix == "pg dump_stuck") { + vector<string> stuckop_vec; + cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec); + if (stuckop_vec.empty()) + stuckop_vec.push_back("unclean"); + int64_t threshold; + cmd_getval(g_ceph_context, cmdmap, "threshold", threshold, + g_conf().get_val<int64_t>("mon_pg_stuck_threshold")); + + if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) { + *ss << "failed"; + } else { + *ss << "ok"; + } + odata->append(ds); + return 0; + } + + if (prefix == "pg debug") { + string debugop; + cmd_getval(g_ceph_context, cmdmap, "debugop", debugop, + string("unfound_objects_exist")); + if (debugop == "unfound_objects_exist") { + bool unfound_objects_exist = false; + for (const auto& p : pg_map.pg_stat) { + if (p.second.stats.sum.num_objects_unfound > 0) { + unfound_objects_exist = true; + break; + } + } + if (unfound_objects_exist) + ds << "TRUE"; + else + ds << "FALSE"; + odata->append(ds); + return 0; + } + if (debugop == "degraded_pgs_exist") { + bool degraded_pgs_exist = false; + for (const auto& p : pg_map.pg_stat) { + if (p.second.stats.sum.num_objects_degraded > 0) { + degraded_pgs_exist = true; + break; + } + } + if (degraded_pgs_exist) + ds << "TRUE"; + else + ds << "FALSE"; + odata->append(ds); + return 0; + } + } + + if (prefix == "osd perf") { + if (f) { + f->open_object_section("osdstats"); + pg_map.dump_osd_perf_stats(f); + f->close_section(); + f->flush(ds); + } else { + pg_map.print_osd_perf_stats(&ds); + } + odata->append(ds); + return 0; + } + + if (prefix == "osd blocked-by") { + if (f) { + f->open_object_section("osd_blocked_by"); + pg_map.dump_osd_blocked_by_stats(f); + f->close_section(); + f->flush(ds); + } else { + pg_map.print_osd_blocked_by_stats(&ds); + } + odata->append(ds); + return 0; + } + + return -EOPNOTSUPP; +} + +void PGMapUpdater::check_osd_map( + CephContext *cct, + const OSDMap& osdmap, + const PGMap& pgmap, + PGMap::Incremental *pending_inc) +{ + for (auto& p : pgmap.osd_stat) { + if (!osdmap.exists(p.first)) { + // remove osd_stat + pending_inc->rm_stat(p.first); + } else if (osdmap.is_out(p.first)) { + // zero osd_stat + if (p.second.statfs.total != 0) { + pending_inc->stat_osd_out(p.first); + } + } else if (!osdmap.is_up(p.first)) { + // zero the op_queue_age_hist + if (!p.second.op_queue_age_hist.empty()) { + pending_inc->stat_osd_down_up(p.first, pgmap); + } + } + } + + // deleted pgs (pools)? + for (auto& p : pgmap.pg_pool_sum) { + if (!osdmap.have_pg_pool(p.first)) { + ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs" + << dendl; + for (auto& q : pgmap.pg_stat) { + if (q.first.pool() == p.first) { + pending_inc->pg_remove.insert(q.first); + } + } + auto q = pending_inc->pg_stat_updates.begin(); + while (q != pending_inc->pg_stat_updates.end()) { + if (q->first.pool() == p.first) { + q = pending_inc->pg_stat_updates.erase(q); + } else { + ++q; + } + } + } + } + + // new (split or new pool) or merged pgs? + map<int64_t,unsigned> new_pg_num; + for (auto& p : osdmap.get_pools()) { + int64_t poolid = p.first; + const pg_pool_t& pi = p.second; + auto q = pgmap.num_pg_by_pool.find(poolid); + unsigned my_pg_num = 0; + if (q != pgmap.num_pg_by_pool.end()) + my_pg_num = q->second; + unsigned pg_num = pi.get_pg_num(); + new_pg_num[poolid] = pg_num; + if (my_pg_num < pg_num) { + ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num + << " > my pg_num " << my_pg_num << dendl; + for (unsigned ps = my_pg_num; ps < pg_num; ++ps) { + pg_t pgid(ps, poolid); + if (pending_inc->pg_stat_updates.count(pgid) == 0) { + ldout(cct,20) << __func__ << " adding " << pgid << dendl; + pg_stat_t &stats = pending_inc->pg_stat_updates[pgid]; + stats.last_fresh = osdmap.get_modified(); + stats.last_active = osdmap.get_modified(); + stats.last_change = osdmap.get_modified(); + stats.last_peered = osdmap.get_modified(); + stats.last_clean = osdmap.get_modified(); + stats.last_unstale = osdmap.get_modified(); + stats.last_undegraded = osdmap.get_modified(); + stats.last_fullsized = osdmap.get_modified(); + stats.last_scrub_stamp = osdmap.get_modified(); + stats.last_deep_scrub_stamp = osdmap.get_modified(); + stats.last_clean_scrub_stamp = osdmap.get_modified(); + } + } + } else if (my_pg_num > pg_num) { + ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num + << " < my pg_num " << my_pg_num << dendl; + for (unsigned i = pg_num; i < my_pg_num; ++i) { + pg_t pgid(i, poolid); + ldout(cct,20) << __func__ << " removing merged " << pgid << dendl; + if (pgmap.pg_stat.count(pgid)) { + pending_inc->pg_remove.insert(pgid); + } + pending_inc->pg_stat_updates.erase(pgid); + } + } + } + auto i = pending_inc->pg_stat_updates.begin(); + while (i != pending_inc->pg_stat_updates.end()) { + auto j = new_pg_num.find(i->first.pool()); + if (j == new_pg_num.end() || + i->first.ps() >= j->second) { + ldout(cct,20) << __func__ << " removing pending update to old " + << i->first << dendl; + i = pending_inc->pg_stat_updates.erase(i); + } else { + ++i; + } + } +} + +static void _try_mark_pg_stale( + const OSDMap& osdmap, + pg_t pgid, + const pg_stat_t& cur, + PGMap::Incremental *pending_inc) +{ + if ((cur.state & PG_STATE_STALE) == 0 && + cur.acting_primary != -1 && + osdmap.is_down(cur.acting_primary)) { + pg_stat_t *newstat; + auto q = pending_inc->pg_stat_updates.find(pgid); + if (q != pending_inc->pg_stat_updates.end()) { + if ((q->second.acting_primary == cur.acting_primary) || + ((q->second.state & PG_STATE_STALE) == 0 && + q->second.acting_primary != -1 && + osdmap.is_down(q->second.acting_primary))) { + newstat = &q->second; + } else { + // pending update is no longer down or already stale + return; + } + } else { + newstat = &pending_inc->pg_stat_updates[pgid]; + *newstat = cur; + } + dout(10) << __func__ << " marking pg " << pgid + << " stale (acting_primary " << newstat->acting_primary + << ")" << dendl; + newstat->state |= PG_STATE_STALE; + newstat->last_unstale = ceph_clock_now(); + } +} + +void PGMapUpdater::check_down_pgs( + const OSDMap &osdmap, + const PGMap &pg_map, + bool check_all, + const set<int>& need_check_down_pg_osds, + PGMap::Incremental *pending_inc) +{ + // if a large number of osds changed state, just iterate over the whole + // pg map. + if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() * + g_conf().get_val<double>("mon_pg_check_down_all_threshold")) { + check_all = true; + } + + if (check_all) { + for (const auto& p : pg_map.pg_stat) { + _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc); + } + } else { + for (auto osd : need_check_down_pg_osds) { + if (osdmap.is_down(osd)) { + auto p = pg_map.pg_by_osd.find(osd); + if (p == pg_map.pg_by_osd.end()) { + continue; + } + for (auto pgid : p->second) { + const pg_stat_t &stat = pg_map.pg_stat.at(pgid); + ceph_assert(stat.acting_primary == osd); + _try_mark_pg_stale(osdmap, pgid, stat, pending_inc); + } + } + } + } +} + +int reweight::by_utilization( + const OSDMap &osdmap, + const PGMap &pgm, + int oload, + double max_changef, + int max_osds, + bool by_pg, const set<int64_t> *pools, + bool no_increasing, + mempool::osdmap::map<int32_t, uint32_t>* new_weights, + std::stringstream *ss, + std::string *out_str, + ceph::Formatter *f) +{ + if (oload <= 100) { + *ss << "You must give a percentage higher than 100. " + "The reweighting threshold will be calculated as <average-utilization> " + "times <input-percentage>. For example, an argument of 200 would " + "reweight OSDs which are twice as utilized as the average OSD.\n"; + return -EINVAL; + } + + vector<int> pgs_by_osd(osdmap.get_max_osd()); + + // Avoid putting a small number (or 0) in the denominator when calculating + // average_util + double average_util; + if (by_pg) { + // by pg mapping + double weight_sum = 0.0; // sum up the crush weights + unsigned num_pg_copies = 0; + int num_osds = 0; + for (const auto& pg : pgm.pg_stat) { + if (pools && pools->count(pg.first.pool()) == 0) + continue; + for (const auto acting : pg.second.acting) { + if (!osdmap.exists(acting)) { + continue; + } + if (acting >= (int)pgs_by_osd.size()) + pgs_by_osd.resize(acting); + if (pgs_by_osd[acting] == 0) { + if (osdmap.crush->get_item_weightf(acting) <= 0) { + //skip if we currently can not identify item + continue; + } + weight_sum += osdmap.crush->get_item_weightf(acting); + ++num_osds; + } + ++pgs_by_osd[acting]; + ++num_pg_copies; + } + } + + if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) { + *ss << "Refusing to reweight: we only have " << num_pg_copies + << " PGs across " << num_osds << " osds!\n"; + return -EDOM; + } + + average_util = (double)num_pg_copies / weight_sum; + } else { + // by osd utilization + int num_osd = std::max<size_t>(1, pgm.osd_stat.size()); + if ((uint64_t)pgm.osd_sum.statfs.total / num_osd + < g_conf()->mon_reweight_min_bytes_per_osd) { + *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb() + << " kb across all osds!\n"; + return -EDOM; + } + if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd + < g_conf()->mon_reweight_min_bytes_per_osd) { + *ss << "Refusing to reweight: we only have " + << pgm.osd_sum.statfs.kb_used_raw() + << " kb used across all osds!\n"; + return -EDOM; + } + + average_util = (double)pgm.osd_sum.statfs.get_used_raw() / + (double)pgm.osd_sum.statfs.total; + } + + // adjust down only if we are above the threshold + const double overload_util = average_util * (double)oload / 100.0; + + // but aggressively adjust weights up whenever possible. + const double underload_util = average_util; + + const unsigned max_change = (unsigned)(max_changef * (double)0x10000); + + ostringstream oss; + if (f) { + f->open_object_section("reweight_by_utilization"); + f->dump_int("overload_min", oload); + f->dump_float("max_change", max_changef); + f->dump_int("max_change_osds", max_osds); + f->dump_float("average_utilization", average_util); + f->dump_float("overload_utilization", overload_util); + } else { + oss << "oload " << oload << "\n"; + oss << "max_change " << max_changef << "\n"; + oss << "max_change_osds " << max_osds << "\n"; + oss.precision(4); + oss << "average_utilization " << std::fixed << average_util << "\n"; + oss << "overload_utilization " << overload_util << "\n"; + } + int num_changed = 0; + + // precompute util for each OSD + std::vector<std::pair<int, float> > util_by_osd; + for (const auto& p : pgm.osd_stat) { + std::pair<int, float> osd_util; + osd_util.first = p.first; + if (by_pg) { + if (p.first >= (int)pgs_by_osd.size() || + pgs_by_osd[p.first] == 0) { + // skip if this OSD does not contain any pg + // belonging to the specified pool(s). + continue; + } + + if (osdmap.crush->get_item_weightf(p.first) <= 0) { + // skip if we are unable to locate item. + continue; + } + + osd_util.second = + pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first); + } else { + osd_util.second = + (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total; + } + util_by_osd.push_back(osd_util); + } + + // sort by absolute deviation from the mean utilization, + // in descending order. + std::sort(util_by_osd.begin(), util_by_osd.end(), + [average_util](std::pair<int, float> l, std::pair<int, float> r) { + return abs(l.second - average_util) > abs(r.second - average_util); + } + ); + + if (f) + f->open_array_section("reweights"); + + for (const auto& p : util_by_osd) { + unsigned weight = osdmap.get_weight(p.first); + if (weight == 0) { + // skip if OSD is currently out + continue; + } + float util = p.second; + + if (util >= overload_util) { + // Assign a lower weight to overloaded OSDs. The current weight + // is a factor to take into account the original weights, + // to represent e.g. differing storage capacities + unsigned new_weight = (unsigned)((average_util / util) * (float)weight); + if (weight > max_change) + new_weight = std::max(new_weight, weight - max_change); + new_weights->insert({p.first, new_weight}); + if (f) { + f->open_object_section("osd"); + f->dump_int("osd", p.first); + f->dump_float("weight", (float)weight / (float)0x10000); + f->dump_float("new_weight", (float)new_weight / (float)0x10000); + f->close_section(); + } else { + oss << "osd." << p.first << " weight " + << (float)weight / (float)0x10000 << " -> " + << (float)new_weight / (float)0x10000 << "\n"; + } + if (++num_changed >= max_osds) + break; + } + if (!no_increasing && util <= underload_util) { + // assign a higher weight.. if we can. + unsigned new_weight = (unsigned)((average_util / util) * (float)weight); + new_weight = std::min(new_weight, weight + max_change); + if (new_weight > 0x10000) + new_weight = 0x10000; + if (new_weight > weight) { + new_weights->insert({p.first, new_weight}); + oss << "osd." << p.first << " weight " + << (float)weight / (float)0x10000 << " -> " + << (float)new_weight / (float)0x10000 << "\n"; + if (++num_changed >= max_osds) + break; + } + } + } + if (f) { + f->close_section(); + } + + OSDMap newmap; + newmap.deepish_copy_from(osdmap); + OSDMap::Incremental newinc; + newinc.fsid = newmap.get_fsid(); + newinc.epoch = newmap.get_epoch() + 1; + newinc.new_weight = *new_weights; + newmap.apply_incremental(newinc); + + osdmap.summarize_mapping_stats(&newmap, pools, out_str, f); + + if (f) { + f->close_section(); + } else { + *out_str += "\n"; + *out_str += oss.str(); + } + return num_changed; +} |