diff options
Diffstat (limited to 'src/osd/OSDMap.cc')
-rw-r--r-- | src/osd/OSDMap.cc | 7377 |
1 files changed, 7377 insertions, 0 deletions
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc new file mode 100644 index 000000000..11f9a87d7 --- /dev/null +++ b/src/osd/OSDMap.cc @@ -0,0 +1,7377 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <algorithm> +#include <bit> +#include <optional> +#include <random> +#include <fmt/format.h> + +#include <boost/algorithm/string.hpp> + +#include "OSDMap.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "include/ceph_features.h" +#include "include/common_fwd.h" +#include "include/str_map.h" + +#include "common/code_environment.h" +#include "mon/health_check.h" + +#include "crush/CrushTreeDumper.h" +#include "common/Clock.h" +#include "mon/PGMap.h" + +using std::list; +using std::make_pair; +using std::map; +using std::multimap; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::string; +using std::stringstream; +using std::unordered_map; +using std::vector; + +using ceph::decode; +using ceph::encode; +using ceph::Formatter; + +#define dout_subsys ceph_subsys_osd + +MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap); +MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap); + + +// ---------------------------------- +// osd_info_t + +void osd_info_t::dump(Formatter *f) const +{ + f->dump_int("last_clean_begin", last_clean_begin); + f->dump_int("last_clean_end", last_clean_end); + f->dump_int("up_from", up_from); + f->dump_int("up_thru", up_thru); + f->dump_int("down_at", down_at); + f->dump_int("lost_at", lost_at); +} + +void osd_info_t::encode(ceph::buffer::list& bl) const +{ + using ceph::encode; + __u8 struct_v = 1; + encode(struct_v, bl); + encode(last_clean_begin, bl); + encode(last_clean_end, bl); + encode(up_from, bl); + encode(up_thru, bl); + encode(down_at, bl); + encode(lost_at, bl); +} + +void osd_info_t::decode(ceph::buffer::list::const_iterator& bl) +{ + using ceph::decode; + __u8 struct_v; + decode(struct_v, bl); + decode(last_clean_begin, bl); + decode(last_clean_end, bl); + decode(up_from, bl); + decode(up_thru, bl); + decode(down_at, bl); + decode(lost_at, bl); +} + +void osd_info_t::generate_test_instances(list<osd_info_t*>& o) +{ + o.push_back(new osd_info_t); + o.push_back(new osd_info_t); + o.back()->last_clean_begin = 1; + o.back()->last_clean_end = 2; + o.back()->up_from = 30; + o.back()->up_thru = 40; + o.back()->down_at = 5; + o.back()->lost_at = 6; +} + +ostream& operator<<(ostream& out, const osd_info_t& info) +{ + out << "up_from " << info.up_from + << " up_thru " << info.up_thru + << " down_at " << info.down_at + << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")"; + if (info.lost_at) + out << " lost_at " << info.lost_at; + return out; +} + +// ---------------------------------- +// osd_xinfo_t + +void osd_xinfo_t::dump(Formatter *f) const +{ + f->dump_stream("down_stamp") << down_stamp; + f->dump_float("laggy_probability", laggy_probability); + f->dump_int("laggy_interval", laggy_interval); + f->dump_int("features", features); + f->dump_unsigned("old_weight", old_weight); + f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub; + f->dump_int("dead_epoch", dead_epoch); +} + +void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const +{ + uint8_t v = 4; + if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) { + v = 3; + } + ENCODE_START(v, 1, bl); + encode(down_stamp, bl); + __u32 lp = laggy_probability * float(0xfffffffful); + encode(lp, bl); + encode(laggy_interval, bl); + encode(features, bl); + encode(old_weight, bl); + if (v >= 4) { + encode(last_purged_snaps_scrub, bl); + encode(dead_epoch, bl); + } + ENCODE_FINISH(bl); +} + +void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl) +{ + DECODE_START(4, bl); + decode(down_stamp, bl); + __u32 lp; + decode(lp, bl); + laggy_probability = (float)lp / (float)0xffffffff; + decode(laggy_interval, bl); + if (struct_v >= 2) + decode(features, bl); + else + features = 0; + if (struct_v >= 3) + decode(old_weight, bl); + else + old_weight = 0; + if (struct_v >= 4) { + decode(last_purged_snaps_scrub, bl); + decode(dead_epoch, bl); + } else { + dead_epoch = 0; + } + DECODE_FINISH(bl); +} + +void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o) +{ + o.push_back(new osd_xinfo_t); + o.push_back(new osd_xinfo_t); + o.back()->down_stamp = utime_t(2, 3); + o.back()->laggy_probability = .123; + o.back()->laggy_interval = 123456; + o.back()->old_weight = 0x7fff; +} + +ostream& operator<<(ostream& out, const osd_xinfo_t& xi) +{ + return out << "down_stamp " << xi.down_stamp + << " laggy_probability " << xi.laggy_probability + << " laggy_interval " << xi.laggy_interval + << " old_weight " << xi.old_weight + << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub + << " dead_epoch " << xi.dead_epoch; +} + +// ---------------------------------- +// OSDMap::Incremental + +int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const +{ + int n = 0; + for (auto &weight : new_weight) { + if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first)) + n++; // marked out + else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first)) + n--; // marked in + } + return n; +} + +int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const +{ + int n = 0; + for (auto &state : new_state) { // + if (state.second & CEPH_OSD_UP) { + if (previous->is_up(state.first)) + n++; // marked down + else + n--; // marked up + } + } + return n; +} + +int OSDMap::Incremental::identify_osd(uuid_d u) const +{ + for (auto &uuid : new_uuid) + if (uuid.second == u) + return uuid.first; + return -1; +} + +int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct, + const OSDMap& osdmap) +{ + ceph_assert(epoch == osdmap.get_epoch() + 1); + + for (auto &new_pool : new_pools) { + if (!new_pool.second.tiers.empty()) { + pg_pool_t& base = new_pool.second; + + auto new_rem_it = new_removed_snaps.find(new_pool.first); + + for (const auto &tier_pool : base.tiers) { + const auto &r = new_pools.find(tier_pool); + pg_pool_t *tier = 0; + if (r == new_pools.end()) { + const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool); + if (!orig) { + lderr(cct) << __func__ << " no pool " << tier_pool << dendl; + return -EIO; + } + tier = get_new_pool(tier_pool, orig); + } else { + tier = &r->second; + } + if (tier->tier_of != new_pool.first) { + lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl; + return -EIO; + } + + ldout(cct, 10) << __func__ << " from " << new_pool.first << " to " + << tier_pool << dendl; + tier->snap_seq = base.snap_seq; + tier->snap_epoch = base.snap_epoch; + tier->snaps = base.snaps; + tier->removed_snaps = base.removed_snaps; + tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS| + pg_pool_t::FLAG_POOL_SNAPS); + + if (new_rem_it != new_removed_snaps.end()) { + new_removed_snaps[tier_pool] = new_rem_it->second; + } + + tier->application_metadata = base.application_metadata; + } + } + } + return 0; +} + +// ---------------------------------- +// OSDMap + +bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const +{ + if (id >= 0) + return is_down(id); + + if (down_cache && + down_cache->count(id)) { + return true; + } + + list<int> children; + crush->get_children(id, &children); + for (const auto &child : children) { + if (!subtree_is_down(child, down_cache)) { + return false; + } + } + if (down_cache) { + down_cache->insert(id); + } + return true; +} + +bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const +{ + // use a stack-local down_cache if we didn't get one from the + // caller. then at least this particular call will avoid duplicated + // work. + set<int> local_down_cache; + if (!down_cache) { + down_cache = &local_down_cache; + } + + int current = id; + while (true) { + int type; + if (current >= 0) { + type = 0; + } else { + type = crush->get_bucket_type(current); + } + ceph_assert(type >= 0); + + if (!subtree_is_down(current, down_cache)) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl; + return false; + } + + // is this a big enough subtree to be marked as down? + if (type >= subtree_type) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl; + return true; + } + + int r = crush->get_immediate_parent_id(current, ¤t); + if (r < 0) { + return false; + } + } +} + +bool OSDMap::subtree_type_is_down( + CephContext *cct, + int id, + int subtree_type, + set<int> *down_in_osds, + set<int> *up_in_osds, + set<int> *subtree_up, + unordered_map<int, set<int> > *subtree_type_down) const +{ + if (id >= 0) { + bool is_down_ret = is_down(id); + if (!is_out(id)) { + if (is_down_ret) { + down_in_osds->insert(id); + } else { + up_in_osds->insert(id); + } + } + return is_down_ret; + } + + if (subtree_type_down && + (*subtree_type_down)[subtree_type].count(id)) { + return true; + } + + list<int> children; + crush->get_children(id, &children); + for (const auto &child : children) { + if (!subtree_type_is_down( + cct, child, crush->get_bucket_type(child), + down_in_osds, up_in_osds, subtree_up, subtree_type_down)) { + subtree_up->insert(id); + return false; + } + } + if (subtree_type_down) { + (*subtree_type_down)[subtree_type].insert(id); + } + return true; +} + +void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const +{ + using ceph::encode; + __u16 v = 5; + encode(v, bl); + encode(fsid, bl); + encode(epoch, bl); + encode(modified, bl); + int32_t new_t = new_pool_max; + encode(new_t, bl); + encode(new_flags, bl); + encode(fullmap, bl); + encode(crush, bl); + + encode(new_max_osd, bl); + // for encode(new_pools, bl); + __u32 n = new_pools.size(); + encode(n, bl); + for (const auto &new_pool : new_pools) { + n = new_pool.first; + encode(n, bl); + encode(new_pool.second, bl, 0); + } + // for encode(new_pool_names, bl); + n = new_pool_names.size(); + encode(n, bl); + + for (const auto &new_pool_name : new_pool_names) { + n = new_pool_name.first; + encode(n, bl); + encode(new_pool_name.second, bl); + } + // for encode(old_pools, bl); + n = old_pools.size(); + encode(n, bl); + for (auto &old_pool : old_pools) { + n = old_pool; + encode(n, bl); + } + encode(new_up_client, bl, 0); + { + // legacy is map<int32_t,uint8_t> + map<int32_t, uint8_t> os; + for (auto p : new_state) { + // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT) + // that an old client could not understand. + // skip those! + uint8_t s = p.second; + if (p.second != 0 && s == 0) + continue; + os[p.first] = s; + } + uint32_t n = os.size(); + encode(n, bl); + for (auto p : os) { + encode(p.first, bl); + encode(p.second, bl); + } + } + encode(new_weight, bl); + // for encode(new_pg_temp, bl); + n = new_pg_temp.size(); + encode(n, bl); + + for (const auto &pg_temp : new_pg_temp) { + old_pg_t opg = pg_temp.first.get_old_pg(); + encode(opg, bl); + encode(pg_temp.second, bl); + } +} + +void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGID64) == 0) { + encode_client_old(bl); + return; + } + + // base + __u16 v = 6; + encode(v, bl); + encode(fsid, bl); + encode(epoch, bl); + encode(modified, bl); + encode(new_pool_max, bl); + encode(new_flags, bl); + encode(fullmap, bl); + encode(crush, bl); + + encode(new_max_osd, bl); + encode(new_pools, bl, features); + encode(new_pool_names, bl); + encode(old_pools, bl); + encode(new_up_client, bl, features); + { + map<int32_t, uint8_t> os; + for (auto p : new_state) { + // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT) + // that an old client could not understand. + // skip those! + uint8_t s = p.second; + if (p.second != 0 && s == 0) + continue; + os[p.first] = s; + } + uint32_t n = os.size(); + encode(n, bl); + for (auto p : os) { + encode(p.first, bl); + encode(p.second, bl); + } + } + encode(new_weight, bl); + encode(new_pg_temp, bl); + + // extended + __u16 ev = 10; + encode(ev, bl); + encode(new_hb_back_up, bl, features); + encode(new_up_thru, bl); + encode(new_last_clean_interval, bl); + encode(new_lost, bl); + encode(new_blocklist, bl, features); + encode(old_blocklist, bl, features); + encode(new_up_cluster, bl, features); + encode(cluster_snapshot, bl); + encode(new_uuid, bl); + encode(new_xinfo, bl, features); + encode(new_hb_front_up, bl, features); +} + +template<class T> +static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f) +{ + uint32_t n = m.size(); + encode(n, bl); + for (auto& i : m) { + encode(i.first, bl); + encode(i.second.legacy_addr(), bl, f); + } +} + +template<class T> +static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f) +{ + uint32_t n = m.size(); + encode(n, bl); + for (auto& i : m) { + if (i) { + encode(i->legacy_addr(), bl, f); + } else { + encode(entity_addr_t(), bl, f); + } + } +} + +/* for a description of osdmap incremental versions, and when they were + * introduced, please refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) { + encode_classic(bl, features); + return; + } + + // only a select set of callers should *ever* be encoding new + // OSDMaps. others should be passing around the canonical encoded + // buffers from on high. select out those callers by passing in an + // "impossible" feature bit. + ceph_assert(features & CEPH_FEATURE_RESERVED); + features &= ~CEPH_FEATURE_RESERVED; + + size_t start_offset = bl.length(); + size_t tail_offset; + size_t crc_offset; + std::optional<ceph::buffer::list::contiguous_filler> crc_filler; + + // meta-encoding: how we include client-used and osd-specific data + ENCODE_START(8, 7, bl); + + { + uint8_t v = 9; + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 3; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 5; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 6; + } else if (!HAVE_FEATURE(features, SERVER_REEF)) { + v = 8; + } + ENCODE_START(v, 1, bl); // client-usable data + encode(fsid, bl); + encode(epoch, bl); + encode(modified, bl); + encode(new_pool_max, bl); + encode(new_flags, bl); + encode(fullmap, bl); + encode(crush, bl); + + encode(new_max_osd, bl); + encode(new_pools, bl, features); + encode(new_pool_names, bl); + encode(old_pools, bl); + if (v >= 7) { + encode(new_up_client, bl, features); + } else { + encode_addrvec_map_as_addr(new_up_client, bl, features); + } + if (v >= 5) { + encode(new_state, bl); + } else { + map<int32_t, uint8_t> os; + for (auto p : new_state) { + // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT) + // that an old client could not understand. + // skip those! + uint8_t s = p.second; + if (p.second != 0 && s == 0) + continue; + os[p.first] = s; + } + uint32_t n = os.size(); + encode(n, bl); + for (auto p : os) { + encode(p.first, bl); + encode(p.second, bl); + } + } + encode(new_weight, bl); + encode(new_pg_temp, bl); + encode(new_primary_temp, bl); + encode(new_primary_affinity, bl); + encode(new_erasure_code_profiles, bl); + encode(old_erasure_code_profiles, bl); + if (v >= 4) { + encode(new_pg_upmap, bl); + encode(old_pg_upmap, bl); + encode(new_pg_upmap_items, bl); + encode(old_pg_upmap_items, bl); + } + if (v >= 6) { + encode(new_removed_snaps, bl); + encode(new_purged_snaps, bl); + } + if (v >= 8) { + encode(new_last_up_change, bl); + encode(new_last_in_change, bl); + } + if (v >= 9) { + encode(new_pg_upmap_primary, bl); + encode(old_pg_upmap_primary, bl); + } + ENCODE_FINISH(bl); // client-usable data + } + + { + uint8_t target_v = 9; // if bumping this, be aware of allow_crimson 12 + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + target_v = 2; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + target_v = 6; + } + if (change_stretch_mode) { + target_v = std::max((uint8_t)10, target_v); + } + if (!new_range_blocklist.empty() || + !old_range_blocklist.empty()) { + target_v = std::max((uint8_t)11, target_v); + } + if (mutate_allow_crimson != mutate_allow_crimson_t::NONE) { + target_v = std::max((uint8_t)12, target_v); + } + ENCODE_START(target_v, 1, bl); // extended, osd-only data + if (target_v < 7) { + encode_addrvec_map_as_addr(new_hb_back_up, bl, features); + } else { + encode(new_hb_back_up, bl, features); + } + encode(new_up_thru, bl); + encode(new_last_clean_interval, bl); + encode(new_lost, bl); + encode(new_blocklist, bl, features); + encode(old_blocklist, bl, features); + if (target_v < 7) { + encode_addrvec_map_as_addr(new_up_cluster, bl, features); + } else { + encode(new_up_cluster, bl, features); + } + encode(cluster_snapshot, bl); + encode(new_uuid, bl); + encode(new_xinfo, bl, features); + if (target_v < 7) { + encode_addrvec_map_as_addr(new_hb_front_up, bl, features); + } else { + encode(new_hb_front_up, bl, features); + } + encode(features, bl); // NOTE: features arg, not the member + if (target_v >= 3) { + encode(new_nearfull_ratio, bl); + encode(new_full_ratio, bl); + encode(new_backfillfull_ratio, bl); + } + // 5 was string-based new_require_min_compat_client + if (target_v >= 6) { + encode(new_require_min_compat_client, bl); + encode(new_require_osd_release, bl); + } + if (target_v >= 8) { + encode(new_crush_node_flags, bl); + } + if (target_v >= 9) { + encode(new_device_class_flags, bl); + } + if (target_v >= 10) { + encode(change_stretch_mode, bl); + encode(new_stretch_bucket_count, bl); + encode(new_degraded_stretch_mode, bl); + encode(new_recovering_stretch_mode, bl); + encode(new_stretch_mode_bucket, bl); + encode(stretch_mode_enabled, bl); + } + if (target_v >= 11) { + encode(new_range_blocklist, bl, features); + encode(old_range_blocklist, bl, features); + } + if (target_v >= 12) { + encode(mutate_allow_crimson, bl); + } + ENCODE_FINISH(bl); // osd-only data + } + + crc_offset = bl.length(); + crc_filler = bl.append_hole(sizeof(uint32_t)); + tail_offset = bl.length(); + + encode(full_crc, bl); + + ENCODE_FINISH(bl); // meta-encoding wrapper + + // fill in crc + ceph::buffer::list front; + front.substr_of(bl, start_offset, crc_offset - start_offset); + inc_crc = front.crc32c(-1); + ceph::buffer::list tail; + tail.substr_of(bl, tail_offset, bl.length() - tail_offset); + inc_crc = tail.crc32c(inc_crc); + ceph_le32 crc_le; + crc_le = inc_crc; + crc_filler->copy_in(4u, (char*)&crc_le); + have_crc = true; +} + +void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p) +{ + using ceph::decode; + __u32 n, t; + // base + __u16 v; + decode(v, p); + decode(fsid, p); + decode(epoch, p); + decode(modified, p); + if (v == 4 || v == 5) { + decode(n, p); + new_pool_max = n; + } else if (v >= 6) + decode(new_pool_max, p); + decode(new_flags, p); + decode(fullmap, p); + decode(crush, p); + + decode(new_max_osd, p); + if (v < 6) { + new_pools.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(new_pools[t], p); + } + } else { + decode(new_pools, p); + } + if (v == 5) { + new_pool_names.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(new_pool_names[t], p); + } + } else if (v >= 6) { + decode(new_pool_names, p); + } + if (v < 6) { + old_pools.clear(); + decode(n, p); + while (n--) { + decode(t, p); + old_pools.insert(t); + } + } else { + decode(old_pools, p); + } + decode(new_up_client, p); + { + map<int32_t,uint8_t> ns; + decode(ns, p); + for (auto q : ns) { + new_state[q.first] = q.second; + } + } + decode(new_weight, p); + + if (v < 6) { + new_pg_temp.clear(); + decode(n, p); + while (n--) { + old_pg_t opg; + ceph::decode_raw(opg, p); + decode(new_pg_temp[pg_t(opg)], p); + } + } else { + decode(new_pg_temp, p); + } + + // decode short map, too. + if (v == 5 && p.end()) + return; + + // extended + __u16 ev = 0; + if (v >= 5) + decode(ev, p); + decode(new_hb_back_up, p); + if (v < 5) + decode(new_pool_names, p); + decode(new_up_thru, p); + decode(new_last_clean_interval, p); + decode(new_lost, p); + decode(new_blocklist, p); + decode(old_blocklist, p); + if (ev >= 6) + decode(new_up_cluster, p); + if (ev >= 7) + decode(cluster_snapshot, p); + if (ev >= 8) + decode(new_uuid, p); + if (ev >= 9) + decode(new_xinfo, p); + if (ev >= 10) + decode(new_hb_front_up, p); +} + +/* for a description of osdmap incremental versions, and when they were + * introduced, please refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl) +{ + using ceph::decode; + /** + * Older encodings of the Incremental had a single struct_v which + * covered the whole encoding, and was prior to our modern + * stuff which includes a compatv and a size. So if we see + * a struct_v < 7, we must rewind to the beginning and use our + * classic decoder. + */ + size_t start_offset = bl.get_off(); + size_t tail_offset = 0; + ceph::buffer::list crc_front, crc_tail; + + DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper + if (struct_v < 7) { + bl.seek(start_offset); + decode_classic(bl); + encode_features = 0; + if (struct_v >= 6) + encode_features = CEPH_FEATURE_PGID64; + else + encode_features = 0; + return; + } + { + DECODE_START(8, bl); // client-usable data + decode(fsid, bl); + decode(epoch, bl); + decode(modified, bl); + decode(new_pool_max, bl); + decode(new_flags, bl); + decode(fullmap, bl); + decode(crush, bl); + + decode(new_max_osd, bl); + decode(new_pools, bl); + decode(new_pool_names, bl); + decode(old_pools, bl); + decode(new_up_client, bl); + if (struct_v >= 5) { + decode(new_state, bl); + } else { + map<int32_t,uint8_t> ns; + decode(ns, bl); + for (auto q : ns) { + new_state[q.first] = q.second; + } + } + decode(new_weight, bl); + decode(new_pg_temp, bl); + decode(new_primary_temp, bl); + if (struct_v >= 2) + decode(new_primary_affinity, bl); + else + new_primary_affinity.clear(); + if (struct_v >= 3) { + decode(new_erasure_code_profiles, bl); + decode(old_erasure_code_profiles, bl); + } else { + new_erasure_code_profiles.clear(); + old_erasure_code_profiles.clear(); + } + if (struct_v >= 4) { + decode(new_pg_upmap, bl); + decode(old_pg_upmap, bl); + decode(new_pg_upmap_items, bl); + decode(old_pg_upmap_items, bl); + } + if (struct_v >= 6) { + decode(new_removed_snaps, bl); + decode(new_purged_snaps, bl); + } + if (struct_v >= 8) { + decode(new_last_up_change, bl); + decode(new_last_in_change, bl); + } + DECODE_FINISH(bl); // client-usable data + } + + { + DECODE_START(10, bl); // extended, osd-only data + decode(new_hb_back_up, bl); + decode(new_up_thru, bl); + decode(new_last_clean_interval, bl); + decode(new_lost, bl); + decode(new_blocklist, bl); + decode(old_blocklist, bl); + decode(new_up_cluster, bl); + decode(cluster_snapshot, bl); + decode(new_uuid, bl); + decode(new_xinfo, bl); + decode(new_hb_front_up, bl); + if (struct_v >= 2) + decode(encode_features, bl); + else + encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC; + if (struct_v >= 3) { + decode(new_nearfull_ratio, bl); + decode(new_full_ratio, bl); + } else { + new_nearfull_ratio = -1; + new_full_ratio = -1; + } + if (struct_v >= 4) { + decode(new_backfillfull_ratio, bl); + } else { + new_backfillfull_ratio = -1; + } + if (struct_v == 5) { + string r; + decode(r, bl); + if (r.length()) { + new_require_min_compat_client = ceph_release_from_name(r); + } + } + if (struct_v >= 6) { + decode(new_require_min_compat_client, bl); + decode(new_require_osd_release, bl); + } else { + if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) { + // only for compat with post-kraken pre-luminous test clusters + new_require_osd_release = ceph_release_t::luminous; + new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) { + new_require_osd_release = ceph_release_t::kraken; + } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) { + new_require_osd_release = ceph_release_t::jewel; + } else { + new_require_osd_release = ceph_release_t::unknown; + } + } + if (struct_v >= 8) { + decode(new_crush_node_flags, bl); + } + if (struct_v >= 9) { + decode(new_device_class_flags, bl); + } + if (struct_v >= 10) { + decode(change_stretch_mode, bl); + decode(new_stretch_bucket_count, bl); + decode(new_degraded_stretch_mode, bl); + decode(new_recovering_stretch_mode, bl); + decode(new_stretch_mode_bucket, bl); + decode(stretch_mode_enabled, bl); + } + if (struct_v >= 11) { + decode(new_range_blocklist, bl); + decode(old_range_blocklist, bl); + } + if (struct_v >= 12) { + decode(mutate_allow_crimson, bl); + } + DECODE_FINISH(bl); // osd-only data + } + + if (struct_v >= 8) { + have_crc = true; + crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset); + decode(inc_crc, bl); + tail_offset = bl.get_off(); + decode(full_crc, bl); + } else { + have_crc = false; + full_crc = 0; + inc_crc = 0; + } + + DECODE_FINISH(bl); // wrapper + + if (have_crc) { + // verify crc + uint32_t actual = crc_front.crc32c(-1); + if (tail_offset < bl.get_off()) { + ceph::buffer::list tail; + tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset); + actual = tail.crc32c(actual); + } + if (inc_crc != actual) { + ostringstream ss; + ss << "bad crc, actual " << actual << " != expected " << inc_crc; + string s = ss.str(); + throw ceph::buffer::malformed_input(s.c_str()); + } + } +} + +void OSDMap::Incremental::dump(Formatter *f) const +{ + f->dump_int("epoch", epoch); + f->dump_stream("fsid") << fsid; + f->dump_stream("modified") << modified; + f->dump_stream("new_last_up_change") << new_last_up_change; + f->dump_stream("new_last_in_change") << new_last_in_change; + f->dump_int("new_pool_max", new_pool_max); + f->dump_int("new_flags", new_flags); + f->dump_float("new_full_ratio", new_full_ratio); + f->dump_float("new_nearfull_ratio", new_nearfull_ratio); + f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio); + f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client)); + f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release)); + f->dump_unsigned("mutate_allow_crimson", static_cast<unsigned>(mutate_allow_crimson)); + + if (fullmap.length()) { + f->open_object_section("full_map"); + OSDMap full; + ceph::buffer::list fbl = fullmap; // kludge around constness. + auto p = fbl.cbegin(); + full.decode(p); + full.dump(f); + f->close_section(); + } + if (crush.length()) { + f->open_object_section("crush"); + CrushWrapper c; + ceph::buffer::list tbl = crush; // kludge around constness. + auto p = tbl.cbegin(); + c.decode(p); + c.dump(f); + f->close_section(); + } + + f->dump_int("new_max_osd", new_max_osd); + + f->open_array_section("new_pools"); + + for (const auto &new_pool : new_pools) { + f->open_object_section("pool"); + f->dump_int("pool", new_pool.first); + new_pool.second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_pool_names"); + + for (const auto &new_pool_name : new_pool_names) { + f->open_object_section("pool_name"); + f->dump_int("pool", new_pool_name.first); + f->dump_string("name", new_pool_name.second); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pools"); + + for (const auto &old_pool : old_pools) + f->dump_int("pool", old_pool); + f->close_section(); + + f->open_array_section("new_up_osds"); + + for (const auto &upclient : new_up_client) { + f->open_object_section("osd"); + f->dump_int("osd", upclient.first); + f->dump_stream("public_addr") << upclient.second.legacy_addr(); + f->dump_object("public_addrs", upclient.second); + if (auto p = new_up_cluster.find(upclient.first); + p != new_up_cluster.end()) { + f->dump_stream("cluster_addr") << p->second.legacy_addr(); + f->dump_object("cluster_addrs", p->second); + } + if (auto p = new_hb_back_up.find(upclient.first); + p != new_hb_back_up.end()) { + f->dump_object("heartbeat_back_addrs", p->second); + } + if (auto p = new_hb_front_up.find(upclient.first); + p != new_hb_front_up.end()) { + f->dump_object("heartbeat_front_addrs", p->second); + } + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_weight"); + + for (const auto &weight : new_weight) { + f->open_object_section("osd"); + f->dump_int("osd", weight.first); + f->dump_int("weight", weight.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("osd_state_xor"); + for (const auto &ns : new_state) { + f->open_object_section("osd"); + f->dump_int("osd", ns.first); + set<string> st; + calc_state_set(new_state.find(ns.first)->second, st); + f->open_array_section("state_xor"); + for (auto &state : st) + f->dump_string("state", state); + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_pg_temp"); + + for (const auto &pg_temp : new_pg_temp) { + f->open_object_section("pg"); + f->dump_stream("pgid") << pg_temp.first; + f->open_array_section("osds"); + + for (const auto &osd : pg_temp.second) + f->dump_int("osd", osd); + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("primary_temp"); + + for (const auto &primary_temp : new_primary_temp) { + f->dump_stream("pgid") << primary_temp.first; + f->dump_int("osd", primary_temp.second); + } + f->close_section(); // primary_temp + + f->open_array_section("new_pg_upmap"); + for (auto& i : new_pg_upmap) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << i.first; + f->open_array_section("osds"); + for (auto osd : i.second) { + f->dump_int("osd", osd); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pg_upmap"); + for (auto& i : old_pg_upmap) { + f->dump_stream("pgid") << i; + } + f->close_section(); + + f->open_array_section("new_pg_upmap_items"); + for (auto& i : new_pg_upmap_items) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << i.first; + f->open_array_section("mappings"); + for (auto& p : i.second) { + f->open_object_section("mapping"); + f->dump_int("from", p.first); + f->dump_int("to", p.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("old_pg_upmap_items"); + for (auto& i : old_pg_upmap_items) { + f->dump_stream("pgid") << i; + } + f->close_section(); + + // dump upmap_primaries + f->open_array_section("new_pg_upmap_primaries"); + for (auto& [pg, osd] : new_pg_upmap_primary) { + f->open_object_section("primary_mapping"); + f->dump_stream("pgid") << pg; + f->dump_int("primary_osd", osd); + f->close_section(); + } + f->close_section(); // new_pg_upmap_primaries + + // dump old_pg_upmap_primaries (removed primary mappings) + f->open_array_section("old_pg_upmap_primaries"); + for (auto& pg : old_pg_upmap_primary) { + f->dump_stream("pgid") << pg; + } + f->close_section(); // old_pg_upmap_primaries + + f->open_array_section("new_up_thru"); + + for (const auto &up_thru : new_up_thru) { + f->open_object_section("osd"); + f->dump_int("osd", up_thru.first); + f->dump_int("up_thru", up_thru.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_lost"); + + for (const auto &lost : new_lost) { + f->open_object_section("osd"); + f->dump_int("osd", lost.first); + f->dump_int("epoch_lost", lost.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_last_clean_interval"); + + for (const auto &last_clean_interval : new_last_clean_interval) { + f->open_object_section("osd"); + f->dump_int("osd", last_clean_interval.first); + f->dump_int("first", last_clean_interval.second.first); + f->dump_int("last", last_clean_interval.second.second); + f->close_section(); + } + f->close_section(); + + f->open_array_section("new_blocklist"); + for (const auto &blist : new_blocklist) { + stringstream ss; + ss << blist.first; + f->dump_stream(ss.str().c_str()) << blist.second; + } + f->close_section(); + f->open_array_section("old_blocklist"); + for (const auto &blist : old_blocklist) + f->dump_stream("addr") << blist; + f->close_section(); + f->open_array_section("new_range_blocklist"); + for (const auto &blist : new_range_blocklist) { + stringstream ss; + ss << blist.first; + f->dump_stream(ss.str().c_str()) << blist.second; + } + f->close_section(); + f->open_array_section("old_range_blocklist"); + for (const auto &blist : old_range_blocklist) + f->dump_stream("addr") << blist; + f->close_section(); + + f->open_array_section("new_xinfo"); + for (const auto &xinfo : new_xinfo) { + f->open_object_section("xinfo"); + f->dump_int("osd", xinfo.first); + xinfo.second.dump(f); + f->close_section(); + } + f->close_section(); + + if (cluster_snapshot.size()) + f->dump_string("cluster_snapshot", cluster_snapshot); + + f->open_array_section("new_uuid"); + for (const auto &uuid : new_uuid) { + f->open_object_section("osd"); + f->dump_int("osd", uuid.first); + f->dump_stream("uuid") << uuid.second; + f->close_section(); + } + f->close_section(); + + OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f); + f->open_array_section("old_erasure_code_profiles"); + for (const auto &erasure_code_profile : old_erasure_code_profiles) { + f->dump_string("old", erasure_code_profile); + } + f->close_section(); + + f->open_array_section("new_removed_snaps"); + for (auto& p : new_removed_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_purged_snaps"); + for (auto& p : new_purged_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->open_array_section("new_crush_node_flags"); + for (auto& i : new_crush_node_flags) { + f->open_object_section("node"); + f->dump_int("id", i.first); + set<string> st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_array_section("new_device_class_flags"); + for (auto& i : new_device_class_flags) { + f->open_object_section("device_class"); + f->dump_int("id", i.first); + set<string> st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_object_section("stretch_mode"); + { + f->dump_bool("change_stretch_mode", change_stretch_mode); + f->dump_bool("stretch_mode_enabled", stretch_mode_enabled); + f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count); + f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode); + f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode); + f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket); + } + f->close_section(); + f->close_section(); +} + +void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o) +{ + o.push_back(new Incremental); +} + +// ---------------------------------- +// OSDMap + +void OSDMap::set_epoch(epoch_t e) +{ + epoch = e; + for (auto &pool : pools) + pool.second.last_change = e; +} + +OSDMap::range_bits::range_bits() : ipv6(false) { + memset(&bits, 0, sizeof(bits)); +} + +OSDMap::range_bits::range_bits(const entity_addr_t& addr) : ipv6(false) { + memset(&bits, 0, sizeof(bits)); + parse(addr); +} + +void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr, + uint64_t *upper, uint64_t *lower) +{ + *upper = ((uint64_t)(ntohl(*(uint32_t*)(addr)))) << 32 | + ((uint64_t)(ntohl(*(uint32_t*)(&addr[4])))); + *lower = ((uint64_t)(ntohl(*(uint32_t*)(&addr[8])))) << 32 | + ((uint64_t)(ntohl(*(uint32_t*)(&addr[12])))); +} + +void OSDMap::range_bits::parse(const entity_addr_t& addr) { + // parse it into meaningful data + if (addr.is_ipv6()) { + get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, + &bits.ipv6.upper_64_bits, &bits.ipv6.lower_64_bits); + int32_t lower_shift = std::min(128- + static_cast<int32_t>(addr.get_nonce()), 64); + int32_t upper_shift = std::max(64- //(128-b.first.get_nonce())-64 + static_cast<int32_t>(addr.get_nonce()), 0); + + auto get_mask = [](int32_t shift) -> uint64_t { + if (shift >= 0 && shift < 64) { + return UINT64_MAX << shift; + } + return 0; + }; + + bits.ipv6.lower_mask = get_mask(lower_shift); + bits.ipv6.upper_mask = get_mask(upper_shift); + ipv6 = true; + } else if (addr.is_ipv4()) { + bits.ipv4.ip_32_bits = ntohl(addr.in4_addr().sin_addr.s_addr); + if (addr.get_nonce() > 0) { + bits.ipv4.mask = UINT32_MAX << (32-addr.get_nonce()); + } else { + bits.ipv4.mask = 0; + } + } else { + // uh... + } +} + +bool OSDMap::range_bits::matches(const entity_addr_t& addr) const { + if (addr.is_ipv4() && !ipv6) { + return ((ntohl(addr.in4_addr().sin_addr.s_addr) & bits.ipv4.mask) == + (bits.ipv4.ip_32_bits & bits.ipv4.mask)); + } else if (addr.is_ipv6() && ipv6) { + uint64_t upper_64, lower_64; + get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, &upper_64, &lower_64); + return (((upper_64 & bits.ipv6.upper_mask) == + (bits.ipv6.upper_64_bits & bits.ipv6.upper_mask)) && + ((lower_64 & bits.ipv6.lower_mask) == + (bits.ipv6.lower_64_bits & bits.ipv6.lower_mask))); + } + return false; +} + +bool OSDMap::is_blocklisted(const entity_addr_t& orig, CephContext *cct) const +{ + if (cct) ldout(cct, 25) << "is_blocklisted: " << orig << dendl; + if (blocklist.empty() && range_blocklist.empty()) { + if (cct) ldout(cct, 30) << "not blocklisted: " << orig << dendl; + return false; + } + + // all blocklist entries are type ANY for nautilus+ + // FIXME: avoid this copy! + entity_addr_t a = orig; + if (require_osd_release < ceph_release_t::nautilus) { + a.set_type(entity_addr_t::TYPE_LEGACY); + } else { + a.set_type(entity_addr_t::TYPE_ANY); + } + + // this specific instance? + if (blocklist.count(a)) { + if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl; + return true; + } + + // is entire ip blocklisted? + if (a.is_ip()) { + a.set_port(0); + a.set_nonce(0); + if (blocklist.count(a)) { + if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl; + return true; + } + } + + // is it in a blocklisted range? + for (const auto& i : calculated_ranges) { + bool blocked = i.second.matches(a); + if (blocked) { + if (cct) ldout(cct, 20) << "range_blocklist contains " << a << dendl; + return true; + } + } + + if (cct) ldout(cct, 25) << "not blocklisted: " << orig << dendl; + return false; +} + +bool OSDMap::is_blocklisted(const entity_addrvec_t& av, CephContext *cct) const +{ + if (blocklist.empty() && range_blocklist.empty()) + return false; + + for (auto& a : av.v) { + if (is_blocklisted(a, cct)) { + return true; + } + } + + return false; +} + +void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl, + std::list<std::pair<entity_addr_t,utime_t> > *rl) const +{ + std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl)); + std::copy(range_blocklist.begin(), range_blocklist.end(), + std::back_inserter(*rl)); +} + +void OSDMap::get_blocklist(std::set<entity_addr_t> *bl, + std::set<entity_addr_t> *rl) const +{ + for (const auto &i : blocklist) { + bl->insert(i.first); + } + for (const auto &i : range_blocklist) { + rl->insert(i.first); + } +} + +void OSDMap::set_max_osd(int m) +{ + max_osd = m; + osd_state.resize(max_osd, 0); + osd_weight.resize(max_osd, CEPH_OSD_OUT); + osd_info.resize(max_osd); + osd_xinfo.resize(max_osd); + osd_addrs->client_addrs.resize(max_osd); + osd_addrs->cluster_addrs.resize(max_osd); + osd_addrs->hb_back_addrs.resize(max_osd); + osd_addrs->hb_front_addrs.resize(max_osd); + osd_uuid->resize(max_osd); + if (osd_primary_affinity) + osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); + + calc_num_osds(); +} + +int OSDMap::calc_num_osds() +{ + num_osd = 0; + num_up_osd = 0; + num_in_osd = 0; + for (int i=0; i<max_osd; i++) { + if (osd_state[i] & CEPH_OSD_EXISTS) { + ++num_osd; + if (osd_state[i] & CEPH_OSD_UP) { + ++num_up_osd; + } + if (get_weight(i) != CEPH_OSD_OUT) { + ++num_in_osd; + } + } + } + return num_osd; +} + +void OSDMap::get_full_pools(CephContext *cct, + set<int64_t> *full, + set<int64_t> *backfillfull, + set<int64_t> *nearfull) const +{ + ceph_assert(full); + ceph_assert(backfillfull); + ceph_assert(nearfull); + full->clear(); + backfillfull->clear(); + nearfull->clear(); + + vector<int> full_osds; + vector<int> backfillfull_osds; + vector<int> nearfull_osds; + for (int i = 0; i < max_osd; ++i) { + if (exists(i) && is_up(i) && is_in(i)) { + if (osd_state[i] & CEPH_OSD_FULL) + full_osds.push_back(i); + else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) + backfillfull_osds.push_back(i); + else if (osd_state[i] & CEPH_OSD_NEARFULL) + nearfull_osds.push_back(i); + } + } + + for (auto i: full_osds) { + get_pool_ids_by_osd(cct, i, full); + } + for (auto i: backfillfull_osds) { + get_pool_ids_by_osd(cct, i, backfillfull); + } + for (auto i: nearfull_osds) { + get_pool_ids_by_osd(cct, i, nearfull); + } +} + +void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill, + set<int> *nearfull) const +{ + full->clear(); + backfill->clear(); + nearfull->clear(); + for (int i = 0; i < max_osd; ++i) { + if (exists(i) && is_up(i) && is_in(i)) { + if (osd_state[i] & CEPH_OSD_FULL) + full->emplace(i); + else if (osd_state[i] & CEPH_OSD_BACKFILLFULL) + backfill->emplace(i); + else if (osd_state[i] & CEPH_OSD_NEARFULL) + nearfull->emplace(i); + } + } +} + +void OSDMap::get_all_osds(set<int32_t>& ls) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i)) + ls.insert(i); +} + +void OSDMap::get_up_osds(set<int32_t>& ls) const +{ + for (int i = 0; i < max_osd; i++) { + if (is_up(i)) + ls.insert(i); + } +} + +void OSDMap::get_out_existing_osds(set<int32_t>& ls) const +{ + for (int i = 0; i < max_osd; i++) { + if (exists(i) && get_weight(i) == CEPH_OSD_OUT) + ls.insert(i); + } +} + +void OSDMap::get_flag_set(set<string> *flagset) const +{ + for (unsigned i = 0; i < sizeof(flags) * 8; ++i) { + if (flags & (1<<i)) { + flagset->insert(get_flag_string(flags & (1<<i))); + } + } +} + +void OSDMap::calc_state_set(int state, set<string>& st) +{ + unsigned t = state; + for (unsigned s = 1; t; s <<= 1) { + if (t & s) { + t &= ~s; + st.insert(ceph_osd_state_name(s)); + } + } +} + +void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const +{ + float max = 0; + for (const auto &weight : weights) { + if (weight.second > max) + max = weight.second; + } + + for (const auto &weight : weights) { + inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN); + } +} + +int OSDMap::identify_osd(const entity_addr_t& addr) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i) && (get_addrs(i).contains(addr) || + get_cluster_addrs(i).contains(addr))) + return i; + return -1; +} + +int OSDMap::identify_osd(const uuid_d& u) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i) && get_uuid(i) == u) + return i; + return -1; +} + +int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i) && (get_addrs(i).contains(addr) || + get_cluster_addrs(i).contains(addr) || + get_hb_back_addrs(i).contains(addr) || + get_hb_front_addrs(i).contains(addr))) + return i; + return -1; +} + +int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const +{ + for (int i=0; i<max_osd; i++) + if (exists(i) && (get_addrs(i).is_same_host(ip) || + get_cluster_addrs(i).is_same_host(ip))) + return i; + return -1; +} + + +uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const +{ + uint64_t features = 0; // things we actually have + uint64_t mask = 0; // things we could have + + if (crush->has_nondefault_tunables()) + features |= CEPH_FEATURE_CRUSH_TUNABLES; + if (crush->has_nondefault_tunables2()) + features |= CEPH_FEATURE_CRUSH_TUNABLES2; + if (crush->has_nondefault_tunables3()) + features |= CEPH_FEATURE_CRUSH_TUNABLES3; + if (crush->has_v4_buckets()) + features |= CEPH_FEATURE_CRUSH_V4; + if (crush->has_nondefault_tunables5()) + features |= CEPH_FEATURE_CRUSH_TUNABLES5; + if (crush->has_incompat_choose_args()) { + features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS; + } + mask |= CEPH_FEATURES_CRUSH; + + if (!pg_upmap.empty() || !pg_upmap_items.empty() || !pg_upmap_primaries.empty()) + features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP; + mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP; + + for (auto &pool: pools) { + if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) { + features |= CEPH_FEATURE_OSDHASHPSPOOL; + } + if (!pool.second.tiers.empty() || + pool.second.is_tier()) { + features |= CEPH_FEATURE_OSD_CACHEPOOL; + } + int ruleid = pool.second.get_crush_rule(); + if (ruleid >= 0) { + if (crush->is_v2_rule(ruleid)) + features |= CEPH_FEATURE_CRUSH_V2; + if (crush->is_v3_rule(ruleid)) + features |= CEPH_FEATURE_CRUSH_TUNABLES3; + if (crush->is_v5_rule(ruleid)) + features |= CEPH_FEATURE_CRUSH_TUNABLES5; + } + } + mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL; + + if (osd_primary_affinity) { + for (int i = 0; i < max_osd; ++i) { + if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY; + break; + } + } + } + mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY; + + if (entity_type == CEPH_ENTITY_TYPE_OSD) { + const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL; + if (require_osd_release >= ceph_release_t::jewel) { + features |= jewel_features; + } + mask |= jewel_features; + + const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN + | CEPH_FEATURE_MSG_ADDR2; + if (require_osd_release >= ceph_release_t::kraken) { + features |= kraken_features; + } + mask |= kraken_features; + + if (stretch_mode_enabled) { + features |= CEPH_FEATUREMASK_STRETCH_MODE; + mask |= CEPH_FEATUREMASK_STRETCH_MODE; + } + } + + if (require_min_compat_client >= ceph_release_t::nautilus) { + // if min_compat_client is >= nautilus, require v2 cephx signatures + // from everyone + features |= CEPH_FEATUREMASK_CEPHX_V2; + } else if (require_osd_release >= ceph_release_t::nautilus && + entity_type == CEPH_ENTITY_TYPE_OSD) { + // if osds are >= nautilus, at least require the signatures from them + features |= CEPH_FEATUREMASK_CEPHX_V2; + } + mask |= CEPH_FEATUREMASK_CEPHX_V2; + + if (pmask) + *pmask = mask; + return features; +} + +ceph_release_t OSDMap::get_min_compat_client() const +{ + uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr); + + if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43 + HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28 + return ceph_release_t::luminous; // v12.2.0 + } + if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) { // v10.0.0-612-g043a737 + return ceph_release_t::jewel; // v10.2.0 + } + if (HAVE_FEATURE(f, CRUSH_V4)) { // v0.91-678-g325fc56 + return ceph_release_t::hammer; // v0.94.0 + } + if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624 + HAVE_FEATURE(f, CRUSH_TUNABLES3) || // v0.76-395-ge20a55d + HAVE_FEATURE(f, OSD_CACHEPOOL)) { // v0.67-401-gb91c1c5 + return ceph_release_t::firefly; // v0.80.0 + } + if (HAVE_FEATURE(f, CRUSH_TUNABLES2) || // v0.54-684-g0cc47ff + HAVE_FEATURE(f, OSDHASHPSPOOL)) { // v0.57-398-g8cc2b0f + return ceph_release_t::dumpling; // v0.67.0 + } + if (HAVE_FEATURE(f, CRUSH_TUNABLES)) { // v0.48argonaut-206-g6f381af + return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af + } + return ceph_release_t::argonaut; // v0.48argonaut-206-g6f381af +} + +ceph_release_t OSDMap::get_require_min_compat_client() const +{ + return require_min_compat_client; +} + +void OSDMap::_calc_up_osd_features() +{ + bool first = true; + cached_up_osd_features = 0; + for (int osd = 0; osd < max_osd; ++osd) { + if (!is_up(osd)) + continue; + const osd_xinfo_t &xi = get_xinfo(osd); + if (xi.features == 0) + continue; // bogus xinfo, maybe #20751 or similar, skipping + if (first) { + cached_up_osd_features = xi.features; + first = false; + } else { + cached_up_osd_features &= xi.features; + } + } +} + +uint64_t OSDMap::get_up_osd_features() const +{ + return cached_up_osd_features; +} + +void OSDMap::dedup(const OSDMap *o, OSDMap *n) +{ + using ceph::encode; + if (o->epoch == n->epoch) + return; + + int diff = 0; + + // do addrs match? + if (o->max_osd != n->max_osd) + diff++; + for (int i = 0; i < o->max_osd && i < n->max_osd; i++) { + if ( n->osd_addrs->client_addrs[i] && o->osd_addrs->client_addrs[i] && + *n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i]) + n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i]; + else + diff++; + if ( n->osd_addrs->cluster_addrs[i] && o->osd_addrs->cluster_addrs[i] && + *n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i]) + n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i]; + else + diff++; + if ( n->osd_addrs->hb_back_addrs[i] && o->osd_addrs->hb_back_addrs[i] && + *n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i]) + n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i]; + else + diff++; + if ( n->osd_addrs->hb_front_addrs[i] && o->osd_addrs->hb_front_addrs[i] && + *n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i]) + n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i]; + else + diff++; + } + if (diff == 0) { + // zoinks, no differences at all! + n->osd_addrs = o->osd_addrs; + } + + // does crush match? + ceph::buffer::list oc, nc; + encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT); + encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT); + if (oc.contents_equal(nc)) { + n->crush = o->crush; + } + + // does pg_temp match? + if (*o->pg_temp == *n->pg_temp) + n->pg_temp = o->pg_temp; + + // does primary_temp match? + if (o->primary_temp->size() == n->primary_temp->size()) { + if (*o->primary_temp == *n->primary_temp) + n->primary_temp = o->primary_temp; + } + + // do uuids match? + if (o->osd_uuid->size() == n->osd_uuid->size() && + *o->osd_uuid == *n->osd_uuid) + n->osd_uuid = o->osd_uuid; +} + +void OSDMap::clean_temps(CephContext *cct, + const OSDMap& oldmap, + const OSDMap& nextmap, + Incremental *pending_inc) +{ + ldout(cct, 10) << __func__ << dendl; + + for (auto pg : *nextmap.pg_temp) { + // if pool does not exist, remove any existing pg_temps associated with + // it. we don't care about pg_temps on the pending_inc either; if there + // are new_pg_temp entries on the pending, clear them out just as well. + if (!nextmap.have_pg_pool(pg.first.pool())) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first + << " for nonexistent pool " << pg.first.pool() << dendl; + pending_inc->new_pg_temp[pg.first].clear(); + continue; + } + if (!nextmap.pg_exists(pg.first)) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first + << " for nonexistent pg " << dendl; + pending_inc->new_pg_temp[pg.first].clear(); + continue; + } + // all osds down? + unsigned num_up = 0; + for (auto o : pg.second) { + if (!nextmap.is_down(o)) { + ++num_up; + break; + } + } + if (num_up == 0) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first + << " with all down osds" << pg.second << dendl; + pending_inc->new_pg_temp[pg.first].clear(); + continue; + } + // redundant pg_temp? + vector<int> raw_up; + int primary; + nextmap.pg_to_raw_up(pg.first, &raw_up, &primary); + bool remove = false; + if (raw_up == pg.second) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " " + << pg.second << " that matches raw_up mapping" << dendl; + remove = true; + } + // oversized pg_temp? + if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) { + ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first << " " + << pg.second << " exceeds pool size" << dendl; + remove = true; + } + if (remove) { + if (oldmap.pg_temp->count(pg.first)) + pending_inc->new_pg_temp[pg.first].clear(); + else + pending_inc->new_pg_temp.erase(pg.first); + } + } + + for (auto &pg : *nextmap.primary_temp) { + // primary down? + if (nextmap.is_down(pg.second)) { + ldout(cct, 10) << __func__ << " removing primary_temp " << pg.first + << " to down " << pg.second << dendl; + pending_inc->new_primary_temp[pg.first] = -1; + continue; + } + // redundant primary_temp? + vector<int> real_up, templess_up; + int real_primary, templess_primary; + pg_t pgid = pg.first; + nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary); + nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary); + if (real_primary == templess_primary){ + ldout(cct, 10) << __func__ << " removing primary_temp " + << pgid << " -> " << real_primary + << " (unnecessary/redundant)" << dendl; + if (oldmap.primary_temp->count(pgid)) + pending_inc->new_primary_temp[pgid] = -1; + else + pending_inc->new_primary_temp.erase(pgid); + } + } +} + +void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const +{ + upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size()); + for (auto& p : pg_upmap) + upmap_pgs->push_back(p.first); + for (auto& p : pg_upmap_items) + upmap_pgs->push_back(p.first); +} + +bool OSDMap::check_pg_upmaps( + CephContext *cct, + const vector<pg_t>& to_check, + vector<pg_t> *to_cancel, + map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const +{ + bool any_change = false; + map<int, map<int, float>> rule_weight_map; + for (auto& pg : to_check) { + const pg_pool_t *pi = get_pg_pool(pg.pool()); + if (!pi || pg.ps() >= pi->get_pg_num_pending()) { + ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source" + << dendl; + to_cancel->push_back(pg); + continue; + } + if (pi->is_pending_merge(pg, nullptr)) { + ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge" + << dendl; + to_cancel->push_back(pg); + continue; + } + vector<int> raw, up; + pg_to_raw_upmap(pg, &raw, &up); + auto crush_rule = get_pg_pool_crush_rule(pg); + auto r = crush->verify_upmap(cct, + crush_rule, + get_pg_pool_size(pg), + up); + if (r < 0) { + ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg + << " returning " << r + << dendl; + to_cancel->push_back(pg); + continue; + } + // below we check against crush-topology changing.. + map<int, float> weight_map; + auto it = rule_weight_map.find(crush_rule); + if (it == rule_weight_map.end()) { + auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map); + if (r < 0) { + lderr(cct) << __func__ << " unable to get crush weight_map for " + << "crush_rule " << crush_rule + << dendl; + continue; + } + rule_weight_map[crush_rule] = weight_map; + } else { + weight_map = it->second; + } + ldout(cct, 10) << __func__ << " pg " << pg + << " weight_map " << weight_map + << dendl; + for (auto osd : up) { + auto it = weight_map.find(osd); + if (it == weight_map.end()) { + ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has " + << "been moved out of the specific crush-tree" + << dendl; + to_cancel->push_back(pg); + break; + } + auto adjusted_weight = get_weightf(it->first) * it->second; + if (adjusted_weight == 0) { + ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd + << " is out/crush-out" + << dendl; + to_cancel->push_back(pg); + break; + } + } + if (!to_cancel->empty() && to_cancel->back() == pg) + continue; + // okay, upmap is valid + // continue to check if it is still necessary + auto i = pg_upmap.find(pg); + if (i != pg_upmap.end()) { + if (i->second == raw) { + ldout(cct, 10) << __func__ << "removing redundant pg_upmap " << i->first << " " + << i->second << dendl; + to_cancel->push_back(pg); + continue; + } + if ((int)i->second.size() != get_pg_pool_size(pg)) { + ldout(cct, 10) << __func__ << "removing pg_upmap " << i->first << " " + << i->second << " != pool size " << get_pg_pool_size(pg) + << dendl; + to_cancel->push_back(pg); + continue; + } + } + auto j = pg_upmap_items.find(pg); + if (j != pg_upmap_items.end()) { + mempool::osdmap::vector<pair<int,int>> newmap; + for (auto& p : j->second) { + auto osd_from = p.first; + auto osd_to = p.second; + if (std::find(raw.begin(), raw.end(), osd_from) == raw.end()) { + // cancel mapping if source osd does not exist anymore + ldout(cct, 20) << __func__ << " pg_upmap_items (source osd does not exist) " << pg_upmap_items << dendl; + continue; + } + if (osd_to != CRUSH_ITEM_NONE && osd_to < max_osd && + osd_to >= 0 && osd_weight[osd_to] == 0) { + // cancel mapping if target osd is out + ldout(cct, 20) << __func__ << " pg_upmap_items (target osd is out) " << pg_upmap_items << dendl; + continue; + } + newmap.push_back(p); + } + if (newmap.empty()) { + ldout(cct, 10) << __func__ << " removing no-op pg_upmap_items " + << j->first << " " << j->second + << dendl; + to_cancel->push_back(pg); + } else if (newmap != j->second) { + // check partial no-op here. + ldout(cct, 10) << __func__ << " simplifying partially no-op pg_upmap_items " + << j->first << " " << j->second + << " -> " << newmap + << dendl; + to_remap->insert({pg, newmap}); + any_change = true; + } + } + } + any_change = any_change || !to_cancel->empty(); + return any_change; +} + +void OSDMap::clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc, + const vector<pg_t>& to_cancel, + const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const +{ + for (auto &pg: to_cancel) { + auto i = pending_inc->new_pg_upmap.find(pg); + if (i != pending_inc->new_pg_upmap.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pending " + << "pg_upmap entry " + << i->first << "->" << i->second + << dendl; + pending_inc->new_pg_upmap.erase(i); + } + auto j = pg_upmap.find(pg); + if (j != pg_upmap.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry " + << j->first << "->" << j->second + << dendl; + pending_inc->old_pg_upmap.insert(pg); + } + auto p = pending_inc->new_pg_upmap_items.find(pg); + if (p != pending_inc->new_pg_upmap_items.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pending " + << "pg_upmap_items entry " + << p->first << "->" << p->second + << dendl; + pending_inc->new_pg_upmap_items.erase(p); + } + auto q = pg_upmap_items.find(pg); + if (q != pg_upmap_items.end()) { + ldout(cct, 10) << __func__ << " cancel invalid " + << "pg_upmap_items entry " + << q->first << "->" << q->second + << dendl; + pending_inc->old_pg_upmap_items.insert(pg); + } + } + for (auto& i : to_remap) + pending_inc->new_pg_upmap_items[i.first] = i.second; +} + +bool OSDMap::clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc) const +{ + ldout(cct, 10) << __func__ << dendl; + vector<pg_t> to_check; + vector<pg_t> to_cancel; + map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap; + + get_upmap_pgs(&to_check); + auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap); + clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap); + //TODO: Create these 3 functions for pg_upmap_primaries and so they can be checked + // and cleaned in the same way as pg_upmap. This is not critical since invalid + // pg_upmap_primaries are never applied, (the final check is in _apply_upmap). + return any_change; +} + +int OSDMap::apply_incremental(const Incremental &inc) +{ + new_blocklist_entries = false; + if (inc.epoch == 1) + fsid = inc.fsid; + else if (inc.fsid != fsid) + return -EINVAL; + + ceph_assert(inc.epoch == epoch+1); + + epoch++; + modified = inc.modified; + + // full map? + if (inc.fullmap.length()) { + ceph::buffer::list bl(inc.fullmap); + decode(bl); + return 0; + } + + // nope, incremental. + if (inc.new_flags >= 0) { + flags = inc.new_flags; + // the below is just to cover a newly-upgraded luminous mon + // cluster that has to set require_jewel_osds or + // require_kraken_osds before the osds can be upgraded to + // luminous. + if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) { + if (require_osd_release < ceph_release_t::kraken) { + require_osd_release = ceph_release_t::kraken; + } + } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) { + if (require_osd_release < ceph_release_t::jewel) { + require_osd_release = ceph_release_t::jewel; + } + } + } + + if (inc.new_max_osd >= 0) + set_max_osd(inc.new_max_osd); + + if (inc.new_pool_max != -1) + pool_max = inc.new_pool_max; + + for (const auto &pool : inc.new_pools) { + pools[pool.first] = pool.second; + pools[pool.first].last_change = epoch; + } + + new_removed_snaps = inc.new_removed_snaps; + new_purged_snaps = inc.new_purged_snaps; + for (auto p = new_removed_snaps.begin(); + p != new_removed_snaps.end(); + ++p) { + removed_snaps_queue[p->first].union_of(p->second); + } + for (auto p = new_purged_snaps.begin(); + p != new_purged_snaps.end(); + ++p) { + auto q = removed_snaps_queue.find(p->first); + ceph_assert(q != removed_snaps_queue.end()); + q->second.subtract(p->second); + if (q->second.empty()) { + removed_snaps_queue.erase(q); + } + } + + if (inc.new_last_up_change != utime_t()) { + last_up_change = inc.new_last_up_change; + } + if (inc.new_last_in_change != utime_t()) { + last_in_change = inc.new_last_in_change; + } + + for (const auto &pname : inc.new_pool_names) { + auto pool_name_entry = pool_name.find(pname.first); + if (pool_name_entry != pool_name.end()) { + name_pool.erase(pool_name_entry->second); + pool_name_entry->second = pname.second; + } else { + pool_name[pname.first] = pname.second; + } + name_pool[pname.second] = pname.first; + } + + for (const auto &pool : inc.old_pools) { + pools.erase(pool); + name_pool.erase(pool_name[pool]); + pool_name.erase(pool); + } + + for (const auto &weight : inc.new_weight) { + set_weight(weight.first, weight.second); + + // if we are marking in, clear the AUTOOUT and NEW bits, and clear + // xinfo old_weight. + if (weight.second) { + osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW); + osd_xinfo[weight.first].old_weight = 0; + } + } + + for (const auto &primary_affinity : inc.new_primary_affinity) { + set_primary_affinity(primary_affinity.first, primary_affinity.second); + } + + // erasure_code_profiles + for (const auto &profile : inc.old_erasure_code_profiles) + erasure_code_profiles.erase(profile); + + for (const auto &profile : inc.new_erasure_code_profiles) { + set_erasure_code_profile(profile.first, profile.second); + } + + // up/down + for (const auto &state : inc.new_state) { + const auto osd = state.first; + int s = state.second ? state.second : CEPH_OSD_UP; + if ((osd_state[osd] & CEPH_OSD_UP) && + (s & CEPH_OSD_UP)) { + osd_info[osd].down_at = epoch; + osd_xinfo[osd].down_stamp = modified; + } + if ((osd_state[osd] & CEPH_OSD_EXISTS) && + (s & CEPH_OSD_EXISTS)) { + // osd is destroyed; clear out anything interesting. + (*osd_uuid)[osd] = uuid_d(); + osd_info[osd] = osd_info_t(); + osd_xinfo[osd] = osd_xinfo_t(); + set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); + osd_addrs->client_addrs[osd].reset(new entity_addrvec_t()); + osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t()); + osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t()); + osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t()); + osd_state[osd] = 0; + } else { + osd_state[osd] ^= s; + } + } + + for (const auto &client : inc.new_up_client) { + osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; + osd_state[client.first] &= ~CEPH_OSD_STOP; // if any + osd_addrs->client_addrs[client.first].reset( + new entity_addrvec_t(client.second)); + osd_addrs->hb_back_addrs[client.first].reset( + new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second)); + osd_addrs->hb_front_addrs[client.first].reset( + new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second)); + + osd_info[client.first].up_from = epoch; + } + + for (const auto &cluster : inc.new_up_cluster) + osd_addrs->cluster_addrs[cluster.first].reset( + new entity_addrvec_t(cluster.second)); + + // info + for (const auto &thru : inc.new_up_thru) + osd_info[thru.first].up_thru = thru.second; + + for (const auto &interval : inc.new_last_clean_interval) { + osd_info[interval.first].last_clean_begin = interval.second.first; + osd_info[interval.first].last_clean_end = interval.second.second; + } + + for (const auto &lost : inc.new_lost) + osd_info[lost.first].lost_at = lost.second; + + // xinfo + for (const auto &xinfo : inc.new_xinfo) + osd_xinfo[xinfo.first] = xinfo.second; + + // uuid + for (const auto &uuid : inc.new_uuid) + (*osd_uuid)[uuid.first] = uuid.second; + + // pg rebuild + for (const auto &pg : inc.new_pg_temp) { + if (pg.second.empty()) + pg_temp->erase(pg.first); + else + pg_temp->set(pg.first, pg.second); + } + if (!inc.new_pg_temp.empty()) { + // make sure pg_temp is efficiently stored + pg_temp->rebuild(); + } + + for (const auto &pg : inc.new_primary_temp) { + if (pg.second == -1) + primary_temp->erase(pg.first); + else + (*primary_temp)[pg.first] = pg.second; + } + + for (auto& p : inc.new_pg_upmap) { + pg_upmap[p.first] = p.second; + } + for (auto& pg : inc.old_pg_upmap) { + pg_upmap.erase(pg); + } + for (auto& p : inc.new_pg_upmap_items) { + pg_upmap_items[p.first] = p.second; + } + for (auto& pg : inc.old_pg_upmap_items) { + pg_upmap_items.erase(pg); + } + + for (auto& [pg, prim] : inc.new_pg_upmap_primary) { + pg_upmap_primaries[pg] = prim; + } + for (auto& pg : inc.old_pg_upmap_primary) { + pg_upmap_primaries.erase(pg); + } + + // blocklist + if (!inc.new_blocklist.empty()) { + blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end()); + new_blocklist_entries = true; + } + for (const auto &addr : inc.old_blocklist) + blocklist.erase(addr); + + for (const auto& addr_p : inc.new_range_blocklist) { + range_blocklist.insert(addr_p); + calculated_ranges.emplace(addr_p.first, addr_p.first); + new_blocklist_entries = true; + } + for (const auto &addr : inc.old_range_blocklist) { + calculated_ranges.erase(addr); + range_blocklist.erase(addr); + } + + for (auto& i : inc.new_crush_node_flags) { + if (i.second) { + crush_node_flags[i.first] = i.second; + } else { + crush_node_flags.erase(i.first); + } + } + + for (auto& i : inc.new_device_class_flags) { + if (i.second) { + device_class_flags[i.first] = i.second; + } else { + device_class_flags.erase(i.first); + } + } + + // cluster snapshot? + if (inc.cluster_snapshot.length()) { + cluster_snapshot = inc.cluster_snapshot; + cluster_snapshot_epoch = inc.epoch; + } else { + cluster_snapshot.clear(); + cluster_snapshot_epoch = 0; + } + + if (inc.new_nearfull_ratio >= 0) { + nearfull_ratio = inc.new_nearfull_ratio; + } + if (inc.new_backfillfull_ratio >= 0) { + backfillfull_ratio = inc.new_backfillfull_ratio; + } + if (inc.new_full_ratio >= 0) { + full_ratio = inc.new_full_ratio; + } + if (inc.new_require_min_compat_client > ceph_release_t::unknown) { + require_min_compat_client = inc.new_require_min_compat_client; + } + if (inc.new_require_osd_release >= ceph_release_t::unknown) { + require_osd_release = inc.new_require_osd_release; + if (require_osd_release >= ceph_release_t::luminous) { + flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + flags |= CEPH_OSDMAP_RECOVERY_DELETES; + } + } + + if (inc.new_require_osd_release >= ceph_release_t::unknown) { + require_osd_release = inc.new_require_osd_release; + if (require_osd_release >= ceph_release_t::nautilus) { + flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT; + } + } + // do new crush map last (after up/down stuff) + if (inc.crush.length()) { + ceph::buffer::list bl(inc.crush); + auto blp = bl.cbegin(); + crush.reset(new CrushWrapper); + crush->decode(blp); + if (require_osd_release >= ceph_release_t::luminous) { + // only increment if this is a luminous-encoded osdmap, lest + // the mon's crush_version diverge from what the osds or others + // are decoding and applying on their end. if we won't encode + // it in the canonical version, don't change it. + ++crush_version; + } + for (auto it = device_class_flags.begin(); + it != device_class_flags.end();) { + const char* class_name = crush->get_class_name(it->first); + if (!class_name) // device class is gone + it = device_class_flags.erase(it); + else + it++; + } + } + + if (inc.change_stretch_mode) { + stretch_mode_enabled = inc.stretch_mode_enabled; + stretch_bucket_count = inc.new_stretch_bucket_count; + degraded_stretch_mode = inc.new_degraded_stretch_mode; + recovering_stretch_mode = inc.new_recovering_stretch_mode; + stretch_mode_bucket = inc.new_stretch_mode_bucket; + } + + switch (inc.mutate_allow_crimson) { + case Incremental::mutate_allow_crimson_t::NONE: + break; + case Incremental::mutate_allow_crimson_t::SET: + allow_crimson = true; + break; + case Incremental::mutate_allow_crimson_t::CLEAR: + allow_crimson = false; + break; + } + + calc_num_osds(); + _calc_up_osd_features(); + return 0; +} + +// mapping +int OSDMap::map_to_pg( + int64_t poolid, + const string& name, + const string& key, + const string& nspace, + pg_t *pg) const +{ + // calculate ps (placement seed) + const pg_pool_t *pool = get_pg_pool(poolid); + if (!pool) + return -ENOENT; + ps_t ps; + if (!key.empty()) + ps = pool->hash_key(key, nspace); + else + ps = pool->hash_key(name, nspace); + *pg = pg_t(ps, poolid); + return 0; +} + +int OSDMap::object_locator_to_pg( + const object_t& oid, const object_locator_t& loc, pg_t &pg) const +{ + if (loc.hash >= 0) { + if (!get_pg_pool(loc.get_pool())) { + return -ENOENT; + } + pg = pg_t(loc.hash, loc.get_pool()); + return 0; + } + return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg); +} + +ceph_object_layout OSDMap::make_object_layout( + object_t oid, int pg_pool, string nspace) const +{ + object_locator_t loc(pg_pool, nspace); + + ceph_object_layout ol; + pg_t pgid = object_locator_to_pg(oid, loc); + ol.ol_pgid = pgid.get_old_pg().v; + ol.ol_stripe_unit = 0; + return ol; +} + +void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool, + vector<int>& osds) const +{ + if (pool.can_shift_osds()) { + unsigned removed = 0; + for (unsigned i = 0; i < osds.size(); i++) { + if (!exists(osds[i])) { + removed++; + continue; + } + if (removed) { + osds[i - removed] = osds[i]; + } + } + if (removed) + osds.resize(osds.size() - removed); + } else { + for (auto& osd : osds) { + if (!exists(osd)) + osd = CRUSH_ITEM_NONE; + } + } +} + +void OSDMap::_pg_to_raw_osds( + const pg_pool_t& pool, pg_t pg, + vector<int> *osds, + ps_t *ppps) const +{ + // map to osds[] + ps_t pps = pool.raw_pg_to_pps(pg); // placement ps + unsigned size = pool.get_size(); + + // what crush rule? + int ruleno = pool.get_crush_rule(); + if (ruleno >= 0) + crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool()); + + _remove_nonexistent_osds(pool, *osds); + + if (ppps) + *ppps = pps; +} + +int OSDMap::_pick_primary(const vector<int>& osds) const +{ + for (auto osd : osds) { + if (osd != CRUSH_ITEM_NONE) { + return osd; + } + } + return -1; +} + +void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const +{ + pg_t pg = pi.raw_pg_to_pg(raw_pg); + auto p = pg_upmap.find(pg); + if (p != pg_upmap.end()) { + // make sure targets aren't marked out + for (auto osd : p->second) { + if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 && + osd_weight[osd] == 0) { + // reject/ignore the explicit mapping + return; + } + } + *raw = vector<int>(p->second.begin(), p->second.end()); + // continue to check and apply pg_upmap_items if any + } + + auto q = pg_upmap_items.find(pg); + if (q != pg_upmap_items.end()) { + // NOTE: this approach does not allow a bidirectional swap, + // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + for (auto& [osd_from, osd_to] : q->second) { + // A capcaity change upmap (repace osd in the pg with osd not in the pg) + // make sure the replacement value doesn't already appear + bool exists = false; + ssize_t pos = -1; + for (unsigned i = 0; i < raw->size(); ++i) { + int osd = (*raw)[i]; + if (osd == osd_to) { + exists = true; + break; + } + // ignore mapping if target is marked out (or invalid osd id) + if (osd == osd_from && + pos < 0 && + !(osd_to != CRUSH_ITEM_NONE && osd_to < max_osd && + osd_to >= 0 && osd_weight[osd_to] == 0)) { + pos = i; + } + } + if (!exists && pos >= 0) { + (*raw)[pos] = osd_to; + } + } + } + auto r = pg_upmap_primaries.find(pg); + if (r != pg_upmap_primaries.end()) { + auto new_prim = r->second; + // Apply mapping only if new primary is not marked out and valid osd id + if (new_prim != CRUSH_ITEM_NONE && new_prim < max_osd && new_prim >= 0 && + osd_weight[new_prim] != 0) { + int new_prim_idx = 0; + for (int i = 1 ; i < (int)raw->size(); i++) { // start from 1 on purpose + if ((*raw)[i] == new_prim) { + new_prim_idx = i; + break; + } + } + if (new_prim_idx > 0) { + // swap primary + (*raw)[new_prim_idx] = (*raw)[0]; + (*raw)[0] = new_prim; + } + } + } +} + +// pg -> (up osd list) +void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw, + vector<int> *up) const +{ + if (pool.can_shift_osds()) { + // shift left + up->clear(); + up->reserve(raw.size()); + for (unsigned i=0; i<raw.size(); i++) { + if (!exists(raw[i]) || is_down(raw[i])) + continue; + up->push_back(raw[i]); + } + } else { + // set down/dne devices to NONE + up->resize(raw.size()); + for (int i = raw.size() - 1; i >= 0; --i) { + if (!exists(raw[i]) || is_down(raw[i])) { + (*up)[i] = CRUSH_ITEM_NONE; + } else { + (*up)[i] = raw[i]; + } + } + } +} + +void OSDMap::_apply_primary_affinity(ps_t seed, + const pg_pool_t& pool, + vector<int> *osds, + int *primary) const +{ + // do we have any non-default primary_affinity values for these osds? + if (!osd_primary_affinity) + return; + + bool any = false; + for (const auto osd : *osds) { + if (osd != CRUSH_ITEM_NONE && + (*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + any = true; + break; + } + } + if (!any) + return; + + // pick the primary. feed both the seed (for the pg) and the osd + // into the hash/rng so that a proportional fraction of an osd's pgs + // get rejected as primary. + int pos = -1; + for (unsigned i = 0; i < osds->size(); ++i) { + int o = (*osds)[i]; + if (o == CRUSH_ITEM_NONE) + continue; + unsigned a = (*osd_primary_affinity)[o]; + if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY && + (crush_hash32_2(CRUSH_HASH_RJENKINS1, + seed, o) >> 16) >= a) { + // we chose not to use this primary. note it anyway as a + // fallback in case we don't pick anyone else, but keep looking. + if (pos < 0) + pos = i; + } else { + pos = i; + break; + } + } + if (pos < 0) + return; + + *primary = (*osds)[pos]; + + if (pool.can_shift_osds() && pos > 0) { + // move the new primary to the front. + for (int i = pos; i > 0; --i) { + (*osds)[i] = (*osds)[i-1]; + } + (*osds)[0] = *primary; + } +} + +void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg, + vector<int> *temp_pg, int *temp_primary) const +{ + pg = pool.raw_pg_to_pg(pg); + const auto p = pg_temp->find(pg); + temp_pg->clear(); + if (p != pg_temp->end()) { + for (unsigned i=0; i<p->second.size(); i++) { + if (!exists(p->second[i]) || is_down(p->second[i])) { + if (pool.can_shift_osds()) { + continue; + } else { + temp_pg->push_back(CRUSH_ITEM_NONE); + } + } else { + temp_pg->push_back(p->second[i]); + } + } + } + const auto &pp = primary_temp->find(pg); + *temp_primary = -1; + if (pp != primary_temp->end()) { + *temp_primary = pp->second; + } else if (!temp_pg->empty()) { // apply pg_temp's primary + for (unsigned i = 0; i < temp_pg->size(); ++i) { + if ((*temp_pg)[i] != CRUSH_ITEM_NONE) { + *temp_primary = (*temp_pg)[i]; + break; + } + } + } +} + +void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool) { + *primary = -1; + raw->clear(); + return; + } + _pg_to_raw_osds(*pool, pg, raw, NULL); + *primary = _pick_primary(*raw); +} + +void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw, + vector<int> *raw_upmap) const +{ + auto pool = get_pg_pool(pg.pool()); + if (!pool) { + raw_upmap->clear(); + return; + } + _pg_to_raw_osds(*pool, pg, raw, NULL); + *raw_upmap = *raw; + _apply_upmap(*pool, pg, raw_upmap); +} + +void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool) { + *primary = -1; + up->clear(); + return; + } + vector<int> raw; + ps_t pps; + _pg_to_raw_osds(*pool, pg, &raw, &pps); + _apply_upmap(*pool, pg, &raw); + _raw_to_up_osds(*pool, raw, up); + *primary = _pick_primary(raw); + _apply_primary_affinity(pps, *pool, up, primary); +} + +void OSDMap::_pg_to_up_acting_osds( + const pg_t& pg, vector<int> *up, int *up_primary, + vector<int> *acting, int *acting_primary, + bool raw_pg_to_pg) const +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool || + (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) { + if (up) + up->clear(); + if (up_primary) + *up_primary = -1; + if (acting) + acting->clear(); + if (acting_primary) + *acting_primary = -1; + return; + } + vector<int> raw; + vector<int> _up; + vector<int> _acting; + int _up_primary; + int _acting_primary; + ps_t pps; + _get_temp_osds(*pool, pg, &_acting, &_acting_primary); + if (_acting.empty() || up || up_primary) { + _pg_to_raw_osds(*pool, pg, &raw, &pps); + _apply_upmap(*pool, pg, &raw); + _raw_to_up_osds(*pool, raw, &_up); + _up_primary = _pick_primary(_up); + _apply_primary_affinity(pps, *pool, &_up, &_up_primary); + if (_acting.empty()) { + _acting = _up; + if (_acting_primary == -1) { + _acting_primary = _up_primary; + } + } + + if (up) + up->swap(_up); + if (up_primary) + *up_primary = _up_primary; + } + + if (acting) + acting->swap(_acting); + if (acting_primary) + *acting_primary = _acting_primary; +} + +int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep) +{ + // This implementation is broken for EC PGs since the osd may appear + // multiple times in the acting set. See + // https://tracker.ceph.com/issues/43213 + if (!nrep) + nrep = acting.size(); + for (int i=0; i<nrep; i++) + if (acting[i] == osd) + return i; + return -1; +} + +int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting) +{ + int nrep = acting.size(); + if (who.shard == shard_id_t::NO_SHARD) { + for (int i=0; i<nrep; i++) { + if (acting[i] == who.osd) { + return i; + } + } + } else { + if (who.shard < nrep && acting[who.shard] == who.osd) { + return who.shard; + } + } + return -1; +} + +bool OSDMap::primary_changed_broken( + int oldprimary, + const vector<int> &oldacting, + int newprimary, + const vector<int> &newacting) +{ + if (oldacting.empty() && newacting.empty()) + return false; // both still empty + if (oldacting.empty() ^ newacting.empty()) + return true; // was empty, now not, or vice versa + if (oldprimary != newprimary) + return true; // primary changed + if (calc_pg_role_broken(oldprimary, oldacting) != + calc_pg_role_broken(newprimary, newacting)) + return true; + return false; // same primary (tho replicas may have changed) +} + +uint64_t OSDMap::get_encoding_features() const +{ + uint64_t f = SIGNIFICANT_FEATURES; + if (require_osd_release < ceph_release_t::reef) { + f &= ~CEPH_FEATURE_SERVER_REEF; + } + if (require_osd_release < ceph_release_t::octopus) { + f &= ~CEPH_FEATURE_SERVER_OCTOPUS; + } + if (require_osd_release < ceph_release_t::nautilus) { + f &= ~CEPH_FEATURE_SERVER_NAUTILUS; + } + if (require_osd_release < ceph_release_t::mimic) { + f &= ~CEPH_FEATURE_SERVER_MIMIC; + } + if (require_osd_release < ceph_release_t::luminous) { + f &= ~(CEPH_FEATURE_SERVER_LUMINOUS | + CEPH_FEATURE_CRUSH_CHOOSE_ARGS); + } + if (require_osd_release < ceph_release_t::kraken) { + f &= ~(CEPH_FEATURE_SERVER_KRAKEN | + CEPH_FEATURE_MSG_ADDR2); + } + if (require_osd_release < ceph_release_t::jewel) { + f &= ~(CEPH_FEATURE_SERVER_JEWEL | + CEPH_FEATURE_NEW_OSDOP_ENCODING | + CEPH_FEATURE_CRUSH_TUNABLES5); + } + return f; +} + +// serialize, unserialize +void OSDMap::encode_client_old(ceph::buffer::list& bl) const +{ + using ceph::encode; + __u16 v = 5; + encode(v, bl); + + // base + encode(fsid, bl); + encode(epoch, bl); + encode(created, bl); + encode(modified, bl); + + // for encode(pools, bl); + __u32 n = pools.size(); + encode(n, bl); + + for (const auto &pool : pools) { + n = pool.first; + encode(n, bl); + encode(pool.second, bl, 0); + } + // for encode(pool_name, bl); + n = pool_name.size(); + encode(n, bl); + for (const auto &pname : pool_name) { + n = pname.first; + encode(n, bl); + encode(pname.second, bl); + } + // for encode(pool_max, bl); + n = pool_max; + encode(n, bl); + + encode(flags, bl); + + encode(max_osd, bl); + { + uint32_t n = osd_state.size(); + encode(n, bl); + for (auto s : osd_state) { + encode((uint8_t)s, bl); + } + } + encode(osd_weight, bl); + encode(osd_addrs->client_addrs, bl, 0); + + // for encode(pg_temp, bl); + n = pg_temp->size(); + encode(n, bl); + for (const auto& pg : *pg_temp) { + old_pg_t opg = pg.first.get_old_pg(); + encode(opg, bl); + encode(pg.second, bl); + } + + // crush + ceph::buffer::list cbl; + crush->encode(cbl, 0 /* legacy (no) features */); + encode(cbl, bl); +} + +void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_PGID64) == 0) { + encode_client_old(bl); + return; + } + + __u16 v = 6; + encode(v, bl); + + // base + encode(fsid, bl); + encode(epoch, bl); + encode(created, bl); + encode(modified, bl); + + encode(pools, bl, features); + encode(pool_name, bl); + encode(pool_max, bl); + + encode(flags, bl); + + encode(max_osd, bl); + { + uint32_t n = osd_state.size(); + encode(n, bl); + for (auto s : osd_state) { + encode((uint8_t)s, bl); + } + } + encode(osd_weight, bl); + encode(osd_addrs->client_addrs, bl, features); + + encode(*pg_temp, bl); + + // crush + ceph::buffer::list cbl; + crush->encode(cbl, 0 /* legacy (no) features */); + encode(cbl, bl); + + // extended + __u16 ev = 10; + encode(ev, bl); + encode(osd_addrs->hb_back_addrs, bl, features); + encode(osd_info, bl); + encode(blocklist, bl, features); + encode(osd_addrs->cluster_addrs, bl, features); + encode(cluster_snapshot_epoch, bl); + encode(cluster_snapshot, bl); + encode(*osd_uuid, bl); + encode(osd_xinfo, bl, features); + encode(osd_addrs->hb_front_addrs, bl, features); +} + +/* for a description of osdmap versions, and when they were introduced, please + * refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) { + encode_classic(bl, features); + return; + } + + // only a select set of callers should *ever* be encoding new + // OSDMaps. others should be passing around the canonical encoded + // buffers from on high. select out those callers by passing in an + // "impossible" feature bit. + ceph_assert(features & CEPH_FEATURE_RESERVED); + features &= ~CEPH_FEATURE_RESERVED; + + size_t start_offset = bl.length(); + size_t tail_offset; + size_t crc_offset; + std::optional<ceph::buffer::list::contiguous_filler> crc_filler; + + // meta-encoding: how we include client-used and osd-specific data + ENCODE_START(8, 7, bl); + + { + // NOTE: any new encoding dependencies must be reflected by + // SIGNIFICANT_FEATURES + uint8_t v = 10; + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + v = 3; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 6; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 7; + } else if (!HAVE_FEATURE(features, SERVER_REEF)) { + v = 9; + } + ENCODE_START(v, 1, bl); // client-usable data + // base + encode(fsid, bl); + encode(epoch, bl); + encode(created, bl); + encode(modified, bl); + + encode(pools, bl, features); + encode(pool_name, bl); + encode(pool_max, bl); + + if (v < 4) { + decltype(flags) f = flags; + if (require_osd_release >= ceph_release_t::luminous) + f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES; + else if (require_osd_release == ceph_release_t::kraken) + f |= CEPH_OSDMAP_REQUIRE_KRAKEN; + else if (require_osd_release == ceph_release_t::jewel) + f |= CEPH_OSDMAP_REQUIRE_JEWEL; + encode(f, bl); + } else { + encode(flags, bl); + } + + encode(max_osd, bl); + if (v >= 5) { + encode(osd_state, bl); + } else { + uint32_t n = osd_state.size(); + encode(n, bl); + for (auto s : osd_state) { + encode((uint8_t)s, bl); + } + } + encode(osd_weight, bl); + if (v >= 8) { + encode(osd_addrs->client_addrs, bl, features); + } else { + encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features); + } + + encode(*pg_temp, bl); + encode(*primary_temp, bl); + if (osd_primary_affinity) { + encode(*osd_primary_affinity, bl); + } else { + vector<__u32> v; + encode(v, bl); + } + + // crush + ceph::buffer::list cbl; + crush->encode(cbl, features); + encode(cbl, bl); + encode(erasure_code_profiles, bl); + + if (v >= 4) { + encode(pg_upmap, bl); + encode(pg_upmap_items, bl); + } else { + ceph_assert(pg_upmap.empty()); + ceph_assert(pg_upmap_items.empty()); + } + if (v >= 6) { + encode(crush_version, bl); + } + if (v >= 7) { + encode(new_removed_snaps, bl); + encode(new_purged_snaps, bl); + } + if (v >= 9) { + encode(last_up_change, bl); + encode(last_in_change, bl); + } + if (v >= 10) { + encode(pg_upmap_primaries, bl); + } else { + ceph_assert(pg_upmap_primaries.empty()); + } + ENCODE_FINISH(bl); // client-usable data + } + + { + // NOTE: any new encoding dependencies must be reflected by + // SIGNIFICANT_FEATURES + uint8_t target_v = 9; // when bumping this, be aware of allow_crimson + if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) { + target_v = 1; + } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + target_v = 5; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + target_v = 6; + } + if (stretch_mode_enabled) { + target_v = std::max((uint8_t)10, target_v); + } + if (!range_blocklist.empty()) { + target_v = std::max((uint8_t)11, target_v); + } + if (allow_crimson) { + target_v = std::max((uint8_t)12, target_v); + } + ENCODE_START(target_v, 1, bl); // extended, osd-only data + if (target_v < 7) { + encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features); + } else { + encode(osd_addrs->hb_back_addrs, bl, features); + } + encode(osd_info, bl); + { + // put this in a sorted, ordered map<> so that we encode in a + // deterministic order. + map<entity_addr_t,utime_t> blocklist_map; + for (const auto &addr : blocklist) + blocklist_map.insert(make_pair(addr.first, addr.second)); + encode(blocklist_map, bl, features); + } + if (target_v < 7) { + encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features); + } else { + encode(osd_addrs->cluster_addrs, bl, features); + } + encode(cluster_snapshot_epoch, bl); + encode(cluster_snapshot, bl); + encode(*osd_uuid, bl); + encode(osd_xinfo, bl, features); + if (target_v < 7) { + encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features); + } else { + encode(osd_addrs->hb_front_addrs, bl, features); + } + if (target_v >= 2) { + encode(nearfull_ratio, bl); + encode(full_ratio, bl); + encode(backfillfull_ratio, bl); + } + // 4 was string-based new_require_min_compat_client + if (target_v >= 5) { + encode(require_min_compat_client, bl); + encode(require_osd_release, bl); + } + if (target_v >= 6) { + encode(removed_snaps_queue, bl); + } + if (target_v >= 8) { + encode(crush_node_flags, bl); + } + if (target_v >= 9) { + encode(device_class_flags, bl); + } + if (target_v >= 10) { + encode(stretch_mode_enabled, bl); + encode(stretch_bucket_count, bl); + encode(degraded_stretch_mode, bl); + encode(recovering_stretch_mode, bl); + encode(stretch_mode_bucket, bl); + } + if (target_v >= 11) { + ::encode(range_blocklist, bl, features); + } + if (target_v >= 12) { + ::encode(allow_crimson, bl); + } + ENCODE_FINISH(bl); // osd-only data + } + + crc_offset = bl.length(); + crc_filler = bl.append_hole(sizeof(uint32_t)); + tail_offset = bl.length(); + + ENCODE_FINISH(bl); // meta-encoding wrapper + + // fill in crc + ceph::buffer::list front; + front.substr_of(bl, start_offset, crc_offset - start_offset); + crc = front.crc32c(-1); + if (tail_offset < bl.length()) { + ceph::buffer::list tail; + tail.substr_of(bl, tail_offset, bl.length() - tail_offset); + crc = tail.crc32c(crc); + } + ceph_le32 crc_le; + crc_le = crc; + crc_filler->copy_in(4, (char*)&crc_le); + crc_defined = true; +} + +/* for a description of osdmap versions, and when they were introduced, please + * refer to + * doc/dev/osd_internals/osdmap_versions.txt + */ +void OSDMap::decode(ceph::buffer::list& bl) +{ + auto p = bl.cbegin(); + decode(p); +} + +void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p) +{ + using ceph::decode; + __u32 n, t; + __u16 v; + decode(v, p); + + // base + decode(fsid, p); + decode(epoch, p); + decode(created, p); + decode(modified, p); + + if (v < 6) { + if (v < 4) { + int32_t max_pools = 0; + decode(max_pools, p); + pool_max = max_pools; + } + pools.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(pools[t], p); + } + if (v == 4) { + decode(n, p); + pool_max = n; + } else if (v == 5) { + pool_name.clear(); + decode(n, p); + while (n--) { + decode(t, p); + decode(pool_name[t], p); + } + decode(n, p); + pool_max = n; + } + } else { + decode(pools, p); + decode(pool_name, p); + decode(pool_max, p); + } + // kludge around some old bug that zeroed out pool_max (#2307) + if (pools.size() && pool_max < pools.rbegin()->first) { + pool_max = pools.rbegin()->first; + } + + decode(flags, p); + + decode(max_osd, p); + { + vector<uint8_t> os; + decode(os, p); + osd_state.resize(os.size()); + for (unsigned i = 0; i < os.size(); ++i) { + osd_state[i] = os[i]; + } + } + decode(osd_weight, p); + decode(osd_addrs->client_addrs, p); + if (v <= 5) { + pg_temp->clear(); + decode(n, p); + while (n--) { + old_pg_t opg; + ceph::decode_raw(opg, p); + mempool::osdmap::vector<int32_t> v; + decode(v, p); + pg_temp->set(pg_t(opg), v); + } + } else { + decode(*pg_temp, p); + } + + // crush + ceph::buffer::list cbl; + decode(cbl, p); + auto cblp = cbl.cbegin(); + crush->decode(cblp); + + // extended + __u16 ev = 0; + if (v >= 5) + decode(ev, p); + decode(osd_addrs->hb_back_addrs, p); + decode(osd_info, p); + if (v < 5) + decode(pool_name, p); + + decode(blocklist, p); + if (ev >= 6) + decode(osd_addrs->cluster_addrs, p); + else + osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size()); + + if (ev >= 7) { + decode(cluster_snapshot_epoch, p); + decode(cluster_snapshot, p); + } + + if (ev >= 8) { + decode(*osd_uuid, p); + } else { + osd_uuid->resize(max_osd); + } + if (ev >= 9) + decode(osd_xinfo, p); + else + osd_xinfo.resize(max_osd); + + if (ev >= 10) + decode(osd_addrs->hb_front_addrs, p); + else + osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size()); + + osd_primary_affinity.reset(); + + post_decode(); +} + +void OSDMap::decode(ceph::buffer::list::const_iterator& bl) +{ + using ceph::decode; + /** + * Older encodings of the OSDMap had a single struct_v which + * covered the whole encoding, and was prior to our modern + * stuff which includes a compatv and a size. So if we see + * a struct_v < 7, we must rewind to the beginning and use our + * classic decoder. + */ + size_t start_offset = bl.get_off(); + size_t tail_offset = 0; + ceph::buffer::list crc_front, crc_tail; + + DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper + if (struct_v < 7) { + bl.seek(start_offset); + decode_classic(bl); + return; + } + /** + * Since we made it past that hurdle, we can use our normal paths. + */ + { + DECODE_START(9, bl); // client-usable data + // base + decode(fsid, bl); + decode(epoch, bl); + decode(created, bl); + decode(modified, bl); + + decode(pools, bl); + decode(pool_name, bl); + decode(pool_max, bl); + + decode(flags, bl); + + decode(max_osd, bl); + if (struct_v >= 5) { + decode(osd_state, bl); + } else { + vector<uint8_t> os; + decode(os, bl); + osd_state.resize(os.size()); + for (unsigned i = 0; i < os.size(); ++i) { + osd_state[i] = os[i]; + } + } + decode(osd_weight, bl); + decode(osd_addrs->client_addrs, bl); + + decode(*pg_temp, bl); + decode(*primary_temp, bl); + // dates back to firefly. version increased from 2 to 3 still in firefly. + // do we really still need to keep this around? even for old clients? + if (struct_v >= 2) { + osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>); + decode(*osd_primary_affinity, bl); + if (osd_primary_affinity->empty()) + osd_primary_affinity.reset(); + } else { + osd_primary_affinity.reset(); + } + + // crush + ceph::buffer::list cbl; + decode(cbl, bl); + auto cblp = cbl.cbegin(); + crush->decode(cblp); + // added in firefly; version increased in luminous, so it affects + // giant, hammer, infernallis, jewel, and kraken. probably should be left + // alone until we require clients to be all luminous? + if (struct_v >= 3) { + decode(erasure_code_profiles, bl); + } else { + erasure_code_profiles.clear(); + } + // version increased from 3 to 4 still in luminous, so same as above + // applies. + if (struct_v >= 4) { + decode(pg_upmap, bl); + decode(pg_upmap_items, bl); + } else { + pg_upmap.clear(); + pg_upmap_items.clear(); + } + // again, version increased from 5 to 6 still in luminous, so above + // applies. + if (struct_v >= 6) { + decode(crush_version, bl); + } + // version increase from 6 to 7 in mimic + if (struct_v >= 7) { + decode(new_removed_snaps, bl); + decode(new_purged_snaps, bl); + } + // version increase from 7 to 8, 8 to 9, in nautilus. + if (struct_v >= 9) { + decode(last_up_change, bl); + decode(last_in_change, bl); + } + if (struct_v >= 10) { + decode(pg_upmap_primaries, bl); + } else { + pg_upmap_primaries.clear(); + } + DECODE_FINISH(bl); // client-usable data + } + + { + DECODE_START(10, bl); // extended, osd-only data + decode(osd_addrs->hb_back_addrs, bl); + decode(osd_info, bl); + decode(blocklist, bl); + decode(osd_addrs->cluster_addrs, bl); + decode(cluster_snapshot_epoch, bl); + decode(cluster_snapshot, bl); + decode(*osd_uuid, bl); + decode(osd_xinfo, bl); + decode(osd_addrs->hb_front_addrs, bl); + // + if (struct_v >= 2) { + decode(nearfull_ratio, bl); + decode(full_ratio, bl); + } else { + nearfull_ratio = 0; + full_ratio = 0; + } + if (struct_v >= 3) { + decode(backfillfull_ratio, bl); + } else { + backfillfull_ratio = 0; + } + if (struct_v == 4) { + string r; + decode(r, bl); + if (r.length()) + require_min_compat_client = ceph_release_from_name(r.c_str()); + } + if (struct_v >= 5) { + decode(require_min_compat_client, bl); + decode(require_osd_release, bl); + if (require_osd_release >= ceph_release_t::nautilus) { + flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT; + } + if (require_osd_release >= ceph_release_t::luminous) { + flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + flags |= CEPH_OSDMAP_RECOVERY_DELETES; + } + } else { + if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) { + // only for compat with post-kraken pre-luminous test clusters + require_osd_release = ceph_release_t::luminous; + flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS); + flags |= CEPH_OSDMAP_RECOVERY_DELETES; + } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) { + require_osd_release = ceph_release_t::kraken; + } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) { + require_osd_release = ceph_release_t::jewel; + } else { + require_osd_release = ceph_release_t::unknown; + } + } + if (struct_v >= 6) { + decode(removed_snaps_queue, bl); + } + if (struct_v >= 8) { + decode(crush_node_flags, bl); + } else { + crush_node_flags.clear(); + } + if (struct_v >= 9) { + decode(device_class_flags, bl); + } else { + device_class_flags.clear(); + } + if (struct_v >= 10) { + decode(stretch_mode_enabled, bl); + decode(stretch_bucket_count, bl); + decode(degraded_stretch_mode, bl); + decode(recovering_stretch_mode, bl); + decode(stretch_mode_bucket, bl); + } else { + stretch_mode_enabled = false; + stretch_bucket_count = 0; + degraded_stretch_mode = 0; + recovering_stretch_mode = 0; + stretch_mode_bucket = 0; + } + if (struct_v >= 11) { + decode(range_blocklist, bl); + calculated_ranges.clear(); + for (const auto& i : range_blocklist) { + calculated_ranges.emplace(i.first, i.first); + } + } + if (struct_v >= 12) { + decode(allow_crimson, bl); + } + DECODE_FINISH(bl); // osd-only data + } + + if (struct_v >= 8) { + crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset); + decode(crc, bl); + tail_offset = bl.get_off(); + crc_defined = true; + } else { + crc_defined = false; + crc = 0; + } + + DECODE_FINISH(bl); // wrapper + + if (tail_offset) { + // verify crc + uint32_t actual = crc_front.crc32c(-1); + if (tail_offset < bl.get_off()) { + ceph::buffer::list tail; + tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset); + actual = tail.crc32c(actual); + } + if (crc != actual) { + ostringstream ss; + ss << "bad crc, actual " << actual << " != expected " << crc; + string s = ss.str(); + throw ceph::buffer::malformed_input(s.c_str()); + } + } + + post_decode(); +} + +void OSDMap::post_decode() +{ + // index pool names + name_pool.clear(); + for (const auto &pname : pool_name) { + name_pool[pname.second] = pname.first; + } + + calc_num_osds(); + _calc_up_osd_features(); +} + +void OSDMap::dump_erasure_code_profiles( + const mempool::osdmap::map<string,map<string,string>>& profiles, + Formatter *f) +{ + f->open_object_section("erasure_code_profiles"); + for (const auto &profile : profiles) { + f->open_object_section(profile.first.c_str()); + for (const auto &profm : profile.second) { + f->dump_string(profm.first.c_str(), profm.second); + } + f->close_section(); + } + f->close_section(); +} + +void OSDMap::dump_osds(Formatter *f) const +{ + f->open_array_section("osds"); + for (int i=0; i<get_max_osd(); i++) { + if (exists(i)) { + dump_osd(i, f); + } + } + f->close_section(); +} + +void OSDMap::dump_osd(int id, Formatter *f) const +{ + ceph_assert(f != nullptr); + if (!exists(id)) { + return; + } + + f->open_object_section("osd_info"); + f->dump_int("osd", id); + f->dump_stream("uuid") << get_uuid(id); + f->dump_int("up", is_up(id)); + f->dump_int("in", is_in(id)); + f->dump_float("weight", get_weightf(id)); + f->dump_float("primary_affinity", get_primary_affinityf(id)); + get_info(id).dump(f); + f->dump_object("public_addrs", get_addrs(id)); + f->dump_object("cluster_addrs", get_cluster_addrs(id)); + f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id)); + f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id)); + // compat + f->dump_stream("public_addr") << get_addrs(id).get_legacy_str(); + f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str(); + f->dump_stream("heartbeat_back_addr") + << get_hb_back_addrs(id).get_legacy_str(); + f->dump_stream("heartbeat_front_addr") + << get_hb_front_addrs(id).get_legacy_str(); + + set<string> st; + get_state(id, st); + f->open_array_section("state"); + for (const auto &state : st) + f->dump_string("state", state); + f->close_section(); + + f->close_section(); +} + +void OSDMap::dump_pool(CephContext *cct, + int64_t pid, + const pg_pool_t &pdata, + ceph::Formatter *f) const +{ + std::string name("<unknown>"); + const auto &pni = pool_name.find(pid); + if (pni != pool_name.end()) + name = pni->second; + f->open_object_section("pool"); + f->dump_int("pool", pid); + f->dump_string("pool_name", name); + pdata.dump(f); + dump_read_balance_score(cct, pid, pdata, f); + f->close_section(); // pool +} + +void OSDMap::dump_read_balance_score(CephContext *cct, + int64_t pid, + const pg_pool_t &pdata, + ceph::Formatter *f) const +{ + if (pdata.is_replicated()) { + // Add rb section with values for score, optimal score, raw score + // // and primary_affinity average + OSDMap::read_balance_info_t rb_info; + auto rc = calc_read_balance_score(cct, pid, &rb_info); + if (rc >= 0) { + f->open_object_section("read_balance"); + f->dump_float("score_acting", rb_info.acting_adj_score); + f->dump_float("score_stable", rb_info.adjusted_score); + f->dump_float("optimal_score", rb_info.optimal_score); + f->dump_float("raw_score_acting", rb_info.acting_raw_score); + f->dump_float("raw_score_stable", rb_info.raw_score); + f->dump_float("primary_affinity_weighted", rb_info.pa_weighted); + f->dump_float("average_primary_affinity", rb_info.pa_avg); + f->dump_float("average_primary_affinity_weighted", rb_info.pa_weighted_avg); + if (rb_info.err_msg.length() > 0) { + f->dump_string("error_message", rb_info.err_msg); + } + f->close_section(); // read_balance + } + else { + if (rb_info.err_msg.length() > 0) { + f->open_object_section("read_balance"); + f->dump_string("error_message", rb_info.err_msg); + f->dump_float("score_acting", rb_info.acting_adj_score); + f->dump_float("score_stable", rb_info.adjusted_score); + f->close_section(); // read_balance + } + } + } +} + +void OSDMap::dump(Formatter *f, CephContext *cct) const +{ + f->dump_int("epoch", get_epoch()); + f->dump_stream("fsid") << get_fsid(); + f->dump_stream("created") << get_created(); + f->dump_stream("modified") << get_modified(); + f->dump_stream("last_up_change") << last_up_change; + f->dump_stream("last_in_change") << last_in_change; + f->dump_string("flags", get_flag_string()); + f->dump_unsigned("flags_num", flags); + f->open_array_section("flags_set"); + set<string> flagset; + get_flag_set(&flagset); + for (auto p : flagset) { + f->dump_string("flag", p); + } + f->close_section(); + f->dump_unsigned("crush_version", get_crush_version()); + f->dump_float("full_ratio", full_ratio); + f->dump_float("backfillfull_ratio", backfillfull_ratio); + f->dump_float("nearfull_ratio", nearfull_ratio); + f->dump_string("cluster_snapshot", get_cluster_snapshot()); + f->dump_int("pool_max", get_pool_max()); + f->dump_int("max_osd", get_max_osd()); + f->dump_string("require_min_compat_client", + to_string(require_min_compat_client)); + f->dump_string("min_compat_client", + to_string(get_min_compat_client())); + f->dump_string("require_osd_release", + to_string(require_osd_release)); + + f->dump_bool("allow_crimson", allow_crimson); + f->open_array_section("pools"); + for (const auto &[pid, pdata] : pools) { + dump_pool(cct, pid, pdata, f); + } + f->close_section(); + + dump_osds(f); + + f->open_array_section("osd_xinfo"); + for (int i=0; i<get_max_osd(); i++) { + if (exists(i)) { + f->open_object_section("xinfo"); + f->dump_int("osd", i); + osd_xinfo[i].dump(f); + f->close_section(); + } + } + f->close_section(); + + f->open_array_section("pg_upmap"); + for (auto& p : pg_upmap) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << p.first; + f->open_array_section("osds"); + for (auto q : p.second) { + f->dump_int("osd", q); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("pg_upmap_items"); + for (auto& [pgid, mappings] : pg_upmap_items) { + f->open_object_section("mapping"); + f->dump_stream("pgid") << pgid; + f->open_array_section("mappings"); + for (auto& [from, to] : mappings) { + f->open_object_section("mapping"); + f->dump_int("from", from); + f->dump_int("to", to); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("pg_upmap_primaries"); + for (const auto& [pg, osd] : pg_upmap_primaries) { + f->open_object_section("primary_mapping"); + f->dump_stream("pgid") << pg; + f->dump_int("primary_osd", osd); + f->close_section(); + } + f->close_section(); // primary_temp + + f->open_array_section("pg_temp"); + pg_temp->dump(f); + f->close_section(); + + f->open_array_section("primary_temp"); + for (const auto &pg : *primary_temp) { + f->dump_stream("pgid") << pg.first; + f->dump_int("osd", pg.second); + } + f->close_section(); // primary_temp + + f->open_object_section("blocklist"); + for (const auto &addr : blocklist) { + stringstream ss; + ss << addr.first; + f->dump_stream(ss.str().c_str()) << addr.second; + } + f->close_section(); + f->open_object_section("range_blocklist"); + for (const auto &addr : range_blocklist) { + stringstream ss; + ss << addr.first; + f->dump_stream(ss.str().c_str()) << addr.second; + } + f->close_section(); + + dump_erasure_code_profiles(erasure_code_profiles, f); + + f->open_array_section("removed_snaps_queue"); + for (auto& p : removed_snaps_queue) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_removed_snaps"); + for (auto& p : new_removed_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("new_purged_snaps"); + for (auto& p : new_purged_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", p.first); + f->open_array_section("snaps"); + for (auto q = p.second.begin(); q != p.second.end(); ++q) { + f->open_object_section("interval"); + f->dump_unsigned("begin", q.get_start()); + f->dump_unsigned("length", q.get_len()); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_object_section("crush_node_flags"); + for (auto& i : crush_node_flags) { + string s = crush->item_exists(i.first) ? crush->get_item_name(i.first) + : stringify(i.first); + f->open_array_section(s.c_str()); + set<string> st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_object_section("device_class_flags"); + for (auto& i : device_class_flags) { + const char* class_name = crush->get_class_name(i.first); + string s = class_name ? class_name : stringify(i.first); + f->open_array_section(s.c_str()); + set<string> st; + calc_state_set(i.second, st); + for (auto& j : st) { + f->dump_string("flag", j); + } + f->close_section(); + } + f->close_section(); + f->open_object_section("stretch_mode"); + { + f->dump_bool("stretch_mode_enabled", stretch_mode_enabled); + f->dump_unsigned("stretch_bucket_count", stretch_bucket_count); + f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode); + f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode); + f->dump_int("stretch_mode_bucket", stretch_mode_bucket); + } + f->close_section(); +} + +void OSDMap::generate_test_instances(list<OSDMap*>& o) +{ + o.push_back(new OSDMap); + + CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY); + o.push_back(new OSDMap); + uuid_d fsid; + o.back()->build_simple(cct, 1, fsid, 16); + o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp + o.back()->blocklist[entity_addr_t()] = utime_t(5, 6); + cct->put(); +} + +string OSDMap::get_flag_string(unsigned f) +{ + string s; + if (f & CEPH_OSDMAP_PAUSERD) + s += ",pauserd"; + if (f & CEPH_OSDMAP_PAUSEWR) + s += ",pausewr"; + if (f & CEPH_OSDMAP_PAUSEREC) + s += ",pauserec"; + if (f & CEPH_OSDMAP_NOUP) + s += ",noup"; + if (f & CEPH_OSDMAP_NODOWN) + s += ",nodown"; + if (f & CEPH_OSDMAP_NOOUT) + s += ",noout"; + if (f & CEPH_OSDMAP_NOIN) + s += ",noin"; + if (f & CEPH_OSDMAP_NOBACKFILL) + s += ",nobackfill"; + if (f & CEPH_OSDMAP_NOREBALANCE) + s += ",norebalance"; + if (f & CEPH_OSDMAP_NORECOVER) + s += ",norecover"; + if (f & CEPH_OSDMAP_NOSCRUB) + s += ",noscrub"; + if (f & CEPH_OSDMAP_NODEEP_SCRUB) + s += ",nodeep-scrub"; + if (f & CEPH_OSDMAP_NOTIERAGENT) + s += ",notieragent"; + if (f & CEPH_OSDMAP_NOSNAPTRIM) + s += ",nosnaptrim"; + if (f & CEPH_OSDMAP_SORTBITWISE) + s += ",sortbitwise"; + if (f & CEPH_OSDMAP_REQUIRE_JEWEL) + s += ",require_jewel_osds"; + if (f & CEPH_OSDMAP_REQUIRE_KRAKEN) + s += ",require_kraken_osds"; + if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS) + s += ",require_luminous_osds"; + if (f & CEPH_OSDMAP_RECOVERY_DELETES) + s += ",recovery_deletes"; + if (f & CEPH_OSDMAP_PURGED_SNAPDIRS) + s += ",purged_snapdirs"; + if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT) + s += ",pglog_hardlimit"; + if (f & CEPH_OSDMAP_NOAUTOSCALE) + s += ",noautoscale"; + if (s.length()) + s.erase(0, 1); + return s; +} + +string OSDMap::get_flag_string() const +{ + return get_flag_string(flags); +} + +void OSDMap::print_pools(CephContext *cct, ostream& out) const +{ + for (const auto &[pid, pdata] : pools) { + std::string name("<unknown>"); + const auto &pni = pool_name.find(pid); + if (pni != pool_name.end()) + name = pni->second; + char rb_score_str[32] = ""; + int rc = 0; + read_balance_info_t rb_info; + if (pdata.is_replicated()) { + rc = calc_read_balance_score(cct, pid, &rb_info); + if (rc >= 0) + snprintf (rb_score_str, sizeof(rb_score_str), + " read_balance_score %.2f", rb_info.acting_adj_score); + } + + out << "pool " << pid + << " '" << name + << "' " << pdata + << rb_score_str << "\n"; + if (rb_info.err_msg.length() > 0) { + out << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << "\n"; + } + + //TODO - print error messages here. + + for (const auto &snap : pdata.snaps) + out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n"; + + if (!pdata.removed_snaps.empty()) + out << "\tremoved_snaps " << pdata.removed_snaps << "\n"; + auto p = removed_snaps_queue.find(pid); + if (p != removed_snaps_queue.end()) { + out << "\tremoved_snaps_queue " << p->second << "\n"; + } + } + out << std::endl; +} + +void OSDMap::print_osds(ostream& out) const +{ + for (int i=0; i<get_max_osd(); i++) { + if (exists(i)) { + print_osd(i, out); + } + } +} +void OSDMap::print_osd(int id, ostream& out) const +{ + if (!exists(id)) { + return; + } + + out << "osd." << id; + out << (is_up(id) ? " up ":" down"); + out << (is_in(id) ? " in ":" out"); + out << " weight " << get_weightf(id); + if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + out << " primary_affinity " << get_primary_affinityf(id); + } + const osd_info_t& info(get_info(id)); + out << " " << info; + out << " " << get_addrs(id) << " " << get_cluster_addrs(id); + set<string> st; + get_state(id, st); + out << " " << st; + if (!get_uuid(id).is_zero()) { + out << " " << get_uuid(id); + } + out << "\n"; +} + +void OSDMap::print(CephContext *cct, ostream& out) const +{ + out << "epoch " << get_epoch() << "\n" + << "fsid " << get_fsid() << "\n" + << "created " << get_created() << "\n" + << "modified " << get_modified() << "\n"; + + out << "flags " << get_flag_string() << "\n"; + out << "crush_version " << get_crush_version() << "\n"; + out << "full_ratio " << full_ratio << "\n"; + out << "backfillfull_ratio " << backfillfull_ratio << "\n"; + out << "nearfull_ratio " << nearfull_ratio << "\n"; + if (require_min_compat_client != ceph_release_t::unknown) { + out << "require_min_compat_client " + << require_min_compat_client << "\n"; + } + out << "min_compat_client " << get_min_compat_client() + << "\n"; + if (require_osd_release > ceph_release_t::unknown) { + out << "require_osd_release " << require_osd_release + << "\n"; + } + out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n"; + if (stretch_mode_enabled) { + out << "stretch_bucket_count " << stretch_bucket_count << "\n"; + out << "degraded_stretch_mode " << degraded_stretch_mode << "\n"; + out << "recovering_stretch_mode " << recovering_stretch_mode << "\n"; + out << "stretch_mode_bucket " << stretch_mode_bucket << "\n"; + } + if (get_cluster_snapshot().length()) + out << "cluster_snapshot " << get_cluster_snapshot() << "\n"; + if (allow_crimson) { + out << "allow_crimson=true\n"; + } + out << "\n"; + + print_pools(cct, out); + + out << "max_osd " << get_max_osd() << "\n"; + print_osds(out); + out << std::endl; + + for (auto& p : pg_upmap) { + out << "pg_upmap " << p.first << " " << p.second << "\n"; + } + for (auto& p : pg_upmap_items) { + out << "pg_upmap_items " << p.first << " " << p.second << "\n"; + } + + for (auto& [pg, osd] : pg_upmap_primaries) { + out << "pg_upmap_primary " << pg << " " << osd << "\n"; + } + + for (const auto& pg : *pg_temp) + out << "pg_temp " << pg.first << " " << pg.second << "\n"; + + for (const auto& pg : *primary_temp) + out << "primary_temp " << pg.first << " " << pg.second << "\n"; + + for (const auto &addr : blocklist) + out << "blocklist " << addr.first << " expires " << addr.second << "\n"; + for (const auto &addr : range_blocklist) + out << "range blocklist " << addr.first << " expires " << addr.second << "\n"; +} + +class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> { +public: + typedef CrushTreeDumper::Dumper<TextTable> Parent; + + OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_, + unsigned f) + : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { } + + bool should_dump_leaf(int i) const override { + if (!filter) { + return true; // normal case + } + if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) || + ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) || + ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) || + ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) || + ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) { + return true; + } + return false; + } + + bool should_dump_empty_bucket() const override { + return !filter; + } + + void init_table(TextTable *tbl) { + tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT); + tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT); + } + void dump(TextTable *tbl, string& bucket) { + init_table(tbl); + + if (!bucket.empty()) { + set_root(bucket); + Parent::dump(tbl); + } else { + Parent::dump(tbl); + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) { + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl); + } + } + } + } + +protected: + void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override { + const char *c = crush->get_item_class(qi.id); + if (!c) + c = ""; + *tbl << qi.id + << c + << weightf_t(qi.weight); + + ostringstream name; + for (int k = 0; k < qi.depth; k++) + name << " "; + if (qi.is_bucket()) { + name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " " + << crush->get_item_name(qi.id); + } else { + name << "osd." << qi.id; + } + *tbl << name.str(); + + if (!qi.is_bucket()) { + if (!osdmap->exists(qi.id)) { + *tbl << "DNE" + << 0; + } else { + string s; + if (osdmap->is_up(qi.id)) { + s = "up"; + } else if (osdmap->is_destroyed(qi.id)) { + s = "destroyed"; + } else { + s = "down"; + } + *tbl << s + << weightf_t(osdmap->get_weightf(qi.id)) + << weightf_t(osdmap->get_primary_affinityf(qi.id)); + } + } + *tbl << TextTable::endrow; + } + +private: + const OSDMap *osdmap; + const unsigned filter; +}; + +class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper { +public: + typedef CrushTreeDumper::FormattingDumper Parent; + + OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_, + unsigned f) + : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { } + + bool should_dump_leaf(int i) const override { + if (!filter) { + return true; // normal case + } + if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) || + ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) || + ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) || + ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) || + ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) { + return true; + } + return false; + } + + bool should_dump_empty_bucket() const override { + return !filter; + } + + void dump(Formatter *f, string& bucket) { + if (!bucket.empty()) { + set_root(bucket); + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + } else { + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + f->open_array_section("stray"); + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f); + } + f->close_section(); + } + } + +protected: + void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override { + Parent::dump_item_fields(qi, f); + if (!qi.is_bucket()) + { + string s; + if (osdmap->is_up(qi.id)) { + s = "up"; + } else if (osdmap->is_destroyed(qi.id)) { + s = "destroyed"; + } else { + s = "down"; + } + f->dump_unsigned("exists", (int)osdmap->exists(qi.id)); + f->dump_string("status", s); + f->dump_float("reweight", osdmap->get_weightf(qi.id)); + f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id)); + } + } + +private: + const OSDMap *osdmap; + const unsigned filter; +}; + +void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const +{ + if (f) { + OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket); + } else { + ceph_assert(out); + TextTable tbl; + OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket); + *out << tbl; + } +} + +void OSDMap::print_summary(Formatter *f, ostream& out, + const string& prefix, bool extra) const +{ + if (f) { + f->dump_int("epoch", get_epoch()); + f->dump_int("num_osds", get_num_osds()); + f->dump_int("num_up_osds", get_num_up_osds()); + f->dump_int("osd_up_since", last_up_change.to_msec() / 1000); + f->dump_int("num_in_osds", get_num_in_osds()); + f->dump_int("osd_in_since", last_in_change.to_msec() / 1000); + f->dump_unsigned("num_remapped_pgs", get_num_pg_temp()); + } else { + utime_t now = ceph_clock_now(); + out << get_num_osds() << " osds: " + << get_num_up_osds() << " up"; + if (last_up_change != utime_t()) { + out << " (since " << utimespan_str(now - last_up_change) << ")"; + } + out << ", " << get_num_in_osds() << " in"; + if (last_in_change != utime_t()) { + out << " (since " << utimespan_str(now - last_in_change) << ")"; + } + if (extra) + out << "; epoch: e" << get_epoch(); + if (get_num_pg_temp()) + out << "; " << get_num_pg_temp() << " remapped pgs"; + out << "\n"; + uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS; + if (important_flags) + out << prefix << "flags " << get_flag_string(important_flags) << "\n"; + } +} + +void OSDMap::print_oneline_summary(ostream& out) const +{ + out << "e" << get_epoch() << ": " + << get_num_osds() << " total, " + << get_num_up_osds() << " up, " + << get_num_in_osds() << " in"; +} + +bool OSDMap::crush_rule_in_use(int rule_id) const +{ + for (const auto &pool : pools) { + if (pool.second.crush_rule == rule_id) + return true; + } + return false; +} + +int OSDMap::validate_crush_rules(CrushWrapper *newcrush, + ostream *ss) const +{ + for (auto& i : pools) { + auto& pool = i.second; + int ruleno = pool.get_crush_rule(); + if (!newcrush->rule_exists(ruleno)) { + *ss << "pool " << i.first << " references crush_rule " << ruleno + << " but it is not present"; + return -EINVAL; + } + if (newcrush->get_rule_type(ruleno) != (int)pool.get_type()) { + *ss << "pool " << i.first << " type does not match rule " << ruleno; + return -EINVAL; + } + } + return 0; +} + +int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid, + int nosd, int pg_bits, int pgp_bits, + bool default_pool) +{ + ldout(cct, 10) << "build_simple on " << nosd + << " osds" << dendl; + epoch = e; + set_fsid(fsid); + created = modified = ceph_clock_now(); + + if (nosd >= 0) { + set_max_osd(nosd); + } else { + // count osds + int maxosd = 0; + const auto& conf = cct->_conf; + vector<string> sections; + conf.get_all_sections(sections); + + for (auto §ion : sections) { + if (section.find("osd.") != 0) + continue; + + const char *begin = section.c_str() + 4; + char *end = (char*)begin; + int o = strtol(begin, &end, 10); + if (*end != '\0') + continue; + + if (o > cct->_conf->mon_max_osd) { + lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl; + return -ERANGE; + } + + if (o > maxosd) + maxosd = o; + } + + set_max_osd(maxosd + 1); + } + + + stringstream ss; + int r; + if (nosd >= 0) + r = build_simple_crush_map(cct, *crush, nosd, &ss); + else + r = build_simple_crush_map_from_conf(cct, *crush, &ss); + ceph_assert(r == 0); + + int poolbase = get_max_osd() ? get_max_osd() : 1; + + const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_rule(cct); + ceph_assert(default_replicated_rule >= 0); + + if (default_pool) { + // pgp_num <= pg_num + if (pgp_bits > pg_bits) + pgp_bits = pg_bits; + + vector<string> pool_names; + pool_names.push_back("rbd"); + for (auto &plname : pool_names) { + int64_t pool = ++pool_max; + pools[pool].type = pg_pool_t::TYPE_REPLICATED; + pools[pool].flags = cct->_conf->osd_pool_default_flags; + if (cct->_conf->osd_pool_default_flag_hashpspool) + pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL); + if (cct->_conf->osd_pool_default_flag_nodelete) + pools[pool].set_flag(pg_pool_t::FLAG_NODELETE); + if (cct->_conf->osd_pool_default_flag_nopgchange) + pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE); + if (cct->_conf->osd_pool_default_flag_nosizechange) + pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE); + if (cct->_conf->osd_pool_default_flag_bulk) + pools[pool].set_flag(pg_pool_t::FLAG_BULK); + pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size"); + pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size( + pools[pool].size); + pools[pool].crush_rule = default_replicated_rule; + pools[pool].object_hash = CEPH_STR_HASH_RJENKINS; + pools[pool].set_pg_num(poolbase << pg_bits); + pools[pool].set_pgp_num(poolbase << pgp_bits); + pools[pool].set_pg_num_target(poolbase << pg_bits); + pools[pool].set_pgp_num_target(poolbase << pgp_bits); + pools[pool].last_change = epoch; + pools[pool].application_metadata.insert( + {pg_pool_t::APPLICATION_NAME_RBD, {}}); + if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name( + cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode")); + m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) { + pools[pool].pg_autoscale_mode = m; + } else { + pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF; + } + pool_name[pool] = plname; + name_pool[plname] = pool; + } + } + + map<string,string> profile_map; + r = get_erasure_code_profile_default(cct, profile_map, &ss); + if (r < 0) { + lderr(cct) << ss.str() << dendl; + return r; + } + set_erasure_code_profile("default", profile_map); + return 0; +} + +int OSDMap::get_erasure_code_profile_default(CephContext *cct, + map<string,string> &profile_map, + ostream *ss) +{ + int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"), + *ss, + &profile_map); + return r; +} + +int OSDMap::_build_crush_types(CrushWrapper& crush) +{ + crush.set_type_name(0, "osd"); + crush.set_type_name(1, "host"); + crush.set_type_name(2, "chassis"); + crush.set_type_name(3, "rack"); + crush.set_type_name(4, "row"); + crush.set_type_name(5, "pdu"); + crush.set_type_name(6, "pod"); + crush.set_type_name(7, "room"); + crush.set_type_name(8, "datacenter"); + crush.set_type_name(9, "zone"); + crush.set_type_name(10, "region"); + crush.set_type_name(11, "root"); + return 11; +} + +int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush, + int nosd, ostream *ss) +{ + crush.create(); + + // root + int root_type = _build_crush_types(crush); + int rootid; + int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT, + root_type, 0, NULL, NULL, &rootid); + ceph_assert(r == 0); + crush.set_item_name(rootid, "default"); + + map<string,string> loc{ + {"host", "localhost"}, + {"rack", "localrack"}, + {"root", "default"} + }; + for (int o=0; o<nosd; o++) { + ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl; + char name[32]; + snprintf(name, sizeof(name), "osd.%d", o); + crush.insert_item(cct, o, 1.0, name, loc); + } + + build_simple_crush_rules(cct, crush, "default", ss); + + crush.finalize(); + + return 0; +} + +int OSDMap::build_simple_crush_map_from_conf(CephContext *cct, + CrushWrapper& crush, + ostream *ss) +{ + const auto& conf = cct->_conf; + + crush.create(); + + // root + int root_type = _build_crush_types(crush); + int rootid; + int r = crush.add_bucket(0, 0, + CRUSH_HASH_DEFAULT, + root_type, 0, NULL, NULL, &rootid); + ceph_assert(r == 0); + crush.set_item_name(rootid, "default"); + + // add osds + vector<string> sections; + conf.get_all_sections(sections); + + for (auto §ion : sections) { + if (section.find("osd.") != 0) + continue; + + const char *begin = section.c_str() + 4; + char *end = (char*)begin; + int o = strtol(begin, &end, 10); + if (*end != '\0') + continue; + + string host, rack, row, room, dc, pool; + vector<string> sectiontmp; + sectiontmp.push_back("osd"); + sectiontmp.push_back(section); + conf.get_val_from_conf_file(sectiontmp, "host", host, false); + conf.get_val_from_conf_file(sectiontmp, "rack", rack, false); + conf.get_val_from_conf_file(sectiontmp, "row", row, false); + conf.get_val_from_conf_file(sectiontmp, "room", room, false); + conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false); + conf.get_val_from_conf_file(sectiontmp, "root", pool, false); + + if (host.length() == 0) + host = "unknownhost"; + if (rack.length() == 0) + rack = "unknownrack"; + + map<string,string> loc; + loc["host"] = host; + loc["rack"] = rack; + if (row.size()) + loc["row"] = row; + if (room.size()) + loc["room"] = room; + if (dc.size()) + loc["datacenter"] = dc; + loc["root"] = "default"; + + ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl; + crush.insert_item(cct, o, 1.0, section, loc); + } + + build_simple_crush_rules(cct, crush, "default", ss); + + crush.finalize(); + + return 0; +} + + +int OSDMap::build_simple_crush_rules( + CephContext *cct, + CrushWrapper& crush, + const string& root, + ostream *ss) +{ + int crush_rule = crush.get_osd_pool_default_crush_replicated_rule(cct); + string failure_domain = + crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type); + + int r; + r = crush.add_simple_rule_at( + "replicated_rule", root, failure_domain, "", + "firstn", pg_pool_t::TYPE_REPLICATED, + crush_rule, ss); + if (r < 0) + return r; + // do not add an erasure rule by default or else we will implicitly + // require the crush_v2 feature of clients + return 0; +} + +int OSDMap::summarize_mapping_stats( + OSDMap *newmap, + const set<int64_t> *pools, + std::string *out, + Formatter *f) const +{ + set<int64_t> ls; + if (pools) { + ls = *pools; + } else { + for (auto &p : get_pools()) + ls.insert(p.first); + } + + unsigned total_pg = 0; + unsigned moved_pg = 0; + vector<unsigned> base_by_osd(get_max_osd(), 0); + vector<unsigned> new_by_osd(get_max_osd(), 0); + for (int64_t pool_id : ls) { + const pg_pool_t *pi = get_pg_pool(pool_id); + vector<int> up, up2; + int up_primary; + for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) { + pg_t pgid(ps, pool_id); + total_pg += pi->get_size(); + pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr); + for (int osd : up) { + if (osd >= 0 && osd < get_max_osd()) + ++base_by_osd[osd]; + } + if (newmap) { + newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr); + for (int osd : up2) { + if (osd >= 0 && osd < get_max_osd()) + ++new_by_osd[osd]; + } + if (pi->is_erasure()) { + for (unsigned i=0; i<up.size(); ++i) { + if (up[i] != up2[i]) { + ++moved_pg; + } + } + } else if (pi->is_replicated()) { + for (int osd : up) { + if (std::find(up2.begin(), up2.end(), osd) == up2.end()) { + ++moved_pg; + } + } + } else { + ceph_abort_msg("unhandled pool type"); + } + } + } + } + + unsigned num_up_in = 0; + for (int osd = 0; osd < get_max_osd(); ++osd) { + if (is_up(osd) && is_in(osd)) + ++num_up_in; + } + if (!num_up_in) { + return -EINVAL; + } + + float avg_pg = (float)total_pg / (float)num_up_in; + float base_stddev = 0, new_stddev = 0; + int min = -1, max = -1; + unsigned min_base_pg = 0, max_base_pg = 0; + unsigned min_new_pg = 0, max_new_pg = 0; + for (int osd = 0; osd < get_max_osd(); ++osd) { + if (is_up(osd) && is_in(osd)) { + float base_diff = (float)base_by_osd[osd] - avg_pg; + base_stddev += base_diff * base_diff; + float new_diff = (float)new_by_osd[osd] - avg_pg; + new_stddev += new_diff * new_diff; + if (min < 0 || base_by_osd[osd] < min_base_pg) { + min = osd; + min_base_pg = base_by_osd[osd]; + min_new_pg = new_by_osd[osd]; + } + if (max < 0 || base_by_osd[osd] > max_base_pg) { + max = osd; + max_base_pg = base_by_osd[osd]; + max_new_pg = new_by_osd[osd]; + } + } + } + base_stddev = sqrt(base_stddev / num_up_in); + new_stddev = sqrt(new_stddev / num_up_in); + + float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in))); + + ostringstream ss; + if (f) + f->open_object_section("utilization"); + if (newmap) { + if (f) { + f->dump_unsigned("moved_pgs", moved_pg); + f->dump_unsigned("total_pgs", total_pg); + } else { + float percent = 0; + if (total_pg) + percent = (float)moved_pg * 100.0 / (float)total_pg; + ss << "moved " << moved_pg << " / " << total_pg + << " (" << percent << "%)\n"; + } + } + if (f) { + f->dump_float("avg_pgs", avg_pg); + f->dump_float("std_dev", base_stddev); + f->dump_float("expected_baseline_std_dev", edev); + if (newmap) + f->dump_float("new_std_dev", new_stddev); + } else { + ss << "avg " << avg_pg << "\n"; + ss << "stddev " << base_stddev; + if (newmap) + ss << " -> " << new_stddev; + ss << " (expected baseline " << edev << ")\n"; + } + if (min >= 0) { + if (f) { + f->dump_unsigned("min_osd", min); + f->dump_unsigned("min_osd_pgs", min_base_pg); + if (newmap) + f->dump_unsigned("new_min_osd_pgs", min_new_pg); + } else { + ss << "min osd." << min << " with " << min_base_pg; + if (newmap) + ss << " -> " << min_new_pg; + ss << " pgs (" << (float)min_base_pg / avg_pg; + if (newmap) + ss << " -> " << (float)min_new_pg / avg_pg; + ss << " * mean)\n"; + } + } + if (max >= 0) { + if (f) { + f->dump_unsigned("max_osd", max); + f->dump_unsigned("max_osd_pgs", max_base_pg); + if (newmap) + f->dump_unsigned("new_max_osd_pgs", max_new_pg); + } else { + ss << "max osd." << max << " with " << max_base_pg; + if (newmap) + ss << " -> " << max_new_pg; + ss << " pgs (" << (float)max_base_pg / avg_pg; + if (newmap) + ss << " -> " << (float)max_new_pg / avg_pg; + ss << " * mean)\n"; + } + } + if (f) + f->close_section(); + if (out) + *out = ss.str(); + return 0; +} + +bool OSDMap::try_pg_upmap( + CephContext *cct, + pg_t pg, ///< pg to potentially remap + const set<int>& overfull, ///< osds we'd want to evacuate + const vector<int>& underfull, ///< osds to move to, in order of preference + const vector<int>& more_underfull, ///< more osds only slightly underfull + vector<int> *orig, + vector<int> *out) ///< resulting alternative mapping +{ + const pg_pool_t *pool = get_pg_pool(pg.pool()); + if (!pool) + return false; + int rule = pool->get_crush_rule(); + if (rule < 0) + return false; + + // make sure there is something there to remap + bool any = false; + for (auto osd : *orig) { + if (overfull.count(osd)) { + any = true; + break; + } + } + if (!any) { + return false; + } + + int r = crush->try_remap_rule( + cct, + rule, + pool->get_size(), + overfull, underfull, + more_underfull, + *orig, + out); + if (r < 0) + return false; + if (*out == *orig) + return false; + return true; +} + + +int OSDMap::balance_primaries( + CephContext *cct, + int64_t pid, + OSDMap::Incremental *pending_inc, + OSDMap& tmp_osd_map) const +{ + // This function only handles replicated pools. + const pg_pool_t* pool = get_pg_pool(pid); + if (! pool->is_replicated()) { + ldout(cct, 10) << __func__ << " skipping erasure pool " + << get_pool_name(pid) << dendl; + return -EINVAL; + } + + // Info to be used in verify_upmap + int pool_size = pool->get_size(); + int crush_rule = pool->get_crush_rule(); + + // Get pgs by osd (map of osd -> pgs) + // Get primaries by osd (map of osd -> primary) + map<uint64_t,set<pg_t>> pgs_by_osd; + map<uint64_t,set<pg_t>> prim_pgs_by_osd; + map<uint64_t,set<pg_t>> acting_prims_by_osd; + pgs_by_osd = tmp_osd_map.get_pgs_by_osd(cct, pid, &prim_pgs_by_osd, &acting_prims_by_osd); + + // Construct information about the pgs and osds we will consider in new primary mappings, + // as well as a map of all pgs and their original primary osds. + map<pg_t,bool> prim_pgs_to_check; + vector<uint64_t> osds_to_check; + map<pg_t, uint64_t> orig_prims; + for (const auto & [osd, pgs] : prim_pgs_by_osd) { + osds_to_check.push_back(osd); + for (const auto & pg : pgs) { + prim_pgs_to_check.insert({pg, false}); + orig_prims.insert({pg, osd}); + } + } + + // calculate desired primary distribution for each osd + map<uint64_t,float> desired_prim_dist; + int rc = 0; + rc = calc_desired_primary_distribution(cct, pid, osds_to_check, desired_prim_dist); + if (rc < 0) { + ldout(cct, 10) << __func__ << " Error in calculating desired primary distribution" << dendl; + return -EINVAL; + } + map<uint64_t,float> prim_dist_scores; + float actual; + float desired; + for (auto osd : osds_to_check) { + actual = prim_pgs_by_osd[osd].size(); + desired = desired_prim_dist[osd]; + prim_dist_scores[osd] = actual - desired; + ldout(cct, 10) << __func__ << " desired distribution for osd." << osd << " " << desired << dendl; + } + + // get read balance score before balancing + float read_balance_score_before = 0.0; + read_balance_info_t rb_info; + rc = tmp_osd_map.calc_read_balance_score(cct, pid, &rb_info); + if (rc >= 0) { + read_balance_score_before = rb_info.adjusted_score; + } + if (rb_info.err_msg.length() > 0) { + ldout(cct, 10) << __func__ << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << dendl; + return -EINVAL; + } + + // get ready to swap pgs + while (true) { + int curr_num_changes = 0; + vector<int> up_osds; + vector<int> acting_osds; + int up_primary, acting_primary; + for (const auto & [pg, mapped] : prim_pgs_to_check) { + // fill in the up, up primary, acting, and acting primary for the current PG + tmp_osd_map.pg_to_up_acting_osds(pg, &up_osds, &up_primary, + &acting_osds, &acting_primary); + + // find the OSD that would make the best swap based on its score + // We start by first testing the OSD that is currently primary for the PG we are checking. + uint64_t curr_best_osd = up_primary; + float prim_score = prim_dist_scores[up_primary]; + for (auto potential_osd : up_osds) { + float potential_score = prim_dist_scores[potential_osd]; + if ((prim_score > 0) && // taking 1 pg from the prim would not make its score worse + (potential_score < 0) && // adding 1 pg to the potential would not make its score worse + ((prim_score - potential_score) > 1) && // swapping a pg would not just keep the scores the same + (desired_prim_dist[potential_osd] > 0)) // the potential is not off limits (the primary affinity is above 0) + { + curr_best_osd = potential_osd; + } + } + + // Make the swap only if: + // 1. The swap is legal + // 2. The balancer has chosen a new primary + auto legal_swap = crush->verify_upmap(cct, + crush_rule, + pool_size, + {(int)curr_best_osd}); + if (legal_swap >= 0 && + ((int)curr_best_osd != up_primary)) { + // Update prim_dist_scores + prim_dist_scores[curr_best_osd] += 1; + prim_dist_scores[up_primary] -= 1; + + // Update the mappings + tmp_osd_map.pg_upmap_primaries[pg] = curr_best_osd; + if (curr_best_osd == orig_prims[pg]) { + pending_inc->new_pg_upmap_primary.erase(pg); + prim_pgs_to_check[pg] = false; + } else { + pending_inc->new_pg_upmap_primary[pg] = curr_best_osd; + prim_pgs_to_check[pg] = true; // mark that this pg changed mappings + } + + curr_num_changes++; + } + ldout(cct, 20) << __func__ << " curr_num_changes: " << curr_num_changes << dendl; + } + // If there are no changes after one pass through the pgs, then no further optimizations can be made. + if (curr_num_changes == 0) { + ldout(cct, 20) << __func__ << " curr_num_changes is 0; no further optimizations can be made." << dendl; + break; + } + } + + // get read balance score after balancing + float read_balance_score_after = 0.0; + rc = tmp_osd_map.calc_read_balance_score(cct, pid, &rb_info); + if (rc >= 0) { + read_balance_score_after = rb_info.adjusted_score; + } + if (rb_info.err_msg.length() > 0) { + ldout(cct, 10) << __func__ << (rc < 0 ? " ERROR: " : " Warning: ") << rb_info.err_msg << dendl; + return -EINVAL; + } + + // Tally total number of changes + int num_changes = 0; + if (read_balance_score_after < read_balance_score_before) { + for (auto [pg, mapped] : prim_pgs_to_check) { + if (mapped) { + num_changes++; + } + } + } + + ldout(cct, 10) << __func__ << " num_changes " << num_changes << dendl; + return num_changes; +} + +int OSDMap::calc_desired_primary_distribution( + CephContext *cct, + int64_t pid, + const vector<uint64_t> &osds, + std::map<uint64_t, float>& desired_primary_distribution) const +{ + // will return a perfect distribution of floats + // without calculating the floor of each value + // + // This function only handles replicated pools. + const pg_pool_t* pool = get_pg_pool(pid); + if (pool->is_replicated()) { + ldout(cct, 20) << __func__ << " calculating distribution for replicated pool " + << get_pool_name(pid) << dendl; + uint64_t replica_count = pool->get_size(); + + map<uint64_t,set<pg_t>> pgs_by_osd; + pgs_by_osd = get_pgs_by_osd(cct, pid); + + // First calculate the distribution using primary affinity and tally up the sum + auto distribution_sum = 0.0; + for (const auto & osd : osds) { + float osd_primary_count = ((float)pgs_by_osd[osd].size() / (float)replica_count) * get_primary_affinityf(osd); + desired_primary_distribution.insert({osd, osd_primary_count}); + distribution_sum += osd_primary_count; + } + if (distribution_sum <= 0) { + ldout(cct, 10) << __func__ << " Unable to calculate primary distribution, likely because primary affinity is" + << " set to 0 on all OSDs." << dendl; + return -EINVAL; + } + + // Then, stretch the value (necessary when primary affinity is smaller than 1) + float factor = (float)pool->get_pg_num() / (float)distribution_sum; + float distribution_sum_desired = 0.0; + + ceph_assert(factor >= 1.0); + for (const auto & [osd, osd_primary_count] : desired_primary_distribution) { + desired_primary_distribution[osd] *= factor; + distribution_sum_desired += desired_primary_distribution[osd]; + } + ceph_assert(fabs(distribution_sum_desired - pool->get_pg_num()) < 0.01); + } else { + ldout(cct, 10) << __func__ <<" skipping erasure pool " + << get_pool_name(pid) << dendl; + return -EINVAL; + } + + return 0; +} + +int OSDMap::calc_pg_upmaps( + CephContext *cct, + uint32_t max_deviation, + int max, + const set<int64_t>& only_pools, + OSDMap::Incremental *pending_inc, + std::random_device::result_type *p_seed) +{ + ldout(cct, 10) << __func__ << " pools " << only_pools << dendl; + OSDMap tmp_osd_map; + // Can't be less than 1 pg + if (max_deviation < 1) + max_deviation = 1; + tmp_osd_map.deepish_copy_from(*this); + int num_changed = 0; + map<int,set<pg_t>> pgs_by_osd; + int total_pgs = 0; + float osd_weight_total = 0; + map<int,float> osd_weight; + + if (max <= 0) { + lderr(cct) << __func__ << " abort due to max <= 0" << dendl; + return 0; + } + + osd_weight_total = build_pool_pgs_info(cct, only_pools, tmp_osd_map, + total_pgs, pgs_by_osd, osd_weight); + if (osd_weight_total == 0) { + lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl; + return 0; + } + + float pgs_per_weight = total_pgs / osd_weight_total; + ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl; + ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl; + + float stddev = 0; + map<int,float> osd_deviation; // osd, deviation(pgs) + multimap<float,int> deviation_osd; // deviation(pgs), osd + float cur_max_deviation = calc_deviations(cct, pgs_by_osd, osd_weight, pgs_per_weight, + osd_deviation, deviation_osd, stddev); + + ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl; + if (cur_max_deviation <= max_deviation) { + ldout(cct, 10) << __func__ << " distribution is almost perfect" + << dendl; + return 0; + } + + bool skip_overfull = false; + auto aggressive = + cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively"); + auto fast_aggressive = aggressive && + cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively_fast"); + auto local_fallback_retries = + cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries"); + + while (max--) { + ldout(cct, 30) << "Top of loop #" << max+1 << dendl; + // build overfull and underfull + set<int> overfull; + set<int> more_overfull; + bool using_more_overfull = false; + vector<int> underfull; + vector<int> more_underfull; + fill_overfull_underfull(cct, deviation_osd, max_deviation, + overfull, more_overfull, + underfull, more_underfull); + + if (underfull.empty() && overfull.empty()) { + ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl; + break; + } + if (overfull.empty() && !underfull.empty()) { + ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl; + overfull = more_overfull; + using_more_overfull = true; + } + + ldout(cct, 10) << " overfull " << overfull + << " underfull " << underfull + << dendl; + set<pg_t> to_skip; + uint64_t local_fallback_retried = 0; + + // Used to prevent some of the unsuccessful loop iterations (save runtime) + // If we can't find a change per OSD we skip further iterations for this OSD + uint n_changes = 0, prev_n_changes = 0; + set<int> osd_to_skip; + + retry: + + set<pg_t> to_unmap; + map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap; + auto temp_pgs_by_osd = pgs_by_osd; + // always start with fullest, break if we find any changes to make + for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) { + if (skip_overfull && !underfull.empty()) { + ldout(cct, 10) << " skipping overfull " << dendl; + break; // fall through to check underfull + } + int osd = p->second; + float deviation = p->first; + if (fast_aggressive && osd_to_skip.count(osd)) { + ldout(cct, 20) << " Fast aggressive mode: skipping osd " << osd + << " osd_to_skip size = " << osd_to_skip.size() << dendl; + continue; + } + + if (deviation < 0) { + ldout(cct, 10) << " hitting underfull osds now" + << " when trying to remap overfull osds" + << dendl; + break; + } + float target = osd_weight[osd] * pgs_per_weight; + ldout(cct, 10) << " Overfull search osd." << osd + << " target " << target + << " deviation " << deviation + << dendl; + ceph_assert(target > 0); + if (!using_more_overfull && deviation <= max_deviation) { + ldout(cct, 10) << " osd." << osd + << " target " << target + << " deviation " << deviation + << " < max deviation " << max_deviation + << dendl; + break; + } + + vector<pg_t> pgs; + pgs.reserve(pgs_by_osd[osd].size()); + for (auto& pg : pgs_by_osd[osd]) { + if (to_skip.count(pg)) + continue; + pgs.push_back(pg); + } + if (aggressive) { + // shuffle PG list so they all get equal (in)attention + std::shuffle(pgs.begin(), pgs.end(), get_random_engine(cct, p_seed)); + } + // look for remaps we can un-remap + if (try_drop_remap_overfull(cct, pgs, tmp_osd_map, osd, + temp_pgs_by_osd, to_unmap, to_upmap)) + goto test_change; + + // try upmap + for (auto pg : pgs) { + auto temp_it = tmp_osd_map.pg_upmap.find(pg); + if (temp_it != tmp_osd_map.pg_upmap.end()) { + // leave pg_upmap alone + // it must be specified by admin since balancer does not + // support pg_upmap yet + ldout(cct, 10) << " " << pg << " already has pg_upmap " + << temp_it->second << ", skipping" + << dendl; + continue; + } + auto pg_pool_size = tmp_osd_map.get_pg_pool_size(pg); + mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items; + set<int> existing; + auto it = tmp_osd_map.pg_upmap_items.find(pg); + if (it != tmp_osd_map.pg_upmap_items.end()) { + auto& um_items = it->second; + if (um_items.size() >= (size_t)pg_pool_size) { + ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items " + << um_items << ", skipping" + << dendl; + continue; + } else { + ldout(cct, 10) << " " << pg << " already has pg_upmap_items " + << um_items + << dendl; + new_upmap_items = um_items; + // build existing too (for dedup) + for (auto [um_from, um_to] : um_items) { + existing.insert(um_from); + existing.insert(um_to); + } + } + // fall through + // to see if we can append more remapping pairs + } + ldout(cct, 10) << " trying " << pg << dendl; + vector<int> raw, orig, out; + tmp_osd_map.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too + if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) { + continue; + } + ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl; + if (orig.size() != out.size()) { + continue; + } + ceph_assert(orig != out); + int pos = find_best_remap(cct, orig, out, existing, osd_deviation); + if (pos != -1) { + // append new remapping pairs slowly + // This way we can make sure that each tiny change will + // definitely make distribution of PGs converging to + // the perfect status. + add_remap_pair(cct, orig[pos], out[pos], pg, (size_t)pg_pool_size, + osd, existing, temp_pgs_by_osd, + new_upmap_items, to_upmap); + goto test_change; + } + } + if (fast_aggressive) { + if (prev_n_changes == n_changes) { // no changes for prev OSD + osd_to_skip.insert(osd); + } + else { + prev_n_changes = n_changes; + } + } + + } + + ceph_assert(!(to_unmap.size() || to_upmap.size())); + ldout(cct, 10) << " failed to find any changes for overfull osds" + << dendl; + for (auto& [deviation, osd] : deviation_osd) { + if (std::find(underfull.begin(), underfull.end(), osd) == + underfull.end()) + break; + float target = osd_weight[osd] * pgs_per_weight; + ceph_assert(target > 0); + if (fabsf(deviation) < max_deviation) { + // respect max_deviation too + ldout(cct, 10) << " osd." << osd + << " target " << target + << " deviation " << deviation + << " -> absolute " << fabsf(deviation) + << " < max " << max_deviation + << dendl; + break; + } + // look for remaps we can un-remap + candidates_t candidates = build_candidates(cct, tmp_osd_map, to_skip, + only_pools, aggressive, p_seed); + if (try_drop_remap_underfull(cct, candidates, osd, temp_pgs_by_osd, + to_unmap, to_upmap)) { + goto test_change; + } + } + + ceph_assert(!(to_unmap.size() || to_upmap.size())); + ldout(cct, 10) << " failed to find any changes for underfull osds" + << dendl; + if (!aggressive) { + ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl; + break; + } else if (!skip_overfull) { + // safe to quit because below here we know + // we've done checking both overfull and underfull osds.. + ldout(cct, 10) << " break due to not being able to find any" + << " further optimizations" + << dendl; + break; + } + // restart with fullest and do exhaustive searching + skip_overfull = false; + continue; + + test_change: + + // test change, apply if change is good + ceph_assert(to_unmap.size() || to_upmap.size()); + float new_stddev = 0; + map<int,float> temp_osd_deviation; + multimap<float,int> temp_deviation_osd; + float cur_max_deviation = calc_deviations(cct, temp_pgs_by_osd, osd_weight, + pgs_per_weight, temp_osd_deviation, + temp_deviation_osd, new_stddev); + ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl; + if (new_stddev >= stddev) { + if (!aggressive) { + ldout(cct, 10) << " break because stddev is not decreasing" + << " and aggressive mode is not enabled" + << dendl; + break; + } + local_fallback_retried++; + if (local_fallback_retried >= local_fallback_retries) { + // does not make progress + // flip *skip_overfull* so both overfull and underfull + // get equal (in)attention + skip_overfull = !skip_overfull; + ldout(cct, 10) << " hit local_fallback_retries " + << local_fallback_retries + << dendl; + continue; + } + for (auto& i : to_unmap) + to_skip.insert(i); + for (auto& i : to_upmap) + to_skip.insert(i.first); + ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried + << " to_skip " << to_skip + << dendl; + goto retry; + } + + // ready to go + ceph_assert(new_stddev < stddev); + stddev = new_stddev; + pgs_by_osd = temp_pgs_by_osd; + osd_deviation = temp_osd_deviation; + deviation_osd = temp_deviation_osd; + n_changes++; + + + num_changed += pack_upmap_results(cct, to_unmap, to_upmap, tmp_osd_map, pending_inc); + + ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl; + if (cur_max_deviation <= max_deviation) { + ldout(cct, 10) << __func__ << " Optimization plan is almost perfect" + << dendl; + break; + } + } + ldout(cct, 10) << " num_changed = " << num_changed << dendl; + return num_changed; +} + +map<uint64_t,set<pg_t>> OSDMap::get_pgs_by_osd( + CephContext *cct, + int64_t pid, + map<uint64_t, set<pg_t>> *p_primaries_by_osd, + map<uint64_t, set<pg_t>> *p_acting_primaries_by_osd) const +{ + // Set up the OSDMap + OSDMap tmp_osd_map; + tmp_osd_map.deepish_copy_from(*this); + + // Get the pool from the provided pool id + const pg_pool_t* pool = get_pg_pool(pid); + + // build array of pgs from the pool + map<uint64_t,set<pg_t>> pgs_by_osd; + for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) { + pg_t pg(ps, pid); + vector<int> up; + int primary; + int acting_prim; + tmp_osd_map.pg_to_up_acting_osds(pg, &up, &primary, nullptr, &acting_prim); + if (cct != nullptr) + ldout(cct, 20) << __func__ << " " << pg + << " up " << up + << " primary " << primary + << " acting_primary " << acting_prim + << dendl; + + if (!up.empty()) { // up can be empty is test generated files + // in this case, we return empty result + for (auto osd : up) { + if (osd != CRUSH_ITEM_NONE) + pgs_by_osd[osd].insert(pg); + } + if (p_primaries_by_osd != nullptr) { + if (primary != CRUSH_ITEM_NONE) + (*p_primaries_by_osd)[primary].insert(pg); + } + if (p_acting_primaries_by_osd != nullptr) { + if (acting_prim != CRUSH_ITEM_NONE) + (*p_acting_primaries_by_osd)[acting_prim].insert(pg); + } + } + } + return pgs_by_osd; +} + +float OSDMap::get_osds_weight( + CephContext *cct, + const OSDMap& tmp_osd_map, + int64_t pid, + map<int,float>& osds_weight) const +{ + map<int,float> pmap; + ceph_assert(pools.count(pid)); + int ruleno = pools.at(pid).get_crush_rule(); + tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &pmap); + ldout(cct,20) << __func__ << " pool " << pid + << " ruleno " << ruleno + << " weight-map " << pmap + << dendl; + float osds_weight_total = 0; + for (auto [oid, oweight] : pmap) { + auto adjusted_weight = tmp_osd_map.get_weightf(oid) * oweight; + if (adjusted_weight != 0) { + osds_weight[oid] += adjusted_weight; + osds_weight_total += adjusted_weight; + } + } + return osds_weight_total; +} + +float OSDMap::build_pool_pgs_info ( + CephContext *cct, + const std::set<int64_t>& only_pools, ///< [optional] restrict to pool + const OSDMap& tmp_osd_map, + int& total_pgs, + map<int,set<pg_t>>& pgs_by_osd, + map<int,float>& osds_weight) +{ + // + // This function builds some data structures that are used by calc_pg_upmaps. + // Specifically it builds pgs_by_osd and osd_weight maps, updates total_pgs + // and returns the osd_weight_total + // + float osds_weight_total = 0.0; + for (auto& [pid, pdata] : pools) { + if (!only_pools.empty() && !only_pools.count(pid)) + continue; + for (unsigned ps = 0; ps < pdata.get_pg_num(); ++ps) { + pg_t pg(ps, pid); + vector<int> up; + tmp_osd_map.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr); + ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl; + for (auto osd : up) { + if (osd != CRUSH_ITEM_NONE) + pgs_by_osd[osd].insert(pg); + } + } + total_pgs += pdata.get_size() * pdata.get_pg_num(); + + osds_weight_total = get_osds_weight(cct, tmp_osd_map, pid, osds_weight); + } + for (auto& [oid, oweight] : osds_weight) { + int pgs = 0; + auto p = pgs_by_osd.find(oid); + if (p != pgs_by_osd.end()) + pgs = p->second.size(); + else + pgs_by_osd.emplace(oid, set<pg_t>()); + ldout(cct, 20) << " osd." << oid << " weight " << oweight + << " pgs " << pgs << dendl; + } + return osds_weight_total; + +} // return total weight of all OSDs + +float OSDMap::calc_deviations ( + CephContext *cct, + const map<int,set<pg_t>>& pgs_by_osd, + const map<int,float>& osd_weight, + float pgs_per_weight, + map<int,float>& osd_deviation, + multimap<float,int>& deviation_osd, + float& stddev) // return current max deviation +{ + // + // This function calculates the 2 maps osd_deviation and deviation_osd which + // hold the deviation between the current number of PGs which map to an OSD + // and the optimal number. Ot also calculates the stddev of the deviations and + // returns the current max deviation. + // NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as + // long as it is monotonic with stddev (and it is), it is sufficient for + // the balancer code. + // + float cur_max_deviation = 0.0; + stddev = 0.0; + for (auto& [oid, opgs] : pgs_by_osd) { + // make sure osd is still there (belongs to this crush-tree) + ceph_assert(osd_weight.count(oid)); + float target = osd_weight.at(oid) * pgs_per_weight; + float deviation = (float)opgs.size() - target; + ldout(cct, 20) << " osd." << oid + << "\tpgs " << opgs.size() + << "\ttarget " << target + << "\tdeviation " << deviation + << dendl; + osd_deviation[oid] = deviation; + deviation_osd.insert(make_pair(deviation, oid)); + stddev += deviation * deviation; + if (fabsf(deviation) > cur_max_deviation) + cur_max_deviation = fabsf(deviation); + } + return cur_max_deviation; +} + +void OSDMap::fill_overfull_underfull ( + CephContext *cct, + const std::multimap<float,int>& deviation_osd, + int max_deviation, + std::set<int>& overfull, + std::set<int>& more_overfull, + std::vector<int>& underfull, + std::vector<int>& more_underfull) +{ + // + // This function just fills the overfull and underfull data structures for the + // use of calc_pg_upmaps + // + for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) { + auto& odev = i->first; + auto& oid = i->second; + ldout(cct, 30) << " check " << odev << " <= " << max_deviation << dendl; + if (odev <= 0) + break; + if (odev > max_deviation) { + ldout(cct, 30) << " add overfull osd." << oid << dendl; + overfull.insert(oid); + } else { + more_overfull.insert(oid); + } + } + + for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) { + auto& odev = i->first; + auto& oid = i->second; + ldout(cct, 30) << " check " << odev << " >= " << -(int)max_deviation << dendl; + if (odev >= 0) + break; + if (odev < -(int)max_deviation) { + ldout(cct, 30) << " add underfull osd." << oid << dendl; + underfull.push_back(oid); + } else { + more_underfull.push_back(oid); + } + } +} + +int OSDMap::pack_upmap_results( + CephContext *cct, + const std::set<pg_t>& to_unmap, + const std::map<pg_t, mempool::osdmap::vector<std::pair<int, int>>>& to_upmap, + OSDMap& tmp_osd_map, + OSDMap::Incremental *pending_inc) +{ + // + // This function takes the input from the local variables to_unmap and to_upmap + // and updates tmp_osd_map (so that another iteration can run) and pending_inc + // (so that the results are visible outside calc_pg_upmaps) + // + int num_changed = 0; + for (auto& i : to_unmap) { + ldout(cct, 10) << " unmap pg " << i << dendl; + ceph_assert(tmp_osd_map.pg_upmap_items.count(i)); + tmp_osd_map.pg_upmap_items.erase(i); + pending_inc->old_pg_upmap_items.insert(i); + ++num_changed; + } + for (auto& [pg, um_items] : to_upmap) { + ldout(cct, 10) << " upmap pg " << pg + << " new pg_upmap_items " << um_items + << dendl; + tmp_osd_map.pg_upmap_items[pg] = um_items; + pending_inc->new_pg_upmap_items[pg] = um_items; + ++num_changed; + } + + return num_changed; +} + +std::default_random_engine OSDMap::get_random_engine( + CephContext *cct, + std::random_device::result_type *p_seed) +{ + // + // This function creates a random_engine to be used for shuffling. + // When p_seed == nullptr it generates random engine with a seed from /dev/random + // when p_seed is not null, it uses (*p_seed + seed_set) as the seed and + // increments seed_set. This is used in order to craete regression test without + // random effect on the results. + // + static std::random_device::result_type seed_set = 0; + std::random_device::result_type seed; + if (p_seed == nullptr) { + std::random_device rd; + seed = rd(); + } + else { + seed = *p_seed + seed_set; + ldout(cct, 30) << " Starting random engine with seed " + << seed << dendl; + seed_set++; + } + return std::default_random_engine{seed}; +} + +bool OSDMap::try_drop_remap_overfull( + CephContext *cct, + const std::vector<pg_t>& pgs, + const OSDMap& tmp_osd_map, + int osd, + map<int,std::set<pg_t>>& temp_pgs_by_osd, + set<pg_t>& to_unmap, + map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap) +{ + // + // This function tries to drop existimg upmap items which map data to overfull + // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true + // if it found an item that can be dropped, false if not. + // + for (auto pg : pgs) { + auto p = tmp_osd_map.pg_upmap_items.find(pg); + if (p == tmp_osd_map.pg_upmap_items.end()) + continue; + mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items; + auto& pg_upmap_items = p->second; + for (auto um_pair : pg_upmap_items) { + auto& um_from = um_pair.first; + auto& um_to = um_pair.second; + if (um_to == osd) { + ldout(cct, 10) << " will try dropping existing" + << " remapping pair " + << um_from << " -> " << um_to + << " which remapped " << pg + << " into overfull osd." << osd + << dendl; + temp_pgs_by_osd[um_to].erase(pg); + temp_pgs_by_osd[um_from].insert(pg); + } else { + new_upmap_items.push_back(um_pair); + } + } + if (new_upmap_items.empty()) { + // drop whole item + ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items + << " remapped " << pg << " into overfull osd." << osd + << ", will try cancelling it entirely" + << dendl; + to_unmap.insert(pg); + return true; + } else if (new_upmap_items.size() != pg_upmap_items.size()) { + // drop single remapping pair, updating + ceph_assert(new_upmap_items.size() < pg_upmap_items.size()); + ldout(cct, 10) << " existing pg_upmap_items " << pg_upmap_items + << " remapped " << pg << " into overfull osd." << osd + << ", new_pg_upmap_items now " << new_upmap_items + << dendl; + to_upmap[pg] = new_upmap_items; + return true; + } + } + return false; +} + +bool OSDMap::try_drop_remap_underfull( + CephContext *cct, + const candidates_t& candidates, + int osd, + map<int,std::set<pg_t>>& temp_pgs_by_osd, + set<pg_t>& to_unmap, + map<pg_t, mempool::osdmap::vector<std::pair<int32_t,int32_t>>>& to_upmap) +{ + // + // This function tries to drop existimg upmap items which map data from underfull + // OSDs. It updates temp_pgs_by_osd, to_unmap and to_upmap and rerturns true + // if it found an item that can be dropped, false if not. + // + for (auto& [pg, um_pairs] : candidates) { + mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items; + for (auto& ump : um_pairs) { + auto& um_from = ump.first; + auto& um_to = ump.second; + if (um_from == osd) { + ldout(cct, 10) << " will try dropping existing" + << " remapping pair " + << um_from << " -> " << um_to + << " which remapped " << pg + << " out from underfull osd." << osd + << dendl; + temp_pgs_by_osd[um_to].erase(pg); + temp_pgs_by_osd[um_from].insert(pg); + } else { + new_upmap_items.push_back(ump); + } + } + if (new_upmap_items.empty()) { + // drop whole item + ldout(cct, 10) << " existing pg_upmap_items " << um_pairs + << " remapped " << pg + << " out from underfull osd." << osd + << ", will try cancelling it entirely" + << dendl; + to_unmap.insert(pg); + return true; + } else if (new_upmap_items.size() != um_pairs.size()) { + // drop single remapping pair, updating + ceph_assert(new_upmap_items.size() < um_pairs.size()); + ldout(cct, 10) << " existing pg_upmap_items " << um_pairs + << " remapped " << pg + << " out from underfull osd." << osd + << ", new_pg_upmap_items now " << new_upmap_items + << dendl; + to_upmap[pg] = new_upmap_items; + return true; + } + } + return false; +} + +void OSDMap::add_remap_pair( + CephContext *cct, + int orig, + int out, + pg_t pg, + size_t pg_pool_size, + int osd, + set<int>& existing, + map<int,set<pg_t>>& temp_pgs_by_osd, + mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items, + map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>>& to_upmap) +{ + // + // add a single remap pair (in pg <pg> remap osd from <orig> to <out>) to all + // the relevant data structures + // + ldout(cct, 10) << " will try adding new remapping pair " + << orig << " -> " << out << " for " << pg + << (orig != osd ? " NOT selected osd" : "") + << dendl; + existing.insert(orig); + existing.insert(out); + temp_pgs_by_osd[orig].erase(pg); + temp_pgs_by_osd[out].insert(pg); + ceph_assert(new_upmap_items.size() < pg_pool_size); + new_upmap_items.push_back(make_pair(orig, out)); + // append new remapping pairs slowly + // This way we can make sure that each tiny change will + // definitely make distribution of PGs converging to + // the perfect status. + to_upmap[pg] = new_upmap_items; + +} + +int OSDMap::find_best_remap ( + CephContext *cct, + const vector<int>& orig, + const vector<int>& out, + const set<int>& existing, + const map<int,float> osd_deviation) +{ + // + // Find the best remap from the suggestions in orig and out - the best remap + // is the one which maps from the OSD with the largest deviatoion (from the + // OSDs which are part of orig) + // + int best_pos = -1; + float max_dev = 0; + for (unsigned i = 0; i < out.size(); ++i) { + if (orig[i] == out[i]) + continue; // skip invalid remappings + if (existing.count(orig[i]) || existing.count(out[i])) + continue; // we want new remappings only! + if (osd_deviation.at(orig[i]) > max_dev) { + max_dev = osd_deviation.at(orig[i]); + best_pos = i; + ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation.at(orig[i]) << dendl; + } + } + return best_pos; +} + +OSDMap::candidates_t OSDMap::build_candidates( + CephContext *cct, + const OSDMap& tmp_osd_map, + const set<pg_t> to_skip, + const set<int64_t>& only_pools, + bool aggressive, + std::random_device::result_type *p_seed) +{ + // + // build the candidates data structure + // + candidates_t candidates; + candidates.reserve(tmp_osd_map.pg_upmap_items.size()); + for (auto& [pg, um_pair] : tmp_osd_map.pg_upmap_items) { + if (to_skip.count(pg)) + continue; + if (!only_pools.empty() && !only_pools.count(pg.pool())) + continue; + candidates.push_back(make_pair(pg, um_pair)); + } + if (aggressive) { + // shuffle candidates so they all get equal (in)attention + std::shuffle(candidates.begin(), candidates.end(), get_random_engine(cct, p_seed)); + } + return candidates; +} + +// return -1 if all PGs are OK, else the first PG which includes only zero PA OSDs +int64_t OSDMap::has_zero_pa_pgs(CephContext *cct, int64_t pool_id) const +{ + const pg_pool_t* pool = get_pg_pool(pool_id); + for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) { + pg_t pg(ps, pool_id); + vector<int> acting; + pg_to_up_acting_osds(pg, nullptr, nullptr, &acting, nullptr); + if (cct != nullptr) { + ldout(cct, 30) << __func__ << " " << pg << " acting " << acting << dendl; + } + bool pg_zero_pa = true; + for (auto osd : acting) { + if (get_primary_affinityf(osd) != 0) { + pg_zero_pa = false; + break; + } + } + if (pg_zero_pa) { + if (cct != nullptr) { + ldout(cct, 20) << __func__ << " " << pg << " - maps only to OSDs with primiary affinity 0" << dendl; + } + return (int64_t)ps; + } + } + return -1; +} + +void OSDMap::zero_rbi(read_balance_info_t &rbi) const { + rbi.pa_avg = 0.; + rbi.pa_weighted = 0.; + rbi.pa_weighted_avg = 0.; + rbi.raw_score = 0.; + rbi.optimal_score = 0.; + rbi.adjusted_score = 0.; + rbi.acting_raw_score = 0.; + rbi.acting_adj_score = 0.; + rbi.err_msg = ""; +} + +int OSDMap::set_rbi( + CephContext *cct, + read_balance_info_t &rbi, + int64_t pool_id, + float total_w_pa, + float pa_sum, + int num_osds, + int osd_pa_count, + float total_osd_weight, + uint max_prims_per_osd, + uint max_acting_prims_per_osd, + float avg_prims_per_osd, + bool prim_on_zero_pa, + bool acting_on_zero_pa, + float max_osd_score) const +{ + // put all the ugly code here, so rest of code is nicer. + const pg_pool_t* pool = get_pg_pool(pool_id); + zero_rbi(rbi); + + if (total_w_pa / total_osd_weight < 1. / float(pool->get_size())) { + ldout(cct, 20) << __func__ << " pool " << pool_id << " average primary affinity is lower than" + << 1. / float(pool->get_size()) << dendl; + rbi.err_msg = fmt::format( + "pool {} average primary affinity is lower than {:.2f}, read balance score is not reliable", + pool_id, 1. / float(pool->get_size())); + return -EINVAL; + } + rbi.pa_weighted = total_w_pa; + + // weighted_prim_affinity_avg + rbi.pa_weighted_avg = rbi_round(rbi.pa_weighted / total_osd_weight); // in [0..1] + // p_rbi->pa_weighted / osd_pa_count; // in [0..1] + + rbi.raw_score = rbi_round((float)max_prims_per_osd / avg_prims_per_osd); // >=1 + if (acting_on_zero_pa) { + rbi.acting_raw_score = rbi_round(max_osd_score); + rbi.err_msg = fmt::format( + "pool {} has acting primaries on OSD(s) with primary affinity 0, read balance score is not accurate", + pool_id); + } else { + rbi.acting_raw_score = rbi_round((float)max_acting_prims_per_osd / avg_prims_per_osd); + } + + if (osd_pa_count != 0) { + // this implies that pa_sum > 0 + rbi.pa_avg = rbi_round(pa_sum / osd_pa_count); // in [0..1] + } else { + rbi.pa_avg = 0.; + } + + if (rbi.pa_avg != 0.) { + int64_t zpg; + if ((zpg = has_zero_pa_pgs(cct, pool_id)) >= 0) { + pg_t pg(zpg, pool_id); + std::stringstream ss; + ss << pg; + ldout(cct, 10) << __func__ << " pool " << pool_id << " has some PGs where all OSDs are with primary_affinity 0 (" << pg << ",...)" << dendl; + rbi.err_msg = fmt::format( + "pool {} has some PGs where all OSDs are with primary_affinity 0 (at least pg {}), read balance score may not be reliable", + pool_id, ss.str()); + return -EINVAL; + } + rbi.optimal_score = rbi_round(float(num_osds) / float(osd_pa_count)); // >= 1 + // adjust the score to the primary affinity setting (if prim affinity is set + // the raw score can't be 1 and the optimal (perfect) score is hifgher than 1) + // When total system primary affinity is too low (average < 1 / pool replica count) + // the score is negative in order to grab the user's attention. + rbi.adjusted_score = rbi_round(rbi.raw_score / rbi.optimal_score); // >= 1 if PA is not low + rbi.acting_adj_score = rbi_round(rbi.acting_raw_score / rbi.optimal_score); // >= 1 if PA is not low + + } else { + // We should never get here - this condition is checked before calling this function - this is just sanity check code. + rbi.err_msg = fmt::format( + "pool {} all OSDs have zero primary affinity, can't calculate a reliable read balance score", + pool_id); + return -EINVAL; + } + + return 0; +} + +int OSDMap::calc_read_balance_score(CephContext *cct, int64_t pool_id, + read_balance_info_t *p_rbi) const +{ + //BUG: wrong score with one PG replica 3 and 4 OSDs + if (cct != nullptr) + ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id) << dendl; + + OSDMap tmp_osd_map; + tmp_osd_map.deepish_copy_from(*this); + if (p_rbi == nullptr) { + // The only case where error message is not set - this is not tested in the unit test. + if (cct != nullptr) + ldout(cct,30) << __func__ << " p_rbi is nullptr." << dendl; + return -EINVAL; + } + + if (tmp_osd_map.pools.count(pool_id) == 0) { + if (cct != nullptr) + ldout(cct,30) << __func__ << " pool " << pool_id << " not found." << dendl; + zero_rbi(*p_rbi); + p_rbi->err_msg = fmt::format("pool {} not found", pool_id); + return -ENOENT; + } + int rc = 0; + const pg_pool_t* pool = tmp_osd_map.get_pg_pool(pool_id); + auto num_pgs = pool->get_pg_num(); + + map<uint64_t,set<pg_t>> pgs_by_osd; + map<uint64_t,set<pg_t>> prim_pgs_by_osd; + map<uint64_t,set<pg_t>> acting_prims_by_osd; + + pgs_by_osd = tmp_osd_map.get_pgs_by_osd(cct, pool_id, &prim_pgs_by_osd, &acting_prims_by_osd); + + if (cct != nullptr) + ldout(cct,30) << __func__ << " Primaries for pool: " + << prim_pgs_by_osd << dendl; + + if (pgs_by_osd.empty()) { + //p_rbi->err_msg = fmt::format("pool {} has no PGs mapped to OSDs", pool_id); + return -EINVAL; + } + if (cct != nullptr) { + for (auto& [osd,pgs] : prim_pgs_by_osd) { + ldout(cct,20) << __func__ << " Pool " << pool_id << " OSD." << osd + << " has " << pgs.size() << " primary PGs, " + << acting_prims_by_osd[osd].size() << " acting primaries." + << dendl; + } + } + + auto num_osds = pgs_by_osd.size(); + + float avg_prims_per_osd = (float)num_pgs / (float)num_osds; + uint64_t max_prims_per_osd = 0; + uint64_t max_acting_prims_per_osd = 0; + float max_osd_score = 0.; + bool prim_on_zero_pa = false; + bool acting_on_zero_pa = false; + + float prim_affinity_sum = 0.; + float total_osd_weight = 0.; + float total_weighted_pa = 0.; + + map<int,float> osds_crush_weight; + // Set up the OSDMap + int ruleno = tmp_osd_map.pools.at(pool_id).get_crush_rule(); + tmp_osd_map.crush->get_rule_weight_osd_map(ruleno, &osds_crush_weight); + + if (cct != nullptr) { + ldout(cct,20) << __func__ << " pool " << pool_id + << " ruleno " << ruleno + << " weight-map " << osds_crush_weight + << dendl; + } + uint osd_pa_count = 0; + + for (auto [osd, oweight] : osds_crush_weight) { // loop over all OSDs + total_osd_weight += oweight; + float osd_pa = tmp_osd_map.get_primary_affinityf(osd); + total_weighted_pa += oweight * osd_pa; + if (osd_pa != 0.) { + osd_pa_count++; + } + if (prim_pgs_by_osd.count(osd)) { + auto n_prims = prim_pgs_by_osd.at(osd).size(); + max_prims_per_osd = std::max(max_prims_per_osd, n_prims); + if (osd_pa == 0.) { + prim_on_zero_pa = true; + } + } + if (acting_prims_by_osd.count(osd)) { + auto n_aprims = acting_prims_by_osd.at(osd).size(); + max_acting_prims_per_osd = std::max(max_acting_prims_per_osd, n_aprims); + if (osd_pa != 0.) { + max_osd_score = std::max(max_osd_score, float(n_aprims) / osd_pa); + } + else { + acting_on_zero_pa = true; + } + } + + prim_affinity_sum += osd_pa; + if (cct != nullptr) { + auto np = prim_pgs_by_osd.count(osd) ? prim_pgs_by_osd.at(osd).size() : 0; + auto nap = acting_prims_by_osd.count(osd) ? acting_prims_by_osd.at(osd).size() : 0; + auto wt = osds_crush_weight.count(osd) ? osds_crush_weight.at(osd) : 0.; + ldout(cct,30) << __func__ << " OSD." << osd << " info: " + << " num_primaries " << np + << " num_acting_prims " << nap + << " prim_affinity " << tmp_osd_map.get_primary_affinityf(osd) + << " weight " << wt + << dendl; + } + } + if (cct != nullptr) { + ldout(cct,30) << __func__ << " pool " << pool_id + << " total_osd_weight " << total_osd_weight + << " total_weighted_pa " << total_weighted_pa + << dendl; + } + + if (prim_affinity_sum == 0.0) { + if (cct != nullptr) { + ldout(cct, 10) << __func__ << " pool " << pool_id + << " has primary_affinity set to zero on all OSDs" << dendl; + } + zero_rbi(*p_rbi); + p_rbi->err_msg = fmt::format("pool {} has primary_affinity set to zero on all OSDs", pool_id); + + return -ERANGE; // score has a different meaning now. + } + else { + max_osd_score *= prim_affinity_sum / num_osds; + } + + rc = tmp_osd_map.set_rbi(cct, *p_rbi, pool_id, total_weighted_pa, + prim_affinity_sum, num_osds, osd_pa_count, + total_osd_weight, max_prims_per_osd, + max_acting_prims_per_osd, avg_prims_per_osd, + prim_on_zero_pa, acting_on_zero_pa, max_osd_score); + + if (cct != nullptr) { + ldout(cct,30) << __func__ << " pool " << get_pool_name(pool_id) + << " pa_avg " << p_rbi->pa_avg + << " pa_weighted " << p_rbi->pa_weighted + << " pa_weighted_avg " << p_rbi->pa_weighted_avg + << " optimal_score " << p_rbi->optimal_score + << " adjusted_score " << p_rbi->adjusted_score + << " acting_adj_score " << p_rbi->acting_adj_score + << dendl; + ldout(cct,20) << __func__ << " pool " << get_pool_name(pool_id) + << " raw_score: " << p_rbi->raw_score + << " acting_raw_score: " << p_rbi->acting_raw_score + << dendl; + ldout(cct,10) << __func__ << " pool " << get_pool_name(pool_id) + << " wl_score: " << p_rbi->acting_adj_score << dendl; + } + + return rc; +} + +int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const +{ + return crush->get_leaves(name, osds); +} + +// get pools whose crush rules might reference the given osd +void OSDMap::get_pool_ids_by_osd(CephContext *cct, + int osd, + set<int64_t> *pool_ids) const +{ + ceph_assert(pool_ids); + set<int> raw_rules; + int r = crush->get_rules_by_osd(osd, &raw_rules); + if (r < 0) { + lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r) + << dendl; + ceph_assert(r >= 0); + } + set<int> rules; + for (auto &i: raw_rules) { + // exclude any dead rule + if (crush_rule_in_use(i)) { + rules.insert(i); + } + } + for (auto &r: rules) { + get_pool_ids_by_rule(r, pool_ids); + } +} + +template <typename F> +class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> { +public: + typedef CrushTreeDumper::Dumper<F> Parent; + + OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_, + const PGMap& pgmap_, bool tree_, + const string& filter) : + Parent(crush, osdmap_->get_pool_names()), + osdmap(osdmap_), + pgmap(pgmap_), + tree(tree_), + min_var(-1), + max_var(-1), + stddev(0), + sum(0) { + if (osdmap->crush->name_exists(filter)) { + // filter by crush node + auto item_id = osdmap->crush->get_item_id(filter); + allowed.insert(item_id); + osdmap->crush->get_all_children(item_id, &allowed); + } else if (osdmap->crush->class_exists(filter)) { + // filter by device class + class_id = osdmap->crush->get_class_id(filter); + } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter); + pool_id >= 0) { + // filter by pool + auto crush_rule = osdmap->get_pool_crush_rule(pool_id); + set<int> roots; + osdmap->crush->find_takes_by_rule(crush_rule, &roots); + allowed = roots; + for (auto r : roots) + osdmap->crush->get_all_children(r, &allowed); + } + average_util = average_utilization(); + } + +protected: + + bool should_dump(int id) const { + if (!allowed.empty() && !allowed.count(id)) // filter by name + return false; + if (id >= 0 && class_id >= 0) { + auto item_class_id = osdmap->crush->get_item_class_id(id); + if (item_class_id < 0 || // not bound to a class yet + item_class_id != class_id) // or already bound to a different class + return false; + } + return true; + } + + set<int> get_dumped_osds() { + if (allowed.empty() && class_id < 0) { + // old way, all + return {}; + } + return dumped_osds; + } + + void dump_stray(F *f) { + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (osdmap->exists(i) && !this->is_touched(i)) + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f); + } + } + + void dump_item(const CrushTreeDumper::Item &qi, F *f) override { + if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id))) + return; + if (!should_dump(qi.id)) + return; + + if (!qi.is_bucket()) + dumped_osds.insert(qi.id); + float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id); + int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0, + kb_used_meta = 0, kb_avail = 0; + double util = 0; + if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data, + &kb_used_omap, &kb_used_meta, &kb_avail)) + if (kb_used && kb) + util = 100.0 * (double)kb_used / (double)kb; + + double var = 1.0; + if (average_util) + var = util / average_util; + + size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id); + + dump_item(qi, reweight, kb, kb_used, + kb_used_data, kb_used_omap, kb_used_meta, + kb_avail, util, var, num_pgs, f); + + if (!qi.is_bucket() && reweight > 0) { + if (min_var < 0 || var < min_var) + min_var = var; + if (max_var < 0 || var > max_var) + max_var = var; + + double dev = util - average_util; + dev *= dev; + stddev += reweight * dev; + sum += reweight; + } + } + + virtual void dump_item(const CrushTreeDumper::Item &qi, + float &reweight, + int64_t kb, + int64_t kb_used, + int64_t kb_used_data, + int64_t kb_used_omap, + int64_t kb_used_meta, + int64_t kb_avail, + double& util, + double& var, + const size_t num_pgs, + F *f) = 0; + + double dev() { + return sum > 0 ? sqrt(stddev / sum) : 0; + } + + double average_utilization() { + int64_t kb = 0, kb_used = 0; + for (int i = 0; i < osdmap->get_max_osd(); i++) { + if (!osdmap->exists(i) || + osdmap->get_weight(i) == 0 || + !should_dump(i)) + continue; + int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i, + kb_avail_i; + if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i, + &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) { + kb += kb_i; + kb_used += kb_used_i; + } + } + return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0; + } + + bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used, + int64_t* kb_used_data, + int64_t* kb_used_omap, + int64_t* kb_used_meta, + int64_t* kb_avail) const { + const osd_stat_t *p = pgmap.get_osd_stat(id); + if (!p) return false; + *kb = p->statfs.kb(); + *kb_used = p->statfs.kb_used_raw(); + *kb_used_data = p->statfs.kb_used_data(); + *kb_used_omap = p->statfs.kb_used_omap(); + *kb_used_meta = p->statfs.kb_used_internal_metadata(); + *kb_avail = p->statfs.kb_avail(); + + return true; + } + + bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used, + int64_t* kb_used_data, + int64_t* kb_used_omap, + int64_t* kb_used_meta, + int64_t* kb_avail) const { + if (id >= 0) { + if (osdmap->is_out(id) || !should_dump(id)) { + *kb = 0; + *kb_used = 0; + *kb_used_data = 0; + *kb_used_omap = 0; + *kb_used_meta = 0; + *kb_avail = 0; + return true; + } + return get_osd_utilization(id, kb, kb_used, kb_used_data, + kb_used_omap, kb_used_meta, kb_avail); + } + + *kb = 0; + *kb_used = 0; + *kb_used_data = 0; + *kb_used_omap = 0; + *kb_used_meta = 0; + *kb_avail = 0; + + for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) { + int item = osdmap->crush->get_bucket_item(id, k); + int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0, + kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0; + if (!get_bucket_utilization(item, &kb_i, &kb_used_i, + &kb_used_data_i, &kb_used_omap_i, + &kb_used_meta_i, &kb_avail_i)) + return false; + *kb += kb_i; + *kb_used += kb_used_i; + *kb_used_data += kb_used_data_i; + *kb_used_omap += kb_used_omap_i; + *kb_used_meta += kb_used_meta_i; + *kb_avail += kb_avail_i; + } + return true; + } + +protected: + const OSDMap *osdmap; + const PGMap& pgmap; + bool tree; + double average_util; + double min_var; + double max_var; + double stddev; + double sum; + int class_id = -1; + set<int> allowed; + set<int> dumped_osds; +}; + + +class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> { +public: + typedef OSDUtilizationDumper<TextTable> Parent; + + OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap, + const PGMap& pgmap, bool tree, + const string& filter) : + Parent(crush, osdmap, pgmap, tree, filter) {} + + void dump(TextTable *tbl) { + tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT); + if (tree) + tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT); + + Parent::dump(tbl); + + dump_stray(tbl); + + auto sum = pgmap.get_osd_sum(get_dumped_osds()); + *tbl << "" + << "" + << "" << "TOTAL" + << byte_u_t(sum.statfs.total) + << byte_u_t(sum.statfs.get_used_raw()) + << byte_u_t(sum.statfs.allocated) + << byte_u_t(sum.statfs.omap_allocated) + << byte_u_t(sum.statfs.internal_metadata) + << byte_u_t(sum.statfs.available) + << lowprecision_t(average_util) + << "" + << TextTable::endrow; + } + +protected: + struct lowprecision_t { + float v; + explicit lowprecision_t(float _v) : v(_v) {} + }; + friend std::ostream &operator<<(ostream& out, const lowprecision_t& v); + + using OSDUtilizationDumper<TextTable>::dump_item; + void dump_item(const CrushTreeDumper::Item &qi, + float &reweight, + int64_t kb, + int64_t kb_used, + int64_t kb_used_data, + int64_t kb_used_omap, + int64_t kb_used_meta, + int64_t kb_avail, + double& util, + double& var, + const size_t num_pgs, + TextTable *tbl) override { + const char *c = crush->get_item_class(qi.id); + if (!c) + c = ""; + *tbl << qi.id + << c + << weightf_t(qi.weight) + << weightf_t(reweight) + << byte_u_t(kb << 10) + << byte_u_t(kb_used << 10) + << byte_u_t(kb_used_data << 10) + << byte_u_t(kb_used_omap << 10) + << byte_u_t(kb_used_meta << 10) + << byte_u_t(kb_avail << 10) + << lowprecision_t(util) + << lowprecision_t(var); + + if (qi.is_bucket()) { + *tbl << "-"; + *tbl << ""; + } else { + *tbl << num_pgs; + if (osdmap->is_up(qi.id)) { + *tbl << "up"; + } else if (osdmap->is_destroyed(qi.id)) { + *tbl << "destroyed"; + } else { + *tbl << "down"; + } + } + + if (tree) { + ostringstream name; + for (int k = 0; k < qi.depth; k++) + name << " "; + if (qi.is_bucket()) { + int type = crush->get_bucket_type(qi.id); + name << crush->get_type_name(type) << " " + << crush->get_item_name(qi.id); + } else { + name << "osd." << qi.id; + } + *tbl << name.str(); + } + + *tbl << TextTable::endrow; + } + +public: + string summary() { + ostringstream out; + out << "MIN/MAX VAR: " << lowprecision_t(min_var) + << "/" << lowprecision_t(max_var) << " " + << "STDDEV: " << lowprecision_t(dev()); + return out.str(); + } +}; + +ostream& operator<<(ostream& out, + const OSDUtilizationPlainDumper::lowprecision_t& v) +{ + if (v.v < -0.01) { + return out << "-"; + } else if (v.v < 0.001) { + return out << "0"; + } else { + std::streamsize p = out.precision(); + return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p); + } +} + +class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> { +public: + typedef OSDUtilizationDumper<Formatter> Parent; + + OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap, + const PGMap& pgmap, bool tree, + const string& filter) : + Parent(crush, osdmap, pgmap, tree, filter) {} + + void dump(Formatter *f) { + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + + f->open_array_section("stray"); + dump_stray(f); + f->close_section(); + } + +protected: + using OSDUtilizationDumper<Formatter>::dump_item; + void dump_item(const CrushTreeDumper::Item &qi, + float &reweight, + int64_t kb, + int64_t kb_used, + int64_t kb_used_data, + int64_t kb_used_omap, + int64_t kb_used_meta, + int64_t kb_avail, + double& util, + double& var, + const size_t num_pgs, + Formatter *f) override { + f->open_object_section("item"); + CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f); + f->dump_float("reweight", reweight); + f->dump_int("kb", kb); + f->dump_int("kb_used", kb_used); + f->dump_int("kb_used_data", kb_used_data); + f->dump_int("kb_used_omap", kb_used_omap); + f->dump_int("kb_used_meta", kb_used_meta); + f->dump_int("kb_avail", kb_avail); + f->dump_float("utilization", util); + f->dump_float("var", var); + f->dump_unsigned("pgs", num_pgs); + if (!qi.is_bucket()) { + if (osdmap->is_up(qi.id)) { + f->dump_string("status", "up"); + } else if (osdmap->is_destroyed(qi.id)) { + f->dump_string("status", "destroyed"); + } else { + f->dump_string("status", "down"); + } + } + CrushTreeDumper::dump_bucket_children(crush, qi, f); + f->close_section(); + } + +public: + void summary(Formatter *f) { + f->open_object_section("summary"); + auto sum = pgmap.get_osd_sum(get_dumped_osds()); + auto& s = sum.statfs; + + f->dump_int("total_kb", s.kb()); + f->dump_int("total_kb_used", s.kb_used_raw()); + f->dump_int("total_kb_used_data", s.kb_used_data()); + f->dump_int("total_kb_used_omap", s.kb_used_omap()); + f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata()); + f->dump_int("total_kb_avail", s.kb_avail()); + f->dump_float("average_utilization", average_util); + f->dump_float("min_var", min_var); + f->dump_float("max_var", max_var); + f->dump_float("dev", dev()); + f->close_section(); + } +}; + +void print_osd_utilization(const OSDMap& osdmap, + const PGMap& pgmap, + ostream& out, + Formatter *f, + bool tree, + const string& filter) +{ + const CrushWrapper *crush = osdmap.crush.get(); + if (f) { + f->open_object_section("df"); + OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter); + d.dump(f); + d.summary(f); + f->close_section(); + f->flush(out); + } else { + OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter); + TextTable tbl; + d.dump(&tbl); + out << tbl << d.summary() << "\n"; + } +} + +void OSDMap::check_health(CephContext *cct, + health_check_map_t *checks) const +{ + int num_osds = get_num_osds(); + + // OSD_DOWN + // OSD_$subtree_DOWN + // OSD_ORPHAN + if (num_osds >= 0) { + int num_in_osds = 0; + int num_down_in_osds = 0; + set<int> osds; + set<int> down_in_osds; + set<int> up_in_osds; + set<int> subtree_up; + unordered_map<int, set<int> > subtree_type_down; + unordered_map<int, int> num_osds_subtree; + int max_type = crush->get_max_type_id(); + + for (int i = 0; i < get_max_osd(); i++) { + if (!exists(i)) { + if (crush->item_exists(i)) { + osds.insert(i); + } + continue; + } + if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW)) + continue; + ++num_in_osds; + if (down_in_osds.count(i) || up_in_osds.count(i)) + continue; + if (!is_up(i)) { + down_in_osds.insert(i); + int parent_id = 0; + int current = i; + for (int type = 0; type <= max_type; type++) { + if (!crush->get_type_name(type)) + continue; + int r = crush->get_immediate_parent_id(current, &parent_id); + if (r == -ENOENT) + break; + // break early if this parent is already marked as up + if (subtree_up.count(parent_id)) + break; + type = crush->get_bucket_type(parent_id); + if (!subtree_type_is_down( + cct, parent_id, type, + &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down)) + break; + current = parent_id; + } + } + } + + // calculate the number of down osds in each down subtree and + // store it in num_osds_subtree + for (int type = 1; type <= max_type; type++) { + if (!crush->get_type_name(type)) + continue; + for (auto j = subtree_type_down[type].begin(); + j != subtree_type_down[type].end(); + ++j) { + list<int> children; + int num = 0; + int num_children = crush->get_children(*j, &children); + if (num_children == 0) + continue; + for (auto l = children.begin(); l != children.end(); ++l) { + if (*l >= 0) { + ++num; + } else if (num_osds_subtree[*l] > 0) { + num = num + num_osds_subtree[*l]; + } + } + num_osds_subtree[*j] = num; + } + } + num_down_in_osds = down_in_osds.size(); + ceph_assert(num_down_in_osds <= num_in_osds); + if (num_down_in_osds > 0) { + // summary of down subtree types and osds + for (int type = max_type; type > 0; type--) { + if (!crush->get_type_name(type)) + continue; + if (subtree_type_down[type].size() > 0) { + ostringstream ss; + ss << subtree_type_down[type].size() << " " + << crush->get_type_name(type); + if (subtree_type_down[type].size() > 1) { + ss << "s"; + } + int sum_down_osds = 0; + for (auto j = subtree_type_down[type].begin(); + j != subtree_type_down[type].end(); + ++j) { + sum_down_osds = sum_down_osds + num_osds_subtree[*j]; + } + ss << " (" << sum_down_osds << " osds) down"; + string err = string("OSD_") + + string(crush->get_type_name(type)) + "_DOWN"; + boost::to_upper(err); + auto& d = checks->add(err, HEALTH_WARN, ss.str(), + subtree_type_down[type].size()); + for (auto j = subtree_type_down[type].rbegin(); + j != subtree_type_down[type].rend(); + ++j) { + ostringstream ss; + ss << crush->get_type_name(type); + ss << " "; + ss << crush->get_item_name(*j); + // at the top level, do not print location + if (type != max_type) { + ss << " ("; + ss << crush->get_full_location_ordered_string(*j); + ss << ")"; + } + int num = num_osds_subtree[*j]; + ss << " (" << num << " osds)"; + ss << " is down"; + d.detail.push_back(ss.str()); + } + } + } + ostringstream ss; + ss << down_in_osds.size() << " osds down"; + auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(), + down_in_osds.size()); + for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) { + ostringstream ss; + ss << "osd." << *it << " ("; + ss << crush->get_full_location_ordered_string(*it); + ss << ") is down"; + d.detail.push_back(ss.str()); + } + } + + if (!osds.empty()) { + ostringstream ss; + ss << osds.size() << " osds exist in the crush map but not in the osdmap"; + auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(), + osds.size()); + for (auto osd : osds) { + ostringstream ss; + ss << "osd." << osd << " exists in crush map but not in osdmap"; + d.detail.push_back(ss.str()); + } + } + } + + std::list<std::string> scrub_messages; + bool noscrub = false, nodeepscrub = false; + for (const auto &p : pools) { + if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) { + ostringstream ss; + ss << "Pool " << get_pool_name(p.first) << " has noscrub flag"; + scrub_messages.push_back(ss.str()); + noscrub = true; + } + if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) { + ostringstream ss; + ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag"; + scrub_messages.push_back(ss.str()); + nodeepscrub = true; + } + } + if (noscrub || nodeepscrub) { + string out = ""; + out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : ""; + out += nodeepscrub ? "nodeep-scrub" : ""; + auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK, + "Some pool(s) have the " + out + " flag(s) set", 0); + d.detail.splice(d.detail.end(), scrub_messages); + } + + // OSD_OUT_OF_ORDER_FULL + { + // An osd could configure failsafe ratio, to something different + // but for now assume it is the same here. + float fsr = cct->_conf->osd_failsafe_full_ratio; + if (fsr > 1.0) fsr /= 100; + float fr = get_full_ratio(); + float br = get_backfillfull_ratio(); + float nr = get_nearfull_ratio(); + + list<string> detail; + // These checks correspond to how OSDService::check_full_status() in an OSD + // handles the improper setting of these values. + if (br < nr) { + ostringstream ss; + ss << "backfillfull_ratio (" << br + << ") < nearfull_ratio (" << nr << "), increased"; + detail.push_back(ss.str()); + br = nr; + } + if (fr < br) { + ostringstream ss; + ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br + << "), increased"; + detail.push_back(ss.str()); + fr = br; + } + if (fsr < fr) { + ostringstream ss; + ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr + << "), increased"; + detail.push_back(ss.str()); + } + if (!detail.empty()) { + auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR, + "full ratio(s) out of order", 0); + d.detail.swap(detail); + } + } + + // OSD_FULL + // OSD_NEARFULL + // OSD_BACKFILLFULL + // OSD_FAILSAFE_FULL + { + set<int> full, backfillfull, nearfull; + get_full_osd_counts(&full, &backfillfull, &nearfull); + if (full.size()) { + ostringstream ss; + ss << full.size() << " full osd(s)"; + auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size()); + for (auto& i: full) { + ostringstream ss; + ss << "osd." << i << " is full"; + d.detail.push_back(ss.str()); + } + } + if (backfillfull.size()) { + ostringstream ss; + ss << backfillfull.size() << " backfillfull osd(s)"; + auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(), + backfillfull.size()); + for (auto& i: backfillfull) { + ostringstream ss; + ss << "osd." << i << " is backfill full"; + d.detail.push_back(ss.str()); + } + } + if (nearfull.size()) { + ostringstream ss; + ss << nearfull.size() << " nearfull osd(s)"; + auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size()); + for (auto& i: nearfull) { + ostringstream ss; + ss << "osd." << i << " is near full"; + d.detail.push_back(ss.str()); + } + } + } + + // OSDMAP_FLAGS + { + // warn about flags + uint64_t warn_flags = + CEPH_OSDMAP_PAUSERD | + CEPH_OSDMAP_PAUSEWR | + CEPH_OSDMAP_PAUSEREC | + CEPH_OSDMAP_NOUP | + CEPH_OSDMAP_NODOWN | + CEPH_OSDMAP_NOIN | + CEPH_OSDMAP_NOOUT | + CEPH_OSDMAP_NOBACKFILL | + CEPH_OSDMAP_NORECOVER | + CEPH_OSDMAP_NOSCRUB | + CEPH_OSDMAP_NODEEP_SCRUB | + CEPH_OSDMAP_NOTIERAGENT | + CEPH_OSDMAP_NOSNAPTRIM | + CEPH_OSDMAP_NOREBALANCE; + if (test_flag(warn_flags)) { + ostringstream ss; + string s = get_flag_string(get_flags() & warn_flags); + ss << s << " flag(s) set"; + checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(), + s.size() /* kludgey but sufficient */); + } + } + + // OSD_FLAGS + { + list<string> detail; + const unsigned flags = + CEPH_OSD_NOUP | + CEPH_OSD_NOIN | + CEPH_OSD_NODOWN | + CEPH_OSD_NOOUT; + for (int i = 0; i < max_osd; ++i) { + if (osd_state[i] & flags) { + ostringstream ss; + set<string> states; + OSDMap::calc_state_set(osd_state[i] & flags, states); + ss << "osd." << i << " has flags " << states; + detail.push_back(ss.str()); + } + } + for (auto& i : crush_node_flags) { + if (i.second && crush->item_exists(i.first)) { + ostringstream ss; + set<string> states; + OSDMap::calc_state_set(i.second, states); + int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first); + const char *tn = crush->get_type_name(t); + ss << (tn ? tn : "node") << " " + << crush->get_item_name(i.first) << " has flags " << states; + detail.push_back(ss.str()); + } + } + for (auto& i : device_class_flags) { + const char* class_name = crush->get_class_name(i.first); + if (i.second && class_name) { + ostringstream ss; + set<string> states; + OSDMap::calc_state_set(i.second, states); + ss << "device class '" << class_name << "' has flags " << states; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set"; + auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size()); + d.detail.swap(detail); + } + } + + // OLD_CRUSH_TUNABLES + if (cct->_conf->mon_warn_on_legacy_crush_tunables) { + string min = crush->get_min_required_version(); + if (min < cct->_conf->mon_crush_min_required_version) { + ostringstream ss; + ss << "crush map has legacy tunables (require " << min + << ", min is " << cct->_conf->mon_crush_min_required_version << ")"; + auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0); + d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables"); + } + } + + // OLD_CRUSH_STRAW_CALC_VERSION + if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) { + if (crush->get_straw_calc_version() == 0) { + ostringstream ss; + ss << "crush map has straw_calc_version=0"; + auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0); + d.detail.push_back( + "see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables"); + } + } + + // CACHE_POOL_NO_HIT_SET + if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) { + list<string> detail; + for (auto p = pools.cbegin(); p != pools.cend(); ++p) { + const pg_pool_t& info = p->second; + if (info.cache_mode_requires_hit_set() && + info.hit_set_params.get_type() == HitSet::TYPE_NONE) { + ostringstream ss; + ss << "pool '" << get_pool_name(p->first) + << "' with cache_mode " << info.get_cache_mode_name() + << " needs hit_set_type to be set but it is not"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " cache pools are missing hit_sets"; + auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(), + detail.size()); + d.detail.swap(detail); + } + } + + // OSD_NO_SORTBITWISE + if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) { + ostringstream ss; + ss << "'sortbitwise' flag is not set"; + checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0); + } + + // OSD_UPGRADE_FINISHED + if (auto require_release = pending_require_osd_release()) { + ostringstream ss; + ss << "all OSDs are running " << *require_release << " or later but" + << " require_osd_release < " << *require_release; + auto& d = checks->add("OSD_UPGRADE_FINISHED", HEALTH_WARN, ss.str(), 0); + d.detail.push_back(ss.str()); + } + + // POOL_NEARFULL/BACKFILLFULL/FULL + { + list<string> full_detail, backfillfull_detail, nearfull_detail; + for (auto it : get_pools()) { + const pg_pool_t &pool = it.second; + const string& pool_name = get_pool_name(it.first); + if (pool.has_flag(pg_pool_t::FLAG_FULL)) { + stringstream ss; + if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) { + // may run out of space too, + // but we want EQUOTA taking precedence + ss << "pool '" << pool_name << "' is full (running out of quota)"; + } else { + ss << "pool '" << pool_name << "' is full (no space)"; + } + full_detail.push_back(ss.str()); + } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) { + stringstream ss; + ss << "pool '" << pool_name << "' is backfillfull"; + backfillfull_detail.push_back(ss.str()); + } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) { + stringstream ss; + ss << "pool '" << pool_name << "' is nearfull"; + nearfull_detail.push_back(ss.str()); + } + } + if (!full_detail.empty()) { + ostringstream ss; + ss << full_detail.size() << " pool(s) full"; + auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size()); + d.detail.swap(full_detail); + } + if (!backfillfull_detail.empty()) { + ostringstream ss; + ss << backfillfull_detail.size() << " pool(s) backfillfull"; + auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(), + backfillfull_detail.size()); + d.detail.swap(backfillfull_detail); + } + if (!nearfull_detail.empty()) { + ostringstream ss; + ss << nearfull_detail.size() << " pool(s) nearfull"; + auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(), + nearfull_detail.size()); + d.detail.swap(nearfull_detail); + } + } + + // POOL_PG_NUM_NOT_POWER_OF_TWO + if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) { + list<string> detail; + for (auto it : get_pools()) { + if (!std::has_single_bit(it.second.get_pg_num_target())) { + ostringstream ss; + ss << "pool '" << get_pool_name(it.first) + << "' pg_num " << it.second.get_pg_num_target() + << " is not a power of two"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " pool(s) have non-power-of-two pg_num"; + auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN, + ss.str(), detail.size()); + d.detail.swap(detail); + } + } + + // POOL_NO_REDUNDANCY + if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy")) + { + list<string> detail; + for (auto it : get_pools()) { + if (it.second.get_size() == 1) { + ostringstream ss; + ss << "pool '" << get_pool_name(it.first) + << "' has no replicas configured"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " pool(s) have no replicas configured"; + auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN, + ss.str(), detail.size()); + d.detail.swap(detail); + } + } + + // DEGRADED STRETCH MODE + if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) { + if (recovering_stretch_mode) { + stringstream ss; + ss << "We are recovering stretch mode buckets, only requiring " + << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ; + checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN, + ss.str(), 0); + } else if (degraded_stretch_mode) { + stringstream ss; + ss << "We are missing stretch mode buckets, only requiring " + << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ; + checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN, + ss.str(), 0); + } + } + // UNEQUAL_WEIGHT + if (stretch_mode_enabled) { + vector<int> subtrees; + crush->get_subtree_of_type(stretch_mode_bucket, &subtrees); + if (subtrees.size() != 2) { + stringstream ss; + ss << "Stretch mode buckets != 2"; + checks->add("INCORRECT_NUM_BUCKETS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0); + return; + } + int weight1 = crush->get_item_weight(subtrees[0]); + int weight2 = crush->get_item_weight(subtrees[1]); + stringstream ss; + if (weight1 != weight2) { + ss << "Stretch mode buckets have different weights!"; + checks->add("UNEVEN_WEIGHTS_STRETCH_MODE", HEALTH_WARN, ss.str(), 0); + } + } +} + +int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out, + ostream *ss) const +{ + out->clear(); + for (auto i = ls.begin(); i != ls.end(); ++i) { + if (i == ls.begin() && + (*i == "any" || *i == "all" || *i == "*")) { + get_all_osds(*out); + break; + } + long osd = ceph::common::parse_osd_id(i->c_str(), ss); + if (osd < 0) { + *ss << "invalid osd id '" << *i << "'"; + return -EINVAL; + } + out->insert(osd); + } + return 0; +} + +void OSDMap::get_random_up_osds_by_subtree(int n, // whoami + string &subtree, + int limit, // how many + set<int> skip, + set<int> *want) const { + if (limit <= 0) + return; + int subtree_type = crush->get_type_id(subtree); + if (subtree_type < 1) + return; + vector<int> subtrees; + crush->get_subtree_of_type(subtree_type, &subtrees); + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(subtrees.begin(), subtrees.end(), rng); + for (auto s : subtrees) { + if (limit <= 0) + break; + if (crush->subtree_contains(s, n)) + continue; + vector<int> osds; + crush->get_children_of_type(s, 0, &osds); + if (osds.empty()) + continue; + vector<int> up_osds; + for (auto o : osds) { + if (is_up(o) && !skip.count(o)) + up_osds.push_back(o); + } + if (up_osds.empty()) + continue; + auto it = up_osds.begin(); + std::advance(it, (n % up_osds.size())); + want->insert(*it); + --limit; + } +} + +float OSDMap::pool_raw_used_rate(int64_t poolid) const +{ + const pg_pool_t *pool = get_pg_pool(poolid); + assert(pool != nullptr); + + switch (pool->get_type()) { + case pg_pool_t::TYPE_REPLICATED: + return pool->get_size(); + case pg_pool_t::TYPE_ERASURE: + { + auto& ecp = + get_erasure_code_profile(pool->erasure_code_profile); + auto pm = ecp.find("m"); + auto pk = ecp.find("k"); + if (pm != ecp.end() && pk != ecp.end()) { + int k = atoi(pk->second.c_str()); + int m = atoi(pm->second.c_str()); + int mk = m + k; + ceph_assert(mk != 0); + ceph_assert(k != 0); + return (float)mk / k; + } else { + return 0.0; + } + } + break; + default: + ceph_abort_msg("unrecognized pool type"); + } +} + +unsigned OSDMap::get_osd_crush_node_flags(int osd) const +{ + unsigned flags = 0; + if (!crush_node_flags.empty()) { + // the map will contain type -> name + std::map<std::string,std::string> ploc = crush->get_full_location(osd); + for (auto& i : ploc) { + int id = crush->get_item_id(i.second); + auto p = crush_node_flags.find(id); + if (p != crush_node_flags.end()) { + flags |= p->second; + } + } + } + return flags; +} + +unsigned OSDMap::get_crush_node_flags(int id) const +{ + unsigned flags = 0; + auto it = crush_node_flags.find(id); + if (it != crush_node_flags.end()) + flags = it->second; + return flags; +} + +unsigned OSDMap::get_device_class_flags(int id) const +{ + unsigned flags = 0; + auto it = device_class_flags.find(id); + if (it != device_class_flags.end()) + flags = it->second; + return flags; +} + +std::optional<std::string> OSDMap::pending_require_osd_release() const +{ + if (HAVE_FEATURE(get_up_osd_features(), SERVER_QUINCY) && + require_osd_release < ceph_release_t::quincy) { + return "quincy"; + } + if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC) && + require_osd_release < ceph_release_t::pacific) { + return "pacific"; + } + if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS) && + require_osd_release < ceph_release_t::octopus) { + return "octopus"; + } + if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS) && + require_osd_release < ceph_release_t::nautilus) { + return "nautilus"; + } + + return std::nullopt; +} |