// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- #include "gtest/gtest.h" #include "osd/OSDMap.h" #include "osd/OSDMapMapping.h" #include "mon/OSDMonitor.h" #include "mon/PGMap.h" #include "global/global_context.h" #include "global/global_init.h" #include "common/common_init.h" #include "common/ceph_argparse.h" #include "common/ceph_json.h" #include using namespace std; int main(int argc, char **argv) { map defaults = { // make sure we have 3 copies, or some tests won't work { "osd_pool_default_size", "3" }, // our map is flat, so just try and split across OSDs, not hosts or whatever { "osd_crush_chooseleaf_type", "0" }, }; std::vector args(argv, argv+argc); auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); common_init_finish(g_ceph_context); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } class OSDMapTest : public testing::Test { int num_osds = 6; public: OSDMap osdmap; OSDMapMapping mapping; const uint64_t my_ec_pool = 1; const uint64_t my_rep_pool = 2; OSDMapTest() {} void set_up_map(int new_num_osds = 6, bool no_default_pools = false) { num_osds = new_num_osds; uuid_d fsid; osdmap.build_simple(g_ceph_context, 0, fsid, num_osds); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.fsid = osdmap.get_fsid(); entity_addrvec_t sample_addrs; sample_addrs.v.push_back(entity_addr_t()); uuid_d sample_uuid; for (int i = 0; i < num_osds; ++i) { sample_uuid.generate_random(); sample_addrs.v[0].nonce = i; pending_inc.new_state[i] = CEPH_OSD_EXISTS | CEPH_OSD_NEW; pending_inc.new_up_client[i] = sample_addrs; pending_inc.new_up_cluster[i] = sample_addrs; pending_inc.new_hb_back_up[i] = sample_addrs; pending_inc.new_hb_front_up[i] = sample_addrs; pending_inc.new_weight[i] = CEPH_OSD_IN; pending_inc.new_uuid[i] = sample_uuid; } osdmap.apply_incremental(pending_inc); if (no_default_pools) // do not create any default pool(s) return; // Create an EC ruleset and a pool using it int r = osdmap.crush->add_simple_rule( "erasure", "default", "osd", "", "indep", pg_pool_t::TYPE_ERASURE, &cerr); OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); new_pool_inc.new_pool_max = osdmap.get_pool_max(); new_pool_inc.fsid = osdmap.get_fsid(); pg_pool_t empty; // make an ec pool uint64_t pool_id = ++new_pool_inc.new_pool_max; ceph_assert(pool_id == my_ec_pool); pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty); p->size = 3; p->set_pg_num(64); p->set_pgp_num(64); p->type = pg_pool_t::TYPE_ERASURE; p->crush_rule = r; new_pool_inc.new_pool_names[pool_id] = "ec"; // and a replicated pool pool_id = ++new_pool_inc.new_pool_max; ceph_assert(pool_id == my_rep_pool); p = new_pool_inc.get_new_pool(pool_id, &empty); p->size = 3; p->set_pg_num(64); p->set_pgp_num(64); p->type = pg_pool_t::TYPE_REPLICATED; p->crush_rule = 0; p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); new_pool_inc.new_pool_names[pool_id] = "reppool"; osdmap.apply_incremental(new_pool_inc); } unsigned int get_num_osds() { return num_osds; } void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) { bufferlist bl; tmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); auto p = bl.cbegin(); newcrush.decode(p); } int crush_move(OSDMap& tmap, const string &name, const vector &argvec) { map loc; CrushWrapper::parse_loc_map(argvec, &loc); CrushWrapper newcrush; get_crush(tmap, newcrush); if (!newcrush.name_exists(name)) { return -ENOENT; } int id = newcrush.get_item_id(name); int err; if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) { if (id >= 0) { err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc); } else { err = newcrush.move_bucket(g_ceph_context, id, loc); } if (err >= 0) { OSDMap::Incremental pending_inc(tmap.get_epoch() + 1); pending_inc.crush.clear(); newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); tmap.apply_incremental(pending_inc); err = 0; } } else { // already there err = 0; } return err; } int crush_rule_create_replicated(const string &name, const string &root, const string &type) { if (osdmap.crush->rule_exists(name)) { return osdmap.crush->get_rule_id(name); } CrushWrapper newcrush; get_crush(osdmap, newcrush); string device_class; stringstream ss; int ruleno = newcrush.add_simple_rule( name, root, type, device_class, "firstn", pg_pool_t::TYPE_REPLICATED, &ss); if (ruleno >= 0) { OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.crush.clear(); newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); osdmap.apply_incremental(pending_inc); } return ruleno; } void test_mappings(int pool, int num, vector *any, vector *first, vector *primary) { mapping.update(osdmap); for (int i=0; i up, acting; int up_primary, acting_primary; pg_t pgid(i, pool); osdmap.pg_to_up_acting_osds(pgid, &up, &up_primary, &acting, &acting_primary); for (unsigned j=0; j= 0) (*primary)[acting_primary]++; // compare to precalc mapping vector up2, acting2; int up_primary2, acting_primary2; pgid = osdmap.raw_pg_to_pg(pgid); mapping.get(pgid, &up2, &up_primary2, &acting2, &acting_primary2); ASSERT_EQ(up, up2); ASSERT_EQ(up_primary, up_primary2); ASSERT_EQ(acting, acting2); ASSERT_EQ(acting_primary, acting_primary2); } cout << "any: " << *any << std::endl;; cout << "first: " << *first << std::endl;; cout << "primary: " << *primary << std::endl;; } void clean_pg_upmaps(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pending_inc) { int cpu_num = 8; int pgs_per_chunk = 256; ThreadPool tp(cct, "BUG_40104::clean_upmap_tp", "clean_upmap_tp", cpu_num); tp.start(); ParallelPGMapper mapper(cct, &tp); vector pgs_to_check; om.get_upmap_pgs(&pgs_to_check); OSDMonitor::CleanUpmapJob job(cct, om, pending_inc); mapper.queue(&job, pgs_per_chunk, pgs_to_check); job.wait(); tp.stop(); } }; TEST_F(OSDMapTest, Create) { set_up_map(); ASSERT_EQ(get_num_osds(), (unsigned)osdmap.get_max_osd()); ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds()); } TEST_F(OSDMapTest, Features) { // with EC pool set_up_map(); uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2); ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL); ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY); // clients have a slightly different view features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2); ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL); ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY); // remove teh EC pool, but leave the rule. add primary affinity. { OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec")); new_pool_inc.new_primary_affinity[0] = 0x8000; osdmap.apply_incremental(new_pool_inc); } features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2); ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2); ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL); ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY); // FIXME: test tiering feature bits } TEST_F(OSDMapTest, MapPG) { set_up_map(); std::cerr << " osdmap.pool_max==" << osdmap.get_pool_max() << std::endl; pg_t rawpg(0, my_rep_pool); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); vector up_osds, acting_osds; int up_primary, acting_primary; osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, &acting_osds, &acting_primary); vector old_up_osds, old_acting_osds; osdmap.pg_to_up_acting_osds(pgid, old_up_osds, old_acting_osds); ASSERT_EQ(old_up_osds, up_osds); ASSERT_EQ(old_acting_osds, acting_osds); ASSERT_EQ(osdmap.get_pg_pool(my_rep_pool)->get_size(), up_osds.size()); } TEST_F(OSDMapTest, MapFunctionsMatch) { // TODO: make sure pg_to_up_acting_osds and pg_to_acting_osds match set_up_map(); pg_t rawpg(0, my_rep_pool); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); vector up_osds, acting_osds; int up_primary, acting_primary; osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, &acting_osds, &acting_primary); vector up_osds_two, acting_osds_two; osdmap.pg_to_up_acting_osds(pgid, up_osds_two, acting_osds_two); ASSERT_EQ(up_osds, up_osds_two); ASSERT_EQ(acting_osds, acting_osds_two); int acting_primary_two; osdmap.pg_to_acting_osds(pgid, &acting_osds_two, &acting_primary_two); EXPECT_EQ(acting_osds, acting_osds_two); EXPECT_EQ(acting_primary, acting_primary_two); osdmap.pg_to_acting_osds(pgid, acting_osds_two); EXPECT_EQ(acting_osds, acting_osds_two); } /** This test must be removed or modified appropriately when we allow * other ways to specify a primary. */ TEST_F(OSDMapTest, PrimaryIsFirst) { set_up_map(); pg_t rawpg(0, my_rep_pool); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); vector up_osds, acting_osds; int up_primary, acting_primary; osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, &acting_osds, &acting_primary); EXPECT_EQ(up_osds[0], up_primary); EXPECT_EQ(acting_osds[0], acting_primary); } TEST_F(OSDMapTest, PGTempRespected) { set_up_map(); pg_t rawpg(0, my_rep_pool); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); vector up_osds, acting_osds; int up_primary, acting_primary; osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, &acting_osds, &acting_primary); // copy and swap first and last element in acting_osds vector new_acting_osds(acting_osds); int first = new_acting_osds[0]; new_acting_osds[0] = *new_acting_osds.rbegin(); *new_acting_osds.rbegin() = first; // apply pg_temp to osdmap OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1); pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector( new_acting_osds.begin(), new_acting_osds.end()); osdmap.apply_incremental(pgtemp_map); osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, &acting_osds, &acting_primary); EXPECT_EQ(new_acting_osds, acting_osds); } TEST_F(OSDMapTest, PrimaryTempRespected) { set_up_map(); pg_t rawpg(0, my_rep_pool); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); vector up_osds; vector acting_osds; int up_primary, acting_primary; osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, &acting_osds, &acting_primary); // make second OSD primary via incremental OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1); pgtemp_map.new_primary_temp[pgid] = acting_osds[1]; osdmap.apply_incremental(pgtemp_map); osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, &acting_osds, &acting_primary); EXPECT_EQ(acting_primary, acting_osds[1]); } TEST_F(OSDMapTest, CleanTemps) { set_up_map(); OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 2); pg_t pga = osdmap.raw_pg_to_pg(pg_t(0, my_rep_pool)); { vector up_osds, acting_osds; int up_primary, acting_primary; osdmap.pg_to_up_acting_osds(pga, &up_osds, &up_primary, &acting_osds, &acting_primary); pgtemp_map.new_pg_temp[pga] = mempool::osdmap::vector( up_osds.begin(), up_osds.end()); pgtemp_map.new_primary_temp[pga] = up_primary; } pg_t pgb = osdmap.raw_pg_to_pg(pg_t(1, my_rep_pool)); { vector up_osds, acting_osds; int up_primary, acting_primary; osdmap.pg_to_up_acting_osds(pgb, &up_osds, &up_primary, &acting_osds, &acting_primary); pending_inc.new_pg_temp[pgb] = mempool::osdmap::vector( up_osds.begin(), up_osds.end()); pending_inc.new_primary_temp[pgb] = up_primary; } osdmap.apply_incremental(pgtemp_map); OSDMap tmpmap; tmpmap.deepish_copy_from(osdmap); tmpmap.apply_incremental(pending_inc); OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc); EXPECT_TRUE(pending_inc.new_pg_temp.count(pga) && pending_inc.new_pg_temp[pga].size() == 0); EXPECT_EQ(-1, pending_inc.new_primary_temp[pga]); EXPECT_TRUE(!pending_inc.new_pg_temp.count(pgb) && !pending_inc.new_primary_temp.count(pgb)); } TEST_F(OSDMapTest, KeepsNecessaryTemps) { set_up_map(); pg_t rawpg(0, my_rep_pool); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); vector up_osds, acting_osds; int up_primary, acting_primary; osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, &acting_osds, &acting_primary); // find unused OSD and stick it in there OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1); // find an unused osd and put it in place of the first one int i = 0; for(; i != (int)get_num_osds(); ++i) { bool in_use = false; for (vector::iterator osd_it = up_osds.begin(); osd_it != up_osds.end(); ++osd_it) { if (i == *osd_it) { in_use = true; break; } } if (!in_use) { up_osds[1] = i; break; } } if (i == (int)get_num_osds()) FAIL() << "did not find unused OSD for temp mapping"; pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector( up_osds.begin(), up_osds.end()); pgtemp_map.new_primary_temp[pgid] = up_osds[1]; osdmap.apply_incremental(pgtemp_map); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); OSDMap tmpmap; tmpmap.deepish_copy_from(osdmap); tmpmap.apply_incremental(pending_inc); OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc); EXPECT_FALSE(pending_inc.new_pg_temp.count(pgid)); EXPECT_FALSE(pending_inc.new_primary_temp.count(pgid)); } TEST_F(OSDMapTest, PrimaryAffinity) { set_up_map(); int n = get_num_osds(); for (map::const_iterator p = osdmap.get_pools().begin(); p != osdmap.get_pools().end(); ++p) { int pool = p->first; int expect_primary = 10000 / n; cout << "pool " << pool << " size " << (int)p->second.size << " expect_primary " << expect_primary << std::endl; { vector any(n, 0); vector first(n, 0); vector primary(n, 0); test_mappings(pool, 10000, &any, &first, &primary); for (int i=0; i any(n, 0); vector first(n, 0); vector primary(n, 0); test_mappings(pool, 10000, &any, &first, &primary); for (int i=0; i= 2) { ASSERT_LT(0, first[i]); ASSERT_LT(0, primary[i]); } else { if (p->second.is_replicated()) { ASSERT_EQ(0, first[i]); } ASSERT_EQ(0, primary[i]); } } } osdmap.set_primary_affinity(0, 0x8000); osdmap.set_primary_affinity(1, 0); { vector any(n, 0); vector first(n, 0); vector primary(n, 0); test_mappings(pool, 10000, &any, &first, &primary); int expect = (10000 / (n-2)) / 2; // half weight cout << "expect " << expect << std::endl; for (int i=0; i= 2) { ASSERT_LT(0, first[i]); ASSERT_LT(0, primary[i]); } else if (i == 1) { if (p->second.is_replicated()) { ASSERT_EQ(0, first[i]); } ASSERT_EQ(0, primary[i]); } else { ASSERT_LT(expect *2/3, primary[0]); ASSERT_GT(expect *4/3, primary[0]); } } } osdmap.set_primary_affinity(0, 0x10000); osdmap.set_primary_affinity(1, 0x10000); } } TEST_F(OSDMapTest, get_osd_crush_node_flags) { set_up_map(); for (unsigned i=0; i out; set all; osdmap.get_all_osds(all); ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0"}, &out, &cout)); ASSERT_EQ(1u, out.size()); ASSERT_EQ(0, *out.begin()); ASSERT_EQ(0, osdmap.parse_osd_id_list({"1"}, &out, &cout)); ASSERT_EQ(1u, out.size()); ASSERT_EQ(1, *out.begin()); ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","osd.1"}, &out, &cout)); ASSERT_EQ(2u, out.size()); ASSERT_EQ(0, *out.begin()); ASSERT_EQ(1, *out.rbegin()); ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","1"}, &out, &cout)); ASSERT_EQ(2u, out.size()); ASSERT_EQ(0, *out.begin()); ASSERT_EQ(1, *out.rbegin()); ASSERT_EQ(0, osdmap.parse_osd_id_list({"*"}, &out, &cout)); ASSERT_EQ(all.size(), out.size()); ASSERT_EQ(all, out); ASSERT_EQ(0, osdmap.parse_osd_id_list({"all"}, &out, &cout)); ASSERT_EQ(all, out); ASSERT_EQ(0, osdmap.parse_osd_id_list({"any"}, &out, &cout)); ASSERT_EQ(all, out); ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"foo"}, &out, &cout)); ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"-12"}, &out, &cout)); } TEST_F(OSDMapTest, CleanPGUpmaps) { set_up_map(); // build a crush rule of type host const int expected_host_num = 3; int osd_per_host = get_num_osds() / expected_host_num; ASSERT_GE(2, osd_per_host); int index = 0; for (int i = 0; i < (int)get_num_osds(); i++) { if (i && i % osd_per_host == 0) { ++index; } stringstream osd_name; stringstream host_name; vector move_to; osd_name << "osd." << i; host_name << "host-" << index; move_to.push_back("root=default"); string host_loc = "host=" + host_name.str(); move_to.push_back(host_loc); int r = crush_move(osdmap, osd_name.str(), move_to); ASSERT_EQ(0, r); } const string upmap_rule = "upmap"; int upmap_rule_no = crush_rule_create_replicated( upmap_rule, "default", "host"); ASSERT_LT(0, upmap_rule_no); // create a replicated pool which references the above rule OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); new_pool_inc.new_pool_max = osdmap.get_pool_max(); new_pool_inc.fsid = osdmap.get_fsid(); pg_pool_t empty; uint64_t upmap_pool_id = ++new_pool_inc.new_pool_max; pg_pool_t *p = new_pool_inc.get_new_pool(upmap_pool_id, &empty); p->size = 2; p->set_pg_num(64); p->set_pgp_num(64); p->type = pg_pool_t::TYPE_REPLICATED; p->crush_rule = upmap_rule_no; p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); new_pool_inc.new_pool_names[upmap_pool_id] = "upmap_pool"; osdmap.apply_incremental(new_pool_inc); pg_t rawpg(0, upmap_pool_id); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); vector up; int up_primary; osdmap.pg_to_raw_up(pgid, &up, &up_primary); ASSERT_LT(1U, up.size()); { // validate we won't have two OSDs from a same host int parent_0 = osdmap.crush->get_parent_of_type(up[0], osdmap.crush->get_type_id("host")); int parent_1 = osdmap.crush->get_parent_of_type(up[1], osdmap.crush->get_type_id("host")); ASSERT_TRUE(parent_0 != parent_1); } { // cancel stale upmaps osdmap.pg_to_raw_up(pgid, &up, &up_primary); int from = -1; for (int i = 0; i < (int)get_num_osds(); i++) { if (std::find(up.begin(), up.end(), i) == up.end()) { from = i; break; } } ASSERT_TRUE(from >= 0); int to = -1; for (int i = 0; i < (int)get_num_osds(); i++) { if (std::find(up.begin(), up.end(), i) == up.end() && i != from) { to = i; break; } } ASSERT_TRUE(to >= 0); vector> new_pg_upmap_items; new_pg_upmap_items.push_back(make_pair(from, to)); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap_items[pgid] = mempool::osdmap::vector>( new_pg_upmap_items.begin(), new_pg_upmap_items.end()); OSDMap nextmap; nextmap.deepish_copy_from(osdmap); nextmap.apply_incremental(pending_inc); ASSERT_TRUE(nextmap.have_pg_upmaps(pgid)); OSDMap::Incremental new_pending_inc(nextmap.get_epoch() + 1); clean_pg_upmaps(g_ceph_context, nextmap, new_pending_inc); nextmap.apply_incremental(new_pending_inc); ASSERT_TRUE(!nextmap.have_pg_upmaps(pgid)); } { // https://tracker.ceph.com/issues/37493 pg_t ec_pg(0, my_ec_pool); pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg); OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map.. int from = -1; int to = -1; { // insert a valid pg_upmap_item vector ec_up; int ec_up_primary; osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary); ASSERT_TRUE(!ec_up.empty()); from = *(ec_up.begin()); ASSERT_TRUE(from >= 0); for (int i = 0; i < (int)get_num_osds(); i++) { if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) { to = i; break; } } ASSERT_TRUE(to >= 0); ASSERT_TRUE(from != to); vector> new_pg_upmap_items; new_pg_upmap_items.push_back(make_pair(from, to)); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap_items[ec_pgid] = mempool::osdmap::vector>( new_pg_upmap_items.begin(), new_pg_upmap_items.end()); tmpmap.deepish_copy_from(osdmap); tmpmap.apply_incremental(pending_inc); ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); } { // mark one of the target OSDs of the above pg_upmap_item as down OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1); pending_inc.new_state[to] = CEPH_OSD_UP; tmpmap.apply_incremental(pending_inc); ASSERT_TRUE(!tmpmap.is_up(to)); ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); } { // confirm *clean_pg_upmaps* won't do anything bad OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1); clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc); tmpmap.apply_incremental(pending_inc); ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); } } { // http://tracker.ceph.com/issues/37501 pg_t ec_pg(0, my_ec_pool); pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg); OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map.. int from = -1; int to = -1; { // insert a valid pg_upmap_item vector ec_up; int ec_up_primary; osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary); ASSERT_TRUE(!ec_up.empty()); from = *(ec_up.begin()); ASSERT_TRUE(from >= 0); for (int i = 0; i < (int)get_num_osds(); i++) { if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) { to = i; break; } } ASSERT_TRUE(to >= 0); ASSERT_TRUE(from != to); vector> new_pg_upmap_items; new_pg_upmap_items.push_back(make_pair(from, to)); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap_items[ec_pgid] = mempool::osdmap::vector>( new_pg_upmap_items.begin(), new_pg_upmap_items.end()); tmpmap.deepish_copy_from(osdmap); tmpmap.apply_incremental(pending_inc); ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); } { // mark one of the target OSDs of the above pg_upmap_item as out OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1); pending_inc.new_weight[to] = CEPH_OSD_OUT; tmpmap.apply_incremental(pending_inc); ASSERT_TRUE(tmpmap.is_out(to)); ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); } { // *clean_pg_upmaps* should be able to remove the above *bad* mapping OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1); clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc); tmpmap.apply_incremental(pending_inc); ASSERT_TRUE(!tmpmap.have_pg_upmaps(ec_pgid)); } } { // http://tracker.ceph.com/issues/37968 // build a temporary crush topology of 2 hosts, 3 osds per host OSDMap tmp; // use a tmpmap here, so we do not dirty origin map.. tmp.deepish_copy_from(osdmap); const int expected_host_num = 2; int osd_per_host = get_num_osds() / expected_host_num; ASSERT_GE(osd_per_host, 3); int index = 0; for (int i = 0; i < (int)get_num_osds(); i++) { if (i && i % osd_per_host == 0) { ++index; } stringstream osd_name; stringstream host_name; vector move_to; osd_name << "osd." << i; host_name << "host-" << index; move_to.push_back("root=default"); string host_loc = "host=" + host_name.str(); move_to.push_back(host_loc); auto r = crush_move(tmp, osd_name.str(), move_to); ASSERT_EQ(0, r); } // build crush rule CrushWrapper crush; get_crush(tmp, crush); string rule_name = "rule_37968"; int rule_type = pg_pool_t::TYPE_ERASURE; ASSERT_TRUE(!crush.rule_exists(rule_name)); int rno; for (rno = 0; rno < crush.get_max_rules(); rno++) { if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) break; } string root_name = "default"; int root = crush.get_item_id(root_name); int min_size = 3; int max_size = 4; int steps = 6; crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); int step = 0; crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 1 /* host*/); crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 0 /* osd */); crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); ASSERT_TRUE(step == steps); auto r = crush_add_rule(crush.get_crush_map(), rule, rno); ASSERT_TRUE(r >= 0); crush.set_rule_name(rno, rule_name); { OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); pending_inc.crush.clear(); crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); tmp.apply_incremental(pending_inc); } // create a erasuce-coded pool referencing the above rule int64_t pool_37968; { OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1); new_pool_inc.new_pool_max = tmp.get_pool_max(); new_pool_inc.fsid = tmp.get_fsid(); pg_pool_t empty; pool_37968 = ++new_pool_inc.new_pool_max; pg_pool_t *p = new_pool_inc.get_new_pool(pool_37968, &empty); p->size = 4; p->set_pg_num(8); p->set_pgp_num(8); p->type = pg_pool_t::TYPE_ERASURE; p->crush_rule = rno; p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); new_pool_inc.new_pool_names[pool_37968] = "pool_37968"; tmp.apply_incremental(new_pool_inc); } pg_t ec_pg(0, pool_37968); pg_t ec_pgid = tmp.raw_pg_to_pg(ec_pg); int from = -1; int to = -1; { // insert a valid pg_upmap_item vector ec_up; int ec_up_primary; tmp.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary); ASSERT_TRUE(ec_up.size() == 4); from = *(ec_up.begin()); ASSERT_TRUE(from >= 0); auto parent = tmp.crush->get_parent_of_type(from, 1 /* host */, rno); ASSERT_TRUE(parent < 0); // pick an osd of the same parent with *from* for (int i = 0; i < (int)get_num_osds(); i++) { if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) { auto p = tmp.crush->get_parent_of_type(i, 1 /* host */, rno); if (p == parent) { to = i; break; } } } ASSERT_TRUE(to >= 0); ASSERT_TRUE(from != to); vector> new_pg_upmap_items; new_pg_upmap_items.push_back(make_pair(from, to)); OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); pending_inc.new_pg_upmap_items[ec_pgid] = mempool::osdmap::vector>( new_pg_upmap_items.begin(), new_pg_upmap_items.end()); tmp.apply_incremental(pending_inc); ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid)); } { // *clean_pg_upmaps* should not remove the above upmap_item OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); clean_pg_upmaps(g_ceph_context, tmp, pending_inc); tmp.apply_incremental(pending_inc); ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid)); } } { // TEST pg_upmap { // STEP-1: enumerate all children of up[0]'s parent, // replace up[1] with one of them (other than up[0]) int parent = osdmap.crush->get_parent_of_type(up[0], osdmap.crush->get_type_id("host")); set candidates; osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &candidates); ASSERT_LT(1U, candidates.size()); int replaced_by = -1; for (auto c: candidates) { if (c != up[0]) { replaced_by = c; break; } } { // Check we can handle a negative pg_upmap value vector new_pg_upmap; new_pg_upmap.push_back(up[0]); new_pg_upmap.push_back(-823648512); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector( new_pg_upmap.begin(), new_pg_upmap.end()); osdmap.apply_incremental(pending_inc); vector new_up; int new_up_primary; // crucial call - _apply_upmap should ignore the negative value osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); } ASSERT_NE(-1, replaced_by); // generate a new pg_upmap item and apply vector new_pg_upmap; new_pg_upmap.push_back(up[0]); new_pg_upmap.push_back(replaced_by); // up[1] -> replaced_by OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector( new_pg_upmap.begin(), new_pg_upmap.end()); osdmap.apply_incremental(pending_inc); { // validate pg_upmap is there vector new_up; int new_up_primary; osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); ASSERT_TRUE(up.size() == new_up.size()); ASSERT_TRUE(new_up[0] == new_pg_upmap[0]); ASSERT_TRUE(new_up[1] == new_pg_upmap[1]); // and we shall have two OSDs from a same host now.. int parent_0 = osdmap.crush->get_parent_of_type(new_up[0], osdmap.crush->get_type_id("host")); int parent_1 = osdmap.crush->get_parent_of_type(new_up[1], osdmap.crush->get_type_id("host")); ASSERT_TRUE(parent_0 == parent_1); } } { // STEP-2: apply cure OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); clean_pg_upmaps(g_ceph_context, osdmap, pending_inc); osdmap.apply_incremental(pending_inc); { // validate pg_upmap is gone (reverted) vector new_up; int new_up_primary; osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); ASSERT_TRUE(new_up == up); ASSERT_TRUE(new_up_primary = up_primary); } } } { // TEST pg_upmap_items // enumerate all used hosts first set parents; for (auto u: up) { int parent = osdmap.crush->get_parent_of_type(u, osdmap.crush->get_type_id("host")); ASSERT_GT(0, parent); parents.insert(parent); } int candidate_parent = 0; set candidate_children; vector up_after_out; { // STEP-1: try mark out up[1] and all other OSDs from the same host int parent = osdmap.crush->get_parent_of_type(up[1], osdmap.crush->get_type_id("host")); set children; osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &children); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); for (auto c: children) { pending_inc.new_weight[c] = CEPH_OSD_OUT; } OSDMap tmpmap; tmpmap.deepish_copy_from(osdmap); tmpmap.apply_incremental(pending_inc); vector new_up; int new_up_primary; tmpmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); // verify that we'll have OSDs from a different host.. int will_choose = -1; for (auto o: new_up) { int parent = tmpmap.crush->get_parent_of_type(o, osdmap.crush->get_type_id("host")); if (!parents.count(parent)) { will_choose = o; candidate_parent = parent; // record break; } } ASSERT_LT(-1, will_choose); // it is an OSD! ASSERT_TRUE(candidate_parent != 0); osdmap.crush->get_leaves(osdmap.crush->get_item_name(candidate_parent), &candidate_children); ASSERT_TRUE(candidate_children.count(will_choose)); candidate_children.erase(will_choose); ASSERT_TRUE(!candidate_children.empty()); up_after_out = new_up; // needed for verification.. } { // Make sure we can handle a negative pg_upmap_item int victim = up[0]; int replaced_by = -823648512; vector> new_pg_upmap_items; new_pg_upmap_items.push_back(make_pair(victim, replaced_by)); // apply OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap_items[pgid] = mempool::osdmap::vector>( new_pg_upmap_items.begin(), new_pg_upmap_items.end()); osdmap.apply_incremental(pending_inc); vector new_up; int new_up_primary; // crucial call - _apply_upmap should ignore the negative value osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); } { // STEP-2: generating a new pg_upmap_items entry by // replacing up[0] with one coming from candidate_children int victim = up[0]; int replaced_by = *candidate_children.begin(); vector> new_pg_upmap_items; new_pg_upmap_items.push_back(make_pair(victim, replaced_by)); // apply OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap_items[pgid] = mempool::osdmap::vector>( new_pg_upmap_items.begin(), new_pg_upmap_items.end()); osdmap.apply_incremental(pending_inc); { // validate pg_upmap_items is there vector new_up; int new_up_primary; osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); ASSERT_TRUE(up.size() == new_up.size()); ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), replaced_by) != new_up.end()); // and up[1] too ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), up[1]) != new_up.end()); } } { // STEP-3: mark out up[1] and all other OSDs from the same host int parent = osdmap.crush->get_parent_of_type(up[1], osdmap.crush->get_type_id("host")); set children; osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &children); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); for (auto c: children) { pending_inc.new_weight[c] = CEPH_OSD_OUT; } osdmap.apply_incremental(pending_inc); { // validate we have two OSDs from the same host now.. vector new_up; int new_up_primary; osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); ASSERT_TRUE(up.size() == new_up.size()); int parent_0 = osdmap.crush->get_parent_of_type(new_up[0], osdmap.crush->get_type_id("host")); int parent_1 = osdmap.crush->get_parent_of_type(new_up[1], osdmap.crush->get_type_id("host")); ASSERT_TRUE(parent_0 == parent_1); } } { // STEP-4: apply cure OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); clean_pg_upmaps(g_ceph_context, osdmap, pending_inc); osdmap.apply_incremental(pending_inc); { // validate pg_upmap_items is gone (reverted) vector new_up; int new_up_primary; osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); ASSERT_TRUE(new_up == up_after_out); } } } } TEST_F(OSDMapTest, BUG_38897) { // http://tracker.ceph.com/issues/38897 // build a fresh map with 12 OSDs, without any default pools set_up_map(12, true); const string pool_1("pool1"); const string pool_2("pool2"); int64_t pool_1_id = -1; { // build customized crush rule for "pool1" string host_name = "host_for_pool_1"; // build a customized host to capture osd.1~5 for (int i = 1; i < 5; i++) { stringstream osd_name; vector move_to; osd_name << "osd." << i; move_to.push_back("root=default"); string host_loc = "host=" + host_name; move_to.push_back(host_loc); auto r = crush_move(osdmap, osd_name.str(), move_to); ASSERT_EQ(0, r); } CrushWrapper crush; get_crush(osdmap, crush); auto host_id = crush.get_item_id(host_name); ASSERT_TRUE(host_id < 0); string rule_name = "rule_for_pool1"; int rule_type = pg_pool_t::TYPE_REPLICATED; ASSERT_TRUE(!crush.rule_exists(rule_name)); int rno; for (rno = 0; rno < crush.get_max_rules(); rno++) { if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) break; } int min_size = 3; int max_size = 3; int steps = 7; crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); int step = 0; crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); // always choose osd.0 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); // then pick any other random osds crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); ASSERT_TRUE(step == steps); auto r = crush_add_rule(crush.get_crush_map(), rule, rno); ASSERT_TRUE(r >= 0); crush.set_rule_name(rno, rule_name); { OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.crush.clear(); crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); osdmap.apply_incremental(pending_inc); } // create "pool1" OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pool_max = osdmap.get_pool_max(); auto pool_id = ++pending_inc.new_pool_max; pool_1_id = pool_id; pg_pool_t empty; auto p = pending_inc.get_new_pool(pool_id, &empty); p->size = 3; p->min_size = 1; p->set_pg_num(3); p->set_pgp_num(3); p->type = pg_pool_t::TYPE_REPLICATED; p->crush_rule = rno; p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); pending_inc.new_pool_names[pool_id] = pool_1; osdmap.apply_incremental(pending_inc); ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1); { for (unsigned i = 0; i < 3; i++) { // 1.x -> [1] pg_t rawpg(i, pool_id); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); vector up; int up_primary; osdmap.pg_to_raw_up(pgid, &up, &up_primary); ASSERT_TRUE(up.size() == 3); ASSERT_TRUE(up[0] == 0); // insert a new pg_upmap vector new_up; // and remap 1.x to osd.1 only // this way osd.0 is deemed to be *underfull* // and osd.1 is deemed to be *overfull* new_up.push_back(1); { OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector( new_up.begin(), new_up.end()); osdmap.apply_incremental(pending_inc); } osdmap.pg_to_raw_up(pgid, &up, &up_primary); ASSERT_TRUE(up.size() == 1); ASSERT_TRUE(up[0] == 1); } } } { // build customized crush rule for "pool2" string host_name = "host_for_pool_2"; // build a customized host to capture osd.6~11 for (int i = 6; i < (int)get_num_osds(); i++) { stringstream osd_name; vector move_to; osd_name << "osd." << i; move_to.push_back("root=default"); string host_loc = "host=" + host_name; move_to.push_back(host_loc); auto r = crush_move(osdmap, osd_name.str(), move_to); ASSERT_EQ(0, r); } CrushWrapper crush; get_crush(osdmap, crush); auto host_id = crush.get_item_id(host_name); ASSERT_TRUE(host_id < 0); string rule_name = "rule_for_pool2"; int rule_type = pg_pool_t::TYPE_REPLICATED; ASSERT_TRUE(!crush.rule_exists(rule_name)); int rno; for (rno = 0; rno < crush.get_max_rules(); rno++) { if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) break; } int min_size = 3; int max_size = 3; int steps = 7; crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); int step = 0; crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); // always choose osd.0 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); // then pick any other random osds crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); ASSERT_TRUE(step == steps); auto r = crush_add_rule(crush.get_crush_map(), rule, rno); ASSERT_TRUE(r >= 0); crush.set_rule_name(rno, rule_name); { OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.crush.clear(); crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); osdmap.apply_incremental(pending_inc); } // create "pool2" OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pool_max = osdmap.get_pool_max(); auto pool_id = ++pending_inc.new_pool_max; pg_pool_t empty; auto p = pending_inc.get_new_pool(pool_id, &empty); p->size = 3; // include a single PG p->set_pg_num(1); p->set_pgp_num(1); p->type = pg_pool_t::TYPE_REPLICATED; p->crush_rule = rno; p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); pending_inc.new_pool_names[pool_id] = pool_2; osdmap.apply_incremental(pending_inc); ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2); pg_t rawpg(0, pool_id); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid)); vector up; int up_primary; osdmap.pg_to_raw_up(pgid, &up, &up_primary); ASSERT_TRUE(up.size() == 3); ASSERT_TRUE(up[0] == 0); { // build a pg_upmap_item that will // remap pg out from *underfull* osd.0 vector> new_pg_upmap_items; new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap_items[pgid] = mempool::osdmap::vector>( new_pg_upmap_items.begin(), new_pg_upmap_items.end()); osdmap.apply_incremental(pending_inc); ASSERT_TRUE(osdmap.have_pg_upmaps(pgid)); vector up; int up_primary; osdmap.pg_to_raw_up(pgid, &up, &up_primary); ASSERT_TRUE(up.size() == 3); ASSERT_TRUE(up[0] == 10); } } // ready to go { set only_pools; ASSERT_TRUE(pool_1_id >= 0); only_pools.insert(pool_1_id); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); // require perfect distribution! (max deviation 0) osdmap.calc_pg_upmaps(g_ceph_context, 0, // so we can force optimizing 100, only_pools, &pending_inc); osdmap.apply_incremental(pending_inc); } } TEST_F(OSDMapTest, BUG_40104) { // http://tracker.ceph.com/issues/40104 int big_osd_num = 5000; int big_pg_num = 10000; set_up_map(big_osd_num, true); int pool_id; { OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pool_max = osdmap.get_pool_max(); pool_id = ++pending_inc.new_pool_max; pg_pool_t empty; auto p = pending_inc.get_new_pool(pool_id, &empty); p->size = 3; p->min_size = 1; p->set_pg_num(big_pg_num); p->set_pgp_num(big_pg_num); p->type = pg_pool_t::TYPE_REPLICATED; p->crush_rule = 0; p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); pending_inc.new_pool_names[pool_id] = "big_pool"; osdmap.apply_incremental(pending_inc); ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); ASSERT_TRUE(osdmap.get_pool_name(pool_id) == "big_pool"); } { // generate pg_upmap_items for each pg OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); for (int i = 0; i < big_pg_num; i++) { pg_t rawpg(i, pool_id); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); vector up; int up_primary; osdmap.pg_to_raw_up(pgid, &up, &up_primary); ASSERT_TRUE(up.size() == 3); int victim = up[0]; int replaced_by = random() % big_osd_num; vector> new_pg_upmap_items; // note that it might or might not be valid, we don't care new_pg_upmap_items.push_back(make_pair(victim, replaced_by)); pending_inc.new_pg_upmap_items[pgid] = mempool::osdmap::vector>( new_pg_upmap_items.begin(), new_pg_upmap_items.end()); } osdmap.apply_incremental(pending_inc); } { OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); auto start = mono_clock::now(); clean_pg_upmaps(g_ceph_context, osdmap, pending_inc); auto latency = mono_clock::now() - start; std::cout << "clean_pg_upmaps (~" << big_pg_num << " pg_upmap_items) latency:" << timespan_str(latency) << std::endl; } } TEST_F(OSDMapTest, BUG_42052) { // https://tracker.ceph.com/issues/42052 set_up_map(6, true); const string pool_name("pool"); // build customized crush rule for "pool" CrushWrapper crush; get_crush(osdmap, crush); string rule_name = "rule"; int rule_type = pg_pool_t::TYPE_REPLICATED; ASSERT_TRUE(!crush.rule_exists(rule_name)); int rno; for (rno = 0; rno < crush.get_max_rules(); rno++) { if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) break; } int min_size = 3; int max_size = 3; int steps = 8; crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); int step = 0; crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); // always choose osd.0, osd.1, osd.2 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 1); crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 2); crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); ASSERT_TRUE(step == steps); auto r = crush_add_rule(crush.get_crush_map(), rule, rno); ASSERT_TRUE(r >= 0); crush.set_rule_name(rno, rule_name); { OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.crush.clear(); crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); osdmap.apply_incremental(pending_inc); } // create "pool" OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pool_max = osdmap.get_pool_max(); auto pool_id = ++pending_inc.new_pool_max; pg_pool_t empty; auto p = pending_inc.get_new_pool(pool_id, &empty); p->size = 3; p->min_size = 1; p->set_pg_num(1); p->set_pgp_num(1); p->type = pg_pool_t::TYPE_REPLICATED; p->crush_rule = rno; p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); pending_inc.new_pool_names[pool_id] = pool_name; osdmap.apply_incremental(pending_inc); ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_name); pg_t rawpg(0, pool_id); pg_t pgid = osdmap.raw_pg_to_pg(rawpg); { // pg_upmap 1.0 [2,3,5] vector new_up{2,3,5}; OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector( new_up.begin(), new_up.end()); osdmap.apply_incremental(pending_inc); } { // pg_upmap_items 1.0 [0,3,4,5] vector> new_pg_upmap_items; new_pg_upmap_items.push_back(make_pair(0, 3)); new_pg_upmap_items.push_back(make_pair(4, 5)); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.new_pg_upmap_items[pgid] = mempool::osdmap::vector>( new_pg_upmap_items.begin(), new_pg_upmap_items.end()); osdmap.apply_incremental(pending_inc); } { OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); clean_pg_upmaps(g_ceph_context, osdmap, pending_inc); osdmap.apply_incremental(pending_inc); ASSERT_FALSE(osdmap.have_pg_upmaps(pgid)); } } TEST(PGTempMap, basic) { PGTempMap m; pg_t a(1,1); for (auto i=3; i<1000; ++i) { pg_t x(i, 1); m.set(x, {static_cast(i)}); } pg_t b(2,1); m.set(a, {1, 2}); ASSERT_NE(m.find(a), m.end()); ASSERT_EQ(m.find(a), m.begin()); ASSERT_EQ(m.find(b), m.end()); ASSERT_EQ(998u, m.size()); } TEST_F(OSDMapTest, BUG_48884) { set_up_map(12); unsigned int host_index = 1; for (unsigned int x=0; x < get_num_osds();) { // Create three hosts with four osds each for (unsigned int y=0; y < 4; y++) { stringstream osd_name; stringstream host_name; vector move_to; osd_name << "osd." << x; host_name << "host-" << host_index; move_to.push_back("root=default"); move_to.push_back("rack=localrack"); string host_loc = "host=" + host_name.str(); move_to.push_back(host_loc); int r = crush_move(osdmap, osd_name.str(), move_to); ASSERT_EQ(0, r); x++; } host_index++; } CrushWrapper crush; get_crush(osdmap, crush); auto host_id = crush.get_item_id("localhost"); crush.remove_item(g_ceph_context, host_id, false); OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); pending_inc.crush.clear(); crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); osdmap.apply_incremental(pending_inc); PGMap pgmap; osd_stat_t stats, stats_null; stats.statfs.total = 500000; stats.statfs.available = 50000; stats.statfs.omap_allocated = 50000; stats.statfs.internal_metadata = 50000; stats_null.statfs.total = 0; stats_null.statfs.available = 0; stats_null.statfs.omap_allocated = 0; stats_null.statfs.internal_metadata = 0; for (unsigned int x=0; x < get_num_osds(); x++) { if (x > 3 && x < 8) { pgmap.osd_stat.insert({x,stats_null}); } else { pgmap.osd_stat.insert({x,stats}); } } stringstream ss; boost::scoped_ptr f(Formatter::create("json-pretty")); print_osd_utilization(osdmap, pgmap, ss, f.get(), true, "", "root"); JSONParser parser; parser.parse(ss.str().c_str(), static_cast(ss.str().size())); auto iter = parser.find_first(); for (const auto bucket : (*iter)->get_array_elements()) { JSONParser parser2; parser2.parse(bucket.c_str(), static_cast(bucket.size())); auto* obj = parser2.find_obj("name"); if (obj->get_data_val().str.compare("localrack") == 0) { obj = parser2.find_obj("kb"); ASSERT_EQ(obj->get_data_val().str, "3904"); obj = parser2.find_obj("kb_used"); ASSERT_EQ(obj->get_data_val().str, "3512"); obj = parser2.find_obj("kb_used_omap"); ASSERT_EQ(obj->get_data_val().str, "384"); obj = parser2.find_obj("kb_used_meta"); ASSERT_EQ(obj->get_data_val().str, "384"); obj = parser2.find_obj("kb_avail"); ASSERT_EQ(obj->get_data_val().str, "384"); } } }