diff options
Diffstat (limited to 'src/test/osd/TestOSDMap.cc')
-rw-r--r-- | src/test/osd/TestOSDMap.cc | 1575 |
1 files changed, 1575 insertions, 0 deletions
diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc new file mode 100644 index 00000000..d76341eb --- /dev/null +++ b/src/test/osd/TestOSDMap.cc @@ -0,0 +1,1575 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +#include "gtest/gtest.h" +#include "osd/OSDMap.h" +#include "osd/OSDMapMapping.h" +#include "mon/OSDMonitor.h" +#include "mon/PGMap.h" + +#include "global/global_context.h" +#include "global/global_init.h" +#include "common/common_init.h" +#include "common/ceph_argparse.h" +#include "common/ceph_json.h" + +#include <iostream> + +using namespace std; + +int main(int argc, char **argv) { + map<string,string> defaults = { + // make sure we have 3 copies, or some tests won't work + { "osd_pool_default_size", "3" }, + // our map is flat, so just try and split across OSDs, not hosts or whatever + { "osd_crush_chooseleaf_type", "0" }, + }; + std::vector<const char*> args(argv, argv+argc); + auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +class OSDMapTest : public testing::Test { + int num_osds = 6; +public: + OSDMap osdmap; + OSDMapMapping mapping; + const uint64_t my_ec_pool = 1; + const uint64_t my_rep_pool = 2; + + + OSDMapTest() {} + + void set_up_map(int new_num_osds = 6, bool no_default_pools = false) { + num_osds = new_num_osds; + uuid_d fsid; + osdmap.build_simple(g_ceph_context, 0, fsid, num_osds); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.fsid = osdmap.get_fsid(); + entity_addrvec_t sample_addrs; + sample_addrs.v.push_back(entity_addr_t()); + uuid_d sample_uuid; + for (int i = 0; i < num_osds; ++i) { + sample_uuid.generate_random(); + sample_addrs.v[0].nonce = i; + pending_inc.new_state[i] = CEPH_OSD_EXISTS | CEPH_OSD_NEW; + pending_inc.new_up_client[i] = sample_addrs; + pending_inc.new_up_cluster[i] = sample_addrs; + pending_inc.new_hb_back_up[i] = sample_addrs; + pending_inc.new_hb_front_up[i] = sample_addrs; + pending_inc.new_weight[i] = CEPH_OSD_IN; + pending_inc.new_uuid[i] = sample_uuid; + } + osdmap.apply_incremental(pending_inc); + if (no_default_pools) // do not create any default pool(s) + return; + + // Create an EC ruleset and a pool using it + int r = osdmap.crush->add_simple_rule( + "erasure", "default", "osd", "", + "indep", pg_pool_t::TYPE_ERASURE, + &cerr); + + OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); + new_pool_inc.new_pool_max = osdmap.get_pool_max(); + new_pool_inc.fsid = osdmap.get_fsid(); + pg_pool_t empty; + // make an ec pool + uint64_t pool_id = ++new_pool_inc.new_pool_max; + ceph_assert(pool_id == my_ec_pool); + pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty); + p->size = 3; + p->set_pg_num(64); + p->set_pgp_num(64); + p->type = pg_pool_t::TYPE_ERASURE; + p->crush_rule = r; + new_pool_inc.new_pool_names[pool_id] = "ec"; + // and a replicated pool + pool_id = ++new_pool_inc.new_pool_max; + ceph_assert(pool_id == my_rep_pool); + p = new_pool_inc.get_new_pool(pool_id, &empty); + p->size = 3; + p->set_pg_num(64); + p->set_pgp_num(64); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = 0; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + new_pool_inc.new_pool_names[pool_id] = "reppool"; + osdmap.apply_incremental(new_pool_inc); + } + unsigned int get_num_osds() { return num_osds; } + void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) { + bufferlist bl; + tmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + auto p = bl.cbegin(); + newcrush.decode(p); + } + int crush_move(OSDMap& tmap, const string &name, const vector<string> &argvec) { + map<string,string> loc; + CrushWrapper::parse_loc_map(argvec, &loc); + CrushWrapper newcrush; + get_crush(tmap, newcrush); + if (!newcrush.name_exists(name)) { + return -ENOENT; + } + int id = newcrush.get_item_id(name); + int err; + if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) { + if (id >= 0) { + err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc); + } else { + err = newcrush.move_bucket(g_ceph_context, id, loc); + } + if (err >= 0) { + OSDMap::Incremental pending_inc(tmap.get_epoch() + 1); + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + tmap.apply_incremental(pending_inc); + err = 0; + } + } else { + // already there + err = 0; + } + return err; + } + int crush_rule_create_replicated(const string &name, + const string &root, + const string &type) { + if (osdmap.crush->rule_exists(name)) { + return osdmap.crush->get_rule_id(name); + } + CrushWrapper newcrush; + get_crush(osdmap, newcrush); + string device_class; + stringstream ss; + int ruleno = newcrush.add_simple_rule( + name, root, type, device_class, + "firstn", pg_pool_t::TYPE_REPLICATED, &ss); + if (ruleno >= 0) { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + osdmap.apply_incremental(pending_inc); + } + return ruleno; + } + void test_mappings(int pool, + int num, + vector<int> *any, + vector<int> *first, + vector<int> *primary) { + mapping.update(osdmap); + for (int i=0; i<num; ++i) { + vector<int> up, acting; + int up_primary, acting_primary; + pg_t pgid(i, pool); + osdmap.pg_to_up_acting_osds(pgid, + &up, &up_primary, &acting, &acting_primary); + for (unsigned j=0; j<acting.size(); ++j) + (*any)[acting[j]]++; + if (!acting.empty()) + (*first)[acting[0]]++; + if (acting_primary >= 0) + (*primary)[acting_primary]++; + + // compare to precalc mapping + vector<int> up2, acting2; + int up_primary2, acting_primary2; + pgid = osdmap.raw_pg_to_pg(pgid); + mapping.get(pgid, &up2, &up_primary2, &acting2, &acting_primary2); + ASSERT_EQ(up, up2); + ASSERT_EQ(up_primary, up_primary2); + ASSERT_EQ(acting, acting2); + ASSERT_EQ(acting_primary, acting_primary2); + } + cout << "any: " << *any << std::endl;; + cout << "first: " << *first << std::endl;; + cout << "primary: " << *primary << std::endl;; + } + void clean_pg_upmaps(CephContext *cct, + const OSDMap& om, + OSDMap::Incremental& pending_inc) { + int cpu_num = 8; + int pgs_per_chunk = 256; + ThreadPool tp(cct, "BUG_40104::clean_upmap_tp", "clean_upmap_tp", cpu_num); + tp.start(); + ParallelPGMapper mapper(cct, &tp); + vector<pg_t> pgs_to_check; + om.get_upmap_pgs(&pgs_to_check); + OSDMonitor::CleanUpmapJob job(cct, om, pending_inc); + mapper.queue(&job, pgs_per_chunk, pgs_to_check); + job.wait(); + tp.stop(); + } +}; + +TEST_F(OSDMapTest, Create) { + set_up_map(); + ASSERT_EQ(get_num_osds(), (unsigned)osdmap.get_max_osd()); + ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds()); +} + +TEST_F(OSDMapTest, Features) { + // with EC pool + set_up_map(); + uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2); + ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL); + ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY); + + // clients have a slightly different view + features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2); + ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL); + ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY); + + // remove teh EC pool, but leave the rule. add primary affinity. + { + OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); + new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec")); + new_pool_inc.new_primary_affinity[0] = 0x8000; + osdmap.apply_incremental(new_pool_inc); + } + + features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2); + ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity + ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2); + ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL); + ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY); + + // FIXME: test tiering feature bits +} + +TEST_F(OSDMapTest, MapPG) { + set_up_map(); + + std::cerr << " osdmap.pool_max==" << osdmap.get_pool_max() << std::endl; + pg_t rawpg(0, my_rep_pool); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector<int> up_osds, acting_osds; + int up_primary, acting_primary; + + osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, + &acting_osds, &acting_primary); + + vector<int> old_up_osds, old_acting_osds; + osdmap.pg_to_up_acting_osds(pgid, old_up_osds, old_acting_osds); + ASSERT_EQ(old_up_osds, up_osds); + ASSERT_EQ(old_acting_osds, acting_osds); + + ASSERT_EQ(osdmap.get_pg_pool(my_rep_pool)->get_size(), up_osds.size()); +} + +TEST_F(OSDMapTest, MapFunctionsMatch) { + // TODO: make sure pg_to_up_acting_osds and pg_to_acting_osds match + set_up_map(); + pg_t rawpg(0, my_rep_pool); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector<int> up_osds, acting_osds; + int up_primary, acting_primary; + + osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, + &acting_osds, &acting_primary); + + vector<int> up_osds_two, acting_osds_two; + + osdmap.pg_to_up_acting_osds(pgid, up_osds_two, acting_osds_two); + + ASSERT_EQ(up_osds, up_osds_two); + ASSERT_EQ(acting_osds, acting_osds_two); + + int acting_primary_two; + osdmap.pg_to_acting_osds(pgid, &acting_osds_two, &acting_primary_two); + EXPECT_EQ(acting_osds, acting_osds_two); + EXPECT_EQ(acting_primary, acting_primary_two); + osdmap.pg_to_acting_osds(pgid, acting_osds_two); + EXPECT_EQ(acting_osds, acting_osds_two); +} + +/** This test must be removed or modified appropriately when we allow + * other ways to specify a primary. */ +TEST_F(OSDMapTest, PrimaryIsFirst) { + set_up_map(); + + pg_t rawpg(0, my_rep_pool); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector<int> up_osds, acting_osds; + int up_primary, acting_primary; + + osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, + &acting_osds, &acting_primary); + EXPECT_EQ(up_osds[0], up_primary); + EXPECT_EQ(acting_osds[0], acting_primary); +} + +TEST_F(OSDMapTest, PGTempRespected) { + set_up_map(); + + pg_t rawpg(0, my_rep_pool); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector<int> up_osds, acting_osds; + int up_primary, acting_primary; + + osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, + &acting_osds, &acting_primary); + + // copy and swap first and last element in acting_osds + vector<int> new_acting_osds(acting_osds); + int first = new_acting_osds[0]; + new_acting_osds[0] = *new_acting_osds.rbegin(); + *new_acting_osds.rbegin() = first; + + // apply pg_temp to osdmap + OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1); + pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>( + new_acting_osds.begin(), new_acting_osds.end()); + osdmap.apply_incremental(pgtemp_map); + + osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, + &acting_osds, &acting_primary); + EXPECT_EQ(new_acting_osds, acting_osds); +} + +TEST_F(OSDMapTest, PrimaryTempRespected) { + set_up_map(); + + pg_t rawpg(0, my_rep_pool); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector<int> up_osds; + vector<int> acting_osds; + int up_primary, acting_primary; + + osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, + &acting_osds, &acting_primary); + + // make second OSD primary via incremental + OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1); + pgtemp_map.new_primary_temp[pgid] = acting_osds[1]; + osdmap.apply_incremental(pgtemp_map); + + osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, + &acting_osds, &acting_primary); + EXPECT_EQ(acting_primary, acting_osds[1]); +} + +TEST_F(OSDMapTest, CleanTemps) { + set_up_map(); + + OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 2); + pg_t pga = osdmap.raw_pg_to_pg(pg_t(0, my_rep_pool)); + { + vector<int> up_osds, acting_osds; + int up_primary, acting_primary; + osdmap.pg_to_up_acting_osds(pga, &up_osds, &up_primary, + &acting_osds, &acting_primary); + pgtemp_map.new_pg_temp[pga] = mempool::osdmap::vector<int>( + up_osds.begin(), up_osds.end()); + pgtemp_map.new_primary_temp[pga] = up_primary; + } + pg_t pgb = osdmap.raw_pg_to_pg(pg_t(1, my_rep_pool)); + { + vector<int> up_osds, acting_osds; + int up_primary, acting_primary; + osdmap.pg_to_up_acting_osds(pgb, &up_osds, &up_primary, + &acting_osds, &acting_primary); + pending_inc.new_pg_temp[pgb] = mempool::osdmap::vector<int>( + up_osds.begin(), up_osds.end()); + pending_inc.new_primary_temp[pgb] = up_primary; + } + + osdmap.apply_incremental(pgtemp_map); + + OSDMap tmpmap; + tmpmap.deepish_copy_from(osdmap); + tmpmap.apply_incremental(pending_inc); + OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc); + + EXPECT_TRUE(pending_inc.new_pg_temp.count(pga) && + pending_inc.new_pg_temp[pga].size() == 0); + EXPECT_EQ(-1, pending_inc.new_primary_temp[pga]); + + EXPECT_TRUE(!pending_inc.new_pg_temp.count(pgb) && + !pending_inc.new_primary_temp.count(pgb)); +} + +TEST_F(OSDMapTest, KeepsNecessaryTemps) { + set_up_map(); + + pg_t rawpg(0, my_rep_pool); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector<int> up_osds, acting_osds; + int up_primary, acting_primary; + + osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary, + &acting_osds, &acting_primary); + + // find unused OSD and stick it in there + OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1); + // find an unused osd and put it in place of the first one + int i = 0; + for(; i != (int)get_num_osds(); ++i) { + bool in_use = false; + for (vector<int>::iterator osd_it = up_osds.begin(); + osd_it != up_osds.end(); + ++osd_it) { + if (i == *osd_it) { + in_use = true; + break; + } + } + if (!in_use) { + up_osds[1] = i; + break; + } + } + if (i == (int)get_num_osds()) + FAIL() << "did not find unused OSD for temp mapping"; + + pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>( + up_osds.begin(), up_osds.end()); + pgtemp_map.new_primary_temp[pgid] = up_osds[1]; + osdmap.apply_incremental(pgtemp_map); + + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + + OSDMap tmpmap; + tmpmap.deepish_copy_from(osdmap); + tmpmap.apply_incremental(pending_inc); + OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc); + EXPECT_FALSE(pending_inc.new_pg_temp.count(pgid)); + EXPECT_FALSE(pending_inc.new_primary_temp.count(pgid)); +} + +TEST_F(OSDMapTest, PrimaryAffinity) { + set_up_map(); + + int n = get_num_osds(); + for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin(); + p != osdmap.get_pools().end(); + ++p) { + int pool = p->first; + int expect_primary = 10000 / n; + cout << "pool " << pool << " size " << (int)p->second.size + << " expect_primary " << expect_primary << std::endl; + { + vector<int> any(n, 0); + vector<int> first(n, 0); + vector<int> primary(n, 0); + test_mappings(pool, 10000, &any, &first, &primary); + for (int i=0; i<n; ++i) { + ASSERT_LT(0, any[i]); + ASSERT_LT(0, first[i]); + ASSERT_LT(0, primary[i]); + } + } + + osdmap.set_primary_affinity(0, 0); + osdmap.set_primary_affinity(1, 0); + { + vector<int> any(n, 0); + vector<int> first(n, 0); + vector<int> primary(n, 0); + test_mappings(pool, 10000, &any, &first, &primary); + for (int i=0; i<n; ++i) { + ASSERT_LT(0, any[i]); + if (i >= 2) { + ASSERT_LT(0, first[i]); + ASSERT_LT(0, primary[i]); + } else { + if (p->second.is_replicated()) { + ASSERT_EQ(0, first[i]); + } + ASSERT_EQ(0, primary[i]); + } + } + } + + osdmap.set_primary_affinity(0, 0x8000); + osdmap.set_primary_affinity(1, 0); + { + vector<int> any(n, 0); + vector<int> first(n, 0); + vector<int> primary(n, 0); + test_mappings(pool, 10000, &any, &first, &primary); + int expect = (10000 / (n-2)) / 2; // half weight + cout << "expect " << expect << std::endl; + for (int i=0; i<n; ++i) { + ASSERT_LT(0, any[i]); + if (i >= 2) { + ASSERT_LT(0, first[i]); + ASSERT_LT(0, primary[i]); + } else if (i == 1) { + if (p->second.is_replicated()) { + ASSERT_EQ(0, first[i]); + } + ASSERT_EQ(0, primary[i]); + } else { + ASSERT_LT(expect *2/3, primary[0]); + ASSERT_GT(expect *4/3, primary[0]); + } + } + } + + osdmap.set_primary_affinity(0, 0x10000); + osdmap.set_primary_affinity(1, 0x10000); + } +} + +TEST_F(OSDMapTest, get_osd_crush_node_flags) { + set_up_map(); + + for (unsigned i=0; i<get_num_osds(); ++i) { + ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(i)); + } + + OSDMap::Incremental inc(osdmap.get_epoch() + 1); + inc.new_crush_node_flags[-1] = 123u; + osdmap.apply_incremental(inc); + for (unsigned i=0; i<get_num_osds(); ++i) { + ASSERT_EQ(123u, osdmap.get_osd_crush_node_flags(i)); + } + ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000)); + + OSDMap::Incremental inc3(osdmap.get_epoch() + 1); + inc3.new_crush_node_flags[-1] = 456u; + osdmap.apply_incremental(inc3); + for (unsigned i=0; i<get_num_osds(); ++i) { + ASSERT_EQ(456u, osdmap.get_osd_crush_node_flags(i)); + } + ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000)); + + OSDMap::Incremental inc2(osdmap.get_epoch() + 1); + inc2.new_crush_node_flags[-1] = 0; + osdmap.apply_incremental(inc2); + for (unsigned i=0; i<get_num_osds(); ++i) { + ASSERT_EQ(0u, osdmap.get_crush_node_flags(i)); + } +} + +TEST_F(OSDMapTest, parse_osd_id_list) { + set_up_map(); + set<int> out; + set<int> all; + osdmap.get_all_osds(all); + + ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0"}, &out, &cout)); + ASSERT_EQ(1u, out.size()); + ASSERT_EQ(0, *out.begin()); + + ASSERT_EQ(0, osdmap.parse_osd_id_list({"1"}, &out, &cout)); + ASSERT_EQ(1u, out.size()); + ASSERT_EQ(1, *out.begin()); + + ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","osd.1"}, &out, &cout)); + ASSERT_EQ(2u, out.size()); + ASSERT_EQ(0, *out.begin()); + ASSERT_EQ(1, *out.rbegin()); + + ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","1"}, &out, &cout)); + ASSERT_EQ(2u, out.size()); + ASSERT_EQ(0, *out.begin()); + ASSERT_EQ(1, *out.rbegin()); + + ASSERT_EQ(0, osdmap.parse_osd_id_list({"*"}, &out, &cout)); + ASSERT_EQ(all.size(), out.size()); + ASSERT_EQ(all, out); + + ASSERT_EQ(0, osdmap.parse_osd_id_list({"all"}, &out, &cout)); + ASSERT_EQ(all, out); + + ASSERT_EQ(0, osdmap.parse_osd_id_list({"any"}, &out, &cout)); + ASSERT_EQ(all, out); + + ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"foo"}, &out, &cout)); + ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"-12"}, &out, &cout)); +} + +TEST_F(OSDMapTest, CleanPGUpmaps) { + set_up_map(); + + // build a crush rule of type host + const int expected_host_num = 3; + int osd_per_host = get_num_osds() / expected_host_num; + ASSERT_GE(2, osd_per_host); + int index = 0; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (i && i % osd_per_host == 0) { + ++index; + } + stringstream osd_name; + stringstream host_name; + vector<string> move_to; + osd_name << "osd." << i; + host_name << "host-" << index; + move_to.push_back("root=default"); + string host_loc = "host=" + host_name.str(); + move_to.push_back(host_loc); + int r = crush_move(osdmap, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + const string upmap_rule = "upmap"; + int upmap_rule_no = crush_rule_create_replicated( + upmap_rule, "default", "host"); + ASSERT_LT(0, upmap_rule_no); + + // create a replicated pool which references the above rule + OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); + new_pool_inc.new_pool_max = osdmap.get_pool_max(); + new_pool_inc.fsid = osdmap.get_fsid(); + pg_pool_t empty; + uint64_t upmap_pool_id = ++new_pool_inc.new_pool_max; + pg_pool_t *p = new_pool_inc.get_new_pool(upmap_pool_id, &empty); + p->size = 2; + p->set_pg_num(64); + p->set_pgp_num(64); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = upmap_rule_no; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + new_pool_inc.new_pool_names[upmap_pool_id] = "upmap_pool"; + osdmap.apply_incremental(new_pool_inc); + + pg_t rawpg(0, upmap_pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector<int> up; + int up_primary; + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_LT(1U, up.size()); + { + // validate we won't have two OSDs from a same host + int parent_0 = osdmap.crush->get_parent_of_type(up[0], + osdmap.crush->get_type_id("host")); + int parent_1 = osdmap.crush->get_parent_of_type(up[1], + osdmap.crush->get_type_id("host")); + ASSERT_TRUE(parent_0 != parent_1); + } + + { + // cancel stale upmaps + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + int from = -1; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(up.begin(), up.end(), i) == up.end()) { + from = i; + break; + } + } + ASSERT_TRUE(from >= 0); + int to = -1; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(up.begin(), up.end(), i) == up.end() && i != from) { + to = i; + break; + } + } + ASSERT_TRUE(to >= 0); + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(from, to)); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap_items[pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + OSDMap nextmap; + nextmap.deepish_copy_from(osdmap); + nextmap.apply_incremental(pending_inc); + ASSERT_TRUE(nextmap.have_pg_upmaps(pgid)); + OSDMap::Incremental new_pending_inc(nextmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, nextmap, new_pending_inc); + nextmap.apply_incremental(new_pending_inc); + ASSERT_TRUE(!nextmap.have_pg_upmaps(pgid)); + } + + { + // https://tracker.ceph.com/issues/37493 + pg_t ec_pg(0, my_ec_pool); + pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg); + OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map.. + int from = -1; + int to = -1; + { + // insert a valid pg_upmap_item + vector<int> ec_up; + int ec_up_primary; + osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary); + ASSERT_TRUE(!ec_up.empty()); + from = *(ec_up.begin()); + ASSERT_TRUE(from >= 0); + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) { + to = i; + break; + } + } + ASSERT_TRUE(to >= 0); + ASSERT_TRUE(from != to); + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(from, to)); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap_items[ec_pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + tmpmap.deepish_copy_from(osdmap); + tmpmap.apply_incremental(pending_inc); + ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); + } + { + // mark one of the target OSDs of the above pg_upmap_item as down + OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1); + pending_inc.new_state[to] = CEPH_OSD_UP; + tmpmap.apply_incremental(pending_inc); + ASSERT_TRUE(!tmpmap.is_up(to)); + ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); + } + { + // confirm *clean_pg_upmaps* won't do anything bad + OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc); + tmpmap.apply_incremental(pending_inc); + ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); + } + } + + { + // http://tracker.ceph.com/issues/37501 + pg_t ec_pg(0, my_ec_pool); + pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg); + OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map.. + int from = -1; + int to = -1; + { + // insert a valid pg_upmap_item + vector<int> ec_up; + int ec_up_primary; + osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary); + ASSERT_TRUE(!ec_up.empty()); + from = *(ec_up.begin()); + ASSERT_TRUE(from >= 0); + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) { + to = i; + break; + } + } + ASSERT_TRUE(to >= 0); + ASSERT_TRUE(from != to); + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(from, to)); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap_items[ec_pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + tmpmap.deepish_copy_from(osdmap); + tmpmap.apply_incremental(pending_inc); + ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); + } + { + // mark one of the target OSDs of the above pg_upmap_item as out + OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1); + pending_inc.new_weight[to] = CEPH_OSD_OUT; + tmpmap.apply_incremental(pending_inc); + ASSERT_TRUE(tmpmap.is_out(to)); + ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid)); + } + { + // *clean_pg_upmaps* should be able to remove the above *bad* mapping + OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc); + tmpmap.apply_incremental(pending_inc); + ASSERT_TRUE(!tmpmap.have_pg_upmaps(ec_pgid)); + } + } + + { + // http://tracker.ceph.com/issues/37968 + + // build a temporary crush topology of 2 hosts, 3 osds per host + OSDMap tmp; // use a tmpmap here, so we do not dirty origin map.. + tmp.deepish_copy_from(osdmap); + const int expected_host_num = 2; + int osd_per_host = get_num_osds() / expected_host_num; + ASSERT_GE(osd_per_host, 3); + int index = 0; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (i && i % osd_per_host == 0) { + ++index; + } + stringstream osd_name; + stringstream host_name; + vector<string> move_to; + osd_name << "osd." << i; + host_name << "host-" << index; + move_to.push_back("root=default"); + string host_loc = "host=" + host_name.str(); + move_to.push_back(host_loc); + auto r = crush_move(tmp, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + + // build crush rule + CrushWrapper crush; + get_crush(tmp, crush); + string rule_name = "rule_37968"; + int rule_type = pg_pool_t::TYPE_ERASURE; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + string root_name = "default"; + int root = crush.get_item_id(root_name); + int min_size = 3; + int max_size = 4; + int steps = 6; + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 1 /* host*/); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 0 /* osd */); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + tmp.apply_incremental(pending_inc); + } + + // create a erasuce-coded pool referencing the above rule + int64_t pool_37968; + { + OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1); + new_pool_inc.new_pool_max = tmp.get_pool_max(); + new_pool_inc.fsid = tmp.get_fsid(); + pg_pool_t empty; + pool_37968 = ++new_pool_inc.new_pool_max; + pg_pool_t *p = new_pool_inc.get_new_pool(pool_37968, &empty); + p->size = 4; + p->set_pg_num(8); + p->set_pgp_num(8); + p->type = pg_pool_t::TYPE_ERASURE; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + new_pool_inc.new_pool_names[pool_37968] = "pool_37968"; + tmp.apply_incremental(new_pool_inc); + } + + pg_t ec_pg(0, pool_37968); + pg_t ec_pgid = tmp.raw_pg_to_pg(ec_pg); + int from = -1; + int to = -1; + { + // insert a valid pg_upmap_item + vector<int> ec_up; + int ec_up_primary; + tmp.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary); + ASSERT_TRUE(ec_up.size() == 4); + from = *(ec_up.begin()); + ASSERT_TRUE(from >= 0); + auto parent = tmp.crush->get_parent_of_type(from, 1 /* host */, rno); + ASSERT_TRUE(parent < 0); + // pick an osd of the same parent with *from* + for (int i = 0; i < (int)get_num_osds(); i++) { + if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) { + auto p = tmp.crush->get_parent_of_type(i, 1 /* host */, rno); + if (p == parent) { + to = i; + break; + } + } + } + ASSERT_TRUE(to >= 0); + ASSERT_TRUE(from != to); + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(from, to)); + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + pending_inc.new_pg_upmap_items[ec_pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid)); + } + { + // *clean_pg_upmaps* should not remove the above upmap_item + OSDMap::Incremental pending_inc(tmp.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, tmp, pending_inc); + tmp.apply_incremental(pending_inc); + ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid)); + } + } + + { + // TEST pg_upmap + { + // STEP-1: enumerate all children of up[0]'s parent, + // replace up[1] with one of them (other than up[0]) + int parent = osdmap.crush->get_parent_of_type(up[0], + osdmap.crush->get_type_id("host")); + set<int> candidates; + osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &candidates); + ASSERT_LT(1U, candidates.size()); + int replaced_by = -1; + for (auto c: candidates) { + if (c != up[0]) { + replaced_by = c; + break; + } + } + { + // Check we can handle a negative pg_upmap value + vector<int32_t> new_pg_upmap; + new_pg_upmap.push_back(up[0]); + new_pg_upmap.push_back(-823648512); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>( + new_pg_upmap.begin(), new_pg_upmap.end()); + osdmap.apply_incremental(pending_inc); + vector<int> new_up; + int new_up_primary; + // crucial call - _apply_upmap should ignore the negative value + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + } + ASSERT_NE(-1, replaced_by); + // generate a new pg_upmap item and apply + vector<int32_t> new_pg_upmap; + new_pg_upmap.push_back(up[0]); + new_pg_upmap.push_back(replaced_by); // up[1] -> replaced_by + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>( + new_pg_upmap.begin(), new_pg_upmap.end()); + osdmap.apply_incremental(pending_inc); + { + // validate pg_upmap is there + vector<int> new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(up.size() == new_up.size()); + ASSERT_TRUE(new_up[0] == new_pg_upmap[0]); + ASSERT_TRUE(new_up[1] == new_pg_upmap[1]); + // and we shall have two OSDs from a same host now.. + int parent_0 = osdmap.crush->get_parent_of_type(new_up[0], + osdmap.crush->get_type_id("host")); + int parent_1 = osdmap.crush->get_parent_of_type(new_up[1], + osdmap.crush->get_type_id("host")); + ASSERT_TRUE(parent_0 == parent_1); + } + } + { + // STEP-2: apply cure + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, osdmap, pending_inc); + osdmap.apply_incremental(pending_inc); + { + // validate pg_upmap is gone (reverted) + vector<int> new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(new_up == up); + ASSERT_TRUE(new_up_primary = up_primary); + } + } + } + + { + // TEST pg_upmap_items + // enumerate all used hosts first + set<int> parents; + for (auto u: up) { + int parent = osdmap.crush->get_parent_of_type(u, + osdmap.crush->get_type_id("host")); + ASSERT_GT(0, parent); + parents.insert(parent); + } + int candidate_parent = 0; + set<int> candidate_children; + vector<int> up_after_out; + { + // STEP-1: try mark out up[1] and all other OSDs from the same host + int parent = osdmap.crush->get_parent_of_type(up[1], + osdmap.crush->get_type_id("host")); + set<int> children; + osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), + &children); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + for (auto c: children) { + pending_inc.new_weight[c] = CEPH_OSD_OUT; + } + OSDMap tmpmap; + tmpmap.deepish_copy_from(osdmap); + tmpmap.apply_incremental(pending_inc); + vector<int> new_up; + int new_up_primary; + tmpmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + // verify that we'll have OSDs from a different host.. + int will_choose = -1; + for (auto o: new_up) { + int parent = tmpmap.crush->get_parent_of_type(o, + osdmap.crush->get_type_id("host")); + if (!parents.count(parent)) { + will_choose = o; + candidate_parent = parent; // record + break; + } + } + ASSERT_LT(-1, will_choose); // it is an OSD! + ASSERT_TRUE(candidate_parent != 0); + osdmap.crush->get_leaves(osdmap.crush->get_item_name(candidate_parent), + &candidate_children); + ASSERT_TRUE(candidate_children.count(will_choose)); + candidate_children.erase(will_choose); + ASSERT_TRUE(!candidate_children.empty()); + up_after_out = new_up; // needed for verification.. + } + { + // Make sure we can handle a negative pg_upmap_item + int victim = up[0]; + int replaced_by = -823648512; + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(victim, replaced_by)); + // apply + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap_items[pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + osdmap.apply_incremental(pending_inc); + vector<int> new_up; + int new_up_primary; + // crucial call - _apply_upmap should ignore the negative value + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + } + { + // STEP-2: generating a new pg_upmap_items entry by + // replacing up[0] with one coming from candidate_children + int victim = up[0]; + int replaced_by = *candidate_children.begin(); + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(victim, replaced_by)); + // apply + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap_items[pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + osdmap.apply_incremental(pending_inc); + { + // validate pg_upmap_items is there + vector<int> new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(up.size() == new_up.size()); + ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), replaced_by) != + new_up.end()); + // and up[1] too + ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), up[1]) != + new_up.end()); + } + } + { + // STEP-3: mark out up[1] and all other OSDs from the same host + int parent = osdmap.crush->get_parent_of_type(up[1], + osdmap.crush->get_type_id("host")); + set<int> children; + osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), + &children); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + for (auto c: children) { + pending_inc.new_weight[c] = CEPH_OSD_OUT; + } + osdmap.apply_incremental(pending_inc); + { + // validate we have two OSDs from the same host now.. + vector<int> new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(up.size() == new_up.size()); + int parent_0 = osdmap.crush->get_parent_of_type(new_up[0], + osdmap.crush->get_type_id("host")); + int parent_1 = osdmap.crush->get_parent_of_type(new_up[1], + osdmap.crush->get_type_id("host")); + ASSERT_TRUE(parent_0 == parent_1); + } + } + { + // STEP-4: apply cure + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, osdmap, pending_inc); + osdmap.apply_incremental(pending_inc); + { + // validate pg_upmap_items is gone (reverted) + vector<int> new_up; + int new_up_primary; + osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary); + ASSERT_TRUE(new_up == up_after_out); + } + } + } +} + +TEST_F(OSDMapTest, BUG_38897) { + // http://tracker.ceph.com/issues/38897 + // build a fresh map with 12 OSDs, without any default pools + set_up_map(12, true); + const string pool_1("pool1"); + const string pool_2("pool2"); + int64_t pool_1_id = -1; + + { + // build customized crush rule for "pool1" + string host_name = "host_for_pool_1"; + // build a customized host to capture osd.1~5 + for (int i = 1; i < 5; i++) { + stringstream osd_name; + vector<string> move_to; + osd_name << "osd." << i; + move_to.push_back("root=default"); + string host_loc = "host=" + host_name; + move_to.push_back(host_loc); + auto r = crush_move(osdmap, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + CrushWrapper crush; + get_crush(osdmap, crush); + auto host_id = crush.get_item_id(host_name); + ASSERT_TRUE(host_id < 0); + string rule_name = "rule_for_pool1"; + int rule_type = pg_pool_t::TYPE_REPLICATED; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + int min_size = 3; + int max_size = 3; + int steps = 7; + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + // always choose osd.0 + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + // then pick any other random osds + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + osdmap.apply_incremental(pending_inc); + } + + // create "pool1" + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pool_max = osdmap.get_pool_max(); + auto pool_id = ++pending_inc.new_pool_max; + pool_1_id = pool_id; + pg_pool_t empty; + auto p = pending_inc.get_new_pool(pool_id, &empty); + p->size = 3; + p->min_size = 1; + p->set_pg_num(3); + p->set_pgp_num(3); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + pending_inc.new_pool_names[pool_id] = pool_1; + osdmap.apply_incremental(pending_inc); + ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); + ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1); + { + for (unsigned i = 0; i < 3; i++) { + // 1.x -> [1] + pg_t rawpg(i, pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector<int> up; + int up_primary; + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_TRUE(up.size() == 3); + ASSERT_TRUE(up[0] == 0); + + // insert a new pg_upmap + vector<int32_t> new_up; + // and remap 1.x to osd.1 only + // this way osd.0 is deemed to be *underfull* + // and osd.1 is deemed to be *overfull* + new_up.push_back(1); + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>( + new_up.begin(), new_up.end()); + osdmap.apply_incremental(pending_inc); + } + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_TRUE(up.size() == 1); + ASSERT_TRUE(up[0] == 1); + } + } + } + + { + // build customized crush rule for "pool2" + string host_name = "host_for_pool_2"; + // build a customized host to capture osd.6~11 + for (int i = 6; i < (int)get_num_osds(); i++) { + stringstream osd_name; + vector<string> move_to; + osd_name << "osd." << i; + move_to.push_back("root=default"); + string host_loc = "host=" + host_name; + move_to.push_back(host_loc); + auto r = crush_move(osdmap, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + CrushWrapper crush; + get_crush(osdmap, crush); + auto host_id = crush.get_item_id(host_name); + ASSERT_TRUE(host_id < 0); + string rule_name = "rule_for_pool2"; + int rule_type = pg_pool_t::TYPE_REPLICATED; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + int min_size = 3; + int max_size = 3; + int steps = 7; + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + // always choose osd.0 + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + // then pick any other random osds + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + osdmap.apply_incremental(pending_inc); + } + + // create "pool2" + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pool_max = osdmap.get_pool_max(); + auto pool_id = ++pending_inc.new_pool_max; + pg_pool_t empty; + auto p = pending_inc.get_new_pool(pool_id, &empty); + p->size = 3; + // include a single PG + p->set_pg_num(1); + p->set_pgp_num(1); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + pending_inc.new_pool_names[pool_id] = pool_2; + osdmap.apply_incremental(pending_inc); + ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); + ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2); + pg_t rawpg(0, pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid)); + vector<int> up; + int up_primary; + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_TRUE(up.size() == 3); + ASSERT_TRUE(up[0] == 0); + + { + // build a pg_upmap_item that will + // remap pg out from *underfull* osd.0 + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10 + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap_items[pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + osdmap.apply_incremental(pending_inc); + ASSERT_TRUE(osdmap.have_pg_upmaps(pgid)); + vector<int> up; + int up_primary; + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_TRUE(up.size() == 3); + ASSERT_TRUE(up[0] == 10); + } + } + + // ready to go + { + set<int64_t> only_pools; + ASSERT_TRUE(pool_1_id >= 0); + only_pools.insert(pool_1_id); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + // require perfect distribution! (max deviation 0) + osdmap.calc_pg_upmaps(g_ceph_context, + 0, // so we can force optimizing + 100, + only_pools, + &pending_inc); + osdmap.apply_incremental(pending_inc); + } +} + +TEST_F(OSDMapTest, BUG_40104) { + // http://tracker.ceph.com/issues/40104 + int big_osd_num = 5000; + int big_pg_num = 10000; + set_up_map(big_osd_num, true); + int pool_id; + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pool_max = osdmap.get_pool_max(); + pool_id = ++pending_inc.new_pool_max; + pg_pool_t empty; + auto p = pending_inc.get_new_pool(pool_id, &empty); + p->size = 3; + p->min_size = 1; + p->set_pg_num(big_pg_num); + p->set_pgp_num(big_pg_num); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = 0; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + pending_inc.new_pool_names[pool_id] = "big_pool"; + osdmap.apply_incremental(pending_inc); + ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); + ASSERT_TRUE(osdmap.get_pool_name(pool_id) == "big_pool"); + } + { + // generate pg_upmap_items for each pg + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + for (int i = 0; i < big_pg_num; i++) { + pg_t rawpg(i, pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector<int> up; + int up_primary; + osdmap.pg_to_raw_up(pgid, &up, &up_primary); + ASSERT_TRUE(up.size() == 3); + int victim = up[0]; + int replaced_by = random() % big_osd_num; + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + // note that it might or might not be valid, we don't care + new_pg_upmap_items.push_back(make_pair(victim, replaced_by)); + pending_inc.new_pg_upmap_items[pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + } + osdmap.apply_incremental(pending_inc); + } + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + auto start = mono_clock::now(); + clean_pg_upmaps(g_ceph_context, osdmap, pending_inc); + auto latency = mono_clock::now() - start; + std::cout << "clean_pg_upmaps (~" << big_pg_num + << " pg_upmap_items) latency:" << timespan_str(latency) + << std::endl; + } +} + +TEST_F(OSDMapTest, BUG_42052) { + // https://tracker.ceph.com/issues/42052 + set_up_map(6, true); + const string pool_name("pool"); + // build customized crush rule for "pool" + CrushWrapper crush; + get_crush(osdmap, crush); + string rule_name = "rule"; + int rule_type = pg_pool_t::TYPE_REPLICATED; + ASSERT_TRUE(!crush.rule_exists(rule_name)); + int rno; + for (rno = 0; rno < crush.get_max_rules(); rno++) { + if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno)) + break; + } + int min_size = 3; + int max_size = 3; + int steps = 8; + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size); + int step = 0; + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + // always choose osd.0, osd.1, osd.2 + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 1); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 2); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + ASSERT_TRUE(step == steps); + auto r = crush_add_rule(crush.get_crush_map(), rule, rno); + ASSERT_TRUE(r >= 0); + crush.set_rule_name(rno, rule_name); + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + osdmap.apply_incremental(pending_inc); + } + + // create "pool" + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pool_max = osdmap.get_pool_max(); + auto pool_id = ++pending_inc.new_pool_max; + pg_pool_t empty; + auto p = pending_inc.get_new_pool(pool_id, &empty); + p->size = 3; + p->min_size = 1; + p->set_pg_num(1); + p->set_pgp_num(1); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = rno; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + pending_inc.new_pool_names[pool_id] = pool_name; + osdmap.apply_incremental(pending_inc); + ASSERT_TRUE(osdmap.have_pg_pool(pool_id)); + ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_name); + pg_t rawpg(0, pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + { + // pg_upmap 1.0 [2,3,5] + vector<int32_t> new_up{2,3,5}; + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>( + new_up.begin(), new_up.end()); + osdmap.apply_incremental(pending_inc); + } + { + // pg_upmap_items 1.0 [0,3,4,5] + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + new_pg_upmap_items.push_back(make_pair(0, 3)); + new_pg_upmap_items.push_back(make_pair(4, 5)); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.new_pg_upmap_items[pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + osdmap.apply_incremental(pending_inc); + } + { + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, osdmap, pending_inc); + osdmap.apply_incremental(pending_inc); + ASSERT_FALSE(osdmap.have_pg_upmaps(pgid)); + } +} + +TEST(PGTempMap, basic) +{ + PGTempMap m; + pg_t a(1,1); + for (auto i=3; i<1000; ++i) { + pg_t x(i, 1); + m.set(x, {static_cast<int>(i)}); + } + pg_t b(2,1); + m.set(a, {1, 2}); + ASSERT_NE(m.find(a), m.end()); + ASSERT_EQ(m.find(a), m.begin()); + ASSERT_EQ(m.find(b), m.end()); + ASSERT_EQ(998u, m.size()); +} + +TEST_F(OSDMapTest, BUG_48884) +{ + + set_up_map(12); + + unsigned int host_index = 1; + for (unsigned int x=0; x < get_num_osds();) { + // Create three hosts with four osds each + for (unsigned int y=0; y < 4; y++) { + stringstream osd_name; + stringstream host_name; + vector<string> move_to; + osd_name << "osd." << x; + host_name << "host-" << host_index; + move_to.push_back("root=default"); + move_to.push_back("rack=localrack"); + string host_loc = "host=" + host_name.str(); + move_to.push_back(host_loc); + int r = crush_move(osdmap, osd_name.str(), move_to); + ASSERT_EQ(0, r); + x++; + } + host_index++; + } + + CrushWrapper crush; + get_crush(osdmap, crush); + auto host_id = crush.get_item_id("localhost"); + crush.remove_item(g_ceph_context, host_id, false); + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + pending_inc.crush.clear(); + crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + osdmap.apply_incremental(pending_inc); + + PGMap pgmap; + osd_stat_t stats, stats_null; + stats.statfs.total = 500000; + stats.statfs.available = 50000; + stats.statfs.omap_allocated = 50000; + stats.statfs.internal_metadata = 50000; + stats_null.statfs.total = 0; + stats_null.statfs.available = 0; + stats_null.statfs.omap_allocated = 0; + stats_null.statfs.internal_metadata = 0; + for (unsigned int x=0; x < get_num_osds(); x++) { + if (x > 3 && x < 8) { + pgmap.osd_stat.insert({x,stats_null}); + } else { + pgmap.osd_stat.insert({x,stats}); + } + } + + stringstream ss; + boost::scoped_ptr<Formatter> f(Formatter::create("json-pretty")); + print_osd_utilization(osdmap, pgmap, ss, f.get(), true, "", "root"); + JSONParser parser; + parser.parse(ss.str().c_str(), static_cast<int>(ss.str().size())); + auto iter = parser.find_first(); + for (const auto bucket : (*iter)->get_array_elements()) { + JSONParser parser2; + parser2.parse(bucket.c_str(), static_cast<int>(bucket.size())); + auto* obj = parser2.find_obj("name"); + if (obj->get_data_val().str.compare("localrack") == 0) { + obj = parser2.find_obj("kb"); + ASSERT_EQ(obj->get_data_val().str, "3904"); + obj = parser2.find_obj("kb_used"); + ASSERT_EQ(obj->get_data_val().str, "3512"); + obj = parser2.find_obj("kb_used_omap"); + ASSERT_EQ(obj->get_data_val().str, "384"); + obj = parser2.find_obj("kb_used_meta"); + ASSERT_EQ(obj->get_data_val().str, "384"); + obj = parser2.find_obj("kb_avail"); + ASSERT_EQ(obj->get_data_val().str, "384"); + } + } +} |