From 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 20:24:20 +0200 Subject: Adding upstream version 14.2.21. Signed-off-by: Daniel Baumann --- src/crush/CMakeLists.txt | 19 + src/crush/CrushCompiler.cc | 1276 +++++++++++++ src/crush/CrushCompiler.h | 92 + src/crush/CrushLocation.cc | 124 ++ src/crush/CrushLocation.h | 35 + src/crush/CrushTester.cc | 802 +++++++++ src/crush/CrushTester.h | 366 ++++ src/crush/CrushTreeDumper.h | 291 +++ src/crush/CrushWrapper.cc | 4185 +++++++++++++++++++++++++++++++++++++++++++ src/crush/CrushWrapper.h | 1657 +++++++++++++++++ src/crush/CrushWrapper.i | 47 + src/crush/builder.c | 1525 ++++++++++++++++ src/crush/builder.h | 344 ++++ src/crush/crush.c | 137 ++ src/crush/crush.h | 549 ++++++ src/crush/crush_compat.h | 39 + src/crush/crush_ln_table.h | 164 ++ src/crush/grammar.h | 191 ++ src/crush/hash.c | 151 ++ src/crush/hash.h | 23 + src/crush/mapper.c | 1105 ++++++++++++ src/crush/mapper.h | 93 + src/crush/old_sample.txt | 82 + src/crush/sample.txt | 47 + src/crush/types.h | 17 + 25 files changed, 13361 insertions(+) create mode 100644 src/crush/CMakeLists.txt create mode 100644 src/crush/CrushCompiler.cc create mode 100644 src/crush/CrushCompiler.h create mode 100644 src/crush/CrushLocation.cc create mode 100644 src/crush/CrushLocation.h create mode 100644 src/crush/CrushTester.cc create mode 100644 src/crush/CrushTester.h create mode 100644 src/crush/CrushTreeDumper.h create mode 100644 src/crush/CrushWrapper.cc create mode 100644 src/crush/CrushWrapper.h create mode 100644 src/crush/CrushWrapper.i create mode 100644 src/crush/builder.c create mode 100644 src/crush/builder.h create mode 100644 src/crush/crush.c create mode 100644 src/crush/crush.h create mode 100644 src/crush/crush_compat.h create mode 100644 src/crush/crush_ln_table.h create mode 100644 src/crush/grammar.h create mode 100644 src/crush/hash.c create mode 100644 src/crush/hash.h create mode 100644 src/crush/mapper.c create mode 100644 src/crush/mapper.h create mode 100644 src/crush/old_sample.txt create mode 100644 src/crush/sample.txt create mode 100644 src/crush/types.h (limited to 'src/crush') diff --git a/src/crush/CMakeLists.txt b/src/crush/CMakeLists.txt new file mode 100644 index 00000000..ae9b9f47 --- /dev/null +++ b/src/crush/CMakeLists.txt @@ -0,0 +1,19 @@ +set(crush_srcs + builder.c + mapper.c + crush.c + hash.c + CrushWrapper.cc + CrushCompiler.cc + CrushTester.cc + CrushLocation.cc) + +add_library(crush_objs OBJECT ${crush_srcs}) + +if(WITH_SEASTAR) + add_library(crimson-crush OBJECT ${crush_srcs}) + target_compile_definitions(crimson-crush PRIVATE + "WITH_SEASTAR=1") + target_include_directories(crimson-crush PRIVATE + $) +endif() diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc new file mode 100644 index 00000000..52ad0563 --- /dev/null +++ b/src/crush/CrushCompiler.cc @@ -0,0 +1,1276 @@ + +#include "CrushCompiler.h" + +#if defined(_AIX) +#define EBADE ECORRUPT +#endif + +#ifndef EBADE +#define EBADE EFTYPE +#endif +#include +#include "common/errno.h" +#include + +// ------------- + +static void print_type_name(ostream& out, int t, CrushWrapper &crush) +{ + const char *name = crush.get_type_name(t); + if (name) + out << name; + else if (t == 0) + out << "device"; + else + out << "type" << t; +} + +static void print_item_name(ostream& out, int t, CrushWrapper &crush) +{ + const char *name = crush.get_item_name(t); + if (name) + out << name; + else if (t >= 0) + out << "device" << t; + else + out << "bucket" << (-1-t); +} + +static void print_bucket_class_ids(ostream& out, int t, CrushWrapper &crush) +{ + if (crush.class_bucket.count(t) == 0) + return; + auto &class_to_id = crush.class_bucket[t]; + for (auto &i : class_to_id) { + int c = i.first; + int cid = i.second; + const char* class_name = crush.get_class_name(c); + ceph_assert(class_name); + out << "\tid " << cid << " class " << class_name << "\t\t# do not change unnecessarily\n"; + } +} + +static void print_item_class(ostream& out, int t, CrushWrapper &crush) +{ + const char *c = crush.get_item_class(t); + if (c) + out << " class " << c; +} + +static void print_class(ostream& out, int t, CrushWrapper &crush) +{ + const char *c = crush.get_class_name(t); + if (c) + out << " class " << c; + else + out << " # unexpected class " << t; +} + +static void print_rule_name(ostream& out, int t, CrushWrapper &crush) +{ + const char *name = crush.get_rule_name(t); + if (name) + out << name; + else + out << "rule" << t; +} + +static void print_fixedpoint(ostream& out, int i) +{ + char s[20]; + snprintf(s, sizeof(s), "%.3f", (float)i / (float)0x10000); + out << s; +} + +int CrushCompiler::decompile_bucket_impl(int i, ostream &out) +{ + const char *name = crush.get_item_name(i); + if (name && !crush.is_valid_crush_name(name)) + return 0; + int type = crush.get_bucket_type(i); + print_type_name(out, type, crush); + out << " "; + print_item_name(out, i, crush); + out << " {\n"; + out << "\tid " << i << "\t\t# do not change unnecessarily\n"; + print_bucket_class_ids(out, i, crush); + + out << "\t# weight "; + print_fixedpoint(out, crush.get_bucket_weight(i)); + out << "\n"; + + int n = crush.get_bucket_size(i); + + int alg = crush.get_bucket_alg(i); + out << "\talg " << crush_bucket_alg_name(alg); + + // notate based on alg type + bool dopos = false; + switch (alg) { + case CRUSH_BUCKET_UNIFORM: + out << "\t# do not change bucket size (" << n << ") unnecessarily"; + dopos = true; + break; + case CRUSH_BUCKET_LIST: + out << "\t# add new items at the end; do not change order unnecessarily"; + break; + case CRUSH_BUCKET_TREE: + out << "\t# do not change pos for existing items unnecessarily"; + dopos = true; + break; + } + out << "\n"; + + int hash = crush.get_bucket_hash(i); + out << "\thash " << hash << "\t# " << crush_hash_name(hash) << "\n"; + + for (int j=0; j& dcb_states, + ostream &out) +{ + if ((cur == 0) || (!crush.bucket_exists(cur))) + return 0; + + std::map::iterator c = dcb_states.find(cur); + if (c == dcb_states.end()) { + // Mark this bucket as "in progress." + std::map::value_type val(cur, DCB_STATE_IN_PROGRESS); + std::pair ::iterator, bool> rval + (dcb_states.insert(val)); + ceph_assert(rval.second); + c = rval.first; + } + else if (c->second == DCB_STATE_DONE) { + // We already did this bucket. + return 0; + } + else if (c->second == DCB_STATE_IN_PROGRESS) { + err << "decompile_crush_bucket: logic error: tried to decompile " + "a bucket that is already being decompiled" << std::endl; + return -EBADE; + } + else { + err << "decompile_crush_bucket: logic error: illegal bucket state! " + << c->second << std::endl; + return -EBADE; + } + + int bsize = crush.get_bucket_size(cur); + for (int i = 0; i < bsize; ++i) { + int item = crush.get_bucket_item(cur, i); + std::map::iterator d = dcb_states.find(item); + if (d == dcb_states.end()) { + int ret = decompile_bucket(item, dcb_states, out); + if (ret) + return ret; + } + else if (d->second == DCB_STATE_IN_PROGRESS) { + err << "decompile_crush_bucket: error: while trying to output bucket " + << cur << ", we found out that it contains one of the buckets that " + << "contain it. This is not allowed. The buckets must form a " + << "directed acyclic graph." << std::endl; + return -EINVAL; + } + else if (d->second != DCB_STATE_DONE) { + err << "decompile_crush_bucket: logic error: illegal bucket state " + << d->second << std::endl; + return -EBADE; + } + } + decompile_bucket_impl(cur, out); + c->second = DCB_STATE_DONE; + return 0; +} + +int CrushCompiler::decompile_weight_set_weights(crush_weight_set weight_set, + ostream &out) +{ + out << " [ "; + for (__u32 i = 0; i < weight_set.size; i++) { + print_fixedpoint(out, weight_set.weights[i]); + out << " "; + } + out << "]\n"; + return 0; +} + +int CrushCompiler::decompile_weight_set(crush_weight_set *weight_set, + __u32 size, + ostream &out) +{ + out << " weight_set [\n"; + for (__u32 i = 0; i < size; i++) { + int r = decompile_weight_set_weights(weight_set[i], out); + if (r < 0) + return r; + } + out << " ]\n"; + return 0; +} + +int CrushCompiler::decompile_ids(__s32 *ids, + __u32 size, + ostream &out) +{ + out << " ids [ "; + for (__u32 i = 0; i < size; i++) + out << ids[i] << " "; + out << "]\n"; + return 0; +} + +int CrushCompiler::decompile_choose_arg(crush_choose_arg *arg, + int bucket_id, + ostream &out) +{ + int r; + out << " {\n"; + out << " bucket_id " << bucket_id << "\n"; + if (arg->weight_set_positions > 0) { + r = decompile_weight_set(arg->weight_set, arg->weight_set_positions, out); + if (r < 0) + return r; + } + if (arg->ids_size > 0) { + r = decompile_ids(arg->ids, arg->ids_size, out); + if (r < 0) + return r; + } + out << " }\n"; + return 0; +} + +int CrushCompiler::decompile_choose_arg_map(crush_choose_arg_map arg_map, + ostream &out) +{ + for (__u32 i = 0; i < arg_map.size; i++) { + if ((arg_map.args[i].ids_size == 0) && + (arg_map.args[i].weight_set_positions == 0)) + continue; + int r = decompile_choose_arg(&arg_map.args[i], -1-i, out); + if (r < 0) + return r; + } + return 0; +} + +int CrushCompiler::decompile_choose_args(const std::pair &i, + ostream &out) +{ + out << "choose_args " << i.first << " {\n"; + int r = decompile_choose_arg_map(i.second, out); + if (r < 0) + return r; + out << "}\n"; + return 0; +} + +int CrushCompiler::decompile(ostream &out) +{ + out << "# begin crush map\n"; + + // only dump tunables if they differ from the defaults + if (crush.get_choose_local_tries() != 2) + out << "tunable choose_local_tries " << crush.get_choose_local_tries() << "\n"; + if (crush.get_choose_local_fallback_tries() != 5) + out << "tunable choose_local_fallback_tries " << crush.get_choose_local_fallback_tries() << "\n"; + if (crush.get_choose_total_tries() != 19) + out << "tunable choose_total_tries " << crush.get_choose_total_tries() << "\n"; + if (crush.get_chooseleaf_descend_once() != 0) + out << "tunable chooseleaf_descend_once " << crush.get_chooseleaf_descend_once() << "\n"; + if (crush.get_chooseleaf_vary_r() != 0) + out << "tunable chooseleaf_vary_r " << crush.get_chooseleaf_vary_r() << "\n"; + if (crush.get_chooseleaf_stable() != 0) + out << "tunable chooseleaf_stable " << crush.get_chooseleaf_stable() << "\n"; + if (crush.get_straw_calc_version() != 0) + out << "tunable straw_calc_version " << crush.get_straw_calc_version() << "\n"; + if (crush.get_allowed_bucket_algs() != CRUSH_LEGACY_ALLOWED_BUCKET_ALGS) + out << "tunable allowed_bucket_algs " << crush.get_allowed_bucket_algs() + << "\n"; + + out << "\n# devices\n"; + for (int i=0; i dcb_states; + for (int bucket = -1; bucket > -1-crush.get_max_buckets(); --bucket) { + int ret = decompile_bucket(bucket, dcb_states, out); + if (ret) + return ret; + } + + out << "\n# rules\n"; + for (int i=0; i= 0) + step_item = original_item; + print_item_name(out, step_item, crush); + if (c >= 0) + print_class(out, c, crush); + } + out << "\n"; + break; + case CRUSH_RULE_EMIT: + out << "\tstep emit\n"; + break; + case CRUSH_RULE_SET_CHOOSE_TRIES: + out << "\tstep set_choose_tries " << crush.get_rule_arg1(i, j) + << "\n"; + break; + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: + out << "\tstep set_choose_local_tries " << crush.get_rule_arg1(i, j) + << "\n"; + break; + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: + out << "\tstep set_choose_local_fallback_tries " << crush.get_rule_arg1(i, j) + << "\n"; + break; + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: + out << "\tstep set_chooseleaf_tries " << crush.get_rule_arg1(i, j) + << "\n"; + break; + case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: + out << "\tstep set_chooseleaf_vary_r " << crush.get_rule_arg1(i, j) + << "\n"; + break; + case CRUSH_RULE_SET_CHOOSELEAF_STABLE: + out << "\tstep set_chooseleaf_stable " << crush.get_rule_arg1(i, j) + << "\n"; + break; + case CRUSH_RULE_CHOOSE_FIRSTN: + out << "\tstep choose firstn " + << crush.get_rule_arg1(i, j) + << " type "; + print_type_name(out, crush.get_rule_arg2(i, j), crush); + out << "\n"; + break; + case CRUSH_RULE_CHOOSE_INDEP: + out << "\tstep choose indep " + << crush.get_rule_arg1(i, j) + << " type "; + print_type_name(out, crush.get_rule_arg2(i, j), crush); + out << "\n"; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: + out << "\tstep chooseleaf firstn " + << crush.get_rule_arg1(i, j) + << " type "; + print_type_name(out, crush.get_rule_arg2(i, j), crush); + out << "\n"; + break; + case CRUSH_RULE_CHOOSELEAF_INDEP: + out << "\tstep chooseleaf indep " + << crush.get_rule_arg1(i, j) + << " type "; + print_type_name(out, crush.get_rule_arg2(i, j), crush); + out << "\n"; + break; + } + } + out << "}\n"; + } + if (crush.choose_args.size() > 0) { + out << "\n# choose_args\n"; + for (auto i : crush.choose_args) { + int ret = decompile_choose_args(i, out); + if (ret) + return ret; + } + } + out << "\n# end crush map" << std::endl; + return 0; +} + + +// ================================================================ + +string CrushCompiler::string_node(node_t &node) +{ + return boost::trim_copy(string(node.value.begin(), node.value.end())); +} + +int CrushCompiler::int_node(node_t &node) +{ + string str = string_node(node); + return strtol(str.c_str(), 0, 10); +} + +float CrushCompiler::float_node(node_t &node) +{ + string s = string_node(node); + return strtof(s.c_str(), 0); +} + +int CrushCompiler::parse_device(iter_t const& i) +{ + int id = int_node(i->children[1]); + + string name = string_node(i->children[2]); + crush.set_item_name(id, name.c_str()); + if (item_id.count(name)) { + err << "item " << name << " defined twice" << std::endl; + return -1; + } + item_id[name] = id; + id_item[id] = name; + + if (verbose) err << "device " << id << " '" << name << "'"; + + if (i->children.size() > 3) { + string c = string_node(i->children[4]); + crush.set_item_class(id, c); + if (verbose) err << " class" << " '" << c << "'" << std::endl; + } else { + if (verbose) err << std::endl; + } + return 0; +} + +int CrushCompiler::parse_tunable(iter_t const& i) +{ + string name = string_node(i->children[1]); + int val = int_node(i->children[2]); + + if (name == "choose_local_tries") + crush.set_choose_local_tries(val); + else if (name == "choose_local_fallback_tries") + crush.set_choose_local_fallback_tries(val); + else if (name == "choose_total_tries") + crush.set_choose_total_tries(val); + else if (name == "chooseleaf_descend_once") + crush.set_chooseleaf_descend_once(val); + else if (name == "chooseleaf_vary_r") + crush.set_chooseleaf_vary_r(val); + else if (name == "chooseleaf_stable") + crush.set_chooseleaf_stable(val); + else if (name == "straw_calc_version") + crush.set_straw_calc_version(val); + else if (name == "allowed_bucket_algs") + crush.set_allowed_bucket_algs(val); + else { + err << "tunable " << name << " not recognized" << std::endl; + return -1; + } + + /* + + current crop of tunables are all now "safe". re-enable this when we + add new ones that are ... new. + + if (!unsafe_tunables) { + err << "tunables are NOT FULLY IMPLEMENTED; enable with --enable-unsafe-tunables to enable this feature" << std::endl; + return -1; + } + */ + + if (verbose) err << "tunable " << name << " " << val << std::endl; + return 0; +} + +int CrushCompiler::parse_bucket_type(iter_t const& i) +{ + int id = int_node(i->children[1]); + string name = string_node(i->children[2]); + if (verbose) err << "type " << id << " '" << name << "'" << std::endl; + type_id[name] = id; + crush.set_type_name(id, name.c_str()); + return 0; +} + +int CrushCompiler::parse_bucket(iter_t const& i) +{ + string tname = string_node(i->children[0]); + if (!type_id.count(tname)) { + err << "bucket type '" << tname << "' is not defined" << std::endl; + return -1; + } + int type = type_id[tname]; + + string name = string_node(i->children[1]); + if (item_id.count(name)) { + err << "bucket or device '" << name << "' is already defined" << std::endl; + return -1; + } + + int id = 0; // none, yet! + int alg = -1; + int hash = 0; + set used_items; + int size = 0; + map class_id; + + for (unsigned p=3; pchildren.size()-1; p++) { + iter_t sub = i->children.begin() + p; + string tag = string_node(sub->children[0]); + //err << "tag " << tag << std::endl; + if (tag == "id") { + int maybe_id = int_node(sub->children[1]); + if (verbose) err << "bucket " << name << " id " << maybe_id; + if (sub->children.size() > 2) { + string class_name = string_node(sub->children[3]); + // note that we do not verify class existence here, + // as this bucket might come from an empty shadow tree + // which currently has no OSDs but is still referenced by a rule! + int cid = crush.get_or_create_class_id(class_name); + if (class_id.count(cid) != 0) { + err << "duplicate device class " << class_name << " for bucket " << name << std::endl; + return -ERANGE; + } + class_id[cid] = maybe_id; + if (verbose) err << " class" << " '" << class_name << "'" << std::endl; + } else { + id = maybe_id; + if (verbose) err << std::endl; + } + } else if (tag == "alg") { + string a = string_node(sub->children[1]); + if (a == "uniform") + alg = CRUSH_BUCKET_UNIFORM; + else if (a == "list") + alg = CRUSH_BUCKET_LIST; + else if (a == "tree") + alg = CRUSH_BUCKET_TREE; + else if (a == "straw") + alg = CRUSH_BUCKET_STRAW; + else if (a == "straw2") + alg = CRUSH_BUCKET_STRAW2; + else { + err << "unknown bucket alg '" << a << "'" << std::endl << std::endl; + return -EINVAL; + } + } + else if (tag == "hash") { + string a = string_node(sub->children[1]); + if (a == "rjenkins1") + hash = CRUSH_HASH_RJENKINS1; + else + hash = atoi(a.c_str()); + } + else if (tag == "item") { + // first, just determine which item pos's are already used + size++; + for (unsigned q = 2; q < sub->children.size(); q++) { + string tag = string_node(sub->children[q++]); + if (tag == "pos") { + int pos = int_node(sub->children[q]); + if (used_items.count(pos)) { + err << "item '" << string_node(sub->children[1]) << "' in bucket '" << name << "' has explicit pos " << pos << ", which is occupied" << std::endl; + return -1; + } + used_items.insert(pos); + } + } + } + else ceph_abort(); + } + + // now do the items. + if (!used_items.empty()) + size = std::max(size, *used_items.rbegin()); + vector items(size); + vector weights(size); + + int curpos = 0; + unsigned bucketweight = 0; + bool have_uniform_weight = false; + unsigned uniform_weight = 0; + for (unsigned p=3; pchildren.size()-1; p++) { + iter_t sub = i->children.begin() + p; + string tag = string_node(sub->children[0]); + if (tag == "item") { + + string iname = string_node(sub->children[1]); + if (!item_id.count(iname)) { + err << "item '" << iname << "' in bucket '" << name << "' is not defined" << std::endl; + return -1; + } + int itemid = item_id[iname]; + + unsigned weight = 0x10000; + if (item_weight.count(itemid)) + weight = item_weight[itemid]; + + int pos = -1; + for (unsigned q = 2; q < sub->children.size(); q++) { + string tag = string_node(sub->children[q++]); + if (tag == "weight") { + weight = float_node(sub->children[q]) * (float)0x10000; + if (weight > CRUSH_MAX_DEVICE_WEIGHT && itemid >= 0) { + err << "device weight limited to " << CRUSH_MAX_DEVICE_WEIGHT / 0x10000 << std::endl; + return -ERANGE; + } + else if (weight > CRUSH_MAX_BUCKET_WEIGHT && itemid < 0) { + err << "bucket weight limited to " << CRUSH_MAX_BUCKET_WEIGHT / 0x10000 + << " to prevent overflow" << std::endl; + return -ERANGE; + } + } + else if (tag == "pos") + pos = int_node(sub->children[q]); + else + ceph_abort(); + + } + if (alg == CRUSH_BUCKET_UNIFORM) { + if (!have_uniform_weight) { + have_uniform_weight = true; + uniform_weight = weight; + } else { + if (uniform_weight != weight) { + err << "item '" << iname << "' in uniform bucket '" << name << "' has weight " << weight + << " but previous item(s) have weight " << (float)uniform_weight/(float)0x10000 + << "; uniform bucket items must all have identical weights." << std::endl; + return -1; + } + } + } + + if (pos >= size) { + err << "item '" << iname << "' in bucket '" << name << "' has pos " << pos << " >= size " << size << std::endl; + return -1; + } + if (pos < 0) { + while (used_items.count(curpos)) curpos++; + pos = curpos++; + } + //err << " item " << iname << " (" << itemid << ") pos " << pos << " weight " << weight << std::endl; + items[pos] = itemid; + weights[pos] = weight; + + if (crush_addition_is_unsafe(bucketweight, weight)) { + err << "oh no! our bucket weights are overflowing all over the place, better lower the item weights" << std::endl; + return -ERANGE; + } + + bucketweight += weight; + } + } + + if (id == 0) { + for (id=-1; id_item.count(id); id--) ; + //err << "assigned id " << id << std::endl; + } + + for (auto &i : class_id) + class_bucket[id][i.first] = i.second; + + if (verbose) err << "bucket " << name << " (" << id << ") " << size << " items and weight " + << (float)bucketweight / (float)0x10000 << std::endl; + id_item[id] = name; + item_id[name] = id; + item_weight[id] = bucketweight; + + ceph_assert(id != 0); + int idout; + int r = crush.add_bucket(id, alg, hash, type, size, + items.data(), weights.data(), &idout); + if (r < 0) { + if (r == -EEXIST) + err << "Duplicate bucket id " << id << std::endl; + else + err << "add_bucket failed " << cpp_strerror(r) << std::endl; + return r; + } + r = crush.set_item_name(id, name.c_str()); + return r; +} + +int CrushCompiler::parse_rule(iter_t const& i) +{ + int start; // rule name is optional! + + string rname = string_node(i->children[1]); + if (rname != "{") { + if (rule_id.count(rname)) { + err << "rule name '" << rname << "' already defined\n" << std::endl; + return -1; + } + start = 4; + } else { + rname = string(); + start = 3; + } + + int ruleno = int_node(i->children[start]); + + string tname = string_node(i->children[start+2]); + int type; + if (tname == "replicated") + type = CEPH_PG_TYPE_REPLICATED; + else if (tname == "erasure") + type = CEPH_PG_TYPE_ERASURE; + else + ceph_abort(); + + int minsize = int_node(i->children[start+4]); + int maxsize = int_node(i->children[start+6]); + + int steps = i->children.size() - start - 8; + //err << "num steps " << steps << std::endl; + + if (crush.rule_exists(ruleno)) { + err << "rule " << ruleno << " already exists" << std::endl; + return -1; + } + int r = crush.add_rule(ruleno, steps, type, minsize, maxsize); + if (r != ruleno) { + err << "unable to add rule id " << ruleno << " for rule '" << rname + << "'" << std::endl; + return -1; + } + if (rname.length()) { + crush.set_rule_name(ruleno, rname.c_str()); + rule_id[rname] = ruleno; + } + + int step = 0; + for (iter_t p = i->children.begin() + start + 7; step < steps; p++) { + iter_t s = p->children.begin() + 1; + int stepid = s->value.id().to_long(); + switch (stepid) { + case crush_grammar::_step_take: + { + string item = string_node(s->children[1]); + if (!item_id.count(item)) { + err << "in rule '" << rname << "' item '" << item << "' not defined" << std::endl; + return -1; + } + int id = item_id[item]; + int c = -1; + string class_name; + if (s->children.size() > 2) { + class_name = string_node(s->children[3]); + c = crush.get_class_id(class_name); + if (c < 0) + return c; + if (crush.class_bucket.count(id) == 0) { + err << "in rule '" << rname << "' step take " << item + << " has no class information" << std::endl; + return -EINVAL; + } + if (crush.class_bucket[id].count(c) == 0) { + err << "in rule '" << rname << "' step take " << item + << " no matching bucket for class " << class_name << std::endl; + return -EINVAL; + } + id = crush.class_bucket[id][c]; + } + if (verbose) { + err << "rule " << rname << " take " << item; + if (c < 0) + err << std::endl; + else + err << " remapped to " << crush.get_item_name(id) << std::endl; + } + + crush.set_rule_step_take(ruleno, step++, id); + } + break; + + case crush_grammar::_step_set_choose_tries: + { + int val = int_node(s->children[1]); + crush.set_rule_step_set_choose_tries(ruleno, step++, val); + } + break; + + case crush_grammar::_step_set_choose_local_tries: + { + int val = int_node(s->children[1]); + crush.set_rule_step_set_choose_local_tries(ruleno, step++, val); + } + break; + + case crush_grammar::_step_set_choose_local_fallback_tries: + { + int val = int_node(s->children[1]); + crush.set_rule_step_set_choose_local_fallback_tries(ruleno, step++, val); + } + break; + + case crush_grammar::_step_set_chooseleaf_tries: + { + int val = int_node(s->children[1]); + crush.set_rule_step_set_chooseleaf_tries(ruleno, step++, val); + } + break; + + case crush_grammar::_step_set_chooseleaf_vary_r: + { + int val = int_node(s->children[1]); + crush.set_rule_step_set_chooseleaf_vary_r(ruleno, step++, val); + } + break; + + case crush_grammar::_step_set_chooseleaf_stable: + { + int val = int_node(s->children[1]); + crush.set_rule_step_set_chooseleaf_stable(ruleno, step++, val); + } + break; + + case crush_grammar::_step_choose: + case crush_grammar::_step_chooseleaf: + { + string type = string_node(s->children[4]); + if (!type_id.count(type)) { + err << "in rule '" << rname << "' type '" << type << "' not defined" << std::endl; + return -1; + } + string choose = string_node(s->children[0]); + string mode = string_node(s->children[1]); + if (choose == "choose") { + if (mode == "firstn") + crush.set_rule_step_choose_firstn(ruleno, step++, int_node(s->children[2]), type_id[type]); + else if (mode == "indep") + crush.set_rule_step_choose_indep(ruleno, step++, int_node(s->children[2]), type_id[type]); + else ceph_abort(); + } else if (choose == "chooseleaf") { + if (mode == "firstn") + crush.set_rule_step_choose_leaf_firstn(ruleno, step++, int_node(s->children[2]), type_id[type]); + else if (mode == "indep") + crush.set_rule_step_choose_leaf_indep(ruleno, step++, int_node(s->children[2]), type_id[type]); + else ceph_abort(); + } else ceph_abort(); + } + break; + + case crush_grammar::_step_emit: + crush.set_rule_step_emit(ruleno, step++); + break; + + default: + err << "bad crush step " << stepid << std::endl; + return -1; + } + } + ceph_assert(step == steps); + return 0; +} + +int CrushCompiler::parse_weight_set_weights(iter_t const& i, int bucket_id, crush_weight_set *weight_set) +{ + // -2 for the enclosing [ ] + __u32 size = i->children.size() - 2; + __u32 bucket_size = crush.get_bucket_size(bucket_id); + if (size != bucket_size) { + err << bucket_id << " needs exactly " << bucket_size + << " weights but got " << size << std::endl; + return -1; + } + weight_set->size = size; + weight_set->weights = (__u32 *)calloc(weight_set->size, sizeof(__u32)); + __u32 pos = 0; + for (iter_t p = i->children.begin() + 1; p != i->children.end(); p++, pos++) + if (pos < size) + weight_set->weights[pos] = float_node(*p) * (float)0x10000; + return 0; +} + +int CrushCompiler::parse_weight_set(iter_t const& i, int bucket_id, crush_choose_arg *arg) +{ + // -3 stands for the leading "weight_set" keyword and the enclosing [ ] + arg->weight_set_positions = i->children.size() - 3; + arg->weight_set = (crush_weight_set *)calloc(arg->weight_set_positions, sizeof(crush_weight_set)); + __u32 pos = 0; + for (iter_t p = i->children.begin(); p != i->children.end(); p++) { + int r = 0; + switch((int)p->value.id().to_long()) { + case crush_grammar::_weight_set_weights: + if (pos < arg->weight_set_positions) { + r = parse_weight_set_weights(p, bucket_id, &arg->weight_set[pos]); + pos++; + } else { + err << "invalid weight_set syntax" << std::endl; + r = -1; + } + } + if (r < 0) + return r; + } + return 0; +} + +int CrushCompiler::parse_choose_arg_ids(iter_t const& i, int bucket_id, crush_choose_arg *arg) +{ + // -3 for the leading "ids" keyword and the enclosing [ ] + __u32 size = i->children.size() - 3; + __u32 bucket_size = crush.get_bucket_size(bucket_id); + if (size != bucket_size) { + err << bucket_id << " needs exactly " << bucket_size + << " ids but got " << size << std::endl; + return -1; + } + arg->ids_size = size; + arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32)); + __u32 pos = 0; + for (iter_t p = i->children.begin() + 2; pos < size; p++, pos++) + arg->ids[pos] = int_node(*p); + return 0; +} + +int CrushCompiler::parse_choose_arg(iter_t const& i, crush_choose_arg *args) +{ + int bucket_id = int_node(i->children[2]); + if (-1-bucket_id < 0 || -1-bucket_id >= crush.get_max_buckets()) { + err << bucket_id << " is out of range" << std::endl; + return -1; + } + if (!crush.bucket_exists(bucket_id)) { + err << bucket_id << " does not exist" << std::endl; + return -1; + } + crush_choose_arg *arg = &args[-1-bucket_id]; + for (iter_t p = i->children.begin(); p != i->children.end(); p++) { + int r = 0; + switch((int)p->value.id().to_long()) { + case crush_grammar::_weight_set: + r = parse_weight_set(p, bucket_id, arg); + break; + case crush_grammar::_choose_arg_ids: + r = parse_choose_arg_ids(p, bucket_id, arg); + break; + } + if (r < 0) + return r; + } + return 0; +} + +int CrushCompiler::parse_choose_args(iter_t const& i) +{ + int choose_arg_index = int_node(i->children[1]); + if (crush.choose_args.find(choose_arg_index) != crush.choose_args.end()) { + err << choose_arg_index << " duplicated" << std::endl; + return -1; + } + const auto max_buckets = crush.get_max_buckets(); + if (max_buckets < 0) { + err << "get_max_buckets() returned error" << std::endl; + return -1; + } + crush_choose_arg_map arg_map; + arg_map.size = max_buckets; + arg_map.args = (crush_choose_arg *)calloc(arg_map.size, sizeof(crush_choose_arg)); + for (iter_t p = i->children.begin() + 2; p != i->children.end(); p++) { + int r = 0; + switch((int)p->value.id().to_long()) { + case crush_grammar::_choose_arg: + r = parse_choose_arg(p, arg_map.args); + break; + } + if (r < 0) { + crush.destroy_choose_args(arg_map); + return r; + } + } + crush.choose_args[choose_arg_index] = arg_map; + return 0; +} + +void CrushCompiler::find_used_bucket_ids(iter_t const& i) +{ + for (iter_t p = i->children.begin(); p != i->children.end(); p++) { + if ((int)p->value.id().to_long() == crush_grammar::_bucket) { + for (iter_t firstline = p->children.begin() + 3; + firstline != p->children.end(); + ++firstline) { + string tag = string_node(firstline->children[0]); + if (tag != "id") { + break; + } + int id = int_node(firstline->children[1]); + //err << "saw bucket id " << id << std::endl; + id_item[id] = string(); + } + } + } +} + +int CrushCompiler::parse_crush(iter_t const& i) +{ + find_used_bucket_ids(i); + bool saw_rule = false; + for (iter_t p = i->children.begin(); p != i->children.end(); p++) { + int r = 0; + switch (p->value.id().to_long()) { + case crush_grammar::_tunable: + r = parse_tunable(p); + break; + case crush_grammar::_device: + r = parse_device(p); + break; + case crush_grammar::_bucket_type: + r = parse_bucket_type(p); + break; + case crush_grammar::_bucket: + if (saw_rule) { + err << "buckets must be defined before rules" << std::endl; + return -1; + } + r = parse_bucket(p); + break; + case crush_grammar::_crushrule: + if (!saw_rule) { + saw_rule = true; + crush.populate_classes(class_bucket); + } + r = parse_rule(p); + break; + case crush_grammar::_choose_args: + r = parse_choose_args(p); + break; + default: + ceph_abort(); + } + if (r < 0) { + return r; + } + } + + //err << "max_devices " << crush.get_max_devices() << std::endl; + crush.finalize(); + + return 0; +} + +// squash runs of whitespace to one space, excepting newlines +string CrushCompiler::consolidate_whitespace(string in) +{ + string out; + + bool white = false; + for (unsigned p=0; p 3) + err << " \"" << in << "\" -> \"" << out << "\"" << std::endl; + return out; +} + +void CrushCompiler::dump(iter_t const& i, int ind) +{ + err << "dump"; + for (int j=0; jvalue.id().to_long(); + err << id << "\t"; + err << "'" << string(i->value.begin(), i->value.end()) + << "' " << i->children.size() << " children" << std::endl; + for (unsigned int j = 0; j < i->children.size(); j++) + dump(i->children.begin() + j, ind+1); +} + +/** +* This function fix the problem like below +* rack using_foo { item foo } +* host foo { ... } +* +* if an item being used by a bucket is defined after that bucket. +* CRUSH compiler will create a map by which we can +* not identify that item when selecting in that bucket. +**/ +int CrushCompiler::adjust_bucket_item_place(iter_t const &i) +{ + map > bucket_items; + map bucket_itrer; + vector buckets; + for (iter_t p = i->children.begin(); p != i->children.end(); ++p) { + if ((int)p->value.id().to_long() == crush_grammar::_bucket) { + string name = string_node(p->children[1]); + buckets.push_back(name); + bucket_itrer[name] = p; + //skip non-bucket-item children in the bucket's parse tree + for (unsigned q=3; q < p->children.size()-1; ++q) { + iter_t sub = p->children.begin() + q; + if ((int)sub->value.id().to_long() + == crush_grammar::_bucket_item) { + string iname = string_node(sub->children[1]); + bucket_items[name].insert(iname); + } + } + } + } + + //adjust the bucket + for (unsigned i=0; i < buckets.size(); ++i) { + for (unsigned j=i+1; j < buckets.size(); ++j) { + if (bucket_items[buckets[i]].count(buckets[j])) { + if (bucket_items[buckets[j]].count(buckets[i])) { + err << "bucket '" << buckets[i] << "' and bucket '" + << buckets[j] << "' are included each other" << std::endl; + return -1; + } else { + std::iter_swap(bucket_itrer[buckets[i]], bucket_itrer[buckets[j]]); + } + } + } + } + + return 0; +} + +int CrushCompiler::compile(istream& in, const char *infn) +{ + if (!infn) + infn = ""; + + // always start with legacy tunables, so that the compiled result of + // a given crush file is fixed for all time. + crush.set_tunables_legacy(); + + string big; + string str; + int line = 1; + map line_pos; // pos -> line + map line_val; + while (getline(in, str)) { + // remove newline + int l = str.length(); + if (l && str[l - 1] == '\n') + str.erase(l-1, 1); + + line_val[line] = str; + + // strip comment + int n = str.find("#"); + if (n >= 0) + str.erase(n, str.length()-n); + + if (verbose>1) err << line << ": " << str << std::endl; + + // work around spirit crankiness by removing extraneous + // whitespace. there is probably a more elegant solution, but + // this only broke with the latest spirit (with the switchover to + // "classic"), i don't want to spend too much time figuring it + // out. + string stripped = consolidate_whitespace(str); + if (stripped.length() && big.length() && big[big.length()-1] != ' ') big += " "; + + line_pos[big.length()] = line; + line++; + big += stripped; + } + + if (verbose > 2) err << "whole file is: \"" << big << "\"" << std::endl; + + crush_grammar crushg; + const char *start = big.c_str(); + //tree_parse_info info = ast_parse(start, crushg, space_p); + tree_parse_info<> info = ast_parse(start, crushg, space_p); + + // parse error? + if (!info.full) { + int cpos = info.stop - start; + //out << "cpos " << cpos << std::endl; + //out << " linemap " << line_pos << std::endl; + ceph_assert(!line_pos.empty()); + map::iterator p = line_pos.upper_bound(cpos); + if (p != line_pos.begin()) + --p; + int line = p->second; + int pos = cpos - p->first; + err << infn << ":" << line //<< ":" << (pos+1) + << " error: parse error at '" << line_val[line].substr(pos) << "'" << std::endl; + return -1; + } + + int r = adjust_bucket_item_place(info.trees.begin()); + if (r < 0) { + return r; + } + //out << "parsing succeeded\n"; + //dump(info.trees.begin()); + return parse_crush(info.trees.begin()); +} diff --git a/src/crush/CrushCompiler.h b/src/crush/CrushCompiler.h new file mode 100644 index 00000000..f035085e --- /dev/null +++ b/src/crush/CrushCompiler.h @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CRUSH_COMPILER_H +#define CEPH_CRUSH_COMPILER_H + +#include "crush/CrushWrapper.h" +#include "crush/grammar.h" + +#include +#include + +class CrushCompiler { + CrushWrapper& crush; + ostream& err; + int verbose; + bool unsafe_tunables; + + // decompile + enum dcb_state_t { + DCB_STATE_IN_PROGRESS = 0, + DCB_STATE_DONE + }; + + int decompile_weight_set_weights(crush_weight_set weight_set, + ostream &out); + int decompile_weight_set(crush_weight_set *weight_set, + __u32 size, + ostream &out); + int decompile_choose_arg(crush_choose_arg *arg, + int bucket_id, + ostream &out); + int decompile_ids(int *ids, + __u32 size, + ostream &out); + int decompile_choose_arg_map(crush_choose_arg_map arg_map, + ostream &out); + int decompile_choose_args(const std::pair &i, + ostream &out); + int decompile_bucket_impl(int i, ostream &out); + int decompile_bucket(int cur, + std::map& dcb_states, + ostream &out); + + // compile + typedef char const* iterator_t; + typedef tree_match parse_tree_match_t; + typedef parse_tree_match_t::tree_iterator iter_t; + typedef parse_tree_match_t::node_t node_t; + + map item_id; + map id_item; + map item_weight; + map type_id; + map rule_id; + std::map > class_bucket; // bucket id -> class id -> shadow bucket id + + string string_node(node_t &node); + int int_node(node_t &node); + float float_node(node_t &node); + + int parse_tunable(iter_t const& i); + int parse_device(iter_t const& i); + int parse_bucket_type(iter_t const& i); + int parse_bucket(iter_t const& i); + int parse_rule(iter_t const& i); + int parse_weight_set_weights(iter_t const& i, int bucket_id, crush_weight_set *weight_set); + int parse_weight_set(iter_t const& i, int bucket_id, crush_choose_arg *arg); + int parse_choose_arg_ids(iter_t const& i, int bucket_id, crush_choose_arg *args); + int parse_choose_arg(iter_t const& i, crush_choose_arg *args); + int parse_choose_args(iter_t const& i); + void find_used_bucket_ids(iter_t const& i); + int parse_crush(iter_t const& i); + void dump(iter_t const& i, int ind=1); + string consolidate_whitespace(string in); + int adjust_bucket_item_place(iter_t const &i); + +public: + CrushCompiler(CrushWrapper& c, ostream& eo, int verbosity=0) + : crush(c), err(eo), verbose(verbosity), + unsafe_tunables(false) {} + ~CrushCompiler() {} + + void enable_unsafe_tunables() { + unsafe_tunables = true; + } + + int decompile(ostream& out); + int compile(istream& in, const char *infn=0); +}; + +#endif diff --git a/src/crush/CrushLocation.cc b/src/crush/CrushLocation.cc new file mode 100644 index 00000000..2032bf71 --- /dev/null +++ b/src/crush/CrushLocation.cc @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/compat.h" +#include "CrushLocation.h" +#include "CrushWrapper.h" +#include "common/ceph_context.h" +#include "common/config.h" +#include "include/str_list.h" +#include "common/debug.h" +#include "common/errno.h" +#include "include/compat.h" + +#include "common/SubProcess.h" + +#include + +int CrushLocation::update_from_conf() +{ + if (cct->_conf->crush_location.length()) + return _parse(cct->_conf->crush_location); + return 0; +} + +int CrushLocation::_parse(const std::string& s) +{ + std::multimap new_crush_location; + std::vector lvec; + get_str_vec(s, ";, \t", lvec); + int r = CrushWrapper::parse_loc_multimap(lvec, &new_crush_location); + if (r < 0) { + lderr(cct) << "warning: crush_location '" << cct->_conf->crush_location + << "' does not parse, keeping original crush_location " + << loc << dendl; + return -EINVAL; + } + std::lock_guard l(lock); + loc.swap(new_crush_location); + lgeneric_dout(cct, 10) << "crush_location is " << loc << dendl; + return 0; +} + +int CrushLocation::update_from_hook() +{ + if (cct->_conf->crush_location_hook.length() == 0) + return 0; + + if (0 != access(cct->_conf->crush_location_hook.c_str(), R_OK)) { + lderr(cct) << "the user define crush location hook: " << cct->_conf->crush_location_hook + << " may not exist or can not access it" << dendl; + return errno; + } + + SubProcessTimed hook( + cct->_conf->crush_location_hook.c_str(), + SubProcess::CLOSE, SubProcess::PIPE, SubProcess::PIPE, + cct->_conf->crush_location_hook_timeout); + hook.add_cmd_args( + "--cluster", cct->_conf->cluster.c_str(), + "--id", cct->_conf->name.get_id().c_str(), + "--type", cct->_conf->name.get_type_str(), + NULL); + int ret = hook.spawn(); + if (ret != 0) { + lderr(cct) << "error: failed run " << cct->_conf->crush_location_hook << ": " + << hook.err() << dendl; + return ret; + } + + bufferlist bl; + ret = bl.read_fd(hook.get_stdout(), 100 * 1024); + if (ret < 0) { + lderr(cct) << "error: failed read stdout from " + << cct->_conf->crush_location_hook + << ": " << cpp_strerror(-ret) << dendl; + bufferlist err; + err.read_fd(hook.get_stderr(), 100 * 1024); + lderr(cct) << "stderr:\n"; + err.hexdump(*_dout); + *_dout << dendl; + } + + if (hook.join() != 0) { + lderr(cct) << "error: failed to join: " << hook.err() << dendl; + return -EINVAL; + } + + if (ret < 0) + return ret; + + std::string out; + bl.copy(0, bl.length(), out); + out.erase(out.find_last_not_of(" \n\r\t")+1); + return _parse(out); +} + +int CrushLocation::init_on_startup() +{ + if (cct->_conf->crush_location.length()) { + return update_from_conf(); + } + if (cct->_conf->crush_location_hook.length()) { + return update_from_hook(); + } + + // start with a sane default + char hostname[HOST_NAME_MAX + 1]; + int r = gethostname(hostname, sizeof(hostname)); + if (r < 0) + strcpy(hostname, "unknown_host"); + // use short hostname + for (unsigned i=0; hostname[i]; ++i) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + break; + } + } + std::lock_guard l(lock); + loc.clear(); + loc.insert(make_pair("host", hostname)); + loc.insert(make_pair("root", "default")); + lgeneric_dout(cct, 10) << "crush_location is (default) " << loc << dendl; + return 0; +} diff --git a/src/crush/CrushLocation.h b/src/crush/CrushLocation.h new file mode 100644 index 00000000..6a099689 --- /dev/null +++ b/src/crush/CrushLocation.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CRUSH_LOCATION_H +#define CEPH_CRUSH_LOCATION_H + +#include +#include +#include + +class CephContext; + +class CrushLocation { + CephContext *cct; + std::multimap loc; + std::mutex lock; + + int _parse(const std::string& s); + +public: + explicit CrushLocation(CephContext *c) : cct(c) { + init_on_startup(); + } + + int update_from_conf(); ///< refresh from config + int update_from_hook(); ///< call hook, if present + int init_on_startup(); + + std::multimap get_location() { + std::lock_guard l(lock); + return loc; + } +}; + +#endif diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc new file mode 100644 index 00000000..86f91ef3 --- /dev/null +++ b/src/crush/CrushTester.cc @@ -0,0 +1,802 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/stringify.h" +#include "CrushTester.h" +#include "CrushTreeDumper.h" +#include "include/ceph_features.h" + +#include +#include +#include +// to workaround https://svn.boost.org/trac/boost/ticket/9501 +#ifdef _LIBCPP_VERSION +#include +#if BOOST_VERSION < 105600 +#define ICL_USE_BOOST_MOVE_IMPLEMENTATION +#endif +#endif +#include +#include +#include "common/SubProcess.h" +#include "common/fork_function.h" + +void CrushTester::set_device_weight(int dev, float f) +{ + int w = (int)(f * 0x10000); + if (w < 0) + w = 0; + if (w > 0x10000) + w = 0x10000; + device_weight[dev] = w; +} + +int CrushTester::get_maximum_affected_by_rule(int ruleno) +{ + // get the number of steps in RULENO + int rule_size = crush.get_rule_len(ruleno); + vector affected_types; + map replications_by_type; + + for (int i = 0; i < rule_size; i++){ + // get what operation is done by the current step + int rule_operation = crush.get_rule_op(ruleno, i); + + // if the operation specifies choosing a device type, store it + if (rule_operation >= 2 && rule_operation != 4){ + int desired_replication = crush.get_rule_arg1(ruleno,i); + int affected_type = crush.get_rule_arg2(ruleno,i); + affected_types.push_back(affected_type); + replications_by_type[affected_type] = desired_replication; + } + } + + /* + * now for each of the affected bucket types, see what is the + * maximum we are (a) requesting or (b) have + */ + + map max_devices_of_type; + + // loop through the vector of affected types + for (vector::iterator it = affected_types.begin(); it != affected_types.end(); ++it){ + // loop through the number of buckets looking for affected types + for (map::iterator p = crush.name_map.begin(); p != crush.name_map.end(); ++p){ + int bucket_type = crush.get_bucket_type(p->first); + if ( bucket_type == *it) + max_devices_of_type[*it]++; + } + } + + for(std::vector::iterator it = affected_types.begin(); it != affected_types.end(); ++it){ + if ( replications_by_type[*it] > 0 && replications_by_type[*it] < max_devices_of_type[*it] ) + max_devices_of_type[*it] = replications_by_type[*it]; + } + + /* + * get the smallest number of buckets available of any type as this is our upper bound on + * the number of replicas we can place + */ + int max_affected = max( crush.get_max_buckets(), crush.get_max_devices() ); + + for(std::vector::iterator it = affected_types.begin(); it != affected_types.end(); ++it){ + if (max_devices_of_type[*it] > 0 && max_devices_of_type[*it] < max_affected ) + max_affected = max_devices_of_type[*it]; + } + + return max_affected; +} + + +map CrushTester::get_collapsed_mapping() +{ + int num_to_check = crush.get_max_devices(); + int next_id = 0; + map collapse_mask; + + for (int i = 0; i < num_to_check; i++){ + if (crush.check_item_present(i)){ + collapse_mask[i] = next_id; + next_id++; + } + } + + return collapse_mask; +} + +void CrushTester::adjust_weights(vector<__u32>& weight) +{ + + if (mark_down_device_ratio > 0) { + // active buckets + vector bucket_ids; + for (int i = 0; i < crush.get_max_buckets(); i++) { + int id = -1 - i; + if (crush.get_bucket_weight(id) > 0) { + bucket_ids.push_back(id); + } + } + + // get buckets that are one level above a device + vector buckets_above_devices; + for (unsigned i = 0; i < bucket_ids.size(); i++) { + // grab the first child object of a bucket and check if it's ID is less than 0 + int id = bucket_ids[i]; + if (crush.get_bucket_size(id) == 0) + continue; + int first_child = crush.get_bucket_item(id, 0); // returns the ID of the bucket or device + if (first_child >= 0) { + buckets_above_devices.push_back(id); + } + } + + // permute bucket list + for (unsigned i = 0; i < buckets_above_devices.size(); i++) { + unsigned j = lrand48() % (buckets_above_devices.size() - 1); + std::swap(buckets_above_devices[i], buckets_above_devices[j]); + } + + // calculate how many buckets and devices we need to reap... + int num_buckets_to_visit = (int) (mark_down_bucket_ratio * buckets_above_devices.size()); + + for (int i = 0; i < num_buckets_to_visit; i++) { + int id = buckets_above_devices[i]; + int size = crush.get_bucket_size(id); + vector items; + for (int o = 0; o < size; o++) + items.push_back(crush.get_bucket_item(id, o)); + + // permute items + for (int o = 0; o < size; o++) { + int j = lrand48() % (crush.get_bucket_size(id) - 1); + std::swap(items[o], items[j]); + } + + int local_devices_to_visit = (int) (mark_down_device_ratio*size); + for (int o = 0; o < local_devices_to_visit; o++){ + int item = crush.get_bucket_item(id, o); + weight[item] = 0; + } + } + } +} + +bool CrushTester::check_valid_placement(int ruleno, vector in, const vector<__u32>& weight) +{ + + bool valid_placement = true; + vector included_devices; + map seen_devices; + + // first do the easy check that all devices are "up" + for (vector::iterator it = in.begin(); it != in.end(); ++it) { + if (weight[(*it)] == 0) { + valid_placement = false; + break; + } else if (weight[(*it)] > 0) { + included_devices.push_back( (*it) ); + } + } + + /* + * now do the harder test of checking that the CRUSH rule r is not violated + * we could test that none of the devices mentioned in out are unique, + * but this is a special case of this test + */ + + // get the number of steps in RULENO + int rule_size = crush.get_rule_len(ruleno); + vector affected_types; + + // get the smallest type id, and name + int min_map_type = crush.get_num_type_names(); + for (map::iterator it = crush.type_map.begin(); it != crush.type_map.end(); ++it ) { + if ( (*it).first < min_map_type ) { + min_map_type = (*it).first; + } + } + + string min_map_type_name = crush.type_map[min_map_type]; + + // get the types of devices affected by RULENO + for (int i = 0; i < rule_size; i++) { + // get what operation is done by the current step + int rule_operation = crush.get_rule_op(ruleno, i); + + // if the operation specifies choosing a device type, store it + if (rule_operation >= 2 && rule_operation != 4) { + int affected_type = crush.get_rule_arg2(ruleno,i); + affected_types.push_back( crush.get_type_name(affected_type)); + } + } + + // find in if we are only dealing with osd's + bool only_osd_affected = false; + if (affected_types.size() == 1) { + if ((affected_types.back() == min_map_type_name) && (min_map_type_name == "osd")) { + only_osd_affected = true; + } + } + + // check that we don't have any duplicate id's + for (vector::iterator it = included_devices.begin(); it != included_devices.end(); ++it) { + int num_copies = std::count(included_devices.begin(), included_devices.end(), (*it) ); + if (num_copies > 1) { + valid_placement = false; + } + } + + // if we have more than just osd's affected we need to do a lot more work + if (!only_osd_affected) { + // loop through the devices that are "in/up" + for (vector::iterator it = included_devices.begin(); it != included_devices.end(); ++it) { + if (valid_placement == false) + break; + + // create a temporary map of the form (device type, device name in map) + map device_location_hierarchy = crush.get_full_location(*it); + + // loop over the types affected by RULENO looking for duplicate bucket assignments + for (vector::iterator t = affected_types.begin(); t != affected_types.end(); ++t) { + if (seen_devices.count( device_location_hierarchy[*t])) { + valid_placement = false; + break; + } else { + // store the devices we have seen in the form of (device name, device type) + seen_devices[ device_location_hierarchy[*t] ] = *t; + } + } + } + } + + return valid_placement; +} + +int CrushTester::random_placement(int ruleno, vector& out, int maxout, vector<__u32>& weight) +{ + // get the total weight of the system + int total_weight = 0; + for (unsigned i = 0; i < weight.size(); i++) + total_weight += weight[i]; + + if (total_weight == 0 || + crush.get_max_devices() == 0) + return -EINVAL; + + // determine the real maximum number of devices to return + int devices_requested = min(maxout, get_maximum_affected_by_rule(ruleno)); + bool accept_placement = false; + + vector trial_placement(devices_requested); + int attempted_tries = 0; + int max_tries = 100; + do { + // create a vector to hold our trial mappings + int temp_array[devices_requested]; + for (int i = 0; i < devices_requested; i++){ + temp_array[i] = lrand48() % (crush.get_max_devices()); + } + + trial_placement.assign(temp_array, temp_array + devices_requested); + accept_placement = check_valid_placement(ruleno, trial_placement, weight); + attempted_tries++; + } while (accept_placement == false && attempted_tries < max_tries); + + // save our random placement to the out vector + if (accept_placement) + out.assign(trial_placement.begin(), trial_placement.end()); + + // or don't.... + else if (attempted_tries == max_tries) + return -EINVAL; + + return 0; +} + +void CrushTester::write_integer_indexed_vector_data_string(vector &dst, int index, vector vector_data) +{ + stringstream data_buffer (stringstream::in | stringstream::out); + unsigned input_size = vector_data.size(); + + // pass the indexing variable to the data buffer + data_buffer << index; + + // pass the rest of the input data to the buffer + for (unsigned i = 0; i < input_size; i++) { + data_buffer << ',' << vector_data[i]; + } + + data_buffer << std::endl; + + // write the data buffer to the destination + dst.push_back( data_buffer.str() ); +} + +void CrushTester::write_integer_indexed_vector_data_string(vector &dst, int index, vector vector_data) +{ + stringstream data_buffer (stringstream::in | stringstream::out); + unsigned input_size = vector_data.size(); + + // pass the indexing variable to the data buffer + data_buffer << index; + + // pass the rest of the input data to the buffer + for (unsigned i = 0; i < input_size; i++) { + data_buffer << ',' << vector_data[i]; + } + + data_buffer << std::endl; + + // write the data buffer to the destination + dst.push_back( data_buffer.str() ); +} + +void CrushTester::write_integer_indexed_scalar_data_string(vector &dst, int index, int scalar_data) +{ + stringstream data_buffer (stringstream::in | stringstream::out); + + // pass the indexing variable to the data buffer + data_buffer << index; + + // pass the input data to the buffer + data_buffer << ',' << scalar_data; + data_buffer << std::endl; + + // write the data buffer to the destination + dst.push_back( data_buffer.str() ); +} +void CrushTester::write_integer_indexed_scalar_data_string(vector &dst, int index, float scalar_data) +{ + stringstream data_buffer (stringstream::in | stringstream::out); + + // pass the indexing variable to the data buffer + data_buffer << index; + + // pass the input data to the buffer + data_buffer << ',' << scalar_data; + data_buffer << std::endl; + + // write the data buffer to the destination + dst.push_back( data_buffer.str() ); +} + +int CrushTester::test_with_fork(int timeout) +{ + ostringstream sink; + int r = fork_function(timeout, sink, [&]() { + return test(); + }); + if (r == -ETIMEDOUT) { + err << "timed out during smoke test (" << timeout << " seconds)"; + } + return r; +} + +namespace { + class BadCrushMap : public std::runtime_error { + public: + int item; + BadCrushMap(const char* msg, int id) + : std::runtime_error(msg), item(id) {} + }; + // throws if any node in the crush fail to print + class CrushWalker : public CrushTreeDumper::Dumper { + typedef void DumbFormatter; + typedef CrushTreeDumper::Dumper Parent; + int max_id; + public: + CrushWalker(const CrushWrapper *crush, unsigned max_id) + : Parent(crush, CrushTreeDumper::name_map_t()), max_id(max_id) {} + void dump_item(const CrushTreeDumper::Item &qi, DumbFormatter *) override { + int type = -1; + if (qi.is_bucket()) { + if (!crush->get_item_name(qi.id)) { + throw BadCrushMap("unknown item name", qi.id); + } + type = crush->get_bucket_type(qi.id); + } else { + if (max_id > 0 && qi.id >= max_id) { + throw BadCrushMap("item id too large", qi.id); + } + type = 0; + } + if (!crush->get_type_name(type)) { + throw BadCrushMap("unknown type name", qi.id); + } + } + }; +} + +bool CrushTester::check_name_maps(unsigned max_id) const +{ + CrushWalker crush_walker(&crush, max_id); + try { + // walk through the crush, to see if its self-contained + crush_walker.dump(NULL); + // and see if the maps is also able to handle straying OSDs, whose id >= 0. + // "ceph osd tree" will try to print them, even they are not listed in the + // crush map. + crush_walker.dump_item(CrushTreeDumper::Item(0, 0, 0, 0), NULL); + } catch (const BadCrushMap& e) { + err << e.what() << ": item#" << e.item << std::endl; + return false; + } + return true; +} + +static string get_rule_name(CrushWrapper& crush, int rule) +{ + if (crush.get_rule_name(rule)) + return crush.get_rule_name(rule); + else + return string("rule") + std::to_string(rule); +} + +void CrushTester::check_overlapped_rules() const +{ + namespace icl = boost::icl; + typedef std::set RuleNames; + typedef icl::interval_map Rules; + // => interval_map + typedef std::map, Rules> RuleSets; + using interval = icl::interval; + + // mimic the logic of crush_find_rule(), but it only return the first matched + // one, but I am collecting all of them by the overlapped sizes. + RuleSets rulesets; + for (int rule = 0; rule < crush.get_max_rules(); rule++) { + if (!crush.rule_exists(rule)) { + continue; + } + Rules& rules = rulesets[{crush.get_rule_mask_ruleset(rule), + crush.get_rule_mask_type(rule)}]; + rules += make_pair(interval::closed(crush.get_rule_mask_min_size(rule), + crush.get_rule_mask_max_size(rule)), + RuleNames{get_rule_name(crush, rule)}); + } + for (auto i : rulesets) { + auto ruleset_type = i.first; + const Rules& rules = i.second; + for (auto r : rules) { + const RuleNames& names = r.second; + // if there are more than one rules covering the same size range, + // print them out. + if (names.size() > 1) { + err << "overlapped rules in ruleset " << ruleset_type.first << ": " + << boost::join(names, ", ") << "\n"; + } + } + } +} + +int CrushTester::test() +{ + if (min_rule < 0 || max_rule < 0) { + min_rule = 0; + max_rule = crush.get_max_rules() - 1; + } + if (min_x < 0 || max_x < 0) { + min_x = 0; + max_x = 1023; + } + + // initial osd weights + vector<__u32> weight; + + /* + * note device weight is set by crushtool + * (likely due to a given a command line option) + */ + for (int o = 0; o < crush.get_max_devices(); o++) { + if (device_weight.count(o)) { + weight.push_back(device_weight[o]); + } else if (crush.check_item_present(o)) { + weight.push_back(0x10000); + } else { + weight.push_back(0); + } + } + + if (output_utilization_all) + err << "devices weights (hex): " << hex << weight << dec << std::endl; + + // make adjustments + adjust_weights(weight); + + + int num_devices_active = 0; + for (vector<__u32>::iterator p = weight.begin(); p != weight.end(); ++p) + if (*p > 0) + num_devices_active++; + + if (output_choose_tries) + crush.start_choose_profile(); + + for (int r = min_rule; r < crush.get_max_rules() && r <= max_rule; r++) { + if (!crush.rule_exists(r)) { + if (output_statistics) + err << "rule " << r << " dne" << std::endl; + continue; + } + if (ruleset >= 0 && + crush.get_rule_mask_ruleset(r) != ruleset) { + continue; + } + int minr = min_rep, maxr = max_rep; + if (min_rep < 0 || max_rep < 0) { + minr = crush.get_rule_mask_min_size(r); + maxr = crush.get_rule_mask_max_size(r); + } + + if (output_statistics) + err << "rule " << r << " (" << crush.get_rule_name(r) + << "), x = " << min_x << ".." << max_x + << ", numrep = " << minr << ".." << maxr + << std::endl; + + for (int nr = minr; nr <= maxr; nr++) { + vector per(crush.get_max_devices()); + map sizes; + + int num_objects = ((max_x - min_x) + 1); + float num_devices = (float) per.size(); // get the total number of devices, better to cast as a float here + + // create a structure to hold data for post-processing + tester_data_set tester_data; + vector vector_data_buffer_f; + + // create a map to hold batch-level placement information + map > batch_per; + int objects_per_batch = num_objects / num_batches; + int batch_min = min_x; + int batch_max = min_x + objects_per_batch - 1; + + // get the total weight of the system + int total_weight = 0; + for (unsigned i = 0; i < per.size(); i++) + total_weight += weight[i]; + + if (total_weight == 0) + continue; + + // compute the expected number of objects stored per device in the absence of weighting + float expected_objects = min(nr, get_maximum_affected_by_rule(r)) * num_objects; + + // compute each device's proportional weight + vector proportional_weights( per.size() ); + + for (unsigned i = 0; i < per.size(); i++) + proportional_weights[i] = (float) weight[i] / (float) total_weight; + + if (output_data_file) { + // stage the absolute weight information for post-processing + for (unsigned i = 0; i < per.size(); i++) { + tester_data.absolute_weights[i] = (float) weight[i] / (float)0x10000; + } + + // stage the proportional weight information for post-processing + for (unsigned i = 0; i < per.size(); i++) { + if (proportional_weights[i] > 0 ) + tester_data.proportional_weights[i] = proportional_weights[i]; + + tester_data.proportional_weights_all[i] = proportional_weights[i]; + } + + } + // compute the expected number of objects stored per device when a device's weight is considered + vector num_objects_expected(num_devices); + + for (unsigned i = 0; i < num_devices; i++) + num_objects_expected[i] = (proportional_weights[i]*expected_objects); + + for (int current_batch = 0; current_batch < num_batches; current_batch++) { + if (current_batch == (num_batches - 1)) { + batch_max = max_x; + objects_per_batch = (batch_max - batch_min + 1); + } + + float batch_expected_objects = min(nr, get_maximum_affected_by_rule(r)) * objects_per_batch; + vector batch_num_objects_expected( per.size() ); + + for (unsigned i = 0; i < per.size() ; i++) + batch_num_objects_expected[i] = (proportional_weights[i]*batch_expected_objects); + + // create a vector to hold placement results temporarily + vector temporary_per ( per.size() ); + + for (int x = batch_min; x <= batch_max; x++) { + // create a vector to hold the results of a CRUSH placement or RNG simulation + vector out; + + if (use_crush) { + if (output_mappings) + err << "CRUSH"; // prepend CRUSH to placement output + uint32_t real_x = x; + if (pool_id != -1) { + real_x = crush_hash32_2(CRUSH_HASH_RJENKINS1, x, (uint32_t)pool_id); + } + crush.do_rule(r, real_x, out, nr, weight, 0); + } else { + if (output_mappings) + err << "RNG"; // prepend RNG to placement output to denote simulation + // test our new monte carlo placement generator + random_placement(r, out, nr, weight); + } + + if (output_mappings) + err << " rule " << r << " x " << x << " " << out << std::endl; + + if (output_data_file) + write_integer_indexed_vector_data_string(tester_data.placement_information, x, out); + + bool has_item_none = false; + for (unsigned i = 0; i < out.size(); i++) { + if (out[i] != CRUSH_ITEM_NONE) { + per[out[i]]++; + temporary_per[out[i]]++; + } else { + has_item_none = true; + } + } + + batch_per[current_batch] = temporary_per; + sizes[out.size()]++; + if (output_bad_mappings && + (out.size() != (unsigned)nr || + has_item_none)) { + err << "bad mapping rule " << r << " x " << x << " num_rep " << nr << " result " << out << std::endl; + } + } + + batch_min = batch_max + 1; + batch_max = batch_min + objects_per_batch - 1; + } + + for (unsigned i = 0; i < per.size(); i++) + if (output_utilization && !output_statistics) + err << " device " << i + << ":\t" << per[i] << std::endl; + + for (map::iterator p = sizes.begin(); p != sizes.end(); ++p) + if (output_statistics) + err << "rule " << r << " (" << crush.get_rule_name(r) << ") num_rep " << nr + << " result size == " << p->first << ":\t" + << p->second << "/" << (max_x-min_x+1) << std::endl; + + if (output_statistics) + for (unsigned i = 0; i < per.size(); i++) { + if (output_utilization) { + if (num_objects_expected[i] > 0 && per[i] > 0) { + err << " device " << i << ":\t" + << "\t" << " stored " << ": " << per[i] + << "\t" << " expected " << ": " << num_objects_expected[i] + << std::endl; + } + } else if (output_utilization_all) { + err << " device " << i << ":\t" + << "\t" << " stored " << ": " << per[i] + << "\t" << " expected " << ": " << num_objects_expected[i] + << std::endl; + } + } + + if (output_data_file) + for (unsigned i = 0; i < per.size(); i++) { + vector_data_buffer_f.clear(); + vector_data_buffer_f.push_back( (float) per[i]); + vector_data_buffer_f.push_back( (float) num_objects_expected[i]); + + write_integer_indexed_vector_data_string(tester_data.device_utilization_all, i, vector_data_buffer_f); + + if (num_objects_expected[i] > 0 && per[i] > 0) + write_integer_indexed_vector_data_string(tester_data.device_utilization, i, vector_data_buffer_f); + } + + if (output_data_file && num_batches > 1) { + // stage batch utilization information for post-processing + for (int i = 0; i < num_batches; i++) { + write_integer_indexed_vector_data_string(tester_data.batch_device_utilization_all, i, batch_per[i]); + write_integer_indexed_vector_data_string(tester_data.batch_device_expected_utilization_all, i, batch_per[i]); + } + } + + string rule_tag = crush.get_rule_name(r); + + if (output_csv) + write_data_set_to_csv(output_data_file_name+rule_tag,tester_data); + } + } + + if (output_choose_tries) { + __u32 *v = 0; + int n = crush.get_choose_profile(&v); + for (int i=0; i weight; + + /* + * note device weight is set by crushtool + * (likely due to a given a command line option) + */ + for (int o = 0; o < crush.get_max_devices(); o++) { + if (device_weight.count(o)) { + weight.push_back(device_weight[o]); + } else if (crush.check_item_present(o)) { + weight.push_back(0x10000); + } else { + weight.push_back(0); + } + } + + // make adjustments + adjust_weights(weight); + + map bad_by_rule; + + int ret = 0; + for (int r = min_rule; r < crush.get_max_rules() && r <= max_rule; r++) { + if (!crush.rule_exists(r)) { + if (output_statistics) + err << "rule " << r << " dne" << std::endl; + continue; + } + if (ruleset >= 0 && + crush.get_rule_mask_ruleset(r) != ruleset) { + continue; + } + int minr = min_rep, maxr = max_rep; + if (min_rep < 0 || max_rep < 0) { + minr = crush.get_rule_mask_min_size(r); + maxr = crush.get_rule_mask_max_size(r); + } + int bad = 0; + for (int nr = minr; nr <= maxr; nr++) { + for (int x = min_x; x <= max_x; ++x) { + vector out; + crush.do_rule(r, x, out, nr, weight, 0); + vector out2; + crush2.do_rule(r, x, out2, nr, weight, 0); + if (out != out2) { + ++bad; + } + } + } + if (bad) { + ret = -1; + } + int max = (maxr - minr + 1) * (max_x - min_x + 1); + double ratio = (double)bad / (double)max; + cout << "rule " << r << " had " << bad << "/" << max + << " mismatched mappings (" << ratio << ")" << std::endl; + } + if (ret) { + cerr << "warning: maps are NOT equivalent" << std::endl; + } else { + cout << "maps appear equivalent" << std::endl; + } + return ret; +} diff --git a/src/crush/CrushTester.h b/src/crush/CrushTester.h new file mode 100644 index 00000000..c4257b63 --- /dev/null +++ b/src/crush/CrushTester.h @@ -0,0 +1,366 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CRUSH_TESTER_H +#define CEPH_CRUSH_TESTER_H + +#include "crush/CrushWrapper.h" + +#include + +class CrushTester { + CrushWrapper& crush; + ostream& err; + + map device_weight; + int min_rule, max_rule; + int ruleset; + int min_x, max_x; + int min_rep, max_rep; + int64_t pool_id; + + int num_batches; + bool use_crush; + + float mark_down_device_ratio; + float mark_down_bucket_ratio; + + bool output_utilization; + bool output_utilization_all; + bool output_statistics; + bool output_mappings; + bool output_bad_mappings; + bool output_choose_tries; + + bool output_data_file; + bool output_csv; + + string output_data_file_name; + +/* + * mark a ratio of devices down, can be used to simulate placement distributions + * under degrated cluster conditions + */ + void adjust_weights(vector<__u32>& weight); + + /* + * Get the maximum number of devices that could be selected to satisfy ruleno. + */ + int get_maximum_affected_by_rule(int ruleno); + + /* + * for maps where in devices have non-sequential id numbers, return a mapping of device id + * to a sequential id number. For example, if we have devices with id's 0 1 4 5 6 return a map + * where: + * 0 = 0 + * 1 = 1 + * 4 = 2 + * 5 = 3 + * 6 = 4 + * + * which can help make post-processing easier + */ + map get_collapsed_mapping(); + + /* + * Essentially a re-implementation of CRUSH. Given a vector of devices + * check that the vector represents a valid placement for a given ruleno. + */ + bool check_valid_placement(int ruleno, vector in, const vector<__u32>& weight); + + /* + * Generate a random selection of devices which satisfies ruleno. Essentially a + * monte-carlo simulator for CRUSH placements which can be used to compare the + * statistical distribution of the CRUSH algorithm to a random number generator + */ + int random_placement(int ruleno, vector& out, int maxout, vector<__u32>& weight); + + // scaffolding to store data for off-line processing + struct tester_data_set { + vector device_utilization; + vector device_utilization_all; + vector placement_information; + vector batch_device_utilization_all; + vector batch_device_expected_utilization_all; + map proportional_weights; + map proportional_weights_all; + map absolute_weights; + } ; + + void write_to_csv(ofstream& csv_file, vector& payload) + { + if (csv_file.good()) + for (vector::iterator it = payload.begin(); it != payload.end(); ++it) + csv_file << (*it); + } + + void write_to_csv(ofstream& csv_file, map& payload) + { + if (csv_file.good()) + for (map::iterator it = payload.begin(); it != payload.end(); ++it) + csv_file << (*it).first << ',' << (*it).second << std::endl; + } + + void write_data_set_to_csv(string user_tag, tester_data_set& tester_data) + { + + ofstream device_utilization_file ((user_tag + (string)"-device_utilization.csv").c_str()); + ofstream device_utilization_all_file ((user_tag + (string)"-device_utilization_all.csv").c_str()); + ofstream placement_information_file ((user_tag + (string)"-placement_information.csv").c_str()); + ofstream proportional_weights_file ((user_tag + (string)"-proportional_weights.csv").c_str()); + ofstream proportional_weights_all_file ((user_tag + (string)"-proportional_weights_all.csv").c_str()); + ofstream absolute_weights_file ((user_tag + (string)"-absolute_weights.csv").c_str()); + + // write the headers + device_utilization_file << "Device ID, Number of Objects Stored, Number of Objects Expected" << std::endl; + device_utilization_all_file << "Device ID, Number of Objects Stored, Number of Objects Expected" << std::endl; + proportional_weights_file << "Device ID, Proportional Weight" << std::endl; + proportional_weights_all_file << "Device ID, Proportional Weight" << std::endl; + absolute_weights_file << "Device ID, Absolute Weight" << std::endl; + + placement_information_file << "Input"; + for (int i = 0; i < max_rep; i++) { + placement_information_file << ", OSD" << i; + } + placement_information_file << std::endl; + + write_to_csv(device_utilization_file, tester_data.device_utilization); + write_to_csv(device_utilization_all_file, tester_data.device_utilization_all); + write_to_csv(placement_information_file, tester_data.placement_information); + write_to_csv(proportional_weights_file, tester_data.proportional_weights); + write_to_csv(proportional_weights_all_file, tester_data.proportional_weights_all); + write_to_csv(absolute_weights_file, tester_data.absolute_weights); + + device_utilization_file.close(); + device_utilization_all_file.close(); + placement_information_file.close(); + proportional_weights_file.close(); + absolute_weights_file.close(); + + if (num_batches > 1) { + ofstream batch_device_utilization_all_file ((user_tag + (string)"-batch_device_utilization_all.csv").c_str()); + ofstream batch_device_expected_utilization_all_file ((user_tag + (string)"-batch_device_expected_utilization_all.csv").c_str()); + + batch_device_utilization_all_file << "Batch Round"; + for (unsigned i = 0; i < tester_data.device_utilization.size(); i++) { + batch_device_utilization_all_file << ", Objects Stored on OSD" << i; + } + batch_device_utilization_all_file << std::endl; + + batch_device_expected_utilization_all_file << "Batch Round"; + for (unsigned i = 0; i < tester_data.device_utilization.size(); i++) { + batch_device_expected_utilization_all_file << ", Objects Expected on OSD" << i; + } + batch_device_expected_utilization_all_file << std::endl; + + write_to_csv(batch_device_utilization_all_file, tester_data.batch_device_utilization_all); + write_to_csv(batch_device_expected_utilization_all_file, tester_data.batch_device_expected_utilization_all); + batch_device_expected_utilization_all_file.close(); + batch_device_utilization_all_file.close(); + } + } + + void write_integer_indexed_vector_data_string(vector &dst, int index, vector vector_data); + void write_integer_indexed_vector_data_string(vector &dst, int index, vector vector_data); + void write_integer_indexed_scalar_data_string(vector &dst, int index, int scalar_data); + void write_integer_indexed_scalar_data_string(vector &dst, int index, float scalar_data); + +public: + CrushTester(CrushWrapper& c, ostream& eo) + : crush(c), err(eo), + min_rule(-1), max_rule(-1), + ruleset(-1), + min_x(-1), max_x(-1), + min_rep(-1), max_rep(-1), + pool_id(-1), + num_batches(1), + use_crush(true), + mark_down_device_ratio(0.0), + mark_down_bucket_ratio(1.0), + output_utilization(false), + output_utilization_all(false), + output_statistics(false), + output_mappings(false), + output_bad_mappings(false), + output_choose_tries(false), + output_data_file(false), + output_csv(false), + output_data_file_name("") + + { } + + void set_output_data_file_name(string name) { + output_data_file_name = name; + } + string get_output_data_file_name() const { + return output_data_file_name; + } + + void set_output_data_file(bool b) { + output_data_file = b; + } + bool get_output_data_file() const { + return output_data_file; + } + + void set_output_csv(bool b) { + output_csv = b; + } + bool get_output_csv() const { + return output_csv; + } + + void set_output_utilization(bool b) { + output_utilization = b; + } + bool get_output_utilization() const { + return output_utilization; + } + + void set_output_utilization_all(bool b) { + output_utilization_all = b; + } + bool get_output_utilization_all() const { + return output_utilization_all; + } + + void set_output_statistics(bool b) { + output_statistics = b; + } + bool get_output_statistics() const { + return output_statistics; + } + + void set_output_mappings(bool b) { + output_mappings = b; + } + bool get_output_mappings() const { + return output_mappings; + } + + void set_output_bad_mappings(bool b) { + output_bad_mappings = b; + } + bool get_output_bad_mappings() const { + return output_bad_mappings; + } + + void set_output_choose_tries(bool b) { + output_choose_tries = b; + } + bool get_output_choose_tries() const { + return output_choose_tries; + } + + void set_batches(int b) { + num_batches = b; + } + int get_batches() const { + return num_batches; + } + + void set_random_placement() { + use_crush = false; + } + bool get_random_placement() const { + return use_crush == false; + } + + void set_bucket_down_ratio(float bucket_ratio) { + mark_down_bucket_ratio = bucket_ratio; + } + float get_bucket_down_ratio() const { + return mark_down_bucket_ratio; + } + + void set_device_down_ratio(float device_ratio) { + mark_down_device_ratio = device_ratio; + } + float set_device_down_ratio() const { + return mark_down_device_ratio; + } + + void set_device_weight(int dev, float f); + + void set_min_rep(int r) { + min_rep = r; + } + int get_min_rep() const { + return min_rep; + } + + void set_max_rep(int r) { + max_rep = r; + } + int get_max_rep() const { + return max_rep; + } + + void set_num_rep(int r) { + min_rep = max_rep = r; + } + + void set_min_x(int x) { + min_x = x; + } + + void set_pool_id(int64_t x){ + pool_id = x; + } + + int get_min_x() const { + return min_x; + } + + void set_max_x(int x) { + max_x = x; + } + int get_max_x() const { + return max_x; + } + + void set_x(int x) { + min_x = max_x = x; + } + + void set_min_rule(int rule) { + min_rule = rule; + } + int get_min_rule() const { + return min_rule; + } + + void set_max_rule(int rule) { + max_rule = rule; + } + int get_max_rule() const { + return max_rule; + } + + void set_rule(int rule) { + min_rule = max_rule = rule; + } + + void set_ruleset(int rs) { + ruleset = rs; + } + + /** + * check if any bucket/nodes is referencing an unknown name or type + * @param max_id rejects any non-bucket items with id less than this number, + * pass 0 to disable this check + * @return false if an dangling name/type is referenced or an item id is too + * large, true otherwise + */ + bool check_name_maps(unsigned max_id = 0) const; + /** + * print out overlapped crush rules belonging to the same ruleset + */ + void check_overlapped_rules() const; + int test(); + int test_with_fork(int timeout); + + int compare(CrushWrapper& other); +}; + +#endif diff --git a/src/crush/CrushTreeDumper.h b/src/crush/CrushTreeDumper.h new file mode 100644 index 00000000..5c0430c2 --- /dev/null +++ b/src/crush/CrushTreeDumper.h @@ -0,0 +1,291 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph distributed storage system + * + * Copyright (C) 2015 Mirantis Inc + * + * Author: Mykola Golub + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef CRUSH_TREE_DUMPER_H +#define CRUSH_TREE_DUMPER_H + +#include "CrushWrapper.h" +#include "include/stringify.h" + +/** + * CrushTreeDumper: + * A helper class and functions to dump a crush tree. + * + * Example: + * + * class SimpleDumper : public CrushTreeDumper::Dumper { + * public: + * SimpleDumper(const CrushWrapper *crush) : + * CrushTreeDumper::Dumper(crush) {} + * protected: + * virtual void dump_item(const CrushTreeDumper::Item &qi, ostream *out) { + * *out << qi.id; + * for (int k = 0; k < qi.depth; k++) + * *out << "-"; + * if (qi.is_bucket()) + * *out << crush->get_item_name(qi.id) + * else + * *out << "osd." << qi.id; + * *out << "\n"; + * } + * }; + * + * SimpleDumper(crush).dump(out); + * + */ + +namespace CrushTreeDumper { + + struct Item { + int id; + int parent; + int depth; + float weight; + list children; + + Item() : id(0), parent(0), depth(0), weight(0) {} + Item(int i, int p, int d, float w) : id(i), parent(p), depth(d), weight(w) {} + + bool is_bucket() const { return id < 0; } + }; + + template + class Dumper : public list { + public: + explicit Dumper(const CrushWrapper *crush_, + const name_map_t& weight_set_names_) + : crush(crush_), weight_set_names(weight_set_names_) { + crush->find_nonshadow_roots(&roots); + root = roots.begin(); + } + explicit Dumper(const CrushWrapper *crush_, + const name_map_t& weight_set_names_, + bool show_shadow) + : crush(crush_), weight_set_names(weight_set_names_) { + if (show_shadow) { + crush->find_roots(&roots); + } else { + crush->find_nonshadow_roots(&roots); + } + root = roots.begin(); + } + + virtual ~Dumper() {} + + virtual void reset() { + root = roots.begin(); + touched.clear(); + clear(); + } + + virtual bool should_dump_leaf(int i) const { + return true; + } + virtual bool should_dump_empty_bucket() const { + return true; + } + + bool should_dump(int id) { + if (id >= 0) + return should_dump_leaf(id); + if (should_dump_empty_bucket()) + return true; + int s = crush->get_bucket_size(id); + for (int k = s - 1; k >= 0; k--) { + int c = crush->get_bucket_item(id, k); + if (should_dump(c)) + return true; + } + return false; + } + + bool next(Item &qi) { + if (empty()) { + while (root != roots.end() && !should_dump(*root)) + ++root; + if (root == roots.end()) + return false; + push_back(Item(*root, 0, 0, crush->get_bucket_weightf(*root))); + ++root; + } + + qi = front(); + pop_front(); + touched.insert(qi.id); + + if (qi.is_bucket()) { + // queue bucket contents, sorted by (class, name) + int s = crush->get_bucket_size(qi.id); + map> sorted; + for (int k = s - 1; k >= 0; k--) { + int id = crush->get_bucket_item(qi.id, k); + if (should_dump(id)) { + string sort_by; + if (id >= 0) { + const char *c = crush->get_item_class(id); + sort_by = c ? c : ""; + sort_by += "_"; + char nn[80]; + snprintf(nn, sizeof(nn), "osd.%08d", id); + sort_by += nn; + } else { + sort_by = "_"; + sort_by += crush->get_item_name(id); + } + sorted[sort_by] = make_pair( + id, crush->get_bucket_item_weightf(qi.id, k)); + } + } + for (auto p = sorted.rbegin(); p != sorted.rend(); ++p) { + qi.children.push_back(p->second.first); + push_front(Item(p->second.first, qi.id, qi.depth + 1, + p->second.second)); + } + } + return true; + } + + void dump(F *f) { + reset(); + Item qi; + while (next(qi)) + dump_item(qi, f); + } + + bool is_touched(int id) const { return touched.count(id) > 0; } + + void set_root(const string& bucket) { + roots.clear(); + if (crush->name_exists(bucket)) { + int i = crush->get_item_id(bucket); + roots.insert(i); + } + } + + protected: + virtual void dump_item(const Item &qi, F *f) = 0; + + protected: + const CrushWrapper *crush; + const name_map_t &weight_set_names; + + private: + set touched; + set roots; + set::iterator root; + }; + + inline void dump_item_fields(const CrushWrapper *crush, + const name_map_t& weight_set_names, + const Item &qi, Formatter *f) { + f->dump_int("id", qi.id); + const char *c = crush->get_item_class(qi.id); + if (c) + f->dump_string("device_class", c); + if (qi.is_bucket()) { + int type = crush->get_bucket_type(qi.id); + f->dump_string("name", crush->get_item_name(qi.id)); + f->dump_string("type", crush->get_type_name(type)); + f->dump_int("type_id", type); + } else { + f->dump_stream("name") << "osd." << qi.id; + f->dump_string("type", crush->get_type_name(0)); + f->dump_int("type_id", 0); + f->dump_float("crush_weight", qi.weight); + f->dump_unsigned("depth", qi.depth); + } + if (qi.parent < 0) { + f->open_object_section("pool_weights"); + for (auto& p : crush->choose_args) { + const crush_choose_arg_map& cmap = p.second; + int bidx = -1 - qi.parent; + const crush_bucket *b = crush->get_bucket(qi.parent); + if (b && + bidx < (int)cmap.size && + cmap.args[bidx].weight_set && + cmap.args[bidx].weight_set_positions >= 1) { + int bpos; + for (bpos = 0; + bpos < (int)cmap.args[bidx].weight_set[0].size && + b->items[bpos] != qi.id; + ++bpos) ; + string name; + if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) { + name = "(compat)"; + } else { + auto q = weight_set_names.find(p.first); + name = q != weight_set_names.end() ? q->second : + stringify(p.first); + } + f->open_array_section(name.c_str()); + for (unsigned opos = 0; + opos < cmap.args[bidx].weight_set_positions; + ++opos) { + float w = (float)cmap.args[bidx].weight_set[opos].weights[bpos] / + (float)0x10000; + f->dump_float("weight", w); + } + f->close_section(); + } + } + f->close_section(); + } + } + + inline void dump_bucket_children(const CrushWrapper *crush, + const Item &qi, Formatter *f) { + if (!qi.is_bucket()) + return; + + f->open_array_section("children"); + for (list::const_iterator i = qi.children.begin(); + i != qi.children.end(); + ++i) { + f->dump_int("child", *i); + } + f->close_section(); + } + + class FormattingDumper : public Dumper { + public: + explicit FormattingDumper(const CrushWrapper *crush, + const name_map_t& weight_set_names) + : Dumper(crush, weight_set_names) {} + explicit FormattingDumper(const CrushWrapper *crush, + const name_map_t& weight_set_names, + bool show_shadow) + : Dumper(crush, weight_set_names, show_shadow) {} + + protected: + void dump_item(const Item &qi, Formatter *f) override { + f->open_object_section("item"); + dump_item_fields(qi, f); + dump_bucket_children(qi, f); + f->close_section(); + } + + virtual void dump_item_fields(const Item &qi, Formatter *f) { + CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f); + } + + virtual void dump_bucket_children(const Item &qi, Formatter *f) { + CrushTreeDumper::dump_bucket_children(crush, qi, f); + } + }; + +} + +#endif diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc new file mode 100644 index 00000000..2b11ce9e --- /dev/null +++ b/src/crush/CrushWrapper.cc @@ -0,0 +1,4185 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd/osd_types.h" +#include "common/debug.h" +#include "common/Formatter.h" +#include "common/errno.h" +#include "common/TextTable.h" +#include "include/stringify.h" + +#include "CrushWrapper.h" +#include "CrushTreeDumper.h" + +#define dout_subsys ceph_subsys_crush + +bool CrushWrapper::has_legacy_rule_ids() const +{ + for (unsigned i=0; imax_rules; i++) { + crush_rule *r = crush->rules[i]; + if (r && + r->mask.ruleset != i) { + return true; + } + } + return false; +} + +std::map CrushWrapper::renumber_rules() +{ + std::map result; + for (unsigned i=0; imax_rules; i++) { + crush_rule *r = crush->rules[i]; + if (r && r->mask.ruleset != i) { + result[r->mask.ruleset] = i; + r->mask.ruleset = i; + } + } + return result; +} + +bool CrushWrapper::has_non_straw2_buckets() const +{ + for (int i=0; imax_buckets; ++i) { + crush_bucket *b = crush->buckets[i]; + if (!b) + continue; + if (b->alg != CRUSH_BUCKET_STRAW2) + return true; + } + return false; +} + +bool CrushWrapper::has_v2_rules() const +{ + for (unsigned i=0; imax_rules; i++) { + if (is_v2_rule(i)) { + return true; + } + } + return false; +} + +bool CrushWrapper::is_v2_rule(unsigned ruleid) const +{ + // check rule for use of indep or new SET_* rule steps + if (ruleid >= crush->max_rules) + return false; + crush_rule *r = crush->rules[ruleid]; + if (!r) + return false; + for (unsigned j=0; jlen; j++) { + if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP || + r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP || + r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES || + r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) { + return true; + } + } + return false; +} + +bool CrushWrapper::has_v3_rules() const +{ + for (unsigned i=0; imax_rules; i++) { + if (is_v3_rule(i)) { + return true; + } + } + return false; +} + +bool CrushWrapper::is_v3_rule(unsigned ruleid) const +{ + // check rule for use of SET_CHOOSELEAF_VARY_R step + if (ruleid >= crush->max_rules) + return false; + crush_rule *r = crush->rules[ruleid]; + if (!r) + return false; + for (unsigned j=0; jlen; j++) { + if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) { + return true; + } + } + return false; +} + +bool CrushWrapper::has_v4_buckets() const +{ + for (int i=0; imax_buckets; ++i) { + crush_bucket *b = crush->buckets[i]; + if (!b) + continue; + if (b->alg == CRUSH_BUCKET_STRAW2) + return true; + } + return false; +} + +bool CrushWrapper::has_v5_rules() const +{ + for (unsigned i=0; imax_rules; i++) { + if (is_v5_rule(i)) { + return true; + } + } + return false; +} + +bool CrushWrapper::is_v5_rule(unsigned ruleid) const +{ + // check rule for use of SET_CHOOSELEAF_STABLE step + if (ruleid >= crush->max_rules) + return false; + crush_rule *r = crush->rules[ruleid]; + if (!r) + return false; + for (unsigned j=0; jlen; j++) { + if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) { + return true; + } + } + return false; +} + +bool CrushWrapper::has_choose_args() const +{ + return !choose_args.empty(); +} + +bool CrushWrapper::has_incompat_choose_args() const +{ + if (choose_args.empty()) + return false; + if (choose_args.size() > 1) + return true; + if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS) + return true; + crush_choose_arg_map arg_map = choose_args.begin()->second; + for (__u32 i = 0; i < arg_map.size; i++) { + crush_choose_arg *arg = &arg_map.args[i]; + if (arg->weight_set_positions == 0 && + arg->ids_size == 0) + continue; + if (arg->weight_set_positions != 1) + return true; + if (arg->ids_size != 0) + return true; + } + return false; +} + +int CrushWrapper::split_id_class(int i, int *idout, int *classout) const +{ + if (!item_exists(i)) + return -EINVAL; + string name = get_item_name(i); + size_t pos = name.find("~"); + if (pos == string::npos) { + *idout = i; + *classout = -1; + return 0; + } + string name_no_class = name.substr(0, pos); + if (!name_exists(name_no_class)) + return -ENOENT; + string class_name = name.substr(pos + 1); + if (!class_exists(class_name)) + return -ENOENT; + *idout = get_item_id(name_no_class); + *classout = get_class_id(class_name); + return 0; +} + +int CrushWrapper::can_rename_item(const string& srcname, + const string& dstname, + ostream *ss) const +{ + if (name_exists(srcname)) { + if (name_exists(dstname)) { + *ss << "dstname = '" << dstname << "' already exists"; + return -EEXIST; + } + if (is_valid_crush_name(dstname)) { + return 0; + } else { + *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+"; + return -EINVAL; + } + } else { + if (name_exists(dstname)) { + *ss << "srcname = '" << srcname << "' does not exist " + << "and dstname = '" << dstname << "' already exists"; + return -EALREADY; + } else { + *ss << "srcname = '" << srcname << "' does not exist"; + return -ENOENT; + } + } +} + +int CrushWrapper::rename_item(const string& srcname, + const string& dstname, + ostream *ss) +{ + int ret = can_rename_item(srcname, dstname, ss); + if (ret < 0) + return ret; + int oldid = get_item_id(srcname); + return set_item_name(oldid, dstname); +} + +int CrushWrapper::can_rename_bucket(const string& srcname, + const string& dstname, + ostream *ss) const +{ + int ret = can_rename_item(srcname, dstname, ss); + if (ret) + return ret; + int srcid = get_item_id(srcname); + if (srcid >= 0) { + *ss << "srcname = '" << srcname << "' is not a bucket " + << "because its id = " << srcid << " is >= 0"; + return -ENOTDIR; + } + return 0; +} + +int CrushWrapper::rename_bucket(const string& srcname, + const string& dstname, + ostream *ss) +{ + int ret = can_rename_bucket(srcname, dstname, ss); + if (ret < 0) + return ret; + int oldid = get_item_id(srcname); + return set_item_name(oldid, dstname); +} + +int CrushWrapper::rename_rule(const string& srcname, + const string& dstname, + ostream *ss) +{ + if (!rule_exists(srcname)) { + if (ss) { + *ss << "source rule name '" << srcname << "' does not exist"; + } + return -ENOENT; + } + if (rule_exists(dstname)) { + if (ss) { + *ss << "destination rule name '" << dstname << "' already exists"; + } + return -EEXIST; + } + int rule_id = get_rule_id(srcname); + auto it = rule_name_map.find(rule_id); + ceph_assert(it != rule_name_map.end()); + it->second = dstname; + if (have_rmaps) { + rule_name_rmap.erase(srcname); + rule_name_rmap[dstname] = rule_id; + } + return 0; +} + +void CrushWrapper::find_takes(set *roots) const +{ + for (unsigned i=0; imax_rules; i++) { + crush_rule *r = crush->rules[i]; + if (!r) + continue; + for (unsigned j=0; jlen; j++) { + if (r->steps[j].op == CRUSH_RULE_TAKE) + roots->insert(r->steps[j].arg1); + } + } +} + +void CrushWrapper::find_takes_by_rule(int rule, set *roots) const +{ + if (rule < 0 || rule >= (int)crush->max_rules) + return; + crush_rule *r = crush->rules[rule]; + if (!r) + return; + for (unsigned i = 0; i < r->len; i++) { + if (r->steps[i].op == CRUSH_RULE_TAKE) + roots->insert(r->steps[i].arg1); + } +} + +void CrushWrapper::find_roots(set *roots) const +{ + for (int i = 0; i < crush->max_buckets; i++) { + if (!crush->buckets[i]) + continue; + crush_bucket *b = crush->buckets[i]; + if (!_search_item_exists(b->id)) + roots->insert(b->id); + } +} + +bool CrushWrapper::subtree_contains(int root, int item) const +{ + if (root == item) + return true; + + if (root >= 0) + return false; // root is a leaf + + const crush_bucket *b = get_bucket(root); + if (IS_ERR(b)) + return false; + + for (unsigned j=0; jsize; j++) { + if (subtree_contains(b->items[j], item)) + return true; + } + return false; +} + +bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only) +{ + // last instance? + if (_search_item_exists(item)) { + return false; + } + if (item < 0 && _bucket_is_in_use(item)) { + return false; + } + + if (item < 0 && !unlink_only) { + crush_bucket *t = get_bucket(item); + ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl; + crush_remove_bucket(crush, t); + if (class_bucket.count(item) != 0) + class_bucket.erase(item); + class_remove_item(item); + update_choose_args(cct); + } + if ((item >= 0 || !unlink_only) && name_map.count(item)) { + ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl; + name_map.erase(item); + have_rmaps = false; + if (item >= 0 && !unlink_only) { + class_remove_item(item); + } + } + rebuild_roots_with_classes(cct); + return true; +} + +int CrushWrapper::remove_root(CephContext *cct, int item) +{ + crush_bucket *b = get_bucket(item); + if (IS_ERR(b)) { + // should be idempotent + // e.g.: we use 'crush link' to link same host into + // different roots, which as a result can cause different + // shadow trees reference same hosts too. This means + // we may need to destory the same buckets(hosts, racks, etc.) + // multiple times during rebuilding all shadow trees. + return 0; + } + + for (unsigned n = 0; n < b->size; n++) { + if (b->items[n] >= 0) + continue; + int r = remove_root(cct, b->items[n]); + if (r < 0) + return r; + } + + crush_remove_bucket(crush, b); + if (name_map.count(item) != 0) { + name_map.erase(item); + have_rmaps = false; + } + if (class_bucket.count(item) != 0) + class_bucket.erase(item); + class_remove_item(item); + update_choose_args(cct); + return 0; +} + +void CrushWrapper::update_choose_args(CephContext *cct) +{ + for (auto& i : choose_args) { + crush_choose_arg_map &arg_map = i.second; + assert(arg_map.size == (unsigned)crush->max_buckets); + unsigned positions = get_choose_args_positions(arg_map); + for (int j = 0; j < crush->max_buckets; ++j) { + crush_bucket *b = crush->buckets[j]; + assert(j < (int)arg_map.size); + auto& carg = arg_map.args[j]; + // strip out choose_args for any buckets that no longer exist + if (!b || b->alg != CRUSH_BUCKET_STRAW2) { + if (carg.ids) { + if (cct) + ldout(cct,10) << __func__ << " removing " << i.first << " bucket " + << (-1-j) << " ids" << dendl; + free(carg.ids); + carg.ids = 0; + carg.ids_size = 0; + } + if (carg.weight_set) { + if (cct) + ldout(cct,10) << __func__ << " removing " << i.first << " bucket " + << (-1-j) << " weight_sets" << dendl; + for (unsigned p = 0; p < carg.weight_set_positions; ++p) { + free(carg.weight_set[p].weights); + } + free(carg.weight_set); + carg.weight_set = 0; + carg.weight_set_positions = 0; + } + continue; + } + if (carg.weight_set_positions == 0) { + continue; // skip it + } + if (carg.weight_set_positions != positions) { + if (cct) + lderr(cct) << __func__ << " " << i.first << " bucket " + << (-1-j) << " positions " << carg.weight_set_positions + << " -> " << positions << dendl; + continue; // wth... skip! + } + // mis-sized weight_sets? this shouldn't ever happen. + for (unsigned p = 0; p < positions; ++p) { + if (carg.weight_set[p].size != b->size) { + if (cct) + lderr(cct) << __func__ << " fixing " << i.first << " bucket " + << (-1-j) << " position " << p + << " size " << carg.weight_set[p].size << " -> " + << b->size << dendl; + auto old_ws = carg.weight_set[p]; + carg.weight_set[p].size = b->size; + carg.weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32)); + auto max = std::min(old_ws.size, b->size); + for (unsigned k = 0; k < max; ++k) { + carg.weight_set[p].weights[k] = old_ws.weights[k]; + } + free(old_ws.weights); + } + } + } + } +} + +int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only) +{ + ldout(cct, 5) << "remove_item " << item + << (unlink_only ? " unlink_only":"") << dendl; + + int ret = -ENOENT; + + if (item < 0 && !unlink_only) { + crush_bucket *t = get_bucket(item); + if (IS_ERR(t)) { + ldout(cct, 1) << "remove_item bucket " << item << " does not exist" + << dendl; + return -ENOENT; + } + + if (t->size) { + ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size + << " items, not empty" << dendl; + return -ENOTEMPTY; + } + if (_bucket_is_in_use(item)) { + return -EBUSY; + } + } + + for (int i = 0; i < crush->max_buckets; i++) { + if (!crush->buckets[i]) + continue; + crush_bucket *b = crush->buckets[i]; + + for (unsigned i=0; isize; ++i) { + int id = b->items[i]; + if (id == item) { + ldout(cct, 5) << "remove_item removing item " << item + << " from bucket " << b->id << dendl; + adjust_item_weight_in_bucket(cct, item, 0, b->id, true); + bucket_remove_item(b, item); + ret = 0; + } + } + } + + if (_maybe_remove_last_instance(cct, item, unlink_only)) + ret = 0; + + return ret; +} + +bool CrushWrapper::_search_item_exists(int item) const +{ + for (int i = 0; i < crush->max_buckets; i++) { + if (!crush->buckets[i]) + continue; + crush_bucket *b = crush->buckets[i]; + for (unsigned j=0; jsize; ++j) { + if (b->items[j] == item) + return true; + } + } + return false; +} + +bool CrushWrapper::_bucket_is_in_use(int item) +{ + for (auto &i : class_bucket) + for (auto &j : i.second) + if (j.second == item) + return true; + for (unsigned i = 0; i < crush->max_rules; ++i) { + crush_rule *r = crush->rules[i]; + if (!r) + continue; + for (unsigned j = 0; j < r->len; ++j) { + if (r->steps[j].op == CRUSH_RULE_TAKE) { + int step_item = r->steps[j].arg1; + int original_item; + int c; + int res = split_id_class(step_item, &original_item, &c); + if (res < 0) + return false; + if (step_item == item || original_item == item) + return true; + } + } + } + return false; +} + +int CrushWrapper::_remove_item_under( + CephContext *cct, int item, int ancestor, bool unlink_only) +{ + ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor + << (unlink_only ? " unlink_only":"") << dendl; + + if (ancestor >= 0) { + return -EINVAL; + } + + if (!bucket_exists(ancestor)) + return -EINVAL; + + int ret = -ENOENT; + + crush_bucket *b = get_bucket(ancestor); + for (unsigned i=0; isize; ++i) { + int id = b->items[i]; + if (id == item) { + ldout(cct, 5) << "_remove_item_under removing item " << item + << " from bucket " << b->id << dendl; + adjust_item_weight_in_bucket(cct, item, 0, b->id, true); + bucket_remove_item(b, item); + ret = 0; + } else if (id < 0) { + int r = remove_item_under(cct, item, id, unlink_only); + if (r == 0) + ret = 0; + } + } + return ret; +} + +int CrushWrapper::remove_item_under( + CephContext *cct, int item, int ancestor, bool unlink_only) +{ + ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor + << (unlink_only ? " unlink_only":"") << dendl; + + if (!unlink_only && _bucket_is_in_use(item)) { + return -EBUSY; + } + + int ret = _remove_item_under(cct, item, ancestor, unlink_only); + if (ret < 0) + return ret; + + if (item < 0 && !unlink_only) { + crush_bucket *t = get_bucket(item); + if (IS_ERR(t)) { + ldout(cct, 1) << "remove_item_under bucket " << item + << " does not exist" << dendl; + return -ENOENT; + } + + if (t->size) { + ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size + << " items, not empty" << dendl; + return -ENOTEMPTY; + } + } + + if (_maybe_remove_last_instance(cct, item, unlink_only)) + ret = 0; + + return ret; +} + +int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id, + const std::multimap& loc) const +{ + ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl; + if (!item_exists(id)) + return -ENOENT; + map id_loc = get_full_location(id); + ldout(cct, 20) << " id is at " << id_loc << dendl; + + for (map::const_iterator p = type_map.begin(); + p != type_map.end(); + ++p) { + map::iterator ip = id_loc.find(p->second); + if (ip == id_loc.end()) + continue; + for (std::multimap::const_iterator q = loc.find(p->second); + q != loc.end(); + ++q) { + if (q->first != p->second) + break; + if (q->second == ip->second) + return p->first; + } + } + return -ERANGE; +} + +int CrushWrapper::parse_loc_map(const std::vector& args, + std::map *ploc) +{ + ploc->clear(); + for (unsigned i = 0; i < args.size(); ++i) { + const char *s = args[i].c_str(); + const char *pos = strchr(s, '='); + if (!pos) + return -EINVAL; + string key(s, 0, pos-s); + string value(pos+1); + if (value.length()) + (*ploc)[key] = value; + else + return -EINVAL; + } + return 0; +} + +int CrushWrapper::parse_loc_multimap(const std::vector& args, + std::multimap *ploc) +{ + ploc->clear(); + for (unsigned i = 0; i < args.size(); ++i) { + const char *s = args[i].c_str(); + const char *pos = strchr(s, '='); + if (!pos) + return -EINVAL; + string key(s, 0, pos-s); + string value(pos+1); + if (value.length()) + ploc->insert(make_pair(key, value)); + else + return -EINVAL; + } + return 0; +} + +bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map& loc, + int *weight) +{ + ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl; + + for (map::const_iterator p = type_map.begin(); p != type_map.end(); ++p) { + // ignore device + if (p->first == 0) + continue; + + // ignore types that aren't specified in loc + map::const_iterator q = loc.find(p->second); + if (q == loc.end()) { + ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are " + << type_map << ")" << dendl; + continue; + } + + if (!name_exists(q->second)) { + ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl; + return false; + } + + int id = get_item_id(q->second); + if (id >= 0) { + ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second + << " is a device, not bucket" << dendl; + return false; + } + + ceph_assert(bucket_exists(id)); + crush_bucket *b = get_bucket(id); + + // see if item exists in this bucket + for (unsigned j=0; jsize; j++) { + if (b->items[j] == item) { + ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl; + if (weight) + *weight = crush_get_bucket_item_weight(b, j); + return true; + } + } + return false; + } + + ldout(cct, 2) << __func__ << " item " << item << " loc " << loc << dendl; + return false; +} + +map CrushWrapper::get_full_location(int id) const +{ + vector > full_location_ordered; + map full_location; + + get_full_location_ordered(id, full_location_ordered); + + std::copy(full_location_ordered.begin(), + full_location_ordered.end(), + std::inserter(full_location, full_location.begin())); + + return full_location; +} + +int CrushWrapper::get_full_location(const string& name, + map *ploc) +{ + build_rmaps(); + auto p = name_rmap.find(name); + if (p == name_rmap.end()) { + return -ENOENT; + } + *ploc = get_full_location(p->second); + return 0; +} + +int CrushWrapper::get_full_location_ordered(int id, vector >& path) const +{ + if (!item_exists(id)) + return -ENOENT; + int cur = id; + int ret; + while (true) { + pair parent_coord = get_immediate_parent(cur, &ret); + if (ret != 0) + break; + path.push_back(parent_coord); + cur = get_item_id(parent_coord.second); + } + return 0; +} + +string CrushWrapper::get_full_location_ordered_string(int id) const +{ + vector > full_location_ordered; + string full_location; + get_full_location_ordered(id, full_location_ordered); + reverse(begin(full_location_ordered), end(full_location_ordered)); + for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) { + full_location = full_location + i->first + "=" + i->second; + if (i != full_location_ordered.end() - 1) { + full_location = full_location + ","; + } + } + return full_location; +} + +map CrushWrapper::get_parent_hierarchy(int id) const +{ + map parent_hierarchy; + pair parent_coord = get_immediate_parent(id); + int parent_id; + + // get the integer type for id and create a counter from there + int type_counter = get_bucket_type(id); + + // if we get a negative type then we can assume that we have an OSD + // change behavior in get_item_type FIXME + if (type_counter < 0) + type_counter = 0; + + // read the type map and get the name of the type with the largest ID + int high_type = 0; + if (!type_map.empty()) + high_type = type_map.rbegin()->first; + + parent_id = get_item_id(parent_coord.second); + + while (type_counter < high_type) { + type_counter++; + parent_hierarchy[ type_counter ] = parent_coord.first; + + if (type_counter < high_type){ + // get the coordinate information for the next parent + parent_coord = get_immediate_parent(parent_id); + parent_id = get_item_id(parent_coord.second); + } + } + + return parent_hierarchy; +} + +int CrushWrapper::get_children(int id, list *children) const +{ + // leaf? + if (id >= 0) { + return 0; + } + + auto *b = get_bucket(id); + if (IS_ERR(b)) { + return -ENOENT; + } + + for (unsigned n=0; nsize; n++) { + children->push_back(b->items[n]); + } + return b->size; +} + +int CrushWrapper::get_all_children(int id, set *children) const +{ + // leaf? + if (id >= 0) { + return 0; + } + + auto *b = get_bucket(id); + if (IS_ERR(b)) { + return -ENOENT; + } + + int c = 0; + for (unsigned n = 0; n < b->size; n++) { + children->insert(b->items[n]); + c++; + auto r = get_all_children(b->items[n], children); + if (r < 0) + return r; + c += r; + } + return c; +} + +void CrushWrapper::get_children_of_type(int id, + int type, + vector *children, + bool exclude_shadow) const +{ + if (id >= 0) { + if (type == 0) { + // want leaf? + children->push_back(id); + } + return; + } + auto b = get_bucket(id); + if (IS_ERR(b)) { + return; + } + if (b->type < type) { + // give up + return; + } else if (b->type == type) { + if (!is_shadow_item(b->id) || !exclude_shadow) { + children->push_back(b->id); + } + return; + } + for (unsigned n = 0; n < b->size; n++) { + get_children_of_type(b->items[n], type, children, exclude_shadow); + } +} + +int CrushWrapper::verify_upmap(CephContext *cct, + int rule_id, + int pool_size, + const vector& up) +{ + auto rule = get_rule(rule_id); + if (IS_ERR(rule) || !rule) { + lderr(cct) << __func__ << " rule " << rule_id << " does not exist" + << dendl; + return -ENOENT; + } + for (unsigned step = 0; step < rule->len; ++step) { + auto curstep = &rule->steps[step]; + ldout(cct, 10) << __func__ << " step " << step << dendl; + switch (curstep->op) { + case CRUSH_RULE_CHOOSELEAF_FIRSTN: + case CRUSH_RULE_CHOOSELEAF_INDEP: + { + int type = curstep->arg2; + if (type == 0) // osd + break; + map> osds_by_parent; // parent_of_desired_type -> osds + for (auto osd : up) { + auto parent = get_parent_of_type(osd, type, rule_id); + if (parent < 0) { + osds_by_parent[parent].insert(osd); + } else { + ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd + << ", skipping for now" + << dendl; + } + } + for (auto i : osds_by_parent) { + if (i.second.size() > 1) { + lderr(cct) << __func__ << " multiple osds " << i.second + << " come from same failure domain " << i.first + << dendl; + return -EINVAL; + } + } + } + break; + + case CRUSH_RULE_CHOOSE_FIRSTN: + case CRUSH_RULE_CHOOSE_INDEP: + { + int numrep = curstep->arg1; + int type = curstep->arg2; + if (type == 0) // osd + break; + if (numrep <= 0) + numrep += pool_size; + set parents_of_type; + for (auto osd : up) { + auto parent = get_parent_of_type(osd, type, rule_id); + if (parent < 0) { + parents_of_type.insert(parent); + } else { + ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd + << ", skipping for now" + << dendl; + } + } + if ((int)parents_of_type.size() > numrep) { + lderr(cct) << __func__ << " number of buckets " + << parents_of_type.size() << " exceeds desired " << numrep + << dendl; + return -EINVAL; + } + } + break; + + default: + // ignore + break; + } + } + return 0; +} + +int CrushWrapper::_get_leaves(int id, list *leaves) const +{ + ceph_assert(leaves); + + // Already leaf? + if (id >= 0) { + leaves->push_back(id); + return 0; + } + + auto b = get_bucket(id); + if (IS_ERR(b)) { + return -ENOENT; + } + + for (unsigned n = 0; n < b->size; n++) { + if (b->items[n] >= 0) { + leaves->push_back(b->items[n]); + } else { + // is a bucket, do recursive call + int r = _get_leaves(b->items[n], leaves); + if (r < 0) { + return r; + } + } + } + + return 0; // all is well +} + +int CrushWrapper::get_leaves(const string &name, set *leaves) const +{ + ceph_assert(leaves); + leaves->clear(); + + if (!name_exists(name)) { + return -ENOENT; + } + + int id = get_item_id(name); + if (id >= 0) { + // already leaf + leaves->insert(id); + return 0; + } + + list unordered; + int r = _get_leaves(id, &unordered); + if (r < 0) { + return r; + } + + for (auto &p : unordered) { + leaves->insert(p); + } + + return 0; +} + +int CrushWrapper::insert_item( + CephContext *cct, int item, float weight, string name, + const map& loc, // typename -> bucketname + bool init_weight_sets) +{ + ldout(cct, 5) << "insert_item item " << item << " weight " << weight + << " name " << name << " loc " << loc << dendl; + + if (!is_valid_crush_name(name)) + return -EINVAL; + + if (!is_valid_crush_loc(cct, loc)) + return -EINVAL; + + int r = validate_weightf(weight); + if (r < 0) { + return r; + } + + if (name_exists(name)) { + if (get_item_id(name) != item) { + ldout(cct, 10) << "device name '" << name << "' already exists as id " + << get_item_id(name) << dendl; + return -EEXIST; + } + } else { + set_item_name(item, name); + } + + int cur = item; + + // create locations if locations don't exist and add child in + // location with 0 weight the more detail in the insert_item method + // declaration in CrushWrapper.h + for (auto p = type_map.begin(); p != type_map.end(); ++p) { + // ignore device type + if (p->first == 0) + continue; + + // skip types that are unspecified + map::const_iterator q = loc.find(p->second); + if (q == loc.end()) { + ldout(cct, 2) << "warning: did not specify location for '" + << p->second << "' level (levels are " + << type_map << ")" << dendl; + continue; + } + + if (!name_exists(q->second)) { + ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl; + int empty = 0, newid; + int r = add_bucket(0, 0, + CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid); + if (r < 0) { + ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r) + << dendl; + return r; + } + set_item_name(newid, q->second); + + cur = newid; + continue; + } + + // add to an existing bucket + int id = get_item_id(q->second); + if (!bucket_exists(id)) { + ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl; + return -EINVAL; + } + + // check that we aren't creating a cycle. + if (subtree_contains(id, cur)) { + ldout(cct, 1) << "insert_item item " << cur << " already exists beneath " + << id << dendl; + return -EINVAL; + } + + // we have done sanity check above + crush_bucket *b = get_bucket(id); + + if (p->first != b->type) { + ldout(cct, 1) << "insert_item existing bucket has type " + << "'" << type_map[b->type] << "' != " + << "'" << type_map[p->first] << "'" << dendl; + return -EINVAL; + } + + // are we forming a loop? + if (subtree_contains(cur, b->id)) { + ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id + << "; cannot form loop" << dendl; + return -ELOOP; + } + + ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight + << " to bucket " << id << dendl; + [[maybe_unused]] int r = bucket_add_item(b, cur, 0); + ceph_assert(!r); + break; + } + + // adjust the item's weight in location + if (adjust_item_weightf_in_loc(cct, item, weight, loc, + item >= 0 && init_weight_sets) > 0) { + if (item >= crush->max_devices) { + crush->max_devices = item + 1; + ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices + << dendl; + } + r = rebuild_roots_with_classes(cct); + if (r < 0) { + ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: " + << cpp_strerror(r) << dendl; + return r; + } + return 0; + } + + ldout(cct, 1) << "error: didn't find anywhere to add item " << item + << " in " << loc << dendl; + return -EINVAL; +} + + +int CrushWrapper::move_bucket( + CephContext *cct, int id, const map& loc) +{ + // sorry this only works for buckets + if (id >= 0) + return -EINVAL; + + if (!item_exists(id)) + return -ENOENT; + + // get the name of the bucket we are trying to move for later + string id_name = get_item_name(id); + + // detach the bucket + int bucket_weight = detach_bucket(cct, id); + + // insert the bucket back into the hierarchy + return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc, + false); +} + +int CrushWrapper::detach_bucket(CephContext *cct, int item) +{ + if (!crush) + return (-EINVAL); + + if (item >= 0) + return (-EINVAL); + + // check that the bucket that we want to detach exists + ceph_assert(bucket_exists(item)); + + // get the bucket's weight + crush_bucket *b = get_bucket(item); + unsigned bucket_weight = b->weight; + + // get where the bucket is located + pair bucket_location = get_immediate_parent(item); + + // get the id of the parent bucket + int parent_id = get_item_id(bucket_location.second); + + // get the parent bucket + crush_bucket *parent_bucket = get_bucket(parent_id); + + if (!IS_ERR(parent_bucket)) { + // zero out the bucket weight + adjust_item_weight_in_bucket(cct, item, 0, parent_bucket->id, true); + + // remove the bucket from the parent + bucket_remove_item(parent_bucket, item); + } else if (PTR_ERR(parent_bucket) != -ENOENT) { + return PTR_ERR(parent_bucket); + } + + // check that we're happy + int test_weight = 0; + map test_location; + test_location[ bucket_location.first ] = (bucket_location.second); + + bool successful_detach = !(check_item_loc(cct, item, test_location, + &test_weight)); + ceph_assert(successful_detach); + ceph_assert(test_weight == 0); + + return bucket_weight; +} + +bool CrushWrapper::is_parent_of(int child, int p) const +{ + int parent = 0; + while (!get_immediate_parent_id(child, &parent)) { + if (parent == p) { + return true; + } + child = parent; + } + return false; +} + +int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst) +{ + if (src >= 0 || dst >= 0) + return -EINVAL; + if (!item_exists(src) || !item_exists(dst)) + return -EINVAL; + crush_bucket *a = get_bucket(src); + crush_bucket *b = get_bucket(dst); + if (is_parent_of(a->id, b->id) || is_parent_of(b->id, a->id)) { + return -EINVAL; + } + unsigned aw = a->weight; + unsigned bw = b->weight; + + // swap weights + adjust_item_weight(cct, a->id, bw); + adjust_item_weight(cct, b->id, aw); + + // swap items + map tmp; + unsigned as = a->size; + unsigned bs = b->size; + for (unsigned i = 0; i < as; ++i) { + int item = a->items[0]; + int itemw = crush_get_bucket_item_weight(a, 0); + tmp[item] = itemw; + bucket_remove_item(a, item); + } + ceph_assert(a->size == 0); + ceph_assert(b->size == bs); + for (unsigned i = 0; i < bs; ++i) { + int item = b->items[0]; + int itemw = crush_get_bucket_item_weight(b, 0); + bucket_remove_item(b, item); + bucket_add_item(a, item, itemw); + } + ceph_assert(a->size == bs); + ceph_assert(b->size == 0); + for (auto t : tmp) { + bucket_add_item(b, t.first, t.second); + } + ceph_assert(a->size == bs); + ceph_assert(b->size == as); + + // swap names + swap_names(src, dst); + return rebuild_roots_with_classes(cct); +} + +int CrushWrapper::link_bucket( + CephContext *cct, int id, const map& loc) +{ + // sorry this only works for buckets + if (id >= 0) + return -EINVAL; + + if (!item_exists(id)) + return -ENOENT; + + // get the name of the bucket we are trying to move for later + string id_name = get_item_name(id); + + crush_bucket *b = get_bucket(id); + unsigned bucket_weight = b->weight; + + return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc); +} + +int CrushWrapper::create_or_move_item( + CephContext *cct, int item, float weight, string name, + const map& loc, // typename -> bucketname + bool init_weight_sets) +{ + int ret = 0; + int old_iweight; + + if (!is_valid_crush_name(name)) + return -EINVAL; + + if (check_item_loc(cct, item, loc, &old_iweight)) { + ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc + << dendl; + } else { + if (_search_item_exists(item)) { + weight = get_item_weightf(item); + ldout(cct, 10) << "create_or_move_item " << item + << " exists with weight " << weight << dendl; + remove_item(cct, item, true); + } + ldout(cct, 5) << "create_or_move_item adding " << item + << " weight " << weight + << " at " << loc << dendl; + ret = insert_item(cct, item, weight, name, loc, + item >= 0 && init_weight_sets); + if (ret == 0) + ret = 1; // changed + } + return ret; +} + +int CrushWrapper::update_item( + CephContext *cct, int item, float weight, string name, + const map& loc) // typename -> bucketname +{ + ldout(cct, 5) << "update_item item " << item << " weight " << weight + << " name " << name << " loc " << loc << dendl; + int ret = 0; + + if (!is_valid_crush_name(name)) + return -EINVAL; + + if (!is_valid_crush_loc(cct, loc)) + return -EINVAL; + + ret = validate_weightf(weight); + if (ret < 0) { + return ret; + } + + // compare quantized (fixed-point integer) weights! + int iweight = (int)(weight * (float)0x10000); + int old_iweight; + if (check_item_loc(cct, item, loc, &old_iweight)) { + ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl; + if (old_iweight != iweight) { + ldout(cct, 5) << "update_item " << item << " adjusting weight " + << ((float)old_iweight/(float)0x10000) << " -> " << weight + << dendl; + adjust_item_weight_in_loc(cct, item, iweight, loc); + ret = 1; + } + if (get_item_name(item) != name) { + ldout(cct, 5) << "update_item setting " << item << " name to " << name + << dendl; + set_item_name(item, name); + ret = 1; + } + } else { + if (item_exists(item)) { + remove_item(cct, item, true); + } + ldout(cct, 5) << "update_item adding " << item << " weight " << weight + << " at " << loc << dendl; + ret = insert_item(cct, item, weight, name, loc); + if (ret == 0) + ret = 1; // changed + } + return ret; +} + +int CrushWrapper::get_item_weight(int id) const +{ + for (int bidx = 0; bidx < crush->max_buckets; bidx++) { + crush_bucket *b = crush->buckets[bidx]; + if (b == NULL) + continue; + if (b->id == id) + return b->weight; + for (unsigned i = 0; i < b->size; i++) + if (b->items[i] == id) + return crush_get_bucket_item_weight(b, i); + } + return -ENOENT; +} + +int CrushWrapper::get_item_weight_in_loc(int id, const map &loc) +{ + for (map::const_iterator l = loc.begin(); l != loc.end(); ++l) { + + int bid = get_item_id(l->second); + if (!bucket_exists(bid)) + continue; + crush_bucket *b = get_bucket(bid); + for (unsigned int i = 0; i < b->size; i++) { + if (b->items[i] == id) { + return crush_get_bucket_item_weight(b, i); + } + } + } + return -ENOENT; +} + +int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight, + bool update_weight_sets) +{ + ldout(cct, 5) << __func__ << " " << id << " weight " << weight + << " update_weight_sets=" << (int)update_weight_sets + << dendl; + int changed = 0; + for (int bidx = 0; bidx < crush->max_buckets; bidx++) { + if (!crush->buckets[bidx]) { + continue; + } + int r = adjust_item_weight_in_bucket(cct, id, weight, -1-bidx, + update_weight_sets); + if (r > 0) { + ++changed; + } + } + if (!changed) { + return -ENOENT; + } + return changed; +} + +int CrushWrapper::adjust_item_weight_in_bucket( + CephContext *cct, int id, int weight, + int bucket_id, + bool update_weight_sets) +{ + ldout(cct, 5) << __func__ << " " << id << " weight " << weight + << " in bucket " << bucket_id + << " update_weight_sets=" << (int)update_weight_sets + << dendl; + int changed = 0; + if (!bucket_exists(bucket_id)) { + return -ENOENT; + } + crush_bucket *b = get_bucket(bucket_id); + for (unsigned int i = 0; i < b->size; i++) { + if (b->items[i] == id) { + int diff = bucket_adjust_item_weight(cct, b, id, weight, + update_weight_sets); + ldout(cct, 5) << __func__ << " " << id << " diff " << diff + << " in bucket " << bucket_id << dendl; + adjust_item_weight(cct, bucket_id, b->weight, false); + changed++; + } + } + // update weight-sets so they continue to sum + for (auto& p : choose_args) { + auto &cmap = p.second; + if (!cmap.args) { + continue; + } + crush_choose_arg *arg = &cmap.args[-1 - bucket_id]; + if (!arg->weight_set) { + continue; + } + ceph_assert(arg->weight_set_positions > 0); + vector w(arg->weight_set_positions); + for (unsigned i = 0; i < b->size; ++i) { + for (unsigned j = 0; j < arg->weight_set_positions; ++j) { + crush_weight_set *weight_set = &arg->weight_set[j]; + w[j] += weight_set->weights[i]; + } + } + ldout(cct,5) << __func__ << " adjusting bucket " << bucket_id + << " cmap " << p.first << " weights to " << w << dendl; + ostringstream ss; + choose_args_adjust_item_weight(cct, cmap, bucket_id, w, &ss); + } + if (!changed) { + return -ENOENT; + } + return changed; +} + +int CrushWrapper::adjust_item_weight_in_loc( + CephContext *cct, int id, int weight, + const map& loc, + bool update_weight_sets) +{ + ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight + << " in " << loc + << " update_weight_sets=" << (int)update_weight_sets + << dendl; + int changed = 0; + for (auto l = loc.begin(); l != loc.end(); ++l) { + int bid = get_item_id(l->second); + if (!bucket_exists(bid)) + continue; + int r = adjust_item_weight_in_bucket(cct, id, weight, bid, + update_weight_sets); + if (r > 0) { + ++changed; + } + } + if (!changed) { + return -ENOENT; + } + return changed; +} + +int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight, + bool update_weight_sets) +{ + ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl; + crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) + return PTR_ERR(b); + int changed = 0; + list q; + q.push_back(b); + while (!q.empty()) { + b = q.front(); + q.pop_front(); + int local_changed = 0; + for (unsigned i=0; isize; ++i) { + int n = b->items[i]; + if (n >= 0) { + adjust_item_weight_in_bucket(cct, n, weight, b->id, update_weight_sets); + ++changed; + ++local_changed; + } else { + crush_bucket *sub = get_bucket(n); + if (IS_ERR(sub)) + continue; + q.push_back(sub); + } + } + } + return changed; +} + +bool CrushWrapper::check_item_present(int id) const +{ + bool found = false; + + for (int bidx = 0; bidx < crush->max_buckets; bidx++) { + crush_bucket *b = crush->buckets[bidx]; + if (b == 0) + continue; + for (unsigned i = 0; i < b->size; i++) + if (b->items[i] == id) + found = true; + } + return found; +} + + +pair CrushWrapper::get_immediate_parent(int id, int *_ret) const +{ + + for (int bidx = 0; bidx < crush->max_buckets; bidx++) { + crush_bucket *b = crush->buckets[bidx]; + if (b == 0) + continue; + if (is_shadow_item(b->id)) + continue; + for (unsigned i = 0; i < b->size; i++) + if (b->items[i] == id) { + string parent_id = name_map.at(b->id); + string parent_bucket_type = type_map.at(b->type); + if (_ret) + *_ret = 0; + return make_pair(parent_bucket_type, parent_id); + } + } + + if (_ret) + *_ret = -ENOENT; + + return pair(); +} + +int CrushWrapper::get_immediate_parent_id(int id, int *parent) const +{ + for (int bidx = 0; bidx < crush->max_buckets; bidx++) { + crush_bucket *b = crush->buckets[bidx]; + if (b == 0) + continue; + if (is_shadow_item(b->id)) + continue; + for (unsigned i = 0; i < b->size; i++) { + if (b->items[i] == id) { + *parent = b->id; + return 0; + } + } + } + return -ENOENT; +} + +int CrushWrapper::get_parent_of_type(int item, int type, int rule) const +{ + if (rule < 0) { + // no rule specified + do { + int r = get_immediate_parent_id(item, &item); + if (r < 0) { + return 0; + } + } while (get_bucket_type(item) != type); + return item; + } + set roots; + find_takes_by_rule(rule, &roots); + for (auto root : roots) { + vector candidates; + get_children_of_type(root, type, &candidates, false); + for (auto candidate : candidates) { + if (subtree_contains(candidate, item)) { + // note that here we assure that no two different buckets + // from a single crush rule will share a same device, + // which should generally be true. + return candidate; + } + } + } + return 0; // not found +} + +void CrushWrapper::get_subtree_of_type(int type, vector *subtrees) +{ + set roots; + find_roots(&roots); + for (auto r: roots) { + crush_bucket *b = get_bucket(r); + if (IS_ERR(b)) + continue; + get_children_of_type(b->id, type, subtrees); + } +} + +bool CrushWrapper::class_is_in_use(int class_id, ostream *ss) +{ + list rules; + for (unsigned i = 0; i < crush->max_rules; ++i) { + crush_rule *r = crush->rules[i]; + if (!r) + continue; + for (unsigned j = 0; j < r->len; ++j) { + if (r->steps[j].op == CRUSH_RULE_TAKE) { + int root = r->steps[j].arg1; + for (auto &p : class_bucket) { + auto& q = p.second; + if (q.count(class_id) && q[class_id] == root) { + rules.push_back(i); + } + } + } + } + } + if (rules.empty()) { + return false; + } + if (ss) { + ostringstream os; + for (auto &p: rules) { + os << "'" << get_rule_name(p) <<"',"; + } + string out(os.str()); + out.resize(out.size() - 1); // drop last ',' + *ss << "still referenced by crush_rule(s): " << out; + } + return true; +} + +int CrushWrapper::rename_class(const string& srcname, const string& dstname) +{ + auto i = class_rname.find(srcname); + if (i == class_rname.end()) + return -ENOENT; + auto j = class_rname.find(dstname); + if (j != class_rname.end()) + return -EEXIST; + + int class_id = i->second; + ceph_assert(class_name.count(class_id)); + // rename any shadow buckets of old class name + for (auto &it: class_map) { + if (it.first < 0 && it.second == class_id) { + string old_name = get_item_name(it.first); + size_t pos = old_name.find("~"); + ceph_assert(pos != string::npos); + string name_no_class = old_name.substr(0, pos); + string old_class_name = old_name.substr(pos + 1); + ceph_assert(old_class_name == srcname); + string new_name = name_no_class + "~" + dstname; + // we do not use set_item_name + // because the name is intentionally invalid + name_map[it.first] = new_name; + have_rmaps = false; + } + } + + // rename class + class_rname.erase(srcname); + class_name.erase(class_id); + class_rname[dstname] = class_id; + class_name[class_id] = dstname; + return 0; +} + +int CrushWrapper::populate_classes( + const std::map>& old_class_bucket) +{ + // build set of previous used shadow ids + set used_ids; + for (auto& p : old_class_bucket) { + for (auto& q : p.second) { + used_ids.insert(q.second); + } + } + // accumulate weight values for each carg and bucket as we go. because it is + // depth first, we will have the nested bucket weights we need when we + // finish constructing the containing buckets. + map>> cmap_item_weight; // cargs -> bno -> [bucket weight for each position] + set roots; + find_nonshadow_roots(&roots); + for (auto &r : roots) { + if (r >= 0) + continue; + for (auto &c : class_name) { + int clone; + int res = device_class_clone(r, c.first, old_class_bucket, used_ids, + &clone, &cmap_item_weight); + if (res < 0) + return res; + } + } + return 0; +} + +int CrushWrapper::trim_roots_with_class(CephContext *cct) +{ + set roots; + find_shadow_roots(&roots); + for (auto &r : roots) { + if (r >= 0) + continue; + int res = remove_root(cct, r); + if (res) + return res; + } + // there is no need to reweight because we only remove from the + // root and down + return 0; +} + +int32_t CrushWrapper::_alloc_class_id() const { + if (class_name.empty()) { + return 0; + } + int32_t class_id = class_name.rbegin()->first + 1; + if (class_id >= 0) { + return class_id; + } + // wrapped, pick a random start and do exhaustive search + uint32_t upperlimit = numeric_limits::max(); + upperlimit++; + class_id = rand() % upperlimit; + const auto start = class_id; + do { + if (!class_name.count(class_id)) { + return class_id; + } else { + class_id++; + if (class_id < 0) { + class_id = 0; + } + } + } while (class_id != start); + ceph_abort_msg("no available class id"); +} + +int CrushWrapper::set_subtree_class( + const string& subtree, + const string& new_class) +{ + if (!name_exists(subtree)) { + return -ENOENT; + } + + int new_class_id = get_or_create_class_id(new_class); + int id = get_item_id(subtree); + list q = { id }; + while (!q.empty()) { + int id = q.front(); + q.pop_front(); + crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) { + return PTR_ERR(b); + } + for (unsigned i = 0; i < b->size; ++i) { + int item = b->items[i]; + if (item >= 0) { + class_map[item] = new_class_id; + } else { + q.push_back(item); + } + } + } + return 0; +} + +int CrushWrapper::reclassify( + CephContext *cct, + ostream& out, + const map& classify_root, + const map>& classify_bucket + ) +{ + map reclassified_bucket; // orig_id -> class + + // classify_root + for (auto& i : classify_root) { + string root = i.first; + if (!name_exists(root)) { + out << "root " << root << " does not exist" << std::endl; + return -EINVAL; + } + int root_id = get_item_id(root); + string new_class = i.second; + int new_class_id = get_or_create_class_id(new_class); + out << "classify_root " << root << " (" << root_id + << ") as " << new_class << std::endl; + + // validate rules + for (unsigned j = 0; j < crush->max_rules; j++) { + if (crush->rules[j]) { + auto rule = crush->rules[j]; + for (unsigned k = 0; k < rule->len; ++k) { + if (rule->steps[k].op == CRUSH_RULE_TAKE) { + int step_item = get_rule_arg1(j, k); + int original_item; + int c; + int res = split_id_class(step_item, &original_item, &c); + if (res < 0) + return res; + if (c >= 0) { + if (original_item == root_id) { + out << " rule " << j << " includes take on root " + << root << " class " << c << std::endl; + return -EINVAL; + } + } + } + } + } + } + + // rebuild new buckets for root + //cout << "before class_bucket: " << class_bucket << std::endl; + map renumber; + list q; + q.push_back(root_id); + while (!q.empty()) { + int id = q.front(); + q.pop_front(); + crush_bucket *bucket = get_bucket(id); + if (IS_ERR(bucket)) { + out << "cannot find bucket " << id + << ": " << cpp_strerror(PTR_ERR(bucket)) << std::endl; + return PTR_ERR(bucket); + } + + // move bucket + int new_id = get_new_bucket_id(); + out << " renumbering bucket " << id << " -> " << new_id << std::endl; + renumber[id] = new_id; + crush->buckets[-1-new_id] = bucket; + bucket->id = new_id; + crush->buckets[-1-id] = crush_make_bucket(crush, + bucket->alg, + bucket->hash, + bucket->type, + 0, NULL, NULL); + crush->buckets[-1-id]->id = id; + for (auto& i : choose_args) { + i.second.args[-1-new_id] = i.second.args[-1-id]; + memset(&i.second.args[-1-id], 0, sizeof(i.second.args[0])); + } + class_bucket.erase(id); + class_bucket[new_id][new_class_id] = id; + name_map[new_id] = string(get_item_name(id)); + name_map[id] = string(get_item_name(id)) + "~" + new_class; + + for (unsigned j = 0; j < bucket->size; ++j) { + if (bucket->items[j] < 0) { + q.push_front(bucket->items[j]); + } else { + // we don't reclassify the device here; if the users wants that, + // they can pass --set-subtree-class separately. + } + } + } + //cout << "mid class_bucket: " << class_bucket << std::endl; + + for (int i = 0; i < crush->max_buckets; ++i) { + crush_bucket *b = crush->buckets[i]; + if (!b) { + continue; + } + for (unsigned j = 0; j < b->size; ++j) { + if (renumber.count(b->items[j])) { + b->items[j] = renumber[b->items[j]]; + } + } + } + + int r = rebuild_roots_with_classes(cct); + if (r < 0) { + out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r) + << std::endl; + return r; + } + //cout << "final class_bucket: " << class_bucket << std::endl; + } + + // classify_bucket + map send_to; // source bucket -> dest bucket + map> new_class_bucket; + map new_bucket_names; + map> new_buckets; + map new_bucket_by_name; + for (auto& i : classify_bucket) { + const string& match = i.first; // prefix% or %suffix + const string& new_class = i.second.first; + const string& default_parent = i.second.second; + if (!name_exists(default_parent)) { + out << "default parent " << default_parent << " does not exist" + << std::endl; + return -EINVAL; + } + int default_parent_id = get_item_id(default_parent); + crush_bucket *default_parent_bucket = get_bucket(default_parent_id); + assert(default_parent_bucket); + string default_parent_type_name = get_type_name(default_parent_bucket->type); + + out << "classify_bucket " << match << " as " << new_class + << " default bucket " << default_parent + << " (" << default_parent_type_name << ")" << std::endl; + + int new_class_id = get_or_create_class_id(new_class); + for (int j = 0; j < crush->max_buckets; ++j) { + crush_bucket *b = crush->buckets[j]; + if (!b || is_shadow_item(b->id)) { + continue; + } + string name = get_item_name(b->id); + if (name.length() < match.length()) { + continue; + } + string basename; + if (match[0] == '%') { + if (match.substr(1) != name.substr(name.size() - match.size() + 1)) { + continue; + } + basename = name.substr(0, name.size() - match.size() + 1); + } else if (match[match.size() - 1] == '%') { + if (match.substr(0, match.size() - 1) != + name.substr(0, match.size() - 1)) { + continue; + } + basename = name.substr(match.size() - 1); + } else if (match == name) { + basename = default_parent; + } else { + continue; + } + cout << "match " << match << " to " << name << " basename " << basename + << std::endl; + // look up or create basename bucket + int base_id; + if (name_exists(basename)) { + base_id = get_item_id(basename); + cout << " have base " << base_id << std::endl; + } else if (new_bucket_by_name.count(basename)) { + base_id = new_bucket_by_name[basename]; + cout << " already creating base " << base_id << std::endl; + } else { + base_id = get_new_bucket_id(); + crush->buckets[-1-base_id] = crush_make_bucket(crush, + b->alg, + b->hash, + b->type, + 0, NULL, NULL); + crush->buckets[-1-base_id]->id = base_id; + name_map[base_id] = basename; + new_bucket_by_name[basename] = base_id; + cout << " created base " << base_id << std::endl; + + new_buckets[base_id][default_parent_type_name] = default_parent; + } + send_to[b->id] = base_id; + new_class_bucket[base_id][new_class_id] = b->id; + new_bucket_names[b->id] = basename + "~" + get_class_name(new_class_id); + + // make sure devices are classified + for (unsigned i = 0; i < b->size; ++i) { + int item = b->items[i]; + if (item >= 0) { + class_map[item] = new_class_id; + } + } + } + } + + // no name_exists() works below, + have_rmaps = false; + + // copy items around + //cout << "send_to " << send_to << std::endl; + set roots; + find_roots(&roots); + for (auto& i : send_to) { + crush_bucket *from = get_bucket(i.first); + crush_bucket *to = get_bucket(i.second); + cout << "moving items from " << from->id << " (" << get_item_name(from->id) + << ") to " << to->id << " (" << get_item_name(to->id) << ")" + << std::endl; + for (unsigned j = 0; j < from->size; ++j) { + int item = from->items[j]; + int r; + map to_loc; + to_loc[get_type_name(to->type)] = get_item_name(to->id); + if (item >= 0) { + if (subtree_contains(to->id, item)) { + continue; + } + map from_loc; + from_loc[get_type_name(from->type)] = get_item_name(from->id); + auto w = get_item_weightf_in_loc(item, from_loc); + r = insert_item(cct, item, + w, + get_item_name(item), + to_loc); + } else { + if (!send_to.count(item)) { + lderr(cct) << "item " << item << " in bucket " << from->id + << " is not also a reclassified bucket" << dendl; + return -EINVAL; + } + int newitem = send_to[item]; + if (subtree_contains(to->id, newitem)) { + continue; + } + r = link_bucket(cct, newitem, to_loc); + } + if (r != 0) { + cout << __func__ << " err from insert_item: " << cpp_strerror(r) + << std::endl; + return r; + } + } + } + + // make sure new buckets have parents + for (auto& i : new_buckets) { + int parent; + if (get_immediate_parent_id(i.first, &parent) < 0) { + cout << "new bucket " << i.first << " missing parent, adding at " + << i.second << std::endl; + int r = link_bucket(cct, i.first, i.second); + if (r != 0) { + cout << __func__ << " err from insert_item: " << cpp_strerror(r) + << std::endl; + return r; + } + } + } + + // set class mappings + //cout << "pre class_bucket: " << class_bucket << std::endl; + for (auto& i : new_class_bucket) { + for (auto& j : i.second) { + class_bucket[i.first][j.first] = j.second; + } + + } + //cout << "post class_bucket: " << class_bucket << std::endl; + for (auto& i : new_bucket_names) { + name_map[i.first] = i.second; + } + + int r = rebuild_roots_with_classes(cct); + if (r < 0) { + out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r) + << std::endl; + return r; + } + //cout << "final class_bucket: " << class_bucket << std::endl; + + return 0; +} + +int CrushWrapper::get_new_bucket_id() +{ + int id = -1; + while (crush->buckets[-1-id] && + -1-id < crush->max_buckets) { + id--; + } + if (-1-id == crush->max_buckets) { + ++crush->max_buckets; + crush->buckets = (struct crush_bucket**)realloc( + crush->buckets, + sizeof(crush->buckets[0]) * crush->max_buckets); + for (auto& i : choose_args) { + assert(i.second.size == (__u32)crush->max_buckets - 1); + ++i.second.size; + i.second.args = (struct crush_choose_arg*)realloc( + i.second.args, + sizeof(i.second.args[0]) * i.second.size); + } + } + return id; +} + +void CrushWrapper::reweight(CephContext *cct) +{ + set roots; + find_nonshadow_roots(&roots); + for (auto id : roots) { + if (id >= 0) + continue; + crush_bucket *b = get_bucket(id); + ldout(cct, 5) << "reweight root bucket " << id << dendl; + int r = crush_reweight_bucket(crush, b); + ceph_assert(r == 0); + + for (auto& i : choose_args) { + //cout << "carg " << i.first << std::endl; + vector w; // discard top-level weights + reweight_bucket(b, i.second, &w); + } + } + int r = rebuild_roots_with_classes(cct); + ceph_assert(r == 0); +} + +void CrushWrapper::reweight_bucket( + crush_bucket *b, + crush_choose_arg_map& arg_map, + vector *weightv) +{ + int idx = -1 - b->id; + unsigned npos = arg_map.args[idx].weight_set_positions; + //cout << __func__ << " " << b->id << " npos " << npos << std::endl; + weightv->resize(npos); + for (unsigned i = 0; i < b->size; ++i) { + int item = b->items[i]; + if (item >= 0) { + for (unsigned pos = 0; pos < npos; ++pos) { + (*weightv)[pos] += arg_map.args[idx].weight_set->weights[i]; + } + } else { + vector subw(npos); + crush_bucket *sub = get_bucket(item); + assert(sub); + reweight_bucket(sub, arg_map, &subw); + for (unsigned pos = 0; pos < npos; ++pos) { + (*weightv)[pos] += subw[pos]; + // strash the real bucket weight as the weights for this reference + arg_map.args[idx].weight_set->weights[i] = subw[pos]; + } + } + } + //cout << __func__ << " finish " << b->id << " " << *weightv << std::endl; +} + +int CrushWrapper::add_simple_rule_at( + string name, string root_name, + string failure_domain_name, + string device_class, + string mode, int rule_type, + int rno, + ostream *err) +{ + if (rule_exists(name)) { + if (err) + *err << "rule " << name << " exists"; + return -EEXIST; + } + if (rno >= 0) { + if (rule_exists(rno)) { + if (err) + *err << "rule with ruleno " << rno << " exists"; + return -EEXIST; + } + if (ruleset_exists(rno)) { + if (err) + *err << "ruleset " << rno << " exists"; + return -EEXIST; + } + } else { + for (rno = 0; rno < get_max_rules(); rno++) { + if (!rule_exists(rno) && !ruleset_exists(rno)) + break; + } + } + if (!name_exists(root_name)) { + if (err) + *err << "root item " << root_name << " does not exist"; + return -ENOENT; + } + int root = get_item_id(root_name); + int type = 0; + if (failure_domain_name.length()) { + type = get_type_id(failure_domain_name); + if (type < 0) { + if (err) + *err << "unknown type " << failure_domain_name; + return -EINVAL; + } + } + if (device_class.size()) { + if (!class_exists(device_class)) { + if (err) + *err << "device class " << device_class << " does not exist"; + return -EINVAL; + } + int c = get_class_id(device_class); + if (class_bucket.count(root) == 0 || + class_bucket[root].count(c) == 0) { + if (err) + *err << "root " << root_name << " has no devices with class " + << device_class; + return -EINVAL; + } + root = class_bucket[root][c]; + } + if (mode != "firstn" && mode != "indep") { + if (err) + *err << "unknown mode " << mode; + return -EINVAL; + } + + int steps = 3; + if (mode == "indep") + steps = 5; + int min_rep = mode == "firstn" ? 1 : 3; + int max_rep = mode == "firstn" ? 10 : 20; + //set the ruleset the same as rule_id(rno) + crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep); + ceph_assert(rule); + int step = 0; + if (mode == "indep") { + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0); + } + crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0); + if (type) + crush_rule_set_step(rule, step++, + mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN : + CRUSH_RULE_CHOOSELEAF_INDEP, + CRUSH_CHOOSE_N, + type); + else + crush_rule_set_step(rule, step++, + mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN : + CRUSH_RULE_CHOOSE_INDEP, + CRUSH_CHOOSE_N, + 0); + crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); + + int ret = crush_add_rule(crush, rule, rno); + if(ret < 0) { + *err << "failed to add rule " << rno << " because " << cpp_strerror(ret); + return ret; + } + set_rule_name(rno, name); + have_rmaps = false; + return rno; +} + +int CrushWrapper::add_simple_rule( + string name, string root_name, + string failure_domain_name, + string device_class, + string mode, int rule_type, + ostream *err) +{ + return add_simple_rule_at(name, root_name, failure_domain_name, device_class, + mode, + rule_type, -1, err); +} + +float CrushWrapper::_get_take_weight_osd_map(int root, + map *pmap) const +{ + float sum = 0.0; + list q; + q.push_back(root); + //breadth first iterate the OSD tree + while (!q.empty()) { + int bno = q.front(); + q.pop_front(); + crush_bucket *b = crush->buckets[-1-bno]; + ceph_assert(b); + for (unsigned j=0; jsize; ++j) { + int item_id = b->items[j]; + if (item_id >= 0) { //it's an OSD + float w = crush_get_bucket_item_weight(b, j); + (*pmap)[item_id] = w; + sum += w; + } else { //not an OSD, expand the child later + q.push_back(item_id); + } + } + } + return sum; +} + +void CrushWrapper::_normalize_weight_map(float sum, + const map& m, + map *pmap) const +{ + for (auto& p : m) { + map::iterator q = pmap->find(p.first); + if (q == pmap->end()) { + (*pmap)[p.first] = p.second / sum; + } else { + q->second += p.second / sum; + } + } +} + +int CrushWrapper::get_take_weight_osd_map(int root, map *pmap) const +{ + map m; + float sum = _get_take_weight_osd_map(root, &m); + _normalize_weight_map(sum, m, pmap); + return 0; +} + +int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, + map *pmap) const +{ + if (ruleno >= crush->max_rules) + return -ENOENT; + if (crush->rules[ruleno] == NULL) + return -ENOENT; + crush_rule *rule = crush->rules[ruleno]; + + // build a weight map for each TAKE in the rule, and then merge them + + // FIXME: if there are multiple takes that place a different number of + // objects we do not take that into account. (Also, note that doing this + // right is also a function of the pool, since the crush rule + // might choose 2 + choose 2 but pool size may only be 3.) + for (unsigned i=0; ilen; ++i) { + map m; + float sum = 0; + if (rule->steps[i].op == CRUSH_RULE_TAKE) { + int n = rule->steps[i].arg1; + if (n >= 0) { + m[n] = 1.0; + sum = 1.0; + } else { + sum += _get_take_weight_osd_map(n, &m); + } + } + _normalize_weight_map(sum, m, pmap); + } + + return 0; +} + +int CrushWrapper::remove_rule(int ruleno) +{ + if (ruleno >= (int)crush->max_rules) + return -ENOENT; + if (crush->rules[ruleno] == NULL) + return -ENOENT; + crush_destroy_rule(crush->rules[ruleno]); + crush->rules[ruleno] = NULL; + rule_name_map.erase(ruleno); + have_rmaps = false; + return rebuild_roots_with_classes(nullptr); +} + +int CrushWrapper::bucket_adjust_item_weight( + CephContext *cct, crush_bucket *bucket, int item, int weight, + bool adjust_weight_sets) +{ + if (adjust_weight_sets) { + unsigned position; + for (position = 0; position < bucket->size; position++) + if (bucket->items[position] == item) + break; + ceph_assert(position != bucket->size); + for (auto &w : choose_args) { + crush_choose_arg_map &arg_map = w.second; + crush_choose_arg *arg = &arg_map.args[-1-bucket->id]; + for (__u32 j = 0; j < arg->weight_set_positions; j++) { + crush_weight_set *weight_set = &arg->weight_set[j]; + weight_set->weights[position] = weight; + } + } + } + return crush_bucket_adjust_item_weight(crush, bucket, item, weight); +} + +int CrushWrapper::add_bucket( + int bucketno, int alg, int hash, int type, int size, + int *items, int *weights, int *idout) +{ + if (alg == 0) { + alg = get_default_bucket_alg(); + if (alg == 0) + return -EINVAL; + } + crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items, + weights); + ceph_assert(b); + ceph_assert(idout); + int r = crush_add_bucket(crush, bucketno, b, idout); + int pos = -1 - *idout; + for (auto& p : choose_args) { + crush_choose_arg_map& cmap = p.second; + unsigned new_size = crush->max_buckets; + if (cmap.args) { + if ((int)cmap.size < crush->max_buckets) { + cmap.args = static_cast(realloc( + cmap.args, + sizeof(crush_choose_arg) * new_size)); + ceph_assert(cmap.args); + memset(&cmap.args[cmap.size], 0, + sizeof(crush_choose_arg) * (new_size - cmap.size)); + cmap.size = new_size; + } + } else { + cmap.args = static_cast(calloc(sizeof(crush_choose_arg), + new_size)); + ceph_assert(cmap.args); + cmap.size = new_size; + } + if (size > 0) { + int positions = get_choose_args_positions(cmap); + crush_choose_arg& carg = cmap.args[pos]; + carg.weight_set = static_cast(calloc(sizeof(crush_weight_set), + size)); + carg.weight_set_positions = positions; + for (int ppos = 0; ppos < positions; ++ppos) { + carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size); + carg.weight_set[ppos].size = size; + for (int bpos = 0; bpos < size; ++bpos) { + carg.weight_set[ppos].weights[bpos] = weights[bpos]; + } + } + } + assert(crush->max_buckets == (int)cmap.size); + } + return r; +} + +int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight) +{ + __u32 new_size = bucket->size + 1; + int r = crush_bucket_add_item(crush, bucket, item, weight); + if (r < 0) { + return r; + } + for (auto &w : choose_args) { + crush_choose_arg_map &arg_map = w.second; + crush_choose_arg *arg = &arg_map.args[-1-bucket->id]; + for (__u32 j = 0; j < arg->weight_set_positions; j++) { + crush_weight_set *weight_set = &arg->weight_set[j]; + weight_set->weights = (__u32*)realloc(weight_set->weights, + new_size * sizeof(__u32)); + ceph_assert(weight_set->size + 1 == new_size); + weight_set->weights[weight_set->size] = weight; + weight_set->size = new_size; + } + if (arg->ids_size) { + arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32)); + ceph_assert(arg->ids_size + 1 == new_size); + arg->ids[arg->ids_size] = item; + arg->ids_size = new_size; + } + } + return 0; +} + +int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item) +{ + __u32 new_size = bucket->size - 1; + unsigned position; + for (position = 0; position < bucket->size; position++) + if (bucket->items[position] == item) + break; + ceph_assert(position != bucket->size); + int r = crush_bucket_remove_item(crush, bucket, item); + if (r < 0) { + return r; + } + for (auto &w : choose_args) { + crush_choose_arg_map &arg_map = w.second; + crush_choose_arg *arg = &arg_map.args[-1-bucket->id]; + for (__u32 j = 0; j < arg->weight_set_positions; j++) { + crush_weight_set *weight_set = &arg->weight_set[j]; + ceph_assert(weight_set->size - 1 == new_size); + for (__u32 k = position; k < new_size; k++) + weight_set->weights[k] = weight_set->weights[k+1]; + if (new_size) { + weight_set->weights = (__u32*)realloc(weight_set->weights, + new_size * sizeof(__u32)); + } else { + free(weight_set->weights); + weight_set->weights = NULL; + } + weight_set->size = new_size; + } + if (arg->ids_size) { + ceph_assert(arg->ids_size - 1 == new_size); + for (__u32 k = position; k < new_size; k++) + arg->ids[k] = arg->ids[k+1]; + if (new_size) { + arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32)); + } else { + free(arg->ids); + arg->ids = NULL; + } + arg->ids_size = new_size; + } + } + return 0; +} + +int CrushWrapper::bucket_set_alg(int bid, int alg) +{ + crush_bucket *b = get_bucket(bid); + if (!b) { + return -ENOENT; + } + b->alg = alg; + return 0; +} + +int CrushWrapper::update_device_class(int id, + const string& class_name, + const string& name, + ostream *ss) +{ + ceph_assert(item_exists(id)); + auto old_class_name = get_item_class(id); + if (old_class_name && old_class_name != class_name) { + *ss << "osd." << id << " has already bound to class '" << old_class_name + << "', can not reset class to '" << class_name << "'; " + << "use 'ceph osd crush rm-device-class ' to " + << "remove old class first"; + return -EBUSY; + } + + int class_id = get_or_create_class_id(class_name); + if (id < 0) { + *ss << name << " id " << id << " is negative"; + return -EINVAL; + } + + if (class_map.count(id) != 0 && class_map[id] == class_id) { + *ss << name << " already set to class " << class_name << ". "; + return 0; + } + + set_item_class(id, class_id); + + int r = rebuild_roots_with_classes(nullptr); + if (r < 0) + return r; + return 1; +} + +int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss) +{ + ceph_assert(ss); + const char *name = get_item_name(id); + if (!name) { + *ss << "osd." << id << " does not have a name"; + return -ENOENT; + } + + const char *class_name = get_item_class(id); + if (!class_name) { + *ss << "osd." << id << " has not been bound to a specific class yet"; + return 0; + } + class_remove_item(id); + + int r = rebuild_roots_with_classes(cct); + if (r < 0) { + *ss << "unable to rebuild roots with class '" << class_name << "' " + << "of osd." << id << ": " << cpp_strerror(r); + return r; + } + return 0; +} + +int CrushWrapper::device_class_clone( + int original_id, int device_class, + const std::map>& old_class_bucket, + const std::set& used_ids, + int *clone, + map>> *cmap_item_weight) +{ + const char *item_name = get_item_name(original_id); + if (item_name == NULL) + return -ECHILD; + const char *class_name = get_class_name(device_class); + if (class_name == NULL) + return -EBADF; + string copy_name = item_name + string("~") + class_name; + if (name_exists(copy_name)) { + *clone = get_item_id(copy_name); + return 0; + } + + crush_bucket *original = get_bucket(original_id); + ceph_assert(!IS_ERR(original)); + crush_bucket *copy = crush_make_bucket(crush, + original->alg, + original->hash, + original->type, + 0, NULL, NULL); + ceph_assert(copy); + + vector item_orig_pos; // new item pos -> orig item pos + for (unsigned i = 0; i < original->size; i++) { + int item = original->items[i]; + int weight = crush_get_bucket_item_weight(original, i); + if (item >= 0) { + if (class_map.count(item) != 0 && class_map[item] == device_class) { + int res = crush_bucket_add_item(crush, copy, item, weight); + if (res) + return res; + } else { + continue; + } + } else { + int child_copy_id; + int res = device_class_clone(item, device_class, old_class_bucket, + used_ids, &child_copy_id, + cmap_item_weight); + if (res < 0) + return res; + crush_bucket *child_copy = get_bucket(child_copy_id); + ceph_assert(!IS_ERR(child_copy)); + res = crush_bucket_add_item(crush, copy, child_copy_id, + child_copy->weight); + if (res) + return res; + } + item_orig_pos.push_back(i); + } + ceph_assert(item_orig_pos.size() == copy->size); + + int bno = 0; + if (old_class_bucket.count(original_id) && + old_class_bucket.at(original_id).count(device_class)) { + bno = old_class_bucket.at(original_id).at(device_class); + } else { + // pick a new shadow bucket id that is not used by the current map + // *or* any previous shadow buckets. + bno = -1; + while (((-1-bno) < crush->max_buckets && crush->buckets[-1-bno]) || + used_ids.count(bno)) { + --bno; + } + } + int res = crush_add_bucket(crush, bno, copy, clone); + if (res) + return res; + ceph_assert(!bno || bno == *clone); + + res = set_item_class(*clone, device_class); + if (res < 0) + return res; + + // we do not use set_item_name because the name is intentionally invalid + name_map[*clone] = copy_name; + if (have_rmaps) + name_rmap[copy_name] = *clone; + class_bucket[original_id][device_class] = *clone; + + // set up choose_args for the new bucket. + for (auto& w : choose_args) { + crush_choose_arg_map& cmap = w.second; + if (crush->max_buckets > (int)cmap.size) { + unsigned new_size = crush->max_buckets; + cmap.args = static_cast(realloc(cmap.args, + new_size * sizeof(cmap.args[0]))); + ceph_assert(cmap.args); + memset(cmap.args + cmap.size, 0, + (new_size - cmap.size) * sizeof(cmap.args[0])); + cmap.size = new_size; + } + auto& o = cmap.args[-1-original_id]; + auto& n = cmap.args[-1-bno]; + n.ids_size = 0; // FIXME: implement me someday + n.weight_set_positions = o.weight_set_positions; + n.weight_set = static_cast(calloc( + n.weight_set_positions, sizeof(crush_weight_set))); + for (size_t s = 0; s < n.weight_set_positions; ++s) { + n.weight_set[s].size = copy->size; + n.weight_set[s].weights = (__u32*)calloc(copy->size, sizeof(__u32)); + } + for (size_t s = 0; s < n.weight_set_positions; ++s) { + vector bucket_weights(n.weight_set_positions); + for (size_t i = 0; i < copy->size; ++i) { + int item = copy->items[i]; + if (item >= 0) { + n.weight_set[s].weights[i] = o.weight_set[s].weights[item_orig_pos[i]]; + } else if ((*cmap_item_weight)[w.first].count(item)) { + n.weight_set[s].weights[i] = (*cmap_item_weight)[w.first][item][s]; + } else { + n.weight_set[s].weights[i] = 0; + } + bucket_weights[s] += n.weight_set[s].weights[i]; + } + (*cmap_item_weight)[w.first][bno] = bucket_weights; + } + } + return 0; +} + +int CrushWrapper::get_rules_by_class(const string &class_name, set *rules) +{ + ceph_assert(rules); + rules->clear(); + if (!class_exists(class_name)) { + return -ENOENT; + } + int class_id = get_class_id(class_name); + for (unsigned i = 0; i < crush->max_rules; ++i) { + crush_rule *r = crush->rules[i]; + if (!r) + continue; + for (unsigned j = 0; j < r->len; ++j) { + if (r->steps[j].op == CRUSH_RULE_TAKE) { + int step_item = r->steps[j].arg1; + int original_item; + int c; + int res = split_id_class(step_item, &original_item, &c); + if (res < 0) { + return res; + } + if (c != -1 && c == class_id) { + rules->insert(i); + break; + } + } + } + } + return 0; +} + +// return rules that might reference the given osd +int CrushWrapper::get_rules_by_osd(int osd, set *rules) +{ + ceph_assert(rules); + rules->clear(); + if (osd < 0) { + return -EINVAL; + } + for (unsigned i = 0; i < crush->max_rules; ++i) { + crush_rule *r = crush->rules[i]; + if (!r) + continue; + for (unsigned j = 0; j < r->len; ++j) { + if (r->steps[j].op == CRUSH_RULE_TAKE) { + int step_item = r->steps[j].arg1; + list unordered; + int rc = _get_leaves(step_item, &unordered); + if (rc < 0) { + return rc; // propagate fatal errors! + } + bool match = false; + for (auto &o: unordered) { + ceph_assert(o >= 0); + if (o == osd) { + match = true; + break; + } + } + if (match) { + rules->insert(i); + break; + } + } + } + } + return 0; +} + +bool CrushWrapper::_class_is_dead(int class_id) +{ + for (auto &p: class_map) { + if (p.first >= 0 && p.second == class_id) { + return false; + } + } + for (unsigned i = 0; i < crush->max_rules; ++i) { + crush_rule *r = crush->rules[i]; + if (!r) + continue; + for (unsigned j = 0; j < r->len; ++j) { + if (r->steps[j].op == CRUSH_RULE_TAKE) { + int root = r->steps[j].arg1; + for (auto &p : class_bucket) { + auto& q = p.second; + if (q.count(class_id) && q[class_id] == root) { + return false; + } + } + } + } + } + // no more referenced by any devices or crush rules + return true; +} + +void CrushWrapper::cleanup_dead_classes() +{ + auto p = class_name.begin(); + while (p != class_name.end()) { + if (_class_is_dead(p->first)) { + string n = p->second; + ++p; + remove_class_name(n); + } else { + ++p; + } + } +} + +int CrushWrapper::rebuild_roots_with_classes(CephContext *cct) +{ + std::map > old_class_bucket = class_bucket; + cleanup_dead_classes(); + int r = trim_roots_with_class(cct); + if (r < 0) + return r; + class_bucket.clear(); + return populate_classes(old_class_bucket); +} + +void CrushWrapper::encode(bufferlist& bl, uint64_t features) const +{ + using ceph::encode; + ceph_assert(crush); + + __u32 magic = CRUSH_MAGIC; + encode(magic, bl); + + encode(crush->max_buckets, bl); + encode(crush->max_rules, bl); + encode(crush->max_devices, bl); + + bool encode_compat_choose_args = false; + crush_choose_arg_map arg_map; + memset(&arg_map, '\0', sizeof(arg_map)); + if (has_choose_args() && + !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) { + ceph_assert(!has_incompat_choose_args()); + encode_compat_choose_args = true; + arg_map = choose_args.begin()->second; + } + + // buckets + for (int i=0; imax_buckets; i++) { + __u32 alg = 0; + if (crush->buckets[i]) alg = crush->buckets[i]->alg; + encode(alg, bl); + if (!alg) + continue; + + encode(crush->buckets[i]->id, bl); + encode(crush->buckets[i]->type, bl); + encode(crush->buckets[i]->alg, bl); + encode(crush->buckets[i]->hash, bl); + encode(crush->buckets[i]->weight, bl); + encode(crush->buckets[i]->size, bl); + for (unsigned j=0; jbuckets[i]->size; j++) + encode(crush->buckets[i]->items[j], bl); + + switch (crush->buckets[i]->alg) { + case CRUSH_BUCKET_UNIFORM: + encode((reinterpret_cast(crush->buckets[i]))->item_weight, bl); + break; + + case CRUSH_BUCKET_LIST: + for (unsigned j=0; jbuckets[i]->size; j++) { + encode((reinterpret_cast(crush->buckets[i]))->item_weights[j], bl); + encode((reinterpret_cast(crush->buckets[i]))->sum_weights[j], bl); + } + break; + + case CRUSH_BUCKET_TREE: + encode((reinterpret_cast(crush->buckets[i]))->num_nodes, bl); + for (unsigned j=0; j<(reinterpret_cast(crush->buckets[i]))->num_nodes; j++) + encode((reinterpret_cast(crush->buckets[i]))->node_weights[j], bl); + break; + + case CRUSH_BUCKET_STRAW: + for (unsigned j=0; jbuckets[i]->size; j++) { + encode((reinterpret_cast(crush->buckets[i]))->item_weights[j], bl); + encode((reinterpret_cast(crush->buckets[i]))->straws[j], bl); + } + break; + + case CRUSH_BUCKET_STRAW2: + { + __u32 *weights; + if (encode_compat_choose_args && + arg_map.args[i].weight_set_positions > 0) { + weights = arg_map.args[i].weight_set[0].weights; + } else { + weights = (reinterpret_cast(crush->buckets[i]))->item_weights; + } + for (unsigned j=0; jbuckets[i]->size; j++) { + encode(weights[j], bl); + } + } + break; + + default: + ceph_abort(); + break; + } + } + + // rules + for (unsigned i=0; imax_rules; i++) { + __u32 yes = crush->rules[i] ? 1:0; + encode(yes, bl); + if (!yes) + continue; + + encode(crush->rules[i]->len, bl); + encode(crush->rules[i]->mask, bl); + for (unsigned j=0; jrules[i]->len; j++) + encode(crush->rules[i]->steps[j], bl); + } + + // name info + encode(type_map, bl); + encode(name_map, bl); + encode(rule_name_map, bl); + + // tunables + encode(crush->choose_local_tries, bl); + encode(crush->choose_local_fallback_tries, bl); + encode(crush->choose_total_tries, bl); + encode(crush->chooseleaf_descend_once, bl); + encode(crush->chooseleaf_vary_r, bl); + encode(crush->straw_calc_version, bl); + encode(crush->allowed_bucket_algs, bl); + if (features & CEPH_FEATURE_CRUSH_TUNABLES5) { + encode(crush->chooseleaf_stable, bl); + } + + if (HAVE_FEATURE(features, SERVER_LUMINOUS)) { + // device classes + encode(class_map, bl); + encode(class_name, bl); + encode(class_bucket, bl); + + // choose args + __u32 size = (__u32)choose_args.size(); + encode(size, bl); + for (auto c : choose_args) { + encode(c.first, bl); + crush_choose_arg_map arg_map = c.second; + size = 0; + for (__u32 i = 0; i < arg_map.size; i++) { + crush_choose_arg *arg = &arg_map.args[i]; + if (arg->weight_set_positions == 0 && + arg->ids_size == 0) + continue; + size++; + } + encode(size, bl); + for (__u32 i = 0; i < arg_map.size; i++) { + crush_choose_arg *arg = &arg_map.args[i]; + if (arg->weight_set_positions == 0 && + arg->ids_size == 0) + continue; + encode(i, bl); + encode(arg->weight_set_positions, bl); + for (__u32 j = 0; j < arg->weight_set_positions; j++) { + crush_weight_set *weight_set = &arg->weight_set[j]; + encode(weight_set->size, bl); + for (__u32 k = 0; k < weight_set->size; k++) + encode(weight_set->weights[k], bl); + } + encode(arg->ids_size, bl); + for (__u32 j = 0; j < arg->ids_size; j++) + encode(arg->ids[j], bl); + } + } + } +} + +static void decode_32_or_64_string_map(map& m, bufferlist::const_iterator& blp) +{ + m.clear(); + __u32 n; + decode(n, blp); + while (n--) { + __s32 key; + decode(key, blp); + + __u32 strlen; + decode(strlen, blp); + if (strlen == 0) { + // der, key was actually 64-bits! + decode(strlen, blp); + } + decode_nohead(strlen, m[key], blp); + } +} + +void CrushWrapper::decode(bufferlist::const_iterator& blp) +{ + using ceph::decode; + create(); + + __u32 magic; + decode(magic, blp); + if (magic != CRUSH_MAGIC) + throw buffer::malformed_input("bad magic number"); + + decode(crush->max_buckets, blp); + decode(crush->max_rules, blp); + decode(crush->max_devices, blp); + + // legacy tunables, unless we decode something newer + set_tunables_legacy(); + + try { + // buckets + crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*)); + for (int i=0; imax_buckets; i++) { + decode_crush_bucket(&crush->buckets[i], blp); + } + + // rules + crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*)); + for (unsigned i = 0; i < crush->max_rules; ++i) { + __u32 yes; + decode(yes, blp); + if (!yes) { + crush->rules[i] = NULL; + continue; + } + + __u32 len; + decode(len, blp); + crush->rules[i] = reinterpret_cast(calloc(1, crush_rule_size(len))); + crush->rules[i]->len = len; + decode(crush->rules[i]->mask, blp); + for (unsigned j=0; jrules[i]->len; j++) + decode(crush->rules[i]->steps[j], blp); + } + + // name info + // NOTE: we had a bug where we were incoding int instead of int32, which means the + // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate + // both by assuming the string is always non-empty. + decode_32_or_64_string_map(type_map, blp); + decode_32_or_64_string_map(name_map, blp); + decode_32_or_64_string_map(rule_name_map, blp); + + // tunables + if (!blp.end()) { + decode(crush->choose_local_tries, blp); + decode(crush->choose_local_fallback_tries, blp); + decode(crush->choose_total_tries, blp); + } + if (!blp.end()) { + decode(crush->chooseleaf_descend_once, blp); + } + if (!blp.end()) { + decode(crush->chooseleaf_vary_r, blp); + } + if (!blp.end()) { + decode(crush->straw_calc_version, blp); + } + if (!blp.end()) { + decode(crush->allowed_bucket_algs, blp); + } + if (!blp.end()) { + decode(crush->chooseleaf_stable, blp); + } + if (!blp.end()) { + decode(class_map, blp); + decode(class_name, blp); + for (auto &c : class_name) + class_rname[c.second] = c.first; + decode(class_bucket, blp); + } + if (!blp.end()) { + __u32 choose_args_size; + decode(choose_args_size, blp); + for (__u32 i = 0; i < choose_args_size; i++) { + typename decltype(choose_args)::key_type choose_args_index; + decode(choose_args_index, blp); + crush_choose_arg_map arg_map; + arg_map.size = crush->max_buckets; + arg_map.args = static_cast(calloc( + arg_map.size, sizeof(crush_choose_arg))); + __u32 size; + decode(size, blp); + for (__u32 j = 0; j < size; j++) { + __u32 bucket_index; + decode(bucket_index, blp); + ceph_assert(bucket_index < arg_map.size); + crush_choose_arg *arg = &arg_map.args[bucket_index]; + decode(arg->weight_set_positions, blp); + if (arg->weight_set_positions) { + arg->weight_set = static_cast(calloc( + arg->weight_set_positions, sizeof(crush_weight_set))); + for (__u32 k = 0; k < arg->weight_set_positions; k++) { + crush_weight_set *weight_set = &arg->weight_set[k]; + decode(weight_set->size, blp); + weight_set->weights = (__u32*)calloc( + weight_set->size, sizeof(__u32)); + for (__u32 l = 0; l < weight_set->size; l++) + decode(weight_set->weights[l], blp); + } + } + decode(arg->ids_size, blp); + if (arg->ids_size) { + ceph_assert(arg->ids_size == crush->buckets[bucket_index]->size); + arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32)); + for (__u32 k = 0; k < arg->ids_size; k++) + decode(arg->ids[k], blp); + } + } + choose_args[choose_args_index] = arg_map; + } + } + update_choose_args(nullptr); // in case we decode a legacy "corrupted" map + finalize(); + } + catch (...) { + crush_destroy(crush); + throw; + } +} + +void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::const_iterator &blp) +{ + using ceph::decode; + __u32 alg; + decode(alg, blp); + if (!alg) { + *bptr = NULL; + return; + } + + int size = 0; + switch (alg) { + case CRUSH_BUCKET_UNIFORM: + size = sizeof(crush_bucket_uniform); + break; + case CRUSH_BUCKET_LIST: + size = sizeof(crush_bucket_list); + break; + case CRUSH_BUCKET_TREE: + size = sizeof(crush_bucket_tree); + break; + case CRUSH_BUCKET_STRAW: + size = sizeof(crush_bucket_straw); + break; + case CRUSH_BUCKET_STRAW2: + size = sizeof(crush_bucket_straw2); + break; + default: + { + char str[128]; + snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg); + throw buffer::malformed_input(str); + } + } + crush_bucket *bucket = reinterpret_cast(calloc(1, size)); + *bptr = bucket; + + decode(bucket->id, blp); + decode(bucket->type, blp); + decode(bucket->alg, blp); + decode(bucket->hash, blp); + decode(bucket->weight, blp); + decode(bucket->size, blp); + + bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32)); + for (unsigned j = 0; j < bucket->size; ++j) { + decode(bucket->items[j], blp); + } + + switch (bucket->alg) { + case CRUSH_BUCKET_UNIFORM: + decode((reinterpret_cast(bucket))->item_weight, blp); + break; + + case CRUSH_BUCKET_LIST: { + crush_bucket_list* cbl = reinterpret_cast(bucket); + cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32)); + cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32)); + + for (unsigned j = 0; j < bucket->size; ++j) { + decode(cbl->item_weights[j], blp); + decode(cbl->sum_weights[j], blp); + } + break; + } + + case CRUSH_BUCKET_TREE: { + crush_bucket_tree* cbt = reinterpret_cast(bucket); + decode(cbt->num_nodes, blp); + cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32)); + for (unsigned j=0; jnum_nodes; j++) { + decode(cbt->node_weights[j], blp); + } + break; + } + + case CRUSH_BUCKET_STRAW: { + crush_bucket_straw* cbs = reinterpret_cast(bucket); + cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32)); + cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32)); + for (unsigned j = 0; j < bucket->size; ++j) { + decode(cbs->item_weights[j], blp); + decode(cbs->straws[j], blp); + } + break; + } + + case CRUSH_BUCKET_STRAW2: { + crush_bucket_straw2* cbs = reinterpret_cast(bucket); + cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32)); + for (unsigned j = 0; j < bucket->size; ++j) { + decode(cbs->item_weights[j], blp); + } + break; + } + + default: + // We should have handled this case in the first switch statement + ceph_abort(); + break; + } +} + + +void CrushWrapper::dump(Formatter *f) const +{ + f->open_array_section("devices"); + for (int i=0; iopen_object_section("device"); + f->dump_int("id", i); + const char *n = get_item_name(i); + if (n) { + f->dump_string("name", n); + } else { + char name[20]; + sprintf(name, "device%d", i); + f->dump_string("name", name); + } + const char *device_class = get_item_class(i); + if (device_class != NULL) + f->dump_string("class", device_class); + f->close_section(); + } + f->close_section(); + + f->open_array_section("types"); + int n = get_num_type_names(); + for (int i=0; n; i++) { + const char *name = get_type_name(i); + if (!name) { + if (i == 0) { + f->open_object_section("type"); + f->dump_int("type_id", 0); + f->dump_string("name", "device"); + f->close_section(); + } + continue; + } + n--; + f->open_object_section("type"); + f->dump_int("type_id", i); + f->dump_string("name", name); + f->close_section(); + } + f->close_section(); + + f->open_array_section("buckets"); + for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) { + if (!bucket_exists(bucket)) + continue; + f->open_object_section("bucket"); + f->dump_int("id", bucket); + if (get_item_name(bucket)) + f->dump_string("name", get_item_name(bucket)); + f->dump_int("type_id", get_bucket_type(bucket)); + if (get_type_name(get_bucket_type(bucket))) + f->dump_string("type_name", get_type_name(get_bucket_type(bucket))); + f->dump_int("weight", get_bucket_weight(bucket)); + f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket))); + f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket))); + f->open_array_section("items"); + for (int j=0; jopen_object_section("item"); + f->dump_int("id", get_bucket_item(bucket, j)); + f->dump_int("weight", get_bucket_item_weight(bucket, j)); + f->dump_int("pos", j); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + + f->open_array_section("rules"); + dump_rules(f); + f->close_section(); + + f->open_object_section("tunables"); + dump_tunables(f); + f->close_section(); + + dump_choose_args(f); +} + +namespace { + // depth first walker + class TreeDumper { + typedef CrushTreeDumper::Item Item; + const CrushWrapper *crush; + const CrushTreeDumper::name_map_t& weight_set_names; + public: + explicit TreeDumper(const CrushWrapper *crush, + const CrushTreeDumper::name_map_t& wsnames) + : crush(crush), weight_set_names(wsnames) {} + + void dump(Formatter *f) { + set roots; + crush->find_roots(&roots); + for (set::iterator root = roots.begin(); root != roots.end(); ++root) { + dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f); + } + } + + private: + void dump_item(const Item& qi, Formatter* f) { + if (qi.is_bucket()) { + f->open_object_section("bucket"); + CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f); + dump_bucket_children(qi, f); + f->close_section(); + } else { + f->open_object_section("device"); + CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f); + f->close_section(); + } + } + + void dump_bucket_children(const Item& parent, Formatter* f) { + f->open_array_section("items"); + const int max_pos = crush->get_bucket_size(parent.id); + for (int pos = 0; pos < max_pos; pos++) { + int id = crush->get_bucket_item(parent.id, pos); + float weight = crush->get_bucket_item_weightf(parent.id, pos); + dump_item(Item(id, parent.id, parent.depth + 1, weight), f); + } + f->close_section(); + } + }; +} + +void CrushWrapper::dump_tree( + Formatter *f, + const CrushTreeDumper::name_map_t& weight_set_names) const +{ + ceph_assert(f); + TreeDumper(this, weight_set_names).dump(f); +} + +void CrushWrapper::dump_tunables(Formatter *f) const +{ + f->dump_int("choose_local_tries", get_choose_local_tries()); + f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries()); + f->dump_int("choose_total_tries", get_choose_total_tries()); + f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once()); + f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r()); + f->dump_int("chooseleaf_stable", get_chooseleaf_stable()); + f->dump_int("straw_calc_version", get_straw_calc_version()); + f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs()); + + // be helpful about it + if (has_jewel_tunables()) + f->dump_string("profile", "jewel"); + else if (has_hammer_tunables()) + f->dump_string("profile", "hammer"); + else if (has_firefly_tunables()) + f->dump_string("profile", "firefly"); + else if (has_bobtail_tunables()) + f->dump_string("profile", "bobtail"); + else if (has_argonaut_tunables()) + f->dump_string("profile", "argonaut"); + else + f->dump_string("profile", "unknown"); + f->dump_int("optimal_tunables", (int)has_optimal_tunables()); + f->dump_int("legacy_tunables", (int)has_legacy_tunables()); + + // be helpful about minimum version required + f->dump_string("minimum_required_version", get_min_required_version()); + + f->dump_int("require_feature_tunables", (int)has_nondefault_tunables()); + f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2()); + f->dump_int("has_v2_rules", (int)has_v2_rules()); + f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3()); + f->dump_int("has_v3_rules", (int)has_v3_rules()); + f->dump_int("has_v4_buckets", (int)has_v4_buckets()); + f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5()); + f->dump_int("has_v5_rules", (int)has_v5_rules()); +} + +void CrushWrapper::dump_choose_args(Formatter *f) const +{ + f->open_object_section("choose_args"); + for (auto c : choose_args) { + crush_choose_arg_map arg_map = c.second; + f->open_array_section(stringify(c.first).c_str()); + for (__u32 i = 0; i < arg_map.size; i++) { + crush_choose_arg *arg = &arg_map.args[i]; + if (arg->weight_set_positions == 0 && + arg->ids_size == 0) + continue; + f->open_object_section("choose_args"); + int bucket_index = i; + f->dump_int("bucket_id", -1-bucket_index); + if (arg->weight_set_positions > 0) { + f->open_array_section("weight_set"); + for (__u32 j = 0; j < arg->weight_set_positions; j++) { + f->open_array_section("weights"); + __u32 *weights = arg->weight_set[j].weights; + __u32 size = arg->weight_set[j].size; + for (__u32 k = 0; k < size; k++) { + f->dump_float("weight", (float)weights[k]/(float)0x10000); + } + f->close_section(); + } + f->close_section(); + } + if (arg->ids_size > 0) { + f->open_array_section("ids"); + for (__u32 j = 0; j < arg->ids_size; j++) + f->dump_int("id", arg->ids[j]); + f->close_section(); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} + +void CrushWrapper::dump_rules(Formatter *f) const +{ + for (int i=0; iopen_object_section("rule"); + f->dump_int("rule_id", ruleset); + if (get_rule_name(ruleset)) + f->dump_string("rule_name", get_rule_name(ruleset)); + f->dump_int("ruleset", get_rule_mask_ruleset(ruleset)); + f->dump_int("type", get_rule_mask_type(ruleset)); + f->dump_int("min_size", get_rule_mask_min_size(ruleset)); + f->dump_int("max_size", get_rule_mask_max_size(ruleset)); + f->open_array_section("steps"); + for (int j=0; jopen_object_section("step"); + switch (get_rule_op(ruleset, j)) { + case CRUSH_RULE_NOOP: + f->dump_string("op", "noop"); + break; + case CRUSH_RULE_TAKE: + f->dump_string("op", "take"); + { + int item = get_rule_arg1(ruleset, j); + f->dump_int("item", item); + + const char *name = get_item_name(item); + f->dump_string("item_name", name ? name : ""); + } + break; + case CRUSH_RULE_EMIT: + f->dump_string("op", "emit"); + break; + case CRUSH_RULE_CHOOSE_FIRSTN: + f->dump_string("op", "choose_firstn"); + f->dump_int("num", get_rule_arg1(ruleset, j)); + f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j))); + break; + case CRUSH_RULE_CHOOSE_INDEP: + f->dump_string("op", "choose_indep"); + f->dump_int("num", get_rule_arg1(ruleset, j)); + f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j))); + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: + f->dump_string("op", "chooseleaf_firstn"); + f->dump_int("num", get_rule_arg1(ruleset, j)); + f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j))); + break; + case CRUSH_RULE_CHOOSELEAF_INDEP: + f->dump_string("op", "chooseleaf_indep"); + f->dump_int("num", get_rule_arg1(ruleset, j)); + f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j))); + break; + case CRUSH_RULE_SET_CHOOSE_TRIES: + f->dump_string("op", "set_choose_tries"); + f->dump_int("num", get_rule_arg1(ruleset, j)); + break; + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: + f->dump_string("op", "set_chooseleaf_tries"); + f->dump_int("num", get_rule_arg1(ruleset, j)); + break; + default: + f->dump_int("opcode", get_rule_op(ruleset, j)); + f->dump_int("arg1", get_rule_arg1(ruleset, j)); + f->dump_int("arg2", get_rule_arg2(ruleset, j)); + } + f->close_section(); + } + f->close_section(); + f->close_section(); +} + +void CrushWrapper::list_rules(Formatter *f) const +{ + for (int rule = 0; rule < get_max_rules(); rule++) { + if (!rule_exists(rule)) + continue; + f->dump_string("name", get_rule_name(rule)); + } +} + +void CrushWrapper::list_rules(ostream *ss) const +{ + for (int rule = 0; rule < get_max_rules(); rule++) { + if (!rule_exists(rule)) + continue; + *ss << get_rule_name(rule) << "\n"; + } +} + +class CrushTreePlainDumper : public CrushTreeDumper::Dumper { +public: + typedef CrushTreeDumper::Dumper Parent; + + explicit CrushTreePlainDumper(const CrushWrapper *crush, + const CrushTreeDumper::name_map_t& wsnames) + : Parent(crush, wsnames) {} + explicit CrushTreePlainDumper(const CrushWrapper *crush, + const CrushTreeDumper::name_map_t& wsnames, + bool show_shadow) + : Parent(crush, wsnames, show_shadow) {} + + + void dump(TextTable *tbl) { + tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT); + tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT); + for (auto& p : crush->choose_args) { + if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) { + tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT); + } else { + string name; + auto q = weight_set_names.find(p.first); + name = q != weight_set_names.end() ? q->second : + stringify(p.first); + tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT); + } + } + tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT); + Parent::dump(tbl); + } + +protected: + void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override { + const char *c = crush->get_item_class(qi.id); + if (!c) + c = ""; + *tbl << qi.id + << c + << weightf_t(qi.weight); + for (auto& p : crush->choose_args) { + if (qi.parent < 0) { + const crush_choose_arg_map cmap = crush->choose_args_get(p.first); + int bidx = -1 - qi.parent; + const crush_bucket *b = crush->get_bucket(qi.parent); + if (b && + bidx < (int)cmap.size && + cmap.args[bidx].weight_set && + cmap.args[bidx].weight_set_positions >= 1) { + int pos; + for (pos = 0; + pos < (int)cmap.args[bidx].weight_set[0].size && + b->items[pos] != qi.id; + ++pos) ; + *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] / + (float)0x10000); + continue; + } + } + *tbl << ""; + } + ostringstream ss; + for (int k=0; k < qi.depth; k++) { + ss << " "; + } + if (qi.is_bucket()) { + ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " " + << crush->get_item_name(qi.id); + } else { + ss << "osd." << qi.id; + } + *tbl << ss.str(); + *tbl << TextTable::endrow; + } +}; + + +class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper { +public: + typedef CrushTreeDumper::FormattingDumper Parent; + + explicit CrushTreeFormattingDumper( + const CrushWrapper *crush, + const CrushTreeDumper::name_map_t& wsnames) + : Parent(crush, wsnames) {} + + explicit CrushTreeFormattingDumper( + const CrushWrapper *crush, + const CrushTreeDumper::name_map_t& wsnames, + bool show_shadow) + : Parent(crush, wsnames, show_shadow) {} + + void dump(Formatter *f) { + f->open_array_section("nodes"); + Parent::dump(f); + f->close_section(); + + // There is no stray bucket whose id is a negative number, so just get + // the max_id and iterate from 0 to max_id to dump stray osds. + f->open_array_section("stray"); + int32_t max_id = -1; + if (!crush->name_map.empty()) { + max_id = crush->name_map.rbegin()->first; + } + for (int32_t i = 0; i <= max_id; i++) { + if (crush->item_exists(i) && !is_touched(i) && should_dump(i)) { + dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f); + } + } + f->close_section(); + } +}; + + +void CrushWrapper::dump_tree( + ostream *out, + Formatter *f, + const CrushTreeDumper::name_map_t& weight_set_names, + bool show_shadow) const +{ + if (out) { + TextTable tbl; + CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl); + *out << tbl; + } + if (f) { + CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f); + } +} + +void CrushWrapper::generate_test_instances(list& o) +{ + o.push_back(new CrushWrapper); + // fixme +} + +/** + * Determine the default CRUSH ruleset ID to be used with + * newly created replicated pools. + * + * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found + */ +int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct) +{ + int crush_ruleset = cct->_conf.get_val("osd_pool_default_crush_rule"); + if (crush_ruleset < 0) { + crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED); + } else if (!ruleset_exists(crush_ruleset)) { + crush_ruleset = -1; // match find_first_ruleset() retval + } + return crush_ruleset; +} + +bool CrushWrapper::is_valid_crush_name(const string& s) +{ + if (s.empty()) + return false; + for (string::const_iterator p = s.begin(); p != s.end(); ++p) { + if (!(*p == '-') && + !(*p == '_') && + !(*p == '.') && + !(*p >= '0' && *p <= '9') && + !(*p >= 'A' && *p <= 'Z') && + !(*p >= 'a' && *p <= 'z')) + return false; + } + return true; +} + +bool CrushWrapper::is_valid_crush_loc(CephContext *cct, + const map& loc) +{ + for (map::const_iterator l = loc.begin(); l != loc.end(); ++l) { + if (!is_valid_crush_name(l->first) || + !is_valid_crush_name(l->second)) { + ldout(cct, 1) << "loc[" + << l->first << "] = '" + << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)" + << dendl; + return false; + } + } + return true; +} + +int CrushWrapper::_choose_type_stack( + CephContext *cct, + const vector>& stack, + const set& overfull, + const vector& underfull, + const vector& more_underfull, + const vector& orig, + vector::const_iterator& i, + set& used, + vector *pw, + int root_bucket, + int rule) const +{ + vector w = *pw; + vector o; + + ldout(cct, 10) << __func__ << " stack " << stack + << " orig " << orig + << " at " << *i + << " pw " << *pw + << dendl; + ceph_assert(root_bucket < 0); + vector cumulative_fanout(stack.size()); + int f = 1; + for (int j = (int)stack.size() - 1; j >= 0; --j) { + cumulative_fanout[j] = f; + f *= stack[j].second; + } + ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout + << dendl; + + // identify underfull targets for each intermediate level. + // this serves two purposes: + // 1. we can tell when we are selecting a bucket that does not have any underfull + // devices beneath it. that means that if the current input includes an overfull + // device, we won't be able to find an underfull device with this parent to + // swap for it. + // 2. when we decide we should reject a bucket due to the above, this list gives us + // a list of peers to consider that *do* have underfull devices available.. (we + // are careful to pick one that has the same parent.) + vector> underfull_buckets; // level -> set of buckets with >0 underfull item(s) + underfull_buckets.resize(stack.size() - 1); + for (auto osd : underfull) { + int item = osd; + for (int j = (int)stack.size() - 2; j >= 0; --j) { + int type = stack[j].first; + item = get_parent_of_type(item, type, rule); + ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type + << " is " << item << dendl; + if (!subtree_contains(root_bucket, item)) { + ldout(cct, 20) << __func__ << " not in root subtree " << root_bucket << dendl; + continue; + } + underfull_buckets[j].insert(item); + } + } + ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl; + + for (unsigned j = 0; j < stack.size(); ++j) { + int type = stack[j].first; + int fanout = stack[j].second; + int cum_fanout = cumulative_fanout[j]; + ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout + << " cumulative " << cum_fanout + << " w " << w << dendl; + vector o; + auto tmpi = i; + if (i == orig.end()) { + ldout(cct, 10) << __func__ << " end of orig, break 0" << dendl; + break; + } + for (auto from : w) { + ldout(cct, 10) << " from " << from << dendl; + // identify leaves under each choice. we use this to check whether any of these + // leaves are overfull. (if so, we need to make sure there are underfull candidates + // to swap for them.) + vector> leaves; + leaves.resize(fanout); + for (int pos = 0; pos < fanout; ++pos) { + if (type > 0) { + // non-leaf + int item = get_parent_of_type(*tmpi, type, rule); + o.push_back(item); + int n = cum_fanout; + while (n-- && tmpi != orig.end()) { + leaves[pos].insert(*tmpi++); + } + ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item + << " of type " << type << " over leaves " << leaves[pos] << dendl; + } else { + // leaf + bool replaced = false; + if (overfull.count(*i)) { + for (auto item : underfull) { + ldout(cct, 10) << __func__ << " pos " << pos + << " was " << *i << " considering " << item + << dendl; + if (used.count(item)) { + ldout(cct, 20) << __func__ << " in used " << used << dendl; + continue; + } + if (!subtree_contains(from, item)) { + ldout(cct, 20) << __func__ << " not in subtree " << from << dendl; + continue; + } + if (std::find(orig.begin(), orig.end(), item) != orig.end()) { + ldout(cct, 20) << __func__ << " in orig " << orig << dendl; + continue; + } + o.push_back(item); + used.insert(item); + ldout(cct, 10) << __func__ << " pos " << pos << " replace " + << *i << " -> " << item << dendl; + replaced = true; + ceph_assert(i != orig.end()); + ++i; + break; + } + if (!replaced) { + for (auto item : more_underfull) { + ldout(cct, 10) << __func__ << " more underfull pos " << pos + << " was " << *i << " considering " << item + << dendl; + if (used.count(item)) { + ldout(cct, 20) << __func__ << " in used " << used << dendl; + continue; + } + if (!subtree_contains(from, item)) { + ldout(cct, 20) << __func__ << " not in subtree " << from << dendl; + continue; + } + if (std::find(orig.begin(), orig.end(), item) != orig.end()) { + ldout(cct, 20) << __func__ << " in orig " << orig << dendl; + continue; + } + o.push_back(item); + used.insert(item); + ldout(cct, 10) << __func__ << " pos " << pos << " replace " + << *i << " -> " << item << dendl; + replaced = true; + assert(i != orig.end()); + ++i; + break; + } + } + } + if (!replaced) { + ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i + << dendl; + ceph_assert(i != orig.end()); + o.push_back(*i); + ++i; + } + if (i == orig.end()) { + ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl; + break; + } + } + } + if (j + 1 < stack.size()) { + // check if any buckets have overfull leaves but no underfull candidates + for (int pos = 0; pos < fanout; ++pos) { + if (underfull_buckets[j].count(o[pos]) == 0) { + // are any leaves overfull? + bool any_overfull = false; + for (auto osd : leaves[pos]) { + if (overfull.count(osd)) { + any_overfull = true; + break; + } + } + if (any_overfull) { + ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and " + << ">0 leaves " << leaves[pos] << " is overfull; alts " + << underfull_buckets[j] + << dendl; + for (auto alt : underfull_buckets[j]) { + if (std::find(o.begin(), o.end(), alt) == o.end()) { + // see if alt has the same parent + if (j == 0 || + get_parent_of_type(o[pos], stack[j-1].first, rule) == + get_parent_of_type(alt, stack[j-1].first, rule)) { + if (j) + ldout(cct, 10) << " replacing " << o[pos] + << " (which has no underfull leaves) with " << alt + << " (same parent " + << get_parent_of_type(alt, stack[j-1].first, rule) << " type " + << type << ")" << dendl; + else + ldout(cct, 10) << " replacing " << o[pos] + << " (which has no underfull leaves) with " << alt + << " (first level)" << dendl; + o[pos] = alt; + break; + } else { + ldout(cct, 30) << " alt " << alt << " for " << o[pos] + << " has different parent, skipping" << dendl; + } + } + } + } + } + } + } + if (i == orig.end()) { + ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl; + break; + } + } + ldout(cct, 10) << __func__ << " w <- " << o << " was " << w << dendl; + w.swap(o); + } + *pw = w; + return 0; +} + +int CrushWrapper::try_remap_rule( + CephContext *cct, + int ruleno, + int maxout, + const set& overfull, + const vector& underfull, + const vector& more_underfull, + const vector& orig, + vector *out) const +{ + const crush_map *map = crush; + const crush_rule *rule = get_rule(ruleno); + ceph_assert(rule); + + ldout(cct, 10) << __func__ << " ruleno " << ruleno + << " numrep " << maxout << " overfull " << overfull + << " underfull " << underfull + << " more_underfull " << more_underfull + << " orig " << orig + << dendl; + vector w; // working set + out->clear(); + + auto i = orig.begin(); + set used; + + vector> type_stack; // (type, fan-out) + int root_bucket = 0; + for (unsigned step = 0; step < rule->len; ++step) { + const crush_rule_step *curstep = &rule->steps[step]; + ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl; + switch (curstep->op) { + case CRUSH_RULE_TAKE: + if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) || + (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets && + map->buckets[-1-curstep->arg1])) { + w.clear(); + w.push_back(curstep->arg1); + root_bucket = curstep->arg1; + ldout(cct, 10) << __func__ << " take " << w << dendl; + } else { + ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl; + } + break; + + case CRUSH_RULE_CHOOSELEAF_FIRSTN: + case CRUSH_RULE_CHOOSELEAF_INDEP: + { + int numrep = curstep->arg1; + int type = curstep->arg2; + if (numrep <= 0) + numrep += maxout; + type_stack.push_back(make_pair(type, numrep)); + if (type > 0) + type_stack.push_back(make_pair(0, 1)); + int r = _choose_type_stack(cct, type_stack, overfull, underfull, more_underfull, orig, + i, used, &w, root_bucket, ruleno); + if (r < 0) + return r; + type_stack.clear(); + } + break; + + case CRUSH_RULE_CHOOSE_FIRSTN: + case CRUSH_RULE_CHOOSE_INDEP: + { + int numrep = curstep->arg1; + int type = curstep->arg2; + if (numrep <= 0) + numrep += maxout; + type_stack.push_back(make_pair(type, numrep)); + } + break; + + case CRUSH_RULE_EMIT: + ldout(cct, 10) << " emit " << w << dendl; + if (!type_stack.empty()) { + int r = _choose_type_stack(cct, type_stack, overfull, underfull, more_underfull, orig, + i, used, &w, root_bucket, ruleno); + if (r < 0) + return r; + type_stack.clear(); + } + for (auto item : w) { + out->push_back(item); + } + w.clear(); + break; + + default: + // ignore + break; + } + } + + return 0; +} + + +int CrushWrapper::_choose_args_adjust_item_weight_in_bucket( + CephContext *cct, + crush_choose_arg_map cmap, + int bucketid, + int id, + const vector& weight, + ostream *ss) +{ + int changed = 0; + int bidx = -1 - bucketid; + crush_bucket *b = crush->buckets[bidx]; + if (bidx >= (int)cmap.size) { + if (ss) + *ss << "no weight-set for bucket " << b->id; + ldout(cct, 10) << __func__ << " no crush_choose_arg for bucket " << b->id + << dendl; + return 0; + } + crush_choose_arg *carg = &cmap.args[bidx]; + if (carg->weight_set == NULL) { + // create a weight-set for this bucket and populate it with the + // bucket weights + unsigned positions = get_choose_args_positions(cmap); + carg->weight_set_positions = positions; + carg->weight_set = static_cast( + calloc(sizeof(crush_weight_set), positions)); + for (unsigned p = 0; p < positions; ++p) { + carg->weight_set[p].size = b->size; + carg->weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32)); + for (unsigned i = 0; i < b->size; ++i) { + carg->weight_set[p].weights[i] = crush_get_bucket_item_weight(b, i); + } + } + changed++; + } + if (carg->weight_set_positions != weight.size()) { + if (ss) + *ss << "weight_set_positions != " << weight.size() << " for bucket " << b->id; + ldout(cct, 10) << __func__ << " weight_set_positions != " << weight.size() + << " for bucket " << b->id << dendl; + return 0; + } + for (unsigned i = 0; i < b->size; i++) { + if (b->items[i] == id) { + for (unsigned j = 0; j < weight.size(); ++j) { + carg->weight_set[j].weights[i] = weight[j]; + } + ldout(cct, 5) << __func__ << " set " << id << " to " << weight + << " in bucket " << b->id << dendl; + changed++; + } + } + if (changed) { + vector bucket_weight(weight.size(), 0); + for (unsigned i = 0; i < b->size; i++) { + for (unsigned j = 0; j < weight.size(); ++j) { + bucket_weight[j] += carg->weight_set[j].weights[i]; + } + } + choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr); + } + return changed; +} + +int CrushWrapper::choose_args_adjust_item_weight( + CephContext *cct, + crush_choose_arg_map cmap, + int id, + const vector& weight, + ostream *ss) +{ + ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl; + int changed = 0; + for (int bidx = 0; bidx < crush->max_buckets; bidx++) { + crush_bucket *b = crush->buckets[bidx]; + if (b == nullptr) { + continue; + } + changed += _choose_args_adjust_item_weight_in_bucket( + cct, cmap, b->id, id, weight, ss); + } + if (!changed) { + if (ss) + *ss << "item " << id << " not found in crush map"; + return -ENOENT; + } + return changed; +} diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h new file mode 100644 index 00000000..136ad538 --- /dev/null +++ b/src/crush/CrushWrapper.h @@ -0,0 +1,1657 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_CRUSH_WRAPPER_H +#define CEPH_CRUSH_WRAPPER_H + +#include +#include +#include +#include + +#include + +#include "include/types.h" + +extern "C" { +#include "crush.h" +#include "hash.h" +#include "mapper.h" +#include "builder.h" +} + +#include "include/ceph_assert.h" +#include "include/err.h" +#include "include/encoding.h" +#include "include/mempool.h" + +#include "common/Mutex.h" + +namespace ceph { + class Formatter; +} + +namespace CrushTreeDumper { + typedef mempool::osdmap::map name_map_t; +} + +WRITE_RAW_ENCODER(crush_rule_mask) // it's all u8's + +inline void encode(const crush_rule_step &s, bufferlist &bl) +{ + using ceph::encode; + encode(s.op, bl); + encode(s.arg1, bl); + encode(s.arg2, bl); +} +inline void decode(crush_rule_step &s, bufferlist::const_iterator &p) +{ + using ceph::decode; + decode(s.op, p); + decode(s.arg1, p); + decode(s.arg2, p); +} + +class CrushWrapper { +public: + // magic value used by OSDMap for a "default" fallback choose_args, used if + // the choose_arg_map passed to do_rule does not exist. if this also + // doesn't exist, fall back to canonical weights. + enum { + DEFAULT_CHOOSE_ARGS = -1 + }; + + std::map type_map; /* bucket/device type names */ + std::map name_map; /* bucket/device names */ + std::map rule_name_map; + + std::map class_map; /* item id -> class id */ + std::map class_name; /* class id -> class name */ + std::map class_rname; /* class name -> class id */ + std::map > class_bucket; /* bucket[id][class] == id */ + std::map choose_args; + +private: + struct crush_map *crush = nullptr; + + bool have_uniform_rules = false; + + /* reverse maps */ + mutable bool have_rmaps = false; + mutable std::map type_rmap, name_rmap, rule_name_rmap; + void build_rmaps() const { + if (have_rmaps) return; + build_rmap(type_map, type_rmap); + build_rmap(name_map, name_rmap); + build_rmap(rule_name_map, rule_name_rmap); + have_rmaps = true; + } + void build_rmap(const map &f, std::map &r) const { + r.clear(); + for (std::map::const_iterator p = f.begin(); p != f.end(); ++p) + r[p->second] = p->first; + } + +public: + CrushWrapper(const CrushWrapper& other); + const CrushWrapper& operator=(const CrushWrapper& other); + + CrushWrapper() { + create(); + } + ~CrushWrapper() { + if (crush) + crush_destroy(crush); + choose_args_clear(); + } + + crush_map *get_crush_map() { return crush; } + + /* building */ + void create() { + if (crush) + crush_destroy(crush); + crush = crush_create(); + choose_args_clear(); + ceph_assert(crush); + have_rmaps = false; + + set_tunables_default(); + } + + /** + * true if any rule has a rule id != its position in the array + * + * These indicate "ruleset" IDs that were created by older versions + * of Ceph. They are cleaned up in renumber_rules so that eventually + * we can remove the code for handling them. + */ + bool has_legacy_rule_ids() const; + + /** + * fix rules whose ruleid != ruleset + * + * These rules were created in older versions of Ceph. The concept + * of a ruleset no longer exists. + * + * Return a map of old ID -> new ID. Caller must update OSDMap + * to use new IDs. + */ + std::map renumber_rules(); + + /// true if any buckets that aren't straw2 + bool has_non_straw2_buckets() const; + + // tunables + void set_tunables_argonaut() { + crush->choose_local_tries = 2; + crush->choose_local_fallback_tries = 5; + crush->choose_total_tries = 19; + crush->chooseleaf_descend_once = 0; + crush->chooseleaf_vary_r = 0; + crush->chooseleaf_stable = 0; + crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; + } + void set_tunables_bobtail() { + crush->choose_local_tries = 0; + crush->choose_local_fallback_tries = 0; + crush->choose_total_tries = 50; + crush->chooseleaf_descend_once = 1; + crush->chooseleaf_vary_r = 0; + crush->chooseleaf_stable = 0; + crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; + } + void set_tunables_firefly() { + crush->choose_local_tries = 0; + crush->choose_local_fallback_tries = 0; + crush->choose_total_tries = 50; + crush->chooseleaf_descend_once = 1; + crush->chooseleaf_vary_r = 1; + crush->chooseleaf_stable = 0; + crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; + } + void set_tunables_hammer() { + crush->choose_local_tries = 0; + crush->choose_local_fallback_tries = 0; + crush->choose_total_tries = 50; + crush->chooseleaf_descend_once = 1; + crush->chooseleaf_vary_r = 1; + crush->chooseleaf_stable = 0; + crush->allowed_bucket_algs = + (1 << CRUSH_BUCKET_UNIFORM) | + (1 << CRUSH_BUCKET_LIST) | + (1 << CRUSH_BUCKET_STRAW) | + (1 << CRUSH_BUCKET_STRAW2); + } + void set_tunables_jewel() { + crush->choose_local_tries = 0; + crush->choose_local_fallback_tries = 0; + crush->choose_total_tries = 50; + crush->chooseleaf_descend_once = 1; + crush->chooseleaf_vary_r = 1; + crush->chooseleaf_stable = 1; + crush->allowed_bucket_algs = + (1 << CRUSH_BUCKET_UNIFORM) | + (1 << CRUSH_BUCKET_LIST) | + (1 << CRUSH_BUCKET_STRAW) | + (1 << CRUSH_BUCKET_STRAW2); + } + + void set_tunables_legacy() { + set_tunables_argonaut(); + crush->straw_calc_version = 0; + } + void set_tunables_optimal() { + set_tunables_jewel(); + crush->straw_calc_version = 1; + } + void set_tunables_default() { + set_tunables_jewel(); + crush->straw_calc_version = 1; + } + + int get_choose_local_tries() const { + return crush->choose_local_tries; + } + void set_choose_local_tries(int n) { + crush->choose_local_tries = n; + } + + int get_choose_local_fallback_tries() const { + return crush->choose_local_fallback_tries; + } + void set_choose_local_fallback_tries(int n) { + crush->choose_local_fallback_tries = n; + } + + int get_choose_total_tries() const { + return crush->choose_total_tries; + } + void set_choose_total_tries(int n) { + crush->choose_total_tries = n; + } + + int get_chooseleaf_descend_once() const { + return crush->chooseleaf_descend_once; + } + void set_chooseleaf_descend_once(int n) { + crush->chooseleaf_descend_once = !!n; + } + + int get_chooseleaf_vary_r() const { + return crush->chooseleaf_vary_r; + } + void set_chooseleaf_vary_r(int n) { + crush->chooseleaf_vary_r = n; + } + + int get_chooseleaf_stable() const { + return crush->chooseleaf_stable; + } + void set_chooseleaf_stable(int n) { + crush->chooseleaf_stable = n; + } + + int get_straw_calc_version() const { + return crush->straw_calc_version; + } + void set_straw_calc_version(int n) { + crush->straw_calc_version = n; + } + + unsigned get_allowed_bucket_algs() const { + return crush->allowed_bucket_algs; + } + void set_allowed_bucket_algs(unsigned n) { + crush->allowed_bucket_algs = n; + } + + bool has_argonaut_tunables() const { + return + crush->choose_local_tries == 2 && + crush->choose_local_fallback_tries == 5 && + crush->choose_total_tries == 19 && + crush->chooseleaf_descend_once == 0 && + crush->chooseleaf_vary_r == 0 && + crush->chooseleaf_stable == 0 && + crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; + } + bool has_bobtail_tunables() const { + return + crush->choose_local_tries == 0 && + crush->choose_local_fallback_tries == 0 && + crush->choose_total_tries == 50 && + crush->chooseleaf_descend_once == 1 && + crush->chooseleaf_vary_r == 0 && + crush->chooseleaf_stable == 0 && + crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; + } + bool has_firefly_tunables() const { + return + crush->choose_local_tries == 0 && + crush->choose_local_fallback_tries == 0 && + crush->choose_total_tries == 50 && + crush->chooseleaf_descend_once == 1 && + crush->chooseleaf_vary_r == 1 && + crush->chooseleaf_stable == 0 && + crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; + } + bool has_hammer_tunables() const { + return + crush->choose_local_tries == 0 && + crush->choose_local_fallback_tries == 0 && + crush->choose_total_tries == 50 && + crush->chooseleaf_descend_once == 1 && + crush->chooseleaf_vary_r == 1 && + crush->chooseleaf_stable == 0 && + crush->allowed_bucket_algs == ((1 << CRUSH_BUCKET_UNIFORM) | + (1 << CRUSH_BUCKET_LIST) | + (1 << CRUSH_BUCKET_STRAW) | + (1 << CRUSH_BUCKET_STRAW2)); + } + bool has_jewel_tunables() const { + return + crush->choose_local_tries == 0 && + crush->choose_local_fallback_tries == 0 && + crush->choose_total_tries == 50 && + crush->chooseleaf_descend_once == 1 && + crush->chooseleaf_vary_r == 1 && + crush->chooseleaf_stable == 1 && + crush->allowed_bucket_algs == ((1 << CRUSH_BUCKET_UNIFORM) | + (1 << CRUSH_BUCKET_LIST) | + (1 << CRUSH_BUCKET_STRAW) | + (1 << CRUSH_BUCKET_STRAW2)); + } + + bool has_optimal_tunables() const { + return has_jewel_tunables(); + } + bool has_legacy_tunables() const { + return has_argonaut_tunables(); + } + + bool has_nondefault_tunables() const { + return + (crush->choose_local_tries != 2 || + crush->choose_local_fallback_tries != 5 || + crush->choose_total_tries != 19); + } + bool has_nondefault_tunables2() const { + return + crush->chooseleaf_descend_once != 0; + } + bool has_nondefault_tunables3() const { + return + crush->chooseleaf_vary_r != 0; + } + bool has_nondefault_tunables5() const { + return + crush->chooseleaf_stable != 0; + } + + bool has_v2_rules() const; + bool has_v3_rules() const; + bool has_v4_buckets() const; + bool has_v5_rules() const; + bool has_choose_args() const; // any choose_args + bool has_incompat_choose_args() const; // choose_args that can't be made compat + + bool is_v2_rule(unsigned ruleid) const; + bool is_v3_rule(unsigned ruleid) const; + bool is_v5_rule(unsigned ruleid) const; + + string get_min_required_version() const { + if (has_v5_rules() || has_nondefault_tunables5()) + return "jewel"; + else if (has_v4_buckets()) + return "hammer"; + else if (has_nondefault_tunables3()) + return "firefly"; + else if (has_nondefault_tunables2() || has_nondefault_tunables()) + return "bobtail"; + else + return "argonaut"; + } + + // default bucket types + unsigned get_default_bucket_alg() const { + // in order of preference + if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_STRAW2)) + return CRUSH_BUCKET_STRAW2; + if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_STRAW)) + return CRUSH_BUCKET_STRAW; + if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_TREE)) + return CRUSH_BUCKET_TREE; + if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_LIST)) + return CRUSH_BUCKET_LIST; + if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_UNIFORM)) + return CRUSH_BUCKET_UNIFORM; + return 0; + } + + // bucket types + int get_num_type_names() const { + return type_map.size(); + } + int get_max_type_id() const { + if (type_map.empty()) + return 0; + return type_map.rbegin()->first; + } + int get_type_id(const string& name) const { + build_rmaps(); + if (type_rmap.count(name)) + return type_rmap[name]; + return -1; + } + const char *get_type_name(int t) const { + std::map::const_iterator p = type_map.find(t); + if (p != type_map.end()) + return p->second.c_str(); + return 0; + } + void set_type_name(int i, const string& name) { + type_map[i] = name; + if (have_rmaps) + type_rmap[name] = i; + } + + // item/bucket names + bool name_exists(const string& name) const { + build_rmaps(); + return name_rmap.count(name); + } + bool item_exists(int i) const { + return name_map.count(i); + } + int get_item_id(const string& name) const { + build_rmaps(); + if (name_rmap.count(name)) + return name_rmap[name]; + return 0; /* hrm */ + } + const char *get_item_name(int t) const { + std::map::const_iterator p = name_map.find(t); + if (p != name_map.end()) + return p->second.c_str(); + return 0; + } + int set_item_name(int i, const string& name) { + if (!is_valid_crush_name(name)) + return -EINVAL; + name_map[i] = name; + if (have_rmaps) + name_rmap[name] = i; + return 0; + } + void swap_names(int a, int b) { + string an = name_map[a]; + string bn = name_map[b]; + name_map[a] = bn; + name_map[b] = an; + if (have_rmaps) { + name_rmap[an] = b; + name_rmap[bn] = a; + } + } + int split_id_class(int i, int *idout, int *classout) const; + + bool class_exists(const string& name) const { + return class_rname.count(name); + } + const char *get_class_name(int i) const { + auto p = class_name.find(i); + if (p != class_name.end()) + return p->second.c_str(); + return 0; + } + int get_class_id(const string& name) const { + auto p = class_rname.find(name); + if (p != class_rname.end()) + return p->second; + else + return -EINVAL; + } + int remove_class_name(const string& name) { + auto p = class_rname.find(name); + if (p == class_rname.end()) + return -ENOENT; + int class_id = p->second; + auto q = class_name.find(class_id); + if (q == class_name.end()) + return -ENOENT; + class_rname.erase(name); + class_name.erase(class_id); + return 0; + } + + int32_t _alloc_class_id() const; + + int get_or_create_class_id(const string& name) { + int c = get_class_id(name); + if (c < 0) { + int i = _alloc_class_id(); + class_name[i] = name; + class_rname[name] = i; + return i; + } else { + return c; + } + } + + const char *get_item_class(int t) const { + std::map::const_iterator p = class_map.find(t); + if (p == class_map.end()) + return 0; + return get_class_name(p->second); + } + int get_item_class_id(int t) const { + auto p = class_map.find(t); + if (p == class_map.end()) + return -ENOENT; + return p->second; + } + int set_item_class(int i, const string& name) { + if (!is_valid_crush_name(name)) + return -EINVAL; + class_map[i] = get_or_create_class_id(name); + return 0; + } + int set_item_class(int i, int c) { + class_map[i] = c; + return c; + } + void get_devices_by_class(const string &name, set *devices) const { + ceph_assert(devices); + devices->clear(); + if (!class_exists(name)) { + return; + } + auto cid = get_class_id(name); + for (auto& p : class_map) { + if (p.first >= 0 && p.second == cid) { + devices->insert(p.first); + } + } + } + void class_remove_item(int i) { + auto it = class_map.find(i); + if (it == class_map.end()) { + return; + } + class_map.erase(it); + } + int can_rename_item(const string& srcname, + const string& dstname, + ostream *ss) const; + int rename_item(const string& srcname, + const string& dstname, + ostream *ss); + int can_rename_bucket(const string& srcname, + const string& dstname, + ostream *ss) const; + int rename_bucket(const string& srcname, + const string& dstname, + ostream *ss); + + // rule names + int rename_rule(const string& srcname, + const string& dstname, + ostream *ss); + bool rule_exists(string name) const { + build_rmaps(); + return rule_name_rmap.count(name); + } + int get_rule_id(string name) const { + build_rmaps(); + if (rule_name_rmap.count(name)) + return rule_name_rmap[name]; + return -ENOENT; + } + const char *get_rule_name(int t) const { + std::map::const_iterator p = rule_name_map.find(t); + if (p != rule_name_map.end()) + return p->second.c_str(); + return 0; + } + void set_rule_name(int i, const string& name) { + rule_name_map[i] = name; + if (have_rmaps) + rule_name_rmap[name] = i; + } + bool is_shadow_item(int id) const { + const char *name = get_item_name(id); + return name && !is_valid_crush_name(name); + } + + + /** + * find tree nodes referenced by rules by a 'take' command + * + * Note that these may not be parentless roots. + */ + void find_takes(set *roots) const; + void find_takes_by_rule(int rule, set *roots) const; + + /** + * find tree roots + * + * These are parentless nodes in the map. + */ + void find_roots(set *roots) const; + + + /** + * find tree roots that contain shadow (device class) items only + */ + void find_shadow_roots(set *roots) const { + set all; + find_roots(&all); + for (auto& p: all) { + if (is_shadow_item(p)) { + roots->insert(p); + } + } + } + + /** + * find tree roots that are not shadow (device class) items + * + * These are parentless nodes in the map that are not shadow + * items for device classes. + */ + void find_nonshadow_roots(set *roots) const { + set all; + find_roots(&all); + for (auto& p: all) { + if (!is_shadow_item(p)) { + roots->insert(p); + } + } + } + + /** + * see if an item is contained within a subtree + * + * @param root haystack + * @param item needle + * @return true if the item is located beneath the given node + */ + bool subtree_contains(int root, int item) const; + +private: + /** + * search for an item in any bucket + * + * @param i item + * @return true if present + */ + bool _search_item_exists(int i) const; + bool is_parent_of(int child, int p) const; +public: + + /** + * see if item is located where we think it is + * + * This verifies that the given item is located at a particular + * location in the hierarchy. However, that check is imprecise; we + * are actually verifying that the most specific location key/value + * is correct. For example, if loc specifies that rack=foo and + * host=bar, it will verify that host=bar is correct; any placement + * above that level in the hierarchy is ignored. This matches the + * semantics for insert_item(). + * + * @param cct cct + * @param item item id + * @param loc location to check (map of type to bucket names) + * @param weight optional pointer to weight of item at that location + * @return true if item is at specified location + */ + bool check_item_loc(CephContext *cct, int item, const map& loc, int *iweight); + bool check_item_loc(CephContext *cct, int item, const map& loc, float *weight) { + int iweight; + bool ret = check_item_loc(cct, item, loc, &iweight); + if (weight) + *weight = (float)iweight / (float)0x10000; + return ret; + } + + + /** + * returns the (type, name) of the parent bucket of id + * + * FIXME: ambiguous for items that occur multiple times in the map + */ + pair get_immediate_parent(int id, int *ret = NULL) const; + + int get_immediate_parent_id(int id, int *parent) const; + + /** + * return ancestor of the given type, or 0 if none + * can pass in a specific crush **rule** to return ancestor from that rule only + * (parent is always a bucket and thus <0) + */ + int get_parent_of_type(int id, int type, int rule = -1) const; + + /** + * get the fully qualified location of a device by successively finding + * parents beginning at ID and ending at highest type number specified in + * the CRUSH map which assumes that if device foo is under device bar, the + * type_id of foo < bar where type_id is the integer specified in the CRUSH map + * + * returns the location in the form of (type=foo) where type is a type of bucket + * specified in the CRUSH map and foo is a name specified in the CRUSH map + */ + map get_full_location(int id) const; + + /** + * return location map for a item, by name + */ + int get_full_location( + const string& name, + std::map *ploc); + + /* + * identical to get_full_location(int id) although it returns the type/name + * pairs in the order they occur in the hierarchy. + * + * returns -ENOENT if id is not found. + */ + int get_full_location_ordered(int id, vector >& path) const; + + /* + * identical to get_full_location_ordered(int id, vector >& path), + * although it returns a concatenated string with the type/name pairs in descending + * hierarchical order with format key1=val1,key2=val2. + * + * returns the location in descending hierarchy as a string. + */ + string get_full_location_ordered_string(int id) const; + + /** + * returns (type_id, type) of all parent buckets between id and + * default, can be used to check for anomalous CRUSH maps + */ + map get_parent_hierarchy(int id) const; + + /** + * enumerate immediate children of given node + * + * @param id parent bucket or device id + * @return number of items, or error + */ + int get_children(int id, list *children) const; + /** + * enumerate all children of given node + * + * @param id parent bucket or device id + * @return number of items, or error + */ + int get_all_children(int id, set *children) const; + void get_children_of_type(int id, + int type, + vector *children, + bool exclude_shadow = true) const; + /** + * enumerate all subtrees by type + */ + void get_subtree_of_type(int type, vector *subtrees); + + + /** + * verify upmapping results. + * return 0 on success or a negative errno on error. + */ + int verify_upmap(CephContext *cct, + int rule_id, + int pool_size, + const vector& up); + + /** + * enumerate leaves(devices) of given node + * + * @param name parent bucket name + * @return 0 on success or a negative errno on error. + */ + int get_leaves(const string &name, set *leaves) const; + +private: + int _get_leaves(int id, list *leaves) const; // worker + +public: + /** + * insert an item into the map at a specific position + * + * Add an item as a specific location of the hierarchy. + * Specifically, we look for the most specific location constraint + * for which a bucket already exists, and then create intervening + * buckets beneath that in order to place the item. + * + * Note that any location specifiers *above* the most specific match + * are ignored. For example, if we specify that osd.12 goes in + * host=foo, rack=bar, and row=baz, and rack=bar is the most + * specific match, we will create host=foo beneath that point and + * put osd.12 inside it. However, we will not verify that rack=bar + * is beneath row=baz or move it. + * + * In short, we will build out a hierarchy, and move leaves around, + * but not adjust the hierarchy's internal structure. Yet. + * + * If the item is already present in the map, we will return EEXIST. + * If the location key/value pairs are nonsensical + * (rack=nameofdevice), or location specifies that do not attach us + * to any existing part of the hierarchy, we will return EINVAL. + * + * @param cct cct + * @param id item id + * @param weight item weight + * @param name item name + * @param loc location (map of type to bucket names) + * @param init_weight_sets initialize weight-set weights to weight (vs 0) + * @return 0 for success, negative on error + */ + int insert_item(CephContext *cct, int id, float weight, string name, + const map& loc, + bool init_weight_sets=true); + + /** + * move a bucket in the hierarchy to the given location + * + * This has the same location and ancestor creation behavior as + * insert_item(), but will relocate the specified existing bucket. + * + * @param cct cct + * @param id bucket id + * @param loc location (map of type to bucket names) + * @return 0 for success, negative on error + */ + int move_bucket(CephContext *cct, int id, const map& loc); + + /** + * swap bucket contents of two buckets without touching bucket ids + * + * @param cct cct + * @param src bucket a + * @param dst bucket b + * @return 0 for success, negative on error + */ + int swap_bucket(CephContext *cct, int src, int dst); + + /** + * add a link to an existing bucket in the hierarchy to the new location + * + * This has the same location and ancestor creation behavior as + * insert_item(), but will add a new link to the specified existing + * bucket. + * + * @param cct cct + * @param id bucket id + * @param loc location (map of type to bucket names) + * @return 0 for success, negative on error + */ + int link_bucket(CephContext *cct, int id, const map& loc); + + /** + * add or update an item's position in the map + * + * This is analogous to insert_item, except we will move an item if + * it is already present. + * + * @param cct cct + * @param id item id + * @param weight item weight + * @param name item name + * @param loc location (map of type to bucket names) + * @return 0 for no change, 1 for successful change, negative on error + */ + int update_item(CephContext *cct, int id, float weight, string name, const map& loc); + + /** + * create or move an item, but do not adjust its weight if it already exists + * + * @param cct cct + * @param item item id + * @param weight initial item weight (if we need to create it) + * @param name item name + * @param loc location (map of type to bucket names) + * @param init_weight_sets initialize weight-set values to weight (vs 0) + * @return 0 for no change, 1 for successful change, negative on error + */ + int create_or_move_item(CephContext *cct, int item, float weight, string name, + const map& loc, + bool init_weight_sets=true); + + /** + * remove all instances of an item from the map + * + * @param cct cct + * @param id item id to remove + * @param unlink_only unlink but do not remove bucket (useful if multiple links or not empty) + * @return 0 on success, negative on error + */ + int remove_item(CephContext *cct, int id, bool unlink_only); + + /** + * recursively remove buckets starting at item and stop removing + * when a bucket is in use. + * + * @param item id to remove + * @return 0 on success, negative on error + */ + int remove_root(CephContext *cct, int item); + + /** + * remove all instances of an item nested beneath a certain point from the map + * + * @param cct cct + * @param id item id to remove + * @param ancestor ancestor item id under which to search for id + * @param unlink_only unlink but do not remove bucket (useful if bucket has multiple links or is not empty) + * @return 0 on success, negative on error + */ +private: + bool _maybe_remove_last_instance(CephContext *cct, int id, bool unlink_only); + int _remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only); + bool _bucket_is_in_use(int id); +public: + int remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only); + + /** + * calculate the locality/distance from a given id to a crush location map + * + * Specifically, we look for the lowest-valued type for which the + * location of id matches that described in loc. + * + * @param cct cct + * @param id the existing id in the map + * @param loc a set of key=value pairs describing a location in the hierarchy + */ + int get_common_ancestor_distance(CephContext *cct, int id, + const std::multimap& loc) const; + + /** + * parse a set of key/value pairs out of a string vector + * + * These are used to describe a location in the CRUSH hierarchy. + * + * @param args list of strings (each key= or key=value) + * @param ploc pointer to a resulting location map or multimap + */ + static int parse_loc_map(const std::vector& args, + std::map *ploc); + static int parse_loc_multimap(const std::vector& args, + std::multimap *ploc); + + + /** + * get an item's weight + * + * Will return the weight for the first instance it finds. + * + * @param id item id to check + * @return weight of item + */ + int get_item_weight(int id) const; + float get_item_weightf(int id) const { + return (float)get_item_weight(id) / (float)0x10000; + } + int get_item_weight_in_loc(int id, const map &loc); + float get_item_weightf_in_loc(int id, const map &loc) { + return (float)get_item_weight_in_loc(id, loc) / (float)0x10000; + } + + int validate_weightf(float weight) { + uint64_t iweight = weight * 0x10000; + if (iweight > std::numeric_limits::max()) { + return -EOVERFLOW; + } + return 0; + } + int adjust_item_weight(CephContext *cct, int id, int weight, + bool update_weight_sets=true); + int adjust_item_weightf(CephContext *cct, int id, float weight, + bool update_weight_sets=true) { + int r = validate_weightf(weight); + if (r < 0) { + return r; + } + return adjust_item_weight(cct, id, (int)(weight * (float)0x10000), + update_weight_sets); + } + int adjust_item_weight_in_bucket(CephContext *cct, int id, int weight, + int bucket_id, + bool update_weight_sets); + int adjust_item_weight_in_loc(CephContext *cct, int id, int weight, + const map& loc, + bool update_weight_sets=true); + int adjust_item_weightf_in_loc(CephContext *cct, int id, float weight, + const map& loc, + bool update_weight_sets=true) { + int r = validate_weightf(weight); + if (r < 0) { + return r; + } + return adjust_item_weight_in_loc(cct, id, (int)(weight * (float)0x10000), + loc, update_weight_sets); + } + void reweight(CephContext *cct); + void reweight_bucket(crush_bucket *b, + crush_choose_arg_map& arg_map, + vector *weightv); + + int adjust_subtree_weight(CephContext *cct, int id, int weight, + bool update_weight_sets=true); + int adjust_subtree_weightf(CephContext *cct, int id, float weight, + bool update_weight_sets=true) { + int r = validate_weightf(weight); + if (r < 0) { + return r; + } + return adjust_subtree_weight(cct, id, (int)(weight * (float)0x10000), + update_weight_sets); + } + + /// check if item id is present in the map hierarchy + bool check_item_present(int id) const; + + + /*** devices ***/ + int get_max_devices() const { + if (!crush) return 0; + return crush->max_devices; + } + + + /*** rules ***/ +private: + crush_rule *get_rule(unsigned ruleno) const { + if (!crush) return (crush_rule *)(-ENOENT); + if (ruleno >= crush->max_rules) + return 0; + return crush->rules[ruleno]; + } + crush_rule_step *get_rule_step(unsigned ruleno, unsigned step) const { + crush_rule *n = get_rule(ruleno); + if (IS_ERR(n)) return (crush_rule_step *)(-EINVAL); + if (step >= n->len) return (crush_rule_step *)(-EINVAL); + return &n->steps[step]; + } + +public: + /* accessors */ + int get_max_rules() const { + if (!crush) return 0; + return crush->max_rules; + } + bool rule_exists(unsigned ruleno) const { + if (!crush) return false; + if (ruleno < crush->max_rules && + crush->rules[ruleno] != NULL) + return true; + return false; + } + bool rule_has_take(unsigned ruleno, int take) const { + if (!crush) return false; + crush_rule *rule = get_rule(ruleno); + for (unsigned i = 0; i < rule->len; ++i) { + if (rule->steps[i].op == CRUSH_RULE_TAKE && + rule->steps[i].arg1 == take) { + return true; + } + } + return false; + } + int get_rule_len(unsigned ruleno) const { + crush_rule *r = get_rule(ruleno); + if (IS_ERR(r)) return PTR_ERR(r); + return r->len; + } + int get_rule_mask_ruleset(unsigned ruleno) const { + crush_rule *r = get_rule(ruleno); + if (IS_ERR(r)) return -1; + return r->mask.ruleset; + } + int get_rule_mask_type(unsigned ruleno) const { + crush_rule *r = get_rule(ruleno); + if (IS_ERR(r)) return -1; + return r->mask.type; + } + int get_rule_mask_min_size(unsigned ruleno) const { + crush_rule *r = get_rule(ruleno); + if (IS_ERR(r)) return -1; + return r->mask.min_size; + } + int get_rule_mask_max_size(unsigned ruleno) const { + crush_rule *r = get_rule(ruleno); + if (IS_ERR(r)) return -1; + return r->mask.max_size; + } + int get_rule_op(unsigned ruleno, unsigned step) const { + crush_rule_step *s = get_rule_step(ruleno, step); + if (IS_ERR(s)) return PTR_ERR(s); + return s->op; + } + int get_rule_arg1(unsigned ruleno, unsigned step) const { + crush_rule_step *s = get_rule_step(ruleno, step); + if (IS_ERR(s)) return PTR_ERR(s); + return s->arg1; + } + int get_rule_arg2(unsigned ruleno, unsigned step) const { + crush_rule_step *s = get_rule_step(ruleno, step); + if (IS_ERR(s)) return PTR_ERR(s); + return s->arg2; + } + +private: + float _get_take_weight_osd_map(int root, map *pmap) const; + void _normalize_weight_map(float sum, const map& m, + map *pmap) const; + +public: + /** + * calculate a map of osds to weights for a given rule + * + * Generate a map of which OSDs get how much relative weight for a + * given rule. + * + * @param ruleno [in] rule id + * @param pmap [out] map of osd to weight + * @return 0 for success, or negative error code + */ + int get_rule_weight_osd_map(unsigned ruleno, map *pmap) const; + + /** + * calculate a map of osds to weights for a given starting root + * + * Generate a map of which OSDs get how much relative weight for a + * given starting root + * + * @param root node + * @param pmap [out] map of osd to weight + * @return 0 for success, or negative error code + */ + int get_take_weight_osd_map(int root, map *pmap) const; + + /* modifiers */ + + int add_rule(int ruleno, int len, int type, int minsize, int maxsize) { + if (!crush) return -ENOENT; + crush_rule *n = crush_make_rule(len, ruleno, type, minsize, maxsize); + ceph_assert(n); + ruleno = crush_add_rule(crush, n, ruleno); + return ruleno; + } + int set_rule_mask_max_size(unsigned ruleno, int max_size) { + crush_rule *r = get_rule(ruleno); + if (IS_ERR(r)) return -1; + return r->mask.max_size = max_size; + } + int set_rule_step(unsigned ruleno, unsigned step, int op, int arg1, int arg2) { + if (!crush) return -ENOENT; + crush_rule *n = get_rule(ruleno); + if (!n) return -1; + crush_rule_set_step(n, step, op, arg1, arg2); + return 0; + } + int set_rule_step_take(unsigned ruleno, unsigned step, int val) { + return set_rule_step(ruleno, step, CRUSH_RULE_TAKE, val, 0); + } + int set_rule_step_set_choose_tries(unsigned ruleno, unsigned step, int val) { + return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_TRIES, val, 0); + } + int set_rule_step_set_choose_local_tries(unsigned ruleno, unsigned step, int val) { + return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES, val, 0); + } + int set_rule_step_set_choose_local_fallback_tries(unsigned ruleno, unsigned step, int val) { + return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES, val, 0); + } + int set_rule_step_set_chooseleaf_tries(unsigned ruleno, unsigned step, int val) { + return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_TRIES, val, 0); + } + int set_rule_step_set_chooseleaf_vary_r(unsigned ruleno, unsigned step, int val) { + return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_VARY_R, val, 0); + } + int set_rule_step_set_chooseleaf_stable(unsigned ruleno, unsigned step, int val) { + return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_STABLE, val, 0); + } + int set_rule_step_choose_firstn(unsigned ruleno, unsigned step, int val, int type) { + return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_FIRSTN, val, type); + } + int set_rule_step_choose_indep(unsigned ruleno, unsigned step, int val, int type) { + return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_INDEP, val, type); + } + int set_rule_step_choose_leaf_firstn(unsigned ruleno, unsigned step, int val, int type) { + return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSELEAF_FIRSTN, val, type); + } + int set_rule_step_choose_leaf_indep(unsigned ruleno, unsigned step, int val, int type) { + return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSELEAF_INDEP, val, type); + } + int set_rule_step_emit(unsigned ruleno, unsigned step) { + return set_rule_step(ruleno, step, CRUSH_RULE_EMIT, 0, 0); + } + + int add_simple_rule( + string name, string root_name, string failure_domain_type, + string device_class, + string mode, int rule_type, ostream *err = 0); + + /** + * @param rno rule[set] id to use, -1 to pick the lowest available + */ + int add_simple_rule_at( + string name, string root_name, + string failure_domain_type, string device_class, string mode, + int rule_type, int rno, ostream *err = 0); + + int remove_rule(int ruleno); + + + /** buckets **/ + const crush_bucket *get_bucket(int id) const { + if (!crush) + return (crush_bucket *)(-EINVAL); + unsigned int pos = (unsigned int)(-1 - id); + unsigned int max_buckets = crush->max_buckets; + if (pos >= max_buckets) + return (crush_bucket *)(-ENOENT); + crush_bucket *ret = crush->buckets[pos]; + if (ret == NULL) + return (crush_bucket *)(-ENOENT); + return ret; + } +private: + crush_bucket *get_bucket(int id) { + if (!crush) + return (crush_bucket *)(-EINVAL); + unsigned int pos = (unsigned int)(-1 - id); + unsigned int max_buckets = crush->max_buckets; + if (pos >= max_buckets) + return (crush_bucket *)(-ENOENT); + crush_bucket *ret = crush->buckets[pos]; + if (ret == NULL) + return (crush_bucket *)(-ENOENT); + return ret; + } + /** + * detach a bucket from its parent and adjust the parent weight + * + * returns the weight of the detached bucket + **/ + int detach_bucket(CephContext *cct, int item); + + int get_new_bucket_id(); + +public: + int get_max_buckets() const { + if (!crush) return -EINVAL; + return crush->max_buckets; + } + int get_next_bucket_id() const { + if (!crush) return -EINVAL; + return crush_get_next_bucket_id(crush); + } + bool bucket_exists(int id) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) + return false; + return true; + } + int get_bucket_weight(int id) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) return PTR_ERR(b); + return b->weight; + } + float get_bucket_weightf(int id) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) return 0; + return b->weight / (float)0x10000; + } + int get_bucket_type(int id) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) return PTR_ERR(b); + return b->type; + } + int get_bucket_alg(int id) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) return PTR_ERR(b); + return b->alg; + } + int get_bucket_hash(int id) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) return PTR_ERR(b); + return b->hash; + } + int get_bucket_size(int id) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) return PTR_ERR(b); + return b->size; + } + int get_bucket_item(int id, int pos) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) return PTR_ERR(b); + if ((__u32)pos >= b->size) + return PTR_ERR(b); + return b->items[pos]; + } + int get_bucket_item_weight(int id, int pos) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) return PTR_ERR(b); + return crush_get_bucket_item_weight(b, pos); + } + float get_bucket_item_weightf(int id, int pos) const { + const crush_bucket *b = get_bucket(id); + if (IS_ERR(b)) return 0; + return (float)crush_get_bucket_item_weight(b, pos) / (float)0x10000; + } + + /* modifiers */ + int add_bucket(int bucketno, int alg, int hash, int type, int size, + int *items, int *weights, int *idout); + int bucket_add_item(crush_bucket *bucket, int item, int weight); + int bucket_remove_item(struct crush_bucket *bucket, int item); + int bucket_adjust_item_weight( + CephContext *cct, struct crush_bucket *bucket, int item, int weight, + bool adjust_weight_sets); + + void finalize() { + ceph_assert(crush); + crush_finalize(crush); + if (!name_map.empty() && + name_map.rbegin()->first >= crush->max_devices) { + crush->max_devices = name_map.rbegin()->first + 1; + } + have_uniform_rules = !has_legacy_rule_ids(); + build_rmaps(); + } + int bucket_set_alg(int id, int alg); + + int update_device_class(int id, const string& class_name, const string& name, ostream *ss); + int remove_device_class(CephContext *cct, int id, ostream *ss); + int device_class_clone( + int original, int device_class, + const std::map>& old_class_bucket, + const std::set& used_ids, + int *clone, + map>> *cmap_item_weight); + bool class_is_in_use(int class_id, ostream *ss = nullptr); + int rename_class(const string& srcname, const string& dstname); + int populate_classes( + const std::map>& old_class_bucket); + int get_rules_by_class(const string &class_name, set *rules); + int get_rules_by_osd(int osd, set *rules); + bool _class_is_dead(int class_id); + void cleanup_dead_classes(); + int rebuild_roots_with_classes(CephContext *cct); + /* remove unused roots generated for class devices */ + int trim_roots_with_class(CephContext *cct); + + int reclassify( + CephContext *cct, + ostream& out, + const map& classify_root, + const map>& classify_bucket + ); + + int set_subtree_class(const string& name, const string& class_name); + + void start_choose_profile() { + free(crush->choose_tries); + /* + * the original choose_total_tries value was off by one (it + * counted "retries" and not "tries"). add one to alloc. + */ + crush->choose_tries = (__u32 *)calloc(sizeof(*crush->choose_tries), + (crush->choose_total_tries + 1)); + memset(crush->choose_tries, 0, + sizeof(*crush->choose_tries) * (crush->choose_total_tries + 1)); + } + void stop_choose_profile() { + free(crush->choose_tries); + crush->choose_tries = 0; + } + + int get_choose_profile(__u32 **vec) { + if (crush->choose_tries) { + *vec = crush->choose_tries; + return crush->choose_total_tries; + } + return 0; + } + + + void set_max_devices(int m) { + crush->max_devices = m; + } + + int find_rule(int ruleset, int type, int size) const { + if (!crush) return -1; + if (have_uniform_rules && + ruleset < (int)crush->max_rules && + crush->rules[ruleset] && + crush->rules[ruleset]->mask.type == type && + crush->rules[ruleset]->mask.min_size <= size && + crush->rules[ruleset]->mask.max_size >= size) { + return ruleset; + } + return crush_find_rule(crush, ruleset, type, size); + } + + bool ruleset_exists(const int ruleset) const { + for (size_t i = 0; i < crush->max_rules; ++i) { + if (rule_exists(i) && crush->rules[i]->mask.ruleset == ruleset) { + return true; + } + } + + return false; + } + + /** + * Return the lowest numbered ruleset of type `type` + * + * @returns a ruleset ID, or -1 if no matching rules found. + */ + int find_first_ruleset(int type) const { + int result = -1; + + for (size_t i = 0; i < crush->max_rules; ++i) { + if (crush->rules[i] + && crush->rules[i]->mask.type == type + && (crush->rules[i]->mask.ruleset < result || result == -1)) { + result = crush->rules[i]->mask.ruleset; + } + } + + return result; + } + + bool have_choose_args(int64_t choose_args_index) const { + return choose_args.count(choose_args_index); + } + + crush_choose_arg_map choose_args_get_with_fallback( + int64_t choose_args_index) const { + auto i = choose_args.find(choose_args_index); + if (i == choose_args.end()) { + i = choose_args.find(DEFAULT_CHOOSE_ARGS); + } + if (i == choose_args.end()) { + crush_choose_arg_map arg_map; + arg_map.args = NULL; + arg_map.size = 0; + return arg_map; + } else { + return i->second; + } + } + crush_choose_arg_map choose_args_get(int64_t choose_args_index) const { + auto i = choose_args.find(choose_args_index); + if (i == choose_args.end()) { + crush_choose_arg_map arg_map; + arg_map.args = NULL; + arg_map.size = 0; + return arg_map; + } else { + return i->second; + } + } + + void destroy_choose_args(crush_choose_arg_map arg_map) { + for (__u32 i = 0; i < arg_map.size; i++) { + crush_choose_arg *arg = &arg_map.args[i]; + for (__u32 j = 0; j < arg->weight_set_positions; j++) { + crush_weight_set *weight_set = &arg->weight_set[j]; + free(weight_set->weights); + } + if (arg->weight_set) + free(arg->weight_set); + if (arg->ids) + free(arg->ids); + } + free(arg_map.args); + } + + bool create_choose_args(int64_t id, int positions) { + if (choose_args.count(id)) + return false; + ceph_assert(positions); + auto &cmap = choose_args[id]; + cmap.args = static_cast(calloc(sizeof(crush_choose_arg), + crush->max_buckets)); + cmap.size = crush->max_buckets; + for (int bidx=0; bidx < crush->max_buckets; ++bidx) { + crush_bucket *b = crush->buckets[bidx]; + auto &carg = cmap.args[bidx]; + carg.ids = NULL; + carg.ids_size = 0; + if (b && b->alg == CRUSH_BUCKET_STRAW2) { + crush_bucket_straw2 *sb = reinterpret_cast(b); + carg.weight_set_positions = positions; + carg.weight_set = static_cast(calloc(sizeof(crush_weight_set), + carg.weight_set_positions)); + // initialize with canonical weights + for (int pos = 0; pos < positions; ++pos) { + carg.weight_set[pos].size = b->size; + carg.weight_set[pos].weights = (__u32*)calloc(4, b->size); + for (unsigned i = 0; i < b->size; ++i) { + carg.weight_set[pos].weights[i] = sb->item_weights[i]; + } + } + } else { + carg.weight_set = NULL; + carg.weight_set_positions = 0; + } + } + return true; + } + + void rm_choose_args(int64_t id) { + auto p = choose_args.find(id); + if (p != choose_args.end()) { + destroy_choose_args(p->second); + choose_args.erase(p); + } + } + + void choose_args_clear() { + for (auto w : choose_args) + destroy_choose_args(w.second); + choose_args.clear(); + } + + // remove choose_args for buckets that no longer exist, create them for new buckets + void update_choose_args(CephContext *cct); + + // adjust choose_args_map weight, preserving the hierarchical summation + // property. used by callers optimizing layouts by tweaking weights. + int _choose_args_adjust_item_weight_in_bucket( + CephContext *cct, + crush_choose_arg_map cmap, + int bucketid, + int id, + const vector& weight, + ostream *ss); + int choose_args_adjust_item_weight( + CephContext *cct, + crush_choose_arg_map cmap, + int id, const vector& weight, + ostream *ss); + int choose_args_adjust_item_weightf( + CephContext *cct, + crush_choose_arg_map cmap, + int id, const vector& weightf, + ostream *ss) { + vector weight(weightf.size()); + for (unsigned i = 0; i < weightf.size(); ++i) { + weight[i] = (int)(weightf[i] * (double)0x10000); + } + return choose_args_adjust_item_weight(cct, cmap, id, weight, ss); + } + + int get_choose_args_positions(crush_choose_arg_map cmap) { + // infer positions from other buckets + for (unsigned j = 0; j < cmap.size; ++j) { + if (cmap.args[j].weight_set_positions) { + return cmap.args[j].weight_set_positions; + } + } + return 1; + } + + template + void do_rule(int rule, int x, vector& out, int maxout, + const WeightVector& weight, + uint64_t choose_args_index) const { + int rawout[maxout]; + char work[crush_work_size(crush, maxout)]; + crush_init_workspace(crush, work); + crush_choose_arg_map arg_map = choose_args_get_with_fallback( + choose_args_index); + int numrep = crush_do_rule(crush, rule, x, rawout, maxout, + std::data(weight), std::size(weight), + work, arg_map.args); + if (numrep < 0) + numrep = 0; + out.resize(numrep); + for (int i=0; i>& stack, + const set& overfull, + const vector& underfull, + const vector& more_underfull, + const vector& orig, + vector::const_iterator& i, + set& used, + vector *pw, + int root_bucket, + int rule) const; + + int try_remap_rule( + CephContext *cct, + int rule, + int maxout, + const set& overfull, + const vector& underfull, + const vector& more_underfull, + const vector& orig, + vector *out) const; + + bool check_crush_rule(int ruleset, int type, int size, ostream& ss) { + ceph_assert(crush); + + __u32 i; + for (i = 0; i < crush->max_rules; i++) { + if (crush->rules[i] && + crush->rules[i]->mask.ruleset == ruleset && + crush->rules[i]->mask.type == type) { + + if (crush->rules[i]->mask.min_size <= size && + crush->rules[i]->mask.max_size >= size) { + return true; + } else if (size < crush->rules[i]->mask.min_size) { + ss << "pool size is smaller than the crush rule min size"; + return false; + } else { + ss << "pool size is bigger than the crush rule max size"; + return false; + } + } + } + + return false; + } + + void encode(bufferlist &bl, uint64_t features) const; + void decode(bufferlist::const_iterator &blp); + void decode_crush_bucket(crush_bucket** bptr, bufferlist::const_iterator &blp); + void dump(Formatter *f) const; + void dump_rules(Formatter *f) const; + void dump_rule(int ruleset, Formatter *f) const; + void dump_tunables(Formatter *f) const; + void dump_choose_args(Formatter *f) const; + void list_rules(Formatter *f) const; + void list_rules(ostream *ss) const; + void dump_tree(ostream *out, + Formatter *f, + const CrushTreeDumper::name_map_t& ws, + bool show_shadow = false) const; + void dump_tree(ostream *out, Formatter *f) { + dump_tree(out, f, CrushTreeDumper::name_map_t()); + } + void dump_tree(Formatter *f, + const CrushTreeDumper::name_map_t& ws) const; + static void generate_test_instances(list& o); + + int get_osd_pool_default_crush_replicated_ruleset(CephContext *cct); + + static bool is_valid_crush_name(const string& s); + static bool is_valid_crush_loc(CephContext *cct, + const map& loc); +}; +WRITE_CLASS_ENCODER_FEATURES(CrushWrapper) + +#endif diff --git a/src/crush/CrushWrapper.i b/src/crush/CrushWrapper.i new file mode 100644 index 00000000..76340611 --- /dev/null +++ b/src/crush/CrushWrapper.i @@ -0,0 +1,47 @@ +/* File : CrushWrapper.i */ +%module CrushWrapper +%{ +#include "CrushWrapper.h" +%} + +%include typemaps.i + +// This tells SWIG to treat 'int *data' as a special case +%typemap(in) int *items { + AV *tempav; + I32 len; + int i; + SV **tv; +// int view; + + + //printf("typemap\n"); + + if (!SvROK($input)) + croak("$input is not a reference."); + if (SvTYPE(SvRV($input)) != SVt_PVAV) + croak("$input is not an array."); + + tempav = (AV*)SvRV($input); + len = av_len(tempav); + //printf("typemap len: %i\n",len); + $1 = (int *) malloc((len+1)*sizeof(int)); + for (i = 0; i <= len; i++) { + tv = av_fetch(tempav, i, 0); + $1[i] = (int) SvIV(*tv); + + /* + view = SvIV(*tv); + printf("view: %d",view); + printf("\n"); + */ + } +} + +%apply int *items { int *weights }; +%apply double *OUTPUT { double *min, double *max, double *avg }; + +/* Let's just grab the original header file here */ +%include "CrushWrapper.h" + +%clear double *min, double *max, double *avg; diff --git a/src/crush/builder.c b/src/crush/builder.c new file mode 100644 index 00000000..68dfcb69 --- /dev/null +++ b/src/crush/builder.c @@ -0,0 +1,1525 @@ +#include +#include +#include +#include +#include +#include + +#include "crush/crush.h" +#include "builder.h" + +#define dprintk(args...) /* printf(args) */ + +#define BUG_ON(x) assert(!(x)) + +struct crush_map *crush_create() +{ + struct crush_map *m; + m = malloc(sizeof(*m)); + if (!m) + return NULL; + memset(m, 0, sizeof(*m)); + + set_optimal_crush_map(m); + return m; +} + +/* + * finalize should be called _after_ all buckets are added to the map. + */ +void crush_finalize(struct crush_map *map) +{ + int b; + __u32 i; + + /* Calculate the needed working space while we do other + finalization tasks. */ + map->working_size = sizeof(struct crush_work); + /* Space for the array of pointers to per-bucket workspace */ + map->working_size += map->max_buckets * + sizeof(struct crush_work_bucket *); + + /* calc max_devices */ + map->max_devices = 0; + for (b=0; bmax_buckets; b++) { + if (map->buckets[b] == 0) + continue; + for (i=0; ibuckets[b]->size; i++) + if (map->buckets[b]->items[i] >= map->max_devices) + map->max_devices = map->buckets[b]->items[i] + 1; + + switch (map->buckets[b]->alg) { + default: + /* The base case, permutation variables and + the pointer to the permutation array. */ + map->working_size += sizeof(struct crush_work_bucket); + break; + } + /* Every bucket has a permutation array. */ + map->working_size += map->buckets[b]->size * sizeof(__u32); + } +} + + + +/** rules **/ + +int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno) +{ + __u32 r; + + if (ruleno < 0) { + for (r=0; r < map->max_rules; r++) + if (map->rules[r] == 0) + break; + assert(r < CRUSH_MAX_RULES); + } + else + r = ruleno; + + if (r >= map->max_rules) { + /* expand array */ + int oldsize; + void *_realloc = NULL; + if (map->max_rules +1 > CRUSH_MAX_RULES) + return -ENOSPC; + oldsize = map->max_rules; + map->max_rules = r+1; + if ((_realloc = realloc(map->rules, map->max_rules * sizeof(map->rules[0]))) == NULL) { + return -ENOMEM; + } else { + map->rules = _realloc; + } + memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0])); + } + + /* add it */ + map->rules[r] = rule; + return r; +} + +struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize) +{ + struct crush_rule *rule; + rule = malloc(crush_rule_size(len)); + if (!rule) + return NULL; + rule->len = len; + rule->mask.ruleset = ruleset; + rule->mask.type = type; + rule->mask.min_size = minsize; + rule->mask.max_size = maxsize; + return rule; +} + +/* + * be careful; this doesn't verify that the buffer you allocated is big enough! + */ +void crush_rule_set_step(struct crush_rule *rule, int n, int op, int arg1, int arg2) +{ + assert((__u32)n < rule->len); + rule->steps[n].op = op; + rule->steps[n].arg1 = arg1; + rule->steps[n].arg2 = arg2; +} + + +/** buckets **/ +int crush_get_next_bucket_id(struct crush_map *map) +{ + int pos; + for (pos=0; pos < map->max_buckets; pos++) + if (map->buckets[pos] == 0) + break; + return -1 - pos; +} + + +int crush_add_bucket(struct crush_map *map, + int id, + struct crush_bucket *bucket, + int *idout) +{ + int pos; + + /* find a bucket id */ + if (id == 0) + id = crush_get_next_bucket_id(map); + pos = -1 - id; + + while (pos >= map->max_buckets) { + /* expand array */ + int oldsize = map->max_buckets; + if (map->max_buckets) + map->max_buckets *= 2; + else + map->max_buckets = 8; + void *_realloc = NULL; + if ((_realloc = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0]))) == NULL) { + return -ENOMEM; + } else { + map->buckets = _realloc; + } + memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0])); + } + + if (map->buckets[pos] != 0) { + return -EEXIST; + } + + /* add it */ + bucket->id = id; + map->buckets[pos] = bucket; + + if (idout) *idout = id; + return 0; +} + +int crush_remove_bucket(struct crush_map *map, struct crush_bucket *bucket) +{ + int pos = -1 - bucket->id; + assert(pos < map->max_buckets); + map->buckets[pos] = NULL; + crush_destroy_bucket(bucket); + return 0; +} + + +/* uniform bucket */ + +struct crush_bucket_uniform * +crush_make_uniform_bucket(int hash, int type, int size, + int *items, + int item_weight) +{ + int i; + struct crush_bucket_uniform *bucket; + + bucket = malloc(sizeof(*bucket)); + if (!bucket) + return NULL; + memset(bucket, 0, sizeof(*bucket)); + bucket->h.alg = CRUSH_BUCKET_UNIFORM; + bucket->h.hash = hash; + bucket->h.type = type; + bucket->h.size = size; + + if (crush_multiplication_is_unsafe(size, item_weight)) + goto err; + + bucket->h.weight = size * item_weight; + bucket->item_weight = item_weight; + bucket->h.items = malloc(sizeof(__s32)*size); + + if (!bucket->h.items) + goto err; + + for (i=0; ih.items[i] = items[i]; + + return bucket; +err: + free(bucket->h.items); + free(bucket); + return NULL; +} + + +/* list bucket */ + +struct crush_bucket_list* +crush_make_list_bucket(int hash, int type, int size, + int *items, + int *weights) +{ + int i; + int w; + struct crush_bucket_list *bucket; + + bucket = malloc(sizeof(*bucket)); + if (!bucket) + return NULL; + memset(bucket, 0, sizeof(*bucket)); + bucket->h.alg = CRUSH_BUCKET_LIST; + bucket->h.hash = hash; + bucket->h.type = type; + bucket->h.size = size; + + bucket->h.items = malloc(sizeof(__s32)*size); + if (!bucket->h.items) + goto err; + + + bucket->item_weights = malloc(sizeof(__u32)*size); + if (!bucket->item_weights) + goto err; + bucket->sum_weights = malloc(sizeof(__u32)*size); + if (!bucket->sum_weights) + goto err; + w = 0; + for (i=0; ih.items[i] = items[i]; + bucket->item_weights[i] = weights[i]; + + if (crush_addition_is_unsafe(w, weights[i])) + goto err; + + w += weights[i]; + bucket->sum_weights[i] = w; + /*dprintk("pos %d item %d weight %d sum %d\n", + i, items[i], weights[i], bucket->sum_weights[i]);*/ + } + + bucket->h.weight = w; + + return bucket; +err: + free(bucket->sum_weights); + free(bucket->item_weights); + free(bucket->h.items); + free(bucket); + return NULL; +} + + +/* tree bucket */ + +static int height(int n) { + int h = 0; + while ((n & 1) == 0) { + h++; + n = n >> 1; + } + return h; +} +static int on_right(int n, int h) { + return n & (1 << (h+1)); +} +static int parent(int n) +{ + int h = height(n); + if (on_right(n, h)) + return n - (1<> 1; + depth++; + } + return depth; +} + +struct crush_bucket_tree* +crush_make_tree_bucket(int hash, int type, int size, + int *items, /* in leaf order */ + int *weights) +{ + struct crush_bucket_tree *bucket; + int depth; + int node; + int i, j; + + bucket = malloc(sizeof(*bucket)); + if (!bucket) + return NULL; + memset(bucket, 0, sizeof(*bucket)); + bucket->h.alg = CRUSH_BUCKET_TREE; + bucket->h.hash = hash; + bucket->h.type = type; + bucket->h.size = size; + + if (size == 0) { + bucket->h.items = NULL; + bucket->h.weight = 0; + bucket->node_weights = NULL; + bucket->num_nodes = 0; + /* printf("size 0 depth 0 nodes 0\n"); */ + return bucket; + } + + bucket->h.items = malloc(sizeof(__s32)*size); + if (!bucket->h.items) + goto err; + + /* calc tree depth */ + depth = calc_depth(size); + bucket->num_nodes = 1 << depth; + dprintk("size %d depth %d nodes %d\n", size, depth, bucket->num_nodes); + + bucket->node_weights = malloc(sizeof(__u32)*bucket->num_nodes); + if (!bucket->node_weights) + goto err; + + memset(bucket->h.items, 0, sizeof(__s32)*bucket->h.size); + memset(bucket->node_weights, 0, sizeof(__u32)*bucket->num_nodes); + + for (i=0; ih.items[i] = items[i]; + node = crush_calc_tree_node(i); + dprintk("item %d node %d weight %d\n", i, node, weights[i]); + bucket->node_weights[node] = weights[i]; + + if (crush_addition_is_unsafe(bucket->h.weight, weights[i])) + goto err; + + bucket->h.weight += weights[i]; + for (j=1; jnode_weights[node], weights[i])) + goto err; + + bucket->node_weights[node] += weights[i]; + dprintk(" node %d weight %d\n", node, bucket->node_weights[node]); + } + } + BUG_ON(bucket->node_weights[bucket->num_nodes/2] != bucket->h.weight); + + return bucket; +err: + free(bucket->node_weights); + free(bucket->h.items); + free(bucket); + return NULL; +} + + + +/* straw bucket */ + +/* + * this code was written 8 years ago. i have a vague recollection of + * drawing boxes underneath bars of different lengths, where the bar + * length represented the probability/weight, and that there was some + * trial and error involved in arriving at this implementation. + * however, reading the code now after all this time, the intuition + * that motivated is lost on me. lame. my only excuse is that I now + * know that the approach is fundamentally flawed and am not + * particularly motivated to reconstruct the flawed reasoning. + * + * as best as i can remember, the idea is: sort the weights, and start + * with the smallest. arbitrarily scale it at 1.0 (16-bit fixed + * point). look at the next larger weight, and calculate the scaling + * factor for that straw based on the relative difference in weight so + * far. what's not clear to me now is why we are looking at wnext + * (the delta to the next bigger weight) for all remaining weights, + * and slicing things horizontally instead of considering just the + * next item or set of items. or why pow() is used the way it is. + * + * note that the original version 1 of this function made special + * accommodation for the case where straw lengths were identical. this + * is also flawed in a non-obvious way; version 2 drops the special + * handling and appears to work just as well. + * + * moral of the story: if you do something clever, write down why it + * works. + */ +int crush_calc_straw(struct crush_map *map, struct crush_bucket_straw *bucket) +{ + int *reverse; + int i, j, k; + double straw, wbelow, lastw, wnext, pbelow; + int numleft; + int size = bucket->h.size; + __u32 *weights = bucket->item_weights; + + /* reverse sort by weight (simple insertion sort) */ + reverse = malloc(sizeof(int) * size); + if (!reverse) + return -ENOMEM; + if (size) + reverse[0] = 0; + for (i=1; ij; k--) + reverse[k] = reverse[k-1]; + reverse[j] = i; + break; + } + } + if (j == i) + reverse[i] = i; + } + + numleft = size; + straw = 1.0; + wbelow = 0; + lastw = 0; + + i=0; + while (i < size) { + if (map->straw_calc_version == 0) { + /* zero weight items get 0 length straws! */ + if (weights[reverse[i]] == 0) { + bucket->straws[reverse[i]] = 0; + i++; + continue; + } + + /* set this item's straw */ + bucket->straws[reverse[i]] = straw * 0x10000; + dprintk("item %d at %d weight %d straw %d (%lf)\n", + bucket->h.items[reverse[i]], + reverse[i], weights[reverse[i]], + bucket->straws[reverse[i]], straw); + i++; + if (i == size) + break; + + /* same weight as previous? */ + if (weights[reverse[i]] == weights[reverse[i-1]]) { + dprintk("same as previous\n"); + continue; + } + + /* adjust straw for next guy */ + wbelow += ((double)weights[reverse[i-1]] - lastw) * + numleft; + for (j=i; jstraw_calc_version >= 1) { + /* zero weight items get 0 length straws! */ + if (weights[reverse[i]] == 0) { + bucket->straws[reverse[i]] = 0; + i++; + numleft--; + continue; + } + + /* set this item's straw */ + bucket->straws[reverse[i]] = straw * 0x10000; + dprintk("item %d at %d weight %d straw %d (%lf)\n", + bucket->h.items[reverse[i]], + reverse[i], weights[reverse[i]], + bucket->straws[reverse[i]], straw); + i++; + if (i == size) + break; + + /* adjust straw for next guy */ + wbelow += ((double)weights[reverse[i-1]] - lastw) * + numleft; + numleft--; + wnext = numleft * (weights[reverse[i]] - + weights[reverse[i-1]]); + pbelow = wbelow / (wbelow + wnext); + dprintk("wbelow %lf wnext %lf pbelow %lf numleft %d\n", + wbelow, wnext, pbelow, numleft); + + straw *= pow((double)1.0 / pbelow, (double)1.0 / + (double)numleft); + + lastw = weights[reverse[i-1]]; + } + } + + free(reverse); + return 0; +} + +struct crush_bucket_straw * +crush_make_straw_bucket(struct crush_map *map, + int hash, + int type, + int size, + int *items, + int *weights) +{ + struct crush_bucket_straw *bucket; + int i; + + bucket = malloc(sizeof(*bucket)); + if (!bucket) + return NULL; + memset(bucket, 0, sizeof(*bucket)); + bucket->h.alg = CRUSH_BUCKET_STRAW; + bucket->h.hash = hash; + bucket->h.type = type; + bucket->h.size = size; + + bucket->h.items = malloc(sizeof(__s32)*size); + if (!bucket->h.items) + goto err; + bucket->item_weights = malloc(sizeof(__u32)*size); + if (!bucket->item_weights) + goto err; + bucket->straws = malloc(sizeof(__u32)*size); + if (!bucket->straws) + goto err; + + bucket->h.weight = 0; + for (i=0; ih.items[i] = items[i]; + bucket->h.weight += weights[i]; + bucket->item_weights[i] = weights[i]; + } + + if (crush_calc_straw(map, bucket) < 0) + goto err; + + return bucket; +err: + free(bucket->straws); + free(bucket->item_weights); + free(bucket->h.items); + free(bucket); + return NULL; +} + +struct crush_bucket_straw2 * +crush_make_straw2_bucket(struct crush_map *map, + int hash, + int type, + int size, + int *items, + int *weights) +{ + struct crush_bucket_straw2 *bucket; + int i; + + bucket = malloc(sizeof(*bucket)); + if (!bucket) + return NULL; + memset(bucket, 0, sizeof(*bucket)); + bucket->h.alg = CRUSH_BUCKET_STRAW2; + bucket->h.hash = hash; + bucket->h.type = type; + bucket->h.size = size; + + bucket->h.items = malloc(sizeof(__s32)*size); + if (!bucket->h.items) + goto err; + bucket->item_weights = malloc(sizeof(__u32)*size); + if (!bucket->item_weights) + goto err; + + bucket->h.weight = 0; + for (i=0; ih.items[i] = items[i]; + bucket->h.weight += weights[i]; + bucket->item_weights[i] = weights[i]; + } + + return bucket; +err: + free(bucket->item_weights); + free(bucket->h.items); + free(bucket); + return NULL; +} + + + +struct crush_bucket* +crush_make_bucket(struct crush_map *map, + int alg, int hash, int type, int size, + int *items, + int *weights) +{ + int item_weight; + + switch (alg) { + case CRUSH_BUCKET_UNIFORM: + if (size && weights) + item_weight = weights[0]; + else + item_weight = 0; + return (struct crush_bucket *)crush_make_uniform_bucket(hash, type, size, items, item_weight); + + case CRUSH_BUCKET_LIST: + return (struct crush_bucket *)crush_make_list_bucket(hash, type, size, items, weights); + + case CRUSH_BUCKET_TREE: + return (struct crush_bucket *)crush_make_tree_bucket(hash, type, size, items, weights); + + case CRUSH_BUCKET_STRAW: + return (struct crush_bucket *)crush_make_straw_bucket(map, hash, type, size, items, weights); + case CRUSH_BUCKET_STRAW2: + return (struct crush_bucket *)crush_make_straw2_bucket(map, hash, type, size, items, weights); + } + return 0; +} + + +/************************************************/ + +int crush_add_uniform_bucket_item(struct crush_bucket_uniform *bucket, int item, int weight) +{ + int newsize = bucket->h.size + 1; + void *_realloc = NULL; + + /* In such situation 'CRUSH_BUCKET_UNIFORM', the weight + provided for the item should be the same as + bucket->item_weight defined with 'crush_make_bucket'. This + assumption is enforced by the return value which is always + 0. */ + if (bucket->item_weight != weight) { + return -EINVAL; + } + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + + bucket->h.items[newsize-1] = item; + + if (crush_addition_is_unsafe(bucket->h.weight, weight)) + return -ERANGE; + + bucket->h.weight += weight; + bucket->h.size++; + + return 0; +} + +int crush_add_list_bucket_item(struct crush_bucket_list *bucket, int item, int weight) +{ + int newsize = bucket->h.size + 1; + void *_realloc = NULL; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->item_weights = _realloc; + } + if ((_realloc = realloc(bucket->sum_weights, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->sum_weights = _realloc; + } + + bucket->h.items[newsize-1] = item; + bucket->item_weights[newsize-1] = weight; + if (newsize > 1) { + + if (crush_addition_is_unsafe(bucket->sum_weights[newsize-2], weight)) + return -ERANGE; + + bucket->sum_weights[newsize-1] = bucket->sum_weights[newsize-2] + weight; + } + + else { + bucket->sum_weights[newsize-1] = weight; + } + + bucket->h.weight += weight; + bucket->h.size++; + return 0; +} + +int crush_add_tree_bucket_item(struct crush_bucket_tree *bucket, int item, int weight) +{ + int newsize = bucket->h.size + 1; + int depth = calc_depth(newsize);; + int node; + int j; + void *_realloc = NULL; + + bucket->num_nodes = 1 << depth; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + if ((_realloc = realloc(bucket->node_weights, sizeof(__u32)*bucket->num_nodes)) == NULL) { + return -ENOMEM; + } else { + bucket->node_weights = _realloc; + } + + node = crush_calc_tree_node(newsize-1); + bucket->node_weights[node] = weight; + + /* if the depth increase, we need to initialize the new root node's weight before add bucket item */ + int root = bucket->num_nodes/2; + if (depth >= 2 && (node - 1) == root) { + /* if the new item is the first node in right sub tree, so + * the root node initial weight is left sub tree's weight + */ + bucket->node_weights[root] = bucket->node_weights[root/2]; + } + + for (j=1; jnode_weights[node], weight)) + return -ERANGE; + + bucket->node_weights[node] += weight; + dprintk(" node %d weight %d\n", node, bucket->node_weights[node]); + } + + + if (crush_addition_is_unsafe(bucket->h.weight, weight)) + return -ERANGE; + + bucket->h.items[newsize-1] = item; + bucket->h.weight += weight; + bucket->h.size++; + + return 0; +} + +int crush_add_straw_bucket_item(struct crush_map *map, + struct crush_bucket_straw *bucket, + int item, int weight) +{ + int newsize = bucket->h.size + 1; + + void *_realloc = NULL; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->item_weights = _realloc; + } + if ((_realloc = realloc(bucket->straws, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->straws = _realloc; + } + + bucket->h.items[newsize-1] = item; + bucket->item_weights[newsize-1] = weight; + + if (crush_addition_is_unsafe(bucket->h.weight, weight)) + return -ERANGE; + + bucket->h.weight += weight; + bucket->h.size++; + + return crush_calc_straw(map, bucket); +} + +int crush_add_straw2_bucket_item(struct crush_map *map, + struct crush_bucket_straw2 *bucket, + int item, int weight) +{ + int newsize = bucket->h.size + 1; + + void *_realloc = NULL; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->item_weights = _realloc; + } + + bucket->h.items[newsize-1] = item; + bucket->item_weights[newsize-1] = weight; + + if (crush_addition_is_unsafe(bucket->h.weight, weight)) + return -ERANGE; + + bucket->h.weight += weight; + bucket->h.size++; + + return 0; +} + +int crush_bucket_add_item(struct crush_map *map, + struct crush_bucket *b, int item, int weight) +{ + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + return crush_add_uniform_bucket_item((struct crush_bucket_uniform *)b, item, weight); + case CRUSH_BUCKET_LIST: + return crush_add_list_bucket_item((struct crush_bucket_list *)b, item, weight); + case CRUSH_BUCKET_TREE: + return crush_add_tree_bucket_item((struct crush_bucket_tree *)b, item, weight); + case CRUSH_BUCKET_STRAW: + return crush_add_straw_bucket_item(map, (struct crush_bucket_straw *)b, item, weight); + case CRUSH_BUCKET_STRAW2: + return crush_add_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item, weight); + default: + return -1; + } +} + +/************************************************/ + +int crush_remove_uniform_bucket_item(struct crush_bucket_uniform *bucket, int item) +{ + unsigned i, j; + int newsize; + void *_realloc = NULL; + + for (i = 0; i < bucket->h.size; i++) + if (bucket->h.items[i] == item) + break; + if (i == bucket->h.size) + return -ENOENT; + + for (j = i; j < bucket->h.size; j++) + bucket->h.items[j] = bucket->h.items[j+1]; + newsize = --bucket->h.size; + if (bucket->item_weight < bucket->h.weight) + bucket->h.weight -= bucket->item_weight; + else + bucket->h.weight = 0; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + return 0; +} + +int crush_remove_list_bucket_item(struct crush_bucket_list *bucket, int item) +{ + unsigned i, j; + int newsize; + unsigned weight; + + for (i = 0; i < bucket->h.size; i++) + if (bucket->h.items[i] == item) + break; + if (i == bucket->h.size) + return -ENOENT; + + weight = bucket->item_weights[i]; + for (j = i; j < bucket->h.size; j++) { + bucket->h.items[j] = bucket->h.items[j+1]; + bucket->item_weights[j] = bucket->item_weights[j+1]; + bucket->sum_weights[j] = bucket->sum_weights[j+1] - weight; + } + if (weight < bucket->h.weight) + bucket->h.weight -= weight; + else + bucket->h.weight = 0; + newsize = --bucket->h.size; + + void *_realloc = NULL; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->item_weights = _realloc; + } + if ((_realloc = realloc(bucket->sum_weights, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->sum_weights = _realloc; + } + return 0; +} + +int crush_remove_tree_bucket_item(struct crush_bucket_tree *bucket, int item) +{ + unsigned i; + unsigned newsize; + + for (i = 0; i < bucket->h.size; i++) { + int node; + unsigned weight; + int j; + int depth = calc_depth(bucket->h.size); + + if (bucket->h.items[i] != item) + continue; + + bucket->h.items[i] = 0; + node = crush_calc_tree_node(i); + weight = bucket->node_weights[node]; + bucket->node_weights[node] = 0; + + for (j = 1; j < depth; j++) { + node = parent(node); + bucket->node_weights[node] -= weight; + dprintk(" node %d weight %d\n", node, bucket->node_weights[node]); + } + if (weight < bucket->h.weight) + bucket->h.weight -= weight; + else + bucket->h.weight = 0; + break; + } + if (i == bucket->h.size) + return -ENOENT; + + newsize = bucket->h.size; + while (newsize > 0) { + int node = crush_calc_tree_node(newsize - 1); + if (bucket->node_weights[node]) + break; + --newsize; + } + + if (newsize != bucket->h.size) { + int olddepth, newdepth; + + void *_realloc = NULL; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + + olddepth = calc_depth(bucket->h.size); + newdepth = calc_depth(newsize); + if (olddepth != newdepth) { + bucket->num_nodes = 1 << newdepth; + if ((_realloc = realloc(bucket->node_weights, + sizeof(__u32)*bucket->num_nodes)) == NULL) { + return -ENOMEM; + } else { + bucket->node_weights = _realloc; + } + } + + bucket->h.size = newsize; + } + return 0; +} + +int crush_remove_straw_bucket_item(struct crush_map *map, + struct crush_bucket_straw *bucket, int item) +{ + int newsize = bucket->h.size - 1; + unsigned i, j; + + for (i = 0; i < bucket->h.size; i++) { + if (bucket->h.items[i] == item) { + if (bucket->item_weights[i] < bucket->h.weight) + bucket->h.weight -= bucket->item_weights[i]; + else + bucket->h.weight = 0; + for (j = i; j < bucket->h.size - 1; j++) { + bucket->h.items[j] = bucket->h.items[j+1]; + bucket->item_weights[j] = bucket->item_weights[j+1]; + } + break; + } + } + if (i == bucket->h.size) + return -ENOENT; + bucket->h.size--; + if (bucket->h.size == 0) { + /* don't bother reallocating */ + return 0; + } + void *_realloc = NULL; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->item_weights = _realloc; + } + if ((_realloc = realloc(bucket->straws, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->straws = _realloc; + } + + return crush_calc_straw(map, bucket); +} + +int crush_remove_straw2_bucket_item(struct crush_map *map, + struct crush_bucket_straw2 *bucket, int item) +{ + int newsize = bucket->h.size - 1; + unsigned i, j; + + for (i = 0; i < bucket->h.size; i++) { + if (bucket->h.items[i] == item) { + if (bucket->item_weights[i] < bucket->h.weight) + bucket->h.weight -= bucket->item_weights[i]; + else + bucket->h.weight = 0; + for (j = i; j < bucket->h.size - 1; j++) { + bucket->h.items[j] = bucket->h.items[j+1]; + bucket->item_weights[j] = bucket->item_weights[j+1]; + } + break; + } + } + if (i == bucket->h.size) + return -ENOENT; + + bucket->h.size--; + if (!newsize) { + /* don't bother reallocating a 0-length array. */ + return 0; + } + + void *_realloc = NULL; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->item_weights = _realloc; + } + + return 0; +} + +int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *b, int item) +{ + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + return crush_remove_uniform_bucket_item((struct crush_bucket_uniform *)b, item); + case CRUSH_BUCKET_LIST: + return crush_remove_list_bucket_item((struct crush_bucket_list *)b, item); + case CRUSH_BUCKET_TREE: + return crush_remove_tree_bucket_item((struct crush_bucket_tree *)b, item); + case CRUSH_BUCKET_STRAW: + return crush_remove_straw_bucket_item(map, (struct crush_bucket_straw *)b, item); + case CRUSH_BUCKET_STRAW2: + return crush_remove_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item); + default: + return -1; + } +} + + +/************************************************/ + +int crush_adjust_uniform_bucket_item_weight(struct crush_bucket_uniform *bucket, int item, int weight) +{ + int diff = (weight - bucket->item_weight) * bucket->h.size; + + bucket->item_weight = weight; + bucket->h.weight = bucket->item_weight * bucket->h.size; + + return diff; +} + +int crush_adjust_list_bucket_item_weight(struct crush_bucket_list *bucket, int item, int weight) +{ + int diff; + unsigned i, j; + + for (i = 0; i < bucket->h.size; i++) { + if (bucket->h.items[i] == item) + break; + } + if (i == bucket->h.size) + return 0; + + diff = weight - bucket->item_weights[i]; + bucket->item_weights[i] = weight; + bucket->h.weight += diff; + + for (j = i; j < bucket->h.size; j++) + bucket->sum_weights[j] += diff; + + return diff; +} + +int crush_adjust_tree_bucket_item_weight(struct crush_bucket_tree *bucket, int item, int weight) +{ + int diff; + int node; + unsigned i, j; + unsigned depth = calc_depth(bucket->h.size); + + for (i = 0; i < bucket->h.size; i++) { + if (bucket->h.items[i] == item) + break; + } + if (i == bucket->h.size) + return 0; + + node = crush_calc_tree_node(i); + diff = weight - bucket->node_weights[node]; + bucket->node_weights[node] = weight; + bucket->h.weight += diff; + + for (j=1; jnode_weights[node] += diff; + } + + return diff; +} + +int crush_adjust_straw_bucket_item_weight(struct crush_map *map, + struct crush_bucket_straw *bucket, + int item, int weight) +{ + unsigned idx; + int diff; + int r; + + for (idx = 0; idx < bucket->h.size; idx++) + if (bucket->h.items[idx] == item) + break; + if (idx == bucket->h.size) + return 0; + + diff = weight - bucket->item_weights[idx]; + bucket->item_weights[idx] = weight; + bucket->h.weight += diff; + + r = crush_calc_straw(map, bucket); + if (r < 0) + return r; + + return diff; +} + +int crush_adjust_straw2_bucket_item_weight(struct crush_map *map, + struct crush_bucket_straw2 *bucket, + int item, int weight) +{ + unsigned idx; + int diff; + + for (idx = 0; idx < bucket->h.size; idx++) + if (bucket->h.items[idx] == item) + break; + if (idx == bucket->h.size) + return 0; + + diff = weight - bucket->item_weights[idx]; + bucket->item_weights[idx] = weight; + bucket->h.weight += diff; + + return diff; +} + +int crush_bucket_adjust_item_weight(struct crush_map *map, + struct crush_bucket *b, + int item, int weight) +{ + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + return crush_adjust_uniform_bucket_item_weight((struct crush_bucket_uniform *)b, + item, weight); + case CRUSH_BUCKET_LIST: + return crush_adjust_list_bucket_item_weight((struct crush_bucket_list *)b, + item, weight); + case CRUSH_BUCKET_TREE: + return crush_adjust_tree_bucket_item_weight((struct crush_bucket_tree *)b, + item, weight); + case CRUSH_BUCKET_STRAW: + return crush_adjust_straw_bucket_item_weight(map, + (struct crush_bucket_straw *)b, + item, weight); + case CRUSH_BUCKET_STRAW2: + return crush_adjust_straw2_bucket_item_weight(map, + (struct crush_bucket_straw2 *)b, + item, weight); + default: + return -1; + } +} + +/************************************************/ + +static int crush_reweight_uniform_bucket(struct crush_map *map, struct crush_bucket_uniform *bucket) +{ + unsigned i; + unsigned sum = 0, n = 0, leaves = 0; + + for (i = 0; i < bucket->h.size; i++) { + int id = bucket->h.items[i]; + if (id < 0) { + struct crush_bucket *c = map->buckets[-1-id]; + crush_reweight_bucket(map, c); + + if (crush_addition_is_unsafe(sum, c->weight)) + return -ERANGE; + + sum += c->weight; + n++; + } else { + leaves++; + } + } + + if (n > leaves) + bucket->item_weight = sum / n; // more bucket children than leaves, average! + bucket->h.weight = bucket->item_weight * bucket->h.size; + + return 0; +} + +static int crush_reweight_list_bucket(struct crush_map *map, struct crush_bucket_list *bucket) +{ + unsigned i; + + bucket->h.weight = 0; + for (i = 0; i < bucket->h.size; i++) { + int id = bucket->h.items[i]; + if (id < 0) { + struct crush_bucket *c = map->buckets[-1-id]; + crush_reweight_bucket(map, c); + bucket->item_weights[i] = c->weight; + } + + if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i])) + return -ERANGE; + + bucket->h.weight += bucket->item_weights[i]; + } + + return 0; +} + +static int crush_reweight_tree_bucket(struct crush_map *map, struct crush_bucket_tree *bucket) +{ + unsigned i; + + bucket->h.weight = 0; + for (i = 0; i < bucket->h.size; i++) { + int node = crush_calc_tree_node(i); + int id = bucket->h.items[i]; + if (id < 0) { + struct crush_bucket *c = map->buckets[-1-id]; + crush_reweight_bucket(map, c); + bucket->node_weights[node] = c->weight; + } + + if (crush_addition_is_unsafe(bucket->h.weight, bucket->node_weights[node])) + return -ERANGE; + + bucket->h.weight += bucket->node_weights[node]; + + + } + + return 0; +} + +static int crush_reweight_straw_bucket(struct crush_map *map, struct crush_bucket_straw *bucket) +{ + unsigned i; + + bucket->h.weight = 0; + for (i = 0; i < bucket->h.size; i++) { + int id = bucket->h.items[i]; + if (id < 0) { + struct crush_bucket *c = map->buckets[-1-id]; + crush_reweight_bucket(map, c); + bucket->item_weights[i] = c->weight; + } + + if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i])) + return -ERANGE; + + bucket->h.weight += bucket->item_weights[i]; + } + crush_calc_straw(map, bucket); + + return 0; +} + +static int crush_reweight_straw2_bucket(struct crush_map *map, struct crush_bucket_straw2 *bucket) +{ + unsigned i; + + bucket->h.weight = 0; + for (i = 0; i < bucket->h.size; i++) { + int id = bucket->h.items[i]; + if (id < 0) { + struct crush_bucket *c = map->buckets[-1-id]; + crush_reweight_bucket(map, c); + bucket->item_weights[i] = c->weight; + } + + if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i])) + return -ERANGE; + + bucket->h.weight += bucket->item_weights[i]; + } + + return 0; +} + +int crush_reweight_bucket(struct crush_map *map, struct crush_bucket *b) +{ + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + return crush_reweight_uniform_bucket(map, (struct crush_bucket_uniform *)b); + case CRUSH_BUCKET_LIST: + return crush_reweight_list_bucket(map, (struct crush_bucket_list *)b); + case CRUSH_BUCKET_TREE: + return crush_reweight_tree_bucket(map, (struct crush_bucket_tree *)b); + case CRUSH_BUCKET_STRAW: + return crush_reweight_straw_bucket(map, (struct crush_bucket_straw *)b); + case CRUSH_BUCKET_STRAW2: + return crush_reweight_straw2_bucket(map, (struct crush_bucket_straw2 *)b); + default: + return -1; + } +} + +struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions) +{ + int b; + int sum_bucket_size = 0; + int bucket_count = 0; + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == 0) + continue; + sum_bucket_size += map->buckets[b]->size; + bucket_count++; + } + dprintk("sum_bucket_size %d max_buckets %d bucket_count %d\n", + sum_bucket_size, map->max_buckets, bucket_count); + int size = (sizeof(struct crush_choose_arg) * map->max_buckets + + sizeof(struct crush_weight_set) * bucket_count * num_positions + + sizeof(__u32) * sum_bucket_size * num_positions + // weights + sizeof(__s32) * sum_bucket_size); // ids + char *space = malloc(size); + struct crush_choose_arg *arg = (struct crush_choose_arg *)space; + struct crush_weight_set *weight_set = (struct crush_weight_set *)(arg + map->max_buckets); + __u32 *weights = (__u32 *)(weight_set + bucket_count * num_positions); + char *weight_set_ends __attribute__((unused)) = (char*)weights; + __s32 *ids = (__s32 *)(weights + sum_bucket_size * num_positions); + char *weights_end __attribute__((unused)) = (char *)ids; + char *ids_end __attribute__((unused)) = (char *)(ids + sum_bucket_size); + BUG_ON(space + size != ids_end); + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == 0) { + memset(&arg[b], '\0', sizeof(struct crush_choose_arg)); + continue; + } + struct crush_bucket_straw2 *bucket = (struct crush_bucket_straw2 *)map->buckets[b]; + + int position; + for (position = 0; position < num_positions; position++) { + memcpy(weights, bucket->item_weights, sizeof(__u32) * bucket->h.size); + weight_set[position].weights = weights; + weight_set[position].size = bucket->h.size; + dprintk("moving weight %d bytes forward\n", (int)((weights + bucket->h.size) - weights)); + weights += bucket->h.size; + } + arg[b].weight_set = weight_set; + arg[b].weight_set_positions = num_positions; + weight_set += position; + + memcpy(ids, bucket->h.items, sizeof(__s32) * bucket->h.size); + arg[b].ids = ids; + arg[b].ids_size = bucket->h.size; + ids += bucket->h.size; + } + BUG_ON((char*)weight_set_ends != (char*)weight_set); + BUG_ON((char*)weights_end != (char*)weights); + BUG_ON((char*)ids != (char*)ids_end); + return arg; +} + +void crush_destroy_choose_args(struct crush_choose_arg *args) +{ + free(args); +} + +/***************************/ + +/* methods to check for safe arithmetic operations */ + +int crush_addition_is_unsafe(__u32 a, __u32 b) +{ + if ((((__u32)(-1)) - b) < a) + return 1; + else + return 0; +} + +int crush_multiplication_is_unsafe(__u32 a, __u32 b) +{ + /* prevent division by zero */ + if (!a) + return 0; + if (!b) + return 1; + if ((((__u32)(-1)) / b) < a) + return 1; + else + return 0; +} + +/***************************/ + +/* methods to configure crush_map */ + +void set_legacy_crush_map(struct crush_map *map) { + /* initialize legacy tunable values */ + map->choose_local_tries = 2; + map->choose_local_fallback_tries = 5; + map->choose_total_tries = 19; + map->chooseleaf_descend_once = 0; + map->chooseleaf_vary_r = 0; + map->chooseleaf_stable = 0; + map->straw_calc_version = 0; + + // by default, use legacy types, and also exclude tree, + // since it was buggy. + map->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; +} + +void set_optimal_crush_map(struct crush_map *map) { + map->choose_local_tries = 0; + map->choose_local_fallback_tries = 0; + map->choose_total_tries = 50; + map->chooseleaf_descend_once = 1; + map->chooseleaf_vary_r = 1; + map->chooseleaf_stable = 1; + map->allowed_bucket_algs = ( + (1 << CRUSH_BUCKET_UNIFORM) | + (1 << CRUSH_BUCKET_LIST) | + (1 << CRUSH_BUCKET_STRAW) | + (1 << CRUSH_BUCKET_STRAW2)); +} diff --git a/src/crush/builder.h b/src/crush/builder.h new file mode 100644 index 00000000..bdf0a4b9 --- /dev/null +++ b/src/crush/builder.h @@ -0,0 +1,344 @@ +#ifndef CEPH_CRUSH_BUILDER_H +#define CEPH_CRUSH_BUILDER_H + +#include "include/int_types.h" + +struct crush_bucket; +struct crush_choose_arg; +struct crush_map; +struct crush_rule; + +/** @ingroup API + * + * Allocate a crush_map with __malloc(3)__ and initialize it. The + * caller is responsible for deallocating the crush_map with + * crush_destroy(). + * + * The content of the allocated crush_map is set with + * set_optimal_crush_map(). The caller is responsible for setting each + * tunable in the __crush_map__ for backward compatibility or mapping + * stability. + * + * @returns a pointer to the newly created crush_map or NULL + */ +extern struct crush_map *crush_create(); +/** @ingroup API + * + * Analyze the content of __map__ and set the internal values required + * before it can be used to map values with crush_do_rule(). The caller + * must make sure it is run before crush_do_rule() and after any + * function that modifies the __map__ (crush_add_bucket(), etc.). + * + * @param map the crush_map + */ +extern void crush_finalize(struct crush_map *map); + +/* rules */ +/** @ingroup API + * + * Allocate an empty crush_rule structure large enough to store __len__ steps. + * Steps can be added to a rule via crush_rule_set_step(). The __ruleset__ + * is a user defined integer, not used by __libcrush__ and stored in + * the allocated rule at __rule->mask.ruleset__. + * + * The rule is designed to allow crush_do_rule() to get at least __minsize__ items + * and at most __maxsize__ items. + * + * The __type__ is defined by the caller and will be used by + * crush_find_rule() when looking for a rule and by + * __CRUSH_RULE_CHOOSE*__ steps when looking for items. + * + * The caller is responsible for deallocating the returned pointer via + * crush_destroy_rule(). + * + * If __malloc(3)__ fails, return NULL. + * + * @param len number of steps in the rule + * @param ruleset user defined value + * @param type user defined value + * @param minsize minimum number of items the rule can map + * @param maxsize maximum number of items the rule can map + * + * @returns a pointer to the newly created rule or NULL + */ +extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize); +/** @ingroup API + * + * Set the __pos__ step of the __rule__ to an operand and up to two arguments. + * The value of the operand __op__ determines if the arguments are used and how: + * + * - __CRUSH_RULE_NOOP__ do nothing. + * - __CRUSH_RULE_TAKE__ select the __arg1__ item + * - __CRUSH_RULE_EMIT__ append the selection to the results and clear + * the selection + * + * - __CRUSH_RULE_CHOOSE_FIRSTN__ and __CRUSH_RULE_CHOOSE_INDEP__ + * recursively explore each bucket currently selected, looking for + * __arg1__ items of type __arg2__ and select them. + * - __CRUSH_RULE_CHOOSELEAF_FIRSTN__ and __CRUSH_RULE_CHOOSELEAF_INDEP__ + * recursively explore each bucket currently selected, looking for + * __arg1__ leaves within all the buckets of type __arg2__ and + * select them. + * + * In all __CHOOSE__ steps, if __arg1__ is less than or equal to zero, + * the number of items to select is equal to the __max_result__ argument + * of crush_do_rule() minus __arg1__. It is common to set __arg1__ to zero + * to select as many items as requested by __max_result__. + * + * - __CRUSH_RULE_SET_CHOOSE_TRIES__ and __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ + * + * The CHOOSE_FIRSTN and CHOOSE_INDEP rule step look for buckets of + * a given type, randomly selecting them. If they are unlucky and + * find the same bucket twice, they will try N+1 times (N being the + * value of the choose_total_tries tunable). If there is a previous + * SET_CHOOSE_TRIES step in the same rule, it will try C times + * instead (C being the value of the argument of the + * SET_CHOOSE_TRIES step). + * + * Note: the __choose_total_tries__ tunable defined in crush_map is + * the number of retry, not the number of tries. The number of tries + * is the number of retry+1. The SET_CHOOSE_TRIES rule step sets the + * number of tries and does not need the + 1. This confusing + * difference is inherited from an off-by-one bug from years ago. + * + * The CHOOSELEAF_FIRSTN and CHOOSELEAF_INDEP rule step do the same + * as CHOOSE_FIRSTN and CHOOSE_INDEP but also recursively explore + * each bucket found, looking for a single device. The same device + * may be found in two different buckets because the crush map is + * not a strict hierarchy, it is a DAG. When such a collision + * happens, they will try again. The number of times they try to + * find a non colliding device is: + * + * - If FIRSTN and there is no previous SET_CHOOSELEAF_TRIES rule + * step: try N + 1 times (N being the value of the + * __choose_total_tries__ tunable defined in crush_map) + * + * - If FIRSTN and there is a previous SET_CHOOSELEAF_TRIES rule + * step: try P times (P being the value of the argument of the + * SET_CHOOSELEAF_TRIES rule step) + * + * - If INDEP and there is no previous SET_CHOOSELEAF_TRIES rule + * step: try 1 time. + * + * - If INDEP and there is a previous SET_CHOOSELEAF_TRIES rule step: try + * P times (P being the value of the argument of the SET_CHOOSELEAF_TRIES + * rule step) + * + * @param rule the rule in which the step is inserted + * @param pos the zero based step index + * @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__, __CRUSH_RULE_SET_CHOOSE_TRIES__, __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ or __CRUSH_RULE_EMIT__ + * @param arg1 first argument for __op__ + * @param arg2 second argument for __op__ + */ +extern void crush_rule_set_step(struct crush_rule *rule, int pos, int op, int arg1, int arg2); +/** @ingroup API + * + * Add the __rule__ into the crush __map__ and assign it the + * __ruleno__ unique identifier. If __ruleno__ is -1, the function will + * assign the lowest available identifier. The __ruleno__ value must be + * a positive integer lower than __CRUSH_MAX_RULES__. + * + * - return -ENOSPC if the rule identifier is >= __CRUSH_MAX_RULES__ + * - return -ENOMEM if __realloc(3)__ fails to expand the array of + * rules in the __map__ + * + * @param map the crush_map + * @param rule the rule to add to the __map__ + * @param ruleno a positive integer < __CRUSH_MAX_RULES__ or -1 + * + * @returns the rule unique identifier on success, < 0 on error + */ +extern int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno); + +/* buckets */ +extern int crush_get_next_bucket_id(struct crush_map *map); +/** @ingroup API + * + * Add __bucket__ into the crush __map__ and assign it the + * __bucketno__ unique identifier. If __bucketno__ is 0, the function + * will assign the lowest available identifier. The bucket identifier + * must be a negative integer. The bucket identifier is returned via + * __idout__. + * + * - return -ENOMEM if __realloc(3)__ fails to expand the array of + * buckets in the __map__ + * - return -EEXIST if the __bucketno__ identifier is already assigned + * to another bucket. + * + * @param[in] map the crush_map + * @param[in] bucketno the bucket unique identifier or 0 + * @param[in] bucket the bucket to add to the __map__ + * @param[out] idout a pointer to the bucket identifier + * + * @returns 0 on success, < 0 on error + */ +extern int crush_add_bucket(struct crush_map *map, + int bucketno, + struct crush_bucket *bucket, int *idout); +/** @ingroup API + * + * Allocate a crush_bucket with __malloc(3)__ and initialize it. The + * content of the bucket is filled with __size__ items from + * __items__. The item selection is set to use __alg__ which is one of + * ::CRUSH_BUCKET_UNIFORM , ::CRUSH_BUCKET_LIST or + * ::CRUSH_BUCKET_STRAW2. The initial __items__ are assigned a + * weight from the __weights__ array, depending on the value of + * __alg__. If __alg__ is ::CRUSH_BUCKET_UNIFORM, all items are set + * to have a weight equal to __weights[0]__, otherwise the weight of + * __items[x]__ is set to be the value of __weights[x]__. + * + * The caller is responsible for deallocating the returned pointer via + * crush_destroy_bucket(). + * + * @param map __unused__ + * @param alg algorithm for item selection + * @param hash always set to CRUSH_HASH_RJENKINS1 + * @param type user defined bucket type + * @param size of the __items__ array + * @param items array of __size__ items + * @param weights the weight of each item in __items__, depending on __alg__ + * + * @returns a pointer to the newly created bucket or NULL + */ +struct crush_bucket *crush_make_bucket(struct crush_map *map, int alg, int hash, int type, int size, int *items, int *weights); +extern struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions); +extern void crush_destroy_choose_args(struct crush_choose_arg *args); +/** @ingroup API + * + * Add __item__ to __bucket__ with __weight__. The weight of the new + * item is added to the weight of the bucket so that it reflects + * the total weight of all items. + * + * If __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM, the value of __weight__ must be equal to + * __(struct crush_bucket_uniform *)bucket->item_weight__. + * + * - return -ENOMEM if the __bucket__ cannot be resized with __realloc(3)__. + * - return -ERANGE if adding __weight__ to the weight of the bucket overflows. + * - return -EINVAL if __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM and + * the __weight__ is not equal to __(struct crush_bucket_uniform *)bucket->item_weight__. + * - return -1 if the value of __bucket->alg__ is unknown. + * + * @returns 0 on success, < 0 on error + */ +extern int crush_bucket_add_item(struct crush_map *map, struct crush_bucket *bucket, int item, int weight); +/** @ingroup API + * + * If __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM, + * __(struct crush_bucket_uniform *)bucket->item_weight__ is set to __weight__ and the + * weight of the bucket is set to be the number of items in the bucket times the weight. + * The return value is the difference between the new bucket weight and the former + * bucket weight. The __item__ argument is ignored. + * + * If __bucket->alg__ is different from ::CRUSH_BUCKET_UNIFORM, + * set the __weight__ of __item__ in __bucket__. The former weight of the + * item is subtracted from the weight of the bucket and the new weight is added. + * The return value is the difference between the new item weight and the former + * item weight. + * + * @returns the difference between the new weight and the former weight + */ +extern int crush_bucket_adjust_item_weight(struct crush_map *map, struct crush_bucket *bucket, int item, int weight); +/** @ingroup API + * + * Recursively update the weight of __bucket__ and its children, deep + * first. The __bucket__ weight is set to the sum of the weight of the + * items it contains. + * + * - return -ERANGE if the sum of the weight of the items in __bucket__ overflows. + * - return -1 if the value of __bucket->alg__ is unknown. + * + * @param map a crush_map containing __bucket__ + * @param bucket the root of the tree to reweight + * @returns 0 on success, < 0 on error + */ +extern int crush_reweight_bucket(struct crush_map *map, struct crush_bucket *bucket); +/** @ingroup API + * + * Remove __bucket__ from __map__ and deallocate it via crush_destroy_bucket(). + * __assert(3)__ that __bucket__ is in __map__. The caller is responsible for + * making sure the bucket is not the child of any other bucket in the __map__. + * + * @param map a crush_map containing __bucket__ + * @param bucket the bucket to remove from __map__ + * @returns 0 + */ +extern int crush_remove_bucket(struct crush_map *map, struct crush_bucket *bucket); +/** @ingroup API + * + * Remove __item__ from __bucket__ and subtract the item weight from + * the bucket weight. If the weight of the item is greater than the + * weight of the bucket, silently set the bucket weight to zero. + * + * - return -ENOMEM if the __bucket__ cannot be sized down with __realloc(3)__. + * - return -1 if the value of __bucket->alg__ is unknown. + * + * @param map __unused__ + * @param bucket the bucket from which __item__ is removed + * @param item the item to remove from __bucket__ + * @returns 0 on success, < 0 on error + */ +extern int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *bucket, int item); + +struct crush_bucket_uniform * +crush_make_uniform_bucket(int hash, int type, int size, + int *items, + int item_weight); +struct crush_bucket_list* +crush_make_list_bucket(int hash, int type, int size, + int *items, + int *weights); +struct crush_bucket_tree* +crush_make_tree_bucket(int hash, int type, int size, + int *items, /* in leaf order */ + int *weights); +struct crush_bucket_straw * +crush_make_straw_bucket(struct crush_map *map, + int hash, int type, int size, + int *items, + int *weights); + +extern int crush_addition_is_unsafe(__u32 a, __u32 b); +extern int crush_multiplication_is_unsafe(__u32 a, __u32 b); + +/** @ingroup API + * + * Set the __map__ tunables to implement the most ancient behavior, + * for backward compatibility purposes only. + * + * - choose_local_tries == 2 + * - choose_local_fallback_tries == 5 + * - choose_total_tries == 19 + * - chooseleaf_descend_once == 0 + * - chooseleaf_vary_r == 0 + * - straw_calc_version == 0 + * - chooseleaf_stable = 0 + * + * See the __crush_map__ documentation for more information about + * each tunable. + * + * @param map a crush_map + */ +extern void set_legacy_crush_map(struct crush_map *map); +/** @ingroup API + * + * Set the __map__ tunables to implement the optimal behavior. These + * are the values set by crush_create(). It does not guarantee a + * stable mapping after an upgrade. + * + * For instance when a bug is fixed it may significantly change the + * mapping. In that case a new tunable (say tunable_new) is added so + * the caller can control when the bug fix is activated. The + * set_optimal_crush_map() function will always set all tunables, + * including tunable_new, to fix all bugs even if it means changing + * the mapping. If the caller needs fine grained control on the + * tunables to upgrade to a new version without changing the mapping, + * it needs to set the __crush_map__ tunables individually. + * + * See the __crush_map__ documentation for more information about + * each tunable. + * + * @param map a crush_map + */ +extern void set_optimal_crush_map(struct crush_map *map); + +#endif diff --git a/src/crush/crush.c b/src/crush/crush.c new file mode 100644 index 00000000..5bf94c04 --- /dev/null +++ b/src/crush/crush.c @@ -0,0 +1,137 @@ +#ifdef __KERNEL__ +# include +# include +#else +# include "crush_compat.h" +# include "crush.h" +#endif + +const char *crush_bucket_alg_name(int alg) +{ + switch (alg) { + case CRUSH_BUCKET_UNIFORM: return "uniform"; + case CRUSH_BUCKET_LIST: return "list"; + case CRUSH_BUCKET_TREE: return "tree"; + case CRUSH_BUCKET_STRAW: return "straw"; + case CRUSH_BUCKET_STRAW2: return "straw2"; + default: return "unknown"; + } +} + +/** + * crush_get_bucket_item_weight - Get weight of an item in given bucket + * @b: bucket pointer + * @p: item index in bucket + */ +int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) +{ + if ((__u32)p >= b->size) + return 0; + + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + return ((struct crush_bucket_uniform *)b)->item_weight; + case CRUSH_BUCKET_LIST: + return ((struct crush_bucket_list *)b)->item_weights[p]; + case CRUSH_BUCKET_TREE: + return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; + case CRUSH_BUCKET_STRAW: + return ((struct crush_bucket_straw *)b)->item_weights[p]; + case CRUSH_BUCKET_STRAW2: + return ((struct crush_bucket_straw2 *)b)->item_weights[p]; + } + return 0; +} + +void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) +{ + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_list(struct crush_bucket_list *b) +{ + kfree(b->item_weights); + kfree(b->sum_weights); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_tree(struct crush_bucket_tree *b) +{ + kfree(b->h.items); + kfree(b->node_weights); + kfree(b); +} + +void crush_destroy_bucket_straw(struct crush_bucket_straw *b) +{ + kfree(b->straws); + kfree(b->item_weights); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) +{ + kfree(b->item_weights); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket(struct crush_bucket *b) +{ + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b); + break; + case CRUSH_BUCKET_LIST: + crush_destroy_bucket_list((struct crush_bucket_list *)b); + break; + case CRUSH_BUCKET_TREE: + crush_destroy_bucket_tree((struct crush_bucket_tree *)b); + break; + case CRUSH_BUCKET_STRAW: + crush_destroy_bucket_straw((struct crush_bucket_straw *)b); + break; + case CRUSH_BUCKET_STRAW2: + crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); + break; + } +} + +/** + * crush_destroy - Destroy a crush_map + * @map: crush_map pointer + */ +void crush_destroy(struct crush_map *map) +{ + /* buckets */ + if (map->buckets) { + __s32 b; + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == NULL) + continue; + crush_destroy_bucket(map->buckets[b]); + } + kfree(map->buckets); + } + + /* rules */ + if (map->rules) { + __u32 b; + for (b = 0; b < map->max_rules; b++) + crush_destroy_rule(map->rules[b]); + kfree(map->rules); + } + +#ifndef __KERNEL__ + kfree(map->choose_tries); +#endif + kfree(map); +} + +void crush_destroy_rule(struct crush_rule *rule) +{ + kfree(rule); +} diff --git a/src/crush/crush.h b/src/crush/crush.h new file mode 100644 index 00000000..dd08aa7b --- /dev/null +++ b/src/crush/crush.h @@ -0,0 +1,549 @@ +#ifndef CEPH_CRUSH_CRUSH_H +#define CEPH_CRUSH_CRUSH_H + +#ifdef __KERNEL__ +# include +#else +# include "crush_compat.h" +#endif + +/* + * CRUSH is a pseudo-random data distribution algorithm that + * efficiently distributes input values (typically, data objects) + * across a heterogeneous, structured storage cluster. + * + * The algorithm was originally described in detail in this paper + * (although the algorithm has evolved somewhat since then): + * + * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf + * + * LGPL2.1 + */ + + +#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ + +#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ +#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */ +#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */ + +#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u) +#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u) + +#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ +/** @ingroup API + * The equivalent of NULL for an item, i.e. the absence of an item. + */ +#define CRUSH_ITEM_NONE 0x7fffffff + +/* + * CRUSH uses user-defined "rules" to describe how inputs should be + * mapped to devices. A rule consists of sequence of steps to perform + * to generate the set of output devices. + */ +struct crush_rule_step { + __u32 op; + __s32 arg1; + __s32 arg2; +}; + +/** @ingroup API + */ +enum crush_opcodes { + /*! do nothing + */ + CRUSH_RULE_NOOP = 0, + CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */ + CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */ + /* arg2 = type */ + CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ + CRUSH_RULE_EMIT = 4, /* no args */ + CRUSH_RULE_CHOOSELEAF_FIRSTN = 6, + CRUSH_RULE_CHOOSELEAF_INDEP = 7, + + CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ + CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ + CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, + CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, + CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12, + CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13 +}; + +/* + * for specifying choose num (arg1) relative to the max parameter + * passed to do_rule + */ +#define CRUSH_CHOOSE_N 0 +#define CRUSH_CHOOSE_N_MINUS(x) (-(x)) + +/* + * The rule mask is used to describe what the rule is intended for. + * Given a ruleset and size of output set, we search through the + * rule list for a matching rule_mask. + */ +struct crush_rule_mask { + __u8 ruleset; + __u8 type; + __u8 min_size; + __u8 max_size; +}; + +struct crush_rule { + __u32 len; + struct crush_rule_mask mask; + struct crush_rule_step steps[0]; +}; + +#define crush_rule_size(len) (sizeof(struct crush_rule) + \ + (len)*sizeof(struct crush_rule_step)) + + + +/* + * A bucket is a named container of other items (either devices or + * other buckets). + */ + +/** @ingroup API + * + * Items within a bucket are chosen with crush_do_rule() using one of + * three algorithms representing a tradeoff between performance and + * reorganization efficiency. If you are unsure of which bucket type + * to use, we recommend using ::CRUSH_BUCKET_STRAW2. + * + * The table summarizes how the speed of each option measures up + * against mapping stability when items are added or removed. + * + * Bucket Alg Speed Additions Removals + * ------------------------------------------------ + * uniform O(1) poor poor + * list O(n) optimal poor + * straw2 O(n) optimal optimal + */ +enum crush_algorithm { + /*! + * Devices are rarely added individually in a large system. + * Instead, new storage is typically deployed in blocks of identical + * devices, often as an additional shelf in a server rack or perhaps + * an entire cabinet. Devices reaching their end of life are often + * similarly decommissioned as a set (individual failures aside), + * making it natural to treat them as a unit. CRUSH uniform buckets + * are used to represent an identical set of devices in such + * circumstances. The key advantage in doing so is performance + * related: CRUSH can map replicas into uniform buckets in constant + * time. In cases where the uniformity restrictions are not + * appropriate, other bucket types can be used. If the size of a + * uniform bucket changes, there is a complete reshuffling of data + * between devices, much like conventional hash-based distribution + * strategies. + */ + CRUSH_BUCKET_UNIFORM = 1, + /*! + * List buckets structure their contents as a linked list, and + * can contain items with arbitrary weights. To place a + * replica, CRUSH begins at the head of the list with the most + * recently added item and compares its weight to the sum of + * all remaining items' weights. Depending on the value of + * hash( x , r , item), either the current item is chosen with + * the appropriate probability, or the process continues + * recursively down the list. This is a natural and intuitive + * choice for an expanding cluster: either an object is + * relocated to the newest device with some appropriate + * probability, or it remains on the older devices as before. + * The result is optimal data migration when items are added + * to the bucket. Items removed from the middle or tail of the + * list, however, can result in a significant amount of + * unnecessary movement, making list buckets most suitable for + * circumstances in which they never (or very rarely) shrink. + */ + CRUSH_BUCKET_LIST = 2, + /*! @cond INTERNAL */ + CRUSH_BUCKET_TREE = 3, + CRUSH_BUCKET_STRAW = 4, + /*! @endcond */ + /*! + * List and tree buckets are structured such that a limited + * number of hash values need to be calculated and compared to + * weights in order to select a bucket item. In doing so, + * they divide and conquer in a way that either gives certain + * items precedence (e. g., those at the beginning of a list) + * or obviates the need to consider entire subtrees of items + * at all. That improves the performance of the replica + * placement process, but can also introduce suboptimal + * reorganization behavior when the contents of a bucket + * change due an addition, removal, or re-weighting of an + * item. + * + * The straw2 bucket type allows all items to fairly "compete" + * against each other for replica placement through a process + * analogous to a draw of straws. To place a replica, a straw + * of random length is drawn for each item in the bucket. The + * item with the longest straw wins. The length of each straw + * is initially a value in a fixed range. Each straw length + * is scaled by a factor based on the item's weight so that + * heavily weighted items are more likely to win the draw. + * Although this process is almost twice as slow (on average) + * than a list bucket and even slower than a tree bucket + * (which scales logarithmically), straw2 buckets result in + * optimal data movement between nested items when modified. + */ + CRUSH_BUCKET_STRAW2 = 5, +}; +extern const char *crush_bucket_alg_name(int alg); + +/* + * although tree was a legacy algorithm, it has been buggy, so + * exclude it. + */ +#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \ + (1 << CRUSH_BUCKET_UNIFORM) | \ + (1 << CRUSH_BUCKET_LIST) | \ + (1 << CRUSH_BUCKET_STRAW)) + +/** @ingroup API + * + * A bucket contains __size__ __items__ which are either positive + * numbers or negative numbers that reference other buckets and is + * uniquely identified with __id__ which is a negative number. The + * __weight__ of a bucket is the cumulative weight of all its + * children. A bucket is assigned a ::crush_algorithm that is used by + * crush_do_rule() to draw an item depending on its weight. A bucket + * can be assigned a strictly positive (> 0) __type__ defined by the + * caller. The __type__ can be used by crush_do_rule(), when it is + * given as an argument of a rule step. + * + * A pointer to crush_bucket can safely be cast into the following + * structure, depending on the value of __alg__: + * + * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform + * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list + * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2 + * + * The weight of each item depends on the algorithm and the + * information about it is available in the corresponding structure + * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2). + * + * See crush_map for more information on how __id__ is used + * to reference the bucket. + */ +struct crush_bucket { + __s32 id; /*!< bucket identifier, < 0 and unique within a crush_map */ + __u16 type; /*!< > 0 bucket type, defined by the caller */ + __u8 alg; /*!< the item selection ::crush_algorithm */ + /*! @cond INTERNAL */ + __u8 hash; /* which hash function to use, CRUSH_HASH_* */ + /*! @endcond */ + __u32 weight; /*!< 16.16 fixed point cumulated children weight */ + __u32 size; /*!< size of the __items__ array */ + __s32 *items; /*!< array of children: < 0 are buckets, >= 0 items */ +}; + +/** @ingroup API + * + * Replacement weights for each item in a bucket. The size of the + * array must be exactly the size of the straw2 bucket, just as the + * item_weights array. + * + */ +struct crush_weight_set { + __u32 *weights; /*!< 16.16 fixed point weights in the same order as items */ + __u32 size; /*!< size of the __weights__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for a given straw2 bucket, for + * placement purposes. + * + * When crush_do_rule() chooses the Nth item from a straw2 bucket, the + * replacement weights found at __weight_set[N]__ are used instead of + * the weights from __item_weights__. If __N__ is greater than + * __weight_set_positions__, the weights found at __weight_set_positions-1__ are + * used instead. For instance if __weight_set__ is: + * + * [ [ 0x10000, 0x20000 ], // position 0 + * [ 0x20000, 0x40000 ] ] // position 1 + * + * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ] + * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ] + * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ] + * etc. + * + */ +struct crush_choose_arg { + __s32 *ids; /*!< values to use instead of items */ + __u32 ids_size; /*!< size of the __ids__ array */ + struct crush_weight_set *weight_set; /*!< weight replacements for a given position */ + __u32 weight_set_positions; /*!< size of the __weight_set__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for each bucket in the crushmap. The + * __size__ of the __args__ array must be exactly the same as the + * __map->max_buckets__. + * + * The __crush_choose_arg__ at index N will be used when choosing + * an item from the bucket __map->buckets[N]__ bucket, provided it + * is a straw2 bucket. + * + */ +struct crush_choose_arg_map { + struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */ + __u32 size; /*!< size of the __args__ array */ +}; + +/** @ingroup API + * The weight of each item in the bucket when + * __h.alg__ == ::CRUSH_BUCKET_UNIFORM. + */ +struct crush_bucket_uniform { + struct crush_bucket h; /*!< generic bucket information */ + __u32 item_weight; /*!< 16.16 fixed point weight for each item */ +}; + +/** @ingroup API + * The weight of each item in the bucket when + * __h.alg__ == ::CRUSH_BUCKET_LIST. + * + * The weight of __h.items[i]__ is __item_weights[i]__ for i in + * [0,__h.size__[. The __sum_weight__[i] is the sum of the __item_weights[j]__ + * for j in [0,i[. + * + */ +struct crush_bucket_list { + struct crush_bucket h; /*!< generic bucket information */ + __u32 *item_weights; /*!< 16.16 fixed point weight for each item */ + __u32 *sum_weights; /*!< 16.16 fixed point sum of the weights */ +}; + +struct crush_bucket_tree { + struct crush_bucket h; /* note: h.size is _tree_ size, not number of + actual items */ + __u8 num_nodes; + __u32 *node_weights; +}; + +struct crush_bucket_straw { + struct crush_bucket h; + __u32 *item_weights; /* 16-bit fixed point */ + __u32 *straws; /* 16-bit fixed point */ +}; + +/** @ingroup API + * The weight of each item in the bucket when + * __h.alg__ == ::CRUSH_BUCKET_STRAW2. + * + * The weight of __h.items[i]__ is __item_weights[i]__ for i in + * [0,__h.size__[. + */ +struct crush_bucket_straw2 { + struct crush_bucket h; /*!< generic bucket information */ + __u32 *item_weights; /*!< 16.16 fixed point weight for each item */ +}; + + + +/** @ingroup API + * + * A crush map define a hierarchy of crush_bucket that end with leaves + * (buckets and leaves are called items) and a set of crush_rule to + * map an integer to items with the crush_do_rule() function. + * + */ +struct crush_map { + /*! An array of crush_bucket pointers of size __max_buckets__. + * An element of the array may be NULL if the bucket was removed with + * crush_remove_bucket(). The buckets must be added with crush_add_bucket(). + * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i. + */ + struct crush_bucket **buckets; + /*! An array of crush_rule pointers of size __max_rules__. + * An element of the array may be NULL if the rule was removed (there is + * no API to do so but there may be one in the future). The rules must be added + * with crush_add_rule(). + */ + struct crush_rule **rules; + __s32 max_buckets; /*!< the size of __buckets__ */ + __u32 max_rules; /*!< the size of __rules__ */ + /*! The value of the highest item stored in the crush_map + 1 + */ + __s32 max_devices; + + /*! Backward compatibility tunable. It implements a bad solution + * and must always be set to 0 except for backward compatibility + * purposes + */ + __u32 choose_local_tries; + /*! Backward compatibility tunable. It implements a bad solution + * and must always be set to 0 except for backward compatibility + * purposes + */ + __u32 choose_local_fallback_tries; + /*! Tunable. The default value when the CHOOSE_TRIES or + * CHOOSELEAF_TRIES steps are omitted in a rule. See the + * documentation for crush_rule_set_step() for more + * information + */ + __u32 choose_total_tries; + /*! Backward compatibility tunable. It should always be set + * to 1 except for backward compatibility. Implemented in 2012 + * it was generalized late 2013 and is mostly unused except + * in one border case, reason why it must be set to 1. + * + * Attempt chooseleaf inner descent once for firstn mode; on + * reject retry outer descent. Note that this does *not* + * apply to a collision: in that case we will retry as we + * used to. + */ + __u32 chooseleaf_descend_once; + /*! Backward compatibility tunable. It is a fix for bad + * mappings implemented in 2014 at + * https://github.com/ceph/ceph/pull/1185. It should always + * be set to 1 except for backward compatibility. + * + * If non-zero, feed r into chooseleaf, bit-shifted right by + * (r-1) bits. a value of 1 is best for new clusters. for + * legacy clusters that want to limit reshuffling, a value of + * 3 or 4 will make the mappings line up a bit better with + * previous mappings. + */ + __u8 chooseleaf_vary_r; + + /*! Backward compatibility tunable. It is an improvement that + * avoids unnecessary mapping changes, implemented at + * https://github.com/ceph/ceph/pull/6572 and explained in + * this post: "chooseleaf may cause some unnecessary pg + * migrations" in October 2015 + * https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html + * It should always be set to 1 except for backward compatibility. + */ + __u8 chooseleaf_stable; + + /*! @cond INTERNAL */ + /* This value is calculated after decode or construction by + the builder. It is exposed here (rather than having a + 'build CRUSH working space' function) so that callers can + reserve a static buffer, allocate space on the stack, or + otherwise avoid calling into the heap allocator if they + want to. The size of the working space depends on the map, + while the size of the scratch vector passed to the mapper + depends on the size of the desired result set. + + Nothing stops the caller from allocating both in one swell + foop and passing in two points, though. */ + size_t working_size; + +#ifndef __KERNEL__ + /*! @endcond */ + /*! Backward compatibility tunable. It is a fix for the straw + * scaler values for the straw algorithm which is deprecated + * (straw2 replaces it) implemented at + * https://github.com/ceph/ceph/pull/3057. It should always + * be set to 1 except for backward compatibility. + * + */ + __u8 straw_calc_version; + + /*! @cond INTERNAL */ + /* + * allowed bucket algs is a bitmask, here the bit positions + * are CRUSH_BUCKET_*. note that these are *bits* and + * CRUSH_BUCKET_* values are not, so we need to or together (1 + * << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to + * minimize confusion (bucket type values start at 1). + */ + __u32 allowed_bucket_algs; + + __u32 *choose_tries; +#endif + /*! @endcond */ +}; + + +/* crush.c */ +/** @ingroup API + * + * Return the 16.16 fixed point weight of the item at __pos__ (zero + * based index) within the bucket __b__. If __pos__ is negative or + * greater or equal to the number of items in the bucket, return 0. + * + * @param b the bucket containing items + * @param pos the zero based index of the item + * + * @returns the 16.16 fixed point item weight + */ +extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos); +extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); +extern void crush_destroy_bucket_list(struct crush_bucket_list *b); +extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); +extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); +extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b); +/** @ingroup API + * + * Deallocate a bucket created via crush_add_bucket(). + * + * @param b the bucket to deallocate + */ +extern void crush_destroy_bucket(struct crush_bucket *b); +/** @ingroup API + * + * Deallocate a rule created via crush_add_rule(). + * + * @param r the rule to deallocate + */ +extern void crush_destroy_rule(struct crush_rule *r); +/** @ingroup API + * + * Deallocate the __map__, previously allocated with crush_create. + * + * @param map the crush map + */ +extern void crush_destroy(struct crush_map *map); + +static inline int crush_calc_tree_node(int i) +{ + return ((i+1) << 1)-1; +} + +static inline const char *crush_alg_name(int alg) +{ + switch (alg) { + case CRUSH_BUCKET_UNIFORM: + return "uniform"; + case CRUSH_BUCKET_LIST: + return "list"; + case CRUSH_BUCKET_TREE: + return "tree"; + case CRUSH_BUCKET_STRAW: + return "straw"; + case CRUSH_BUCKET_STRAW2: + return "straw2"; + default: + return "unknown"; + } +} + +/* --------------------------------------------------------------------- + Private + --------------------------------------------------------------------- */ + +/* These data structures are private to the CRUSH implementation. They + are exposed in this header file because builder needs their + definitions to calculate the total working size. + + Moving this out of the crush map allow us to treat the CRUSH map as + immutable within the mapper and removes the requirement for a CRUSH + map lock. */ + +struct crush_work_bucket { + __u32 perm_x; /* @x for which *perm is defined */ + __u32 perm_n; /* num elements of *perm that are permuted/defined */ + __u32 *perm; /* Permutation of the bucket's items */ +}; + +struct crush_work { + struct crush_work_bucket **work; /* Per-bucket working store */ +}; + +#endif diff --git a/src/crush/crush_compat.h b/src/crush/crush_compat.h new file mode 100644 index 00000000..08eb4eab --- /dev/null +++ b/src/crush/crush_compat.h @@ -0,0 +1,39 @@ +#ifndef CEPH_CRUSH_COMPAT_H +#define CEPH_CRUSH_COMPAT_H + +#include "include/int_types.h" + +#include +#include +#include +#include + +/* asm-generic/bug.h */ + +#define BUG_ON(x) assert(!(x)) + +/* linux/kernel.h */ + +#define U8_MAX ((__u8)~0U) +#define S8_MAX ((__s8)(U8_MAX>>1)) +#define S8_MIN ((__s8)(-S8_MAX - 1)) +#define U16_MAX ((__u16)~0U) +#define S16_MAX ((__s16)(U16_MAX>>1)) +#define S16_MIN ((__s16)(-S16_MAX - 1)) +#define U32_MAX ((__u32)~0U) +#define S32_MAX ((__s32)(U32_MAX>>1)) +#define S32_MIN ((__s32)(-S32_MAX - 1)) +#define U64_MAX ((__u64)~0ULL) +#define S64_MAX ((__s64)(U64_MAX>>1)) +#define S64_MIN ((__s64)(-S64_MAX - 1)) + +/* linux/math64.h */ + +#define div64_s64(dividend, divisor) ((dividend) / (divisor)) + +/* linux/slab.h */ + +#define kmalloc(size, flags) malloc(size) +#define kfree(x) do { if (x) free(x); } while (0) + +#endif /* CEPH_CRUSH_COMPAT_H */ diff --git a/src/crush/crush_ln_table.h b/src/crush/crush_ln_table.h new file mode 100644 index 00000000..aae534c9 --- /dev/null +++ b/src/crush/crush_ln_table.h @@ -0,0 +1,164 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel Corporation All Rights Reserved + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_CRUSH_LN_H +#define CEPH_CRUSH_LN_H + +#ifdef __KERNEL__ +# include +#else +# include "crush_compat.h" +#endif + +/* + * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0) + * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0) + */ +static __s64 __RH_LH_tbl[128*2+2] = { + 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll, + 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all, + 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll, + 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll, + 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll, + 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll, + 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll, + 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell, + 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll, + 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll, + 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll, + 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll, + 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll, + 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll, + 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all, + 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll, + 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all, + 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell, + 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll, + 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll, + 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll, + 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll, + 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll, + 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll, + 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll, + 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll, + 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell, + 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll, + 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll, + 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll, + 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll, + 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll, + 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll, + 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll, + 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll, + 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll, + 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll, + 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll, + 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll, + 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll, + 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll, + 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll, + 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll, + 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll, + 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll, + 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll, + 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll, + 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll, + 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll, + 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll, + 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll, + 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll, + 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll, + 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell, + 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell, + 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll, + 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell, + 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll, + 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll, + 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll, + 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll, + 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll, + 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll, + 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll, + 0x0000800000000000ll, 0x0000ffff00000000ll, +}; + +/* + * LL_tbl[k] = 2^48*log2(1.0+k/2^15) + */ +static __s64 __LL_tbl[256] = { + 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull, + 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull, + 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull, + 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull, + 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull, + 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull, + 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull, + 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull, + 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull, + 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull, + 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull, + 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull, + 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull, + 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull, + 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull, + 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull, + 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull, + 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull, + 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull, + 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull, + 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull, + 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull, + 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull, + 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull, + 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull, + 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull, + 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull, + 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull, + 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull, + 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull, + 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull, + 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull, + 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull, + 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull, + 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull, + 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull, + 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull, + 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull, + 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull, + 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull, + 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull, + 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull, + 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull, + 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull, + 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull, + 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull, + 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull, + 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull, + 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull, + 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull, + 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull, + 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull, + 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull, + 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull, + 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull, + 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull, + 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull, + 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull, + 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull, + 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull, + 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull, + 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull, + 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull, + 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull, +}; + +#endif diff --git a/src/crush/grammar.h b/src/crush/grammar.h new file mode 100644 index 00000000..42a6068b --- /dev/null +++ b/src/crush/grammar.h @@ -0,0 +1,191 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2008 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_CRUSH_GRAMMAR_H +#define CEPH_CRUSH_GRAMMAR_H + +//#define BOOST_SPIRIT_DEBUG + +#ifdef USE_BOOST_SPIRIT_OLD_HDR +#include +#include +#include +#else +#define BOOST_SPIRIT_USE_OLD_NAMESPACE +#include +#include +#include +#endif +using namespace boost::spirit; + +struct crush_grammar : public grammar +{ + enum { + _int = 1, + _posint, + _negint, + _name, + _device, + _bucket_type, + _bucket_id, + _bucket_alg, + _bucket_hash, + _bucket_item, + _bucket, + _step_take, + _step_set_chooseleaf_tries, + _step_set_chooseleaf_vary_r, + _step_set_chooseleaf_stable, + _step_set_choose_tries, + _step_set_choose_local_tries, + _step_set_choose_local_fallback_tries, + _step_choose, + _step_chooseleaf, + _step_emit, + _step, + _crushrule, + _weight_set_weights, + _weight_set, + _choose_arg_ids, + _choose_arg, + _choose_args, + _crushmap, + _tunable, + }; + + template + struct definition + { + rule, parser_tag<_int> > integer; + rule, parser_tag<_posint> > posint; + rule, parser_tag<_negint> > negint; + rule, parser_tag<_name> > name; + + rule, parser_tag<_tunable> > tunable; + + rule, parser_tag<_device> > device; + + rule, parser_tag<_bucket_type> > bucket_type; + + rule, parser_tag<_bucket_id> > bucket_id; + rule, parser_tag<_bucket_alg> > bucket_alg; + rule, parser_tag<_bucket_hash> > bucket_hash; + rule, parser_tag<_bucket_item> > bucket_item; + rule, parser_tag<_bucket> > bucket; + + rule, parser_tag<_step_take> > step_take; + rule, parser_tag<_step_set_choose_tries> > step_set_choose_tries; + rule, parser_tag<_step_set_choose_local_tries> > step_set_choose_local_tries; + rule, parser_tag<_step_set_choose_local_fallback_tries> > step_set_choose_local_fallback_tries; + rule, parser_tag<_step_set_chooseleaf_tries> > step_set_chooseleaf_tries; + rule, parser_tag<_step_set_chooseleaf_vary_r> > step_set_chooseleaf_vary_r; + rule, parser_tag<_step_set_chooseleaf_stable> > step_set_chooseleaf_stable; + rule, parser_tag<_step_choose> > step_choose; + rule, parser_tag<_step_chooseleaf> > step_chooseleaf; + rule, parser_tag<_step_emit> > step_emit; + rule, parser_tag<_step> > step; + rule, parser_tag<_crushrule> > crushrule; + rule, parser_tag<_weight_set_weights> > weight_set_weights; + rule, parser_tag<_weight_set> > weight_set; + rule, parser_tag<_choose_arg_ids> > choose_arg_ids; + rule, parser_tag<_choose_arg> > choose_arg; + rule, parser_tag<_choose_args> > choose_args; + + rule, parser_tag<_crushmap> > crushmap; + + definition(crush_grammar const& /*self*/) + { + // base types + integer = leaf_node_d[ lexeme_d[ + (!ch_p('-') >> +digit_p) + ] ]; + posint = leaf_node_d[ lexeme_d[ +digit_p ] ]; + negint = leaf_node_d[ lexeme_d[ ch_p('-') >> +digit_p ] ]; + name = leaf_node_d[ lexeme_d[ +( alnum_p || ch_p('-') || ch_p('_') || ch_p('.')) ] ]; + + // tunables + tunable = str_p("tunable") >> name >> posint; + + // devices + device = str_p("device") >> posint >> name >> !( str_p("class") >> name ); + + // bucket types + bucket_type = str_p("type") >> posint >> name; + + // buckets + bucket_id = str_p("id") >> negint >> !( str_p("class") >> name ); + bucket_alg = str_p("alg") >> name; + bucket_hash = str_p("hash") >> ( integer | + str_p("rjenkins1") ); + bucket_item = str_p("item") >> name + >> !( str_p("weight") >> real_p ) + >> !( str_p("pos") >> posint ); + bucket = name >> name >> '{' >> *bucket_id >> bucket_alg >> *bucket_hash >> *bucket_item >> '}'; + + // rules + step_take = str_p("take") >> name >> !( str_p("class") >> name ); + step_set_choose_tries = str_p("set_choose_tries") >> posint; + step_set_choose_local_tries = str_p("set_choose_local_tries") >> posint; + step_set_choose_local_fallback_tries = str_p("set_choose_local_fallback_tries") >> posint; + step_set_chooseleaf_tries = str_p("set_chooseleaf_tries") >> posint; + step_set_chooseleaf_vary_r = str_p("set_chooseleaf_vary_r") >> posint; + step_set_chooseleaf_stable = str_p("set_chooseleaf_stable") >> posint; + step_choose = str_p("choose") + >> ( str_p("indep") | str_p("firstn") ) + >> integer + >> str_p("type") >> name; + step_chooseleaf = str_p("chooseleaf") + >> ( str_p("indep") | str_p("firstn") ) + >> integer + >> str_p("type") >> name; + step_emit = str_p("emit"); + step = str_p("step") >> ( step_take | + step_set_choose_tries | + step_set_choose_local_tries | + step_set_choose_local_fallback_tries | + step_set_chooseleaf_tries | + step_set_chooseleaf_vary_r | + step_set_chooseleaf_stable | + step_choose | + step_chooseleaf | + step_emit ); + crushrule = str_p("rule") >> !name >> '{' + >> (str_p("id") | str_p("ruleset")) >> posint + >> str_p("type") >> ( str_p("replicated") | str_p("erasure") ) + >> str_p("min_size") >> posint + >> str_p("max_size") >> posint + >> +step + >> '}'; + + weight_set_weights = str_p("[") >> *real_p >> str_p("]"); + weight_set = str_p("weight_set") >> str_p("[") + >> *weight_set_weights + >> str_p("]"); + choose_arg_ids = str_p("ids") >> str_p("[") >> *integer >> str_p("]"); + choose_arg = str_p("{") >> str_p("bucket_id") >> negint + >> !weight_set + >> !choose_arg_ids + >> str_p("}"); + choose_args = str_p("choose_args") >> posint >> str_p("{") >> *choose_arg >> str_p("}"); + + // the whole crush map + crushmap = *(tunable | device | bucket_type) >> *(bucket | crushrule) >> *choose_args; + } + + rule, parser_tag<_crushmap> > const& + start() const { return crushmap; } + }; +}; + +#endif diff --git a/src/crush/hash.c b/src/crush/hash.c new file mode 100644 index 00000000..ed123af4 --- /dev/null +++ b/src/crush/hash.c @@ -0,0 +1,151 @@ +#ifdef __KERNEL__ +# include +#else +# include "hash.h" +#endif + +/* + * Robert Jenkins' function for mixing 32-bit values + * http://burtleburtle.net/bob/hash/evahash.html + * a, b = random bits, c = input and output + */ +#define crush_hashmix(a, b, c) do { \ + a = a-b; a = a-c; a = a^(c>>13); \ + b = b-c; b = b-a; b = b^(a<<8); \ + c = c-a; c = c-b; c = c^(b>>13); \ + a = a-b; a = a-c; a = a^(c>>12); \ + b = b-c; b = b-a; b = b^(a<<16); \ + c = c-a; c = c-b; c = c^(b>>5); \ + a = a-b; a = a-c; a = a^(c>>3); \ + b = b-c; b = b-a; b = b^(a<<10); \ + c = c-a; c = c-b; c = c^(b>>15); \ + } while (0) + +#define crush_hash_seed 1315423911 + +static __u32 crush_hash32_rjenkins1(__u32 a) +{ + __u32 hash = crush_hash_seed ^ a; + __u32 b = a; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(b, x, hash); + crush_hashmix(y, a, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b) +{ + __u32 hash = crush_hash_seed ^ a ^ b; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(x, a, hash); + crush_hashmix(b, y, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, x, hash); + crush_hashmix(y, a, hash); + crush_hashmix(b, x, hash); + crush_hashmix(y, c, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, d, hash); + crush_hashmix(a, x, hash); + crush_hashmix(y, b, hash); + crush_hashmix(c, x, hash); + crush_hashmix(y, d, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d, + __u32 e) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, d, hash); + crush_hashmix(e, x, hash); + crush_hashmix(y, a, hash); + crush_hashmix(b, x, hash); + crush_hashmix(y, c, hash); + crush_hashmix(d, x, hash); + crush_hashmix(y, e, hash); + return hash; +} + + +__u32 crush_hash32(int type, __u32 a) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1(a); + default: + return 0; + } +} + +__u32 crush_hash32_2(int type, __u32 a, __u32 b) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_2(a, b); + default: + return 0; + } +} + +__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_3(a, b, c); + default: + return 0; + } +} + +__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_4(a, b, c, d); + default: + return 0; + } +} + +__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_5(a, b, c, d, e); + default: + return 0; + } +} + +const char *crush_hash_name(int type) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return "rjenkins1"; + default: + return "unknown"; + } +} diff --git a/src/crush/hash.h b/src/crush/hash.h new file mode 100644 index 00000000..d1d90258 --- /dev/null +++ b/src/crush/hash.h @@ -0,0 +1,23 @@ +#ifndef CEPH_CRUSH_HASH_H +#define CEPH_CRUSH_HASH_H + +#ifdef __KERNEL__ +# include +#else +# include "crush_compat.h" +#endif + +#define CRUSH_HASH_RJENKINS1 0 + +#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 + +extern const char *crush_hash_name(int type); + +extern __u32 crush_hash32(int type, __u32 a); +extern __u32 crush_hash32_2(int type, __u32 a, __u32 b); +extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c); +extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d); +extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, + __u32 e); + +#endif diff --git a/src/crush/mapper.c b/src/crush/mapper.c new file mode 100644 index 00000000..73f92a77 --- /dev/null +++ b/src/crush/mapper.c @@ -0,0 +1,1105 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel Corporation All Rights Reserved + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifdef __KERNEL__ +# include +# include +# include +# include +# include +# include +#else +# include "crush_compat.h" +# include "crush.h" +# include "hash.h" +#endif +#include "crush_ln_table.h" +#include "mapper.h" + +#define dprintk(args...) /* printf(args) */ + +/* + * Implement the core CRUSH mapping algorithm. + */ + +/** + * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. + * @map: the crush_map + * @ruleset: the storage ruleset id (user defined) + * @type: storage ruleset type (user defined) + * @size: output set size + */ +int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size) +{ + __u32 i; + + for (i = 0; i < map->max_rules; i++) { + if (map->rules[i] && + map->rules[i]->mask.ruleset == ruleset && + map->rules[i]->mask.type == type && + map->rules[i]->mask.min_size <= size && + map->rules[i]->mask.max_size >= size) + return i; + } + return -1; +} + +/* + * bucket choose methods + * + * For each bucket algorithm, we have a "choose" method that, given a + * crush input @x and replica position (usually, position in output set) @r, + * will produce an item in the bucket. + */ + +/* + * Choose based on a random permutation of the bucket. + * + * We used to use some prime number arithmetic to do this, but it + * wasn't very random, and had some other bad behaviors. Instead, we + * calculate an actual random permutation of the bucket members. + * Since this is expensive, we optimize for the r=0 case, which + * captures the vast majority of calls. + */ +static int bucket_perm_choose(const struct crush_bucket *bucket, + struct crush_work_bucket *work, + int x, int r) +{ + unsigned int pr = r % bucket->size; + unsigned int i, s; + + /* start a new permutation if @x has changed */ + if (work->perm_x != (__u32)x || work->perm_n == 0) { + dprintk("bucket %d new x=%d\n", bucket->id, x); + work->perm_x = x; + + /* optimize common r=0 case */ + if (pr == 0) { + s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % + bucket->size; + work->perm[0] = s; + work->perm_n = 0xffff; /* magic value, see below */ + goto out; + } + + for (i = 0; i < bucket->size; i++) + work->perm[i] = i; + work->perm_n = 0; + } else if (work->perm_n == 0xffff) { + /* clean up after the r=0 case above */ + for (i = 1; i < bucket->size; i++) + work->perm[i] = i; + work->perm[work->perm[0]] = 0; + work->perm_n = 1; + } + + /* calculate permutation up to pr */ + for (i = 0; i < work->perm_n; i++) + dprintk(" perm_choose have %d: %d\n", i, work->perm[i]); + while (work->perm_n <= pr) { + unsigned int p = work->perm_n; + /* no point in swapping the final entry */ + if (p < bucket->size - 1) { + i = crush_hash32_3(bucket->hash, x, bucket->id, p) % + (bucket->size - p); + if (i) { + unsigned int t = work->perm[p + i]; + work->perm[p + i] = work->perm[p]; + work->perm[p] = t; + } + dprintk(" perm_choose swap %d with %d\n", p, p+i); + } + work->perm_n++; + } + for (i = 0; i < bucket->size; i++) + dprintk(" perm_choose %d: %d\n", i, work->perm[i]); + + s = work->perm[pr]; +out: + dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, + bucket->size, x, r, pr, s); + return bucket->items[s]; +} + +/* uniform */ +static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket, + struct crush_work_bucket *work, int x, int r) +{ + return bucket_perm_choose(&bucket->h, work, x, r); +} + +/* list */ +static int bucket_list_choose(const struct crush_bucket_list *bucket, + int x, int r) +{ + int i; + + for (i = bucket->h.size-1; i >= 0; i--) { + __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i], + r, bucket->h.id); + w &= 0xffff; + dprintk("list_choose i=%d x=%d r=%d item %d weight %x " + "sw %x rand %llx", + i, x, r, bucket->h.items[i], bucket->item_weights[i], + bucket->sum_weights[i], w); + w *= bucket->sum_weights[i]; + w = w >> 16; + /*dprintk(" scaled %llx\n", w);*/ + if (w < bucket->item_weights[i]) { + return bucket->h.items[i]; + } + } + + dprintk("bad list sums for bucket %d\n", bucket->h.id); + return bucket->h.items[0]; +} + + +/* (binary) tree */ +static int height(int n) +{ + int h = 0; + while ((n & 1) == 0) { + h++; + n = n >> 1; + } + return h; +} + +static int left(int x) +{ + int h = height(x); + return x - (1 << (h-1)); +} + +static int right(int x) +{ + int h = height(x); + return x + (1 << (h-1)); +} + +static int terminal(int x) +{ + return x & 1; +} + +static int bucket_tree_choose(const struct crush_bucket_tree *bucket, + int x, int r) +{ + int n; + __u32 w; + __u64 t; + + /* start at root */ + n = bucket->num_nodes >> 1; + + while (!terminal(n)) { + int l; + /* pick point in [0, w) */ + w = bucket->node_weights[n]; + t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, + bucket->h.id) * (__u64)w; + t = t >> 32; + + /* descend to the left or right? */ + l = left(n); + if (t < bucket->node_weights[l]) + n = l; + else + n = right(n); + } + + return bucket->h.items[n >> 1]; +} + + +/* straw */ + +static int bucket_straw_choose(const struct crush_bucket_straw *bucket, + int x, int r) +{ + __u32 i; + int high = 0; + __u64 high_draw = 0; + __u64 draw; + + for (i = 0; i < bucket->h.size; i++) { + draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); + draw &= 0xffff; + draw *= bucket->straws[i]; + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + return bucket->h.items[high]; +} + +/* compute 2^44*log2(input+1) */ +static __u64 crush_ln(unsigned int xin) +{ + unsigned int x = xin; + int iexpon, index1, index2; + __u64 RH, LH, LL, xl64, result; + + x++; + + /* normalize input */ + iexpon = 15; + + // figure out number of bits we need to shift and + // do it in one step instead of iteratively + if (!(x & 0x18000)) { + int bits = __builtin_clz(x & 0x1FFFF) - 16; + x <<= bits; + iexpon = 15 - bits; + } + + index1 = (x >> 8) << 1; + /* RH ~ 2^56/index1 */ + RH = __RH_LH_tbl[index1 - 256]; + /* LH ~ 2^48 * log2(index1/256) */ + LH = __RH_LH_tbl[index1 + 1 - 256]; + + /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */ + xl64 = (__s64)x * RH; + xl64 >>= 48; + + result = iexpon; + result <<= (12 + 32); + + index2 = xl64 & 0xff; + /* LL ~ 2^48*log2(1.0+index2/2^15) */ + LL = __LL_tbl[index2]; + + LH = LH + LL; + + LH >>= (48 - 12 - 32); + result += LH; + + return result; +} + + +/* + * straw2 + * + * Suppose we have two osds: osd.0 and osd.1, with weight 8 and 4 respectively, It means: + * a). For osd.0, the time interval between each io request apply to exponential distribution + * with lamba equals 8 + * b). For osd.1, the time interval between each io request apply to exponential distribution + * with lamba equals 4 + * c). If we apply to each osd's exponential random variable, then the total pgs on each osd + * is proportional to its weight. + * + * for reference, see: + * + * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables + */ + +static inline __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket, + const struct crush_choose_arg *arg, + int position) +{ + if ((arg == NULL) || (arg->weight_set == NULL)) + return bucket->item_weights; + if (position >= arg->weight_set_positions) + position = arg->weight_set_positions - 1; + return arg->weight_set[position].weights; +} + +static inline __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket, + const struct crush_choose_arg *arg) +{ + if ((arg == NULL) || (arg->ids == NULL)) + return bucket->h.items; + return arg->ids; +} + +/* + * Compute exponential random variable using inversion method. + * + * for reference, see the exponential distribution example at: + * https://en.wikipedia.org/wiki/Inverse_transform_sampling#Examples + */ +static inline __s64 generate_exponential_distribution(int type, int x, int y, int z, + int weight) +{ + unsigned int u = crush_hash32_3(type, x, y, z); + u &= 0xffff; + + /* + * for some reason slightly less than 0x10000 produces + * a slightly more accurate distribution... probably a + * rounding effect. + * + * the natural log lookup table maps [0,0xffff] + * (corresponding to real numbers [1/0x10000, 1] to + * [0, 0xffffffffffff] (corresponding to real numbers + * [-11.090355,0]). + */ + __s64 ln = crush_ln(u) - 0x1000000000000ll; + + /* + * divide by 16.16 fixed-point weight. note + * that the ln value is negative, so a larger + * weight means a larger (less negative) value + * for draw. + */ + return div64_s64(ln, weight); +} + +static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, + int x, int r, const struct crush_choose_arg *arg, + int position) +{ + unsigned int i, high = 0; + __s64 draw, high_draw = 0; + __u32 *weights = get_choose_arg_weights(bucket, arg, position); + __s32 *ids = get_choose_arg_ids(bucket, arg); + for (i = 0; i < bucket->h.size; i++) { + dprintk("weight 0x%x item %d\n", weights[i], ids[i]); + if (weights[i]) { + draw = generate_exponential_distribution(bucket->h.hash, x, ids[i], r, weights[i]); + } else { + draw = S64_MIN; + } + + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + + return bucket->h.items[high]; +} + + +static int crush_bucket_choose(const struct crush_bucket *in, + struct crush_work_bucket *work, + int x, int r, + const struct crush_choose_arg *arg, + int position) +{ + dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); + BUG_ON(in->size == 0); + switch (in->alg) { + case CRUSH_BUCKET_UNIFORM: + return bucket_uniform_choose( + (const struct crush_bucket_uniform *)in, + work, x, r); + case CRUSH_BUCKET_LIST: + return bucket_list_choose((const struct crush_bucket_list *)in, + x, r); + case CRUSH_BUCKET_TREE: + return bucket_tree_choose((const struct crush_bucket_tree *)in, + x, r); + case CRUSH_BUCKET_STRAW: + return bucket_straw_choose( + (const struct crush_bucket_straw *)in, + x, r); + case CRUSH_BUCKET_STRAW2: + return bucket_straw2_choose( + (const struct crush_bucket_straw2 *)in, + x, r, arg, position); + default: + dprintk("unknown bucket %d alg %d\n", in->id, in->alg); + return in->items[0]; + } +} + +/* + * true if device is marked "out" (failed, fully offloaded) + * of the cluster + */ +static int is_out(const struct crush_map *map, + const __u32 *weight, int weight_max, + int item, int x) +{ + if (item >= weight_max) + return 1; + if (weight[item] >= 0x10000) + return 0; + if (weight[item] == 0) + return 1; + if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) + < weight[item]) + return 0; + return 1; +} + +/** + * crush_choose_firstn - choose numrep distinct items of given type + * @map: the crush_map + * @bucket: the bucket we are choose an item from + * @x: crush input value + * @numrep: the number of items to choose + * @type: the type of item to choose + * @out: pointer to output vector + * @outpos: our position in that vector + * @out_size: size of the out vector + * @tries: number of attempts to make + * @recurse_tries: number of attempts to have recursive chooseleaf make + * @local_retries: localized retries + * @local_fallback_retries: localized fallback retries + * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @stable: stable mode starts rep=0 in the recursive call for all replicas + * @vary_r: pass r to recursive calls + * @out2: second output vector for leaf items (if @recurse_to_leaf) + * @parent_r: r value passed from the parent + */ +static int crush_choose_firstn(const struct crush_map *map, + struct crush_work *work, + const struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int numrep, int type, + int *out, int outpos, + int out_size, + unsigned int tries, + unsigned int recurse_tries, + unsigned int local_retries, + unsigned int local_fallback_retries, + int recurse_to_leaf, + unsigned int vary_r, + unsigned int stable, + int *out2, + int parent_r, + const struct crush_choose_arg *choose_args) +{ + int rep; + unsigned int ftotal, flocal; + int retry_descent, retry_bucket, skip_rep; + const struct crush_bucket *in = bucket; + int r; + int i; + int item = 0; + int itemtype; + int collide, reject; + int count = out_size; + + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d \ +recurse_tries %d local_retries %d local_fallback_retries %d \ +parent_r %d stable %d\n", + recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep, + tries, recurse_tries, local_retries, local_fallback_retries, + parent_r, stable); + + for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { + /* keep trying until we get a non-out, non-colliding item */ + ftotal = 0; + skip_rep = 0; + do { + retry_descent = 0; + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + flocal = 0; + do { + collide = 0; + retry_bucket = 0; + r = rep + parent_r; + /* r' = r + f_total */ + r += ftotal; + + /* bucket choose */ + if (in->size == 0) { + reject = 1; + goto reject; + } + if (local_fallback_retries > 0 && + flocal >= (in->size>>1) && + flocal > local_fallback_retries) + item = bucket_perm_choose( + in, work->work[-1-in->id], + x, r); + else + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r, + (choose_args ? &choose_args[-1-in->id] : 0), + outpos); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + skip_rep = 1; + break; + } + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + skip_rep = 1; + break; + } + in = map->buckets[-1-item]; + retry_bucket = 1; + continue; + } + + /* collision? */ + for (i = 0; i < outpos; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + + reject = 0; + if (!collide && recurse_to_leaf) { + if (item < 0) { + int sub_r; + if (vary_r) + sub_r = r >> (vary_r-1); + else + sub_r = 0; + if (crush_choose_firstn( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, stable ? 1 : outpos+1, 0, + out2, outpos, count, + recurse_tries, 0, + local_retries, + local_fallback_retries, + 0, + vary_r, + stable, + NULL, + sub_r, + choose_args) <= outpos) + /* didn't get leaf */ + reject = 1; + } else { + /* we already have a leaf! */ + out2[outpos] = item; + } + } + + if (!reject && !collide) { + /* out? */ + if (itemtype == 0) + reject = is_out(map, weight, + weight_max, + item, x); + } + +reject: + if (reject || collide) { + ftotal++; + flocal++; + + if (collide && flocal <= local_retries) + /* retry locally a few times */ + retry_bucket = 1; + else if (local_fallback_retries > 0 && + flocal <= in->size + local_fallback_retries) + /* exhaustive bucket search */ + retry_bucket = 1; + else if (ftotal < tries) + /* then retry descent */ + retry_descent = 1; + else + /* else give up */ + skip_rep = 1; + dprintk(" reject %d collide %d " + "ftotal %u flocal %u\n", + reject, collide, ftotal, + flocal); + } + } while (retry_bucket); + } while (retry_descent); + + if (skip_rep) { + dprintk("skip rep\n"); + continue; + } + + dprintk("CHOOSE got %d\n", item); + out[outpos] = item; + outpos++; + count--; +#ifndef __KERNEL__ + if (map->choose_tries && ftotal <= map->choose_total_tries) + map->choose_tries[ftotal]++; +#endif + } + + dprintk("CHOOSE returns %d\n", outpos); + return outpos; +} + + +/** + * crush_choose_indep: alternative breadth-first positionally stable mapping + * + */ +static void crush_choose_indep(const struct crush_map *map, + struct crush_work *work, + const struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int left, int numrep, int type, + int *out, int outpos, + unsigned int tries, + unsigned int recurse_tries, + int recurse_to_leaf, + int *out2, + int parent_r, + const struct crush_choose_arg *choose_args) +{ + const struct crush_bucket *in = bucket; + int endpos = outpos + left; + int rep; + unsigned int ftotal; + int r; + int i; + int item = 0; + int itemtype; + int collide; + + dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); + + /* initially my result is undefined */ + for (rep = outpos; rep < endpos; rep++) { + out[rep] = CRUSH_ITEM_UNDEF; + if (out2) + out2[rep] = CRUSH_ITEM_UNDEF; + } + + for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { +#ifdef DEBUG_INDEP + if (out2 && ftotal) { + dprintk("%u %d a: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out[rep]); + } + dprintk("\n"); + dprintk("%u %d b: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out2[rep]); + } + dprintk("\n"); + } +#endif + for (rep = outpos; rep < endpos; rep++) { + if (out[rep] != CRUSH_ITEM_UNDEF) + continue; + + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + for (;;) { + /* note: we base the choice on the position + * even in the nested call. that means that + * if the first layer chooses the same bucket + * in a different position, we will tend to + * choose a different item in that bucket. + * this will involve more devices in data + * movement and tend to distribute the load. + */ + r = rep + parent_r; + + /* be careful */ + if (in->alg == CRUSH_BUCKET_UNIFORM && + in->size % numrep == 0) + /* r'=r+(n+1)*f_total */ + r += (numrep+1) * ftotal; + else + /* r' = r + n*f_total */ + r += numrep * ftotal; + + /* bucket choose */ + if (in->size == 0) { + dprintk(" empty bucket\n"); + break; + } + + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r, + (choose_args ? &choose_args[-1-in->id] : 0), + outpos); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = CRUSH_ITEM_NONE; + left--; + break; + } + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = + CRUSH_ITEM_NONE; + left--; + break; + } + in = map->buckets[-1-item]; + continue; + } + + /* collision? */ + collide = 0; + for (i = outpos; i < endpos; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + if (collide) + break; + + if (recurse_to_leaf) { + if (item < 0) { + crush_choose_indep( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_tries, 0, + 0, NULL, r, choose_args); + if (out2[rep] == CRUSH_ITEM_NONE) { + /* placed nothing; no leaf */ + break; + } + } else { + /* we already have a leaf! */ + out2[rep] = item; + } + } + + /* out? */ + if (itemtype == 0 && + is_out(map, weight, weight_max, item, x)) + break; + + /* yay! */ + out[rep] = item; + left--; + break; + } + } + } + for (rep = outpos; rep < endpos; rep++) { + if (out[rep] == CRUSH_ITEM_UNDEF) { + out[rep] = CRUSH_ITEM_NONE; + } + if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { + out2[rep] = CRUSH_ITEM_NONE; + } + } +#ifndef __KERNEL__ + if (map->choose_tries && ftotal <= map->choose_total_tries) + map->choose_tries[ftotal]++; +#endif +#ifdef DEBUG_INDEP + if (out2) { + dprintk("%u %d a: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out[rep]); + } + dprintk("\n"); + dprintk("%u %d b: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out2[rep]); + } + dprintk("\n"); + } +#endif +} + + +/* This takes a chunk of memory and sets it up to be a shiny new + working area for a CRUSH placement computation. It must be called + on any newly allocated memory before passing it in to + crush_do_rule. It may be used repeatedly after that, so long as the + map has not changed. If the map /has/ changed, you must make sure + the working size is no smaller than what was allocated and re-run + crush_init_workspace. + + If you do retain the working space between calls to crush, make it + thread-local. If you reinstitute the locking I've spent so much + time getting rid of, I will be very unhappy with you. */ + +void crush_init_workspace(const struct crush_map *m, void *v) { + /* We work by moving through the available space and setting + values and pointers as we go. + + It's a bit like Forth's use of the 'allot' word since we + set the pointer first and then reserve the space for it to + point to by incrementing the point. */ + struct crush_work *w = (struct crush_work *)v; + char *point = (char *)v; + __s32 b; + point += sizeof(struct crush_work); + w->work = (struct crush_work_bucket **)point; + point += m->max_buckets * sizeof(struct crush_work_bucket *); + for (b = 0; b < m->max_buckets; ++b) { + if (m->buckets[b] == 0) + continue; + + w->work[b] = (struct crush_work_bucket *) point; + switch (m->buckets[b]->alg) { + default: + point += sizeof(struct crush_work_bucket); + break; + } + w->work[b]->perm_x = 0; + w->work[b]->perm_n = 0; + w->work[b]->perm = (__u32 *)point; + point += m->buckets[b]->size * sizeof(__u32); + } + BUG_ON((char *)point - (char *)w != m->working_size); +} + +/** + * crush_do_rule - calculate a mapping with the given input and rule + * @map: the crush_map + * @ruleno: the rule id + * @x: hash input + * @result: pointer to result vector + * @result_max: maximum result size + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector + * @cwin: Pointer to at least map->working_size bytes of memory or NULL. + */ +int crush_do_rule(const struct crush_map *map, + int ruleno, int x, int *result, int result_max, + const __u32 *weight, int weight_max, + void *cwin, const struct crush_choose_arg *choose_args) +{ + int result_len; + struct crush_work *cw = cwin; + int *a = (int *)((char *)cw + map->working_size); + int *b = a + result_max; + int *c = b + result_max; + int *w = a; + int *o = b; + int recurse_to_leaf; + int wsize = 0; + int osize; + int *tmp; + const struct crush_rule *rule; + __u32 step; + int i, j; + int numrep; + int out_size; + /* + * the original choose_total_tries value was off by one (it + * counted "retries" and not "tries"). add one. + */ + int choose_tries = map->choose_total_tries + 1; + int choose_leaf_tries = 0; + /* + * the local tries values were counted as "retries", though, + * and need no adjustment + */ + int choose_local_retries = map->choose_local_tries; + int choose_local_fallback_retries = map->choose_local_fallback_tries; + + int vary_r = map->chooseleaf_vary_r; + int stable = map->chooseleaf_stable; + + if ((__u32)ruleno >= map->max_rules) { + dprintk(" bad ruleno %d\n", ruleno); + return 0; + } + + rule = map->rules[ruleno]; + result_len = 0; + + for (step = 0; step < rule->len; step++) { + int firstn = 0; + const struct crush_rule_step *curstep = &rule->steps[step]; + + switch (curstep->op) { + case CRUSH_RULE_TAKE: + if ((curstep->arg1 >= 0 && + curstep->arg1 < map->max_devices) || + (-1-curstep->arg1 >= 0 && + -1-curstep->arg1 < map->max_buckets && + map->buckets[-1-curstep->arg1])) { + w[0] = curstep->arg1; + wsize = 1; + } else { + dprintk(" bad take value %d\n", curstep->arg1); + } + break; + + case CRUSH_RULE_SET_CHOOSE_TRIES: + if (curstep->arg1 > 0) + choose_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: + if (curstep->arg1 > 0) + choose_leaf_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: + if (curstep->arg1 >= 0) + choose_local_retries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: + if (curstep->arg1 >= 0) + choose_local_fallback_retries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: + if (curstep->arg1 >= 0) + vary_r = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_STABLE: + if (curstep->arg1 >= 0) + stable = curstep->arg1; + break; + + case CRUSH_RULE_CHOOSELEAF_FIRSTN: + case CRUSH_RULE_CHOOSE_FIRSTN: + firstn = 1; + /* fall through */ + case CRUSH_RULE_CHOOSELEAF_INDEP: + case CRUSH_RULE_CHOOSE_INDEP: + if (wsize == 0) + break; + + recurse_to_leaf = + curstep->op == + CRUSH_RULE_CHOOSELEAF_FIRSTN || + curstep->op == + CRUSH_RULE_CHOOSELEAF_INDEP; + + /* reset output */ + osize = 0; + + for (i = 0; i < wsize; i++) { + int bno; + numrep = curstep->arg1; + if (numrep <= 0) { + numrep += result_max; + if (numrep <= 0) + continue; + } + j = 0; + /* make sure bucket id is valid */ + bno = -1 - w[i]; + if (bno < 0 || bno >= map->max_buckets) { + // w[i] is probably CRUSH_ITEM_NONE + dprintk(" bad w[i] %d\n", w[i]); + continue; + } + if (firstn) { + int recurse_tries; + if (choose_leaf_tries) + recurse_tries = + choose_leaf_tries; + else if (map->chooseleaf_descend_once) + recurse_tries = 1; + else + recurse_tries = choose_tries; + osize += crush_choose_firstn( + map, + cw, + map->buckets[bno], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + result_max-osize, + choose_tries, + recurse_tries, + choose_local_retries, + choose_local_fallback_retries, + recurse_to_leaf, + vary_r, + stable, + c+osize, + 0, + choose_args); + } else { + out_size = ((numrep < (result_max-osize)) ? + numrep : (result_max-osize)); + crush_choose_indep( + map, + cw, + map->buckets[bno], + weight, weight_max, + x, out_size, numrep, + curstep->arg2, + o+osize, j, + choose_tries, + choose_leaf_tries ? + choose_leaf_tries : 1, + recurse_to_leaf, + c+osize, + 0, + choose_args); + osize += out_size; + } + } + + if (recurse_to_leaf) + /* copy final _leaf_ values to output set */ + memcpy(o, c, osize*sizeof(*o)); + + /* swap o and w arrays */ + tmp = o; + o = w; + w = tmp; + wsize = osize; + break; + + + case CRUSH_RULE_EMIT: + for (i = 0; i < wsize && result_len < result_max; i++) { + result[result_len] = w[i]; + result_len++; + } + wsize = 0; + break; + + default: + dprintk(" unknown op %d at step %d\n", + curstep->op, step); + break; + } + } + + return result_len; +} diff --git a/src/crush/mapper.h b/src/crush/mapper.h new file mode 100644 index 00000000..e76be767 --- /dev/null +++ b/src/crush/mapper.h @@ -0,0 +1,93 @@ +#ifndef CEPH_CRUSH_MAPPER_H +#define CEPH_CRUSH_MAPPER_H + +/* + * CRUSH functions for find rules and then mapping an input to an + * output set. + * + * LGPL2.1 + */ + +#include "crush.h" + +extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); +/** @ingroup API + * + * Map __x__ to __result_max__ items and store them in the __result__ + * array. The mapping is done by following each step of the rule + * __ruleno__. See crush_make_rule(), crush_rule_set_step() and + * crush_add_rule() for more information on how the rules are created, + * populated and added to the crush __map__. + * + * The return value is the the number of items in the __result__ + * array. If the caller asked for __result_max__ items and the return + * value is X where X < __result_max__, the content of __result[0,X[__ + * is defined but the content of __result[X,result_max[__ is + * undefined. For example: + * + * crush_do_rule(map, ruleno=1, x=1, result, result_max=3,...) == 1 + * result[0] is set + * result[1] is undefined + * result[2] is undefined + * + * An entry in the __result__ array is either an item in the crush + * __map__ or ::CRUSH_ITEM_NONE if no item was found. For example: + * + * crush_do_rule(map, ruleno=1, x=1, result, result_max=4,...) == 2 + * result[0] is CRUSH_ITEM_NONE + * result[1] is item number 5 + * result[2] is undefined + * result[3] is undefined + * + * The __weight__ array contains the probabilities that a leaf is + * ignored even if it is selected. It is a 16.16 fixed point + * number in the range [0x00000,0x10000]. The lower the value, the + * more often the leaf is ignored. For instance: + * + * - weight[leaf] == 0x00000 == 0.0 always ignore + * - weight[leaf] == 0x10000 == 1.0 never ignore + * - weight[leaf] == 0x08000 == 0.5 ignore 50% of the time + * - weight[leaf] == 0x04000 == 0.25 ignore 75% of the time + * - etc. + * + * During mapping, each leaf is checked against the __weight__ array, + * using the leaf as an index. If there is no entry in __weight__ for + * the leaf, it is ignored. If there is an entry, the leaf will be + * ignored some of the time, depending on the probability. + * + * The __cwin__ argument must be set as follows: + * + * char __cwin__[crush_work_size(__map__, __result_max__)]; + * crush_init_workspace(__map__, __cwin__); + * + * @param map the crush_map + * @param ruleno a positive integer < __CRUSH_MAX_RULES__ + * @param x the value to map to __result_max__ items + * @param result an array of items of size __result_max__ + * @param result_max the size of the __result__ array + * @param weights an array of weights of size __weight_max__ + * @param weight_max the size of the __weights__ array + * @param cwin must be an char array initialized by crush_init_workspace + * @param choose_args weights and ids for each known bucket + * + * @return 0 on error or the size of __result__ on success + */ +extern int crush_do_rule(const struct crush_map *map, + int ruleno, + int x, int *result, int result_max, + const __u32 *weights, int weight_max, + void *cwin, const struct crush_choose_arg *choose_args); + +/* Returns the exact amount of workspace that will need to be used + for a given combination of crush_map and result_max. The caller can + then allocate this much on its own, either on the stack, in a + per-thread long-lived buffer, or however it likes. */ + +static inline size_t crush_work_size(const struct crush_map *map, + int result_max) { + return map->working_size + result_max * 3 * sizeof(__u32); +} + +extern void crush_init_workspace(const struct crush_map *m, void *v); + +#endif diff --git a/src/crush/old_sample.txt b/src/crush/old_sample.txt new file mode 100644 index 00000000..54cf06a7 --- /dev/null +++ b/src/crush/old_sample.txt @@ -0,0 +1,82 @@ + +# first define our types + + + type_id = 0 + + + type_id = 2 + + + type_id = 3 + + + type_id = 10 + + + +# hierarchy + + + id 1 + weight 500 + + + id 2 + weight 500 + + + id 3 + weight 500 + + + id 4 + weight 500 + + + id 5 + weight 500 + + + + + + alg straw + id -12 + + + + + weight 600 + + + +# +# +# weight 1.0 +# +# +# weight 3.0 +# +# + + + + + id 5 + weight 500 + + + +# rules + + + pool 0 + type replicated + min_size 1 + mix_size 4 + step take root + step choose_indep 0 osd + step emit + + diff --git a/src/crush/sample.txt b/src/crush/sample.txt new file mode 100644 index 00000000..f7e0ac39 --- /dev/null +++ b/src/crush/sample.txt @@ -0,0 +1,47 @@ + +# devices +device 1 osd001 +device 2 osd002 +device 3 osd003 down # same as offload 1.0 +device 4 osd004 offload 0 # 0.0 -> normal, 1.0 -> failed +device 5 osd005 offload 0.1 +device 6 osd006 offload 0.1 + +# hierarchy +type 0 osd # 'device' is actually the default for 0 +type 2 cab +type 3 row +type 10 pool + +cab root { + id -1 # optional + alg tree # required + item osd001 + item osd002 weight 600 pos 1 + item osd003 weight 600 pos 0 + item osd004 weight 600 pos 3 + item osd005 weight 600 pos 4 +} + +# rules +rule normal { + # these are required. + pool 0 + type replicated + min_size 1 + max_size 4 + # need 1 or more of these. + step take root + step choose firstn 0 type osd + step emit +} + +rule { + pool 1 + type erasure + min_size 3 + max_size 6 + step take root + step choose indep 0 type osd + step emit +} diff --git a/src/crush/types.h b/src/crush/types.h new file mode 100644 index 00000000..919eed25 --- /dev/null +++ b/src/crush/types.h @@ -0,0 +1,17 @@ +#ifndef CEPH_CRUSH_TYPES_H +#define CEPH_CRUSH_TYPES_H + +#ifdef KERNEL +# define free(x) kfree(x) +#else +# include +#endif + + +#include /* just for int types */ + +#ifndef BUG_ON +# define BUG_ON(x) ceph_assert(!(x)) +#endif + +#endif -- cgit v1.2.3