summaryrefslogtreecommitdiffstats
path: root/src/crush
diff options
context:
space:
mode:
Diffstat (limited to 'src/crush')
-rw-r--r--src/crush/CMakeLists.txt19
-rw-r--r--src/crush/CrushCompiler.cc1276
-rw-r--r--src/crush/CrushCompiler.h92
-rw-r--r--src/crush/CrushLocation.cc124
-rw-r--r--src/crush/CrushLocation.h35
-rw-r--r--src/crush/CrushTester.cc802
-rw-r--r--src/crush/CrushTester.h366
-rw-r--r--src/crush/CrushTreeDumper.h291
-rw-r--r--src/crush/CrushWrapper.cc4185
-rw-r--r--src/crush/CrushWrapper.h1657
-rw-r--r--src/crush/CrushWrapper.i47
-rw-r--r--src/crush/builder.c1525
-rw-r--r--src/crush/builder.h344
-rw-r--r--src/crush/crush.c137
-rw-r--r--src/crush/crush.h549
-rw-r--r--src/crush/crush_compat.h39
-rw-r--r--src/crush/crush_ln_table.h164
-rw-r--r--src/crush/grammar.h191
-rw-r--r--src/crush/hash.c151
-rw-r--r--src/crush/hash.h23
-rw-r--r--src/crush/mapper.c1105
-rw-r--r--src/crush/mapper.h93
-rw-r--r--src/crush/old_sample.txt82
-rw-r--r--src/crush/sample.txt47
-rw-r--r--src/crush/types.h17
25 files changed, 13361 insertions, 0 deletions
diff --git a/src/crush/CMakeLists.txt b/src/crush/CMakeLists.txt
new file mode 100644
index 00000000..ae9b9f47
--- /dev/null
+++ b/src/crush/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(crush_srcs
+ builder.c
+ mapper.c
+ crush.c
+ hash.c
+ CrushWrapper.cc
+ CrushCompiler.cc
+ CrushTester.cc
+ CrushLocation.cc)
+
+add_library(crush_objs OBJECT ${crush_srcs})
+
+if(WITH_SEASTAR)
+ add_library(crimson-crush OBJECT ${crush_srcs})
+ target_compile_definitions(crimson-crush PRIVATE
+ "WITH_SEASTAR=1")
+ target_include_directories(crimson-crush PRIVATE
+ $<TARGET_PROPERTY:Seastar::seastar,INTERFACE_INCLUDE_DIRECTORIES>)
+endif()
diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc
new file mode 100644
index 00000000..52ad0563
--- /dev/null
+++ b/src/crush/CrushCompiler.cc
@@ -0,0 +1,1276 @@
+
+#include "CrushCompiler.h"
+
+#if defined(_AIX)
+#define EBADE ECORRUPT
+#endif
+
+#ifndef EBADE
+#define EBADE EFTYPE
+#endif
+#include <string>
+#include "common/errno.h"
+#include <boost/algorithm/string.hpp>
+
+// -------------
+
+static void print_type_name(ostream& out, int t, CrushWrapper &crush)
+{
+ const char *name = crush.get_type_name(t);
+ if (name)
+ out << name;
+ else if (t == 0)
+ out << "device";
+ else
+ out << "type" << t;
+}
+
+static void print_item_name(ostream& out, int t, CrushWrapper &crush)
+{
+ const char *name = crush.get_item_name(t);
+ if (name)
+ out << name;
+ else if (t >= 0)
+ out << "device" << t;
+ else
+ out << "bucket" << (-1-t);
+}
+
+static void print_bucket_class_ids(ostream& out, int t, CrushWrapper &crush)
+{
+ if (crush.class_bucket.count(t) == 0)
+ return;
+ auto &class_to_id = crush.class_bucket[t];
+ for (auto &i : class_to_id) {
+ int c = i.first;
+ int cid = i.second;
+ const char* class_name = crush.get_class_name(c);
+ ceph_assert(class_name);
+ out << "\tid " << cid << " class " << class_name << "\t\t# do not change unnecessarily\n";
+ }
+}
+
+static void print_item_class(ostream& out, int t, CrushWrapper &crush)
+{
+ const char *c = crush.get_item_class(t);
+ if (c)
+ out << " class " << c;
+}
+
+static void print_class(ostream& out, int t, CrushWrapper &crush)
+{
+ const char *c = crush.get_class_name(t);
+ if (c)
+ out << " class " << c;
+ else
+ out << " # unexpected class " << t;
+}
+
+static void print_rule_name(ostream& out, int t, CrushWrapper &crush)
+{
+ const char *name = crush.get_rule_name(t);
+ if (name)
+ out << name;
+ else
+ out << "rule" << t;
+}
+
+static void print_fixedpoint(ostream& out, int i)
+{
+ char s[20];
+ snprintf(s, sizeof(s), "%.3f", (float)i / (float)0x10000);
+ out << s;
+}
+
+int CrushCompiler::decompile_bucket_impl(int i, ostream &out)
+{
+ const char *name = crush.get_item_name(i);
+ if (name && !crush.is_valid_crush_name(name))
+ return 0;
+ int type = crush.get_bucket_type(i);
+ print_type_name(out, type, crush);
+ out << " ";
+ print_item_name(out, i, crush);
+ out << " {\n";
+ out << "\tid " << i << "\t\t# do not change unnecessarily\n";
+ print_bucket_class_ids(out, i, crush);
+
+ out << "\t# weight ";
+ print_fixedpoint(out, crush.get_bucket_weight(i));
+ out << "\n";
+
+ int n = crush.get_bucket_size(i);
+
+ int alg = crush.get_bucket_alg(i);
+ out << "\talg " << crush_bucket_alg_name(alg);
+
+ // notate based on alg type
+ bool dopos = false;
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ out << "\t# do not change bucket size (" << n << ") unnecessarily";
+ dopos = true;
+ break;
+ case CRUSH_BUCKET_LIST:
+ out << "\t# add new items at the end; do not change order unnecessarily";
+ break;
+ case CRUSH_BUCKET_TREE:
+ out << "\t# do not change pos for existing items unnecessarily";
+ dopos = true;
+ break;
+ }
+ out << "\n";
+
+ int hash = crush.get_bucket_hash(i);
+ out << "\thash " << hash << "\t# " << crush_hash_name(hash) << "\n";
+
+ for (int j=0; j<n; j++) {
+ int item = crush.get_bucket_item(i, j);
+ int w = crush.get_bucket_item_weight(i, j);
+ out << "\titem ";
+ print_item_name(out, item, crush);
+ out << " weight ";
+ print_fixedpoint(out, w);
+ if (dopos)
+ out << " pos " << j;
+
+ out << "\n";
+ }
+ out << "}\n";
+ return 0;
+}
+
+/* Basically, we just descend recursively into all of the buckets,
+ * executing a depth-first traversal of the graph. Since the buckets form a
+ * directed acyclic graph, this should work just fine. The graph isn't
+ * necessarily a tree, so we have to keep track of what buckets we already
+ * outputted. We don't want to output anything twice. We also keep track of
+ * what buckets are in progress so that we can detect cycles. These can
+ * arise through user error.
+ */
+int CrushCompiler::decompile_bucket(int cur,
+ std::map<int, dcb_state_t>& dcb_states,
+ ostream &out)
+{
+ if ((cur == 0) || (!crush.bucket_exists(cur)))
+ return 0;
+
+ std::map<int, dcb_state_t>::iterator c = dcb_states.find(cur);
+ if (c == dcb_states.end()) {
+ // Mark this bucket as "in progress."
+ std::map<int, dcb_state_t>::value_type val(cur, DCB_STATE_IN_PROGRESS);
+ std::pair <std::map<int, dcb_state_t>::iterator, bool> rval
+ (dcb_states.insert(val));
+ ceph_assert(rval.second);
+ c = rval.first;
+ }
+ else if (c->second == DCB_STATE_DONE) {
+ // We already did this bucket.
+ return 0;
+ }
+ else if (c->second == DCB_STATE_IN_PROGRESS) {
+ err << "decompile_crush_bucket: logic error: tried to decompile "
+ "a bucket that is already being decompiled" << std::endl;
+ return -EBADE;
+ }
+ else {
+ err << "decompile_crush_bucket: logic error: illegal bucket state! "
+ << c->second << std::endl;
+ return -EBADE;
+ }
+
+ int bsize = crush.get_bucket_size(cur);
+ for (int i = 0; i < bsize; ++i) {
+ int item = crush.get_bucket_item(cur, i);
+ std::map<int, dcb_state_t>::iterator d = dcb_states.find(item);
+ if (d == dcb_states.end()) {
+ int ret = decompile_bucket(item, dcb_states, out);
+ if (ret)
+ return ret;
+ }
+ else if (d->second == DCB_STATE_IN_PROGRESS) {
+ err << "decompile_crush_bucket: error: while trying to output bucket "
+ << cur << ", we found out that it contains one of the buckets that "
+ << "contain it. This is not allowed. The buckets must form a "
+ << "directed acyclic graph." << std::endl;
+ return -EINVAL;
+ }
+ else if (d->second != DCB_STATE_DONE) {
+ err << "decompile_crush_bucket: logic error: illegal bucket state "
+ << d->second << std::endl;
+ return -EBADE;
+ }
+ }
+ decompile_bucket_impl(cur, out);
+ c->second = DCB_STATE_DONE;
+ return 0;
+}
+
+int CrushCompiler::decompile_weight_set_weights(crush_weight_set weight_set,
+ ostream &out)
+{
+ out << " [ ";
+ for (__u32 i = 0; i < weight_set.size; i++) {
+ print_fixedpoint(out, weight_set.weights[i]);
+ out << " ";
+ }
+ out << "]\n";
+ return 0;
+}
+
+int CrushCompiler::decompile_weight_set(crush_weight_set *weight_set,
+ __u32 size,
+ ostream &out)
+{
+ out << " weight_set [\n";
+ for (__u32 i = 0; i < size; i++) {
+ int r = decompile_weight_set_weights(weight_set[i], out);
+ if (r < 0)
+ return r;
+ }
+ out << " ]\n";
+ return 0;
+}
+
+int CrushCompiler::decompile_ids(__s32 *ids,
+ __u32 size,
+ ostream &out)
+{
+ out << " ids [ ";
+ for (__u32 i = 0; i < size; i++)
+ out << ids[i] << " ";
+ out << "]\n";
+ return 0;
+}
+
+int CrushCompiler::decompile_choose_arg(crush_choose_arg *arg,
+ int bucket_id,
+ ostream &out)
+{
+ int r;
+ out << " {\n";
+ out << " bucket_id " << bucket_id << "\n";
+ if (arg->weight_set_positions > 0) {
+ r = decompile_weight_set(arg->weight_set, arg->weight_set_positions, out);
+ if (r < 0)
+ return r;
+ }
+ if (arg->ids_size > 0) {
+ r = decompile_ids(arg->ids, arg->ids_size, out);
+ if (r < 0)
+ return r;
+ }
+ out << " }\n";
+ return 0;
+}
+
+int CrushCompiler::decompile_choose_arg_map(crush_choose_arg_map arg_map,
+ ostream &out)
+{
+ for (__u32 i = 0; i < arg_map.size; i++) {
+ if ((arg_map.args[i].ids_size == 0) &&
+ (arg_map.args[i].weight_set_positions == 0))
+ continue;
+ int r = decompile_choose_arg(&arg_map.args[i], -1-i, out);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+int CrushCompiler::decompile_choose_args(const std::pair<const long unsigned int, crush_choose_arg_map> &i,
+ ostream &out)
+{
+ out << "choose_args " << i.first << " {\n";
+ int r = decompile_choose_arg_map(i.second, out);
+ if (r < 0)
+ return r;
+ out << "}\n";
+ return 0;
+}
+
+int CrushCompiler::decompile(ostream &out)
+{
+ out << "# begin crush map\n";
+
+ // only dump tunables if they differ from the defaults
+ if (crush.get_choose_local_tries() != 2)
+ out << "tunable choose_local_tries " << crush.get_choose_local_tries() << "\n";
+ if (crush.get_choose_local_fallback_tries() != 5)
+ out << "tunable choose_local_fallback_tries " << crush.get_choose_local_fallback_tries() << "\n";
+ if (crush.get_choose_total_tries() != 19)
+ out << "tunable choose_total_tries " << crush.get_choose_total_tries() << "\n";
+ if (crush.get_chooseleaf_descend_once() != 0)
+ out << "tunable chooseleaf_descend_once " << crush.get_chooseleaf_descend_once() << "\n";
+ if (crush.get_chooseleaf_vary_r() != 0)
+ out << "tunable chooseleaf_vary_r " << crush.get_chooseleaf_vary_r() << "\n";
+ if (crush.get_chooseleaf_stable() != 0)
+ out << "tunable chooseleaf_stable " << crush.get_chooseleaf_stable() << "\n";
+ if (crush.get_straw_calc_version() != 0)
+ out << "tunable straw_calc_version " << crush.get_straw_calc_version() << "\n";
+ if (crush.get_allowed_bucket_algs() != CRUSH_LEGACY_ALLOWED_BUCKET_ALGS)
+ out << "tunable allowed_bucket_algs " << crush.get_allowed_bucket_algs()
+ << "\n";
+
+ out << "\n# devices\n";
+ for (int i=0; i<crush.get_max_devices(); i++) {
+ const char *name = crush.get_item_name(i);
+ if (name) {
+ out << "device " << i << " " << name;
+ print_item_class(out, i, crush);
+ out << "\n";
+ }
+ }
+
+ out << "\n# types\n";
+ int n = crush.get_num_type_names();
+ for (int i=0; n; i++) {
+ const char *name = crush.get_type_name(i);
+ if (!name) {
+ if (i == 0) out << "type 0 osd\n";
+ continue;
+ }
+ n--;
+ out << "type " << i << " " << name << "\n";
+ }
+
+ out << "\n# buckets\n";
+ std::map<int, dcb_state_t> dcb_states;
+ for (int bucket = -1; bucket > -1-crush.get_max_buckets(); --bucket) {
+ int ret = decompile_bucket(bucket, dcb_states, out);
+ if (ret)
+ return ret;
+ }
+
+ out << "\n# rules\n";
+ for (int i=0; i<crush.get_max_rules(); i++) {
+ if (!crush.rule_exists(i))
+ continue;
+ out << "rule ";
+ if (crush.get_rule_name(i))
+ print_rule_name(out, i, crush);
+ out << " {\n";
+ out << "\tid " << i << "\n";
+ if (i != crush.get_rule_mask_ruleset(i)) {
+ out << "\t# WARNING: ruleset " << crush.get_rule_mask_ruleset(i) << " != id " << i << "; this will not recompile to the same map\n";
+ }
+
+ switch (crush.get_rule_mask_type(i)) {
+ case CEPH_PG_TYPE_REPLICATED:
+ out << "\ttype replicated\n";
+ break;
+ case CEPH_PG_TYPE_ERASURE:
+ out << "\ttype erasure\n";
+ break;
+ default:
+ out << "\ttype " << crush.get_rule_mask_type(i) << "\n";
+ }
+
+ out << "\tmin_size " << crush.get_rule_mask_min_size(i) << "\n";
+ out << "\tmax_size " << crush.get_rule_mask_max_size(i) << "\n";
+
+ for (int j=0; j<crush.get_rule_len(i); j++) {
+ switch (crush.get_rule_op(i, j)) {
+ case CRUSH_RULE_NOOP:
+ out << "\tstep noop\n";
+ break;
+ case CRUSH_RULE_TAKE:
+ out << "\tstep take ";
+ {
+ int step_item = crush.get_rule_arg1(i, j);
+ int original_item;
+ int c;
+ int res = crush.split_id_class(step_item, &original_item, &c);
+ if (res < 0)
+ return res;
+ if (c >= 0)
+ step_item = original_item;
+ print_item_name(out, step_item, crush);
+ if (c >= 0)
+ print_class(out, c, crush);
+ }
+ out << "\n";
+ break;
+ case CRUSH_RULE_EMIT:
+ out << "\tstep emit\n";
+ break;
+ case CRUSH_RULE_SET_CHOOSE_TRIES:
+ out << "\tstep set_choose_tries " << crush.get_rule_arg1(i, j)
+ << "\n";
+ break;
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
+ out << "\tstep set_choose_local_tries " << crush.get_rule_arg1(i, j)
+ << "\n";
+ break;
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
+ out << "\tstep set_choose_local_fallback_tries " << crush.get_rule_arg1(i, j)
+ << "\n";
+ break;
+ case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+ out << "\tstep set_chooseleaf_tries " << crush.get_rule_arg1(i, j)
+ << "\n";
+ break;
+ case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+ out << "\tstep set_chooseleaf_vary_r " << crush.get_rule_arg1(i, j)
+ << "\n";
+ break;
+ case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
+ out << "\tstep set_chooseleaf_stable " << crush.get_rule_arg1(i, j)
+ << "\n";
+ break;
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ out << "\tstep choose firstn "
+ << crush.get_rule_arg1(i, j)
+ << " type ";
+ print_type_name(out, crush.get_rule_arg2(i, j), crush);
+ out << "\n";
+ break;
+ case CRUSH_RULE_CHOOSE_INDEP:
+ out << "\tstep choose indep "
+ << crush.get_rule_arg1(i, j)
+ << " type ";
+ print_type_name(out, crush.get_rule_arg2(i, j), crush);
+ out << "\n";
+ break;
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+ out << "\tstep chooseleaf firstn "
+ << crush.get_rule_arg1(i, j)
+ << " type ";
+ print_type_name(out, crush.get_rule_arg2(i, j), crush);
+ out << "\n";
+ break;
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
+ out << "\tstep chooseleaf indep "
+ << crush.get_rule_arg1(i, j)
+ << " type ";
+ print_type_name(out, crush.get_rule_arg2(i, j), crush);
+ out << "\n";
+ break;
+ }
+ }
+ out << "}\n";
+ }
+ if (crush.choose_args.size() > 0) {
+ out << "\n# choose_args\n";
+ for (auto i : crush.choose_args) {
+ int ret = decompile_choose_args(i, out);
+ if (ret)
+ return ret;
+ }
+ }
+ out << "\n# end crush map" << std::endl;
+ return 0;
+}
+
+
+// ================================================================
+
+string CrushCompiler::string_node(node_t &node)
+{
+ return boost::trim_copy(string(node.value.begin(), node.value.end()));
+}
+
+int CrushCompiler::int_node(node_t &node)
+{
+ string str = string_node(node);
+ return strtol(str.c_str(), 0, 10);
+}
+
+float CrushCompiler::float_node(node_t &node)
+{
+ string s = string_node(node);
+ return strtof(s.c_str(), 0);
+}
+
+int CrushCompiler::parse_device(iter_t const& i)
+{
+ int id = int_node(i->children[1]);
+
+ string name = string_node(i->children[2]);
+ crush.set_item_name(id, name.c_str());
+ if (item_id.count(name)) {
+ err << "item " << name << " defined twice" << std::endl;
+ return -1;
+ }
+ item_id[name] = id;
+ id_item[id] = name;
+
+ if (verbose) err << "device " << id << " '" << name << "'";
+
+ if (i->children.size() > 3) {
+ string c = string_node(i->children[4]);
+ crush.set_item_class(id, c);
+ if (verbose) err << " class" << " '" << c << "'" << std::endl;
+ } else {
+ if (verbose) err << std::endl;
+ }
+ return 0;
+}
+
+int CrushCompiler::parse_tunable(iter_t const& i)
+{
+ string name = string_node(i->children[1]);
+ int val = int_node(i->children[2]);
+
+ if (name == "choose_local_tries")
+ crush.set_choose_local_tries(val);
+ else if (name == "choose_local_fallback_tries")
+ crush.set_choose_local_fallback_tries(val);
+ else if (name == "choose_total_tries")
+ crush.set_choose_total_tries(val);
+ else if (name == "chooseleaf_descend_once")
+ crush.set_chooseleaf_descend_once(val);
+ else if (name == "chooseleaf_vary_r")
+ crush.set_chooseleaf_vary_r(val);
+ else if (name == "chooseleaf_stable")
+ crush.set_chooseleaf_stable(val);
+ else if (name == "straw_calc_version")
+ crush.set_straw_calc_version(val);
+ else if (name == "allowed_bucket_algs")
+ crush.set_allowed_bucket_algs(val);
+ else {
+ err << "tunable " << name << " not recognized" << std::endl;
+ return -1;
+ }
+
+ /*
+
+ current crop of tunables are all now "safe". re-enable this when we
+ add new ones that are ... new.
+
+ if (!unsafe_tunables) {
+ err << "tunables are NOT FULLY IMPLEMENTED; enable with --enable-unsafe-tunables to enable this feature" << std::endl;
+ return -1;
+ }
+ */
+
+ if (verbose) err << "tunable " << name << " " << val << std::endl;
+ return 0;
+}
+
+int CrushCompiler::parse_bucket_type(iter_t const& i)
+{
+ int id = int_node(i->children[1]);
+ string name = string_node(i->children[2]);
+ if (verbose) err << "type " << id << " '" << name << "'" << std::endl;
+ type_id[name] = id;
+ crush.set_type_name(id, name.c_str());
+ return 0;
+}
+
+int CrushCompiler::parse_bucket(iter_t const& i)
+{
+ string tname = string_node(i->children[0]);
+ if (!type_id.count(tname)) {
+ err << "bucket type '" << tname << "' is not defined" << std::endl;
+ return -1;
+ }
+ int type = type_id[tname];
+
+ string name = string_node(i->children[1]);
+ if (item_id.count(name)) {
+ err << "bucket or device '" << name << "' is already defined" << std::endl;
+ return -1;
+ }
+
+ int id = 0; // none, yet!
+ int alg = -1;
+ int hash = 0;
+ set<int> used_items;
+ int size = 0;
+ map<int32_t, int32_t> class_id;
+
+ for (unsigned p=3; p<i->children.size()-1; p++) {
+ iter_t sub = i->children.begin() + p;
+ string tag = string_node(sub->children[0]);
+ //err << "tag " << tag << std::endl;
+ if (tag == "id") {
+ int maybe_id = int_node(sub->children[1]);
+ if (verbose) err << "bucket " << name << " id " << maybe_id;
+ if (sub->children.size() > 2) {
+ string class_name = string_node(sub->children[3]);
+ // note that we do not verify class existence here,
+ // as this bucket might come from an empty shadow tree
+ // which currently has no OSDs but is still referenced by a rule!
+ int cid = crush.get_or_create_class_id(class_name);
+ if (class_id.count(cid) != 0) {
+ err << "duplicate device class " << class_name << " for bucket " << name << std::endl;
+ return -ERANGE;
+ }
+ class_id[cid] = maybe_id;
+ if (verbose) err << " class" << " '" << class_name << "'" << std::endl;
+ } else {
+ id = maybe_id;
+ if (verbose) err << std::endl;
+ }
+ } else if (tag == "alg") {
+ string a = string_node(sub->children[1]);
+ if (a == "uniform")
+ alg = CRUSH_BUCKET_UNIFORM;
+ else if (a == "list")
+ alg = CRUSH_BUCKET_LIST;
+ else if (a == "tree")
+ alg = CRUSH_BUCKET_TREE;
+ else if (a == "straw")
+ alg = CRUSH_BUCKET_STRAW;
+ else if (a == "straw2")
+ alg = CRUSH_BUCKET_STRAW2;
+ else {
+ err << "unknown bucket alg '" << a << "'" << std::endl << std::endl;
+ return -EINVAL;
+ }
+ }
+ else if (tag == "hash") {
+ string a = string_node(sub->children[1]);
+ if (a == "rjenkins1")
+ hash = CRUSH_HASH_RJENKINS1;
+ else
+ hash = atoi(a.c_str());
+ }
+ else if (tag == "item") {
+ // first, just determine which item pos's are already used
+ size++;
+ for (unsigned q = 2; q < sub->children.size(); q++) {
+ string tag = string_node(sub->children[q++]);
+ if (tag == "pos") {
+ int pos = int_node(sub->children[q]);
+ if (used_items.count(pos)) {
+ err << "item '" << string_node(sub->children[1]) << "' in bucket '" << name << "' has explicit pos " << pos << ", which is occupied" << std::endl;
+ return -1;
+ }
+ used_items.insert(pos);
+ }
+ }
+ }
+ else ceph_abort();
+ }
+
+ // now do the items.
+ if (!used_items.empty())
+ size = std::max(size, *used_items.rbegin());
+ vector<int> items(size);
+ vector<int> weights(size);
+
+ int curpos = 0;
+ unsigned bucketweight = 0;
+ bool have_uniform_weight = false;
+ unsigned uniform_weight = 0;
+ for (unsigned p=3; p<i->children.size()-1; p++) {
+ iter_t sub = i->children.begin() + p;
+ string tag = string_node(sub->children[0]);
+ if (tag == "item") {
+
+ string iname = string_node(sub->children[1]);
+ if (!item_id.count(iname)) {
+ err << "item '" << iname << "' in bucket '" << name << "' is not defined" << std::endl;
+ return -1;
+ }
+ int itemid = item_id[iname];
+
+ unsigned weight = 0x10000;
+ if (item_weight.count(itemid))
+ weight = item_weight[itemid];
+
+ int pos = -1;
+ for (unsigned q = 2; q < sub->children.size(); q++) {
+ string tag = string_node(sub->children[q++]);
+ if (tag == "weight") {
+ weight = float_node(sub->children[q]) * (float)0x10000;
+ if (weight > CRUSH_MAX_DEVICE_WEIGHT && itemid >= 0) {
+ err << "device weight limited to " << CRUSH_MAX_DEVICE_WEIGHT / 0x10000 << std::endl;
+ return -ERANGE;
+ }
+ else if (weight > CRUSH_MAX_BUCKET_WEIGHT && itemid < 0) {
+ err << "bucket weight limited to " << CRUSH_MAX_BUCKET_WEIGHT / 0x10000
+ << " to prevent overflow" << std::endl;
+ return -ERANGE;
+ }
+ }
+ else if (tag == "pos")
+ pos = int_node(sub->children[q]);
+ else
+ ceph_abort();
+
+ }
+ if (alg == CRUSH_BUCKET_UNIFORM) {
+ if (!have_uniform_weight) {
+ have_uniform_weight = true;
+ uniform_weight = weight;
+ } else {
+ if (uniform_weight != weight) {
+ err << "item '" << iname << "' in uniform bucket '" << name << "' has weight " << weight
+ << " but previous item(s) have weight " << (float)uniform_weight/(float)0x10000
+ << "; uniform bucket items must all have identical weights." << std::endl;
+ return -1;
+ }
+ }
+ }
+
+ if (pos >= size) {
+ err << "item '" << iname << "' in bucket '" << name << "' has pos " << pos << " >= size " << size << std::endl;
+ return -1;
+ }
+ if (pos < 0) {
+ while (used_items.count(curpos)) curpos++;
+ pos = curpos++;
+ }
+ //err << " item " << iname << " (" << itemid << ") pos " << pos << " weight " << weight << std::endl;
+ items[pos] = itemid;
+ weights[pos] = weight;
+
+ if (crush_addition_is_unsafe(bucketweight, weight)) {
+ err << "oh no! our bucket weights are overflowing all over the place, better lower the item weights" << std::endl;
+ return -ERANGE;
+ }
+
+ bucketweight += weight;
+ }
+ }
+
+ if (id == 0) {
+ for (id=-1; id_item.count(id); id--) ;
+ //err << "assigned id " << id << std::endl;
+ }
+
+ for (auto &i : class_id)
+ class_bucket[id][i.first] = i.second;
+
+ if (verbose) err << "bucket " << name << " (" << id << ") " << size << " items and weight "
+ << (float)bucketweight / (float)0x10000 << std::endl;
+ id_item[id] = name;
+ item_id[name] = id;
+ item_weight[id] = bucketweight;
+
+ ceph_assert(id != 0);
+ int idout;
+ int r = crush.add_bucket(id, alg, hash, type, size,
+ items.data(), weights.data(), &idout);
+ if (r < 0) {
+ if (r == -EEXIST)
+ err << "Duplicate bucket id " << id << std::endl;
+ else
+ err << "add_bucket failed " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ r = crush.set_item_name(id, name.c_str());
+ return r;
+}
+
+int CrushCompiler::parse_rule(iter_t const& i)
+{
+ int start; // rule name is optional!
+
+ string rname = string_node(i->children[1]);
+ if (rname != "{") {
+ if (rule_id.count(rname)) {
+ err << "rule name '" << rname << "' already defined\n" << std::endl;
+ return -1;
+ }
+ start = 4;
+ } else {
+ rname = string();
+ start = 3;
+ }
+
+ int ruleno = int_node(i->children[start]);
+
+ string tname = string_node(i->children[start+2]);
+ int type;
+ if (tname == "replicated")
+ type = CEPH_PG_TYPE_REPLICATED;
+ else if (tname == "erasure")
+ type = CEPH_PG_TYPE_ERASURE;
+ else
+ ceph_abort();
+
+ int minsize = int_node(i->children[start+4]);
+ int maxsize = int_node(i->children[start+6]);
+
+ int steps = i->children.size() - start - 8;
+ //err << "num steps " << steps << std::endl;
+
+ if (crush.rule_exists(ruleno)) {
+ err << "rule " << ruleno << " already exists" << std::endl;
+ return -1;
+ }
+ int r = crush.add_rule(ruleno, steps, type, minsize, maxsize);
+ if (r != ruleno) {
+ err << "unable to add rule id " << ruleno << " for rule '" << rname
+ << "'" << std::endl;
+ return -1;
+ }
+ if (rname.length()) {
+ crush.set_rule_name(ruleno, rname.c_str());
+ rule_id[rname] = ruleno;
+ }
+
+ int step = 0;
+ for (iter_t p = i->children.begin() + start + 7; step < steps; p++) {
+ iter_t s = p->children.begin() + 1;
+ int stepid = s->value.id().to_long();
+ switch (stepid) {
+ case crush_grammar::_step_take:
+ {
+ string item = string_node(s->children[1]);
+ if (!item_id.count(item)) {
+ err << "in rule '" << rname << "' item '" << item << "' not defined" << std::endl;
+ return -1;
+ }
+ int id = item_id[item];
+ int c = -1;
+ string class_name;
+ if (s->children.size() > 2) {
+ class_name = string_node(s->children[3]);
+ c = crush.get_class_id(class_name);
+ if (c < 0)
+ return c;
+ if (crush.class_bucket.count(id) == 0) {
+ err << "in rule '" << rname << "' step take " << item
+ << " has no class information" << std::endl;
+ return -EINVAL;
+ }
+ if (crush.class_bucket[id].count(c) == 0) {
+ err << "in rule '" << rname << "' step take " << item
+ << " no matching bucket for class " << class_name << std::endl;
+ return -EINVAL;
+ }
+ id = crush.class_bucket[id][c];
+ }
+ if (verbose) {
+ err << "rule " << rname << " take " << item;
+ if (c < 0)
+ err << std::endl;
+ else
+ err << " remapped to " << crush.get_item_name(id) << std::endl;
+ }
+
+ crush.set_rule_step_take(ruleno, step++, id);
+ }
+ break;
+
+ case crush_grammar::_step_set_choose_tries:
+ {
+ int val = int_node(s->children[1]);
+ crush.set_rule_step_set_choose_tries(ruleno, step++, val);
+ }
+ break;
+
+ case crush_grammar::_step_set_choose_local_tries:
+ {
+ int val = int_node(s->children[1]);
+ crush.set_rule_step_set_choose_local_tries(ruleno, step++, val);
+ }
+ break;
+
+ case crush_grammar::_step_set_choose_local_fallback_tries:
+ {
+ int val = int_node(s->children[1]);
+ crush.set_rule_step_set_choose_local_fallback_tries(ruleno, step++, val);
+ }
+ break;
+
+ case crush_grammar::_step_set_chooseleaf_tries:
+ {
+ int val = int_node(s->children[1]);
+ crush.set_rule_step_set_chooseleaf_tries(ruleno, step++, val);
+ }
+ break;
+
+ case crush_grammar::_step_set_chooseleaf_vary_r:
+ {
+ int val = int_node(s->children[1]);
+ crush.set_rule_step_set_chooseleaf_vary_r(ruleno, step++, val);
+ }
+ break;
+
+ case crush_grammar::_step_set_chooseleaf_stable:
+ {
+ int val = int_node(s->children[1]);
+ crush.set_rule_step_set_chooseleaf_stable(ruleno, step++, val);
+ }
+ break;
+
+ case crush_grammar::_step_choose:
+ case crush_grammar::_step_chooseleaf:
+ {
+ string type = string_node(s->children[4]);
+ if (!type_id.count(type)) {
+ err << "in rule '" << rname << "' type '" << type << "' not defined" << std::endl;
+ return -1;
+ }
+ string choose = string_node(s->children[0]);
+ string mode = string_node(s->children[1]);
+ if (choose == "choose") {
+ if (mode == "firstn")
+ crush.set_rule_step_choose_firstn(ruleno, step++, int_node(s->children[2]), type_id[type]);
+ else if (mode == "indep")
+ crush.set_rule_step_choose_indep(ruleno, step++, int_node(s->children[2]), type_id[type]);
+ else ceph_abort();
+ } else if (choose == "chooseleaf") {
+ if (mode == "firstn")
+ crush.set_rule_step_choose_leaf_firstn(ruleno, step++, int_node(s->children[2]), type_id[type]);
+ else if (mode == "indep")
+ crush.set_rule_step_choose_leaf_indep(ruleno, step++, int_node(s->children[2]), type_id[type]);
+ else ceph_abort();
+ } else ceph_abort();
+ }
+ break;
+
+ case crush_grammar::_step_emit:
+ crush.set_rule_step_emit(ruleno, step++);
+ break;
+
+ default:
+ err << "bad crush step " << stepid << std::endl;
+ return -1;
+ }
+ }
+ ceph_assert(step == steps);
+ return 0;
+}
+
+int CrushCompiler::parse_weight_set_weights(iter_t const& i, int bucket_id, crush_weight_set *weight_set)
+{
+ // -2 for the enclosing [ ]
+ __u32 size = i->children.size() - 2;
+ __u32 bucket_size = crush.get_bucket_size(bucket_id);
+ if (size != bucket_size) {
+ err << bucket_id << " needs exactly " << bucket_size
+ << " weights but got " << size << std::endl;
+ return -1;
+ }
+ weight_set->size = size;
+ weight_set->weights = (__u32 *)calloc(weight_set->size, sizeof(__u32));
+ __u32 pos = 0;
+ for (iter_t p = i->children.begin() + 1; p != i->children.end(); p++, pos++)
+ if (pos < size)
+ weight_set->weights[pos] = float_node(*p) * (float)0x10000;
+ return 0;
+}
+
+int CrushCompiler::parse_weight_set(iter_t const& i, int bucket_id, crush_choose_arg *arg)
+{
+ // -3 stands for the leading "weight_set" keyword and the enclosing [ ]
+ arg->weight_set_positions = i->children.size() - 3;
+ arg->weight_set = (crush_weight_set *)calloc(arg->weight_set_positions, sizeof(crush_weight_set));
+ __u32 pos = 0;
+ for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
+ int r = 0;
+ switch((int)p->value.id().to_long()) {
+ case crush_grammar::_weight_set_weights:
+ if (pos < arg->weight_set_positions) {
+ r = parse_weight_set_weights(p, bucket_id, &arg->weight_set[pos]);
+ pos++;
+ } else {
+ err << "invalid weight_set syntax" << std::endl;
+ r = -1;
+ }
+ }
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+int CrushCompiler::parse_choose_arg_ids(iter_t const& i, int bucket_id, crush_choose_arg *arg)
+{
+ // -3 for the leading "ids" keyword and the enclosing [ ]
+ __u32 size = i->children.size() - 3;
+ __u32 bucket_size = crush.get_bucket_size(bucket_id);
+ if (size != bucket_size) {
+ err << bucket_id << " needs exactly " << bucket_size
+ << " ids but got " << size << std::endl;
+ return -1;
+ }
+ arg->ids_size = size;
+ arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
+ __u32 pos = 0;
+ for (iter_t p = i->children.begin() + 2; pos < size; p++, pos++)
+ arg->ids[pos] = int_node(*p);
+ return 0;
+}
+
+int CrushCompiler::parse_choose_arg(iter_t const& i, crush_choose_arg *args)
+{
+ int bucket_id = int_node(i->children[2]);
+ if (-1-bucket_id < 0 || -1-bucket_id >= crush.get_max_buckets()) {
+ err << bucket_id << " is out of range" << std::endl;
+ return -1;
+ }
+ if (!crush.bucket_exists(bucket_id)) {
+ err << bucket_id << " does not exist" << std::endl;
+ return -1;
+ }
+ crush_choose_arg *arg = &args[-1-bucket_id];
+ for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
+ int r = 0;
+ switch((int)p->value.id().to_long()) {
+ case crush_grammar::_weight_set:
+ r = parse_weight_set(p, bucket_id, arg);
+ break;
+ case crush_grammar::_choose_arg_ids:
+ r = parse_choose_arg_ids(p, bucket_id, arg);
+ break;
+ }
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+int CrushCompiler::parse_choose_args(iter_t const& i)
+{
+ int choose_arg_index = int_node(i->children[1]);
+ if (crush.choose_args.find(choose_arg_index) != crush.choose_args.end()) {
+ err << choose_arg_index << " duplicated" << std::endl;
+ return -1;
+ }
+ const auto max_buckets = crush.get_max_buckets();
+ if (max_buckets < 0) {
+ err << "get_max_buckets() returned error" << std::endl;
+ return -1;
+ }
+ crush_choose_arg_map arg_map;
+ arg_map.size = max_buckets;
+ arg_map.args = (crush_choose_arg *)calloc(arg_map.size, sizeof(crush_choose_arg));
+ for (iter_t p = i->children.begin() + 2; p != i->children.end(); p++) {
+ int r = 0;
+ switch((int)p->value.id().to_long()) {
+ case crush_grammar::_choose_arg:
+ r = parse_choose_arg(p, arg_map.args);
+ break;
+ }
+ if (r < 0) {
+ crush.destroy_choose_args(arg_map);
+ return r;
+ }
+ }
+ crush.choose_args[choose_arg_index] = arg_map;
+ return 0;
+}
+
+void CrushCompiler::find_used_bucket_ids(iter_t const& i)
+{
+ for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
+ if ((int)p->value.id().to_long() == crush_grammar::_bucket) {
+ for (iter_t firstline = p->children.begin() + 3;
+ firstline != p->children.end();
+ ++firstline) {
+ string tag = string_node(firstline->children[0]);
+ if (tag != "id") {
+ break;
+ }
+ int id = int_node(firstline->children[1]);
+ //err << "saw bucket id " << id << std::endl;
+ id_item[id] = string();
+ }
+ }
+ }
+}
+
+int CrushCompiler::parse_crush(iter_t const& i)
+{
+ find_used_bucket_ids(i);
+ bool saw_rule = false;
+ for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
+ int r = 0;
+ switch (p->value.id().to_long()) {
+ case crush_grammar::_tunable:
+ r = parse_tunable(p);
+ break;
+ case crush_grammar::_device:
+ r = parse_device(p);
+ break;
+ case crush_grammar::_bucket_type:
+ r = parse_bucket_type(p);
+ break;
+ case crush_grammar::_bucket:
+ if (saw_rule) {
+ err << "buckets must be defined before rules" << std::endl;
+ return -1;
+ }
+ r = parse_bucket(p);
+ break;
+ case crush_grammar::_crushrule:
+ if (!saw_rule) {
+ saw_rule = true;
+ crush.populate_classes(class_bucket);
+ }
+ r = parse_rule(p);
+ break;
+ case crush_grammar::_choose_args:
+ r = parse_choose_args(p);
+ break;
+ default:
+ ceph_abort();
+ }
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ //err << "max_devices " << crush.get_max_devices() << std::endl;
+ crush.finalize();
+
+ return 0;
+}
+
+// squash runs of whitespace to one space, excepting newlines
+string CrushCompiler::consolidate_whitespace(string in)
+{
+ string out;
+
+ bool white = false;
+ for (unsigned p=0; p<in.length(); p++) {
+ if (isspace(in[p]) && in[p] != '\n') {
+ if (white)
+ continue;
+ white = true;
+ } else {
+ if (white) {
+ if (out.length()) out += " ";
+ white = false;
+ }
+ out += in[p];
+ }
+ }
+ if (verbose > 3)
+ err << " \"" << in << "\" -> \"" << out << "\"" << std::endl;
+ return out;
+}
+
+void CrushCompiler::dump(iter_t const& i, int ind)
+{
+ err << "dump";
+ for (int j=0; j<ind; j++)
+ cout << "\t";
+ long id = i->value.id().to_long();
+ err << id << "\t";
+ err << "'" << string(i->value.begin(), i->value.end())
+ << "' " << i->children.size() << " children" << std::endl;
+ for (unsigned int j = 0; j < i->children.size(); j++)
+ dump(i->children.begin() + j, ind+1);
+}
+
+/**
+* This function fix the problem like below
+* rack using_foo { item foo }
+* host foo { ... }
+*
+* if an item being used by a bucket is defined after that bucket.
+* CRUSH compiler will create a map by which we can
+* not identify that item when selecting in that bucket.
+**/
+int CrushCompiler::adjust_bucket_item_place(iter_t const &i)
+{
+ map<string,set<string> > bucket_items;
+ map<string,iter_t> bucket_itrer;
+ vector<string> buckets;
+ for (iter_t p = i->children.begin(); p != i->children.end(); ++p) {
+ if ((int)p->value.id().to_long() == crush_grammar::_bucket) {
+ string name = string_node(p->children[1]);
+ buckets.push_back(name);
+ bucket_itrer[name] = p;
+ //skip non-bucket-item children in the bucket's parse tree
+ for (unsigned q=3; q < p->children.size()-1; ++q) {
+ iter_t sub = p->children.begin() + q;
+ if ((int)sub->value.id().to_long()
+ == crush_grammar::_bucket_item) {
+ string iname = string_node(sub->children[1]);
+ bucket_items[name].insert(iname);
+ }
+ }
+ }
+ }
+
+ //adjust the bucket
+ for (unsigned i=0; i < buckets.size(); ++i) {
+ for (unsigned j=i+1; j < buckets.size(); ++j) {
+ if (bucket_items[buckets[i]].count(buckets[j])) {
+ if (bucket_items[buckets[j]].count(buckets[i])) {
+ err << "bucket '" << buckets[i] << "' and bucket '"
+ << buckets[j] << "' are included each other" << std::endl;
+ return -1;
+ } else {
+ std::iter_swap(bucket_itrer[buckets[i]], bucket_itrer[buckets[j]]);
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+int CrushCompiler::compile(istream& in, const char *infn)
+{
+ if (!infn)
+ infn = "<input>";
+
+ // always start with legacy tunables, so that the compiled result of
+ // a given crush file is fixed for all time.
+ crush.set_tunables_legacy();
+
+ string big;
+ string str;
+ int line = 1;
+ map<int,int> line_pos; // pos -> line
+ map<int,string> line_val;
+ while (getline(in, str)) {
+ // remove newline
+ int l = str.length();
+ if (l && str[l - 1] == '\n')
+ str.erase(l-1, 1);
+
+ line_val[line] = str;
+
+ // strip comment
+ int n = str.find("#");
+ if (n >= 0)
+ str.erase(n, str.length()-n);
+
+ if (verbose>1) err << line << ": " << str << std::endl;
+
+ // work around spirit crankiness by removing extraneous
+ // whitespace. there is probably a more elegant solution, but
+ // this only broke with the latest spirit (with the switchover to
+ // "classic"), i don't want to spend too much time figuring it
+ // out.
+ string stripped = consolidate_whitespace(str);
+ if (stripped.length() && big.length() && big[big.length()-1] != ' ') big += " ";
+
+ line_pos[big.length()] = line;
+ line++;
+ big += stripped;
+ }
+
+ if (verbose > 2) err << "whole file is: \"" << big << "\"" << std::endl;
+
+ crush_grammar crushg;
+ const char *start = big.c_str();
+ //tree_parse_info<const char *> info = ast_parse(start, crushg, space_p);
+ tree_parse_info<> info = ast_parse(start, crushg, space_p);
+
+ // parse error?
+ if (!info.full) {
+ int cpos = info.stop - start;
+ //out << "cpos " << cpos << std::endl;
+ //out << " linemap " << line_pos << std::endl;
+ ceph_assert(!line_pos.empty());
+ map<int,int>::iterator p = line_pos.upper_bound(cpos);
+ if (p != line_pos.begin())
+ --p;
+ int line = p->second;
+ int pos = cpos - p->first;
+ err << infn << ":" << line //<< ":" << (pos+1)
+ << " error: parse error at '" << line_val[line].substr(pos) << "'" << std::endl;
+ return -1;
+ }
+
+ int r = adjust_bucket_item_place(info.trees.begin());
+ if (r < 0) {
+ return r;
+ }
+ //out << "parsing succeeded\n";
+ //dump(info.trees.begin());
+ return parse_crush(info.trees.begin());
+}
diff --git a/src/crush/CrushCompiler.h b/src/crush/CrushCompiler.h
new file mode 100644
index 00000000..f035085e
--- /dev/null
+++ b/src/crush/CrushCompiler.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CRUSH_COMPILER_H
+#define CEPH_CRUSH_COMPILER_H
+
+#include "crush/CrushWrapper.h"
+#include "crush/grammar.h"
+
+#include <map>
+#include <iostream>
+
+class CrushCompiler {
+ CrushWrapper& crush;
+ ostream& err;
+ int verbose;
+ bool unsafe_tunables;
+
+ // decompile
+ enum dcb_state_t {
+ DCB_STATE_IN_PROGRESS = 0,
+ DCB_STATE_DONE
+ };
+
+ int decompile_weight_set_weights(crush_weight_set weight_set,
+ ostream &out);
+ int decompile_weight_set(crush_weight_set *weight_set,
+ __u32 size,
+ ostream &out);
+ int decompile_choose_arg(crush_choose_arg *arg,
+ int bucket_id,
+ ostream &out);
+ int decompile_ids(int *ids,
+ __u32 size,
+ ostream &out);
+ int decompile_choose_arg_map(crush_choose_arg_map arg_map,
+ ostream &out);
+ int decompile_choose_args(const std::pair<const long unsigned int, crush_choose_arg_map> &i,
+ ostream &out);
+ int decompile_bucket_impl(int i, ostream &out);
+ int decompile_bucket(int cur,
+ std::map<int, dcb_state_t>& dcb_states,
+ ostream &out);
+
+ // compile
+ typedef char const* iterator_t;
+ typedef tree_match<iterator_t> parse_tree_match_t;
+ typedef parse_tree_match_t::tree_iterator iter_t;
+ typedef parse_tree_match_t::node_t node_t;
+
+ map<string, int> item_id;
+ map<int, string> id_item;
+ map<int, unsigned> item_weight;
+ map<string, int> type_id;
+ map<string, int> rule_id;
+ std::map<int32_t, map<int32_t, int32_t> > class_bucket; // bucket id -> class id -> shadow bucket id
+
+ string string_node(node_t &node);
+ int int_node(node_t &node);
+ float float_node(node_t &node);
+
+ int parse_tunable(iter_t const& i);
+ int parse_device(iter_t const& i);
+ int parse_bucket_type(iter_t const& i);
+ int parse_bucket(iter_t const& i);
+ int parse_rule(iter_t const& i);
+ int parse_weight_set_weights(iter_t const& i, int bucket_id, crush_weight_set *weight_set);
+ int parse_weight_set(iter_t const& i, int bucket_id, crush_choose_arg *arg);
+ int parse_choose_arg_ids(iter_t const& i, int bucket_id, crush_choose_arg *args);
+ int parse_choose_arg(iter_t const& i, crush_choose_arg *args);
+ int parse_choose_args(iter_t const& i);
+ void find_used_bucket_ids(iter_t const& i);
+ int parse_crush(iter_t const& i);
+ void dump(iter_t const& i, int ind=1);
+ string consolidate_whitespace(string in);
+ int adjust_bucket_item_place(iter_t const &i);
+
+public:
+ CrushCompiler(CrushWrapper& c, ostream& eo, int verbosity=0)
+ : crush(c), err(eo), verbose(verbosity),
+ unsafe_tunables(false) {}
+ ~CrushCompiler() {}
+
+ void enable_unsafe_tunables() {
+ unsafe_tunables = true;
+ }
+
+ int decompile(ostream& out);
+ int compile(istream& in, const char *infn=0);
+};
+
+#endif
diff --git a/src/crush/CrushLocation.cc b/src/crush/CrushLocation.cc
new file mode 100644
index 00000000..2032bf71
--- /dev/null
+++ b/src/crush/CrushLocation.cc
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "CrushLocation.h"
+#include "CrushWrapper.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "include/str_list.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/compat.h"
+
+#include "common/SubProcess.h"
+
+#include <vector>
+
+int CrushLocation::update_from_conf()
+{
+ if (cct->_conf->crush_location.length())
+ return _parse(cct->_conf->crush_location);
+ return 0;
+}
+
+int CrushLocation::_parse(const std::string& s)
+{
+ std::multimap<std::string,std::string> new_crush_location;
+ std::vector<std::string> lvec;
+ get_str_vec(s, ";, \t", lvec);
+ int r = CrushWrapper::parse_loc_multimap(lvec, &new_crush_location);
+ if (r < 0) {
+ lderr(cct) << "warning: crush_location '" << cct->_conf->crush_location
+ << "' does not parse, keeping original crush_location "
+ << loc << dendl;
+ return -EINVAL;
+ }
+ std::lock_guard<std::mutex> l(lock);
+ loc.swap(new_crush_location);
+ lgeneric_dout(cct, 10) << "crush_location is " << loc << dendl;
+ return 0;
+}
+
+int CrushLocation::update_from_hook()
+{
+ if (cct->_conf->crush_location_hook.length() == 0)
+ return 0;
+
+ if (0 != access(cct->_conf->crush_location_hook.c_str(), R_OK)) {
+ lderr(cct) << "the user define crush location hook: " << cct->_conf->crush_location_hook
+ << " may not exist or can not access it" << dendl;
+ return errno;
+ }
+
+ SubProcessTimed hook(
+ cct->_conf->crush_location_hook.c_str(),
+ SubProcess::CLOSE, SubProcess::PIPE, SubProcess::PIPE,
+ cct->_conf->crush_location_hook_timeout);
+ hook.add_cmd_args(
+ "--cluster", cct->_conf->cluster.c_str(),
+ "--id", cct->_conf->name.get_id().c_str(),
+ "--type", cct->_conf->name.get_type_str(),
+ NULL);
+ int ret = hook.spawn();
+ if (ret != 0) {
+ lderr(cct) << "error: failed run " << cct->_conf->crush_location_hook << ": "
+ << hook.err() << dendl;
+ return ret;
+ }
+
+ bufferlist bl;
+ ret = bl.read_fd(hook.get_stdout(), 100 * 1024);
+ if (ret < 0) {
+ lderr(cct) << "error: failed read stdout from "
+ << cct->_conf->crush_location_hook
+ << ": " << cpp_strerror(-ret) << dendl;
+ bufferlist err;
+ err.read_fd(hook.get_stderr(), 100 * 1024);
+ lderr(cct) << "stderr:\n";
+ err.hexdump(*_dout);
+ *_dout << dendl;
+ }
+
+ if (hook.join() != 0) {
+ lderr(cct) << "error: failed to join: " << hook.err() << dendl;
+ return -EINVAL;
+ }
+
+ if (ret < 0)
+ return ret;
+
+ std::string out;
+ bl.copy(0, bl.length(), out);
+ out.erase(out.find_last_not_of(" \n\r\t")+1);
+ return _parse(out);
+}
+
+int CrushLocation::init_on_startup()
+{
+ if (cct->_conf->crush_location.length()) {
+ return update_from_conf();
+ }
+ if (cct->_conf->crush_location_hook.length()) {
+ return update_from_hook();
+ }
+
+ // start with a sane default
+ char hostname[HOST_NAME_MAX + 1];
+ int r = gethostname(hostname, sizeof(hostname));
+ if (r < 0)
+ strcpy(hostname, "unknown_host");
+ // use short hostname
+ for (unsigned i=0; hostname[i]; ++i) {
+ if (hostname[i] == '.') {
+ hostname[i] = '\0';
+ break;
+ }
+ }
+ std::lock_guard<std::mutex> l(lock);
+ loc.clear();
+ loc.insert(make_pair<std::string,std::string>("host", hostname));
+ loc.insert(make_pair<std::string,std::string>("root", "default"));
+ lgeneric_dout(cct, 10) << "crush_location is (default) " << loc << dendl;
+ return 0;
+}
diff --git a/src/crush/CrushLocation.h b/src/crush/CrushLocation.h
new file mode 100644
index 00000000..6a099689
--- /dev/null
+++ b/src/crush/CrushLocation.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CRUSH_LOCATION_H
+#define CEPH_CRUSH_LOCATION_H
+
+#include <map>
+#include <mutex>
+#include <string>
+
+class CephContext;
+
+class CrushLocation {
+ CephContext *cct;
+ std::multimap<std::string,std::string> loc;
+ std::mutex lock;
+
+ int _parse(const std::string& s);
+
+public:
+ explicit CrushLocation(CephContext *c) : cct(c) {
+ init_on_startup();
+ }
+
+ int update_from_conf(); ///< refresh from config
+ int update_from_hook(); ///< call hook, if present
+ int init_on_startup();
+
+ std::multimap<std::string,std::string> get_location() {
+ std::lock_guard<std::mutex> l(lock);
+ return loc;
+ }
+};
+
+#endif
diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc
new file mode 100644
index 00000000..86f91ef3
--- /dev/null
+++ b/src/crush/CrushTester.cc
@@ -0,0 +1,802 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/stringify.h"
+#include "CrushTester.h"
+#include "CrushTreeDumper.h"
+#include "include/ceph_features.h"
+
+#include <algorithm>
+#include <stdlib.h>
+#include <boost/lexical_cast.hpp>
+// to workaround https://svn.boost.org/trac/boost/ticket/9501
+#ifdef _LIBCPP_VERSION
+#include <boost/version.hpp>
+#if BOOST_VERSION < 105600
+#define ICL_USE_BOOST_MOVE_IMPLEMENTATION
+#endif
+#endif
+#include <boost/icl/interval_map.hpp>
+#include <boost/algorithm/string/join.hpp>
+#include "common/SubProcess.h"
+#include "common/fork_function.h"
+
+void CrushTester::set_device_weight(int dev, float f)
+{
+ int w = (int)(f * 0x10000);
+ if (w < 0)
+ w = 0;
+ if (w > 0x10000)
+ w = 0x10000;
+ device_weight[dev] = w;
+}
+
+int CrushTester::get_maximum_affected_by_rule(int ruleno)
+{
+ // get the number of steps in RULENO
+ int rule_size = crush.get_rule_len(ruleno);
+ vector<int> affected_types;
+ map<int,int> replications_by_type;
+
+ for (int i = 0; i < rule_size; i++){
+ // get what operation is done by the current step
+ int rule_operation = crush.get_rule_op(ruleno, i);
+
+ // if the operation specifies choosing a device type, store it
+ if (rule_operation >= 2 && rule_operation != 4){
+ int desired_replication = crush.get_rule_arg1(ruleno,i);
+ int affected_type = crush.get_rule_arg2(ruleno,i);
+ affected_types.push_back(affected_type);
+ replications_by_type[affected_type] = desired_replication;
+ }
+ }
+
+ /*
+ * now for each of the affected bucket types, see what is the
+ * maximum we are (a) requesting or (b) have
+ */
+
+ map<int,int> max_devices_of_type;
+
+ // loop through the vector of affected types
+ for (vector<int>::iterator it = affected_types.begin(); it != affected_types.end(); ++it){
+ // loop through the number of buckets looking for affected types
+ for (map<int,string>::iterator p = crush.name_map.begin(); p != crush.name_map.end(); ++p){
+ int bucket_type = crush.get_bucket_type(p->first);
+ if ( bucket_type == *it)
+ max_devices_of_type[*it]++;
+ }
+ }
+
+ for(std::vector<int>::iterator it = affected_types.begin(); it != affected_types.end(); ++it){
+ if ( replications_by_type[*it] > 0 && replications_by_type[*it] < max_devices_of_type[*it] )
+ max_devices_of_type[*it] = replications_by_type[*it];
+ }
+
+ /*
+ * get the smallest number of buckets available of any type as this is our upper bound on
+ * the number of replicas we can place
+ */
+ int max_affected = max( crush.get_max_buckets(), crush.get_max_devices() );
+
+ for(std::vector<int>::iterator it = affected_types.begin(); it != affected_types.end(); ++it){
+ if (max_devices_of_type[*it] > 0 && max_devices_of_type[*it] < max_affected )
+ max_affected = max_devices_of_type[*it];
+ }
+
+ return max_affected;
+}
+
+
+map<int,int> CrushTester::get_collapsed_mapping()
+{
+ int num_to_check = crush.get_max_devices();
+ int next_id = 0;
+ map<int, int> collapse_mask;
+
+ for (int i = 0; i < num_to_check; i++){
+ if (crush.check_item_present(i)){
+ collapse_mask[i] = next_id;
+ next_id++;
+ }
+ }
+
+ return collapse_mask;
+}
+
+void CrushTester::adjust_weights(vector<__u32>& weight)
+{
+
+ if (mark_down_device_ratio > 0) {
+ // active buckets
+ vector<int> bucket_ids;
+ for (int i = 0; i < crush.get_max_buckets(); i++) {
+ int id = -1 - i;
+ if (crush.get_bucket_weight(id) > 0) {
+ bucket_ids.push_back(id);
+ }
+ }
+
+ // get buckets that are one level above a device
+ vector<int> buckets_above_devices;
+ for (unsigned i = 0; i < bucket_ids.size(); i++) {
+ // grab the first child object of a bucket and check if it's ID is less than 0
+ int id = bucket_ids[i];
+ if (crush.get_bucket_size(id) == 0)
+ continue;
+ int first_child = crush.get_bucket_item(id, 0); // returns the ID of the bucket or device
+ if (first_child >= 0) {
+ buckets_above_devices.push_back(id);
+ }
+ }
+
+ // permute bucket list
+ for (unsigned i = 0; i < buckets_above_devices.size(); i++) {
+ unsigned j = lrand48() % (buckets_above_devices.size() - 1);
+ std::swap(buckets_above_devices[i], buckets_above_devices[j]);
+ }
+
+ // calculate how many buckets and devices we need to reap...
+ int num_buckets_to_visit = (int) (mark_down_bucket_ratio * buckets_above_devices.size());
+
+ for (int i = 0; i < num_buckets_to_visit; i++) {
+ int id = buckets_above_devices[i];
+ int size = crush.get_bucket_size(id);
+ vector<int> items;
+ for (int o = 0; o < size; o++)
+ items.push_back(crush.get_bucket_item(id, o));
+
+ // permute items
+ for (int o = 0; o < size; o++) {
+ int j = lrand48() % (crush.get_bucket_size(id) - 1);
+ std::swap(items[o], items[j]);
+ }
+
+ int local_devices_to_visit = (int) (mark_down_device_ratio*size);
+ for (int o = 0; o < local_devices_to_visit; o++){
+ int item = crush.get_bucket_item(id, o);
+ weight[item] = 0;
+ }
+ }
+ }
+}
+
+bool CrushTester::check_valid_placement(int ruleno, vector<int> in, const vector<__u32>& weight)
+{
+
+ bool valid_placement = true;
+ vector<int> included_devices;
+ map<string,string> seen_devices;
+
+ // first do the easy check that all devices are "up"
+ for (vector<int>::iterator it = in.begin(); it != in.end(); ++it) {
+ if (weight[(*it)] == 0) {
+ valid_placement = false;
+ break;
+ } else if (weight[(*it)] > 0) {
+ included_devices.push_back( (*it) );
+ }
+ }
+
+ /*
+ * now do the harder test of checking that the CRUSH rule r is not violated
+ * we could test that none of the devices mentioned in out are unique,
+ * but this is a special case of this test
+ */
+
+ // get the number of steps in RULENO
+ int rule_size = crush.get_rule_len(ruleno);
+ vector<string> affected_types;
+
+ // get the smallest type id, and name
+ int min_map_type = crush.get_num_type_names();
+ for (map<int,string>::iterator it = crush.type_map.begin(); it != crush.type_map.end(); ++it ) {
+ if ( (*it).first < min_map_type ) {
+ min_map_type = (*it).first;
+ }
+ }
+
+ string min_map_type_name = crush.type_map[min_map_type];
+
+ // get the types of devices affected by RULENO
+ for (int i = 0; i < rule_size; i++) {
+ // get what operation is done by the current step
+ int rule_operation = crush.get_rule_op(ruleno, i);
+
+ // if the operation specifies choosing a device type, store it
+ if (rule_operation >= 2 && rule_operation != 4) {
+ int affected_type = crush.get_rule_arg2(ruleno,i);
+ affected_types.push_back( crush.get_type_name(affected_type));
+ }
+ }
+
+ // find in if we are only dealing with osd's
+ bool only_osd_affected = false;
+ if (affected_types.size() == 1) {
+ if ((affected_types.back() == min_map_type_name) && (min_map_type_name == "osd")) {
+ only_osd_affected = true;
+ }
+ }
+
+ // check that we don't have any duplicate id's
+ for (vector<int>::iterator it = included_devices.begin(); it != included_devices.end(); ++it) {
+ int num_copies = std::count(included_devices.begin(), included_devices.end(), (*it) );
+ if (num_copies > 1) {
+ valid_placement = false;
+ }
+ }
+
+ // if we have more than just osd's affected we need to do a lot more work
+ if (!only_osd_affected) {
+ // loop through the devices that are "in/up"
+ for (vector<int>::iterator it = included_devices.begin(); it != included_devices.end(); ++it) {
+ if (valid_placement == false)
+ break;
+
+ // create a temporary map of the form (device type, device name in map)
+ map<string,string> device_location_hierarchy = crush.get_full_location(*it);
+
+ // loop over the types affected by RULENO looking for duplicate bucket assignments
+ for (vector<string>::iterator t = affected_types.begin(); t != affected_types.end(); ++t) {
+ if (seen_devices.count( device_location_hierarchy[*t])) {
+ valid_placement = false;
+ break;
+ } else {
+ // store the devices we have seen in the form of (device name, device type)
+ seen_devices[ device_location_hierarchy[*t] ] = *t;
+ }
+ }
+ }
+ }
+
+ return valid_placement;
+}
+
+int CrushTester::random_placement(int ruleno, vector<int>& out, int maxout, vector<__u32>& weight)
+{
+ // get the total weight of the system
+ int total_weight = 0;
+ for (unsigned i = 0; i < weight.size(); i++)
+ total_weight += weight[i];
+
+ if (total_weight == 0 ||
+ crush.get_max_devices() == 0)
+ return -EINVAL;
+
+ // determine the real maximum number of devices to return
+ int devices_requested = min(maxout, get_maximum_affected_by_rule(ruleno));
+ bool accept_placement = false;
+
+ vector<int> trial_placement(devices_requested);
+ int attempted_tries = 0;
+ int max_tries = 100;
+ do {
+ // create a vector to hold our trial mappings
+ int temp_array[devices_requested];
+ for (int i = 0; i < devices_requested; i++){
+ temp_array[i] = lrand48() % (crush.get_max_devices());
+ }
+
+ trial_placement.assign(temp_array, temp_array + devices_requested);
+ accept_placement = check_valid_placement(ruleno, trial_placement, weight);
+ attempted_tries++;
+ } while (accept_placement == false && attempted_tries < max_tries);
+
+ // save our random placement to the out vector
+ if (accept_placement)
+ out.assign(trial_placement.begin(), trial_placement.end());
+
+ // or don't....
+ else if (attempted_tries == max_tries)
+ return -EINVAL;
+
+ return 0;
+}
+
+void CrushTester::write_integer_indexed_vector_data_string(vector<string> &dst, int index, vector<int> vector_data)
+{
+ stringstream data_buffer (stringstream::in | stringstream::out);
+ unsigned input_size = vector_data.size();
+
+ // pass the indexing variable to the data buffer
+ data_buffer << index;
+
+ // pass the rest of the input data to the buffer
+ for (unsigned i = 0; i < input_size; i++) {
+ data_buffer << ',' << vector_data[i];
+ }
+
+ data_buffer << std::endl;
+
+ // write the data buffer to the destination
+ dst.push_back( data_buffer.str() );
+}
+
+void CrushTester::write_integer_indexed_vector_data_string(vector<string> &dst, int index, vector<float> vector_data)
+{
+ stringstream data_buffer (stringstream::in | stringstream::out);
+ unsigned input_size = vector_data.size();
+
+ // pass the indexing variable to the data buffer
+ data_buffer << index;
+
+ // pass the rest of the input data to the buffer
+ for (unsigned i = 0; i < input_size; i++) {
+ data_buffer << ',' << vector_data[i];
+ }
+
+ data_buffer << std::endl;
+
+ // write the data buffer to the destination
+ dst.push_back( data_buffer.str() );
+}
+
+void CrushTester::write_integer_indexed_scalar_data_string(vector<string> &dst, int index, int scalar_data)
+{
+ stringstream data_buffer (stringstream::in | stringstream::out);
+
+ // pass the indexing variable to the data buffer
+ data_buffer << index;
+
+ // pass the input data to the buffer
+ data_buffer << ',' << scalar_data;
+ data_buffer << std::endl;
+
+ // write the data buffer to the destination
+ dst.push_back( data_buffer.str() );
+}
+void CrushTester::write_integer_indexed_scalar_data_string(vector<string> &dst, int index, float scalar_data)
+{
+ stringstream data_buffer (stringstream::in | stringstream::out);
+
+ // pass the indexing variable to the data buffer
+ data_buffer << index;
+
+ // pass the input data to the buffer
+ data_buffer << ',' << scalar_data;
+ data_buffer << std::endl;
+
+ // write the data buffer to the destination
+ dst.push_back( data_buffer.str() );
+}
+
+int CrushTester::test_with_fork(int timeout)
+{
+ ostringstream sink;
+ int r = fork_function(timeout, sink, [&]() {
+ return test();
+ });
+ if (r == -ETIMEDOUT) {
+ err << "timed out during smoke test (" << timeout << " seconds)";
+ }
+ return r;
+}
+
+namespace {
+ class BadCrushMap : public std::runtime_error {
+ public:
+ int item;
+ BadCrushMap(const char* msg, int id)
+ : std::runtime_error(msg), item(id) {}
+ };
+ // throws if any node in the crush fail to print
+ class CrushWalker : public CrushTreeDumper::Dumper<void> {
+ typedef void DumbFormatter;
+ typedef CrushTreeDumper::Dumper<DumbFormatter> Parent;
+ int max_id;
+ public:
+ CrushWalker(const CrushWrapper *crush, unsigned max_id)
+ : Parent(crush, CrushTreeDumper::name_map_t()), max_id(max_id) {}
+ void dump_item(const CrushTreeDumper::Item &qi, DumbFormatter *) override {
+ int type = -1;
+ if (qi.is_bucket()) {
+ if (!crush->get_item_name(qi.id)) {
+ throw BadCrushMap("unknown item name", qi.id);
+ }
+ type = crush->get_bucket_type(qi.id);
+ } else {
+ if (max_id > 0 && qi.id >= max_id) {
+ throw BadCrushMap("item id too large", qi.id);
+ }
+ type = 0;
+ }
+ if (!crush->get_type_name(type)) {
+ throw BadCrushMap("unknown type name", qi.id);
+ }
+ }
+ };
+}
+
+bool CrushTester::check_name_maps(unsigned max_id) const
+{
+ CrushWalker crush_walker(&crush, max_id);
+ try {
+ // walk through the crush, to see if its self-contained
+ crush_walker.dump(NULL);
+ // and see if the maps is also able to handle straying OSDs, whose id >= 0.
+ // "ceph osd tree" will try to print them, even they are not listed in the
+ // crush map.
+ crush_walker.dump_item(CrushTreeDumper::Item(0, 0, 0, 0), NULL);
+ } catch (const BadCrushMap& e) {
+ err << e.what() << ": item#" << e.item << std::endl;
+ return false;
+ }
+ return true;
+}
+
+static string get_rule_name(CrushWrapper& crush, int rule)
+{
+ if (crush.get_rule_name(rule))
+ return crush.get_rule_name(rule);
+ else
+ return string("rule") + std::to_string(rule);
+}
+
+void CrushTester::check_overlapped_rules() const
+{
+ namespace icl = boost::icl;
+ typedef std::set<string> RuleNames;
+ typedef icl::interval_map<int, RuleNames> Rules;
+ // <ruleset, type> => interval_map<size, {names}>
+ typedef std::map<std::pair<int, int>, Rules> RuleSets;
+ using interval = icl::interval<int>;
+
+ // mimic the logic of crush_find_rule(), but it only return the first matched
+ // one, but I am collecting all of them by the overlapped sizes.
+ RuleSets rulesets;
+ for (int rule = 0; rule < crush.get_max_rules(); rule++) {
+ if (!crush.rule_exists(rule)) {
+ continue;
+ }
+ Rules& rules = rulesets[{crush.get_rule_mask_ruleset(rule),
+ crush.get_rule_mask_type(rule)}];
+ rules += make_pair(interval::closed(crush.get_rule_mask_min_size(rule),
+ crush.get_rule_mask_max_size(rule)),
+ RuleNames{get_rule_name(crush, rule)});
+ }
+ for (auto i : rulesets) {
+ auto ruleset_type = i.first;
+ const Rules& rules = i.second;
+ for (auto r : rules) {
+ const RuleNames& names = r.second;
+ // if there are more than one rules covering the same size range,
+ // print them out.
+ if (names.size() > 1) {
+ err << "overlapped rules in ruleset " << ruleset_type.first << ": "
+ << boost::join(names, ", ") << "\n";
+ }
+ }
+ }
+}
+
+int CrushTester::test()
+{
+ if (min_rule < 0 || max_rule < 0) {
+ min_rule = 0;
+ max_rule = crush.get_max_rules() - 1;
+ }
+ if (min_x < 0 || max_x < 0) {
+ min_x = 0;
+ max_x = 1023;
+ }
+
+ // initial osd weights
+ vector<__u32> weight;
+
+ /*
+ * note device weight is set by crushtool
+ * (likely due to a given a command line option)
+ */
+ for (int o = 0; o < crush.get_max_devices(); o++) {
+ if (device_weight.count(o)) {
+ weight.push_back(device_weight[o]);
+ } else if (crush.check_item_present(o)) {
+ weight.push_back(0x10000);
+ } else {
+ weight.push_back(0);
+ }
+ }
+
+ if (output_utilization_all)
+ err << "devices weights (hex): " << hex << weight << dec << std::endl;
+
+ // make adjustments
+ adjust_weights(weight);
+
+
+ int num_devices_active = 0;
+ for (vector<__u32>::iterator p = weight.begin(); p != weight.end(); ++p)
+ if (*p > 0)
+ num_devices_active++;
+
+ if (output_choose_tries)
+ crush.start_choose_profile();
+
+ for (int r = min_rule; r < crush.get_max_rules() && r <= max_rule; r++) {
+ if (!crush.rule_exists(r)) {
+ if (output_statistics)
+ err << "rule " << r << " dne" << std::endl;
+ continue;
+ }
+ if (ruleset >= 0 &&
+ crush.get_rule_mask_ruleset(r) != ruleset) {
+ continue;
+ }
+ int minr = min_rep, maxr = max_rep;
+ if (min_rep < 0 || max_rep < 0) {
+ minr = crush.get_rule_mask_min_size(r);
+ maxr = crush.get_rule_mask_max_size(r);
+ }
+
+ if (output_statistics)
+ err << "rule " << r << " (" << crush.get_rule_name(r)
+ << "), x = " << min_x << ".." << max_x
+ << ", numrep = " << minr << ".." << maxr
+ << std::endl;
+
+ for (int nr = minr; nr <= maxr; nr++) {
+ vector<int> per(crush.get_max_devices());
+ map<int,int> sizes;
+
+ int num_objects = ((max_x - min_x) + 1);
+ float num_devices = (float) per.size(); // get the total number of devices, better to cast as a float here
+
+ // create a structure to hold data for post-processing
+ tester_data_set tester_data;
+ vector<float> vector_data_buffer_f;
+
+ // create a map to hold batch-level placement information
+ map<int, vector<int> > batch_per;
+ int objects_per_batch = num_objects / num_batches;
+ int batch_min = min_x;
+ int batch_max = min_x + objects_per_batch - 1;
+
+ // get the total weight of the system
+ int total_weight = 0;
+ for (unsigned i = 0; i < per.size(); i++)
+ total_weight += weight[i];
+
+ if (total_weight == 0)
+ continue;
+
+ // compute the expected number of objects stored per device in the absence of weighting
+ float expected_objects = min(nr, get_maximum_affected_by_rule(r)) * num_objects;
+
+ // compute each device's proportional weight
+ vector<float> proportional_weights( per.size() );
+
+ for (unsigned i = 0; i < per.size(); i++)
+ proportional_weights[i] = (float) weight[i] / (float) total_weight;
+
+ if (output_data_file) {
+ // stage the absolute weight information for post-processing
+ for (unsigned i = 0; i < per.size(); i++) {
+ tester_data.absolute_weights[i] = (float) weight[i] / (float)0x10000;
+ }
+
+ // stage the proportional weight information for post-processing
+ for (unsigned i = 0; i < per.size(); i++) {
+ if (proportional_weights[i] > 0 )
+ tester_data.proportional_weights[i] = proportional_weights[i];
+
+ tester_data.proportional_weights_all[i] = proportional_weights[i];
+ }
+
+ }
+ // compute the expected number of objects stored per device when a device's weight is considered
+ vector<float> num_objects_expected(num_devices);
+
+ for (unsigned i = 0; i < num_devices; i++)
+ num_objects_expected[i] = (proportional_weights[i]*expected_objects);
+
+ for (int current_batch = 0; current_batch < num_batches; current_batch++) {
+ if (current_batch == (num_batches - 1)) {
+ batch_max = max_x;
+ objects_per_batch = (batch_max - batch_min + 1);
+ }
+
+ float batch_expected_objects = min(nr, get_maximum_affected_by_rule(r)) * objects_per_batch;
+ vector<float> batch_num_objects_expected( per.size() );
+
+ for (unsigned i = 0; i < per.size() ; i++)
+ batch_num_objects_expected[i] = (proportional_weights[i]*batch_expected_objects);
+
+ // create a vector to hold placement results temporarily
+ vector<int> temporary_per ( per.size() );
+
+ for (int x = batch_min; x <= batch_max; x++) {
+ // create a vector to hold the results of a CRUSH placement or RNG simulation
+ vector<int> out;
+
+ if (use_crush) {
+ if (output_mappings)
+ err << "CRUSH"; // prepend CRUSH to placement output
+ uint32_t real_x = x;
+ if (pool_id != -1) {
+ real_x = crush_hash32_2(CRUSH_HASH_RJENKINS1, x, (uint32_t)pool_id);
+ }
+ crush.do_rule(r, real_x, out, nr, weight, 0);
+ } else {
+ if (output_mappings)
+ err << "RNG"; // prepend RNG to placement output to denote simulation
+ // test our new monte carlo placement generator
+ random_placement(r, out, nr, weight);
+ }
+
+ if (output_mappings)
+ err << " rule " << r << " x " << x << " " << out << std::endl;
+
+ if (output_data_file)
+ write_integer_indexed_vector_data_string(tester_data.placement_information, x, out);
+
+ bool has_item_none = false;
+ for (unsigned i = 0; i < out.size(); i++) {
+ if (out[i] != CRUSH_ITEM_NONE) {
+ per[out[i]]++;
+ temporary_per[out[i]]++;
+ } else {
+ has_item_none = true;
+ }
+ }
+
+ batch_per[current_batch] = temporary_per;
+ sizes[out.size()]++;
+ if (output_bad_mappings &&
+ (out.size() != (unsigned)nr ||
+ has_item_none)) {
+ err << "bad mapping rule " << r << " x " << x << " num_rep " << nr << " result " << out << std::endl;
+ }
+ }
+
+ batch_min = batch_max + 1;
+ batch_max = batch_min + objects_per_batch - 1;
+ }
+
+ for (unsigned i = 0; i < per.size(); i++)
+ if (output_utilization && !output_statistics)
+ err << " device " << i
+ << ":\t" << per[i] << std::endl;
+
+ for (map<int,int>::iterator p = sizes.begin(); p != sizes.end(); ++p)
+ if (output_statistics)
+ err << "rule " << r << " (" << crush.get_rule_name(r) << ") num_rep " << nr
+ << " result size == " << p->first << ":\t"
+ << p->second << "/" << (max_x-min_x+1) << std::endl;
+
+ if (output_statistics)
+ for (unsigned i = 0; i < per.size(); i++) {
+ if (output_utilization) {
+ if (num_objects_expected[i] > 0 && per[i] > 0) {
+ err << " device " << i << ":\t"
+ << "\t" << " stored " << ": " << per[i]
+ << "\t" << " expected " << ": " << num_objects_expected[i]
+ << std::endl;
+ }
+ } else if (output_utilization_all) {
+ err << " device " << i << ":\t"
+ << "\t" << " stored " << ": " << per[i]
+ << "\t" << " expected " << ": " << num_objects_expected[i]
+ << std::endl;
+ }
+ }
+
+ if (output_data_file)
+ for (unsigned i = 0; i < per.size(); i++) {
+ vector_data_buffer_f.clear();
+ vector_data_buffer_f.push_back( (float) per[i]);
+ vector_data_buffer_f.push_back( (float) num_objects_expected[i]);
+
+ write_integer_indexed_vector_data_string(tester_data.device_utilization_all, i, vector_data_buffer_f);
+
+ if (num_objects_expected[i] > 0 && per[i] > 0)
+ write_integer_indexed_vector_data_string(tester_data.device_utilization, i, vector_data_buffer_f);
+ }
+
+ if (output_data_file && num_batches > 1) {
+ // stage batch utilization information for post-processing
+ for (int i = 0; i < num_batches; i++) {
+ write_integer_indexed_vector_data_string(tester_data.batch_device_utilization_all, i, batch_per[i]);
+ write_integer_indexed_vector_data_string(tester_data.batch_device_expected_utilization_all, i, batch_per[i]);
+ }
+ }
+
+ string rule_tag = crush.get_rule_name(r);
+
+ if (output_csv)
+ write_data_set_to_csv(output_data_file_name+rule_tag,tester_data);
+ }
+ }
+
+ if (output_choose_tries) {
+ __u32 *v = 0;
+ int n = crush.get_choose_profile(&v);
+ for (int i=0; i<n; i++) {
+ cout.setf(std::ios::right);
+ cout << std::setw(2)
+ << i << ": " << std::setw(9) << v[i];
+ cout.unsetf(std::ios::right);
+ cout << std::endl;
+ }
+
+ crush.stop_choose_profile();
+ }
+
+ return 0;
+}
+
+int CrushTester::compare(CrushWrapper& crush2)
+{
+ if (min_rule < 0 || max_rule < 0) {
+ min_rule = 0;
+ max_rule = crush.get_max_rules() - 1;
+ }
+ if (min_x < 0 || max_x < 0) {
+ min_x = 0;
+ max_x = 1023;
+ }
+
+ // initial osd weights
+ vector<__u32> weight;
+
+ /*
+ * note device weight is set by crushtool
+ * (likely due to a given a command line option)
+ */
+ for (int o = 0; o < crush.get_max_devices(); o++) {
+ if (device_weight.count(o)) {
+ weight.push_back(device_weight[o]);
+ } else if (crush.check_item_present(o)) {
+ weight.push_back(0x10000);
+ } else {
+ weight.push_back(0);
+ }
+ }
+
+ // make adjustments
+ adjust_weights(weight);
+
+ map<int,int> bad_by_rule;
+
+ int ret = 0;
+ for (int r = min_rule; r < crush.get_max_rules() && r <= max_rule; r++) {
+ if (!crush.rule_exists(r)) {
+ if (output_statistics)
+ err << "rule " << r << " dne" << std::endl;
+ continue;
+ }
+ if (ruleset >= 0 &&
+ crush.get_rule_mask_ruleset(r) != ruleset) {
+ continue;
+ }
+ int minr = min_rep, maxr = max_rep;
+ if (min_rep < 0 || max_rep < 0) {
+ minr = crush.get_rule_mask_min_size(r);
+ maxr = crush.get_rule_mask_max_size(r);
+ }
+ int bad = 0;
+ for (int nr = minr; nr <= maxr; nr++) {
+ for (int x = min_x; x <= max_x; ++x) {
+ vector<int> out;
+ crush.do_rule(r, x, out, nr, weight, 0);
+ vector<int> out2;
+ crush2.do_rule(r, x, out2, nr, weight, 0);
+ if (out != out2) {
+ ++bad;
+ }
+ }
+ }
+ if (bad) {
+ ret = -1;
+ }
+ int max = (maxr - minr + 1) * (max_x - min_x + 1);
+ double ratio = (double)bad / (double)max;
+ cout << "rule " << r << " had " << bad << "/" << max
+ << " mismatched mappings (" << ratio << ")" << std::endl;
+ }
+ if (ret) {
+ cerr << "warning: maps are NOT equivalent" << std::endl;
+ } else {
+ cout << "maps appear equivalent" << std::endl;
+ }
+ return ret;
+}
diff --git a/src/crush/CrushTester.h b/src/crush/CrushTester.h
new file mode 100644
index 00000000..c4257b63
--- /dev/null
+++ b/src/crush/CrushTester.h
@@ -0,0 +1,366 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CRUSH_TESTER_H
+#define CEPH_CRUSH_TESTER_H
+
+#include "crush/CrushWrapper.h"
+
+#include <fstream>
+
+class CrushTester {
+ CrushWrapper& crush;
+ ostream& err;
+
+ map<int, int> device_weight;
+ int min_rule, max_rule;
+ int ruleset;
+ int min_x, max_x;
+ int min_rep, max_rep;
+ int64_t pool_id;
+
+ int num_batches;
+ bool use_crush;
+
+ float mark_down_device_ratio;
+ float mark_down_bucket_ratio;
+
+ bool output_utilization;
+ bool output_utilization_all;
+ bool output_statistics;
+ bool output_mappings;
+ bool output_bad_mappings;
+ bool output_choose_tries;
+
+ bool output_data_file;
+ bool output_csv;
+
+ string output_data_file_name;
+
+/*
+ * mark a ratio of devices down, can be used to simulate placement distributions
+ * under degrated cluster conditions
+ */
+ void adjust_weights(vector<__u32>& weight);
+
+ /*
+ * Get the maximum number of devices that could be selected to satisfy ruleno.
+ */
+ int get_maximum_affected_by_rule(int ruleno);
+
+ /*
+ * for maps where in devices have non-sequential id numbers, return a mapping of device id
+ * to a sequential id number. For example, if we have devices with id's 0 1 4 5 6 return a map
+ * where:
+ * 0 = 0
+ * 1 = 1
+ * 4 = 2
+ * 5 = 3
+ * 6 = 4
+ *
+ * which can help make post-processing easier
+ */
+ map<int,int> get_collapsed_mapping();
+
+ /*
+ * Essentially a re-implementation of CRUSH. Given a vector of devices
+ * check that the vector represents a valid placement for a given ruleno.
+ */
+ bool check_valid_placement(int ruleno, vector<int> in, const vector<__u32>& weight);
+
+ /*
+ * Generate a random selection of devices which satisfies ruleno. Essentially a
+ * monte-carlo simulator for CRUSH placements which can be used to compare the
+ * statistical distribution of the CRUSH algorithm to a random number generator
+ */
+ int random_placement(int ruleno, vector<int>& out, int maxout, vector<__u32>& weight);
+
+ // scaffolding to store data for off-line processing
+ struct tester_data_set {
+ vector <string> device_utilization;
+ vector <string> device_utilization_all;
+ vector <string> placement_information;
+ vector <string> batch_device_utilization_all;
+ vector <string> batch_device_expected_utilization_all;
+ map<int, float> proportional_weights;
+ map<int, float> proportional_weights_all;
+ map<int, float> absolute_weights;
+ } ;
+
+ void write_to_csv(ofstream& csv_file, vector<string>& payload)
+ {
+ if (csv_file.good())
+ for (vector<string>::iterator it = payload.begin(); it != payload.end(); ++it)
+ csv_file << (*it);
+ }
+
+ void write_to_csv(ofstream& csv_file, map<int, float>& payload)
+ {
+ if (csv_file.good())
+ for (map<int, float>::iterator it = payload.begin(); it != payload.end(); ++it)
+ csv_file << (*it).first << ',' << (*it).second << std::endl;
+ }
+
+ void write_data_set_to_csv(string user_tag, tester_data_set& tester_data)
+ {
+
+ ofstream device_utilization_file ((user_tag + (string)"-device_utilization.csv").c_str());
+ ofstream device_utilization_all_file ((user_tag + (string)"-device_utilization_all.csv").c_str());
+ ofstream placement_information_file ((user_tag + (string)"-placement_information.csv").c_str());
+ ofstream proportional_weights_file ((user_tag + (string)"-proportional_weights.csv").c_str());
+ ofstream proportional_weights_all_file ((user_tag + (string)"-proportional_weights_all.csv").c_str());
+ ofstream absolute_weights_file ((user_tag + (string)"-absolute_weights.csv").c_str());
+
+ // write the headers
+ device_utilization_file << "Device ID, Number of Objects Stored, Number of Objects Expected" << std::endl;
+ device_utilization_all_file << "Device ID, Number of Objects Stored, Number of Objects Expected" << std::endl;
+ proportional_weights_file << "Device ID, Proportional Weight" << std::endl;
+ proportional_weights_all_file << "Device ID, Proportional Weight" << std::endl;
+ absolute_weights_file << "Device ID, Absolute Weight" << std::endl;
+
+ placement_information_file << "Input";
+ for (int i = 0; i < max_rep; i++) {
+ placement_information_file << ", OSD" << i;
+ }
+ placement_information_file << std::endl;
+
+ write_to_csv(device_utilization_file, tester_data.device_utilization);
+ write_to_csv(device_utilization_all_file, tester_data.device_utilization_all);
+ write_to_csv(placement_information_file, tester_data.placement_information);
+ write_to_csv(proportional_weights_file, tester_data.proportional_weights);
+ write_to_csv(proportional_weights_all_file, tester_data.proportional_weights_all);
+ write_to_csv(absolute_weights_file, tester_data.absolute_weights);
+
+ device_utilization_file.close();
+ device_utilization_all_file.close();
+ placement_information_file.close();
+ proportional_weights_file.close();
+ absolute_weights_file.close();
+
+ if (num_batches > 1) {
+ ofstream batch_device_utilization_all_file ((user_tag + (string)"-batch_device_utilization_all.csv").c_str());
+ ofstream batch_device_expected_utilization_all_file ((user_tag + (string)"-batch_device_expected_utilization_all.csv").c_str());
+
+ batch_device_utilization_all_file << "Batch Round";
+ for (unsigned i = 0; i < tester_data.device_utilization.size(); i++) {
+ batch_device_utilization_all_file << ", Objects Stored on OSD" << i;
+ }
+ batch_device_utilization_all_file << std::endl;
+
+ batch_device_expected_utilization_all_file << "Batch Round";
+ for (unsigned i = 0; i < tester_data.device_utilization.size(); i++) {
+ batch_device_expected_utilization_all_file << ", Objects Expected on OSD" << i;
+ }
+ batch_device_expected_utilization_all_file << std::endl;
+
+ write_to_csv(batch_device_utilization_all_file, tester_data.batch_device_utilization_all);
+ write_to_csv(batch_device_expected_utilization_all_file, tester_data.batch_device_expected_utilization_all);
+ batch_device_expected_utilization_all_file.close();
+ batch_device_utilization_all_file.close();
+ }
+ }
+
+ void write_integer_indexed_vector_data_string(vector<string> &dst, int index, vector<int> vector_data);
+ void write_integer_indexed_vector_data_string(vector<string> &dst, int index, vector<float> vector_data);
+ void write_integer_indexed_scalar_data_string(vector<string> &dst, int index, int scalar_data);
+ void write_integer_indexed_scalar_data_string(vector<string> &dst, int index, float scalar_data);
+
+public:
+ CrushTester(CrushWrapper& c, ostream& eo)
+ : crush(c), err(eo),
+ min_rule(-1), max_rule(-1),
+ ruleset(-1),
+ min_x(-1), max_x(-1),
+ min_rep(-1), max_rep(-1),
+ pool_id(-1),
+ num_batches(1),
+ use_crush(true),
+ mark_down_device_ratio(0.0),
+ mark_down_bucket_ratio(1.0),
+ output_utilization(false),
+ output_utilization_all(false),
+ output_statistics(false),
+ output_mappings(false),
+ output_bad_mappings(false),
+ output_choose_tries(false),
+ output_data_file(false),
+ output_csv(false),
+ output_data_file_name("")
+
+ { }
+
+ void set_output_data_file_name(string name) {
+ output_data_file_name = name;
+ }
+ string get_output_data_file_name() const {
+ return output_data_file_name;
+ }
+
+ void set_output_data_file(bool b) {
+ output_data_file = b;
+ }
+ bool get_output_data_file() const {
+ return output_data_file;
+ }
+
+ void set_output_csv(bool b) {
+ output_csv = b;
+ }
+ bool get_output_csv() const {
+ return output_csv;
+ }
+
+ void set_output_utilization(bool b) {
+ output_utilization = b;
+ }
+ bool get_output_utilization() const {
+ return output_utilization;
+ }
+
+ void set_output_utilization_all(bool b) {
+ output_utilization_all = b;
+ }
+ bool get_output_utilization_all() const {
+ return output_utilization_all;
+ }
+
+ void set_output_statistics(bool b) {
+ output_statistics = b;
+ }
+ bool get_output_statistics() const {
+ return output_statistics;
+ }
+
+ void set_output_mappings(bool b) {
+ output_mappings = b;
+ }
+ bool get_output_mappings() const {
+ return output_mappings;
+ }
+
+ void set_output_bad_mappings(bool b) {
+ output_bad_mappings = b;
+ }
+ bool get_output_bad_mappings() const {
+ return output_bad_mappings;
+ }
+
+ void set_output_choose_tries(bool b) {
+ output_choose_tries = b;
+ }
+ bool get_output_choose_tries() const {
+ return output_choose_tries;
+ }
+
+ void set_batches(int b) {
+ num_batches = b;
+ }
+ int get_batches() const {
+ return num_batches;
+ }
+
+ void set_random_placement() {
+ use_crush = false;
+ }
+ bool get_random_placement() const {
+ return use_crush == false;
+ }
+
+ void set_bucket_down_ratio(float bucket_ratio) {
+ mark_down_bucket_ratio = bucket_ratio;
+ }
+ float get_bucket_down_ratio() const {
+ return mark_down_bucket_ratio;
+ }
+
+ void set_device_down_ratio(float device_ratio) {
+ mark_down_device_ratio = device_ratio;
+ }
+ float set_device_down_ratio() const {
+ return mark_down_device_ratio;
+ }
+
+ void set_device_weight(int dev, float f);
+
+ void set_min_rep(int r) {
+ min_rep = r;
+ }
+ int get_min_rep() const {
+ return min_rep;
+ }
+
+ void set_max_rep(int r) {
+ max_rep = r;
+ }
+ int get_max_rep() const {
+ return max_rep;
+ }
+
+ void set_num_rep(int r) {
+ min_rep = max_rep = r;
+ }
+
+ void set_min_x(int x) {
+ min_x = x;
+ }
+
+ void set_pool_id(int64_t x){
+ pool_id = x;
+ }
+
+ int get_min_x() const {
+ return min_x;
+ }
+
+ void set_max_x(int x) {
+ max_x = x;
+ }
+ int get_max_x() const {
+ return max_x;
+ }
+
+ void set_x(int x) {
+ min_x = max_x = x;
+ }
+
+ void set_min_rule(int rule) {
+ min_rule = rule;
+ }
+ int get_min_rule() const {
+ return min_rule;
+ }
+
+ void set_max_rule(int rule) {
+ max_rule = rule;
+ }
+ int get_max_rule() const {
+ return max_rule;
+ }
+
+ void set_rule(int rule) {
+ min_rule = max_rule = rule;
+ }
+
+ void set_ruleset(int rs) {
+ ruleset = rs;
+ }
+
+ /**
+ * check if any bucket/nodes is referencing an unknown name or type
+ * @param max_id rejects any non-bucket items with id less than this number,
+ * pass 0 to disable this check
+ * @return false if an dangling name/type is referenced or an item id is too
+ * large, true otherwise
+ */
+ bool check_name_maps(unsigned max_id = 0) const;
+ /**
+ * print out overlapped crush rules belonging to the same ruleset
+ */
+ void check_overlapped_rules() const;
+ int test();
+ int test_with_fork(int timeout);
+
+ int compare(CrushWrapper& other);
+};
+
+#endif
diff --git a/src/crush/CrushTreeDumper.h b/src/crush/CrushTreeDumper.h
new file mode 100644
index 00000000..5c0430c2
--- /dev/null
+++ b/src/crush/CrushTreeDumper.h
@@ -0,0 +1,291 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2015 Mirantis Inc
+ *
+ * Author: Mykola Golub <mgolub@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CRUSH_TREE_DUMPER_H
+#define CRUSH_TREE_DUMPER_H
+
+#include "CrushWrapper.h"
+#include "include/stringify.h"
+
+/**
+ * CrushTreeDumper:
+ * A helper class and functions to dump a crush tree.
+ *
+ * Example:
+ *
+ * class SimpleDumper : public CrushTreeDumper::Dumper<ostream> {
+ * public:
+ * SimpleDumper(const CrushWrapper *crush) :
+ * CrushTreeDumper::Dumper<ostream>(crush) {}
+ * protected:
+ * virtual void dump_item(const CrushTreeDumper::Item &qi, ostream *out) {
+ * *out << qi.id;
+ * for (int k = 0; k < qi.depth; k++)
+ * *out << "-";
+ * if (qi.is_bucket())
+ * *out << crush->get_item_name(qi.id)
+ * else
+ * *out << "osd." << qi.id;
+ * *out << "\n";
+ * }
+ * };
+ *
+ * SimpleDumper(crush).dump(out);
+ *
+ */
+
+namespace CrushTreeDumper {
+
+ struct Item {
+ int id;
+ int parent;
+ int depth;
+ float weight;
+ list<int> children;
+
+ Item() : id(0), parent(0), depth(0), weight(0) {}
+ Item(int i, int p, int d, float w) : id(i), parent(p), depth(d), weight(w) {}
+
+ bool is_bucket() const { return id < 0; }
+ };
+
+ template <typename F>
+ class Dumper : public list<Item> {
+ public:
+ explicit Dumper(const CrushWrapper *crush_,
+ const name_map_t& weight_set_names_)
+ : crush(crush_), weight_set_names(weight_set_names_) {
+ crush->find_nonshadow_roots(&roots);
+ root = roots.begin();
+ }
+ explicit Dumper(const CrushWrapper *crush_,
+ const name_map_t& weight_set_names_,
+ bool show_shadow)
+ : crush(crush_), weight_set_names(weight_set_names_) {
+ if (show_shadow) {
+ crush->find_roots(&roots);
+ } else {
+ crush->find_nonshadow_roots(&roots);
+ }
+ root = roots.begin();
+ }
+
+ virtual ~Dumper() {}
+
+ virtual void reset() {
+ root = roots.begin();
+ touched.clear();
+ clear();
+ }
+
+ virtual bool should_dump_leaf(int i) const {
+ return true;
+ }
+ virtual bool should_dump_empty_bucket() const {
+ return true;
+ }
+
+ bool should_dump(int id) {
+ if (id >= 0)
+ return should_dump_leaf(id);
+ if (should_dump_empty_bucket())
+ return true;
+ int s = crush->get_bucket_size(id);
+ for (int k = s - 1; k >= 0; k--) {
+ int c = crush->get_bucket_item(id, k);
+ if (should_dump(c))
+ return true;
+ }
+ return false;
+ }
+
+ bool next(Item &qi) {
+ if (empty()) {
+ while (root != roots.end() && !should_dump(*root))
+ ++root;
+ if (root == roots.end())
+ return false;
+ push_back(Item(*root, 0, 0, crush->get_bucket_weightf(*root)));
+ ++root;
+ }
+
+ qi = front();
+ pop_front();
+ touched.insert(qi.id);
+
+ if (qi.is_bucket()) {
+ // queue bucket contents, sorted by (class, name)
+ int s = crush->get_bucket_size(qi.id);
+ map<string,pair<int,float>> sorted;
+ for (int k = s - 1; k >= 0; k--) {
+ int id = crush->get_bucket_item(qi.id, k);
+ if (should_dump(id)) {
+ string sort_by;
+ if (id >= 0) {
+ const char *c = crush->get_item_class(id);
+ sort_by = c ? c : "";
+ sort_by += "_";
+ char nn[80];
+ snprintf(nn, sizeof(nn), "osd.%08d", id);
+ sort_by += nn;
+ } else {
+ sort_by = "_";
+ sort_by += crush->get_item_name(id);
+ }
+ sorted[sort_by] = make_pair(
+ id, crush->get_bucket_item_weightf(qi.id, k));
+ }
+ }
+ for (auto p = sorted.rbegin(); p != sorted.rend(); ++p) {
+ qi.children.push_back(p->second.first);
+ push_front(Item(p->second.first, qi.id, qi.depth + 1,
+ p->second.second));
+ }
+ }
+ return true;
+ }
+
+ void dump(F *f) {
+ reset();
+ Item qi;
+ while (next(qi))
+ dump_item(qi, f);
+ }
+
+ bool is_touched(int id) const { return touched.count(id) > 0; }
+
+ void set_root(const string& bucket) {
+ roots.clear();
+ if (crush->name_exists(bucket)) {
+ int i = crush->get_item_id(bucket);
+ roots.insert(i);
+ }
+ }
+
+ protected:
+ virtual void dump_item(const Item &qi, F *f) = 0;
+
+ protected:
+ const CrushWrapper *crush;
+ const name_map_t &weight_set_names;
+
+ private:
+ set<int> touched;
+ set<int> roots;
+ set<int>::iterator root;
+ };
+
+ inline void dump_item_fields(const CrushWrapper *crush,
+ const name_map_t& weight_set_names,
+ const Item &qi, Formatter *f) {
+ f->dump_int("id", qi.id);
+ const char *c = crush->get_item_class(qi.id);
+ if (c)
+ f->dump_string("device_class", c);
+ if (qi.is_bucket()) {
+ int type = crush->get_bucket_type(qi.id);
+ f->dump_string("name", crush->get_item_name(qi.id));
+ f->dump_string("type", crush->get_type_name(type));
+ f->dump_int("type_id", type);
+ } else {
+ f->dump_stream("name") << "osd." << qi.id;
+ f->dump_string("type", crush->get_type_name(0));
+ f->dump_int("type_id", 0);
+ f->dump_float("crush_weight", qi.weight);
+ f->dump_unsigned("depth", qi.depth);
+ }
+ if (qi.parent < 0) {
+ f->open_object_section("pool_weights");
+ for (auto& p : crush->choose_args) {
+ const crush_choose_arg_map& cmap = p.second;
+ int bidx = -1 - qi.parent;
+ const crush_bucket *b = crush->get_bucket(qi.parent);
+ if (b &&
+ bidx < (int)cmap.size &&
+ cmap.args[bidx].weight_set &&
+ cmap.args[bidx].weight_set_positions >= 1) {
+ int bpos;
+ for (bpos = 0;
+ bpos < (int)cmap.args[bidx].weight_set[0].size &&
+ b->items[bpos] != qi.id;
+ ++bpos) ;
+ string name;
+ if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
+ name = "(compat)";
+ } else {
+ auto q = weight_set_names.find(p.first);
+ name = q != weight_set_names.end() ? q->second :
+ stringify(p.first);
+ }
+ f->open_array_section(name.c_str());
+ for (unsigned opos = 0;
+ opos < cmap.args[bidx].weight_set_positions;
+ ++opos) {
+ float w = (float)cmap.args[bidx].weight_set[opos].weights[bpos] /
+ (float)0x10000;
+ f->dump_float("weight", w);
+ }
+ f->close_section();
+ }
+ }
+ f->close_section();
+ }
+ }
+
+ inline void dump_bucket_children(const CrushWrapper *crush,
+ const Item &qi, Formatter *f) {
+ if (!qi.is_bucket())
+ return;
+
+ f->open_array_section("children");
+ for (list<int>::const_iterator i = qi.children.begin();
+ i != qi.children.end();
+ ++i) {
+ f->dump_int("child", *i);
+ }
+ f->close_section();
+ }
+
+ class FormattingDumper : public Dumper<Formatter> {
+ public:
+ explicit FormattingDumper(const CrushWrapper *crush,
+ const name_map_t& weight_set_names)
+ : Dumper<Formatter>(crush, weight_set_names) {}
+ explicit FormattingDumper(const CrushWrapper *crush,
+ const name_map_t& weight_set_names,
+ bool show_shadow)
+ : Dumper<Formatter>(crush, weight_set_names, show_shadow) {}
+
+ protected:
+ void dump_item(const Item &qi, Formatter *f) override {
+ f->open_object_section("item");
+ dump_item_fields(qi, f);
+ dump_bucket_children(qi, f);
+ f->close_section();
+ }
+
+ virtual void dump_item_fields(const Item &qi, Formatter *f) {
+ CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
+ }
+
+ virtual void dump_bucket_children(const Item &qi, Formatter *f) {
+ CrushTreeDumper::dump_bucket_children(crush, qi, f);
+ }
+ };
+
+}
+
+#endif
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
new file mode 100644
index 00000000..2b11ce9e
--- /dev/null
+++ b/src/crush/CrushWrapper.cc
@@ -0,0 +1,4185 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd/osd_types.h"
+#include "common/debug.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+#include "common/TextTable.h"
+#include "include/stringify.h"
+
+#include "CrushWrapper.h"
+#include "CrushTreeDumper.h"
+
+#define dout_subsys ceph_subsys_crush
+
+bool CrushWrapper::has_legacy_rule_ids() const
+{
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ crush_rule *r = crush->rules[i];
+ if (r &&
+ r->mask.ruleset != i) {
+ return true;
+ }
+ }
+ return false;
+}
+
+std::map<int, int> CrushWrapper::renumber_rules()
+{
+ std::map<int, int> result;
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ crush_rule *r = crush->rules[i];
+ if (r && r->mask.ruleset != i) {
+ result[r->mask.ruleset] = i;
+ r->mask.ruleset = i;
+ }
+ }
+ return result;
+}
+
+bool CrushWrapper::has_non_straw2_buckets() const
+{
+ for (int i=0; i<crush->max_buckets; ++i) {
+ crush_bucket *b = crush->buckets[i];
+ if (!b)
+ continue;
+ if (b->alg != CRUSH_BUCKET_STRAW2)
+ return true;
+ }
+ return false;
+}
+
+bool CrushWrapper::has_v2_rules() const
+{
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ if (is_v2_rule(i)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CrushWrapper::is_v2_rule(unsigned ruleid) const
+{
+ // check rule for use of indep or new SET_* rule steps
+ if (ruleid >= crush->max_rules)
+ return false;
+ crush_rule *r = crush->rules[ruleid];
+ if (!r)
+ return false;
+ for (unsigned j=0; j<r->len; j++) {
+ if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
+ r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
+ r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
+ r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CrushWrapper::has_v3_rules() const
+{
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ if (is_v3_rule(i)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CrushWrapper::is_v3_rule(unsigned ruleid) const
+{
+ // check rule for use of SET_CHOOSELEAF_VARY_R step
+ if (ruleid >= crush->max_rules)
+ return false;
+ crush_rule *r = crush->rules[ruleid];
+ if (!r)
+ return false;
+ for (unsigned j=0; j<r->len; j++) {
+ if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CrushWrapper::has_v4_buckets() const
+{
+ for (int i=0; i<crush->max_buckets; ++i) {
+ crush_bucket *b = crush->buckets[i];
+ if (!b)
+ continue;
+ if (b->alg == CRUSH_BUCKET_STRAW2)
+ return true;
+ }
+ return false;
+}
+
+bool CrushWrapper::has_v5_rules() const
+{
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ if (is_v5_rule(i)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CrushWrapper::is_v5_rule(unsigned ruleid) const
+{
+ // check rule for use of SET_CHOOSELEAF_STABLE step
+ if (ruleid >= crush->max_rules)
+ return false;
+ crush_rule *r = crush->rules[ruleid];
+ if (!r)
+ return false;
+ for (unsigned j=0; j<r->len; j++) {
+ if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CrushWrapper::has_choose_args() const
+{
+ return !choose_args.empty();
+}
+
+bool CrushWrapper::has_incompat_choose_args() const
+{
+ if (choose_args.empty())
+ return false;
+ if (choose_args.size() > 1)
+ return true;
+ if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS)
+ return true;
+ crush_choose_arg_map arg_map = choose_args.begin()->second;
+ for (__u32 i = 0; i < arg_map.size; i++) {
+ crush_choose_arg *arg = &arg_map.args[i];
+ if (arg->weight_set_positions == 0 &&
+ arg->ids_size == 0)
+ continue;
+ if (arg->weight_set_positions != 1)
+ return true;
+ if (arg->ids_size != 0)
+ return true;
+ }
+ return false;
+}
+
+int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
+{
+ if (!item_exists(i))
+ return -EINVAL;
+ string name = get_item_name(i);
+ size_t pos = name.find("~");
+ if (pos == string::npos) {
+ *idout = i;
+ *classout = -1;
+ return 0;
+ }
+ string name_no_class = name.substr(0, pos);
+ if (!name_exists(name_no_class))
+ return -ENOENT;
+ string class_name = name.substr(pos + 1);
+ if (!class_exists(class_name))
+ return -ENOENT;
+ *idout = get_item_id(name_no_class);
+ *classout = get_class_id(class_name);
+ return 0;
+}
+
+int CrushWrapper::can_rename_item(const string& srcname,
+ const string& dstname,
+ ostream *ss) const
+{
+ if (name_exists(srcname)) {
+ if (name_exists(dstname)) {
+ *ss << "dstname = '" << dstname << "' already exists";
+ return -EEXIST;
+ }
+ if (is_valid_crush_name(dstname)) {
+ return 0;
+ } else {
+ *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
+ return -EINVAL;
+ }
+ } else {
+ if (name_exists(dstname)) {
+ *ss << "srcname = '" << srcname << "' does not exist "
+ << "and dstname = '" << dstname << "' already exists";
+ return -EALREADY;
+ } else {
+ *ss << "srcname = '" << srcname << "' does not exist";
+ return -ENOENT;
+ }
+ }
+}
+
+int CrushWrapper::rename_item(const string& srcname,
+ const string& dstname,
+ ostream *ss)
+{
+ int ret = can_rename_item(srcname, dstname, ss);
+ if (ret < 0)
+ return ret;
+ int oldid = get_item_id(srcname);
+ return set_item_name(oldid, dstname);
+}
+
+int CrushWrapper::can_rename_bucket(const string& srcname,
+ const string& dstname,
+ ostream *ss) const
+{
+ int ret = can_rename_item(srcname, dstname, ss);
+ if (ret)
+ return ret;
+ int srcid = get_item_id(srcname);
+ if (srcid >= 0) {
+ *ss << "srcname = '" << srcname << "' is not a bucket "
+ << "because its id = " << srcid << " is >= 0";
+ return -ENOTDIR;
+ }
+ return 0;
+}
+
+int CrushWrapper::rename_bucket(const string& srcname,
+ const string& dstname,
+ ostream *ss)
+{
+ int ret = can_rename_bucket(srcname, dstname, ss);
+ if (ret < 0)
+ return ret;
+ int oldid = get_item_id(srcname);
+ return set_item_name(oldid, dstname);
+}
+
+int CrushWrapper::rename_rule(const string& srcname,
+ const string& dstname,
+ ostream *ss)
+{
+ if (!rule_exists(srcname)) {
+ if (ss) {
+ *ss << "source rule name '" << srcname << "' does not exist";
+ }
+ return -ENOENT;
+ }
+ if (rule_exists(dstname)) {
+ if (ss) {
+ *ss << "destination rule name '" << dstname << "' already exists";
+ }
+ return -EEXIST;
+ }
+ int rule_id = get_rule_id(srcname);
+ auto it = rule_name_map.find(rule_id);
+ ceph_assert(it != rule_name_map.end());
+ it->second = dstname;
+ if (have_rmaps) {
+ rule_name_rmap.erase(srcname);
+ rule_name_rmap[dstname] = rule_id;
+ }
+ return 0;
+}
+
+void CrushWrapper::find_takes(set<int> *roots) const
+{
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j=0; j<r->len; j++) {
+ if (r->steps[j].op == CRUSH_RULE_TAKE)
+ roots->insert(r->steps[j].arg1);
+ }
+ }
+}
+
+void CrushWrapper::find_takes_by_rule(int rule, set<int> *roots) const
+{
+ if (rule < 0 || rule >= (int)crush->max_rules)
+ return;
+ crush_rule *r = crush->rules[rule];
+ if (!r)
+ return;
+ for (unsigned i = 0; i < r->len; i++) {
+ if (r->steps[i].op == CRUSH_RULE_TAKE)
+ roots->insert(r->steps[i].arg1);
+ }
+}
+
+void CrushWrapper::find_roots(set<int> *roots) const
+{
+ for (int i = 0; i < crush->max_buckets; i++) {
+ if (!crush->buckets[i])
+ continue;
+ crush_bucket *b = crush->buckets[i];
+ if (!_search_item_exists(b->id))
+ roots->insert(b->id);
+ }
+}
+
+bool CrushWrapper::subtree_contains(int root, int item) const
+{
+ if (root == item)
+ return true;
+
+ if (root >= 0)
+ return false; // root is a leaf
+
+ const crush_bucket *b = get_bucket(root);
+ if (IS_ERR(b))
+ return false;
+
+ for (unsigned j=0; j<b->size; j++) {
+ if (subtree_contains(b->items[j], item))
+ return true;
+ }
+ return false;
+}
+
+bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only)
+{
+ // last instance?
+ if (_search_item_exists(item)) {
+ return false;
+ }
+ if (item < 0 && _bucket_is_in_use(item)) {
+ return false;
+ }
+
+ if (item < 0 && !unlink_only) {
+ crush_bucket *t = get_bucket(item);
+ ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl;
+ crush_remove_bucket(crush, t);
+ if (class_bucket.count(item) != 0)
+ class_bucket.erase(item);
+ class_remove_item(item);
+ update_choose_args(cct);
+ }
+ if ((item >= 0 || !unlink_only) && name_map.count(item)) {
+ ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
+ name_map.erase(item);
+ have_rmaps = false;
+ if (item >= 0 && !unlink_only) {
+ class_remove_item(item);
+ }
+ }
+ rebuild_roots_with_classes(cct);
+ return true;
+}
+
+int CrushWrapper::remove_root(CephContext *cct, int item)
+{
+ crush_bucket *b = get_bucket(item);
+ if (IS_ERR(b)) {
+ // should be idempotent
+ // e.g.: we use 'crush link' to link same host into
+ // different roots, which as a result can cause different
+ // shadow trees reference same hosts too. This means
+ // we may need to destory the same buckets(hosts, racks, etc.)
+ // multiple times during rebuilding all shadow trees.
+ return 0;
+ }
+
+ for (unsigned n = 0; n < b->size; n++) {
+ if (b->items[n] >= 0)
+ continue;
+ int r = remove_root(cct, b->items[n]);
+ if (r < 0)
+ return r;
+ }
+
+ crush_remove_bucket(crush, b);
+ if (name_map.count(item) != 0) {
+ name_map.erase(item);
+ have_rmaps = false;
+ }
+ if (class_bucket.count(item) != 0)
+ class_bucket.erase(item);
+ class_remove_item(item);
+ update_choose_args(cct);
+ return 0;
+}
+
+void CrushWrapper::update_choose_args(CephContext *cct)
+{
+ for (auto& i : choose_args) {
+ crush_choose_arg_map &arg_map = i.second;
+ assert(arg_map.size == (unsigned)crush->max_buckets);
+ unsigned positions = get_choose_args_positions(arg_map);
+ for (int j = 0; j < crush->max_buckets; ++j) {
+ crush_bucket *b = crush->buckets[j];
+ assert(j < (int)arg_map.size);
+ auto& carg = arg_map.args[j];
+ // strip out choose_args for any buckets that no longer exist
+ if (!b || b->alg != CRUSH_BUCKET_STRAW2) {
+ if (carg.ids) {
+ if (cct)
+ ldout(cct,10) << __func__ << " removing " << i.first << " bucket "
+ << (-1-j) << " ids" << dendl;
+ free(carg.ids);
+ carg.ids = 0;
+ carg.ids_size = 0;
+ }
+ if (carg.weight_set) {
+ if (cct)
+ ldout(cct,10) << __func__ << " removing " << i.first << " bucket "
+ << (-1-j) << " weight_sets" << dendl;
+ for (unsigned p = 0; p < carg.weight_set_positions; ++p) {
+ free(carg.weight_set[p].weights);
+ }
+ free(carg.weight_set);
+ carg.weight_set = 0;
+ carg.weight_set_positions = 0;
+ }
+ continue;
+ }
+ if (carg.weight_set_positions == 0) {
+ continue; // skip it
+ }
+ if (carg.weight_set_positions != positions) {
+ if (cct)
+ lderr(cct) << __func__ << " " << i.first << " bucket "
+ << (-1-j) << " positions " << carg.weight_set_positions
+ << " -> " << positions << dendl;
+ continue; // wth... skip!
+ }
+ // mis-sized weight_sets? this shouldn't ever happen.
+ for (unsigned p = 0; p < positions; ++p) {
+ if (carg.weight_set[p].size != b->size) {
+ if (cct)
+ lderr(cct) << __func__ << " fixing " << i.first << " bucket "
+ << (-1-j) << " position " << p
+ << " size " << carg.weight_set[p].size << " -> "
+ << b->size << dendl;
+ auto old_ws = carg.weight_set[p];
+ carg.weight_set[p].size = b->size;
+ carg.weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
+ auto max = std::min<unsigned>(old_ws.size, b->size);
+ for (unsigned k = 0; k < max; ++k) {
+ carg.weight_set[p].weights[k] = old_ws.weights[k];
+ }
+ free(old_ws.weights);
+ }
+ }
+ }
+ }
+}
+
+int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
+{
+ ldout(cct, 5) << "remove_item " << item
+ << (unlink_only ? " unlink_only":"") << dendl;
+
+ int ret = -ENOENT;
+
+ if (item < 0 && !unlink_only) {
+ crush_bucket *t = get_bucket(item);
+ if (IS_ERR(t)) {
+ ldout(cct, 1) << "remove_item bucket " << item << " does not exist"
+ << dendl;
+ return -ENOENT;
+ }
+
+ if (t->size) {
+ ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size
+ << " items, not empty" << dendl;
+ return -ENOTEMPTY;
+ }
+ if (_bucket_is_in_use(item)) {
+ return -EBUSY;
+ }
+ }
+
+ for (int i = 0; i < crush->max_buckets; i++) {
+ if (!crush->buckets[i])
+ continue;
+ crush_bucket *b = crush->buckets[i];
+
+ for (unsigned i=0; i<b->size; ++i) {
+ int id = b->items[i];
+ if (id == item) {
+ ldout(cct, 5) << "remove_item removing item " << item
+ << " from bucket " << b->id << dendl;
+ adjust_item_weight_in_bucket(cct, item, 0, b->id, true);
+ bucket_remove_item(b, item);
+ ret = 0;
+ }
+ }
+ }
+
+ if (_maybe_remove_last_instance(cct, item, unlink_only))
+ ret = 0;
+
+ return ret;
+}
+
+bool CrushWrapper::_search_item_exists(int item) const
+{
+ for (int i = 0; i < crush->max_buckets; i++) {
+ if (!crush->buckets[i])
+ continue;
+ crush_bucket *b = crush->buckets[i];
+ for (unsigned j=0; j<b->size; ++j) {
+ if (b->items[j] == item)
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CrushWrapper::_bucket_is_in_use(int item)
+{
+ for (auto &i : class_bucket)
+ for (auto &j : i.second)
+ if (j.second == item)
+ return true;
+ for (unsigned i = 0; i < crush->max_rules; ++i) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j = 0; j < r->len; ++j) {
+ if (r->steps[j].op == CRUSH_RULE_TAKE) {
+ int step_item = r->steps[j].arg1;
+ int original_item;
+ int c;
+ int res = split_id_class(step_item, &original_item, &c);
+ if (res < 0)
+ return false;
+ if (step_item == item || original_item == item)
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+int CrushWrapper::_remove_item_under(
+ CephContext *cct, int item, int ancestor, bool unlink_only)
+{
+ ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
+ << (unlink_only ? " unlink_only":"") << dendl;
+
+ if (ancestor >= 0) {
+ return -EINVAL;
+ }
+
+ if (!bucket_exists(ancestor))
+ return -EINVAL;
+
+ int ret = -ENOENT;
+
+ crush_bucket *b = get_bucket(ancestor);
+ for (unsigned i=0; i<b->size; ++i) {
+ int id = b->items[i];
+ if (id == item) {
+ ldout(cct, 5) << "_remove_item_under removing item " << item
+ << " from bucket " << b->id << dendl;
+ adjust_item_weight_in_bucket(cct, item, 0, b->id, true);
+ bucket_remove_item(b, item);
+ ret = 0;
+ } else if (id < 0) {
+ int r = remove_item_under(cct, item, id, unlink_only);
+ if (r == 0)
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+int CrushWrapper::remove_item_under(
+ CephContext *cct, int item, int ancestor, bool unlink_only)
+{
+ ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
+ << (unlink_only ? " unlink_only":"") << dendl;
+
+ if (!unlink_only && _bucket_is_in_use(item)) {
+ return -EBUSY;
+ }
+
+ int ret = _remove_item_under(cct, item, ancestor, unlink_only);
+ if (ret < 0)
+ return ret;
+
+ if (item < 0 && !unlink_only) {
+ crush_bucket *t = get_bucket(item);
+ if (IS_ERR(t)) {
+ ldout(cct, 1) << "remove_item_under bucket " << item
+ << " does not exist" << dendl;
+ return -ENOENT;
+ }
+
+ if (t->size) {
+ ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
+ << " items, not empty" << dendl;
+ return -ENOTEMPTY;
+ }
+ }
+
+ if (_maybe_remove_last_instance(cct, item, unlink_only))
+ ret = 0;
+
+ return ret;
+}
+
+int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id,
+ const std::multimap<string,string>& loc) const
+{
+ ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl;
+ if (!item_exists(id))
+ return -ENOENT;
+ map<string,string> id_loc = get_full_location(id);
+ ldout(cct, 20) << " id is at " << id_loc << dendl;
+
+ for (map<int,string>::const_iterator p = type_map.begin();
+ p != type_map.end();
+ ++p) {
+ map<string,string>::iterator ip = id_loc.find(p->second);
+ if (ip == id_loc.end())
+ continue;
+ for (std::multimap<string,string>::const_iterator q = loc.find(p->second);
+ q != loc.end();
+ ++q) {
+ if (q->first != p->second)
+ break;
+ if (q->second == ip->second)
+ return p->first;
+ }
+ }
+ return -ERANGE;
+}
+
+int CrushWrapper::parse_loc_map(const std::vector<string>& args,
+ std::map<string,string> *ploc)
+{
+ ploc->clear();
+ for (unsigned i = 0; i < args.size(); ++i) {
+ const char *s = args[i].c_str();
+ const char *pos = strchr(s, '=');
+ if (!pos)
+ return -EINVAL;
+ string key(s, 0, pos-s);
+ string value(pos+1);
+ if (value.length())
+ (*ploc)[key] = value;
+ else
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int CrushWrapper::parse_loc_multimap(const std::vector<string>& args,
+ std::multimap<string,string> *ploc)
+{
+ ploc->clear();
+ for (unsigned i = 0; i < args.size(); ++i) {
+ const char *s = args[i].c_str();
+ const char *pos = strchr(s, '=');
+ if (!pos)
+ return -EINVAL;
+ string key(s, 0, pos-s);
+ string value(pos+1);
+ if (value.length())
+ ploc->insert(make_pair(key, value));
+ else
+ return -EINVAL;
+ }
+ return 0;
+}
+
+bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map<string,string>& loc,
+ int *weight)
+{
+ ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl;
+
+ for (map<int,string>::const_iterator p = type_map.begin(); p != type_map.end(); ++p) {
+ // ignore device
+ if (p->first == 0)
+ continue;
+
+ // ignore types that aren't specified in loc
+ map<string,string>::const_iterator q = loc.find(p->second);
+ if (q == loc.end()) {
+ ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
+ << type_map << ")" << dendl;
+ continue;
+ }
+
+ if (!name_exists(q->second)) {
+ ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl;
+ return false;
+ }
+
+ int id = get_item_id(q->second);
+ if (id >= 0) {
+ ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second
+ << " is a device, not bucket" << dendl;
+ return false;
+ }
+
+ ceph_assert(bucket_exists(id));
+ crush_bucket *b = get_bucket(id);
+
+ // see if item exists in this bucket
+ for (unsigned j=0; j<b->size; j++) {
+ if (b->items[j] == item) {
+ ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl;
+ if (weight)
+ *weight = crush_get_bucket_item_weight(b, j);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ ldout(cct, 2) << __func__ << " item " << item << " loc " << loc << dendl;
+ return false;
+}
+
+map<string, string> CrushWrapper::get_full_location(int id) const
+{
+ vector<pair<string, string> > full_location_ordered;
+ map<string,string> full_location;
+
+ get_full_location_ordered(id, full_location_ordered);
+
+ std::copy(full_location_ordered.begin(),
+ full_location_ordered.end(),
+ std::inserter(full_location, full_location.begin()));
+
+ return full_location;
+}
+
+int CrushWrapper::get_full_location(const string& name,
+ map<string,string> *ploc)
+{
+ build_rmaps();
+ auto p = name_rmap.find(name);
+ if (p == name_rmap.end()) {
+ return -ENOENT;
+ }
+ *ploc = get_full_location(p->second);
+ return 0;
+}
+
+int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string> >& path) const
+{
+ if (!item_exists(id))
+ return -ENOENT;
+ int cur = id;
+ int ret;
+ while (true) {
+ pair<string, string> parent_coord = get_immediate_parent(cur, &ret);
+ if (ret != 0)
+ break;
+ path.push_back(parent_coord);
+ cur = get_item_id(parent_coord.second);
+ }
+ return 0;
+}
+
+string CrushWrapper::get_full_location_ordered_string(int id) const
+{
+ vector<pair<string, string> > full_location_ordered;
+ string full_location;
+ get_full_location_ordered(id, full_location_ordered);
+ reverse(begin(full_location_ordered), end(full_location_ordered));
+ for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
+ full_location = full_location + i->first + "=" + i->second;
+ if (i != full_location_ordered.end() - 1) {
+ full_location = full_location + ",";
+ }
+ }
+ return full_location;
+}
+
+map<int, string> CrushWrapper::get_parent_hierarchy(int id) const
+{
+ map<int,string> parent_hierarchy;
+ pair<string, string> parent_coord = get_immediate_parent(id);
+ int parent_id;
+
+ // get the integer type for id and create a counter from there
+ int type_counter = get_bucket_type(id);
+
+ // if we get a negative type then we can assume that we have an OSD
+ // change behavior in get_item_type FIXME
+ if (type_counter < 0)
+ type_counter = 0;
+
+ // read the type map and get the name of the type with the largest ID
+ int high_type = 0;
+ if (!type_map.empty())
+ high_type = type_map.rbegin()->first;
+
+ parent_id = get_item_id(parent_coord.second);
+
+ while (type_counter < high_type) {
+ type_counter++;
+ parent_hierarchy[ type_counter ] = parent_coord.first;
+
+ if (type_counter < high_type){
+ // get the coordinate information for the next parent
+ parent_coord = get_immediate_parent(parent_id);
+ parent_id = get_item_id(parent_coord.second);
+ }
+ }
+
+ return parent_hierarchy;
+}
+
+int CrushWrapper::get_children(int id, list<int> *children) const
+{
+ // leaf?
+ if (id >= 0) {
+ return 0;
+ }
+
+ auto *b = get_bucket(id);
+ if (IS_ERR(b)) {
+ return -ENOENT;
+ }
+
+ for (unsigned n=0; n<b->size; n++) {
+ children->push_back(b->items[n]);
+ }
+ return b->size;
+}
+
+int CrushWrapper::get_all_children(int id, set<int> *children) const
+{
+ // leaf?
+ if (id >= 0) {
+ return 0;
+ }
+
+ auto *b = get_bucket(id);
+ if (IS_ERR(b)) {
+ return -ENOENT;
+ }
+
+ int c = 0;
+ for (unsigned n = 0; n < b->size; n++) {
+ children->insert(b->items[n]);
+ c++;
+ auto r = get_all_children(b->items[n], children);
+ if (r < 0)
+ return r;
+ c += r;
+ }
+ return c;
+}
+
+void CrushWrapper::get_children_of_type(int id,
+ int type,
+ vector<int> *children,
+ bool exclude_shadow) const
+{
+ if (id >= 0) {
+ if (type == 0) {
+ // want leaf?
+ children->push_back(id);
+ }
+ return;
+ }
+ auto b = get_bucket(id);
+ if (IS_ERR(b)) {
+ return;
+ }
+ if (b->type < type) {
+ // give up
+ return;
+ } else if (b->type == type) {
+ if (!is_shadow_item(b->id) || !exclude_shadow) {
+ children->push_back(b->id);
+ }
+ return;
+ }
+ for (unsigned n = 0; n < b->size; n++) {
+ get_children_of_type(b->items[n], type, children, exclude_shadow);
+ }
+}
+
+int CrushWrapper::verify_upmap(CephContext *cct,
+ int rule_id,
+ int pool_size,
+ const vector<int>& up)
+{
+ auto rule = get_rule(rule_id);
+ if (IS_ERR(rule) || !rule) {
+ lderr(cct) << __func__ << " rule " << rule_id << " does not exist"
+ << dendl;
+ return -ENOENT;
+ }
+ for (unsigned step = 0; step < rule->len; ++step) {
+ auto curstep = &rule->steps[step];
+ ldout(cct, 10) << __func__ << " step " << step << dendl;
+ switch (curstep->op) {
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
+ {
+ int type = curstep->arg2;
+ if (type == 0) // osd
+ break;
+ map<int, set<int>> osds_by_parent; // parent_of_desired_type -> osds
+ for (auto osd : up) {
+ auto parent = get_parent_of_type(osd, type, rule_id);
+ if (parent < 0) {
+ osds_by_parent[parent].insert(osd);
+ } else {
+ ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
+ << ", skipping for now"
+ << dendl;
+ }
+ }
+ for (auto i : osds_by_parent) {
+ if (i.second.size() > 1) {
+ lderr(cct) << __func__ << " multiple osds " << i.second
+ << " come from same failure domain " << i.first
+ << dendl;
+ return -EINVAL;
+ }
+ }
+ }
+ break;
+
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ case CRUSH_RULE_CHOOSE_INDEP:
+ {
+ int numrep = curstep->arg1;
+ int type = curstep->arg2;
+ if (type == 0) // osd
+ break;
+ if (numrep <= 0)
+ numrep += pool_size;
+ set<int> parents_of_type;
+ for (auto osd : up) {
+ auto parent = get_parent_of_type(osd, type, rule_id);
+ if (parent < 0) {
+ parents_of_type.insert(parent);
+ } else {
+ ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
+ << ", skipping for now"
+ << dendl;
+ }
+ }
+ if ((int)parents_of_type.size() > numrep) {
+ lderr(cct) << __func__ << " number of buckets "
+ << parents_of_type.size() << " exceeds desired " << numrep
+ << dendl;
+ return -EINVAL;
+ }
+ }
+ break;
+
+ default:
+ // ignore
+ break;
+ }
+ }
+ return 0;
+}
+
+int CrushWrapper::_get_leaves(int id, list<int> *leaves) const
+{
+ ceph_assert(leaves);
+
+ // Already leaf?
+ if (id >= 0) {
+ leaves->push_back(id);
+ return 0;
+ }
+
+ auto b = get_bucket(id);
+ if (IS_ERR(b)) {
+ return -ENOENT;
+ }
+
+ for (unsigned n = 0; n < b->size; n++) {
+ if (b->items[n] >= 0) {
+ leaves->push_back(b->items[n]);
+ } else {
+ // is a bucket, do recursive call
+ int r = _get_leaves(b->items[n], leaves);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ return 0; // all is well
+}
+
+int CrushWrapper::get_leaves(const string &name, set<int> *leaves) const
+{
+ ceph_assert(leaves);
+ leaves->clear();
+
+ if (!name_exists(name)) {
+ return -ENOENT;
+ }
+
+ int id = get_item_id(name);
+ if (id >= 0) {
+ // already leaf
+ leaves->insert(id);
+ return 0;
+ }
+
+ list<int> unordered;
+ int r = _get_leaves(id, &unordered);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto &p : unordered) {
+ leaves->insert(p);
+ }
+
+ return 0;
+}
+
+int CrushWrapper::insert_item(
+ CephContext *cct, int item, float weight, string name,
+ const map<string,string>& loc, // typename -> bucketname
+ bool init_weight_sets)
+{
+ ldout(cct, 5) << "insert_item item " << item << " weight " << weight
+ << " name " << name << " loc " << loc << dendl;
+
+ if (!is_valid_crush_name(name))
+ return -EINVAL;
+
+ if (!is_valid_crush_loc(cct, loc))
+ return -EINVAL;
+
+ int r = validate_weightf(weight);
+ if (r < 0) {
+ return r;
+ }
+
+ if (name_exists(name)) {
+ if (get_item_id(name) != item) {
+ ldout(cct, 10) << "device name '" << name << "' already exists as id "
+ << get_item_id(name) << dendl;
+ return -EEXIST;
+ }
+ } else {
+ set_item_name(item, name);
+ }
+
+ int cur = item;
+
+ // create locations if locations don't exist and add child in
+ // location with 0 weight the more detail in the insert_item method
+ // declaration in CrushWrapper.h
+ for (auto p = type_map.begin(); p != type_map.end(); ++p) {
+ // ignore device type
+ if (p->first == 0)
+ continue;
+
+ // skip types that are unspecified
+ map<string,string>::const_iterator q = loc.find(p->second);
+ if (q == loc.end()) {
+ ldout(cct, 2) << "warning: did not specify location for '"
+ << p->second << "' level (levels are "
+ << type_map << ")" << dendl;
+ continue;
+ }
+
+ if (!name_exists(q->second)) {
+ ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
+ int empty = 0, newid;
+ int r = add_bucket(0, 0,
+ CRUSH_HASH_DEFAULT, p->first, 1, &cur, &empty, &newid);
+ if (r < 0) {
+ ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ set_item_name(newid, q->second);
+
+ cur = newid;
+ continue;
+ }
+
+ // add to an existing bucket
+ int id = get_item_id(q->second);
+ if (!bucket_exists(id)) {
+ ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl;
+ return -EINVAL;
+ }
+
+ // check that we aren't creating a cycle.
+ if (subtree_contains(id, cur)) {
+ ldout(cct, 1) << "insert_item item " << cur << " already exists beneath "
+ << id << dendl;
+ return -EINVAL;
+ }
+
+ // we have done sanity check above
+ crush_bucket *b = get_bucket(id);
+
+ if (p->first != b->type) {
+ ldout(cct, 1) << "insert_item existing bucket has type "
+ << "'" << type_map[b->type] << "' != "
+ << "'" << type_map[p->first] << "'" << dendl;
+ return -EINVAL;
+ }
+
+ // are we forming a loop?
+ if (subtree_contains(cur, b->id)) {
+ ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id
+ << "; cannot form loop" << dendl;
+ return -ELOOP;
+ }
+
+ ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
+ << " to bucket " << id << dendl;
+ [[maybe_unused]] int r = bucket_add_item(b, cur, 0);
+ ceph_assert(!r);
+ break;
+ }
+
+ // adjust the item's weight in location
+ if (adjust_item_weightf_in_loc(cct, item, weight, loc,
+ item >= 0 && init_weight_sets) > 0) {
+ if (item >= crush->max_devices) {
+ crush->max_devices = item + 1;
+ ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices
+ << dendl;
+ }
+ r = rebuild_roots_with_classes(cct);
+ if (r < 0) {
+ ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+ }
+
+ ldout(cct, 1) << "error: didn't find anywhere to add item " << item
+ << " in " << loc << dendl;
+ return -EINVAL;
+}
+
+
+int CrushWrapper::move_bucket(
+ CephContext *cct, int id, const map<string,string>& loc)
+{
+ // sorry this only works for buckets
+ if (id >= 0)
+ return -EINVAL;
+
+ if (!item_exists(id))
+ return -ENOENT;
+
+ // get the name of the bucket we are trying to move for later
+ string id_name = get_item_name(id);
+
+ // detach the bucket
+ int bucket_weight = detach_bucket(cct, id);
+
+ // insert the bucket back into the hierarchy
+ return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc,
+ false);
+}
+
+int CrushWrapper::detach_bucket(CephContext *cct, int item)
+{
+ if (!crush)
+ return (-EINVAL);
+
+ if (item >= 0)
+ return (-EINVAL);
+
+ // check that the bucket that we want to detach exists
+ ceph_assert(bucket_exists(item));
+
+ // get the bucket's weight
+ crush_bucket *b = get_bucket(item);
+ unsigned bucket_weight = b->weight;
+
+ // get where the bucket is located
+ pair<string, string> bucket_location = get_immediate_parent(item);
+
+ // get the id of the parent bucket
+ int parent_id = get_item_id(bucket_location.second);
+
+ // get the parent bucket
+ crush_bucket *parent_bucket = get_bucket(parent_id);
+
+ if (!IS_ERR(parent_bucket)) {
+ // zero out the bucket weight
+ adjust_item_weight_in_bucket(cct, item, 0, parent_bucket->id, true);
+
+ // remove the bucket from the parent
+ bucket_remove_item(parent_bucket, item);
+ } else if (PTR_ERR(parent_bucket) != -ENOENT) {
+ return PTR_ERR(parent_bucket);
+ }
+
+ // check that we're happy
+ int test_weight = 0;
+ map<string,string> test_location;
+ test_location[ bucket_location.first ] = (bucket_location.second);
+
+ bool successful_detach = !(check_item_loc(cct, item, test_location,
+ &test_weight));
+ ceph_assert(successful_detach);
+ ceph_assert(test_weight == 0);
+
+ return bucket_weight;
+}
+
+bool CrushWrapper::is_parent_of(int child, int p) const
+{
+ int parent = 0;
+ while (!get_immediate_parent_id(child, &parent)) {
+ if (parent == p) {
+ return true;
+ }
+ child = parent;
+ }
+ return false;
+}
+
+int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
+{
+ if (src >= 0 || dst >= 0)
+ return -EINVAL;
+ if (!item_exists(src) || !item_exists(dst))
+ return -EINVAL;
+ crush_bucket *a = get_bucket(src);
+ crush_bucket *b = get_bucket(dst);
+ if (is_parent_of(a->id, b->id) || is_parent_of(b->id, a->id)) {
+ return -EINVAL;
+ }
+ unsigned aw = a->weight;
+ unsigned bw = b->weight;
+
+ // swap weights
+ adjust_item_weight(cct, a->id, bw);
+ adjust_item_weight(cct, b->id, aw);
+
+ // swap items
+ map<int,unsigned> tmp;
+ unsigned as = a->size;
+ unsigned bs = b->size;
+ for (unsigned i = 0; i < as; ++i) {
+ int item = a->items[0];
+ int itemw = crush_get_bucket_item_weight(a, 0);
+ tmp[item] = itemw;
+ bucket_remove_item(a, item);
+ }
+ ceph_assert(a->size == 0);
+ ceph_assert(b->size == bs);
+ for (unsigned i = 0; i < bs; ++i) {
+ int item = b->items[0];
+ int itemw = crush_get_bucket_item_weight(b, 0);
+ bucket_remove_item(b, item);
+ bucket_add_item(a, item, itemw);
+ }
+ ceph_assert(a->size == bs);
+ ceph_assert(b->size == 0);
+ for (auto t : tmp) {
+ bucket_add_item(b, t.first, t.second);
+ }
+ ceph_assert(a->size == bs);
+ ceph_assert(b->size == as);
+
+ // swap names
+ swap_names(src, dst);
+ return rebuild_roots_with_classes(cct);
+}
+
+int CrushWrapper::link_bucket(
+ CephContext *cct, int id, const map<string,string>& loc)
+{
+ // sorry this only works for buckets
+ if (id >= 0)
+ return -EINVAL;
+
+ if (!item_exists(id))
+ return -ENOENT;
+
+ // get the name of the bucket we are trying to move for later
+ string id_name = get_item_name(id);
+
+ crush_bucket *b = get_bucket(id);
+ unsigned bucket_weight = b->weight;
+
+ return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
+}
+
+int CrushWrapper::create_or_move_item(
+ CephContext *cct, int item, float weight, string name,
+ const map<string,string>& loc, // typename -> bucketname
+ bool init_weight_sets)
+{
+ int ret = 0;
+ int old_iweight;
+
+ if (!is_valid_crush_name(name))
+ return -EINVAL;
+
+ if (check_item_loc(cct, item, loc, &old_iweight)) {
+ ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc
+ << dendl;
+ } else {
+ if (_search_item_exists(item)) {
+ weight = get_item_weightf(item);
+ ldout(cct, 10) << "create_or_move_item " << item
+ << " exists with weight " << weight << dendl;
+ remove_item(cct, item, true);
+ }
+ ldout(cct, 5) << "create_or_move_item adding " << item
+ << " weight " << weight
+ << " at " << loc << dendl;
+ ret = insert_item(cct, item, weight, name, loc,
+ item >= 0 && init_weight_sets);
+ if (ret == 0)
+ ret = 1; // changed
+ }
+ return ret;
+}
+
+int CrushWrapper::update_item(
+ CephContext *cct, int item, float weight, string name,
+ const map<string,string>& loc) // typename -> bucketname
+{
+ ldout(cct, 5) << "update_item item " << item << " weight " << weight
+ << " name " << name << " loc " << loc << dendl;
+ int ret = 0;
+
+ if (!is_valid_crush_name(name))
+ return -EINVAL;
+
+ if (!is_valid_crush_loc(cct, loc))
+ return -EINVAL;
+
+ ret = validate_weightf(weight);
+ if (ret < 0) {
+ return ret;
+ }
+
+ // compare quantized (fixed-point integer) weights!
+ int iweight = (int)(weight * (float)0x10000);
+ int old_iweight;
+ if (check_item_loc(cct, item, loc, &old_iweight)) {
+ ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
+ if (old_iweight != iweight) {
+ ldout(cct, 5) << "update_item " << item << " adjusting weight "
+ << ((float)old_iweight/(float)0x10000) << " -> " << weight
+ << dendl;
+ adjust_item_weight_in_loc(cct, item, iweight, loc);
+ ret = 1;
+ }
+ if (get_item_name(item) != name) {
+ ldout(cct, 5) << "update_item setting " << item << " name to " << name
+ << dendl;
+ set_item_name(item, name);
+ ret = 1;
+ }
+ } else {
+ if (item_exists(item)) {
+ remove_item(cct, item, true);
+ }
+ ldout(cct, 5) << "update_item adding " << item << " weight " << weight
+ << " at " << loc << dendl;
+ ret = insert_item(cct, item, weight, name, loc);
+ if (ret == 0)
+ ret = 1; // changed
+ }
+ return ret;
+}
+
+int CrushWrapper::get_item_weight(int id) const
+{
+ for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+ crush_bucket *b = crush->buckets[bidx];
+ if (b == NULL)
+ continue;
+ if (b->id == id)
+ return b->weight;
+ for (unsigned i = 0; i < b->size; i++)
+ if (b->items[i] == id)
+ return crush_get_bucket_item_weight(b, i);
+ }
+ return -ENOENT;
+}
+
+int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
+{
+ for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
+
+ int bid = get_item_id(l->second);
+ if (!bucket_exists(bid))
+ continue;
+ crush_bucket *b = get_bucket(bid);
+ for (unsigned int i = 0; i < b->size; i++) {
+ if (b->items[i] == id) {
+ return crush_get_bucket_item_weight(b, i);
+ }
+ }
+ }
+ return -ENOENT;
+}
+
+int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight,
+ bool update_weight_sets)
+{
+ ldout(cct, 5) << __func__ << " " << id << " weight " << weight
+ << " update_weight_sets=" << (int)update_weight_sets
+ << dendl;
+ int changed = 0;
+ for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+ if (!crush->buckets[bidx]) {
+ continue;
+ }
+ int r = adjust_item_weight_in_bucket(cct, id, weight, -1-bidx,
+ update_weight_sets);
+ if (r > 0) {
+ ++changed;
+ }
+ }
+ if (!changed) {
+ return -ENOENT;
+ }
+ return changed;
+}
+
+int CrushWrapper::adjust_item_weight_in_bucket(
+ CephContext *cct, int id, int weight,
+ int bucket_id,
+ bool update_weight_sets)
+{
+ ldout(cct, 5) << __func__ << " " << id << " weight " << weight
+ << " in bucket " << bucket_id
+ << " update_weight_sets=" << (int)update_weight_sets
+ << dendl;
+ int changed = 0;
+ if (!bucket_exists(bucket_id)) {
+ return -ENOENT;
+ }
+ crush_bucket *b = get_bucket(bucket_id);
+ for (unsigned int i = 0; i < b->size; i++) {
+ if (b->items[i] == id) {
+ int diff = bucket_adjust_item_weight(cct, b, id, weight,
+ update_weight_sets);
+ ldout(cct, 5) << __func__ << " " << id << " diff " << diff
+ << " in bucket " << bucket_id << dendl;
+ adjust_item_weight(cct, bucket_id, b->weight, false);
+ changed++;
+ }
+ }
+ // update weight-sets so they continue to sum
+ for (auto& p : choose_args) {
+ auto &cmap = p.second;
+ if (!cmap.args) {
+ continue;
+ }
+ crush_choose_arg *arg = &cmap.args[-1 - bucket_id];
+ if (!arg->weight_set) {
+ continue;
+ }
+ ceph_assert(arg->weight_set_positions > 0);
+ vector<int> w(arg->weight_set_positions);
+ for (unsigned i = 0; i < b->size; ++i) {
+ for (unsigned j = 0; j < arg->weight_set_positions; ++j) {
+ crush_weight_set *weight_set = &arg->weight_set[j];
+ w[j] += weight_set->weights[i];
+ }
+ }
+ ldout(cct,5) << __func__ << " adjusting bucket " << bucket_id
+ << " cmap " << p.first << " weights to " << w << dendl;
+ ostringstream ss;
+ choose_args_adjust_item_weight(cct, cmap, bucket_id, w, &ss);
+ }
+ if (!changed) {
+ return -ENOENT;
+ }
+ return changed;
+}
+
+int CrushWrapper::adjust_item_weight_in_loc(
+ CephContext *cct, int id, int weight,
+ const map<string,string>& loc,
+ bool update_weight_sets)
+{
+ ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight
+ << " in " << loc
+ << " update_weight_sets=" << (int)update_weight_sets
+ << dendl;
+ int changed = 0;
+ for (auto l = loc.begin(); l != loc.end(); ++l) {
+ int bid = get_item_id(l->second);
+ if (!bucket_exists(bid))
+ continue;
+ int r = adjust_item_weight_in_bucket(cct, id, weight, bid,
+ update_weight_sets);
+ if (r > 0) {
+ ++changed;
+ }
+ }
+ if (!changed) {
+ return -ENOENT;
+ }
+ return changed;
+}
+
+int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight,
+ bool update_weight_sets)
+{
+ ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
+ crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b))
+ return PTR_ERR(b);
+ int changed = 0;
+ list<crush_bucket*> q;
+ q.push_back(b);
+ while (!q.empty()) {
+ b = q.front();
+ q.pop_front();
+ int local_changed = 0;
+ for (unsigned i=0; i<b->size; ++i) {
+ int n = b->items[i];
+ if (n >= 0) {
+ adjust_item_weight_in_bucket(cct, n, weight, b->id, update_weight_sets);
+ ++changed;
+ ++local_changed;
+ } else {
+ crush_bucket *sub = get_bucket(n);
+ if (IS_ERR(sub))
+ continue;
+ q.push_back(sub);
+ }
+ }
+ }
+ return changed;
+}
+
+bool CrushWrapper::check_item_present(int id) const
+{
+ bool found = false;
+
+ for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+ crush_bucket *b = crush->buckets[bidx];
+ if (b == 0)
+ continue;
+ for (unsigned i = 0; i < b->size; i++)
+ if (b->items[i] == id)
+ found = true;
+ }
+ return found;
+}
+
+
+pair<string,string> CrushWrapper::get_immediate_parent(int id, int *_ret) const
+{
+
+ for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+ crush_bucket *b = crush->buckets[bidx];
+ if (b == 0)
+ continue;
+ if (is_shadow_item(b->id))
+ continue;
+ for (unsigned i = 0; i < b->size; i++)
+ if (b->items[i] == id) {
+ string parent_id = name_map.at(b->id);
+ string parent_bucket_type = type_map.at(b->type);
+ if (_ret)
+ *_ret = 0;
+ return make_pair(parent_bucket_type, parent_id);
+ }
+ }
+
+ if (_ret)
+ *_ret = -ENOENT;
+
+ return pair<string, string>();
+}
+
+int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
+{
+ for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+ crush_bucket *b = crush->buckets[bidx];
+ if (b == 0)
+ continue;
+ if (is_shadow_item(b->id))
+ continue;
+ for (unsigned i = 0; i < b->size; i++) {
+ if (b->items[i] == id) {
+ *parent = b->id;
+ return 0;
+ }
+ }
+ }
+ return -ENOENT;
+}
+
+int CrushWrapper::get_parent_of_type(int item, int type, int rule) const
+{
+ if (rule < 0) {
+ // no rule specified
+ do {
+ int r = get_immediate_parent_id(item, &item);
+ if (r < 0) {
+ return 0;
+ }
+ } while (get_bucket_type(item) != type);
+ return item;
+ }
+ set<int> roots;
+ find_takes_by_rule(rule, &roots);
+ for (auto root : roots) {
+ vector<int> candidates;
+ get_children_of_type(root, type, &candidates, false);
+ for (auto candidate : candidates) {
+ if (subtree_contains(candidate, item)) {
+ // note that here we assure that no two different buckets
+ // from a single crush rule will share a same device,
+ // which should generally be true.
+ return candidate;
+ }
+ }
+ }
+ return 0; // not found
+}
+
+void CrushWrapper::get_subtree_of_type(int type, vector<int> *subtrees)
+{
+ set<int> roots;
+ find_roots(&roots);
+ for (auto r: roots) {
+ crush_bucket *b = get_bucket(r);
+ if (IS_ERR(b))
+ continue;
+ get_children_of_type(b->id, type, subtrees);
+ }
+}
+
+bool CrushWrapper::class_is_in_use(int class_id, ostream *ss)
+{
+ list<unsigned> rules;
+ for (unsigned i = 0; i < crush->max_rules; ++i) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j = 0; j < r->len; ++j) {
+ if (r->steps[j].op == CRUSH_RULE_TAKE) {
+ int root = r->steps[j].arg1;
+ for (auto &p : class_bucket) {
+ auto& q = p.second;
+ if (q.count(class_id) && q[class_id] == root) {
+ rules.push_back(i);
+ }
+ }
+ }
+ }
+ }
+ if (rules.empty()) {
+ return false;
+ }
+ if (ss) {
+ ostringstream os;
+ for (auto &p: rules) {
+ os << "'" << get_rule_name(p) <<"',";
+ }
+ string out(os.str());
+ out.resize(out.size() - 1); // drop last ','
+ *ss << "still referenced by crush_rule(s): " << out;
+ }
+ return true;
+}
+
+int CrushWrapper::rename_class(const string& srcname, const string& dstname)
+{
+ auto i = class_rname.find(srcname);
+ if (i == class_rname.end())
+ return -ENOENT;
+ auto j = class_rname.find(dstname);
+ if (j != class_rname.end())
+ return -EEXIST;
+
+ int class_id = i->second;
+ ceph_assert(class_name.count(class_id));
+ // rename any shadow buckets of old class name
+ for (auto &it: class_map) {
+ if (it.first < 0 && it.second == class_id) {
+ string old_name = get_item_name(it.first);
+ size_t pos = old_name.find("~");
+ ceph_assert(pos != string::npos);
+ string name_no_class = old_name.substr(0, pos);
+ string old_class_name = old_name.substr(pos + 1);
+ ceph_assert(old_class_name == srcname);
+ string new_name = name_no_class + "~" + dstname;
+ // we do not use set_item_name
+ // because the name is intentionally invalid
+ name_map[it.first] = new_name;
+ have_rmaps = false;
+ }
+ }
+
+ // rename class
+ class_rname.erase(srcname);
+ class_name.erase(class_id);
+ class_rname[dstname] = class_id;
+ class_name[class_id] = dstname;
+ return 0;
+}
+
+int CrushWrapper::populate_classes(
+ const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket)
+{
+ // build set of previous used shadow ids
+ set<int32_t> used_ids;
+ for (auto& p : old_class_bucket) {
+ for (auto& q : p.second) {
+ used_ids.insert(q.second);
+ }
+ }
+ // accumulate weight values for each carg and bucket as we go. because it is
+ // depth first, we will have the nested bucket weights we need when we
+ // finish constructing the containing buckets.
+ map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> [bucket weight for each position]
+ set<int> roots;
+ find_nonshadow_roots(&roots);
+ for (auto &r : roots) {
+ if (r >= 0)
+ continue;
+ for (auto &c : class_name) {
+ int clone;
+ int res = device_class_clone(r, c.first, old_class_bucket, used_ids,
+ &clone, &cmap_item_weight);
+ if (res < 0)
+ return res;
+ }
+ }
+ return 0;
+}
+
+int CrushWrapper::trim_roots_with_class(CephContext *cct)
+{
+ set<int> roots;
+ find_shadow_roots(&roots);
+ for (auto &r : roots) {
+ if (r >= 0)
+ continue;
+ int res = remove_root(cct, r);
+ if (res)
+ return res;
+ }
+ // there is no need to reweight because we only remove from the
+ // root and down
+ return 0;
+}
+
+int32_t CrushWrapper::_alloc_class_id() const {
+ if (class_name.empty()) {
+ return 0;
+ }
+ int32_t class_id = class_name.rbegin()->first + 1;
+ if (class_id >= 0) {
+ return class_id;
+ }
+ // wrapped, pick a random start and do exhaustive search
+ uint32_t upperlimit = numeric_limits<int32_t>::max();
+ upperlimit++;
+ class_id = rand() % upperlimit;
+ const auto start = class_id;
+ do {
+ if (!class_name.count(class_id)) {
+ return class_id;
+ } else {
+ class_id++;
+ if (class_id < 0) {
+ class_id = 0;
+ }
+ }
+ } while (class_id != start);
+ ceph_abort_msg("no available class id");
+}
+
+int CrushWrapper::set_subtree_class(
+ const string& subtree,
+ const string& new_class)
+{
+ if (!name_exists(subtree)) {
+ return -ENOENT;
+ }
+
+ int new_class_id = get_or_create_class_id(new_class);
+ int id = get_item_id(subtree);
+ list<int> q = { id };
+ while (!q.empty()) {
+ int id = q.front();
+ q.pop_front();
+ crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) {
+ return PTR_ERR(b);
+ }
+ for (unsigned i = 0; i < b->size; ++i) {
+ int item = b->items[i];
+ if (item >= 0) {
+ class_map[item] = new_class_id;
+ } else {
+ q.push_back(item);
+ }
+ }
+ }
+ return 0;
+}
+
+int CrushWrapper::reclassify(
+ CephContext *cct,
+ ostream& out,
+ const map<string,string>& classify_root,
+ const map<string,pair<string,string>>& classify_bucket
+ )
+{
+ map<int,string> reclassified_bucket; // orig_id -> class
+
+ // classify_root
+ for (auto& i : classify_root) {
+ string root = i.first;
+ if (!name_exists(root)) {
+ out << "root " << root << " does not exist" << std::endl;
+ return -EINVAL;
+ }
+ int root_id = get_item_id(root);
+ string new_class = i.second;
+ int new_class_id = get_or_create_class_id(new_class);
+ out << "classify_root " << root << " (" << root_id
+ << ") as " << new_class << std::endl;
+
+ // validate rules
+ for (unsigned j = 0; j < crush->max_rules; j++) {
+ if (crush->rules[j]) {
+ auto rule = crush->rules[j];
+ for (unsigned k = 0; k < rule->len; ++k) {
+ if (rule->steps[k].op == CRUSH_RULE_TAKE) {
+ int step_item = get_rule_arg1(j, k);
+ int original_item;
+ int c;
+ int res = split_id_class(step_item, &original_item, &c);
+ if (res < 0)
+ return res;
+ if (c >= 0) {
+ if (original_item == root_id) {
+ out << " rule " << j << " includes take on root "
+ << root << " class " << c << std::endl;
+ return -EINVAL;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // rebuild new buckets for root
+ //cout << "before class_bucket: " << class_bucket << std::endl;
+ map<int,int> renumber;
+ list<int> q;
+ q.push_back(root_id);
+ while (!q.empty()) {
+ int id = q.front();
+ q.pop_front();
+ crush_bucket *bucket = get_bucket(id);
+ if (IS_ERR(bucket)) {
+ out << "cannot find bucket " << id
+ << ": " << cpp_strerror(PTR_ERR(bucket)) << std::endl;
+ return PTR_ERR(bucket);
+ }
+
+ // move bucket
+ int new_id = get_new_bucket_id();
+ out << " renumbering bucket " << id << " -> " << new_id << std::endl;
+ renumber[id] = new_id;
+ crush->buckets[-1-new_id] = bucket;
+ bucket->id = new_id;
+ crush->buckets[-1-id] = crush_make_bucket(crush,
+ bucket->alg,
+ bucket->hash,
+ bucket->type,
+ 0, NULL, NULL);
+ crush->buckets[-1-id]->id = id;
+ for (auto& i : choose_args) {
+ i.second.args[-1-new_id] = i.second.args[-1-id];
+ memset(&i.second.args[-1-id], 0, sizeof(i.second.args[0]));
+ }
+ class_bucket.erase(id);
+ class_bucket[new_id][new_class_id] = id;
+ name_map[new_id] = string(get_item_name(id));
+ name_map[id] = string(get_item_name(id)) + "~" + new_class;
+
+ for (unsigned j = 0; j < bucket->size; ++j) {
+ if (bucket->items[j] < 0) {
+ q.push_front(bucket->items[j]);
+ } else {
+ // we don't reclassify the device here; if the users wants that,
+ // they can pass --set-subtree-class separately.
+ }
+ }
+ }
+ //cout << "mid class_bucket: " << class_bucket << std::endl;
+
+ for (int i = 0; i < crush->max_buckets; ++i) {
+ crush_bucket *b = crush->buckets[i];
+ if (!b) {
+ continue;
+ }
+ for (unsigned j = 0; j < b->size; ++j) {
+ if (renumber.count(b->items[j])) {
+ b->items[j] = renumber[b->items[j]];
+ }
+ }
+ }
+
+ int r = rebuild_roots_with_classes(cct);
+ if (r < 0) {
+ out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ //cout << "final class_bucket: " << class_bucket << std::endl;
+ }
+
+ // classify_bucket
+ map<int,int> send_to; // source bucket -> dest bucket
+ map<int,map<int,int>> new_class_bucket;
+ map<int,string> new_bucket_names;
+ map<int,map<string,string>> new_buckets;
+ map<string,int> new_bucket_by_name;
+ for (auto& i : classify_bucket) {
+ const string& match = i.first; // prefix% or %suffix
+ const string& new_class = i.second.first;
+ const string& default_parent = i.second.second;
+ if (!name_exists(default_parent)) {
+ out << "default parent " << default_parent << " does not exist"
+ << std::endl;
+ return -EINVAL;
+ }
+ int default_parent_id = get_item_id(default_parent);
+ crush_bucket *default_parent_bucket = get_bucket(default_parent_id);
+ assert(default_parent_bucket);
+ string default_parent_type_name = get_type_name(default_parent_bucket->type);
+
+ out << "classify_bucket " << match << " as " << new_class
+ << " default bucket " << default_parent
+ << " (" << default_parent_type_name << ")" << std::endl;
+
+ int new_class_id = get_or_create_class_id(new_class);
+ for (int j = 0; j < crush->max_buckets; ++j) {
+ crush_bucket *b = crush->buckets[j];
+ if (!b || is_shadow_item(b->id)) {
+ continue;
+ }
+ string name = get_item_name(b->id);
+ if (name.length() < match.length()) {
+ continue;
+ }
+ string basename;
+ if (match[0] == '%') {
+ if (match.substr(1) != name.substr(name.size() - match.size() + 1)) {
+ continue;
+ }
+ basename = name.substr(0, name.size() - match.size() + 1);
+ } else if (match[match.size() - 1] == '%') {
+ if (match.substr(0, match.size() - 1) !=
+ name.substr(0, match.size() - 1)) {
+ continue;
+ }
+ basename = name.substr(match.size() - 1);
+ } else if (match == name) {
+ basename = default_parent;
+ } else {
+ continue;
+ }
+ cout << "match " << match << " to " << name << " basename " << basename
+ << std::endl;
+ // look up or create basename bucket
+ int base_id;
+ if (name_exists(basename)) {
+ base_id = get_item_id(basename);
+ cout << " have base " << base_id << std::endl;
+ } else if (new_bucket_by_name.count(basename)) {
+ base_id = new_bucket_by_name[basename];
+ cout << " already creating base " << base_id << std::endl;
+ } else {
+ base_id = get_new_bucket_id();
+ crush->buckets[-1-base_id] = crush_make_bucket(crush,
+ b->alg,
+ b->hash,
+ b->type,
+ 0, NULL, NULL);
+ crush->buckets[-1-base_id]->id = base_id;
+ name_map[base_id] = basename;
+ new_bucket_by_name[basename] = base_id;
+ cout << " created base " << base_id << std::endl;
+
+ new_buckets[base_id][default_parent_type_name] = default_parent;
+ }
+ send_to[b->id] = base_id;
+ new_class_bucket[base_id][new_class_id] = b->id;
+ new_bucket_names[b->id] = basename + "~" + get_class_name(new_class_id);
+
+ // make sure devices are classified
+ for (unsigned i = 0; i < b->size; ++i) {
+ int item = b->items[i];
+ if (item >= 0) {
+ class_map[item] = new_class_id;
+ }
+ }
+ }
+ }
+
+ // no name_exists() works below,
+ have_rmaps = false;
+
+ // copy items around
+ //cout << "send_to " << send_to << std::endl;
+ set<int> roots;
+ find_roots(&roots);
+ for (auto& i : send_to) {
+ crush_bucket *from = get_bucket(i.first);
+ crush_bucket *to = get_bucket(i.second);
+ cout << "moving items from " << from->id << " (" << get_item_name(from->id)
+ << ") to " << to->id << " (" << get_item_name(to->id) << ")"
+ << std::endl;
+ for (unsigned j = 0; j < from->size; ++j) {
+ int item = from->items[j];
+ int r;
+ map<string,string> to_loc;
+ to_loc[get_type_name(to->type)] = get_item_name(to->id);
+ if (item >= 0) {
+ if (subtree_contains(to->id, item)) {
+ continue;
+ }
+ map<string,string> from_loc;
+ from_loc[get_type_name(from->type)] = get_item_name(from->id);
+ auto w = get_item_weightf_in_loc(item, from_loc);
+ r = insert_item(cct, item,
+ w,
+ get_item_name(item),
+ to_loc);
+ } else {
+ if (!send_to.count(item)) {
+ lderr(cct) << "item " << item << " in bucket " << from->id
+ << " is not also a reclassified bucket" << dendl;
+ return -EINVAL;
+ }
+ int newitem = send_to[item];
+ if (subtree_contains(to->id, newitem)) {
+ continue;
+ }
+ r = link_bucket(cct, newitem, to_loc);
+ }
+ if (r != 0) {
+ cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ }
+ }
+
+ // make sure new buckets have parents
+ for (auto& i : new_buckets) {
+ int parent;
+ if (get_immediate_parent_id(i.first, &parent) < 0) {
+ cout << "new bucket " << i.first << " missing parent, adding at "
+ << i.second << std::endl;
+ int r = link_bucket(cct, i.first, i.second);
+ if (r != 0) {
+ cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ }
+ }
+
+ // set class mappings
+ //cout << "pre class_bucket: " << class_bucket << std::endl;
+ for (auto& i : new_class_bucket) {
+ for (auto& j : i.second) {
+ class_bucket[i.first][j.first] = j.second;
+ }
+
+ }
+ //cout << "post class_bucket: " << class_bucket << std::endl;
+ for (auto& i : new_bucket_names) {
+ name_map[i.first] = i.second;
+ }
+
+ int r = rebuild_roots_with_classes(cct);
+ if (r < 0) {
+ out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ //cout << "final class_bucket: " << class_bucket << std::endl;
+
+ return 0;
+}
+
+int CrushWrapper::get_new_bucket_id()
+{
+ int id = -1;
+ while (crush->buckets[-1-id] &&
+ -1-id < crush->max_buckets) {
+ id--;
+ }
+ if (-1-id == crush->max_buckets) {
+ ++crush->max_buckets;
+ crush->buckets = (struct crush_bucket**)realloc(
+ crush->buckets,
+ sizeof(crush->buckets[0]) * crush->max_buckets);
+ for (auto& i : choose_args) {
+ assert(i.second.size == (__u32)crush->max_buckets - 1);
+ ++i.second.size;
+ i.second.args = (struct crush_choose_arg*)realloc(
+ i.second.args,
+ sizeof(i.second.args[0]) * i.second.size);
+ }
+ }
+ return id;
+}
+
+void CrushWrapper::reweight(CephContext *cct)
+{
+ set<int> roots;
+ find_nonshadow_roots(&roots);
+ for (auto id : roots) {
+ if (id >= 0)
+ continue;
+ crush_bucket *b = get_bucket(id);
+ ldout(cct, 5) << "reweight root bucket " << id << dendl;
+ int r = crush_reweight_bucket(crush, b);
+ ceph_assert(r == 0);
+
+ for (auto& i : choose_args) {
+ //cout << "carg " << i.first << std::endl;
+ vector<uint32_t> w; // discard top-level weights
+ reweight_bucket(b, i.second, &w);
+ }
+ }
+ int r = rebuild_roots_with_classes(cct);
+ ceph_assert(r == 0);
+}
+
+void CrushWrapper::reweight_bucket(
+ crush_bucket *b,
+ crush_choose_arg_map& arg_map,
+ vector<uint32_t> *weightv)
+{
+ int idx = -1 - b->id;
+ unsigned npos = arg_map.args[idx].weight_set_positions;
+ //cout << __func__ << " " << b->id << " npos " << npos << std::endl;
+ weightv->resize(npos);
+ for (unsigned i = 0; i < b->size; ++i) {
+ int item = b->items[i];
+ if (item >= 0) {
+ for (unsigned pos = 0; pos < npos; ++pos) {
+ (*weightv)[pos] += arg_map.args[idx].weight_set->weights[i];
+ }
+ } else {
+ vector<uint32_t> subw(npos);
+ crush_bucket *sub = get_bucket(item);
+ assert(sub);
+ reweight_bucket(sub, arg_map, &subw);
+ for (unsigned pos = 0; pos < npos; ++pos) {
+ (*weightv)[pos] += subw[pos];
+ // strash the real bucket weight as the weights for this reference
+ arg_map.args[idx].weight_set->weights[i] = subw[pos];
+ }
+ }
+ }
+ //cout << __func__ << " finish " << b->id << " " << *weightv << std::endl;
+}
+
+int CrushWrapper::add_simple_rule_at(
+ string name, string root_name,
+ string failure_domain_name,
+ string device_class,
+ string mode, int rule_type,
+ int rno,
+ ostream *err)
+{
+ if (rule_exists(name)) {
+ if (err)
+ *err << "rule " << name << " exists";
+ return -EEXIST;
+ }
+ if (rno >= 0) {
+ if (rule_exists(rno)) {
+ if (err)
+ *err << "rule with ruleno " << rno << " exists";
+ return -EEXIST;
+ }
+ if (ruleset_exists(rno)) {
+ if (err)
+ *err << "ruleset " << rno << " exists";
+ return -EEXIST;
+ }
+ } else {
+ for (rno = 0; rno < get_max_rules(); rno++) {
+ if (!rule_exists(rno) && !ruleset_exists(rno))
+ break;
+ }
+ }
+ if (!name_exists(root_name)) {
+ if (err)
+ *err << "root item " << root_name << " does not exist";
+ return -ENOENT;
+ }
+ int root = get_item_id(root_name);
+ int type = 0;
+ if (failure_domain_name.length()) {
+ type = get_type_id(failure_domain_name);
+ if (type < 0) {
+ if (err)
+ *err << "unknown type " << failure_domain_name;
+ return -EINVAL;
+ }
+ }
+ if (device_class.size()) {
+ if (!class_exists(device_class)) {
+ if (err)
+ *err << "device class " << device_class << " does not exist";
+ return -EINVAL;
+ }
+ int c = get_class_id(device_class);
+ if (class_bucket.count(root) == 0 ||
+ class_bucket[root].count(c) == 0) {
+ if (err)
+ *err << "root " << root_name << " has no devices with class "
+ << device_class;
+ return -EINVAL;
+ }
+ root = class_bucket[root][c];
+ }
+ if (mode != "firstn" && mode != "indep") {
+ if (err)
+ *err << "unknown mode " << mode;
+ return -EINVAL;
+ }
+
+ int steps = 3;
+ if (mode == "indep")
+ steps = 5;
+ int min_rep = mode == "firstn" ? 1 : 3;
+ int max_rep = mode == "firstn" ? 10 : 20;
+ //set the ruleset the same as rule_id(rno)
+ crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep);
+ ceph_assert(rule);
+ int step = 0;
+ if (mode == "indep") {
+ crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+ crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+ }
+ crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
+ if (type)
+ crush_rule_set_step(rule, step++,
+ mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
+ CRUSH_RULE_CHOOSELEAF_INDEP,
+ CRUSH_CHOOSE_N,
+ type);
+ else
+ crush_rule_set_step(rule, step++,
+ mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
+ CRUSH_RULE_CHOOSE_INDEP,
+ CRUSH_CHOOSE_N,
+ 0);
+ crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+
+ int ret = crush_add_rule(crush, rule, rno);
+ if(ret < 0) {
+ *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
+ return ret;
+ }
+ set_rule_name(rno, name);
+ have_rmaps = false;
+ return rno;
+}
+
+int CrushWrapper::add_simple_rule(
+ string name, string root_name,
+ string failure_domain_name,
+ string device_class,
+ string mode, int rule_type,
+ ostream *err)
+{
+ return add_simple_rule_at(name, root_name, failure_domain_name, device_class,
+ mode,
+ rule_type, -1, err);
+}
+
+float CrushWrapper::_get_take_weight_osd_map(int root,
+ map<int,float> *pmap) const
+{
+ float sum = 0.0;
+ list<int> q;
+ q.push_back(root);
+ //breadth first iterate the OSD tree
+ while (!q.empty()) {
+ int bno = q.front();
+ q.pop_front();
+ crush_bucket *b = crush->buckets[-1-bno];
+ ceph_assert(b);
+ for (unsigned j=0; j<b->size; ++j) {
+ int item_id = b->items[j];
+ if (item_id >= 0) { //it's an OSD
+ float w = crush_get_bucket_item_weight(b, j);
+ (*pmap)[item_id] = w;
+ sum += w;
+ } else { //not an OSD, expand the child later
+ q.push_back(item_id);
+ }
+ }
+ }
+ return sum;
+}
+
+void CrushWrapper::_normalize_weight_map(float sum,
+ const map<int,float>& m,
+ map<int,float> *pmap) const
+{
+ for (auto& p : m) {
+ map<int,float>::iterator q = pmap->find(p.first);
+ if (q == pmap->end()) {
+ (*pmap)[p.first] = p.second / sum;
+ } else {
+ q->second += p.second / sum;
+ }
+ }
+}
+
+int CrushWrapper::get_take_weight_osd_map(int root, map<int,float> *pmap) const
+{
+ map<int,float> m;
+ float sum = _get_take_weight_osd_map(root, &m);
+ _normalize_weight_map(sum, m, pmap);
+ return 0;
+}
+
+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno,
+ map<int,float> *pmap) const
+{
+ if (ruleno >= crush->max_rules)
+ return -ENOENT;
+ if (crush->rules[ruleno] == NULL)
+ return -ENOENT;
+ crush_rule *rule = crush->rules[ruleno];
+
+ // build a weight map for each TAKE in the rule, and then merge them
+
+ // FIXME: if there are multiple takes that place a different number of
+ // objects we do not take that into account. (Also, note that doing this
+ // right is also a function of the pool, since the crush rule
+ // might choose 2 + choose 2 but pool size may only be 3.)
+ for (unsigned i=0; i<rule->len; ++i) {
+ map<int,float> m;
+ float sum = 0;
+ if (rule->steps[i].op == CRUSH_RULE_TAKE) {
+ int n = rule->steps[i].arg1;
+ if (n >= 0) {
+ m[n] = 1.0;
+ sum = 1.0;
+ } else {
+ sum += _get_take_weight_osd_map(n, &m);
+ }
+ }
+ _normalize_weight_map(sum, m, pmap);
+ }
+
+ return 0;
+}
+
+int CrushWrapper::remove_rule(int ruleno)
+{
+ if (ruleno >= (int)crush->max_rules)
+ return -ENOENT;
+ if (crush->rules[ruleno] == NULL)
+ return -ENOENT;
+ crush_destroy_rule(crush->rules[ruleno]);
+ crush->rules[ruleno] = NULL;
+ rule_name_map.erase(ruleno);
+ have_rmaps = false;
+ return rebuild_roots_with_classes(nullptr);
+}
+
+int CrushWrapper::bucket_adjust_item_weight(
+ CephContext *cct, crush_bucket *bucket, int item, int weight,
+ bool adjust_weight_sets)
+{
+ if (adjust_weight_sets) {
+ unsigned position;
+ for (position = 0; position < bucket->size; position++)
+ if (bucket->items[position] == item)
+ break;
+ ceph_assert(position != bucket->size);
+ for (auto &w : choose_args) {
+ crush_choose_arg_map &arg_map = w.second;
+ crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+ crush_weight_set *weight_set = &arg->weight_set[j];
+ weight_set->weights[position] = weight;
+ }
+ }
+ }
+ return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
+}
+
+int CrushWrapper::add_bucket(
+ int bucketno, int alg, int hash, int type, int size,
+ int *items, int *weights, int *idout)
+{
+ if (alg == 0) {
+ alg = get_default_bucket_alg();
+ if (alg == 0)
+ return -EINVAL;
+ }
+ crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
+ weights);
+ ceph_assert(b);
+ ceph_assert(idout);
+ int r = crush_add_bucket(crush, bucketno, b, idout);
+ int pos = -1 - *idout;
+ for (auto& p : choose_args) {
+ crush_choose_arg_map& cmap = p.second;
+ unsigned new_size = crush->max_buckets;
+ if (cmap.args) {
+ if ((int)cmap.size < crush->max_buckets) {
+ cmap.args = static_cast<crush_choose_arg*>(realloc(
+ cmap.args,
+ sizeof(crush_choose_arg) * new_size));
+ ceph_assert(cmap.args);
+ memset(&cmap.args[cmap.size], 0,
+ sizeof(crush_choose_arg) * (new_size - cmap.size));
+ cmap.size = new_size;
+ }
+ } else {
+ cmap.args = static_cast<crush_choose_arg*>(calloc(sizeof(crush_choose_arg),
+ new_size));
+ ceph_assert(cmap.args);
+ cmap.size = new_size;
+ }
+ if (size > 0) {
+ int positions = get_choose_args_positions(cmap);
+ crush_choose_arg& carg = cmap.args[pos];
+ carg.weight_set = static_cast<crush_weight_set*>(calloc(sizeof(crush_weight_set),
+ size));
+ carg.weight_set_positions = positions;
+ for (int ppos = 0; ppos < positions; ++ppos) {
+ carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
+ carg.weight_set[ppos].size = size;
+ for (int bpos = 0; bpos < size; ++bpos) {
+ carg.weight_set[ppos].weights[bpos] = weights[bpos];
+ }
+ }
+ }
+ assert(crush->max_buckets == (int)cmap.size);
+ }
+ return r;
+}
+
+int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
+{
+ __u32 new_size = bucket->size + 1;
+ int r = crush_bucket_add_item(crush, bucket, item, weight);
+ if (r < 0) {
+ return r;
+ }
+ for (auto &w : choose_args) {
+ crush_choose_arg_map &arg_map = w.second;
+ crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+ crush_weight_set *weight_set = &arg->weight_set[j];
+ weight_set->weights = (__u32*)realloc(weight_set->weights,
+ new_size * sizeof(__u32));
+ ceph_assert(weight_set->size + 1 == new_size);
+ weight_set->weights[weight_set->size] = weight;
+ weight_set->size = new_size;
+ }
+ if (arg->ids_size) {
+ arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
+ ceph_assert(arg->ids_size + 1 == new_size);
+ arg->ids[arg->ids_size] = item;
+ arg->ids_size = new_size;
+ }
+ }
+ return 0;
+}
+
+int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
+{
+ __u32 new_size = bucket->size - 1;
+ unsigned position;
+ for (position = 0; position < bucket->size; position++)
+ if (bucket->items[position] == item)
+ break;
+ ceph_assert(position != bucket->size);
+ int r = crush_bucket_remove_item(crush, bucket, item);
+ if (r < 0) {
+ return r;
+ }
+ for (auto &w : choose_args) {
+ crush_choose_arg_map &arg_map = w.second;
+ crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+ crush_weight_set *weight_set = &arg->weight_set[j];
+ ceph_assert(weight_set->size - 1 == new_size);
+ for (__u32 k = position; k < new_size; k++)
+ weight_set->weights[k] = weight_set->weights[k+1];
+ if (new_size) {
+ weight_set->weights = (__u32*)realloc(weight_set->weights,
+ new_size * sizeof(__u32));
+ } else {
+ free(weight_set->weights);
+ weight_set->weights = NULL;
+ }
+ weight_set->size = new_size;
+ }
+ if (arg->ids_size) {
+ ceph_assert(arg->ids_size - 1 == new_size);
+ for (__u32 k = position; k < new_size; k++)
+ arg->ids[k] = arg->ids[k+1];
+ if (new_size) {
+ arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
+ } else {
+ free(arg->ids);
+ arg->ids = NULL;
+ }
+ arg->ids_size = new_size;
+ }
+ }
+ return 0;
+}
+
+int CrushWrapper::bucket_set_alg(int bid, int alg)
+{
+ crush_bucket *b = get_bucket(bid);
+ if (!b) {
+ return -ENOENT;
+ }
+ b->alg = alg;
+ return 0;
+}
+
+int CrushWrapper::update_device_class(int id,
+ const string& class_name,
+ const string& name,
+ ostream *ss)
+{
+ ceph_assert(item_exists(id));
+ auto old_class_name = get_item_class(id);
+ if (old_class_name && old_class_name != class_name) {
+ *ss << "osd." << id << " has already bound to class '" << old_class_name
+ << "', can not reset class to '" << class_name << "'; "
+ << "use 'ceph osd crush rm-device-class <id>' to "
+ << "remove old class first";
+ return -EBUSY;
+ }
+
+ int class_id = get_or_create_class_id(class_name);
+ if (id < 0) {
+ *ss << name << " id " << id << " is negative";
+ return -EINVAL;
+ }
+
+ if (class_map.count(id) != 0 && class_map[id] == class_id) {
+ *ss << name << " already set to class " << class_name << ". ";
+ return 0;
+ }
+
+ set_item_class(id, class_id);
+
+ int r = rebuild_roots_with_classes(nullptr);
+ if (r < 0)
+ return r;
+ return 1;
+}
+
+int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss)
+{
+ ceph_assert(ss);
+ const char *name = get_item_name(id);
+ if (!name) {
+ *ss << "osd." << id << " does not have a name";
+ return -ENOENT;
+ }
+
+ const char *class_name = get_item_class(id);
+ if (!class_name) {
+ *ss << "osd." << id << " has not been bound to a specific class yet";
+ return 0;
+ }
+ class_remove_item(id);
+
+ int r = rebuild_roots_with_classes(cct);
+ if (r < 0) {
+ *ss << "unable to rebuild roots with class '" << class_name << "' "
+ << "of osd." << id << ": " << cpp_strerror(r);
+ return r;
+ }
+ return 0;
+}
+
+int CrushWrapper::device_class_clone(
+ int original_id, int device_class,
+ const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket,
+ const std::set<int32_t>& used_ids,
+ int *clone,
+ map<int,map<int,vector<int>>> *cmap_item_weight)
+{
+ const char *item_name = get_item_name(original_id);
+ if (item_name == NULL)
+ return -ECHILD;
+ const char *class_name = get_class_name(device_class);
+ if (class_name == NULL)
+ return -EBADF;
+ string copy_name = item_name + string("~") + class_name;
+ if (name_exists(copy_name)) {
+ *clone = get_item_id(copy_name);
+ return 0;
+ }
+
+ crush_bucket *original = get_bucket(original_id);
+ ceph_assert(!IS_ERR(original));
+ crush_bucket *copy = crush_make_bucket(crush,
+ original->alg,
+ original->hash,
+ original->type,
+ 0, NULL, NULL);
+ ceph_assert(copy);
+
+ vector<unsigned> item_orig_pos; // new item pos -> orig item pos
+ for (unsigned i = 0; i < original->size; i++) {
+ int item = original->items[i];
+ int weight = crush_get_bucket_item_weight(original, i);
+ if (item >= 0) {
+ if (class_map.count(item) != 0 && class_map[item] == device_class) {
+ int res = crush_bucket_add_item(crush, copy, item, weight);
+ if (res)
+ return res;
+ } else {
+ continue;
+ }
+ } else {
+ int child_copy_id;
+ int res = device_class_clone(item, device_class, old_class_bucket,
+ used_ids, &child_copy_id,
+ cmap_item_weight);
+ if (res < 0)
+ return res;
+ crush_bucket *child_copy = get_bucket(child_copy_id);
+ ceph_assert(!IS_ERR(child_copy));
+ res = crush_bucket_add_item(crush, copy, child_copy_id,
+ child_copy->weight);
+ if (res)
+ return res;
+ }
+ item_orig_pos.push_back(i);
+ }
+ ceph_assert(item_orig_pos.size() == copy->size);
+
+ int bno = 0;
+ if (old_class_bucket.count(original_id) &&
+ old_class_bucket.at(original_id).count(device_class)) {
+ bno = old_class_bucket.at(original_id).at(device_class);
+ } else {
+ // pick a new shadow bucket id that is not used by the current map
+ // *or* any previous shadow buckets.
+ bno = -1;
+ while (((-1-bno) < crush->max_buckets && crush->buckets[-1-bno]) ||
+ used_ids.count(bno)) {
+ --bno;
+ }
+ }
+ int res = crush_add_bucket(crush, bno, copy, clone);
+ if (res)
+ return res;
+ ceph_assert(!bno || bno == *clone);
+
+ res = set_item_class(*clone, device_class);
+ if (res < 0)
+ return res;
+
+ // we do not use set_item_name because the name is intentionally invalid
+ name_map[*clone] = copy_name;
+ if (have_rmaps)
+ name_rmap[copy_name] = *clone;
+ class_bucket[original_id][device_class] = *clone;
+
+ // set up choose_args for the new bucket.
+ for (auto& w : choose_args) {
+ crush_choose_arg_map& cmap = w.second;
+ if (crush->max_buckets > (int)cmap.size) {
+ unsigned new_size = crush->max_buckets;
+ cmap.args = static_cast<crush_choose_arg*>(realloc(cmap.args,
+ new_size * sizeof(cmap.args[0])));
+ ceph_assert(cmap.args);
+ memset(cmap.args + cmap.size, 0,
+ (new_size - cmap.size) * sizeof(cmap.args[0]));
+ cmap.size = new_size;
+ }
+ auto& o = cmap.args[-1-original_id];
+ auto& n = cmap.args[-1-bno];
+ n.ids_size = 0; // FIXME: implement me someday
+ n.weight_set_positions = o.weight_set_positions;
+ n.weight_set = static_cast<crush_weight_set*>(calloc(
+ n.weight_set_positions, sizeof(crush_weight_set)));
+ for (size_t s = 0; s < n.weight_set_positions; ++s) {
+ n.weight_set[s].size = copy->size;
+ n.weight_set[s].weights = (__u32*)calloc(copy->size, sizeof(__u32));
+ }
+ for (size_t s = 0; s < n.weight_set_positions; ++s) {
+ vector<int> bucket_weights(n.weight_set_positions);
+ for (size_t i = 0; i < copy->size; ++i) {
+ int item = copy->items[i];
+ if (item >= 0) {
+ n.weight_set[s].weights[i] = o.weight_set[s].weights[item_orig_pos[i]];
+ } else if ((*cmap_item_weight)[w.first].count(item)) {
+ n.weight_set[s].weights[i] = (*cmap_item_weight)[w.first][item][s];
+ } else {
+ n.weight_set[s].weights[i] = 0;
+ }
+ bucket_weights[s] += n.weight_set[s].weights[i];
+ }
+ (*cmap_item_weight)[w.first][bno] = bucket_weights;
+ }
+ }
+ return 0;
+}
+
+int CrushWrapper::get_rules_by_class(const string &class_name, set<int> *rules)
+{
+ ceph_assert(rules);
+ rules->clear();
+ if (!class_exists(class_name)) {
+ return -ENOENT;
+ }
+ int class_id = get_class_id(class_name);
+ for (unsigned i = 0; i < crush->max_rules; ++i) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j = 0; j < r->len; ++j) {
+ if (r->steps[j].op == CRUSH_RULE_TAKE) {
+ int step_item = r->steps[j].arg1;
+ int original_item;
+ int c;
+ int res = split_id_class(step_item, &original_item, &c);
+ if (res < 0) {
+ return res;
+ }
+ if (c != -1 && c == class_id) {
+ rules->insert(i);
+ break;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+// return rules that might reference the given osd
+int CrushWrapper::get_rules_by_osd(int osd, set<int> *rules)
+{
+ ceph_assert(rules);
+ rules->clear();
+ if (osd < 0) {
+ return -EINVAL;
+ }
+ for (unsigned i = 0; i < crush->max_rules; ++i) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j = 0; j < r->len; ++j) {
+ if (r->steps[j].op == CRUSH_RULE_TAKE) {
+ int step_item = r->steps[j].arg1;
+ list<int> unordered;
+ int rc = _get_leaves(step_item, &unordered);
+ if (rc < 0) {
+ return rc; // propagate fatal errors!
+ }
+ bool match = false;
+ for (auto &o: unordered) {
+ ceph_assert(o >= 0);
+ if (o == osd) {
+ match = true;
+ break;
+ }
+ }
+ if (match) {
+ rules->insert(i);
+ break;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+bool CrushWrapper::_class_is_dead(int class_id)
+{
+ for (auto &p: class_map) {
+ if (p.first >= 0 && p.second == class_id) {
+ return false;
+ }
+ }
+ for (unsigned i = 0; i < crush->max_rules; ++i) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j = 0; j < r->len; ++j) {
+ if (r->steps[j].op == CRUSH_RULE_TAKE) {
+ int root = r->steps[j].arg1;
+ for (auto &p : class_bucket) {
+ auto& q = p.second;
+ if (q.count(class_id) && q[class_id] == root) {
+ return false;
+ }
+ }
+ }
+ }
+ }
+ // no more referenced by any devices or crush rules
+ return true;
+}
+
+void CrushWrapper::cleanup_dead_classes()
+{
+ auto p = class_name.begin();
+ while (p != class_name.end()) {
+ if (_class_is_dead(p->first)) {
+ string n = p->second;
+ ++p;
+ remove_class_name(n);
+ } else {
+ ++p;
+ }
+ }
+}
+
+int CrushWrapper::rebuild_roots_with_classes(CephContext *cct)
+{
+ std::map<int32_t, map<int32_t, int32_t> > old_class_bucket = class_bucket;
+ cleanup_dead_classes();
+ int r = trim_roots_with_class(cct);
+ if (r < 0)
+ return r;
+ class_bucket.clear();
+ return populate_classes(old_class_bucket);
+}
+
+void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
+{
+ using ceph::encode;
+ ceph_assert(crush);
+
+ __u32 magic = CRUSH_MAGIC;
+ encode(magic, bl);
+
+ encode(crush->max_buckets, bl);
+ encode(crush->max_rules, bl);
+ encode(crush->max_devices, bl);
+
+ bool encode_compat_choose_args = false;
+ crush_choose_arg_map arg_map;
+ memset(&arg_map, '\0', sizeof(arg_map));
+ if (has_choose_args() &&
+ !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
+ ceph_assert(!has_incompat_choose_args());
+ encode_compat_choose_args = true;
+ arg_map = choose_args.begin()->second;
+ }
+
+ // buckets
+ for (int i=0; i<crush->max_buckets; i++) {
+ __u32 alg = 0;
+ if (crush->buckets[i]) alg = crush->buckets[i]->alg;
+ encode(alg, bl);
+ if (!alg)
+ continue;
+
+ encode(crush->buckets[i]->id, bl);
+ encode(crush->buckets[i]->type, bl);
+ encode(crush->buckets[i]->alg, bl);
+ encode(crush->buckets[i]->hash, bl);
+ encode(crush->buckets[i]->weight, bl);
+ encode(crush->buckets[i]->size, bl);
+ for (unsigned j=0; j<crush->buckets[i]->size; j++)
+ encode(crush->buckets[i]->items[j], bl);
+
+ switch (crush->buckets[i]->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ encode((reinterpret_cast<crush_bucket_uniform*>(crush->buckets[i]))->item_weight, bl);
+ break;
+
+ case CRUSH_BUCKET_LIST:
+ for (unsigned j=0; j<crush->buckets[i]->size; j++) {
+ encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->item_weights[j], bl);
+ encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->sum_weights[j], bl);
+ }
+ break;
+
+ case CRUSH_BUCKET_TREE:
+ encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes, bl);
+ for (unsigned j=0; j<(reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes; j++)
+ encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->node_weights[j], bl);
+ break;
+
+ case CRUSH_BUCKET_STRAW:
+ for (unsigned j=0; j<crush->buckets[i]->size; j++) {
+ encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->item_weights[j], bl);
+ encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->straws[j], bl);
+ }
+ break;
+
+ case CRUSH_BUCKET_STRAW2:
+ {
+ __u32 *weights;
+ if (encode_compat_choose_args &&
+ arg_map.args[i].weight_set_positions > 0) {
+ weights = arg_map.args[i].weight_set[0].weights;
+ } else {
+ weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
+ }
+ for (unsigned j=0; j<crush->buckets[i]->size; j++) {
+ encode(weights[j], bl);
+ }
+ }
+ break;
+
+ default:
+ ceph_abort();
+ break;
+ }
+ }
+
+ // rules
+ for (unsigned i=0; i<crush->max_rules; i++) {
+ __u32 yes = crush->rules[i] ? 1:0;
+ encode(yes, bl);
+ if (!yes)
+ continue;
+
+ encode(crush->rules[i]->len, bl);
+ encode(crush->rules[i]->mask, bl);
+ for (unsigned j=0; j<crush->rules[i]->len; j++)
+ encode(crush->rules[i]->steps[j], bl);
+ }
+
+ // name info
+ encode(type_map, bl);
+ encode(name_map, bl);
+ encode(rule_name_map, bl);
+
+ // tunables
+ encode(crush->choose_local_tries, bl);
+ encode(crush->choose_local_fallback_tries, bl);
+ encode(crush->choose_total_tries, bl);
+ encode(crush->chooseleaf_descend_once, bl);
+ encode(crush->chooseleaf_vary_r, bl);
+ encode(crush->straw_calc_version, bl);
+ encode(crush->allowed_bucket_algs, bl);
+ if (features & CEPH_FEATURE_CRUSH_TUNABLES5) {
+ encode(crush->chooseleaf_stable, bl);
+ }
+
+ if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+ // device classes
+ encode(class_map, bl);
+ encode(class_name, bl);
+ encode(class_bucket, bl);
+
+ // choose args
+ __u32 size = (__u32)choose_args.size();
+ encode(size, bl);
+ for (auto c : choose_args) {
+ encode(c.first, bl);
+ crush_choose_arg_map arg_map = c.second;
+ size = 0;
+ for (__u32 i = 0; i < arg_map.size; i++) {
+ crush_choose_arg *arg = &arg_map.args[i];
+ if (arg->weight_set_positions == 0 &&
+ arg->ids_size == 0)
+ continue;
+ size++;
+ }
+ encode(size, bl);
+ for (__u32 i = 0; i < arg_map.size; i++) {
+ crush_choose_arg *arg = &arg_map.args[i];
+ if (arg->weight_set_positions == 0 &&
+ arg->ids_size == 0)
+ continue;
+ encode(i, bl);
+ encode(arg->weight_set_positions, bl);
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+ crush_weight_set *weight_set = &arg->weight_set[j];
+ encode(weight_set->size, bl);
+ for (__u32 k = 0; k < weight_set->size; k++)
+ encode(weight_set->weights[k], bl);
+ }
+ encode(arg->ids_size, bl);
+ for (__u32 j = 0; j < arg->ids_size; j++)
+ encode(arg->ids[j], bl);
+ }
+ }
+ }
+}
+
+static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::const_iterator& blp)
+{
+ m.clear();
+ __u32 n;
+ decode(n, blp);
+ while (n--) {
+ __s32 key;
+ decode(key, blp);
+
+ __u32 strlen;
+ decode(strlen, blp);
+ if (strlen == 0) {
+ // der, key was actually 64-bits!
+ decode(strlen, blp);
+ }
+ decode_nohead(strlen, m[key], blp);
+ }
+}
+
+void CrushWrapper::decode(bufferlist::const_iterator& blp)
+{
+ using ceph::decode;
+ create();
+
+ __u32 magic;
+ decode(magic, blp);
+ if (magic != CRUSH_MAGIC)
+ throw buffer::malformed_input("bad magic number");
+
+ decode(crush->max_buckets, blp);
+ decode(crush->max_rules, blp);
+ decode(crush->max_devices, blp);
+
+ // legacy tunables, unless we decode something newer
+ set_tunables_legacy();
+
+ try {
+ // buckets
+ crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*));
+ for (int i=0; i<crush->max_buckets; i++) {
+ decode_crush_bucket(&crush->buckets[i], blp);
+ }
+
+ // rules
+ crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*));
+ for (unsigned i = 0; i < crush->max_rules; ++i) {
+ __u32 yes;
+ decode(yes, blp);
+ if (!yes) {
+ crush->rules[i] = NULL;
+ continue;
+ }
+
+ __u32 len;
+ decode(len, blp);
+ crush->rules[i] = reinterpret_cast<crush_rule*>(calloc(1, crush_rule_size(len)));
+ crush->rules[i]->len = len;
+ decode(crush->rules[i]->mask, blp);
+ for (unsigned j=0; j<crush->rules[i]->len; j++)
+ decode(crush->rules[i]->steps[j], blp);
+ }
+
+ // name info
+ // NOTE: we had a bug where we were incoding int instead of int32, which means the
+ // 'key' field for these maps may be either 32 or 64 bits, depending. tolerate
+ // both by assuming the string is always non-empty.
+ decode_32_or_64_string_map(type_map, blp);
+ decode_32_or_64_string_map(name_map, blp);
+ decode_32_or_64_string_map(rule_name_map, blp);
+
+ // tunables
+ if (!blp.end()) {
+ decode(crush->choose_local_tries, blp);
+ decode(crush->choose_local_fallback_tries, blp);
+ decode(crush->choose_total_tries, blp);
+ }
+ if (!blp.end()) {
+ decode(crush->chooseleaf_descend_once, blp);
+ }
+ if (!blp.end()) {
+ decode(crush->chooseleaf_vary_r, blp);
+ }
+ if (!blp.end()) {
+ decode(crush->straw_calc_version, blp);
+ }
+ if (!blp.end()) {
+ decode(crush->allowed_bucket_algs, blp);
+ }
+ if (!blp.end()) {
+ decode(crush->chooseleaf_stable, blp);
+ }
+ if (!blp.end()) {
+ decode(class_map, blp);
+ decode(class_name, blp);
+ for (auto &c : class_name)
+ class_rname[c.second] = c.first;
+ decode(class_bucket, blp);
+ }
+ if (!blp.end()) {
+ __u32 choose_args_size;
+ decode(choose_args_size, blp);
+ for (__u32 i = 0; i < choose_args_size; i++) {
+ typename decltype(choose_args)::key_type choose_args_index;
+ decode(choose_args_index, blp);
+ crush_choose_arg_map arg_map;
+ arg_map.size = crush->max_buckets;
+ arg_map.args = static_cast<crush_choose_arg*>(calloc(
+ arg_map.size, sizeof(crush_choose_arg)));
+ __u32 size;
+ decode(size, blp);
+ for (__u32 j = 0; j < size; j++) {
+ __u32 bucket_index;
+ decode(bucket_index, blp);
+ ceph_assert(bucket_index < arg_map.size);
+ crush_choose_arg *arg = &arg_map.args[bucket_index];
+ decode(arg->weight_set_positions, blp);
+ if (arg->weight_set_positions) {
+ arg->weight_set = static_cast<crush_weight_set*>(calloc(
+ arg->weight_set_positions, sizeof(crush_weight_set)));
+ for (__u32 k = 0; k < arg->weight_set_positions; k++) {
+ crush_weight_set *weight_set = &arg->weight_set[k];
+ decode(weight_set->size, blp);
+ weight_set->weights = (__u32*)calloc(
+ weight_set->size, sizeof(__u32));
+ for (__u32 l = 0; l < weight_set->size; l++)
+ decode(weight_set->weights[l], blp);
+ }
+ }
+ decode(arg->ids_size, blp);
+ if (arg->ids_size) {
+ ceph_assert(arg->ids_size == crush->buckets[bucket_index]->size);
+ arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
+ for (__u32 k = 0; k < arg->ids_size; k++)
+ decode(arg->ids[k], blp);
+ }
+ }
+ choose_args[choose_args_index] = arg_map;
+ }
+ }
+ update_choose_args(nullptr); // in case we decode a legacy "corrupted" map
+ finalize();
+ }
+ catch (...) {
+ crush_destroy(crush);
+ throw;
+ }
+}
+
+void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::const_iterator &blp)
+{
+ using ceph::decode;
+ __u32 alg;
+ decode(alg, blp);
+ if (!alg) {
+ *bptr = NULL;
+ return;
+ }
+
+ int size = 0;
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ size = sizeof(crush_bucket_uniform);
+ break;
+ case CRUSH_BUCKET_LIST:
+ size = sizeof(crush_bucket_list);
+ break;
+ case CRUSH_BUCKET_TREE:
+ size = sizeof(crush_bucket_tree);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ size = sizeof(crush_bucket_straw);
+ break;
+ case CRUSH_BUCKET_STRAW2:
+ size = sizeof(crush_bucket_straw2);
+ break;
+ default:
+ {
+ char str[128];
+ snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg);
+ throw buffer::malformed_input(str);
+ }
+ }
+ crush_bucket *bucket = reinterpret_cast<crush_bucket*>(calloc(1, size));
+ *bptr = bucket;
+
+ decode(bucket->id, blp);
+ decode(bucket->type, blp);
+ decode(bucket->alg, blp);
+ decode(bucket->hash, blp);
+ decode(bucket->weight, blp);
+ decode(bucket->size, blp);
+
+ bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32));
+ for (unsigned j = 0; j < bucket->size; ++j) {
+ decode(bucket->items[j], blp);
+ }
+
+ switch (bucket->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ decode((reinterpret_cast<crush_bucket_uniform*>(bucket))->item_weight, blp);
+ break;
+
+ case CRUSH_BUCKET_LIST: {
+ crush_bucket_list* cbl = reinterpret_cast<crush_bucket_list*>(bucket);
+ cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+ cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+
+ for (unsigned j = 0; j < bucket->size; ++j) {
+ decode(cbl->item_weights[j], blp);
+ decode(cbl->sum_weights[j], blp);
+ }
+ break;
+ }
+
+ case CRUSH_BUCKET_TREE: {
+ crush_bucket_tree* cbt = reinterpret_cast<crush_bucket_tree*>(bucket);
+ decode(cbt->num_nodes, blp);
+ cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32));
+ for (unsigned j=0; j<cbt->num_nodes; j++) {
+ decode(cbt->node_weights[j], blp);
+ }
+ break;
+ }
+
+ case CRUSH_BUCKET_STRAW: {
+ crush_bucket_straw* cbs = reinterpret_cast<crush_bucket_straw*>(bucket);
+ cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+ cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+ for (unsigned j = 0; j < bucket->size; ++j) {
+ decode(cbs->item_weights[j], blp);
+ decode(cbs->straws[j], blp);
+ }
+ break;
+ }
+
+ case CRUSH_BUCKET_STRAW2: {
+ crush_bucket_straw2* cbs = reinterpret_cast<crush_bucket_straw2*>(bucket);
+ cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+ for (unsigned j = 0; j < bucket->size; ++j) {
+ decode(cbs->item_weights[j], blp);
+ }
+ break;
+ }
+
+ default:
+ // We should have handled this case in the first switch statement
+ ceph_abort();
+ break;
+ }
+}
+
+
+void CrushWrapper::dump(Formatter *f) const
+{
+ f->open_array_section("devices");
+ for (int i=0; i<get_max_devices(); i++) {
+ f->open_object_section("device");
+ f->dump_int("id", i);
+ const char *n = get_item_name(i);
+ if (n) {
+ f->dump_string("name", n);
+ } else {
+ char name[20];
+ sprintf(name, "device%d", i);
+ f->dump_string("name", name);
+ }
+ const char *device_class = get_item_class(i);
+ if (device_class != NULL)
+ f->dump_string("class", device_class);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("types");
+ int n = get_num_type_names();
+ for (int i=0; n; i++) {
+ const char *name = get_type_name(i);
+ if (!name) {
+ if (i == 0) {
+ f->open_object_section("type");
+ f->dump_int("type_id", 0);
+ f->dump_string("name", "device");
+ f->close_section();
+ }
+ continue;
+ }
+ n--;
+ f->open_object_section("type");
+ f->dump_int("type_id", i);
+ f->dump_string("name", name);
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("buckets");
+ for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) {
+ if (!bucket_exists(bucket))
+ continue;
+ f->open_object_section("bucket");
+ f->dump_int("id", bucket);
+ if (get_item_name(bucket))
+ f->dump_string("name", get_item_name(bucket));
+ f->dump_int("type_id", get_bucket_type(bucket));
+ if (get_type_name(get_bucket_type(bucket)))
+ f->dump_string("type_name", get_type_name(get_bucket_type(bucket)));
+ f->dump_int("weight", get_bucket_weight(bucket));
+ f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket)));
+ f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket)));
+ f->open_array_section("items");
+ for (int j=0; j<get_bucket_size(bucket); j++) {
+ f->open_object_section("item");
+ f->dump_int("id", get_bucket_item(bucket, j));
+ f->dump_int("weight", get_bucket_item_weight(bucket, j));
+ f->dump_int("pos", j);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+
+ f->open_array_section("rules");
+ dump_rules(f);
+ f->close_section();
+
+ f->open_object_section("tunables");
+ dump_tunables(f);
+ f->close_section();
+
+ dump_choose_args(f);
+}
+
+namespace {
+ // depth first walker
+ class TreeDumper {
+ typedef CrushTreeDumper::Item Item;
+ const CrushWrapper *crush;
+ const CrushTreeDumper::name_map_t& weight_set_names;
+ public:
+ explicit TreeDumper(const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames)
+ : crush(crush), weight_set_names(wsnames) {}
+
+ void dump(Formatter *f) {
+ set<int> roots;
+ crush->find_roots(&roots);
+ for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
+ dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
+ }
+ }
+
+ private:
+ void dump_item(const Item& qi, Formatter* f) {
+ if (qi.is_bucket()) {
+ f->open_object_section("bucket");
+ CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
+ dump_bucket_children(qi, f);
+ f->close_section();
+ } else {
+ f->open_object_section("device");
+ CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
+ f->close_section();
+ }
+ }
+
+ void dump_bucket_children(const Item& parent, Formatter* f) {
+ f->open_array_section("items");
+ const int max_pos = crush->get_bucket_size(parent.id);
+ for (int pos = 0; pos < max_pos; pos++) {
+ int id = crush->get_bucket_item(parent.id, pos);
+ float weight = crush->get_bucket_item_weightf(parent.id, pos);
+ dump_item(Item(id, parent.id, parent.depth + 1, weight), f);
+ }
+ f->close_section();
+ }
+ };
+}
+
+void CrushWrapper::dump_tree(
+ Formatter *f,
+ const CrushTreeDumper::name_map_t& weight_set_names) const
+{
+ ceph_assert(f);
+ TreeDumper(this, weight_set_names).dump(f);
+}
+
+void CrushWrapper::dump_tunables(Formatter *f) const
+{
+ f->dump_int("choose_local_tries", get_choose_local_tries());
+ f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
+ f->dump_int("choose_total_tries", get_choose_total_tries());
+ f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
+ f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
+ f->dump_int("chooseleaf_stable", get_chooseleaf_stable());
+ f->dump_int("straw_calc_version", get_straw_calc_version());
+ f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
+
+ // be helpful about it
+ if (has_jewel_tunables())
+ f->dump_string("profile", "jewel");
+ else if (has_hammer_tunables())
+ f->dump_string("profile", "hammer");
+ else if (has_firefly_tunables())
+ f->dump_string("profile", "firefly");
+ else if (has_bobtail_tunables())
+ f->dump_string("profile", "bobtail");
+ else if (has_argonaut_tunables())
+ f->dump_string("profile", "argonaut");
+ else
+ f->dump_string("profile", "unknown");
+ f->dump_int("optimal_tunables", (int)has_optimal_tunables());
+ f->dump_int("legacy_tunables", (int)has_legacy_tunables());
+
+ // be helpful about minimum version required
+ f->dump_string("minimum_required_version", get_min_required_version());
+
+ f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
+ f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
+ f->dump_int("has_v2_rules", (int)has_v2_rules());
+ f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
+ f->dump_int("has_v3_rules", (int)has_v3_rules());
+ f->dump_int("has_v4_buckets", (int)has_v4_buckets());
+ f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
+ f->dump_int("has_v5_rules", (int)has_v5_rules());
+}
+
+void CrushWrapper::dump_choose_args(Formatter *f) const
+{
+ f->open_object_section("choose_args");
+ for (auto c : choose_args) {
+ crush_choose_arg_map arg_map = c.second;
+ f->open_array_section(stringify(c.first).c_str());
+ for (__u32 i = 0; i < arg_map.size; i++) {
+ crush_choose_arg *arg = &arg_map.args[i];
+ if (arg->weight_set_positions == 0 &&
+ arg->ids_size == 0)
+ continue;
+ f->open_object_section("choose_args");
+ int bucket_index = i;
+ f->dump_int("bucket_id", -1-bucket_index);
+ if (arg->weight_set_positions > 0) {
+ f->open_array_section("weight_set");
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+ f->open_array_section("weights");
+ __u32 *weights = arg->weight_set[j].weights;
+ __u32 size = arg->weight_set[j].size;
+ for (__u32 k = 0; k < size; k++) {
+ f->dump_float("weight", (float)weights[k]/(float)0x10000);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ if (arg->ids_size > 0) {
+ f->open_array_section("ids");
+ for (__u32 j = 0; j < arg->ids_size; j++)
+ f->dump_int("id", arg->ids[j]);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void CrushWrapper::dump_rules(Formatter *f) const
+{
+ for (int i=0; i<get_max_rules(); i++) {
+ if (!rule_exists(i))
+ continue;
+ dump_rule(i, f);
+ }
+}
+
+void CrushWrapper::dump_rule(int ruleset, Formatter *f) const
+{
+ f->open_object_section("rule");
+ f->dump_int("rule_id", ruleset);
+ if (get_rule_name(ruleset))
+ f->dump_string("rule_name", get_rule_name(ruleset));
+ f->dump_int("ruleset", get_rule_mask_ruleset(ruleset));
+ f->dump_int("type", get_rule_mask_type(ruleset));
+ f->dump_int("min_size", get_rule_mask_min_size(ruleset));
+ f->dump_int("max_size", get_rule_mask_max_size(ruleset));
+ f->open_array_section("steps");
+ for (int j=0; j<get_rule_len(ruleset); j++) {
+ f->open_object_section("step");
+ switch (get_rule_op(ruleset, j)) {
+ case CRUSH_RULE_NOOP:
+ f->dump_string("op", "noop");
+ break;
+ case CRUSH_RULE_TAKE:
+ f->dump_string("op", "take");
+ {
+ int item = get_rule_arg1(ruleset, j);
+ f->dump_int("item", item);
+
+ const char *name = get_item_name(item);
+ f->dump_string("item_name", name ? name : "");
+ }
+ break;
+ case CRUSH_RULE_EMIT:
+ f->dump_string("op", "emit");
+ break;
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ f->dump_string("op", "choose_firstn");
+ f->dump_int("num", get_rule_arg1(ruleset, j));
+ f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
+ break;
+ case CRUSH_RULE_CHOOSE_INDEP:
+ f->dump_string("op", "choose_indep");
+ f->dump_int("num", get_rule_arg1(ruleset, j));
+ f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
+ break;
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+ f->dump_string("op", "chooseleaf_firstn");
+ f->dump_int("num", get_rule_arg1(ruleset, j));
+ f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
+ break;
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
+ f->dump_string("op", "chooseleaf_indep");
+ f->dump_int("num", get_rule_arg1(ruleset, j));
+ f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
+ break;
+ case CRUSH_RULE_SET_CHOOSE_TRIES:
+ f->dump_string("op", "set_choose_tries");
+ f->dump_int("num", get_rule_arg1(ruleset, j));
+ break;
+ case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+ f->dump_string("op", "set_chooseleaf_tries");
+ f->dump_int("num", get_rule_arg1(ruleset, j));
+ break;
+ default:
+ f->dump_int("opcode", get_rule_op(ruleset, j));
+ f->dump_int("arg1", get_rule_arg1(ruleset, j));
+ f->dump_int("arg2", get_rule_arg2(ruleset, j));
+ }
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+}
+
+void CrushWrapper::list_rules(Formatter *f) const
+{
+ for (int rule = 0; rule < get_max_rules(); rule++) {
+ if (!rule_exists(rule))
+ continue;
+ f->dump_string("name", get_rule_name(rule));
+ }
+}
+
+void CrushWrapper::list_rules(ostream *ss) const
+{
+ for (int rule = 0; rule < get_max_rules(); rule++) {
+ if (!rule_exists(rule))
+ continue;
+ *ss << get_rule_name(rule) << "\n";
+ }
+}
+
+class CrushTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
+public:
+ typedef CrushTreeDumper::Dumper<TextTable> Parent;
+
+ explicit CrushTreePlainDumper(const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames)
+ : Parent(crush, wsnames) {}
+ explicit CrushTreePlainDumper(const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames,
+ bool show_shadow)
+ : Parent(crush, wsnames, show_shadow) {}
+
+
+ void dump(TextTable *tbl) {
+ tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
+ tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
+ for (auto& p : crush->choose_args) {
+ if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
+ tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT);
+ } else {
+ string name;
+ auto q = weight_set_names.find(p.first);
+ name = q != weight_set_names.end() ? q->second :
+ stringify(p.first);
+ tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT);
+ }
+ }
+ tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
+ Parent::dump(tbl);
+ }
+
+protected:
+ void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
+ const char *c = crush->get_item_class(qi.id);
+ if (!c)
+ c = "";
+ *tbl << qi.id
+ << c
+ << weightf_t(qi.weight);
+ for (auto& p : crush->choose_args) {
+ if (qi.parent < 0) {
+ const crush_choose_arg_map cmap = crush->choose_args_get(p.first);
+ int bidx = -1 - qi.parent;
+ const crush_bucket *b = crush->get_bucket(qi.parent);
+ if (b &&
+ bidx < (int)cmap.size &&
+ cmap.args[bidx].weight_set &&
+ cmap.args[bidx].weight_set_positions >= 1) {
+ int pos;
+ for (pos = 0;
+ pos < (int)cmap.args[bidx].weight_set[0].size &&
+ b->items[pos] != qi.id;
+ ++pos) ;
+ *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] /
+ (float)0x10000);
+ continue;
+ }
+ }
+ *tbl << "";
+ }
+ ostringstream ss;
+ for (int k=0; k < qi.depth; k++) {
+ ss << " ";
+ }
+ if (qi.is_bucket()) {
+ ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
+ << crush->get_item_name(qi.id);
+ } else {
+ ss << "osd." << qi.id;
+ }
+ *tbl << ss.str();
+ *tbl << TextTable::endrow;
+ }
+};
+
+
+class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
+public:
+ typedef CrushTreeDumper::FormattingDumper Parent;
+
+ explicit CrushTreeFormattingDumper(
+ const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames)
+ : Parent(crush, wsnames) {}
+
+ explicit CrushTreeFormattingDumper(
+ const CrushWrapper *crush,
+ const CrushTreeDumper::name_map_t& wsnames,
+ bool show_shadow)
+ : Parent(crush, wsnames, show_shadow) {}
+
+ void dump(Formatter *f) {
+ f->open_array_section("nodes");
+ Parent::dump(f);
+ f->close_section();
+
+ // There is no stray bucket whose id is a negative number, so just get
+ // the max_id and iterate from 0 to max_id to dump stray osds.
+ f->open_array_section("stray");
+ int32_t max_id = -1;
+ if (!crush->name_map.empty()) {
+ max_id = crush->name_map.rbegin()->first;
+ }
+ for (int32_t i = 0; i <= max_id; i++) {
+ if (crush->item_exists(i) && !is_touched(i) && should_dump(i)) {
+ dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
+ }
+ }
+ f->close_section();
+ }
+};
+
+
+void CrushWrapper::dump_tree(
+ ostream *out,
+ Formatter *f,
+ const CrushTreeDumper::name_map_t& weight_set_names,
+ bool show_shadow) const
+{
+ if (out) {
+ TextTable tbl;
+ CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl);
+ *out << tbl;
+ }
+ if (f) {
+ CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f);
+ }
+}
+
+void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
+{
+ o.push_back(new CrushWrapper);
+ // fixme
+}
+
+/**
+ * Determine the default CRUSH ruleset ID to be used with
+ * newly created replicated pools.
+ *
+ * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
+ */
+int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
+{
+ int crush_ruleset = cct->_conf.get_val<int64_t>("osd_pool_default_crush_rule");
+ if (crush_ruleset < 0) {
+ crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
+ } else if (!ruleset_exists(crush_ruleset)) {
+ crush_ruleset = -1; // match find_first_ruleset() retval
+ }
+ return crush_ruleset;
+}
+
+bool CrushWrapper::is_valid_crush_name(const string& s)
+{
+ if (s.empty())
+ return false;
+ for (string::const_iterator p = s.begin(); p != s.end(); ++p) {
+ if (!(*p == '-') &&
+ !(*p == '_') &&
+ !(*p == '.') &&
+ !(*p >= '0' && *p <= '9') &&
+ !(*p >= 'A' && *p <= 'Z') &&
+ !(*p >= 'a' && *p <= 'z'))
+ return false;
+ }
+ return true;
+}
+
+bool CrushWrapper::is_valid_crush_loc(CephContext *cct,
+ const map<string,string>& loc)
+{
+ for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
+ if (!is_valid_crush_name(l->first) ||
+ !is_valid_crush_name(l->second)) {
+ ldout(cct, 1) << "loc["
+ << l->first << "] = '"
+ << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)"
+ << dendl;
+ return false;
+ }
+ }
+ return true;
+}
+
+int CrushWrapper::_choose_type_stack(
+ CephContext *cct,
+ const vector<pair<int,int>>& stack,
+ const set<int>& overfull,
+ const vector<int>& underfull,
+ const vector<int>& more_underfull,
+ const vector<int>& orig,
+ vector<int>::const_iterator& i,
+ set<int>& used,
+ vector<int> *pw,
+ int root_bucket,
+ int rule) const
+{
+ vector<int> w = *pw;
+ vector<int> o;
+
+ ldout(cct, 10) << __func__ << " stack " << stack
+ << " orig " << orig
+ << " at " << *i
+ << " pw " << *pw
+ << dendl;
+ ceph_assert(root_bucket < 0);
+ vector<int> cumulative_fanout(stack.size());
+ int f = 1;
+ for (int j = (int)stack.size() - 1; j >= 0; --j) {
+ cumulative_fanout[j] = f;
+ f *= stack[j].second;
+ }
+ ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
+ << dendl;
+
+ // identify underfull targets for each intermediate level.
+ // this serves two purposes:
+ // 1. we can tell when we are selecting a bucket that does not have any underfull
+ // devices beneath it. that means that if the current input includes an overfull
+ // device, we won't be able to find an underfull device with this parent to
+ // swap for it.
+ // 2. when we decide we should reject a bucket due to the above, this list gives us
+ // a list of peers to consider that *do* have underfull devices available.. (we
+ // are careful to pick one that has the same parent.)
+ vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
+ underfull_buckets.resize(stack.size() - 1);
+ for (auto osd : underfull) {
+ int item = osd;
+ for (int j = (int)stack.size() - 2; j >= 0; --j) {
+ int type = stack[j].first;
+ item = get_parent_of_type(item, type, rule);
+ ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
+ << " is " << item << dendl;
+ if (!subtree_contains(root_bucket, item)) {
+ ldout(cct, 20) << __func__ << " not in root subtree " << root_bucket << dendl;
+ continue;
+ }
+ underfull_buckets[j].insert(item);
+ }
+ }
+ ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
+
+ for (unsigned j = 0; j < stack.size(); ++j) {
+ int type = stack[j].first;
+ int fanout = stack[j].second;
+ int cum_fanout = cumulative_fanout[j];
+ ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout
+ << " cumulative " << cum_fanout
+ << " w " << w << dendl;
+ vector<int> o;
+ auto tmpi = i;
+ if (i == orig.end()) {
+ ldout(cct, 10) << __func__ << " end of orig, break 0" << dendl;
+ break;
+ }
+ for (auto from : w) {
+ ldout(cct, 10) << " from " << from << dendl;
+ // identify leaves under each choice. we use this to check whether any of these
+ // leaves are overfull. (if so, we need to make sure there are underfull candidates
+ // to swap for them.)
+ vector<set<int>> leaves;
+ leaves.resize(fanout);
+ for (int pos = 0; pos < fanout; ++pos) {
+ if (type > 0) {
+ // non-leaf
+ int item = get_parent_of_type(*tmpi, type, rule);
+ o.push_back(item);
+ int n = cum_fanout;
+ while (n-- && tmpi != orig.end()) {
+ leaves[pos].insert(*tmpi++);
+ }
+ ldout(cct, 10) << __func__ << " from " << *tmpi << " got " << item
+ << " of type " << type << " over leaves " << leaves[pos] << dendl;
+ } else {
+ // leaf
+ bool replaced = false;
+ if (overfull.count(*i)) {
+ for (auto item : underfull) {
+ ldout(cct, 10) << __func__ << " pos " << pos
+ << " was " << *i << " considering " << item
+ << dendl;
+ if (used.count(item)) {
+ ldout(cct, 20) << __func__ << " in used " << used << dendl;
+ continue;
+ }
+ if (!subtree_contains(from, item)) {
+ ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
+ continue;
+ }
+ if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
+ ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
+ continue;
+ }
+ o.push_back(item);
+ used.insert(item);
+ ldout(cct, 10) << __func__ << " pos " << pos << " replace "
+ << *i << " -> " << item << dendl;
+ replaced = true;
+ ceph_assert(i != orig.end());
+ ++i;
+ break;
+ }
+ if (!replaced) {
+ for (auto item : more_underfull) {
+ ldout(cct, 10) << __func__ << " more underfull pos " << pos
+ << " was " << *i << " considering " << item
+ << dendl;
+ if (used.count(item)) {
+ ldout(cct, 20) << __func__ << " in used " << used << dendl;
+ continue;
+ }
+ if (!subtree_contains(from, item)) {
+ ldout(cct, 20) << __func__ << " not in subtree " << from << dendl;
+ continue;
+ }
+ if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
+ ldout(cct, 20) << __func__ << " in orig " << orig << dendl;
+ continue;
+ }
+ o.push_back(item);
+ used.insert(item);
+ ldout(cct, 10) << __func__ << " pos " << pos << " replace "
+ << *i << " -> " << item << dendl;
+ replaced = true;
+ assert(i != orig.end());
+ ++i;
+ break;
+ }
+ }
+ }
+ if (!replaced) {
+ ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
+ << dendl;
+ ceph_assert(i != orig.end());
+ o.push_back(*i);
+ ++i;
+ }
+ if (i == orig.end()) {
+ ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl;
+ break;
+ }
+ }
+ }
+ if (j + 1 < stack.size()) {
+ // check if any buckets have overfull leaves but no underfull candidates
+ for (int pos = 0; pos < fanout; ++pos) {
+ if (underfull_buckets[j].count(o[pos]) == 0) {
+ // are any leaves overfull?
+ bool any_overfull = false;
+ for (auto osd : leaves[pos]) {
+ if (overfull.count(osd)) {
+ any_overfull = true;
+ break;
+ }
+ }
+ if (any_overfull) {
+ ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
+ << ">0 leaves " << leaves[pos] << " is overfull; alts "
+ << underfull_buckets[j]
+ << dendl;
+ for (auto alt : underfull_buckets[j]) {
+ if (std::find(o.begin(), o.end(), alt) == o.end()) {
+ // see if alt has the same parent
+ if (j == 0 ||
+ get_parent_of_type(o[pos], stack[j-1].first, rule) ==
+ get_parent_of_type(alt, stack[j-1].first, rule)) {
+ if (j)
+ ldout(cct, 10) << " replacing " << o[pos]
+ << " (which has no underfull leaves) with " << alt
+ << " (same parent "
+ << get_parent_of_type(alt, stack[j-1].first, rule) << " type "
+ << type << ")" << dendl;
+ else
+ ldout(cct, 10) << " replacing " << o[pos]
+ << " (which has no underfull leaves) with " << alt
+ << " (first level)" << dendl;
+ o[pos] = alt;
+ break;
+ } else {
+ ldout(cct, 30) << " alt " << alt << " for " << o[pos]
+ << " has different parent, skipping" << dendl;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ if (i == orig.end()) {
+ ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
+ break;
+ }
+ }
+ ldout(cct, 10) << __func__ << " w <- " << o << " was " << w << dendl;
+ w.swap(o);
+ }
+ *pw = w;
+ return 0;
+}
+
+int CrushWrapper::try_remap_rule(
+ CephContext *cct,
+ int ruleno,
+ int maxout,
+ const set<int>& overfull,
+ const vector<int>& underfull,
+ const vector<int>& more_underfull,
+ const vector<int>& orig,
+ vector<int> *out) const
+{
+ const crush_map *map = crush;
+ const crush_rule *rule = get_rule(ruleno);
+ ceph_assert(rule);
+
+ ldout(cct, 10) << __func__ << " ruleno " << ruleno
+ << " numrep " << maxout << " overfull " << overfull
+ << " underfull " << underfull
+ << " more_underfull " << more_underfull
+ << " orig " << orig
+ << dendl;
+ vector<int> w; // working set
+ out->clear();
+
+ auto i = orig.begin();
+ set<int> used;
+
+ vector<pair<int,int>> type_stack; // (type, fan-out)
+ int root_bucket = 0;
+ for (unsigned step = 0; step < rule->len; ++step) {
+ const crush_rule_step *curstep = &rule->steps[step];
+ ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
+ switch (curstep->op) {
+ case CRUSH_RULE_TAKE:
+ if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) ||
+ (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets &&
+ map->buckets[-1-curstep->arg1])) {
+ w.clear();
+ w.push_back(curstep->arg1);
+ root_bucket = curstep->arg1;
+ ldout(cct, 10) << __func__ << " take " << w << dendl;
+ } else {
+ ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
+ }
+ break;
+
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
+ {
+ int numrep = curstep->arg1;
+ int type = curstep->arg2;
+ if (numrep <= 0)
+ numrep += maxout;
+ type_stack.push_back(make_pair(type, numrep));
+ if (type > 0)
+ type_stack.push_back(make_pair(0, 1));
+ int r = _choose_type_stack(cct, type_stack, overfull, underfull, more_underfull, orig,
+ i, used, &w, root_bucket, ruleno);
+ if (r < 0)
+ return r;
+ type_stack.clear();
+ }
+ break;
+
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ case CRUSH_RULE_CHOOSE_INDEP:
+ {
+ int numrep = curstep->arg1;
+ int type = curstep->arg2;
+ if (numrep <= 0)
+ numrep += maxout;
+ type_stack.push_back(make_pair(type, numrep));
+ }
+ break;
+
+ case CRUSH_RULE_EMIT:
+ ldout(cct, 10) << " emit " << w << dendl;
+ if (!type_stack.empty()) {
+ int r = _choose_type_stack(cct, type_stack, overfull, underfull, more_underfull, orig,
+ i, used, &w, root_bucket, ruleno);
+ if (r < 0)
+ return r;
+ type_stack.clear();
+ }
+ for (auto item : w) {
+ out->push_back(item);
+ }
+ w.clear();
+ break;
+
+ default:
+ // ignore
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int bucketid,
+ int id,
+ const vector<int>& weight,
+ ostream *ss)
+{
+ int changed = 0;
+ int bidx = -1 - bucketid;
+ crush_bucket *b = crush->buckets[bidx];
+ if (bidx >= (int)cmap.size) {
+ if (ss)
+ *ss << "no weight-set for bucket " << b->id;
+ ldout(cct, 10) << __func__ << " no crush_choose_arg for bucket " << b->id
+ << dendl;
+ return 0;
+ }
+ crush_choose_arg *carg = &cmap.args[bidx];
+ if (carg->weight_set == NULL) {
+ // create a weight-set for this bucket and populate it with the
+ // bucket weights
+ unsigned positions = get_choose_args_positions(cmap);
+ carg->weight_set_positions = positions;
+ carg->weight_set = static_cast<crush_weight_set*>(
+ calloc(sizeof(crush_weight_set), positions));
+ for (unsigned p = 0; p < positions; ++p) {
+ carg->weight_set[p].size = b->size;
+ carg->weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
+ for (unsigned i = 0; i < b->size; ++i) {
+ carg->weight_set[p].weights[i] = crush_get_bucket_item_weight(b, i);
+ }
+ }
+ changed++;
+ }
+ if (carg->weight_set_positions != weight.size()) {
+ if (ss)
+ *ss << "weight_set_positions != " << weight.size() << " for bucket " << b->id;
+ ldout(cct, 10) << __func__ << " weight_set_positions != " << weight.size()
+ << " for bucket " << b->id << dendl;
+ return 0;
+ }
+ for (unsigned i = 0; i < b->size; i++) {
+ if (b->items[i] == id) {
+ for (unsigned j = 0; j < weight.size(); ++j) {
+ carg->weight_set[j].weights[i] = weight[j];
+ }
+ ldout(cct, 5) << __func__ << " set " << id << " to " << weight
+ << " in bucket " << b->id << dendl;
+ changed++;
+ }
+ }
+ if (changed) {
+ vector<int> bucket_weight(weight.size(), 0);
+ for (unsigned i = 0; i < b->size; i++) {
+ for (unsigned j = 0; j < weight.size(); ++j) {
+ bucket_weight[j] += carg->weight_set[j].weights[i];
+ }
+ }
+ choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr);
+ }
+ return changed;
+}
+
+int CrushWrapper::choose_args_adjust_item_weight(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int id,
+ const vector<int>& weight,
+ ostream *ss)
+{
+ ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
+ int changed = 0;
+ for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+ crush_bucket *b = crush->buckets[bidx];
+ if (b == nullptr) {
+ continue;
+ }
+ changed += _choose_args_adjust_item_weight_in_bucket(
+ cct, cmap, b->id, id, weight, ss);
+ }
+ if (!changed) {
+ if (ss)
+ *ss << "item " << id << " not found in crush map";
+ return -ENOENT;
+ }
+ return changed;
+}
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
new file mode 100644
index 00000000..136ad538
--- /dev/null
+++ b/src/crush/CrushWrapper.h
@@ -0,0 +1,1657 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CRUSH_WRAPPER_H
+#define CEPH_CRUSH_WRAPPER_H
+
+#include <stdlib.h>
+#include <map>
+#include <set>
+#include <string>
+
+#include <iosfwd>
+
+#include "include/types.h"
+
+extern "C" {
+#include "crush.h"
+#include "hash.h"
+#include "mapper.h"
+#include "builder.h"
+}
+
+#include "include/ceph_assert.h"
+#include "include/err.h"
+#include "include/encoding.h"
+#include "include/mempool.h"
+
+#include "common/Mutex.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+namespace CrushTreeDumper {
+ typedef mempool::osdmap::map<int64_t,string> name_map_t;
+}
+
+WRITE_RAW_ENCODER(crush_rule_mask) // it's all u8's
+
+inline void encode(const crush_rule_step &s, bufferlist &bl)
+{
+ using ceph::encode;
+ encode(s.op, bl);
+ encode(s.arg1, bl);
+ encode(s.arg2, bl);
+}
+inline void decode(crush_rule_step &s, bufferlist::const_iterator &p)
+{
+ using ceph::decode;
+ decode(s.op, p);
+ decode(s.arg1, p);
+ decode(s.arg2, p);
+}
+
+class CrushWrapper {
+public:
+ // magic value used by OSDMap for a "default" fallback choose_args, used if
+ // the choose_arg_map passed to do_rule does not exist. if this also
+ // doesn't exist, fall back to canonical weights.
+ enum {
+ DEFAULT_CHOOSE_ARGS = -1
+ };
+
+ std::map<int32_t, string> type_map; /* bucket/device type names */
+ std::map<int32_t, string> name_map; /* bucket/device names */
+ std::map<int32_t, string> rule_name_map;
+
+ std::map<int32_t, int32_t> class_map; /* item id -> class id */
+ std::map<int32_t, string> class_name; /* class id -> class name */
+ std::map<string, int32_t> class_rname; /* class name -> class id */
+ std::map<int32_t, map<int32_t, int32_t> > class_bucket; /* bucket[id][class] == id */
+ std::map<int64_t, crush_choose_arg_map> choose_args;
+
+private:
+ struct crush_map *crush = nullptr;
+
+ bool have_uniform_rules = false;
+
+ /* reverse maps */
+ mutable bool have_rmaps = false;
+ mutable std::map<string, int> type_rmap, name_rmap, rule_name_rmap;
+ void build_rmaps() const {
+ if (have_rmaps) return;
+ build_rmap(type_map, type_rmap);
+ build_rmap(name_map, name_rmap);
+ build_rmap(rule_name_map, rule_name_rmap);
+ have_rmaps = true;
+ }
+ void build_rmap(const map<int, string> &f, std::map<string, int> &r) const {
+ r.clear();
+ for (std::map<int, string>::const_iterator p = f.begin(); p != f.end(); ++p)
+ r[p->second] = p->first;
+ }
+
+public:
+ CrushWrapper(const CrushWrapper& other);
+ const CrushWrapper& operator=(const CrushWrapper& other);
+
+ CrushWrapper() {
+ create();
+ }
+ ~CrushWrapper() {
+ if (crush)
+ crush_destroy(crush);
+ choose_args_clear();
+ }
+
+ crush_map *get_crush_map() { return crush; }
+
+ /* building */
+ void create() {
+ if (crush)
+ crush_destroy(crush);
+ crush = crush_create();
+ choose_args_clear();
+ ceph_assert(crush);
+ have_rmaps = false;
+
+ set_tunables_default();
+ }
+
+ /**
+ * true if any rule has a rule id != its position in the array
+ *
+ * These indicate "ruleset" IDs that were created by older versions
+ * of Ceph. They are cleaned up in renumber_rules so that eventually
+ * we can remove the code for handling them.
+ */
+ bool has_legacy_rule_ids() const;
+
+ /**
+ * fix rules whose ruleid != ruleset
+ *
+ * These rules were created in older versions of Ceph. The concept
+ * of a ruleset no longer exists.
+ *
+ * Return a map of old ID -> new ID. Caller must update OSDMap
+ * to use new IDs.
+ */
+ std::map<int, int> renumber_rules();
+
+ /// true if any buckets that aren't straw2
+ bool has_non_straw2_buckets() const;
+
+ // tunables
+ void set_tunables_argonaut() {
+ crush->choose_local_tries = 2;
+ crush->choose_local_fallback_tries = 5;
+ crush->choose_total_tries = 19;
+ crush->chooseleaf_descend_once = 0;
+ crush->chooseleaf_vary_r = 0;
+ crush->chooseleaf_stable = 0;
+ crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+ }
+ void set_tunables_bobtail() {
+ crush->choose_local_tries = 0;
+ crush->choose_local_fallback_tries = 0;
+ crush->choose_total_tries = 50;
+ crush->chooseleaf_descend_once = 1;
+ crush->chooseleaf_vary_r = 0;
+ crush->chooseleaf_stable = 0;
+ crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+ }
+ void set_tunables_firefly() {
+ crush->choose_local_tries = 0;
+ crush->choose_local_fallback_tries = 0;
+ crush->choose_total_tries = 50;
+ crush->chooseleaf_descend_once = 1;
+ crush->chooseleaf_vary_r = 1;
+ crush->chooseleaf_stable = 0;
+ crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+ }
+ void set_tunables_hammer() {
+ crush->choose_local_tries = 0;
+ crush->choose_local_fallback_tries = 0;
+ crush->choose_total_tries = 50;
+ crush->chooseleaf_descend_once = 1;
+ crush->chooseleaf_vary_r = 1;
+ crush->chooseleaf_stable = 0;
+ crush->allowed_bucket_algs =
+ (1 << CRUSH_BUCKET_UNIFORM) |
+ (1 << CRUSH_BUCKET_LIST) |
+ (1 << CRUSH_BUCKET_STRAW) |
+ (1 << CRUSH_BUCKET_STRAW2);
+ }
+ void set_tunables_jewel() {
+ crush->choose_local_tries = 0;
+ crush->choose_local_fallback_tries = 0;
+ crush->choose_total_tries = 50;
+ crush->chooseleaf_descend_once = 1;
+ crush->chooseleaf_vary_r = 1;
+ crush->chooseleaf_stable = 1;
+ crush->allowed_bucket_algs =
+ (1 << CRUSH_BUCKET_UNIFORM) |
+ (1 << CRUSH_BUCKET_LIST) |
+ (1 << CRUSH_BUCKET_STRAW) |
+ (1 << CRUSH_BUCKET_STRAW2);
+ }
+
+ void set_tunables_legacy() {
+ set_tunables_argonaut();
+ crush->straw_calc_version = 0;
+ }
+ void set_tunables_optimal() {
+ set_tunables_jewel();
+ crush->straw_calc_version = 1;
+ }
+ void set_tunables_default() {
+ set_tunables_jewel();
+ crush->straw_calc_version = 1;
+ }
+
+ int get_choose_local_tries() const {
+ return crush->choose_local_tries;
+ }
+ void set_choose_local_tries(int n) {
+ crush->choose_local_tries = n;
+ }
+
+ int get_choose_local_fallback_tries() const {
+ return crush->choose_local_fallback_tries;
+ }
+ void set_choose_local_fallback_tries(int n) {
+ crush->choose_local_fallback_tries = n;
+ }
+
+ int get_choose_total_tries() const {
+ return crush->choose_total_tries;
+ }
+ void set_choose_total_tries(int n) {
+ crush->choose_total_tries = n;
+ }
+
+ int get_chooseleaf_descend_once() const {
+ return crush->chooseleaf_descend_once;
+ }
+ void set_chooseleaf_descend_once(int n) {
+ crush->chooseleaf_descend_once = !!n;
+ }
+
+ int get_chooseleaf_vary_r() const {
+ return crush->chooseleaf_vary_r;
+ }
+ void set_chooseleaf_vary_r(int n) {
+ crush->chooseleaf_vary_r = n;
+ }
+
+ int get_chooseleaf_stable() const {
+ return crush->chooseleaf_stable;
+ }
+ void set_chooseleaf_stable(int n) {
+ crush->chooseleaf_stable = n;
+ }
+
+ int get_straw_calc_version() const {
+ return crush->straw_calc_version;
+ }
+ void set_straw_calc_version(int n) {
+ crush->straw_calc_version = n;
+ }
+
+ unsigned get_allowed_bucket_algs() const {
+ return crush->allowed_bucket_algs;
+ }
+ void set_allowed_bucket_algs(unsigned n) {
+ crush->allowed_bucket_algs = n;
+ }
+
+ bool has_argonaut_tunables() const {
+ return
+ crush->choose_local_tries == 2 &&
+ crush->choose_local_fallback_tries == 5 &&
+ crush->choose_total_tries == 19 &&
+ crush->chooseleaf_descend_once == 0 &&
+ crush->chooseleaf_vary_r == 0 &&
+ crush->chooseleaf_stable == 0 &&
+ crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+ }
+ bool has_bobtail_tunables() const {
+ return
+ crush->choose_local_tries == 0 &&
+ crush->choose_local_fallback_tries == 0 &&
+ crush->choose_total_tries == 50 &&
+ crush->chooseleaf_descend_once == 1 &&
+ crush->chooseleaf_vary_r == 0 &&
+ crush->chooseleaf_stable == 0 &&
+ crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+ }
+ bool has_firefly_tunables() const {
+ return
+ crush->choose_local_tries == 0 &&
+ crush->choose_local_fallback_tries == 0 &&
+ crush->choose_total_tries == 50 &&
+ crush->chooseleaf_descend_once == 1 &&
+ crush->chooseleaf_vary_r == 1 &&
+ crush->chooseleaf_stable == 0 &&
+ crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+ }
+ bool has_hammer_tunables() const {
+ return
+ crush->choose_local_tries == 0 &&
+ crush->choose_local_fallback_tries == 0 &&
+ crush->choose_total_tries == 50 &&
+ crush->chooseleaf_descend_once == 1 &&
+ crush->chooseleaf_vary_r == 1 &&
+ crush->chooseleaf_stable == 0 &&
+ crush->allowed_bucket_algs == ((1 << CRUSH_BUCKET_UNIFORM) |
+ (1 << CRUSH_BUCKET_LIST) |
+ (1 << CRUSH_BUCKET_STRAW) |
+ (1 << CRUSH_BUCKET_STRAW2));
+ }
+ bool has_jewel_tunables() const {
+ return
+ crush->choose_local_tries == 0 &&
+ crush->choose_local_fallback_tries == 0 &&
+ crush->choose_total_tries == 50 &&
+ crush->chooseleaf_descend_once == 1 &&
+ crush->chooseleaf_vary_r == 1 &&
+ crush->chooseleaf_stable == 1 &&
+ crush->allowed_bucket_algs == ((1 << CRUSH_BUCKET_UNIFORM) |
+ (1 << CRUSH_BUCKET_LIST) |
+ (1 << CRUSH_BUCKET_STRAW) |
+ (1 << CRUSH_BUCKET_STRAW2));
+ }
+
+ bool has_optimal_tunables() const {
+ return has_jewel_tunables();
+ }
+ bool has_legacy_tunables() const {
+ return has_argonaut_tunables();
+ }
+
+ bool has_nondefault_tunables() const {
+ return
+ (crush->choose_local_tries != 2 ||
+ crush->choose_local_fallback_tries != 5 ||
+ crush->choose_total_tries != 19);
+ }
+ bool has_nondefault_tunables2() const {
+ return
+ crush->chooseleaf_descend_once != 0;
+ }
+ bool has_nondefault_tunables3() const {
+ return
+ crush->chooseleaf_vary_r != 0;
+ }
+ bool has_nondefault_tunables5() const {
+ return
+ crush->chooseleaf_stable != 0;
+ }
+
+ bool has_v2_rules() const;
+ bool has_v3_rules() const;
+ bool has_v4_buckets() const;
+ bool has_v5_rules() const;
+ bool has_choose_args() const; // any choose_args
+ bool has_incompat_choose_args() const; // choose_args that can't be made compat
+
+ bool is_v2_rule(unsigned ruleid) const;
+ bool is_v3_rule(unsigned ruleid) const;
+ bool is_v5_rule(unsigned ruleid) const;
+
+ string get_min_required_version() const {
+ if (has_v5_rules() || has_nondefault_tunables5())
+ return "jewel";
+ else if (has_v4_buckets())
+ return "hammer";
+ else if (has_nondefault_tunables3())
+ return "firefly";
+ else if (has_nondefault_tunables2() || has_nondefault_tunables())
+ return "bobtail";
+ else
+ return "argonaut";
+ }
+
+ // default bucket types
+ unsigned get_default_bucket_alg() const {
+ // in order of preference
+ if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_STRAW2))
+ return CRUSH_BUCKET_STRAW2;
+ if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_STRAW))
+ return CRUSH_BUCKET_STRAW;
+ if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_TREE))
+ return CRUSH_BUCKET_TREE;
+ if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_LIST))
+ return CRUSH_BUCKET_LIST;
+ if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_UNIFORM))
+ return CRUSH_BUCKET_UNIFORM;
+ return 0;
+ }
+
+ // bucket types
+ int get_num_type_names() const {
+ return type_map.size();
+ }
+ int get_max_type_id() const {
+ if (type_map.empty())
+ return 0;
+ return type_map.rbegin()->first;
+ }
+ int get_type_id(const string& name) const {
+ build_rmaps();
+ if (type_rmap.count(name))
+ return type_rmap[name];
+ return -1;
+ }
+ const char *get_type_name(int t) const {
+ std::map<int,string>::const_iterator p = type_map.find(t);
+ if (p != type_map.end())
+ return p->second.c_str();
+ return 0;
+ }
+ void set_type_name(int i, const string& name) {
+ type_map[i] = name;
+ if (have_rmaps)
+ type_rmap[name] = i;
+ }
+
+ // item/bucket names
+ bool name_exists(const string& name) const {
+ build_rmaps();
+ return name_rmap.count(name);
+ }
+ bool item_exists(int i) const {
+ return name_map.count(i);
+ }
+ int get_item_id(const string& name) const {
+ build_rmaps();
+ if (name_rmap.count(name))
+ return name_rmap[name];
+ return 0; /* hrm */
+ }
+ const char *get_item_name(int t) const {
+ std::map<int,string>::const_iterator p = name_map.find(t);
+ if (p != name_map.end())
+ return p->second.c_str();
+ return 0;
+ }
+ int set_item_name(int i, const string& name) {
+ if (!is_valid_crush_name(name))
+ return -EINVAL;
+ name_map[i] = name;
+ if (have_rmaps)
+ name_rmap[name] = i;
+ return 0;
+ }
+ void swap_names(int a, int b) {
+ string an = name_map[a];
+ string bn = name_map[b];
+ name_map[a] = bn;
+ name_map[b] = an;
+ if (have_rmaps) {
+ name_rmap[an] = b;
+ name_rmap[bn] = a;
+ }
+ }
+ int split_id_class(int i, int *idout, int *classout) const;
+
+ bool class_exists(const string& name) const {
+ return class_rname.count(name);
+ }
+ const char *get_class_name(int i) const {
+ auto p = class_name.find(i);
+ if (p != class_name.end())
+ return p->second.c_str();
+ return 0;
+ }
+ int get_class_id(const string& name) const {
+ auto p = class_rname.find(name);
+ if (p != class_rname.end())
+ return p->second;
+ else
+ return -EINVAL;
+ }
+ int remove_class_name(const string& name) {
+ auto p = class_rname.find(name);
+ if (p == class_rname.end())
+ return -ENOENT;
+ int class_id = p->second;
+ auto q = class_name.find(class_id);
+ if (q == class_name.end())
+ return -ENOENT;
+ class_rname.erase(name);
+ class_name.erase(class_id);
+ return 0;
+ }
+
+ int32_t _alloc_class_id() const;
+
+ int get_or_create_class_id(const string& name) {
+ int c = get_class_id(name);
+ if (c < 0) {
+ int i = _alloc_class_id();
+ class_name[i] = name;
+ class_rname[name] = i;
+ return i;
+ } else {
+ return c;
+ }
+ }
+
+ const char *get_item_class(int t) const {
+ std::map<int,int>::const_iterator p = class_map.find(t);
+ if (p == class_map.end())
+ return 0;
+ return get_class_name(p->second);
+ }
+ int get_item_class_id(int t) const {
+ auto p = class_map.find(t);
+ if (p == class_map.end())
+ return -ENOENT;
+ return p->second;
+ }
+ int set_item_class(int i, const string& name) {
+ if (!is_valid_crush_name(name))
+ return -EINVAL;
+ class_map[i] = get_or_create_class_id(name);
+ return 0;
+ }
+ int set_item_class(int i, int c) {
+ class_map[i] = c;
+ return c;
+ }
+ void get_devices_by_class(const string &name, set<int> *devices) const {
+ ceph_assert(devices);
+ devices->clear();
+ if (!class_exists(name)) {
+ return;
+ }
+ auto cid = get_class_id(name);
+ for (auto& p : class_map) {
+ if (p.first >= 0 && p.second == cid) {
+ devices->insert(p.first);
+ }
+ }
+ }
+ void class_remove_item(int i) {
+ auto it = class_map.find(i);
+ if (it == class_map.end()) {
+ return;
+ }
+ class_map.erase(it);
+ }
+ int can_rename_item(const string& srcname,
+ const string& dstname,
+ ostream *ss) const;
+ int rename_item(const string& srcname,
+ const string& dstname,
+ ostream *ss);
+ int can_rename_bucket(const string& srcname,
+ const string& dstname,
+ ostream *ss) const;
+ int rename_bucket(const string& srcname,
+ const string& dstname,
+ ostream *ss);
+
+ // rule names
+ int rename_rule(const string& srcname,
+ const string& dstname,
+ ostream *ss);
+ bool rule_exists(string name) const {
+ build_rmaps();
+ return rule_name_rmap.count(name);
+ }
+ int get_rule_id(string name) const {
+ build_rmaps();
+ if (rule_name_rmap.count(name))
+ return rule_name_rmap[name];
+ return -ENOENT;
+ }
+ const char *get_rule_name(int t) const {
+ std::map<int,string>::const_iterator p = rule_name_map.find(t);
+ if (p != rule_name_map.end())
+ return p->second.c_str();
+ return 0;
+ }
+ void set_rule_name(int i, const string& name) {
+ rule_name_map[i] = name;
+ if (have_rmaps)
+ rule_name_rmap[name] = i;
+ }
+ bool is_shadow_item(int id) const {
+ const char *name = get_item_name(id);
+ return name && !is_valid_crush_name(name);
+ }
+
+
+ /**
+ * find tree nodes referenced by rules by a 'take' command
+ *
+ * Note that these may not be parentless roots.
+ */
+ void find_takes(set<int> *roots) const;
+ void find_takes_by_rule(int rule, set<int> *roots) const;
+
+ /**
+ * find tree roots
+ *
+ * These are parentless nodes in the map.
+ */
+ void find_roots(set<int> *roots) const;
+
+
+ /**
+ * find tree roots that contain shadow (device class) items only
+ */
+ void find_shadow_roots(set<int> *roots) const {
+ set<int> all;
+ find_roots(&all);
+ for (auto& p: all) {
+ if (is_shadow_item(p)) {
+ roots->insert(p);
+ }
+ }
+ }
+
+ /**
+ * find tree roots that are not shadow (device class) items
+ *
+ * These are parentless nodes in the map that are not shadow
+ * items for device classes.
+ */
+ void find_nonshadow_roots(set<int> *roots) const {
+ set<int> all;
+ find_roots(&all);
+ for (auto& p: all) {
+ if (!is_shadow_item(p)) {
+ roots->insert(p);
+ }
+ }
+ }
+
+ /**
+ * see if an item is contained within a subtree
+ *
+ * @param root haystack
+ * @param item needle
+ * @return true if the item is located beneath the given node
+ */
+ bool subtree_contains(int root, int item) const;
+
+private:
+ /**
+ * search for an item in any bucket
+ *
+ * @param i item
+ * @return true if present
+ */
+ bool _search_item_exists(int i) const;
+ bool is_parent_of(int child, int p) const;
+public:
+
+ /**
+ * see if item is located where we think it is
+ *
+ * This verifies that the given item is located at a particular
+ * location in the hierarchy. However, that check is imprecise; we
+ * are actually verifying that the most specific location key/value
+ * is correct. For example, if loc specifies that rack=foo and
+ * host=bar, it will verify that host=bar is correct; any placement
+ * above that level in the hierarchy is ignored. This matches the
+ * semantics for insert_item().
+ *
+ * @param cct cct
+ * @param item item id
+ * @param loc location to check (map of type to bucket names)
+ * @param weight optional pointer to weight of item at that location
+ * @return true if item is at specified location
+ */
+ bool check_item_loc(CephContext *cct, int item, const map<string,string>& loc, int *iweight);
+ bool check_item_loc(CephContext *cct, int item, const map<string,string>& loc, float *weight) {
+ int iweight;
+ bool ret = check_item_loc(cct, item, loc, &iweight);
+ if (weight)
+ *weight = (float)iweight / (float)0x10000;
+ return ret;
+ }
+
+
+ /**
+ * returns the (type, name) of the parent bucket of id
+ *
+ * FIXME: ambiguous for items that occur multiple times in the map
+ */
+ pair<string,string> get_immediate_parent(int id, int *ret = NULL) const;
+
+ int get_immediate_parent_id(int id, int *parent) const;
+
+ /**
+ * return ancestor of the given type, or 0 if none
+ * can pass in a specific crush **rule** to return ancestor from that rule only
+ * (parent is always a bucket and thus <0)
+ */
+ int get_parent_of_type(int id, int type, int rule = -1) const;
+
+ /**
+ * get the fully qualified location of a device by successively finding
+ * parents beginning at ID and ending at highest type number specified in
+ * the CRUSH map which assumes that if device foo is under device bar, the
+ * type_id of foo < bar where type_id is the integer specified in the CRUSH map
+ *
+ * returns the location in the form of (type=foo) where type is a type of bucket
+ * specified in the CRUSH map and foo is a name specified in the CRUSH map
+ */
+ map<string, string> get_full_location(int id) const;
+
+ /**
+ * return location map for a item, by name
+ */
+ int get_full_location(
+ const string& name,
+ std::map<string,string> *ploc);
+
+ /*
+ * identical to get_full_location(int id) although it returns the type/name
+ * pairs in the order they occur in the hierarchy.
+ *
+ * returns -ENOENT if id is not found.
+ */
+ int get_full_location_ordered(int id, vector<pair<string, string> >& path) const;
+
+ /*
+ * identical to get_full_location_ordered(int id, vector<pair<string, string> >& path),
+ * although it returns a concatenated string with the type/name pairs in descending
+ * hierarchical order with format key1=val1,key2=val2.
+ *
+ * returns the location in descending hierarchy as a string.
+ */
+ string get_full_location_ordered_string(int id) const;
+
+ /**
+ * returns (type_id, type) of all parent buckets between id and
+ * default, can be used to check for anomalous CRUSH maps
+ */
+ map<int, string> get_parent_hierarchy(int id) const;
+
+ /**
+ * enumerate immediate children of given node
+ *
+ * @param id parent bucket or device id
+ * @return number of items, or error
+ */
+ int get_children(int id, list<int> *children) const;
+ /**
+ * enumerate all children of given node
+ *
+ * @param id parent bucket or device id
+ * @return number of items, or error
+ */
+ int get_all_children(int id, set<int> *children) const;
+ void get_children_of_type(int id,
+ int type,
+ vector<int> *children,
+ bool exclude_shadow = true) const;
+ /**
+ * enumerate all subtrees by type
+ */
+ void get_subtree_of_type(int type, vector<int> *subtrees);
+
+
+ /**
+ * verify upmapping results.
+ * return 0 on success or a negative errno on error.
+ */
+ int verify_upmap(CephContext *cct,
+ int rule_id,
+ int pool_size,
+ const vector<int>& up);
+
+ /**
+ * enumerate leaves(devices) of given node
+ *
+ * @param name parent bucket name
+ * @return 0 on success or a negative errno on error.
+ */
+ int get_leaves(const string &name, set<int> *leaves) const;
+
+private:
+ int _get_leaves(int id, list<int> *leaves) const; // worker
+
+public:
+ /**
+ * insert an item into the map at a specific position
+ *
+ * Add an item as a specific location of the hierarchy.
+ * Specifically, we look for the most specific location constraint
+ * for which a bucket already exists, and then create intervening
+ * buckets beneath that in order to place the item.
+ *
+ * Note that any location specifiers *above* the most specific match
+ * are ignored. For example, if we specify that osd.12 goes in
+ * host=foo, rack=bar, and row=baz, and rack=bar is the most
+ * specific match, we will create host=foo beneath that point and
+ * put osd.12 inside it. However, we will not verify that rack=bar
+ * is beneath row=baz or move it.
+ *
+ * In short, we will build out a hierarchy, and move leaves around,
+ * but not adjust the hierarchy's internal structure. Yet.
+ *
+ * If the item is already present in the map, we will return EEXIST.
+ * If the location key/value pairs are nonsensical
+ * (rack=nameofdevice), or location specifies that do not attach us
+ * to any existing part of the hierarchy, we will return EINVAL.
+ *
+ * @param cct cct
+ * @param id item id
+ * @param weight item weight
+ * @param name item name
+ * @param loc location (map of type to bucket names)
+ * @param init_weight_sets initialize weight-set weights to weight (vs 0)
+ * @return 0 for success, negative on error
+ */
+ int insert_item(CephContext *cct, int id, float weight, string name,
+ const map<string,string>& loc,
+ bool init_weight_sets=true);
+
+ /**
+ * move a bucket in the hierarchy to the given location
+ *
+ * This has the same location and ancestor creation behavior as
+ * insert_item(), but will relocate the specified existing bucket.
+ *
+ * @param cct cct
+ * @param id bucket id
+ * @param loc location (map of type to bucket names)
+ * @return 0 for success, negative on error
+ */
+ int move_bucket(CephContext *cct, int id, const map<string,string>& loc);
+
+ /**
+ * swap bucket contents of two buckets without touching bucket ids
+ *
+ * @param cct cct
+ * @param src bucket a
+ * @param dst bucket b
+ * @return 0 for success, negative on error
+ */
+ int swap_bucket(CephContext *cct, int src, int dst);
+
+ /**
+ * add a link to an existing bucket in the hierarchy to the new location
+ *
+ * This has the same location and ancestor creation behavior as
+ * insert_item(), but will add a new link to the specified existing
+ * bucket.
+ *
+ * @param cct cct
+ * @param id bucket id
+ * @param loc location (map of type to bucket names)
+ * @return 0 for success, negative on error
+ */
+ int link_bucket(CephContext *cct, int id, const map<string,string>& loc);
+
+ /**
+ * add or update an item's position in the map
+ *
+ * This is analogous to insert_item, except we will move an item if
+ * it is already present.
+ *
+ * @param cct cct
+ * @param id item id
+ * @param weight item weight
+ * @param name item name
+ * @param loc location (map of type to bucket names)
+ * @return 0 for no change, 1 for successful change, negative on error
+ */
+ int update_item(CephContext *cct, int id, float weight, string name, const map<string,string>& loc);
+
+ /**
+ * create or move an item, but do not adjust its weight if it already exists
+ *
+ * @param cct cct
+ * @param item item id
+ * @param weight initial item weight (if we need to create it)
+ * @param name item name
+ * @param loc location (map of type to bucket names)
+ * @param init_weight_sets initialize weight-set values to weight (vs 0)
+ * @return 0 for no change, 1 for successful change, negative on error
+ */
+ int create_or_move_item(CephContext *cct, int item, float weight, string name,
+ const map<string,string>& loc,
+ bool init_weight_sets=true);
+
+ /**
+ * remove all instances of an item from the map
+ *
+ * @param cct cct
+ * @param id item id to remove
+ * @param unlink_only unlink but do not remove bucket (useful if multiple links or not empty)
+ * @return 0 on success, negative on error
+ */
+ int remove_item(CephContext *cct, int id, bool unlink_only);
+
+ /**
+ * recursively remove buckets starting at item and stop removing
+ * when a bucket is in use.
+ *
+ * @param item id to remove
+ * @return 0 on success, negative on error
+ */
+ int remove_root(CephContext *cct, int item);
+
+ /**
+ * remove all instances of an item nested beneath a certain point from the map
+ *
+ * @param cct cct
+ * @param id item id to remove
+ * @param ancestor ancestor item id under which to search for id
+ * @param unlink_only unlink but do not remove bucket (useful if bucket has multiple links or is not empty)
+ * @return 0 on success, negative on error
+ */
+private:
+ bool _maybe_remove_last_instance(CephContext *cct, int id, bool unlink_only);
+ int _remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only);
+ bool _bucket_is_in_use(int id);
+public:
+ int remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only);
+
+ /**
+ * calculate the locality/distance from a given id to a crush location map
+ *
+ * Specifically, we look for the lowest-valued type for which the
+ * location of id matches that described in loc.
+ *
+ * @param cct cct
+ * @param id the existing id in the map
+ * @param loc a set of key=value pairs describing a location in the hierarchy
+ */
+ int get_common_ancestor_distance(CephContext *cct, int id,
+ const std::multimap<string,string>& loc) const;
+
+ /**
+ * parse a set of key/value pairs out of a string vector
+ *
+ * These are used to describe a location in the CRUSH hierarchy.
+ *
+ * @param args list of strings (each key= or key=value)
+ * @param ploc pointer to a resulting location map or multimap
+ */
+ static int parse_loc_map(const std::vector<string>& args,
+ std::map<string,string> *ploc);
+ static int parse_loc_multimap(const std::vector<string>& args,
+ std::multimap<string,string> *ploc);
+
+
+ /**
+ * get an item's weight
+ *
+ * Will return the weight for the first instance it finds.
+ *
+ * @param id item id to check
+ * @return weight of item
+ */
+ int get_item_weight(int id) const;
+ float get_item_weightf(int id) const {
+ return (float)get_item_weight(id) / (float)0x10000;
+ }
+ int get_item_weight_in_loc(int id, const map<string,string> &loc);
+ float get_item_weightf_in_loc(int id, const map<string,string> &loc) {
+ return (float)get_item_weight_in_loc(id, loc) / (float)0x10000;
+ }
+
+ int validate_weightf(float weight) {
+ uint64_t iweight = weight * 0x10000;
+ if (iweight > std::numeric_limits<int>::max()) {
+ return -EOVERFLOW;
+ }
+ return 0;
+ }
+ int adjust_item_weight(CephContext *cct, int id, int weight,
+ bool update_weight_sets=true);
+ int adjust_item_weightf(CephContext *cct, int id, float weight,
+ bool update_weight_sets=true) {
+ int r = validate_weightf(weight);
+ if (r < 0) {
+ return r;
+ }
+ return adjust_item_weight(cct, id, (int)(weight * (float)0x10000),
+ update_weight_sets);
+ }
+ int adjust_item_weight_in_bucket(CephContext *cct, int id, int weight,
+ int bucket_id,
+ bool update_weight_sets);
+ int adjust_item_weight_in_loc(CephContext *cct, int id, int weight,
+ const map<string,string>& loc,
+ bool update_weight_sets=true);
+ int adjust_item_weightf_in_loc(CephContext *cct, int id, float weight,
+ const map<string,string>& loc,
+ bool update_weight_sets=true) {
+ int r = validate_weightf(weight);
+ if (r < 0) {
+ return r;
+ }
+ return adjust_item_weight_in_loc(cct, id, (int)(weight * (float)0x10000),
+ loc, update_weight_sets);
+ }
+ void reweight(CephContext *cct);
+ void reweight_bucket(crush_bucket *b,
+ crush_choose_arg_map& arg_map,
+ vector<uint32_t> *weightv);
+
+ int adjust_subtree_weight(CephContext *cct, int id, int weight,
+ bool update_weight_sets=true);
+ int adjust_subtree_weightf(CephContext *cct, int id, float weight,
+ bool update_weight_sets=true) {
+ int r = validate_weightf(weight);
+ if (r < 0) {
+ return r;
+ }
+ return adjust_subtree_weight(cct, id, (int)(weight * (float)0x10000),
+ update_weight_sets);
+ }
+
+ /// check if item id is present in the map hierarchy
+ bool check_item_present(int id) const;
+
+
+ /*** devices ***/
+ int get_max_devices() const {
+ if (!crush) return 0;
+ return crush->max_devices;
+ }
+
+
+ /*** rules ***/
+private:
+ crush_rule *get_rule(unsigned ruleno) const {
+ if (!crush) return (crush_rule *)(-ENOENT);
+ if (ruleno >= crush->max_rules)
+ return 0;
+ return crush->rules[ruleno];
+ }
+ crush_rule_step *get_rule_step(unsigned ruleno, unsigned step) const {
+ crush_rule *n = get_rule(ruleno);
+ if (IS_ERR(n)) return (crush_rule_step *)(-EINVAL);
+ if (step >= n->len) return (crush_rule_step *)(-EINVAL);
+ return &n->steps[step];
+ }
+
+public:
+ /* accessors */
+ int get_max_rules() const {
+ if (!crush) return 0;
+ return crush->max_rules;
+ }
+ bool rule_exists(unsigned ruleno) const {
+ if (!crush) return false;
+ if (ruleno < crush->max_rules &&
+ crush->rules[ruleno] != NULL)
+ return true;
+ return false;
+ }
+ bool rule_has_take(unsigned ruleno, int take) const {
+ if (!crush) return false;
+ crush_rule *rule = get_rule(ruleno);
+ for (unsigned i = 0; i < rule->len; ++i) {
+ if (rule->steps[i].op == CRUSH_RULE_TAKE &&
+ rule->steps[i].arg1 == take) {
+ return true;
+ }
+ }
+ return false;
+ }
+ int get_rule_len(unsigned ruleno) const {
+ crush_rule *r = get_rule(ruleno);
+ if (IS_ERR(r)) return PTR_ERR(r);
+ return r->len;
+ }
+ int get_rule_mask_ruleset(unsigned ruleno) const {
+ crush_rule *r = get_rule(ruleno);
+ if (IS_ERR(r)) return -1;
+ return r->mask.ruleset;
+ }
+ int get_rule_mask_type(unsigned ruleno) const {
+ crush_rule *r = get_rule(ruleno);
+ if (IS_ERR(r)) return -1;
+ return r->mask.type;
+ }
+ int get_rule_mask_min_size(unsigned ruleno) const {
+ crush_rule *r = get_rule(ruleno);
+ if (IS_ERR(r)) return -1;
+ return r->mask.min_size;
+ }
+ int get_rule_mask_max_size(unsigned ruleno) const {
+ crush_rule *r = get_rule(ruleno);
+ if (IS_ERR(r)) return -1;
+ return r->mask.max_size;
+ }
+ int get_rule_op(unsigned ruleno, unsigned step) const {
+ crush_rule_step *s = get_rule_step(ruleno, step);
+ if (IS_ERR(s)) return PTR_ERR(s);
+ return s->op;
+ }
+ int get_rule_arg1(unsigned ruleno, unsigned step) const {
+ crush_rule_step *s = get_rule_step(ruleno, step);
+ if (IS_ERR(s)) return PTR_ERR(s);
+ return s->arg1;
+ }
+ int get_rule_arg2(unsigned ruleno, unsigned step) const {
+ crush_rule_step *s = get_rule_step(ruleno, step);
+ if (IS_ERR(s)) return PTR_ERR(s);
+ return s->arg2;
+ }
+
+private:
+ float _get_take_weight_osd_map(int root, map<int,float> *pmap) const;
+ void _normalize_weight_map(float sum, const map<int,float>& m,
+ map<int,float> *pmap) const;
+
+public:
+ /**
+ * calculate a map of osds to weights for a given rule
+ *
+ * Generate a map of which OSDs get how much relative weight for a
+ * given rule.
+ *
+ * @param ruleno [in] rule id
+ * @param pmap [out] map of osd to weight
+ * @return 0 for success, or negative error code
+ */
+ int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap) const;
+
+ /**
+ * calculate a map of osds to weights for a given starting root
+ *
+ * Generate a map of which OSDs get how much relative weight for a
+ * given starting root
+ *
+ * @param root node
+ * @param pmap [out] map of osd to weight
+ * @return 0 for success, or negative error code
+ */
+ int get_take_weight_osd_map(int root, map<int,float> *pmap) const;
+
+ /* modifiers */
+
+ int add_rule(int ruleno, int len, int type, int minsize, int maxsize) {
+ if (!crush) return -ENOENT;
+ crush_rule *n = crush_make_rule(len, ruleno, type, minsize, maxsize);
+ ceph_assert(n);
+ ruleno = crush_add_rule(crush, n, ruleno);
+ return ruleno;
+ }
+ int set_rule_mask_max_size(unsigned ruleno, int max_size) {
+ crush_rule *r = get_rule(ruleno);
+ if (IS_ERR(r)) return -1;
+ return r->mask.max_size = max_size;
+ }
+ int set_rule_step(unsigned ruleno, unsigned step, int op, int arg1, int arg2) {
+ if (!crush) return -ENOENT;
+ crush_rule *n = get_rule(ruleno);
+ if (!n) return -1;
+ crush_rule_set_step(n, step, op, arg1, arg2);
+ return 0;
+ }
+ int set_rule_step_take(unsigned ruleno, unsigned step, int val) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_TAKE, val, 0);
+ }
+ int set_rule_step_set_choose_tries(unsigned ruleno, unsigned step, int val) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_TRIES, val, 0);
+ }
+ int set_rule_step_set_choose_local_tries(unsigned ruleno, unsigned step, int val) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES, val, 0);
+ }
+ int set_rule_step_set_choose_local_fallback_tries(unsigned ruleno, unsigned step, int val) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES, val, 0);
+ }
+ int set_rule_step_set_chooseleaf_tries(unsigned ruleno, unsigned step, int val) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_TRIES, val, 0);
+ }
+ int set_rule_step_set_chooseleaf_vary_r(unsigned ruleno, unsigned step, int val) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_VARY_R, val, 0);
+ }
+ int set_rule_step_set_chooseleaf_stable(unsigned ruleno, unsigned step, int val) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_STABLE, val, 0);
+ }
+ int set_rule_step_choose_firstn(unsigned ruleno, unsigned step, int val, int type) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_FIRSTN, val, type);
+ }
+ int set_rule_step_choose_indep(unsigned ruleno, unsigned step, int val, int type) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_INDEP, val, type);
+ }
+ int set_rule_step_choose_leaf_firstn(unsigned ruleno, unsigned step, int val, int type) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSELEAF_FIRSTN, val, type);
+ }
+ int set_rule_step_choose_leaf_indep(unsigned ruleno, unsigned step, int val, int type) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSELEAF_INDEP, val, type);
+ }
+ int set_rule_step_emit(unsigned ruleno, unsigned step) {
+ return set_rule_step(ruleno, step, CRUSH_RULE_EMIT, 0, 0);
+ }
+
+ int add_simple_rule(
+ string name, string root_name, string failure_domain_type,
+ string device_class,
+ string mode, int rule_type, ostream *err = 0);
+
+ /**
+ * @param rno rule[set] id to use, -1 to pick the lowest available
+ */
+ int add_simple_rule_at(
+ string name, string root_name,
+ string failure_domain_type, string device_class, string mode,
+ int rule_type, int rno, ostream *err = 0);
+
+ int remove_rule(int ruleno);
+
+
+ /** buckets **/
+ const crush_bucket *get_bucket(int id) const {
+ if (!crush)
+ return (crush_bucket *)(-EINVAL);
+ unsigned int pos = (unsigned int)(-1 - id);
+ unsigned int max_buckets = crush->max_buckets;
+ if (pos >= max_buckets)
+ return (crush_bucket *)(-ENOENT);
+ crush_bucket *ret = crush->buckets[pos];
+ if (ret == NULL)
+ return (crush_bucket *)(-ENOENT);
+ return ret;
+ }
+private:
+ crush_bucket *get_bucket(int id) {
+ if (!crush)
+ return (crush_bucket *)(-EINVAL);
+ unsigned int pos = (unsigned int)(-1 - id);
+ unsigned int max_buckets = crush->max_buckets;
+ if (pos >= max_buckets)
+ return (crush_bucket *)(-ENOENT);
+ crush_bucket *ret = crush->buckets[pos];
+ if (ret == NULL)
+ return (crush_bucket *)(-ENOENT);
+ return ret;
+ }
+ /**
+ * detach a bucket from its parent and adjust the parent weight
+ *
+ * returns the weight of the detached bucket
+ **/
+ int detach_bucket(CephContext *cct, int item);
+
+ int get_new_bucket_id();
+
+public:
+ int get_max_buckets() const {
+ if (!crush) return -EINVAL;
+ return crush->max_buckets;
+ }
+ int get_next_bucket_id() const {
+ if (!crush) return -EINVAL;
+ return crush_get_next_bucket_id(crush);
+ }
+ bool bucket_exists(int id) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b))
+ return false;
+ return true;
+ }
+ int get_bucket_weight(int id) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) return PTR_ERR(b);
+ return b->weight;
+ }
+ float get_bucket_weightf(int id) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) return 0;
+ return b->weight / (float)0x10000;
+ }
+ int get_bucket_type(int id) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) return PTR_ERR(b);
+ return b->type;
+ }
+ int get_bucket_alg(int id) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) return PTR_ERR(b);
+ return b->alg;
+ }
+ int get_bucket_hash(int id) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) return PTR_ERR(b);
+ return b->hash;
+ }
+ int get_bucket_size(int id) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) return PTR_ERR(b);
+ return b->size;
+ }
+ int get_bucket_item(int id, int pos) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) return PTR_ERR(b);
+ if ((__u32)pos >= b->size)
+ return PTR_ERR(b);
+ return b->items[pos];
+ }
+ int get_bucket_item_weight(int id, int pos) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) return PTR_ERR(b);
+ return crush_get_bucket_item_weight(b, pos);
+ }
+ float get_bucket_item_weightf(int id, int pos) const {
+ const crush_bucket *b = get_bucket(id);
+ if (IS_ERR(b)) return 0;
+ return (float)crush_get_bucket_item_weight(b, pos) / (float)0x10000;
+ }
+
+ /* modifiers */
+ int add_bucket(int bucketno, int alg, int hash, int type, int size,
+ int *items, int *weights, int *idout);
+ int bucket_add_item(crush_bucket *bucket, int item, int weight);
+ int bucket_remove_item(struct crush_bucket *bucket, int item);
+ int bucket_adjust_item_weight(
+ CephContext *cct, struct crush_bucket *bucket, int item, int weight,
+ bool adjust_weight_sets);
+
+ void finalize() {
+ ceph_assert(crush);
+ crush_finalize(crush);
+ if (!name_map.empty() &&
+ name_map.rbegin()->first >= crush->max_devices) {
+ crush->max_devices = name_map.rbegin()->first + 1;
+ }
+ have_uniform_rules = !has_legacy_rule_ids();
+ build_rmaps();
+ }
+ int bucket_set_alg(int id, int alg);
+
+ int update_device_class(int id, const string& class_name, const string& name, ostream *ss);
+ int remove_device_class(CephContext *cct, int id, ostream *ss);
+ int device_class_clone(
+ int original, int device_class,
+ const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket,
+ const std::set<int32_t>& used_ids,
+ int *clone,
+ map<int,map<int,vector<int>>> *cmap_item_weight);
+ bool class_is_in_use(int class_id, ostream *ss = nullptr);
+ int rename_class(const string& srcname, const string& dstname);
+ int populate_classes(
+ const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket);
+ int get_rules_by_class(const string &class_name, set<int> *rules);
+ int get_rules_by_osd(int osd, set<int> *rules);
+ bool _class_is_dead(int class_id);
+ void cleanup_dead_classes();
+ int rebuild_roots_with_classes(CephContext *cct);
+ /* remove unused roots generated for class devices */
+ int trim_roots_with_class(CephContext *cct);
+
+ int reclassify(
+ CephContext *cct,
+ ostream& out,
+ const map<string,string>& classify_root,
+ const map<string,pair<string,string>>& classify_bucket
+ );
+
+ int set_subtree_class(const string& name, const string& class_name);
+
+ void start_choose_profile() {
+ free(crush->choose_tries);
+ /*
+ * the original choose_total_tries value was off by one (it
+ * counted "retries" and not "tries"). add one to alloc.
+ */
+ crush->choose_tries = (__u32 *)calloc(sizeof(*crush->choose_tries),
+ (crush->choose_total_tries + 1));
+ memset(crush->choose_tries, 0,
+ sizeof(*crush->choose_tries) * (crush->choose_total_tries + 1));
+ }
+ void stop_choose_profile() {
+ free(crush->choose_tries);
+ crush->choose_tries = 0;
+ }
+
+ int get_choose_profile(__u32 **vec) {
+ if (crush->choose_tries) {
+ *vec = crush->choose_tries;
+ return crush->choose_total_tries;
+ }
+ return 0;
+ }
+
+
+ void set_max_devices(int m) {
+ crush->max_devices = m;
+ }
+
+ int find_rule(int ruleset, int type, int size) const {
+ if (!crush) return -1;
+ if (have_uniform_rules &&
+ ruleset < (int)crush->max_rules &&
+ crush->rules[ruleset] &&
+ crush->rules[ruleset]->mask.type == type &&
+ crush->rules[ruleset]->mask.min_size <= size &&
+ crush->rules[ruleset]->mask.max_size >= size) {
+ return ruleset;
+ }
+ return crush_find_rule(crush, ruleset, type, size);
+ }
+
+ bool ruleset_exists(const int ruleset) const {
+ for (size_t i = 0; i < crush->max_rules; ++i) {
+ if (rule_exists(i) && crush->rules[i]->mask.ruleset == ruleset) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Return the lowest numbered ruleset of type `type`
+ *
+ * @returns a ruleset ID, or -1 if no matching rules found.
+ */
+ int find_first_ruleset(int type) const {
+ int result = -1;
+
+ for (size_t i = 0; i < crush->max_rules; ++i) {
+ if (crush->rules[i]
+ && crush->rules[i]->mask.type == type
+ && (crush->rules[i]->mask.ruleset < result || result == -1)) {
+ result = crush->rules[i]->mask.ruleset;
+ }
+ }
+
+ return result;
+ }
+
+ bool have_choose_args(int64_t choose_args_index) const {
+ return choose_args.count(choose_args_index);
+ }
+
+ crush_choose_arg_map choose_args_get_with_fallback(
+ int64_t choose_args_index) const {
+ auto i = choose_args.find(choose_args_index);
+ if (i == choose_args.end()) {
+ i = choose_args.find(DEFAULT_CHOOSE_ARGS);
+ }
+ if (i == choose_args.end()) {
+ crush_choose_arg_map arg_map;
+ arg_map.args = NULL;
+ arg_map.size = 0;
+ return arg_map;
+ } else {
+ return i->second;
+ }
+ }
+ crush_choose_arg_map choose_args_get(int64_t choose_args_index) const {
+ auto i = choose_args.find(choose_args_index);
+ if (i == choose_args.end()) {
+ crush_choose_arg_map arg_map;
+ arg_map.args = NULL;
+ arg_map.size = 0;
+ return arg_map;
+ } else {
+ return i->second;
+ }
+ }
+
+ void destroy_choose_args(crush_choose_arg_map arg_map) {
+ for (__u32 i = 0; i < arg_map.size; i++) {
+ crush_choose_arg *arg = &arg_map.args[i];
+ for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+ crush_weight_set *weight_set = &arg->weight_set[j];
+ free(weight_set->weights);
+ }
+ if (arg->weight_set)
+ free(arg->weight_set);
+ if (arg->ids)
+ free(arg->ids);
+ }
+ free(arg_map.args);
+ }
+
+ bool create_choose_args(int64_t id, int positions) {
+ if (choose_args.count(id))
+ return false;
+ ceph_assert(positions);
+ auto &cmap = choose_args[id];
+ cmap.args = static_cast<crush_choose_arg*>(calloc(sizeof(crush_choose_arg),
+ crush->max_buckets));
+ cmap.size = crush->max_buckets;
+ for (int bidx=0; bidx < crush->max_buckets; ++bidx) {
+ crush_bucket *b = crush->buckets[bidx];
+ auto &carg = cmap.args[bidx];
+ carg.ids = NULL;
+ carg.ids_size = 0;
+ if (b && b->alg == CRUSH_BUCKET_STRAW2) {
+ crush_bucket_straw2 *sb = reinterpret_cast<crush_bucket_straw2*>(b);
+ carg.weight_set_positions = positions;
+ carg.weight_set = static_cast<crush_weight_set*>(calloc(sizeof(crush_weight_set),
+ carg.weight_set_positions));
+ // initialize with canonical weights
+ for (int pos = 0; pos < positions; ++pos) {
+ carg.weight_set[pos].size = b->size;
+ carg.weight_set[pos].weights = (__u32*)calloc(4, b->size);
+ for (unsigned i = 0; i < b->size; ++i) {
+ carg.weight_set[pos].weights[i] = sb->item_weights[i];
+ }
+ }
+ } else {
+ carg.weight_set = NULL;
+ carg.weight_set_positions = 0;
+ }
+ }
+ return true;
+ }
+
+ void rm_choose_args(int64_t id) {
+ auto p = choose_args.find(id);
+ if (p != choose_args.end()) {
+ destroy_choose_args(p->second);
+ choose_args.erase(p);
+ }
+ }
+
+ void choose_args_clear() {
+ for (auto w : choose_args)
+ destroy_choose_args(w.second);
+ choose_args.clear();
+ }
+
+ // remove choose_args for buckets that no longer exist, create them for new buckets
+ void update_choose_args(CephContext *cct);
+
+ // adjust choose_args_map weight, preserving the hierarchical summation
+ // property. used by callers optimizing layouts by tweaking weights.
+ int _choose_args_adjust_item_weight_in_bucket(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int bucketid,
+ int id,
+ const vector<int>& weight,
+ ostream *ss);
+ int choose_args_adjust_item_weight(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int id, const vector<int>& weight,
+ ostream *ss);
+ int choose_args_adjust_item_weightf(
+ CephContext *cct,
+ crush_choose_arg_map cmap,
+ int id, const vector<double>& weightf,
+ ostream *ss) {
+ vector<int> weight(weightf.size());
+ for (unsigned i = 0; i < weightf.size(); ++i) {
+ weight[i] = (int)(weightf[i] * (double)0x10000);
+ }
+ return choose_args_adjust_item_weight(cct, cmap, id, weight, ss);
+ }
+
+ int get_choose_args_positions(crush_choose_arg_map cmap) {
+ // infer positions from other buckets
+ for (unsigned j = 0; j < cmap.size; ++j) {
+ if (cmap.args[j].weight_set_positions) {
+ return cmap.args[j].weight_set_positions;
+ }
+ }
+ return 1;
+ }
+
+ template<typename WeightVector>
+ void do_rule(int rule, int x, vector<int>& out, int maxout,
+ const WeightVector& weight,
+ uint64_t choose_args_index) const {
+ int rawout[maxout];
+ char work[crush_work_size(crush, maxout)];
+ crush_init_workspace(crush, work);
+ crush_choose_arg_map arg_map = choose_args_get_with_fallback(
+ choose_args_index);
+ int numrep = crush_do_rule(crush, rule, x, rawout, maxout,
+ std::data(weight), std::size(weight),
+ work, arg_map.args);
+ if (numrep < 0)
+ numrep = 0;
+ out.resize(numrep);
+ for (int i=0; i<numrep; i++)
+ out[i] = rawout[i];
+ }
+
+ int _choose_type_stack(
+ CephContext *cct,
+ const vector<pair<int,int>>& stack,
+ const set<int>& overfull,
+ const vector<int>& underfull,
+ const vector<int>& more_underfull,
+ const vector<int>& orig,
+ vector<int>::const_iterator& i,
+ set<int>& used,
+ vector<int> *pw,
+ int root_bucket,
+ int rule) const;
+
+ int try_remap_rule(
+ CephContext *cct,
+ int rule,
+ int maxout,
+ const set<int>& overfull,
+ const vector<int>& underfull,
+ const vector<int>& more_underfull,
+ const vector<int>& orig,
+ vector<int> *out) const;
+
+ bool check_crush_rule(int ruleset, int type, int size, ostream& ss) {
+ ceph_assert(crush);
+
+ __u32 i;
+ for (i = 0; i < crush->max_rules; i++) {
+ if (crush->rules[i] &&
+ crush->rules[i]->mask.ruleset == ruleset &&
+ crush->rules[i]->mask.type == type) {
+
+ if (crush->rules[i]->mask.min_size <= size &&
+ crush->rules[i]->mask.max_size >= size) {
+ return true;
+ } else if (size < crush->rules[i]->mask.min_size) {
+ ss << "pool size is smaller than the crush rule min size";
+ return false;
+ } else {
+ ss << "pool size is bigger than the crush rule max size";
+ return false;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ void encode(bufferlist &bl, uint64_t features) const;
+ void decode(bufferlist::const_iterator &blp);
+ void decode_crush_bucket(crush_bucket** bptr, bufferlist::const_iterator &blp);
+ void dump(Formatter *f) const;
+ void dump_rules(Formatter *f) const;
+ void dump_rule(int ruleset, Formatter *f) const;
+ void dump_tunables(Formatter *f) const;
+ void dump_choose_args(Formatter *f) const;
+ void list_rules(Formatter *f) const;
+ void list_rules(ostream *ss) const;
+ void dump_tree(ostream *out,
+ Formatter *f,
+ const CrushTreeDumper::name_map_t& ws,
+ bool show_shadow = false) const;
+ void dump_tree(ostream *out, Formatter *f) {
+ dump_tree(out, f, CrushTreeDumper::name_map_t());
+ }
+ void dump_tree(Formatter *f,
+ const CrushTreeDumper::name_map_t& ws) const;
+ static void generate_test_instances(list<CrushWrapper*>& o);
+
+ int get_osd_pool_default_crush_replicated_ruleset(CephContext *cct);
+
+ static bool is_valid_crush_name(const string& s);
+ static bool is_valid_crush_loc(CephContext *cct,
+ const map<string,string>& loc);
+};
+WRITE_CLASS_ENCODER_FEATURES(CrushWrapper)
+
+#endif
diff --git a/src/crush/CrushWrapper.i b/src/crush/CrushWrapper.i
new file mode 100644
index 00000000..76340611
--- /dev/null
+++ b/src/crush/CrushWrapper.i
@@ -0,0 +1,47 @@
+/* File : CrushWrapper.i */
+%module CrushWrapper
+%{
+#include "CrushWrapper.h"
+%}
+
+%include typemaps.i
+
+// This tells SWIG to treat 'int *data' as a special case
+%typemap(in) int *items {
+ AV *tempav;
+ I32 len;
+ int i;
+ SV **tv;
+// int view;
+
+
+ //printf("typemap\n");
+
+ if (!SvROK($input))
+ croak("$input is not a reference.");
+ if (SvTYPE(SvRV($input)) != SVt_PVAV)
+ croak("$input is not an array.");
+
+ tempav = (AV*)SvRV($input);
+ len = av_len(tempav);
+ //printf("typemap len: %i\n",len);
+ $1 = (int *) malloc((len+1)*sizeof(int));
+ for (i = 0; i <= len; i++) {
+ tv = av_fetch(tempav, i, 0);
+ $1[i] = (int) SvIV(*tv);
+
+ /*
+ view = SvIV(*tv);
+ printf("view: %d",view);
+ printf("\n");
+ */
+ }
+}
+
+%apply int *items { int *weights };
+%apply double *OUTPUT { double *min, double *max, double *avg };
+
+/* Let's just grab the original header file here */
+%include "CrushWrapper.h"
+
+%clear double *min, double *max, double *avg;
diff --git a/src/crush/builder.c b/src/crush/builder.c
new file mode 100644
index 00000000..68dfcb69
--- /dev/null
+++ b/src/crush/builder.c
@@ -0,0 +1,1525 @@
+#include <string.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+
+#include "crush/crush.h"
+#include "builder.h"
+
+#define dprintk(args...) /* printf(args) */
+
+#define BUG_ON(x) assert(!(x))
+
+struct crush_map *crush_create()
+{
+ struct crush_map *m;
+ m = malloc(sizeof(*m));
+ if (!m)
+ return NULL;
+ memset(m, 0, sizeof(*m));
+
+ set_optimal_crush_map(m);
+ return m;
+}
+
+/*
+ * finalize should be called _after_ all buckets are added to the map.
+ */
+void crush_finalize(struct crush_map *map)
+{
+ int b;
+ __u32 i;
+
+ /* Calculate the needed working space while we do other
+ finalization tasks. */
+ map->working_size = sizeof(struct crush_work);
+ /* Space for the array of pointers to per-bucket workspace */
+ map->working_size += map->max_buckets *
+ sizeof(struct crush_work_bucket *);
+
+ /* calc max_devices */
+ map->max_devices = 0;
+ for (b=0; b<map->max_buckets; b++) {
+ if (map->buckets[b] == 0)
+ continue;
+ for (i=0; i<map->buckets[b]->size; i++)
+ if (map->buckets[b]->items[i] >= map->max_devices)
+ map->max_devices = map->buckets[b]->items[i] + 1;
+
+ switch (map->buckets[b]->alg) {
+ default:
+ /* The base case, permutation variables and
+ the pointer to the permutation array. */
+ map->working_size += sizeof(struct crush_work_bucket);
+ break;
+ }
+ /* Every bucket has a permutation array. */
+ map->working_size += map->buckets[b]->size * sizeof(__u32);
+ }
+}
+
+
+
+/** rules **/
+
+int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno)
+{
+ __u32 r;
+
+ if (ruleno < 0) {
+ for (r=0; r < map->max_rules; r++)
+ if (map->rules[r] == 0)
+ break;
+ assert(r < CRUSH_MAX_RULES);
+ }
+ else
+ r = ruleno;
+
+ if (r >= map->max_rules) {
+ /* expand array */
+ int oldsize;
+ void *_realloc = NULL;
+ if (map->max_rules +1 > CRUSH_MAX_RULES)
+ return -ENOSPC;
+ oldsize = map->max_rules;
+ map->max_rules = r+1;
+ if ((_realloc = realloc(map->rules, map->max_rules * sizeof(map->rules[0]))) == NULL) {
+ return -ENOMEM;
+ } else {
+ map->rules = _realloc;
+ }
+ memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0]));
+ }
+
+ /* add it */
+ map->rules[r] = rule;
+ return r;
+}
+
+struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize)
+{
+ struct crush_rule *rule;
+ rule = malloc(crush_rule_size(len));
+ if (!rule)
+ return NULL;
+ rule->len = len;
+ rule->mask.ruleset = ruleset;
+ rule->mask.type = type;
+ rule->mask.min_size = minsize;
+ rule->mask.max_size = maxsize;
+ return rule;
+}
+
+/*
+ * be careful; this doesn't verify that the buffer you allocated is big enough!
+ */
+void crush_rule_set_step(struct crush_rule *rule, int n, int op, int arg1, int arg2)
+{
+ assert((__u32)n < rule->len);
+ rule->steps[n].op = op;
+ rule->steps[n].arg1 = arg1;
+ rule->steps[n].arg2 = arg2;
+}
+
+
+/** buckets **/
+int crush_get_next_bucket_id(struct crush_map *map)
+{
+ int pos;
+ for (pos=0; pos < map->max_buckets; pos++)
+ if (map->buckets[pos] == 0)
+ break;
+ return -1 - pos;
+}
+
+
+int crush_add_bucket(struct crush_map *map,
+ int id,
+ struct crush_bucket *bucket,
+ int *idout)
+{
+ int pos;
+
+ /* find a bucket id */
+ if (id == 0)
+ id = crush_get_next_bucket_id(map);
+ pos = -1 - id;
+
+ while (pos >= map->max_buckets) {
+ /* expand array */
+ int oldsize = map->max_buckets;
+ if (map->max_buckets)
+ map->max_buckets *= 2;
+ else
+ map->max_buckets = 8;
+ void *_realloc = NULL;
+ if ((_realloc = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0]))) == NULL) {
+ return -ENOMEM;
+ } else {
+ map->buckets = _realloc;
+ }
+ memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0]));
+ }
+
+ if (map->buckets[pos] != 0) {
+ return -EEXIST;
+ }
+
+ /* add it */
+ bucket->id = id;
+ map->buckets[pos] = bucket;
+
+ if (idout) *idout = id;
+ return 0;
+}
+
+int crush_remove_bucket(struct crush_map *map, struct crush_bucket *bucket)
+{
+ int pos = -1 - bucket->id;
+ assert(pos < map->max_buckets);
+ map->buckets[pos] = NULL;
+ crush_destroy_bucket(bucket);
+ return 0;
+}
+
+
+/* uniform bucket */
+
+struct crush_bucket_uniform *
+crush_make_uniform_bucket(int hash, int type, int size,
+ int *items,
+ int item_weight)
+{
+ int i;
+ struct crush_bucket_uniform *bucket;
+
+ bucket = malloc(sizeof(*bucket));
+ if (!bucket)
+ return NULL;
+ memset(bucket, 0, sizeof(*bucket));
+ bucket->h.alg = CRUSH_BUCKET_UNIFORM;
+ bucket->h.hash = hash;
+ bucket->h.type = type;
+ bucket->h.size = size;
+
+ if (crush_multiplication_is_unsafe(size, item_weight))
+ goto err;
+
+ bucket->h.weight = size * item_weight;
+ bucket->item_weight = item_weight;
+ bucket->h.items = malloc(sizeof(__s32)*size);
+
+ if (!bucket->h.items)
+ goto err;
+
+ for (i=0; i<size; i++)
+ bucket->h.items[i] = items[i];
+
+ return bucket;
+err:
+ free(bucket->h.items);
+ free(bucket);
+ return NULL;
+}
+
+
+/* list bucket */
+
+struct crush_bucket_list*
+crush_make_list_bucket(int hash, int type, int size,
+ int *items,
+ int *weights)
+{
+ int i;
+ int w;
+ struct crush_bucket_list *bucket;
+
+ bucket = malloc(sizeof(*bucket));
+ if (!bucket)
+ return NULL;
+ memset(bucket, 0, sizeof(*bucket));
+ bucket->h.alg = CRUSH_BUCKET_LIST;
+ bucket->h.hash = hash;
+ bucket->h.type = type;
+ bucket->h.size = size;
+
+ bucket->h.items = malloc(sizeof(__s32)*size);
+ if (!bucket->h.items)
+ goto err;
+
+
+ bucket->item_weights = malloc(sizeof(__u32)*size);
+ if (!bucket->item_weights)
+ goto err;
+ bucket->sum_weights = malloc(sizeof(__u32)*size);
+ if (!bucket->sum_weights)
+ goto err;
+ w = 0;
+ for (i=0; i<size; i++) {
+ bucket->h.items[i] = items[i];
+ bucket->item_weights[i] = weights[i];
+
+ if (crush_addition_is_unsafe(w, weights[i]))
+ goto err;
+
+ w += weights[i];
+ bucket->sum_weights[i] = w;
+ /*dprintk("pos %d item %d weight %d sum %d\n",
+ i, items[i], weights[i], bucket->sum_weights[i]);*/
+ }
+
+ bucket->h.weight = w;
+
+ return bucket;
+err:
+ free(bucket->sum_weights);
+ free(bucket->item_weights);
+ free(bucket->h.items);
+ free(bucket);
+ return NULL;
+}
+
+
+/* tree bucket */
+
+static int height(int n) {
+ int h = 0;
+ while ((n & 1) == 0) {
+ h++;
+ n = n >> 1;
+ }
+ return h;
+}
+static int on_right(int n, int h) {
+ return n & (1 << (h+1));
+}
+static int parent(int n)
+{
+ int h = height(n);
+ if (on_right(n, h))
+ return n - (1<<h);
+ else
+ return n + (1<<h);
+}
+
+static int calc_depth(int size)
+{
+ if (size == 0) {
+ return 0;
+ }
+
+ int depth = 1;
+ int t = size - 1;
+ while (t) {
+ t = t >> 1;
+ depth++;
+ }
+ return depth;
+}
+
+struct crush_bucket_tree*
+crush_make_tree_bucket(int hash, int type, int size,
+ int *items, /* in leaf order */
+ int *weights)
+{
+ struct crush_bucket_tree *bucket;
+ int depth;
+ int node;
+ int i, j;
+
+ bucket = malloc(sizeof(*bucket));
+ if (!bucket)
+ return NULL;
+ memset(bucket, 0, sizeof(*bucket));
+ bucket->h.alg = CRUSH_BUCKET_TREE;
+ bucket->h.hash = hash;
+ bucket->h.type = type;
+ bucket->h.size = size;
+
+ if (size == 0) {
+ bucket->h.items = NULL;
+ bucket->h.weight = 0;
+ bucket->node_weights = NULL;
+ bucket->num_nodes = 0;
+ /* printf("size 0 depth 0 nodes 0\n"); */
+ return bucket;
+ }
+
+ bucket->h.items = malloc(sizeof(__s32)*size);
+ if (!bucket->h.items)
+ goto err;
+
+ /* calc tree depth */
+ depth = calc_depth(size);
+ bucket->num_nodes = 1 << depth;
+ dprintk("size %d depth %d nodes %d\n", size, depth, bucket->num_nodes);
+
+ bucket->node_weights = malloc(sizeof(__u32)*bucket->num_nodes);
+ if (!bucket->node_weights)
+ goto err;
+
+ memset(bucket->h.items, 0, sizeof(__s32)*bucket->h.size);
+ memset(bucket->node_weights, 0, sizeof(__u32)*bucket->num_nodes);
+
+ for (i=0; i<size; i++) {
+ bucket->h.items[i] = items[i];
+ node = crush_calc_tree_node(i);
+ dprintk("item %d node %d weight %d\n", i, node, weights[i]);
+ bucket->node_weights[node] = weights[i];
+
+ if (crush_addition_is_unsafe(bucket->h.weight, weights[i]))
+ goto err;
+
+ bucket->h.weight += weights[i];
+ for (j=1; j<depth; j++) {
+ node = parent(node);
+
+ if (crush_addition_is_unsafe(bucket->node_weights[node], weights[i]))
+ goto err;
+
+ bucket->node_weights[node] += weights[i];
+ dprintk(" node %d weight %d\n", node, bucket->node_weights[node]);
+ }
+ }
+ BUG_ON(bucket->node_weights[bucket->num_nodes/2] != bucket->h.weight);
+
+ return bucket;
+err:
+ free(bucket->node_weights);
+ free(bucket->h.items);
+ free(bucket);
+ return NULL;
+}
+
+
+
+/* straw bucket */
+
+/*
+ * this code was written 8 years ago. i have a vague recollection of
+ * drawing boxes underneath bars of different lengths, where the bar
+ * length represented the probability/weight, and that there was some
+ * trial and error involved in arriving at this implementation.
+ * however, reading the code now after all this time, the intuition
+ * that motivated is lost on me. lame. my only excuse is that I now
+ * know that the approach is fundamentally flawed and am not
+ * particularly motivated to reconstruct the flawed reasoning.
+ *
+ * as best as i can remember, the idea is: sort the weights, and start
+ * with the smallest. arbitrarily scale it at 1.0 (16-bit fixed
+ * point). look at the next larger weight, and calculate the scaling
+ * factor for that straw based on the relative difference in weight so
+ * far. what's not clear to me now is why we are looking at wnext
+ * (the delta to the next bigger weight) for all remaining weights,
+ * and slicing things horizontally instead of considering just the
+ * next item or set of items. or why pow() is used the way it is.
+ *
+ * note that the original version 1 of this function made special
+ * accommodation for the case where straw lengths were identical. this
+ * is also flawed in a non-obvious way; version 2 drops the special
+ * handling and appears to work just as well.
+ *
+ * moral of the story: if you do something clever, write down why it
+ * works.
+ */
+int crush_calc_straw(struct crush_map *map, struct crush_bucket_straw *bucket)
+{
+ int *reverse;
+ int i, j, k;
+ double straw, wbelow, lastw, wnext, pbelow;
+ int numleft;
+ int size = bucket->h.size;
+ __u32 *weights = bucket->item_weights;
+
+ /* reverse sort by weight (simple insertion sort) */
+ reverse = malloc(sizeof(int) * size);
+ if (!reverse)
+ return -ENOMEM;
+ if (size)
+ reverse[0] = 0;
+ for (i=1; i<size; i++) {
+ for (j=0; j<i; j++) {
+ if (weights[i] < weights[reverse[j]]) {
+ /* insert here */
+ for (k=i; k>j; k--)
+ reverse[k] = reverse[k-1];
+ reverse[j] = i;
+ break;
+ }
+ }
+ if (j == i)
+ reverse[i] = i;
+ }
+
+ numleft = size;
+ straw = 1.0;
+ wbelow = 0;
+ lastw = 0;
+
+ i=0;
+ while (i < size) {
+ if (map->straw_calc_version == 0) {
+ /* zero weight items get 0 length straws! */
+ if (weights[reverse[i]] == 0) {
+ bucket->straws[reverse[i]] = 0;
+ i++;
+ continue;
+ }
+
+ /* set this item's straw */
+ bucket->straws[reverse[i]] = straw * 0x10000;
+ dprintk("item %d at %d weight %d straw %d (%lf)\n",
+ bucket->h.items[reverse[i]],
+ reverse[i], weights[reverse[i]],
+ bucket->straws[reverse[i]], straw);
+ i++;
+ if (i == size)
+ break;
+
+ /* same weight as previous? */
+ if (weights[reverse[i]] == weights[reverse[i-1]]) {
+ dprintk("same as previous\n");
+ continue;
+ }
+
+ /* adjust straw for next guy */
+ wbelow += ((double)weights[reverse[i-1]] - lastw) *
+ numleft;
+ for (j=i; j<size; j++)
+ if (weights[reverse[j]] == weights[reverse[i]])
+ numleft--;
+ else
+ break;
+ wnext = numleft * (weights[reverse[i]] -
+ weights[reverse[i-1]]);
+ pbelow = wbelow / (wbelow + wnext);
+ dprintk("wbelow %lf wnext %lf pbelow %lf numleft %d\n",
+ wbelow, wnext, pbelow, numleft);
+
+ straw *= pow((double)1.0 / pbelow, (double)1.0 /
+ (double)numleft);
+
+ lastw = weights[reverse[i-1]];
+ } else if (map->straw_calc_version >= 1) {
+ /* zero weight items get 0 length straws! */
+ if (weights[reverse[i]] == 0) {
+ bucket->straws[reverse[i]] = 0;
+ i++;
+ numleft--;
+ continue;
+ }
+
+ /* set this item's straw */
+ bucket->straws[reverse[i]] = straw * 0x10000;
+ dprintk("item %d at %d weight %d straw %d (%lf)\n",
+ bucket->h.items[reverse[i]],
+ reverse[i], weights[reverse[i]],
+ bucket->straws[reverse[i]], straw);
+ i++;
+ if (i == size)
+ break;
+
+ /* adjust straw for next guy */
+ wbelow += ((double)weights[reverse[i-1]] - lastw) *
+ numleft;
+ numleft--;
+ wnext = numleft * (weights[reverse[i]] -
+ weights[reverse[i-1]]);
+ pbelow = wbelow / (wbelow + wnext);
+ dprintk("wbelow %lf wnext %lf pbelow %lf numleft %d\n",
+ wbelow, wnext, pbelow, numleft);
+
+ straw *= pow((double)1.0 / pbelow, (double)1.0 /
+ (double)numleft);
+
+ lastw = weights[reverse[i-1]];
+ }
+ }
+
+ free(reverse);
+ return 0;
+}
+
+struct crush_bucket_straw *
+crush_make_straw_bucket(struct crush_map *map,
+ int hash,
+ int type,
+ int size,
+ int *items,
+ int *weights)
+{
+ struct crush_bucket_straw *bucket;
+ int i;
+
+ bucket = malloc(sizeof(*bucket));
+ if (!bucket)
+ return NULL;
+ memset(bucket, 0, sizeof(*bucket));
+ bucket->h.alg = CRUSH_BUCKET_STRAW;
+ bucket->h.hash = hash;
+ bucket->h.type = type;
+ bucket->h.size = size;
+
+ bucket->h.items = malloc(sizeof(__s32)*size);
+ if (!bucket->h.items)
+ goto err;
+ bucket->item_weights = malloc(sizeof(__u32)*size);
+ if (!bucket->item_weights)
+ goto err;
+ bucket->straws = malloc(sizeof(__u32)*size);
+ if (!bucket->straws)
+ goto err;
+
+ bucket->h.weight = 0;
+ for (i=0; i<size; i++) {
+ bucket->h.items[i] = items[i];
+ bucket->h.weight += weights[i];
+ bucket->item_weights[i] = weights[i];
+ }
+
+ if (crush_calc_straw(map, bucket) < 0)
+ goto err;
+
+ return bucket;
+err:
+ free(bucket->straws);
+ free(bucket->item_weights);
+ free(bucket->h.items);
+ free(bucket);
+ return NULL;
+}
+
+struct crush_bucket_straw2 *
+crush_make_straw2_bucket(struct crush_map *map,
+ int hash,
+ int type,
+ int size,
+ int *items,
+ int *weights)
+{
+ struct crush_bucket_straw2 *bucket;
+ int i;
+
+ bucket = malloc(sizeof(*bucket));
+ if (!bucket)
+ return NULL;
+ memset(bucket, 0, sizeof(*bucket));
+ bucket->h.alg = CRUSH_BUCKET_STRAW2;
+ bucket->h.hash = hash;
+ bucket->h.type = type;
+ bucket->h.size = size;
+
+ bucket->h.items = malloc(sizeof(__s32)*size);
+ if (!bucket->h.items)
+ goto err;
+ bucket->item_weights = malloc(sizeof(__u32)*size);
+ if (!bucket->item_weights)
+ goto err;
+
+ bucket->h.weight = 0;
+ for (i=0; i<size; i++) {
+ bucket->h.items[i] = items[i];
+ bucket->h.weight += weights[i];
+ bucket->item_weights[i] = weights[i];
+ }
+
+ return bucket;
+err:
+ free(bucket->item_weights);
+ free(bucket->h.items);
+ free(bucket);
+ return NULL;
+}
+
+
+
+struct crush_bucket*
+crush_make_bucket(struct crush_map *map,
+ int alg, int hash, int type, int size,
+ int *items,
+ int *weights)
+{
+ int item_weight;
+
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ if (size && weights)
+ item_weight = weights[0];
+ else
+ item_weight = 0;
+ return (struct crush_bucket *)crush_make_uniform_bucket(hash, type, size, items, item_weight);
+
+ case CRUSH_BUCKET_LIST:
+ return (struct crush_bucket *)crush_make_list_bucket(hash, type, size, items, weights);
+
+ case CRUSH_BUCKET_TREE:
+ return (struct crush_bucket *)crush_make_tree_bucket(hash, type, size, items, weights);
+
+ case CRUSH_BUCKET_STRAW:
+ return (struct crush_bucket *)crush_make_straw_bucket(map, hash, type, size, items, weights);
+ case CRUSH_BUCKET_STRAW2:
+ return (struct crush_bucket *)crush_make_straw2_bucket(map, hash, type, size, items, weights);
+ }
+ return 0;
+}
+
+
+/************************************************/
+
+int crush_add_uniform_bucket_item(struct crush_bucket_uniform *bucket, int item, int weight)
+{
+ int newsize = bucket->h.size + 1;
+ void *_realloc = NULL;
+
+ /* In such situation 'CRUSH_BUCKET_UNIFORM', the weight
+ provided for the item should be the same as
+ bucket->item_weight defined with 'crush_make_bucket'. This
+ assumption is enforced by the return value which is always
+ 0. */
+ if (bucket->item_weight != weight) {
+ return -EINVAL;
+ }
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+
+ bucket->h.items[newsize-1] = item;
+
+ if (crush_addition_is_unsafe(bucket->h.weight, weight))
+ return -ERANGE;
+
+ bucket->h.weight += weight;
+ bucket->h.size++;
+
+ return 0;
+}
+
+int crush_add_list_bucket_item(struct crush_bucket_list *bucket, int item, int weight)
+{
+ int newsize = bucket->h.size + 1;
+ void *_realloc = NULL;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->item_weights = _realloc;
+ }
+ if ((_realloc = realloc(bucket->sum_weights, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->sum_weights = _realloc;
+ }
+
+ bucket->h.items[newsize-1] = item;
+ bucket->item_weights[newsize-1] = weight;
+ if (newsize > 1) {
+
+ if (crush_addition_is_unsafe(bucket->sum_weights[newsize-2], weight))
+ return -ERANGE;
+
+ bucket->sum_weights[newsize-1] = bucket->sum_weights[newsize-2] + weight;
+ }
+
+ else {
+ bucket->sum_weights[newsize-1] = weight;
+ }
+
+ bucket->h.weight += weight;
+ bucket->h.size++;
+ return 0;
+}
+
+int crush_add_tree_bucket_item(struct crush_bucket_tree *bucket, int item, int weight)
+{
+ int newsize = bucket->h.size + 1;
+ int depth = calc_depth(newsize);;
+ int node;
+ int j;
+ void *_realloc = NULL;
+
+ bucket->num_nodes = 1 << depth;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ if ((_realloc = realloc(bucket->node_weights, sizeof(__u32)*bucket->num_nodes)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->node_weights = _realloc;
+ }
+
+ node = crush_calc_tree_node(newsize-1);
+ bucket->node_weights[node] = weight;
+
+ /* if the depth increase, we need to initialize the new root node's weight before add bucket item */
+ int root = bucket->num_nodes/2;
+ if (depth >= 2 && (node - 1) == root) {
+ /* if the new item is the first node in right sub tree, so
+ * the root node initial weight is left sub tree's weight
+ */
+ bucket->node_weights[root] = bucket->node_weights[root/2];
+ }
+
+ for (j=1; j<depth; j++) {
+ node = parent(node);
+
+ if (crush_addition_is_unsafe(bucket->node_weights[node], weight))
+ return -ERANGE;
+
+ bucket->node_weights[node] += weight;
+ dprintk(" node %d weight %d\n", node, bucket->node_weights[node]);
+ }
+
+
+ if (crush_addition_is_unsafe(bucket->h.weight, weight))
+ return -ERANGE;
+
+ bucket->h.items[newsize-1] = item;
+ bucket->h.weight += weight;
+ bucket->h.size++;
+
+ return 0;
+}
+
+int crush_add_straw_bucket_item(struct crush_map *map,
+ struct crush_bucket_straw *bucket,
+ int item, int weight)
+{
+ int newsize = bucket->h.size + 1;
+
+ void *_realloc = NULL;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->item_weights = _realloc;
+ }
+ if ((_realloc = realloc(bucket->straws, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->straws = _realloc;
+ }
+
+ bucket->h.items[newsize-1] = item;
+ bucket->item_weights[newsize-1] = weight;
+
+ if (crush_addition_is_unsafe(bucket->h.weight, weight))
+ return -ERANGE;
+
+ bucket->h.weight += weight;
+ bucket->h.size++;
+
+ return crush_calc_straw(map, bucket);
+}
+
+int crush_add_straw2_bucket_item(struct crush_map *map,
+ struct crush_bucket_straw2 *bucket,
+ int item, int weight)
+{
+ int newsize = bucket->h.size + 1;
+
+ void *_realloc = NULL;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->item_weights = _realloc;
+ }
+
+ bucket->h.items[newsize-1] = item;
+ bucket->item_weights[newsize-1] = weight;
+
+ if (crush_addition_is_unsafe(bucket->h.weight, weight))
+ return -ERANGE;
+
+ bucket->h.weight += weight;
+ bucket->h.size++;
+
+ return 0;
+}
+
+int crush_bucket_add_item(struct crush_map *map,
+ struct crush_bucket *b, int item, int weight)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return crush_add_uniform_bucket_item((struct crush_bucket_uniform *)b, item, weight);
+ case CRUSH_BUCKET_LIST:
+ return crush_add_list_bucket_item((struct crush_bucket_list *)b, item, weight);
+ case CRUSH_BUCKET_TREE:
+ return crush_add_tree_bucket_item((struct crush_bucket_tree *)b, item, weight);
+ case CRUSH_BUCKET_STRAW:
+ return crush_add_straw_bucket_item(map, (struct crush_bucket_straw *)b, item, weight);
+ case CRUSH_BUCKET_STRAW2:
+ return crush_add_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item, weight);
+ default:
+ return -1;
+ }
+}
+
+/************************************************/
+
+int crush_remove_uniform_bucket_item(struct crush_bucket_uniform *bucket, int item)
+{
+ unsigned i, j;
+ int newsize;
+ void *_realloc = NULL;
+
+ for (i = 0; i < bucket->h.size; i++)
+ if (bucket->h.items[i] == item)
+ break;
+ if (i == bucket->h.size)
+ return -ENOENT;
+
+ for (j = i; j < bucket->h.size; j++)
+ bucket->h.items[j] = bucket->h.items[j+1];
+ newsize = --bucket->h.size;
+ if (bucket->item_weight < bucket->h.weight)
+ bucket->h.weight -= bucket->item_weight;
+ else
+ bucket->h.weight = 0;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ return 0;
+}
+
+int crush_remove_list_bucket_item(struct crush_bucket_list *bucket, int item)
+{
+ unsigned i, j;
+ int newsize;
+ unsigned weight;
+
+ for (i = 0; i < bucket->h.size; i++)
+ if (bucket->h.items[i] == item)
+ break;
+ if (i == bucket->h.size)
+ return -ENOENT;
+
+ weight = bucket->item_weights[i];
+ for (j = i; j < bucket->h.size; j++) {
+ bucket->h.items[j] = bucket->h.items[j+1];
+ bucket->item_weights[j] = bucket->item_weights[j+1];
+ bucket->sum_weights[j] = bucket->sum_weights[j+1] - weight;
+ }
+ if (weight < bucket->h.weight)
+ bucket->h.weight -= weight;
+ else
+ bucket->h.weight = 0;
+ newsize = --bucket->h.size;
+
+ void *_realloc = NULL;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->item_weights = _realloc;
+ }
+ if ((_realloc = realloc(bucket->sum_weights, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->sum_weights = _realloc;
+ }
+ return 0;
+}
+
+int crush_remove_tree_bucket_item(struct crush_bucket_tree *bucket, int item)
+{
+ unsigned i;
+ unsigned newsize;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ int node;
+ unsigned weight;
+ int j;
+ int depth = calc_depth(bucket->h.size);
+
+ if (bucket->h.items[i] != item)
+ continue;
+
+ bucket->h.items[i] = 0;
+ node = crush_calc_tree_node(i);
+ weight = bucket->node_weights[node];
+ bucket->node_weights[node] = 0;
+
+ for (j = 1; j < depth; j++) {
+ node = parent(node);
+ bucket->node_weights[node] -= weight;
+ dprintk(" node %d weight %d\n", node, bucket->node_weights[node]);
+ }
+ if (weight < bucket->h.weight)
+ bucket->h.weight -= weight;
+ else
+ bucket->h.weight = 0;
+ break;
+ }
+ if (i == bucket->h.size)
+ return -ENOENT;
+
+ newsize = bucket->h.size;
+ while (newsize > 0) {
+ int node = crush_calc_tree_node(newsize - 1);
+ if (bucket->node_weights[node])
+ break;
+ --newsize;
+ }
+
+ if (newsize != bucket->h.size) {
+ int olddepth, newdepth;
+
+ void *_realloc = NULL;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+
+ olddepth = calc_depth(bucket->h.size);
+ newdepth = calc_depth(newsize);
+ if (olddepth != newdepth) {
+ bucket->num_nodes = 1 << newdepth;
+ if ((_realloc = realloc(bucket->node_weights,
+ sizeof(__u32)*bucket->num_nodes)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->node_weights = _realloc;
+ }
+ }
+
+ bucket->h.size = newsize;
+ }
+ return 0;
+}
+
+int crush_remove_straw_bucket_item(struct crush_map *map,
+ struct crush_bucket_straw *bucket, int item)
+{
+ int newsize = bucket->h.size - 1;
+ unsigned i, j;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ if (bucket->h.items[i] == item) {
+ if (bucket->item_weights[i] < bucket->h.weight)
+ bucket->h.weight -= bucket->item_weights[i];
+ else
+ bucket->h.weight = 0;
+ for (j = i; j < bucket->h.size - 1; j++) {
+ bucket->h.items[j] = bucket->h.items[j+1];
+ bucket->item_weights[j] = bucket->item_weights[j+1];
+ }
+ break;
+ }
+ }
+ if (i == bucket->h.size)
+ return -ENOENT;
+ bucket->h.size--;
+ if (bucket->h.size == 0) {
+ /* don't bother reallocating */
+ return 0;
+ }
+ void *_realloc = NULL;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->item_weights = _realloc;
+ }
+ if ((_realloc = realloc(bucket->straws, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->straws = _realloc;
+ }
+
+ return crush_calc_straw(map, bucket);
+}
+
+int crush_remove_straw2_bucket_item(struct crush_map *map,
+ struct crush_bucket_straw2 *bucket, int item)
+{
+ int newsize = bucket->h.size - 1;
+ unsigned i, j;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ if (bucket->h.items[i] == item) {
+ if (bucket->item_weights[i] < bucket->h.weight)
+ bucket->h.weight -= bucket->item_weights[i];
+ else
+ bucket->h.weight = 0;
+ for (j = i; j < bucket->h.size - 1; j++) {
+ bucket->h.items[j] = bucket->h.items[j+1];
+ bucket->item_weights[j] = bucket->item_weights[j+1];
+ }
+ break;
+ }
+ }
+ if (i == bucket->h.size)
+ return -ENOENT;
+
+ bucket->h.size--;
+ if (!newsize) {
+ /* don't bother reallocating a 0-length array. */
+ return 0;
+ }
+
+ void *_realloc = NULL;
+
+ if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->h.items = _realloc;
+ }
+ if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+ return -ENOMEM;
+ } else {
+ bucket->item_weights = _realloc;
+ }
+
+ return 0;
+}
+
+int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *b, int item)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return crush_remove_uniform_bucket_item((struct crush_bucket_uniform *)b, item);
+ case CRUSH_BUCKET_LIST:
+ return crush_remove_list_bucket_item((struct crush_bucket_list *)b, item);
+ case CRUSH_BUCKET_TREE:
+ return crush_remove_tree_bucket_item((struct crush_bucket_tree *)b, item);
+ case CRUSH_BUCKET_STRAW:
+ return crush_remove_straw_bucket_item(map, (struct crush_bucket_straw *)b, item);
+ case CRUSH_BUCKET_STRAW2:
+ return crush_remove_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item);
+ default:
+ return -1;
+ }
+}
+
+
+/************************************************/
+
+int crush_adjust_uniform_bucket_item_weight(struct crush_bucket_uniform *bucket, int item, int weight)
+{
+ int diff = (weight - bucket->item_weight) * bucket->h.size;
+
+ bucket->item_weight = weight;
+ bucket->h.weight = bucket->item_weight * bucket->h.size;
+
+ return diff;
+}
+
+int crush_adjust_list_bucket_item_weight(struct crush_bucket_list *bucket, int item, int weight)
+{
+ int diff;
+ unsigned i, j;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ if (bucket->h.items[i] == item)
+ break;
+ }
+ if (i == bucket->h.size)
+ return 0;
+
+ diff = weight - bucket->item_weights[i];
+ bucket->item_weights[i] = weight;
+ bucket->h.weight += diff;
+
+ for (j = i; j < bucket->h.size; j++)
+ bucket->sum_weights[j] += diff;
+
+ return diff;
+}
+
+int crush_adjust_tree_bucket_item_weight(struct crush_bucket_tree *bucket, int item, int weight)
+{
+ int diff;
+ int node;
+ unsigned i, j;
+ unsigned depth = calc_depth(bucket->h.size);
+
+ for (i = 0; i < bucket->h.size; i++) {
+ if (bucket->h.items[i] == item)
+ break;
+ }
+ if (i == bucket->h.size)
+ return 0;
+
+ node = crush_calc_tree_node(i);
+ diff = weight - bucket->node_weights[node];
+ bucket->node_weights[node] = weight;
+ bucket->h.weight += diff;
+
+ for (j=1; j<depth; j++) {
+ node = parent(node);
+ bucket->node_weights[node] += diff;
+ }
+
+ return diff;
+}
+
+int crush_adjust_straw_bucket_item_weight(struct crush_map *map,
+ struct crush_bucket_straw *bucket,
+ int item, int weight)
+{
+ unsigned idx;
+ int diff;
+ int r;
+
+ for (idx = 0; idx < bucket->h.size; idx++)
+ if (bucket->h.items[idx] == item)
+ break;
+ if (idx == bucket->h.size)
+ return 0;
+
+ diff = weight - bucket->item_weights[idx];
+ bucket->item_weights[idx] = weight;
+ bucket->h.weight += diff;
+
+ r = crush_calc_straw(map, bucket);
+ if (r < 0)
+ return r;
+
+ return diff;
+}
+
+int crush_adjust_straw2_bucket_item_weight(struct crush_map *map,
+ struct crush_bucket_straw2 *bucket,
+ int item, int weight)
+{
+ unsigned idx;
+ int diff;
+
+ for (idx = 0; idx < bucket->h.size; idx++)
+ if (bucket->h.items[idx] == item)
+ break;
+ if (idx == bucket->h.size)
+ return 0;
+
+ diff = weight - bucket->item_weights[idx];
+ bucket->item_weights[idx] = weight;
+ bucket->h.weight += diff;
+
+ return diff;
+}
+
+int crush_bucket_adjust_item_weight(struct crush_map *map,
+ struct crush_bucket *b,
+ int item, int weight)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return crush_adjust_uniform_bucket_item_weight((struct crush_bucket_uniform *)b,
+ item, weight);
+ case CRUSH_BUCKET_LIST:
+ return crush_adjust_list_bucket_item_weight((struct crush_bucket_list *)b,
+ item, weight);
+ case CRUSH_BUCKET_TREE:
+ return crush_adjust_tree_bucket_item_weight((struct crush_bucket_tree *)b,
+ item, weight);
+ case CRUSH_BUCKET_STRAW:
+ return crush_adjust_straw_bucket_item_weight(map,
+ (struct crush_bucket_straw *)b,
+ item, weight);
+ case CRUSH_BUCKET_STRAW2:
+ return crush_adjust_straw2_bucket_item_weight(map,
+ (struct crush_bucket_straw2 *)b,
+ item, weight);
+ default:
+ return -1;
+ }
+}
+
+/************************************************/
+
+static int crush_reweight_uniform_bucket(struct crush_map *map, struct crush_bucket_uniform *bucket)
+{
+ unsigned i;
+ unsigned sum = 0, n = 0, leaves = 0;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ int id = bucket->h.items[i];
+ if (id < 0) {
+ struct crush_bucket *c = map->buckets[-1-id];
+ crush_reweight_bucket(map, c);
+
+ if (crush_addition_is_unsafe(sum, c->weight))
+ return -ERANGE;
+
+ sum += c->weight;
+ n++;
+ } else {
+ leaves++;
+ }
+ }
+
+ if (n > leaves)
+ bucket->item_weight = sum / n; // more bucket children than leaves, average!
+ bucket->h.weight = bucket->item_weight * bucket->h.size;
+
+ return 0;
+}
+
+static int crush_reweight_list_bucket(struct crush_map *map, struct crush_bucket_list *bucket)
+{
+ unsigned i;
+
+ bucket->h.weight = 0;
+ for (i = 0; i < bucket->h.size; i++) {
+ int id = bucket->h.items[i];
+ if (id < 0) {
+ struct crush_bucket *c = map->buckets[-1-id];
+ crush_reweight_bucket(map, c);
+ bucket->item_weights[i] = c->weight;
+ }
+
+ if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i]))
+ return -ERANGE;
+
+ bucket->h.weight += bucket->item_weights[i];
+ }
+
+ return 0;
+}
+
+static int crush_reweight_tree_bucket(struct crush_map *map, struct crush_bucket_tree *bucket)
+{
+ unsigned i;
+
+ bucket->h.weight = 0;
+ for (i = 0; i < bucket->h.size; i++) {
+ int node = crush_calc_tree_node(i);
+ int id = bucket->h.items[i];
+ if (id < 0) {
+ struct crush_bucket *c = map->buckets[-1-id];
+ crush_reweight_bucket(map, c);
+ bucket->node_weights[node] = c->weight;
+ }
+
+ if (crush_addition_is_unsafe(bucket->h.weight, bucket->node_weights[node]))
+ return -ERANGE;
+
+ bucket->h.weight += bucket->node_weights[node];
+
+
+ }
+
+ return 0;
+}
+
+static int crush_reweight_straw_bucket(struct crush_map *map, struct crush_bucket_straw *bucket)
+{
+ unsigned i;
+
+ bucket->h.weight = 0;
+ for (i = 0; i < bucket->h.size; i++) {
+ int id = bucket->h.items[i];
+ if (id < 0) {
+ struct crush_bucket *c = map->buckets[-1-id];
+ crush_reweight_bucket(map, c);
+ bucket->item_weights[i] = c->weight;
+ }
+
+ if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i]))
+ return -ERANGE;
+
+ bucket->h.weight += bucket->item_weights[i];
+ }
+ crush_calc_straw(map, bucket);
+
+ return 0;
+}
+
+static int crush_reweight_straw2_bucket(struct crush_map *map, struct crush_bucket_straw2 *bucket)
+{
+ unsigned i;
+
+ bucket->h.weight = 0;
+ for (i = 0; i < bucket->h.size; i++) {
+ int id = bucket->h.items[i];
+ if (id < 0) {
+ struct crush_bucket *c = map->buckets[-1-id];
+ crush_reweight_bucket(map, c);
+ bucket->item_weights[i] = c->weight;
+ }
+
+ if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i]))
+ return -ERANGE;
+
+ bucket->h.weight += bucket->item_weights[i];
+ }
+
+ return 0;
+}
+
+int crush_reweight_bucket(struct crush_map *map, struct crush_bucket *b)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return crush_reweight_uniform_bucket(map, (struct crush_bucket_uniform *)b);
+ case CRUSH_BUCKET_LIST:
+ return crush_reweight_list_bucket(map, (struct crush_bucket_list *)b);
+ case CRUSH_BUCKET_TREE:
+ return crush_reweight_tree_bucket(map, (struct crush_bucket_tree *)b);
+ case CRUSH_BUCKET_STRAW:
+ return crush_reweight_straw_bucket(map, (struct crush_bucket_straw *)b);
+ case CRUSH_BUCKET_STRAW2:
+ return crush_reweight_straw2_bucket(map, (struct crush_bucket_straw2 *)b);
+ default:
+ return -1;
+ }
+}
+
+struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions)
+{
+ int b;
+ int sum_bucket_size = 0;
+ int bucket_count = 0;
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == 0)
+ continue;
+ sum_bucket_size += map->buckets[b]->size;
+ bucket_count++;
+ }
+ dprintk("sum_bucket_size %d max_buckets %d bucket_count %d\n",
+ sum_bucket_size, map->max_buckets, bucket_count);
+ int size = (sizeof(struct crush_choose_arg) * map->max_buckets +
+ sizeof(struct crush_weight_set) * bucket_count * num_positions +
+ sizeof(__u32) * sum_bucket_size * num_positions + // weights
+ sizeof(__s32) * sum_bucket_size); // ids
+ char *space = malloc(size);
+ struct crush_choose_arg *arg = (struct crush_choose_arg *)space;
+ struct crush_weight_set *weight_set = (struct crush_weight_set *)(arg + map->max_buckets);
+ __u32 *weights = (__u32 *)(weight_set + bucket_count * num_positions);
+ char *weight_set_ends __attribute__((unused)) = (char*)weights;
+ __s32 *ids = (__s32 *)(weights + sum_bucket_size * num_positions);
+ char *weights_end __attribute__((unused)) = (char *)ids;
+ char *ids_end __attribute__((unused)) = (char *)(ids + sum_bucket_size);
+ BUG_ON(space + size != ids_end);
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == 0) {
+ memset(&arg[b], '\0', sizeof(struct crush_choose_arg));
+ continue;
+ }
+ struct crush_bucket_straw2 *bucket = (struct crush_bucket_straw2 *)map->buckets[b];
+
+ int position;
+ for (position = 0; position < num_positions; position++) {
+ memcpy(weights, bucket->item_weights, sizeof(__u32) * bucket->h.size);
+ weight_set[position].weights = weights;
+ weight_set[position].size = bucket->h.size;
+ dprintk("moving weight %d bytes forward\n", (int)((weights + bucket->h.size) - weights));
+ weights += bucket->h.size;
+ }
+ arg[b].weight_set = weight_set;
+ arg[b].weight_set_positions = num_positions;
+ weight_set += position;
+
+ memcpy(ids, bucket->h.items, sizeof(__s32) * bucket->h.size);
+ arg[b].ids = ids;
+ arg[b].ids_size = bucket->h.size;
+ ids += bucket->h.size;
+ }
+ BUG_ON((char*)weight_set_ends != (char*)weight_set);
+ BUG_ON((char*)weights_end != (char*)weights);
+ BUG_ON((char*)ids != (char*)ids_end);
+ return arg;
+}
+
+void crush_destroy_choose_args(struct crush_choose_arg *args)
+{
+ free(args);
+}
+
+/***************************/
+
+/* methods to check for safe arithmetic operations */
+
+int crush_addition_is_unsafe(__u32 a, __u32 b)
+{
+ if ((((__u32)(-1)) - b) < a)
+ return 1;
+ else
+ return 0;
+}
+
+int crush_multiplication_is_unsafe(__u32 a, __u32 b)
+{
+ /* prevent division by zero */
+ if (!a)
+ return 0;
+ if (!b)
+ return 1;
+ if ((((__u32)(-1)) / b) < a)
+ return 1;
+ else
+ return 0;
+}
+
+/***************************/
+
+/* methods to configure crush_map */
+
+void set_legacy_crush_map(struct crush_map *map) {
+ /* initialize legacy tunable values */
+ map->choose_local_tries = 2;
+ map->choose_local_fallback_tries = 5;
+ map->choose_total_tries = 19;
+ map->chooseleaf_descend_once = 0;
+ map->chooseleaf_vary_r = 0;
+ map->chooseleaf_stable = 0;
+ map->straw_calc_version = 0;
+
+ // by default, use legacy types, and also exclude tree,
+ // since it was buggy.
+ map->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+}
+
+void set_optimal_crush_map(struct crush_map *map) {
+ map->choose_local_tries = 0;
+ map->choose_local_fallback_tries = 0;
+ map->choose_total_tries = 50;
+ map->chooseleaf_descend_once = 1;
+ map->chooseleaf_vary_r = 1;
+ map->chooseleaf_stable = 1;
+ map->allowed_bucket_algs = (
+ (1 << CRUSH_BUCKET_UNIFORM) |
+ (1 << CRUSH_BUCKET_LIST) |
+ (1 << CRUSH_BUCKET_STRAW) |
+ (1 << CRUSH_BUCKET_STRAW2));
+}
diff --git a/src/crush/builder.h b/src/crush/builder.h
new file mode 100644
index 00000000..bdf0a4b9
--- /dev/null
+++ b/src/crush/builder.h
@@ -0,0 +1,344 @@
+#ifndef CEPH_CRUSH_BUILDER_H
+#define CEPH_CRUSH_BUILDER_H
+
+#include "include/int_types.h"
+
+struct crush_bucket;
+struct crush_choose_arg;
+struct crush_map;
+struct crush_rule;
+
+/** @ingroup API
+ *
+ * Allocate a crush_map with __malloc(3)__ and initialize it. The
+ * caller is responsible for deallocating the crush_map with
+ * crush_destroy().
+ *
+ * The content of the allocated crush_map is set with
+ * set_optimal_crush_map(). The caller is responsible for setting each
+ * tunable in the __crush_map__ for backward compatibility or mapping
+ * stability.
+ *
+ * @returns a pointer to the newly created crush_map or NULL
+ */
+extern struct crush_map *crush_create();
+/** @ingroup API
+ *
+ * Analyze the content of __map__ and set the internal values required
+ * before it can be used to map values with crush_do_rule(). The caller
+ * must make sure it is run before crush_do_rule() and after any
+ * function that modifies the __map__ (crush_add_bucket(), etc.).
+ *
+ * @param map the crush_map
+ */
+extern void crush_finalize(struct crush_map *map);
+
+/* rules */
+/** @ingroup API
+ *
+ * Allocate an empty crush_rule structure large enough to store __len__ steps.
+ * Steps can be added to a rule via crush_rule_set_step(). The __ruleset__
+ * is a user defined integer, not used by __libcrush__ and stored in
+ * the allocated rule at __rule->mask.ruleset__.
+ *
+ * The rule is designed to allow crush_do_rule() to get at least __minsize__ items
+ * and at most __maxsize__ items.
+ *
+ * The __type__ is defined by the caller and will be used by
+ * crush_find_rule() when looking for a rule and by
+ * __CRUSH_RULE_CHOOSE*__ steps when looking for items.
+ *
+ * The caller is responsible for deallocating the returned pointer via
+ * crush_destroy_rule().
+ *
+ * If __malloc(3)__ fails, return NULL.
+ *
+ * @param len number of steps in the rule
+ * @param ruleset user defined value
+ * @param type user defined value
+ * @param minsize minimum number of items the rule can map
+ * @param maxsize maximum number of items the rule can map
+ *
+ * @returns a pointer to the newly created rule or NULL
+ */
+extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize);
+/** @ingroup API
+ *
+ * Set the __pos__ step of the __rule__ to an operand and up to two arguments.
+ * The value of the operand __op__ determines if the arguments are used and how:
+ *
+ * - __CRUSH_RULE_NOOP__ do nothing.
+ * - __CRUSH_RULE_TAKE__ select the __arg1__ item
+ * - __CRUSH_RULE_EMIT__ append the selection to the results and clear
+ * the selection
+ *
+ * - __CRUSH_RULE_CHOOSE_FIRSTN__ and __CRUSH_RULE_CHOOSE_INDEP__
+ * recursively explore each bucket currently selected, looking for
+ * __arg1__ items of type __arg2__ and select them.
+ * - __CRUSH_RULE_CHOOSELEAF_FIRSTN__ and __CRUSH_RULE_CHOOSELEAF_INDEP__
+ * recursively explore each bucket currently selected, looking for
+ * __arg1__ leaves within all the buckets of type __arg2__ and
+ * select them.
+ *
+ * In all __CHOOSE__ steps, if __arg1__ is less than or equal to zero,
+ * the number of items to select is equal to the __max_result__ argument
+ * of crush_do_rule() minus __arg1__. It is common to set __arg1__ to zero
+ * to select as many items as requested by __max_result__.
+ *
+ * - __CRUSH_RULE_SET_CHOOSE_TRIES__ and __CRUSH_RULE_SET_CHOOSELEAF_TRIES__
+ *
+ * The CHOOSE_FIRSTN and CHOOSE_INDEP rule step look for buckets of
+ * a given type, randomly selecting them. If they are unlucky and
+ * find the same bucket twice, they will try N+1 times (N being the
+ * value of the choose_total_tries tunable). If there is a previous
+ * SET_CHOOSE_TRIES step in the same rule, it will try C times
+ * instead (C being the value of the argument of the
+ * SET_CHOOSE_TRIES step).
+ *
+ * Note: the __choose_total_tries__ tunable defined in crush_map is
+ * the number of retry, not the number of tries. The number of tries
+ * is the number of retry+1. The SET_CHOOSE_TRIES rule step sets the
+ * number of tries and does not need the + 1. This confusing
+ * difference is inherited from an off-by-one bug from years ago.
+ *
+ * The CHOOSELEAF_FIRSTN and CHOOSELEAF_INDEP rule step do the same
+ * as CHOOSE_FIRSTN and CHOOSE_INDEP but also recursively explore
+ * each bucket found, looking for a single device. The same device
+ * may be found in two different buckets because the crush map is
+ * not a strict hierarchy, it is a DAG. When such a collision
+ * happens, they will try again. The number of times they try to
+ * find a non colliding device is:
+ *
+ * - If FIRSTN and there is no previous SET_CHOOSELEAF_TRIES rule
+ * step: try N + 1 times (N being the value of the
+ * __choose_total_tries__ tunable defined in crush_map)
+ *
+ * - If FIRSTN and there is a previous SET_CHOOSELEAF_TRIES rule
+ * step: try P times (P being the value of the argument of the
+ * SET_CHOOSELEAF_TRIES rule step)
+ *
+ * - If INDEP and there is no previous SET_CHOOSELEAF_TRIES rule
+ * step: try 1 time.
+ *
+ * - If INDEP and there is a previous SET_CHOOSELEAF_TRIES rule step: try
+ * P times (P being the value of the argument of the SET_CHOOSELEAF_TRIES
+ * rule step)
+ *
+ * @param rule the rule in which the step is inserted
+ * @param pos the zero based step index
+ * @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__, __CRUSH_RULE_SET_CHOOSE_TRIES__, __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ or __CRUSH_RULE_EMIT__
+ * @param arg1 first argument for __op__
+ * @param arg2 second argument for __op__
+ */
+extern void crush_rule_set_step(struct crush_rule *rule, int pos, int op, int arg1, int arg2);
+/** @ingroup API
+ *
+ * Add the __rule__ into the crush __map__ and assign it the
+ * __ruleno__ unique identifier. If __ruleno__ is -1, the function will
+ * assign the lowest available identifier. The __ruleno__ value must be
+ * a positive integer lower than __CRUSH_MAX_RULES__.
+ *
+ * - return -ENOSPC if the rule identifier is >= __CRUSH_MAX_RULES__
+ * - return -ENOMEM if __realloc(3)__ fails to expand the array of
+ * rules in the __map__
+ *
+ * @param map the crush_map
+ * @param rule the rule to add to the __map__
+ * @param ruleno a positive integer < __CRUSH_MAX_RULES__ or -1
+ *
+ * @returns the rule unique identifier on success, < 0 on error
+ */
+extern int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno);
+
+/* buckets */
+extern int crush_get_next_bucket_id(struct crush_map *map);
+/** @ingroup API
+ *
+ * Add __bucket__ into the crush __map__ and assign it the
+ * __bucketno__ unique identifier. If __bucketno__ is 0, the function
+ * will assign the lowest available identifier. The bucket identifier
+ * must be a negative integer. The bucket identifier is returned via
+ * __idout__.
+ *
+ * - return -ENOMEM if __realloc(3)__ fails to expand the array of
+ * buckets in the __map__
+ * - return -EEXIST if the __bucketno__ identifier is already assigned
+ * to another bucket.
+ *
+ * @param[in] map the crush_map
+ * @param[in] bucketno the bucket unique identifier or 0
+ * @param[in] bucket the bucket to add to the __map__
+ * @param[out] idout a pointer to the bucket identifier
+ *
+ * @returns 0 on success, < 0 on error
+ */
+extern int crush_add_bucket(struct crush_map *map,
+ int bucketno,
+ struct crush_bucket *bucket, int *idout);
+/** @ingroup API
+ *
+ * Allocate a crush_bucket with __malloc(3)__ and initialize it. The
+ * content of the bucket is filled with __size__ items from
+ * __items__. The item selection is set to use __alg__ which is one of
+ * ::CRUSH_BUCKET_UNIFORM , ::CRUSH_BUCKET_LIST or
+ * ::CRUSH_BUCKET_STRAW2. The initial __items__ are assigned a
+ * weight from the __weights__ array, depending on the value of
+ * __alg__. If __alg__ is ::CRUSH_BUCKET_UNIFORM, all items are set
+ * to have a weight equal to __weights[0]__, otherwise the weight of
+ * __items[x]__ is set to be the value of __weights[x]__.
+ *
+ * The caller is responsible for deallocating the returned pointer via
+ * crush_destroy_bucket().
+ *
+ * @param map __unused__
+ * @param alg algorithm for item selection
+ * @param hash always set to CRUSH_HASH_RJENKINS1
+ * @param type user defined bucket type
+ * @param size of the __items__ array
+ * @param items array of __size__ items
+ * @param weights the weight of each item in __items__, depending on __alg__
+ *
+ * @returns a pointer to the newly created bucket or NULL
+ */
+struct crush_bucket *crush_make_bucket(struct crush_map *map, int alg, int hash, int type, int size, int *items, int *weights);
+extern struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions);
+extern void crush_destroy_choose_args(struct crush_choose_arg *args);
+/** @ingroup API
+ *
+ * Add __item__ to __bucket__ with __weight__. The weight of the new
+ * item is added to the weight of the bucket so that it reflects
+ * the total weight of all items.
+ *
+ * If __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM, the value of __weight__ must be equal to
+ * __(struct crush_bucket_uniform *)bucket->item_weight__.
+ *
+ * - return -ENOMEM if the __bucket__ cannot be resized with __realloc(3)__.
+ * - return -ERANGE if adding __weight__ to the weight of the bucket overflows.
+ * - return -EINVAL if __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM and
+ * the __weight__ is not equal to __(struct crush_bucket_uniform *)bucket->item_weight__.
+ * - return -1 if the value of __bucket->alg__ is unknown.
+ *
+ * @returns 0 on success, < 0 on error
+ */
+extern int crush_bucket_add_item(struct crush_map *map, struct crush_bucket *bucket, int item, int weight);
+/** @ingroup API
+ *
+ * If __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM,
+ * __(struct crush_bucket_uniform *)bucket->item_weight__ is set to __weight__ and the
+ * weight of the bucket is set to be the number of items in the bucket times the weight.
+ * The return value is the difference between the new bucket weight and the former
+ * bucket weight. The __item__ argument is ignored.
+ *
+ * If __bucket->alg__ is different from ::CRUSH_BUCKET_UNIFORM,
+ * set the __weight__ of __item__ in __bucket__. The former weight of the
+ * item is subtracted from the weight of the bucket and the new weight is added.
+ * The return value is the difference between the new item weight and the former
+ * item weight.
+ *
+ * @returns the difference between the new weight and the former weight
+ */
+extern int crush_bucket_adjust_item_weight(struct crush_map *map, struct crush_bucket *bucket, int item, int weight);
+/** @ingroup API
+ *
+ * Recursively update the weight of __bucket__ and its children, deep
+ * first. The __bucket__ weight is set to the sum of the weight of the
+ * items it contains.
+ *
+ * - return -ERANGE if the sum of the weight of the items in __bucket__ overflows.
+ * - return -1 if the value of __bucket->alg__ is unknown.
+ *
+ * @param map a crush_map containing __bucket__
+ * @param bucket the root of the tree to reweight
+ * @returns 0 on success, < 0 on error
+ */
+extern int crush_reweight_bucket(struct crush_map *map, struct crush_bucket *bucket);
+/** @ingroup API
+ *
+ * Remove __bucket__ from __map__ and deallocate it via crush_destroy_bucket().
+ * __assert(3)__ that __bucket__ is in __map__. The caller is responsible for
+ * making sure the bucket is not the child of any other bucket in the __map__.
+ *
+ * @param map a crush_map containing __bucket__
+ * @param bucket the bucket to remove from __map__
+ * @returns 0
+ */
+extern int crush_remove_bucket(struct crush_map *map, struct crush_bucket *bucket);
+/** @ingroup API
+ *
+ * Remove __item__ from __bucket__ and subtract the item weight from
+ * the bucket weight. If the weight of the item is greater than the
+ * weight of the bucket, silently set the bucket weight to zero.
+ *
+ * - return -ENOMEM if the __bucket__ cannot be sized down with __realloc(3)__.
+ * - return -1 if the value of __bucket->alg__ is unknown.
+ *
+ * @param map __unused__
+ * @param bucket the bucket from which __item__ is removed
+ * @param item the item to remove from __bucket__
+ * @returns 0 on success, < 0 on error
+ */
+extern int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *bucket, int item);
+
+struct crush_bucket_uniform *
+crush_make_uniform_bucket(int hash, int type, int size,
+ int *items,
+ int item_weight);
+struct crush_bucket_list*
+crush_make_list_bucket(int hash, int type, int size,
+ int *items,
+ int *weights);
+struct crush_bucket_tree*
+crush_make_tree_bucket(int hash, int type, int size,
+ int *items, /* in leaf order */
+ int *weights);
+struct crush_bucket_straw *
+crush_make_straw_bucket(struct crush_map *map,
+ int hash, int type, int size,
+ int *items,
+ int *weights);
+
+extern int crush_addition_is_unsafe(__u32 a, __u32 b);
+extern int crush_multiplication_is_unsafe(__u32 a, __u32 b);
+
+/** @ingroup API
+ *
+ * Set the __map__ tunables to implement the most ancient behavior,
+ * for backward compatibility purposes only.
+ *
+ * - choose_local_tries == 2
+ * - choose_local_fallback_tries == 5
+ * - choose_total_tries == 19
+ * - chooseleaf_descend_once == 0
+ * - chooseleaf_vary_r == 0
+ * - straw_calc_version == 0
+ * - chooseleaf_stable = 0
+ *
+ * See the __crush_map__ documentation for more information about
+ * each tunable.
+ *
+ * @param map a crush_map
+ */
+extern void set_legacy_crush_map(struct crush_map *map);
+/** @ingroup API
+ *
+ * Set the __map__ tunables to implement the optimal behavior. These
+ * are the values set by crush_create(). It does not guarantee a
+ * stable mapping after an upgrade.
+ *
+ * For instance when a bug is fixed it may significantly change the
+ * mapping. In that case a new tunable (say tunable_new) is added so
+ * the caller can control when the bug fix is activated. The
+ * set_optimal_crush_map() function will always set all tunables,
+ * including tunable_new, to fix all bugs even if it means changing
+ * the mapping. If the caller needs fine grained control on the
+ * tunables to upgrade to a new version without changing the mapping,
+ * it needs to set the __crush_map__ tunables individually.
+ *
+ * See the __crush_map__ documentation for more information about
+ * each tunable.
+ *
+ * @param map a crush_map
+ */
+extern void set_optimal_crush_map(struct crush_map *map);
+
+#endif
diff --git a/src/crush/crush.c b/src/crush/crush.c
new file mode 100644
index 00000000..5bf94c04
--- /dev/null
+++ b/src/crush/crush.c
@@ -0,0 +1,137 @@
+#ifdef __KERNEL__
+# include <linux/slab.h>
+# include <linux/crush/crush.h>
+#else
+# include "crush_compat.h"
+# include "crush.h"
+#endif
+
+const char *crush_bucket_alg_name(int alg)
+{
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM: return "uniform";
+ case CRUSH_BUCKET_LIST: return "list";
+ case CRUSH_BUCKET_TREE: return "tree";
+ case CRUSH_BUCKET_STRAW: return "straw";
+ case CRUSH_BUCKET_STRAW2: return "straw2";
+ default: return "unknown";
+ }
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
+{
+ if ((__u32)p >= b->size)
+ return 0;
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return ((struct crush_bucket_uniform *)b)->item_weight;
+ case CRUSH_BUCKET_LIST:
+ return ((struct crush_bucket_list *)b)->item_weights[p];
+ case CRUSH_BUCKET_TREE:
+ return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)];
+ case CRUSH_BUCKET_STRAW:
+ return ((struct crush_bucket_straw *)b)->item_weights[p];
+ case CRUSH_BUCKET_STRAW2:
+ return ((struct crush_bucket_straw2 *)b)->item_weights[p];
+ }
+ return 0;
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+ kfree(b->item_weights);
+ kfree(b->sum_weights);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+ kfree(b->h.items);
+ kfree(b->node_weights);
+ kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+ kfree(b->straws);
+ kfree(b->item_weights);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
+{
+ kfree(b->item_weights);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+ break;
+ case CRUSH_BUCKET_LIST:
+ crush_destroy_bucket_list((struct crush_bucket_list *)b);
+ break;
+ case CRUSH_BUCKET_TREE:
+ crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+ break;
+ case CRUSH_BUCKET_STRAW2:
+ crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b);
+ break;
+ }
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+ /* buckets */
+ if (map->buckets) {
+ __s32 b;
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ crush_destroy_bucket(map->buckets[b]);
+ }
+ kfree(map->buckets);
+ }
+
+ /* rules */
+ if (map->rules) {
+ __u32 b;
+ for (b = 0; b < map->max_rules; b++)
+ crush_destroy_rule(map->rules[b]);
+ kfree(map->rules);
+ }
+
+#ifndef __KERNEL__
+ kfree(map->choose_tries);
+#endif
+ kfree(map);
+}
+
+void crush_destroy_rule(struct crush_rule *rule)
+{
+ kfree(rule);
+}
diff --git a/src/crush/crush.h b/src/crush/crush.h
new file mode 100644
index 00000000..dd08aa7b
--- /dev/null
+++ b/src/crush/crush.h
@@ -0,0 +1,549 @@
+#ifndef CEPH_CRUSH_CRUSH_H
+#define CEPH_CRUSH_CRUSH_H
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2.1
+ */
+
+
+#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
+
+#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
+#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */
+#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */
+
+#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
+#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
+
+#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
+/** @ingroup API
+ * The equivalent of NULL for an item, i.e. the absence of an item.
+ */
+#define CRUSH_ITEM_NONE 0x7fffffff
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices. A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+ __u32 op;
+ __s32 arg1;
+ __s32 arg2;
+};
+
+/** @ingroup API
+ */
+enum crush_opcodes {
+ /*! do nothing
+ */
+ CRUSH_RULE_NOOP = 0,
+ CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
+ CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+ /* arg2 = type */
+ CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
+ CRUSH_RULE_EMIT = 4, /* no args */
+ CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
+ CRUSH_RULE_CHOOSELEAF_INDEP = 7,
+
+ CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
+ CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
+ CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
+ CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
+ CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
+ CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N 0
+#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+ __u8 ruleset;
+ __u8 type;
+ __u8 min_size;
+ __u8 max_size;
+};
+
+struct crush_rule {
+ __u32 len;
+ struct crush_rule_mask mask;
+ struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+ (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).
+ */
+
+/** @ingroup API
+ *
+ * Items within a bucket are chosen with crush_do_rule() using one of
+ * three algorithms representing a tradeoff between performance and
+ * reorganization efficiency. If you are unsure of which bucket type
+ * to use, we recommend using ::CRUSH_BUCKET_STRAW2.
+ *
+ * The table summarizes how the speed of each option measures up
+ * against mapping stability when items are added or removed.
+ *
+ * Bucket Alg Speed Additions Removals
+ * ------------------------------------------------
+ * uniform O(1) poor poor
+ * list O(n) optimal poor
+ * straw2 O(n) optimal optimal
+ */
+enum crush_algorithm {
+ /*!
+ * Devices are rarely added individually in a large system.
+ * Instead, new storage is typically deployed in blocks of identical
+ * devices, often as an additional shelf in a server rack or perhaps
+ * an entire cabinet. Devices reaching their end of life are often
+ * similarly decommissioned as a set (individual failures aside),
+ * making it natural to treat them as a unit. CRUSH uniform buckets
+ * are used to represent an identical set of devices in such
+ * circumstances. The key advantage in doing so is performance
+ * related: CRUSH can map replicas into uniform buckets in constant
+ * time. In cases where the uniformity restrictions are not
+ * appropriate, other bucket types can be used. If the size of a
+ * uniform bucket changes, there is a complete reshuffling of data
+ * between devices, much like conventional hash-based distribution
+ * strategies.
+ */
+ CRUSH_BUCKET_UNIFORM = 1,
+ /*!
+ * List buckets structure their contents as a linked list, and
+ * can contain items with arbitrary weights. To place a
+ * replica, CRUSH begins at the head of the list with the most
+ * recently added item and compares its weight to the sum of
+ * all remaining items' weights. Depending on the value of
+ * hash( x , r , item), either the current item is chosen with
+ * the appropriate probability, or the process continues
+ * recursively down the list. This is a natural and intuitive
+ * choice for an expanding cluster: either an object is
+ * relocated to the newest device with some appropriate
+ * probability, or it remains on the older devices as before.
+ * The result is optimal data migration when items are added
+ * to the bucket. Items removed from the middle or tail of the
+ * list, however, can result in a significant amount of
+ * unnecessary movement, making list buckets most suitable for
+ * circumstances in which they never (or very rarely) shrink.
+ */
+ CRUSH_BUCKET_LIST = 2,
+ /*! @cond INTERNAL */
+ CRUSH_BUCKET_TREE = 3,
+ CRUSH_BUCKET_STRAW = 4,
+ /*! @endcond */
+ /*!
+ * List and tree buckets are structured such that a limited
+ * number of hash values need to be calculated and compared to
+ * weights in order to select a bucket item. In doing so,
+ * they divide and conquer in a way that either gives certain
+ * items precedence (e. g., those at the beginning of a list)
+ * or obviates the need to consider entire subtrees of items
+ * at all. That improves the performance of the replica
+ * placement process, but can also introduce suboptimal
+ * reorganization behavior when the contents of a bucket
+ * change due an addition, removal, or re-weighting of an
+ * item.
+ *
+ * The straw2 bucket type allows all items to fairly "compete"
+ * against each other for replica placement through a process
+ * analogous to a draw of straws. To place a replica, a straw
+ * of random length is drawn for each item in the bucket. The
+ * item with the longest straw wins. The length of each straw
+ * is initially a value in a fixed range. Each straw length
+ * is scaled by a factor based on the item's weight so that
+ * heavily weighted items are more likely to win the draw.
+ * Although this process is almost twice as slow (on average)
+ * than a list bucket and even slower than a tree bucket
+ * (which scales logarithmically), straw2 buckets result in
+ * optimal data movement between nested items when modified.
+ */
+ CRUSH_BUCKET_STRAW2 = 5,
+};
+extern const char *crush_bucket_alg_name(int alg);
+
+/*
+ * although tree was a legacy algorithm, it has been buggy, so
+ * exclude it.
+ */
+#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
+ (1 << CRUSH_BUCKET_UNIFORM) | \
+ (1 << CRUSH_BUCKET_LIST) | \
+ (1 << CRUSH_BUCKET_STRAW))
+
+/** @ingroup API
+ *
+ * A bucket contains __size__ __items__ which are either positive
+ * numbers or negative numbers that reference other buckets and is
+ * uniquely identified with __id__ which is a negative number. The
+ * __weight__ of a bucket is the cumulative weight of all its
+ * children. A bucket is assigned a ::crush_algorithm that is used by
+ * crush_do_rule() to draw an item depending on its weight. A bucket
+ * can be assigned a strictly positive (> 0) __type__ defined by the
+ * caller. The __type__ can be used by crush_do_rule(), when it is
+ * given as an argument of a rule step.
+ *
+ * A pointer to crush_bucket can safely be cast into the following
+ * structure, depending on the value of __alg__:
+ *
+ * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform
+ * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list
+ * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2
+ *
+ * The weight of each item depends on the algorithm and the
+ * information about it is available in the corresponding structure
+ * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2).
+ *
+ * See crush_map for more information on how __id__ is used
+ * to reference the bucket.
+ */
+struct crush_bucket {
+ __s32 id; /*!< bucket identifier, < 0 and unique within a crush_map */
+ __u16 type; /*!< > 0 bucket type, defined by the caller */
+ __u8 alg; /*!< the item selection ::crush_algorithm */
+ /*! @cond INTERNAL */
+ __u8 hash; /* which hash function to use, CRUSH_HASH_* */
+ /*! @endcond */
+ __u32 weight; /*!< 16.16 fixed point cumulated children weight */
+ __u32 size; /*!< size of the __items__ array */
+ __s32 *items; /*!< array of children: < 0 are buckets, >= 0 items */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights for each item in a bucket. The size of the
+ * array must be exactly the size of the straw2 bucket, just as the
+ * item_weights array.
+ *
+ */
+struct crush_weight_set {
+ __u32 *weights; /*!< 16.16 fixed point weights in the same order as items */
+ __u32 size; /*!< size of the __weights__ array */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights and ids for a given straw2 bucket, for
+ * placement purposes.
+ *
+ * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
+ * replacement weights found at __weight_set[N]__ are used instead of
+ * the weights from __item_weights__. If __N__ is greater than
+ * __weight_set_positions__, the weights found at __weight_set_positions-1__ are
+ * used instead. For instance if __weight_set__ is:
+ *
+ * [ [ 0x10000, 0x20000 ], // position 0
+ * [ 0x20000, 0x40000 ] ] // position 1
+ *
+ * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
+ * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
+ * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
+ * etc.
+ *
+ */
+struct crush_choose_arg {
+ __s32 *ids; /*!< values to use instead of items */
+ __u32 ids_size; /*!< size of the __ids__ array */
+ struct crush_weight_set *weight_set; /*!< weight replacements for a given position */
+ __u32 weight_set_positions; /*!< size of the __weight_set__ array */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights and ids for each bucket in the crushmap. The
+ * __size__ of the __args__ array must be exactly the same as the
+ * __map->max_buckets__.
+ *
+ * The __crush_choose_arg__ at index N will be used when choosing
+ * an item from the bucket __map->buckets[N]__ bucket, provided it
+ * is a straw2 bucket.
+ *
+ */
+struct crush_choose_arg_map {
+ struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */
+ __u32 size; /*!< size of the __args__ array */
+};
+
+/** @ingroup API
+ * The weight of each item in the bucket when
+ * __h.alg__ == ::CRUSH_BUCKET_UNIFORM.
+ */
+struct crush_bucket_uniform {
+ struct crush_bucket h; /*!< generic bucket information */
+ __u32 item_weight; /*!< 16.16 fixed point weight for each item */
+};
+
+/** @ingroup API
+ * The weight of each item in the bucket when
+ * __h.alg__ == ::CRUSH_BUCKET_LIST.
+ *
+ * The weight of __h.items[i]__ is __item_weights[i]__ for i in
+ * [0,__h.size__[. The __sum_weight__[i] is the sum of the __item_weights[j]__
+ * for j in [0,i[.
+ *
+ */
+struct crush_bucket_list {
+ struct crush_bucket h; /*!< generic bucket information */
+ __u32 *item_weights; /*!< 16.16 fixed point weight for each item */
+ __u32 *sum_weights; /*!< 16.16 fixed point sum of the weights */
+};
+
+struct crush_bucket_tree {
+ struct crush_bucket h; /* note: h.size is _tree_ size, not number of
+ actual items */
+ __u8 num_nodes;
+ __u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+ struct crush_bucket h;
+ __u32 *item_weights; /* 16-bit fixed point */
+ __u32 *straws; /* 16-bit fixed point */
+};
+
+/** @ingroup API
+ * The weight of each item in the bucket when
+ * __h.alg__ == ::CRUSH_BUCKET_STRAW2.
+ *
+ * The weight of __h.items[i]__ is __item_weights[i]__ for i in
+ * [0,__h.size__[.
+ */
+struct crush_bucket_straw2 {
+ struct crush_bucket h; /*!< generic bucket information */
+ __u32 *item_weights; /*!< 16.16 fixed point weight for each item */
+};
+
+
+
+/** @ingroup API
+ *
+ * A crush map define a hierarchy of crush_bucket that end with leaves
+ * (buckets and leaves are called items) and a set of crush_rule to
+ * map an integer to items with the crush_do_rule() function.
+ *
+ */
+struct crush_map {
+ /*! An array of crush_bucket pointers of size __max_buckets__.
+ * An element of the array may be NULL if the bucket was removed with
+ * crush_remove_bucket(). The buckets must be added with crush_add_bucket().
+ * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i.
+ */
+ struct crush_bucket **buckets;
+ /*! An array of crush_rule pointers of size __max_rules__.
+ * An element of the array may be NULL if the rule was removed (there is
+ * no API to do so but there may be one in the future). The rules must be added
+ * with crush_add_rule().
+ */
+ struct crush_rule **rules;
+ __s32 max_buckets; /*!< the size of __buckets__ */
+ __u32 max_rules; /*!< the size of __rules__ */
+ /*! The value of the highest item stored in the crush_map + 1
+ */
+ __s32 max_devices;
+
+ /*! Backward compatibility tunable. It implements a bad solution
+ * and must always be set to 0 except for backward compatibility
+ * purposes
+ */
+ __u32 choose_local_tries;
+ /*! Backward compatibility tunable. It implements a bad solution
+ * and must always be set to 0 except for backward compatibility
+ * purposes
+ */
+ __u32 choose_local_fallback_tries;
+ /*! Tunable. The default value when the CHOOSE_TRIES or
+ * CHOOSELEAF_TRIES steps are omitted in a rule. See the
+ * documentation for crush_rule_set_step() for more
+ * information
+ */
+ __u32 choose_total_tries;
+ /*! Backward compatibility tunable. It should always be set
+ * to 1 except for backward compatibility. Implemented in 2012
+ * it was generalized late 2013 and is mostly unused except
+ * in one border case, reason why it must be set to 1.
+ *
+ * Attempt chooseleaf inner descent once for firstn mode; on
+ * reject retry outer descent. Note that this does *not*
+ * apply to a collision: in that case we will retry as we
+ * used to.
+ */
+ __u32 chooseleaf_descend_once;
+ /*! Backward compatibility tunable. It is a fix for bad
+ * mappings implemented in 2014 at
+ * https://github.com/ceph/ceph/pull/1185. It should always
+ * be set to 1 except for backward compatibility.
+ *
+ * If non-zero, feed r into chooseleaf, bit-shifted right by
+ * (r-1) bits. a value of 1 is best for new clusters. for
+ * legacy clusters that want to limit reshuffling, a value of
+ * 3 or 4 will make the mappings line up a bit better with
+ * previous mappings.
+ */
+ __u8 chooseleaf_vary_r;
+
+ /*! Backward compatibility tunable. It is an improvement that
+ * avoids unnecessary mapping changes, implemented at
+ * https://github.com/ceph/ceph/pull/6572 and explained in
+ * this post: "chooseleaf may cause some unnecessary pg
+ * migrations" in October 2015
+ * https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html
+ * It should always be set to 1 except for backward compatibility.
+ */
+ __u8 chooseleaf_stable;
+
+ /*! @cond INTERNAL */
+ /* This value is calculated after decode or construction by
+ the builder. It is exposed here (rather than having a
+ 'build CRUSH working space' function) so that callers can
+ reserve a static buffer, allocate space on the stack, or
+ otherwise avoid calling into the heap allocator if they
+ want to. The size of the working space depends on the map,
+ while the size of the scratch vector passed to the mapper
+ depends on the size of the desired result set.
+
+ Nothing stops the caller from allocating both in one swell
+ foop and passing in two points, though. */
+ size_t working_size;
+
+#ifndef __KERNEL__
+ /*! @endcond */
+ /*! Backward compatibility tunable. It is a fix for the straw
+ * scaler values for the straw algorithm which is deprecated
+ * (straw2 replaces it) implemented at
+ * https://github.com/ceph/ceph/pull/3057. It should always
+ * be set to 1 except for backward compatibility.
+ *
+ */
+ __u8 straw_calc_version;
+
+ /*! @cond INTERNAL */
+ /*
+ * allowed bucket algs is a bitmask, here the bit positions
+ * are CRUSH_BUCKET_*. note that these are *bits* and
+ * CRUSH_BUCKET_* values are not, so we need to or together (1
+ * << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
+ * minimize confusion (bucket type values start at 1).
+ */
+ __u32 allowed_bucket_algs;
+
+ __u32 *choose_tries;
+#endif
+ /*! @endcond */
+};
+
+
+/* crush.c */
+/** @ingroup API
+ *
+ * Return the 16.16 fixed point weight of the item at __pos__ (zero
+ * based index) within the bucket __b__. If __pos__ is negative or
+ * greater or equal to the number of items in the bucket, return 0.
+ *
+ * @param b the bucket containing items
+ * @param pos the zero based index of the item
+ *
+ * @returns the 16.16 fixed point item weight
+ */
+extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
+/** @ingroup API
+ *
+ * Deallocate a bucket created via crush_add_bucket().
+ *
+ * @param b the bucket to deallocate
+ */
+extern void crush_destroy_bucket(struct crush_bucket *b);
+/** @ingroup API
+ *
+ * Deallocate a rule created via crush_add_rule().
+ *
+ * @param r the rule to deallocate
+ */
+extern void crush_destroy_rule(struct crush_rule *r);
+/** @ingroup API
+ *
+ * Deallocate the __map__, previously allocated with crush_create.
+ *
+ * @param map the crush map
+ */
+extern void crush_destroy(struct crush_map *map);
+
+static inline int crush_calc_tree_node(int i)
+{
+ return ((i+1) << 1)-1;
+}
+
+static inline const char *crush_alg_name(int alg)
+{
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return "uniform";
+ case CRUSH_BUCKET_LIST:
+ return "list";
+ case CRUSH_BUCKET_TREE:
+ return "tree";
+ case CRUSH_BUCKET_STRAW:
+ return "straw";
+ case CRUSH_BUCKET_STRAW2:
+ return "straw2";
+ default:
+ return "unknown";
+ }
+}
+
+/* ---------------------------------------------------------------------
+ Private
+ --------------------------------------------------------------------- */
+
+/* These data structures are private to the CRUSH implementation. They
+ are exposed in this header file because builder needs their
+ definitions to calculate the total working size.
+
+ Moving this out of the crush map allow us to treat the CRUSH map as
+ immutable within the mapper and removes the requirement for a CRUSH
+ map lock. */
+
+struct crush_work_bucket {
+ __u32 perm_x; /* @x for which *perm is defined */
+ __u32 perm_n; /* num elements of *perm that are permuted/defined */
+ __u32 *perm; /* Permutation of the bucket's items */
+};
+
+struct crush_work {
+ struct crush_work_bucket **work; /* Per-bucket working store */
+};
+
+#endif
diff --git a/src/crush/crush_compat.h b/src/crush/crush_compat.h
new file mode 100644
index 00000000..08eb4eab
--- /dev/null
+++ b/src/crush/crush_compat.h
@@ -0,0 +1,39 @@
+#ifndef CEPH_CRUSH_COMPAT_H
+#define CEPH_CRUSH_COMPAT_H
+
+#include "include/int_types.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* asm-generic/bug.h */
+
+#define BUG_ON(x) assert(!(x))
+
+/* linux/kernel.h */
+
+#define U8_MAX ((__u8)~0U)
+#define S8_MAX ((__s8)(U8_MAX>>1))
+#define S8_MIN ((__s8)(-S8_MAX - 1))
+#define U16_MAX ((__u16)~0U)
+#define S16_MAX ((__s16)(U16_MAX>>1))
+#define S16_MIN ((__s16)(-S16_MAX - 1))
+#define U32_MAX ((__u32)~0U)
+#define S32_MAX ((__s32)(U32_MAX>>1))
+#define S32_MIN ((__s32)(-S32_MAX - 1))
+#define U64_MAX ((__u64)~0ULL)
+#define S64_MAX ((__s64)(U64_MAX>>1))
+#define S64_MIN ((__s64)(-S64_MAX - 1))
+
+/* linux/math64.h */
+
+#define div64_s64(dividend, divisor) ((dividend) / (divisor))
+
+/* linux/slab.h */
+
+#define kmalloc(size, flags) malloc(size)
+#define kfree(x) do { if (x) free(x); } while (0)
+
+#endif /* CEPH_CRUSH_COMPAT_H */
diff --git a/src/crush/crush_ln_table.h b/src/crush/crush_ln_table.h
new file mode 100644
index 00000000..aae534c9
--- /dev/null
+++ b/src/crush/crush_ln_table.h
@@ -0,0 +1,164 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CRUSH_LN_H
+#define CEPH_CRUSH_LN_H
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
+/*
+ * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+ * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+ */
+static __s64 __RH_LH_tbl[128*2+2] = {
+ 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
+ 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
+ 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
+ 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll,
+ 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll,
+ 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll,
+ 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll,
+ 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell,
+ 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll,
+ 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll,
+ 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll,
+ 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll,
+ 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll,
+ 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll,
+ 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all,
+ 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll,
+ 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all,
+ 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell,
+ 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll,
+ 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll,
+ 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll,
+ 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll,
+ 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll,
+ 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll,
+ 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll,
+ 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll,
+ 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell,
+ 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll,
+ 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll,
+ 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll,
+ 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll,
+ 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll,
+ 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll,
+ 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll,
+ 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll,
+ 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll,
+ 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll,
+ 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll,
+ 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll,
+ 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll,
+ 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll,
+ 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll,
+ 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll,
+ 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll,
+ 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll,
+ 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll,
+ 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll,
+ 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll,
+ 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll,
+ 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll,
+ 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll,
+ 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll,
+ 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll,
+ 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell,
+ 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell,
+ 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll,
+ 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell,
+ 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll,
+ 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll,
+ 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll,
+ 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll,
+ 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll,
+ 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
+ 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
+ 0x0000800000000000ll, 0x0000ffff00000000ll,
+};
+
+/*
+ * LL_tbl[k] = 2^48*log2(1.0+k/2^15)
+ */
+static __s64 __LL_tbl[256] = {
+ 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
+ 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
+ 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
+ 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull,
+ 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull,
+ 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull,
+ 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull,
+ 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull,
+ 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull,
+ 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull,
+ 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull,
+ 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull,
+ 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull,
+ 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull,
+ 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull,
+ 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull,
+ 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull,
+ 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull,
+ 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull,
+ 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull,
+ 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull,
+ 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull,
+ 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull,
+ 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull,
+ 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull,
+ 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull,
+ 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull,
+ 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull,
+ 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull,
+ 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull,
+ 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull,
+ 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull,
+ 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull,
+ 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull,
+ 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull,
+ 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull,
+ 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull,
+ 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull,
+ 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull,
+ 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull,
+ 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull,
+ 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull,
+ 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull,
+ 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull,
+ 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull,
+ 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull,
+ 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull,
+ 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull,
+ 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull,
+ 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull,
+ 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull,
+ 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull,
+ 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull,
+ 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull,
+ 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull,
+ 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull,
+ 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull,
+ 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull,
+ 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull,
+ 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull,
+ 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull,
+ 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull,
+ 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull,
+ 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
+};
+
+#endif
diff --git a/src/crush/grammar.h b/src/crush/grammar.h
new file mode 100644
index 00000000..42a6068b
--- /dev/null
+++ b/src/crush/grammar.h
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2008 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CRUSH_GRAMMAR_H
+#define CEPH_CRUSH_GRAMMAR_H
+
+//#define BOOST_SPIRIT_DEBUG
+
+#ifdef USE_BOOST_SPIRIT_OLD_HDR
+#include <boost/spirit/core.hpp>
+#include <boost/spirit/tree/ast.hpp>
+#include <boost/spirit/tree/tree_to_xml.hpp>
+#else
+#define BOOST_SPIRIT_USE_OLD_NAMESPACE
+#include <boost/spirit/include/classic_core.hpp>
+#include <boost/spirit/include/classic_ast.hpp>
+#include <boost/spirit/include/classic_tree_to_xml.hpp>
+#endif
+using namespace boost::spirit;
+
+struct crush_grammar : public grammar<crush_grammar>
+{
+ enum {
+ _int = 1,
+ _posint,
+ _negint,
+ _name,
+ _device,
+ _bucket_type,
+ _bucket_id,
+ _bucket_alg,
+ _bucket_hash,
+ _bucket_item,
+ _bucket,
+ _step_take,
+ _step_set_chooseleaf_tries,
+ _step_set_chooseleaf_vary_r,
+ _step_set_chooseleaf_stable,
+ _step_set_choose_tries,
+ _step_set_choose_local_tries,
+ _step_set_choose_local_fallback_tries,
+ _step_choose,
+ _step_chooseleaf,
+ _step_emit,
+ _step,
+ _crushrule,
+ _weight_set_weights,
+ _weight_set,
+ _choose_arg_ids,
+ _choose_arg,
+ _choose_args,
+ _crushmap,
+ _tunable,
+ };
+
+ template <typename ScannerT>
+ struct definition
+ {
+ rule<ScannerT, parser_context<>, parser_tag<_int> > integer;
+ rule<ScannerT, parser_context<>, parser_tag<_posint> > posint;
+ rule<ScannerT, parser_context<>, parser_tag<_negint> > negint;
+ rule<ScannerT, parser_context<>, parser_tag<_name> > name;
+
+ rule<ScannerT, parser_context<>, parser_tag<_tunable> > tunable;
+
+ rule<ScannerT, parser_context<>, parser_tag<_device> > device;
+
+ rule<ScannerT, parser_context<>, parser_tag<_bucket_type> > bucket_type;
+
+ rule<ScannerT, parser_context<>, parser_tag<_bucket_id> > bucket_id;
+ rule<ScannerT, parser_context<>, parser_tag<_bucket_alg> > bucket_alg;
+ rule<ScannerT, parser_context<>, parser_tag<_bucket_hash> > bucket_hash;
+ rule<ScannerT, parser_context<>, parser_tag<_bucket_item> > bucket_item;
+ rule<ScannerT, parser_context<>, parser_tag<_bucket> > bucket;
+
+ rule<ScannerT, parser_context<>, parser_tag<_step_take> > step_take;
+ rule<ScannerT, parser_context<>, parser_tag<_step_set_choose_tries> > step_set_choose_tries;
+ rule<ScannerT, parser_context<>, parser_tag<_step_set_choose_local_tries> > step_set_choose_local_tries;
+ rule<ScannerT, parser_context<>, parser_tag<_step_set_choose_local_fallback_tries> > step_set_choose_local_fallback_tries;
+ rule<ScannerT, parser_context<>, parser_tag<_step_set_chooseleaf_tries> > step_set_chooseleaf_tries;
+ rule<ScannerT, parser_context<>, parser_tag<_step_set_chooseleaf_vary_r> > step_set_chooseleaf_vary_r;
+ rule<ScannerT, parser_context<>, parser_tag<_step_set_chooseleaf_stable> > step_set_chooseleaf_stable;
+ rule<ScannerT, parser_context<>, parser_tag<_step_choose> > step_choose;
+ rule<ScannerT, parser_context<>, parser_tag<_step_chooseleaf> > step_chooseleaf;
+ rule<ScannerT, parser_context<>, parser_tag<_step_emit> > step_emit;
+ rule<ScannerT, parser_context<>, parser_tag<_step> > step;
+ rule<ScannerT, parser_context<>, parser_tag<_crushrule> > crushrule;
+ rule<ScannerT, parser_context<>, parser_tag<_weight_set_weights> > weight_set_weights;
+ rule<ScannerT, parser_context<>, parser_tag<_weight_set> > weight_set;
+ rule<ScannerT, parser_context<>, parser_tag<_choose_arg_ids> > choose_arg_ids;
+ rule<ScannerT, parser_context<>, parser_tag<_choose_arg> > choose_arg;
+ rule<ScannerT, parser_context<>, parser_tag<_choose_args> > choose_args;
+
+ rule<ScannerT, parser_context<>, parser_tag<_crushmap> > crushmap;
+
+ definition(crush_grammar const& /*self*/)
+ {
+ // base types
+ integer = leaf_node_d[ lexeme_d[
+ (!ch_p('-') >> +digit_p)
+ ] ];
+ posint = leaf_node_d[ lexeme_d[ +digit_p ] ];
+ negint = leaf_node_d[ lexeme_d[ ch_p('-') >> +digit_p ] ];
+ name = leaf_node_d[ lexeme_d[ +( alnum_p || ch_p('-') || ch_p('_') || ch_p('.')) ] ];
+
+ // tunables
+ tunable = str_p("tunable") >> name >> posint;
+
+ // devices
+ device = str_p("device") >> posint >> name >> !( str_p("class") >> name );
+
+ // bucket types
+ bucket_type = str_p("type") >> posint >> name;
+
+ // buckets
+ bucket_id = str_p("id") >> negint >> !( str_p("class") >> name );
+ bucket_alg = str_p("alg") >> name;
+ bucket_hash = str_p("hash") >> ( integer |
+ str_p("rjenkins1") );
+ bucket_item = str_p("item") >> name
+ >> !( str_p("weight") >> real_p )
+ >> !( str_p("pos") >> posint );
+ bucket = name >> name >> '{' >> *bucket_id >> bucket_alg >> *bucket_hash >> *bucket_item >> '}';
+
+ // rules
+ step_take = str_p("take") >> name >> !( str_p("class") >> name );
+ step_set_choose_tries = str_p("set_choose_tries") >> posint;
+ step_set_choose_local_tries = str_p("set_choose_local_tries") >> posint;
+ step_set_choose_local_fallback_tries = str_p("set_choose_local_fallback_tries") >> posint;
+ step_set_chooseleaf_tries = str_p("set_chooseleaf_tries") >> posint;
+ step_set_chooseleaf_vary_r = str_p("set_chooseleaf_vary_r") >> posint;
+ step_set_chooseleaf_stable = str_p("set_chooseleaf_stable") >> posint;
+ step_choose = str_p("choose")
+ >> ( str_p("indep") | str_p("firstn") )
+ >> integer
+ >> str_p("type") >> name;
+ step_chooseleaf = str_p("chooseleaf")
+ >> ( str_p("indep") | str_p("firstn") )
+ >> integer
+ >> str_p("type") >> name;
+ step_emit = str_p("emit");
+ step = str_p("step") >> ( step_take |
+ step_set_choose_tries |
+ step_set_choose_local_tries |
+ step_set_choose_local_fallback_tries |
+ step_set_chooseleaf_tries |
+ step_set_chooseleaf_vary_r |
+ step_set_chooseleaf_stable |
+ step_choose |
+ step_chooseleaf |
+ step_emit );
+ crushrule = str_p("rule") >> !name >> '{'
+ >> (str_p("id") | str_p("ruleset")) >> posint
+ >> str_p("type") >> ( str_p("replicated") | str_p("erasure") )
+ >> str_p("min_size") >> posint
+ >> str_p("max_size") >> posint
+ >> +step
+ >> '}';
+
+ weight_set_weights = str_p("[") >> *real_p >> str_p("]");
+ weight_set = str_p("weight_set") >> str_p("[")
+ >> *weight_set_weights
+ >> str_p("]");
+ choose_arg_ids = str_p("ids") >> str_p("[") >> *integer >> str_p("]");
+ choose_arg = str_p("{") >> str_p("bucket_id") >> negint
+ >> !weight_set
+ >> !choose_arg_ids
+ >> str_p("}");
+ choose_args = str_p("choose_args") >> posint >> str_p("{") >> *choose_arg >> str_p("}");
+
+ // the whole crush map
+ crushmap = *(tunable | device | bucket_type) >> *(bucket | crushrule) >> *choose_args;
+ }
+
+ rule<ScannerT, parser_context<>, parser_tag<_crushmap> > const&
+ start() const { return crushmap; }
+ };
+};
+
+#endif
diff --git a/src/crush/hash.c b/src/crush/hash.c
new file mode 100644
index 00000000..ed123af4
--- /dev/null
+++ b/src/crush/hash.c
@@ -0,0 +1,151 @@
+#ifdef __KERNEL__
+# include <linux/crush/hash.h>
+#else
+# include "hash.h"
+#endif
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do { \
+ a = a-b; a = a-c; a = a^(c>>13); \
+ b = b-c; b = b-a; b = b^(a<<8); \
+ c = c-a; c = c-b; c = c^(b>>13); \
+ a = a-b; a = a-c; a = a^(c>>12); \
+ b = b-c; b = b-a; b = b^(a<<16); \
+ c = c-a; c = c-b; c = c^(b>>5); \
+ a = a-b; a = a-c; a = a^(c>>3); \
+ b = b-c; b = b-a; b = b^(a<<10); \
+ c = c-a; c = c-b; c = c^(b>>15); \
+ } while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+ __u32 hash = crush_hash_seed ^ a;
+ __u32 b = a;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, a, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(x, a, hash);
+ crush_hashmix(b, y, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(a, x, hash);
+ crush_hashmix(y, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, d, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+ __u32 e)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(e, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ crush_hashmix(d, x, hash);
+ crush_hashmix(y, e, hash);
+ return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1(a);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_2(a, b);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_3(a, b, c);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_4(a, b, c, d);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_5(a, b, c, d, e);
+ default:
+ return 0;
+ }
+}
+
+const char *crush_hash_name(int type)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return "rjenkins1";
+ default:
+ return "unknown";
+ }
+}
diff --git a/src/crush/hash.h b/src/crush/hash.h
new file mode 100644
index 00000000..d1d90258
--- /dev/null
+++ b/src/crush/hash.h
@@ -0,0 +1,23 @@
+#ifndef CEPH_CRUSH_HASH_H
+#define CEPH_CRUSH_HASH_H
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
+#define CRUSH_HASH_RJENKINS1 0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+ __u32 e);
+
+#endif
diff --git a/src/crush/mapper.c b/src/crush/mapper.c
new file mode 100644
index 00000000..73f92a77
--- /dev/null
+++ b/src/crush/mapper.c
@@ -0,0 +1,1105 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# include <linux/crush/crush.h>
+# include <linux/crush/hash.h>
+#else
+# include "crush_compat.h"
+# include "crush.h"
+# include "hash.h"
+#endif
+#include "crush_ln_table.h"
+#include "mapper.h"
+
+#define dprintk(args...) /* printf(args) */
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
+{
+ __u32 i;
+
+ for (i = 0; i < map->max_rules; i++) {
+ if (map->rules[i] &&
+ map->rules[i]->mask.ruleset == ruleset &&
+ map->rules[i]->mask.type == type &&
+ map->rules[i]->mask.min_size <= size &&
+ map->rules[i]->mask.max_size >= size)
+ return i;
+ }
+ return -1;
+}
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors. Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+ struct crush_work_bucket *work,
+ int x, int r)
+{
+ unsigned int pr = r % bucket->size;
+ unsigned int i, s;
+
+ /* start a new permutation if @x has changed */
+ if (work->perm_x != (__u32)x || work->perm_n == 0) {
+ dprintk("bucket %d new x=%d\n", bucket->id, x);
+ work->perm_x = x;
+
+ /* optimize common r=0 case */
+ if (pr == 0) {
+ s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+ bucket->size;
+ work->perm[0] = s;
+ work->perm_n = 0xffff; /* magic value, see below */
+ goto out;
+ }
+
+ for (i = 0; i < bucket->size; i++)
+ work->perm[i] = i;
+ work->perm_n = 0;
+ } else if (work->perm_n == 0xffff) {
+ /* clean up after the r=0 case above */
+ for (i = 1; i < bucket->size; i++)
+ work->perm[i] = i;
+ work->perm[work->perm[0]] = 0;
+ work->perm_n = 1;
+ }
+
+ /* calculate permutation up to pr */
+ for (i = 0; i < work->perm_n; i++)
+ dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
+ while (work->perm_n <= pr) {
+ unsigned int p = work->perm_n;
+ /* no point in swapping the final entry */
+ if (p < bucket->size - 1) {
+ i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+ (bucket->size - p);
+ if (i) {
+ unsigned int t = work->perm[p + i];
+ work->perm[p + i] = work->perm[p];
+ work->perm[p] = t;
+ }
+ dprintk(" perm_choose swap %d with %d\n", p, p+i);
+ }
+ work->perm_n++;
+ }
+ for (i = 0; i < bucket->size; i++)
+ dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
+
+ s = work->perm[pr];
+out:
+ dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+ bucket->size, x, r, pr, s);
+ return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+ struct crush_work_bucket *work, int x, int r)
+{
+ return bucket_perm_choose(&bucket->h, work, x, r);
+}
+
+/* list */
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
+ int x, int r)
+{
+ int i;
+
+ for (i = bucket->h.size-1; i >= 0; i--) {
+ __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
+ r, bucket->h.id);
+ w &= 0xffff;
+ dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+ "sw %x rand %llx",
+ i, x, r, bucket->h.items[i], bucket->item_weights[i],
+ bucket->sum_weights[i], w);
+ w *= bucket->sum_weights[i];
+ w = w >> 16;
+ /*dprintk(" scaled %llx\n", w);*/
+ if (w < bucket->item_weights[i]) {
+ return bucket->h.items[i];
+ }
+ }
+
+ dprintk("bad list sums for bucket %d\n", bucket->h.id);
+ return bucket->h.items[0];
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+ int h = 0;
+ while ((n & 1) == 0) {
+ h++;
+ n = n >> 1;
+ }
+ return h;
+}
+
+static int left(int x)
+{
+ int h = height(x);
+ return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+ int h = height(x);
+ return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+ return x & 1;
+}
+
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
+ int x, int r)
+{
+ int n;
+ __u32 w;
+ __u64 t;
+
+ /* start at root */
+ n = bucket->num_nodes >> 1;
+
+ while (!terminal(n)) {
+ int l;
+ /* pick point in [0, w) */
+ w = bucket->node_weights[n];
+ t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+ bucket->h.id) * (__u64)w;
+ t = t >> 32;
+
+ /* descend to the left or right? */
+ l = left(n);
+ if (t < bucket->node_weights[l])
+ n = l;
+ else
+ n = right(n);
+ }
+
+ return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
+ int x, int r)
+{
+ __u32 i;
+ int high = 0;
+ __u64 high_draw = 0;
+ __u64 draw;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+ draw &= 0xffff;
+ draw *= bucket->straws[i];
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+ return bucket->h.items[high];
+}
+
+/* compute 2^44*log2(input+1) */
+static __u64 crush_ln(unsigned int xin)
+{
+ unsigned int x = xin;
+ int iexpon, index1, index2;
+ __u64 RH, LH, LL, xl64, result;
+
+ x++;
+
+ /* normalize input */
+ iexpon = 15;
+
+ // figure out number of bits we need to shift and
+ // do it in one step instead of iteratively
+ if (!(x & 0x18000)) {
+ int bits = __builtin_clz(x & 0x1FFFF) - 16;
+ x <<= bits;
+ iexpon = 15 - bits;
+ }
+
+ index1 = (x >> 8) << 1;
+ /* RH ~ 2^56/index1 */
+ RH = __RH_LH_tbl[index1 - 256];
+ /* LH ~ 2^48 * log2(index1/256) */
+ LH = __RH_LH_tbl[index1 + 1 - 256];
+
+ /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
+ xl64 = (__s64)x * RH;
+ xl64 >>= 48;
+
+ result = iexpon;
+ result <<= (12 + 32);
+
+ index2 = xl64 & 0xff;
+ /* LL ~ 2^48*log2(1.0+index2/2^15) */
+ LL = __LL_tbl[index2];
+
+ LH = LH + LL;
+
+ LH >>= (48 - 12 - 32);
+ result += LH;
+
+ return result;
+}
+
+
+/*
+ * straw2
+ *
+ * Suppose we have two osds: osd.0 and osd.1, with weight 8 and 4 respectively, It means:
+ * a). For osd.0, the time interval between each io request apply to exponential distribution
+ * with lamba equals 8
+ * b). For osd.1, the time interval between each io request apply to exponential distribution
+ * with lamba equals 4
+ * c). If we apply to each osd's exponential random variable, then the total pgs on each osd
+ * is proportional to its weight.
+ *
+ * for reference, see:
+ *
+ * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
+ */
+
+static inline __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
+ const struct crush_choose_arg *arg,
+ int position)
+{
+ if ((arg == NULL) || (arg->weight_set == NULL))
+ return bucket->item_weights;
+ if (position >= arg->weight_set_positions)
+ position = arg->weight_set_positions - 1;
+ return arg->weight_set[position].weights;
+}
+
+static inline __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
+ const struct crush_choose_arg *arg)
+{
+ if ((arg == NULL) || (arg->ids == NULL))
+ return bucket->h.items;
+ return arg->ids;
+}
+
+/*
+ * Compute exponential random variable using inversion method.
+ *
+ * for reference, see the exponential distribution example at:
+ * https://en.wikipedia.org/wiki/Inverse_transform_sampling#Examples
+ */
+static inline __s64 generate_exponential_distribution(int type, int x, int y, int z,
+ int weight)
+{
+ unsigned int u = crush_hash32_3(type, x, y, z);
+ u &= 0xffff;
+
+ /*
+ * for some reason slightly less than 0x10000 produces
+ * a slightly more accurate distribution... probably a
+ * rounding effect.
+ *
+ * the natural log lookup table maps [0,0xffff]
+ * (corresponding to real numbers [1/0x10000, 1] to
+ * [0, 0xffffffffffff] (corresponding to real numbers
+ * [-11.090355,0]).
+ */
+ __s64 ln = crush_ln(u) - 0x1000000000000ll;
+
+ /*
+ * divide by 16.16 fixed-point weight. note
+ * that the ln value is negative, so a larger
+ * weight means a larger (less negative) value
+ * for draw.
+ */
+ return div64_s64(ln, weight);
+}
+
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
+ int x, int r, const struct crush_choose_arg *arg,
+ int position)
+{
+ unsigned int i, high = 0;
+ __s64 draw, high_draw = 0;
+ __u32 *weights = get_choose_arg_weights(bucket, arg, position);
+ __s32 *ids = get_choose_arg_ids(bucket, arg);
+ for (i = 0; i < bucket->h.size; i++) {
+ dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
+ if (weights[i]) {
+ draw = generate_exponential_distribution(bucket->h.hash, x, ids[i], r, weights[i]);
+ } else {
+ draw = S64_MIN;
+ }
+
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+
+ return bucket->h.items[high];
+}
+
+
+static int crush_bucket_choose(const struct crush_bucket *in,
+ struct crush_work_bucket *work,
+ int x, int r,
+ const struct crush_choose_arg *arg,
+ int position)
+{
+ dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+ BUG_ON(in->size == 0);
+ switch (in->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return bucket_uniform_choose(
+ (const struct crush_bucket_uniform *)in,
+ work, x, r);
+ case CRUSH_BUCKET_LIST:
+ return bucket_list_choose((const struct crush_bucket_list *)in,
+ x, r);
+ case CRUSH_BUCKET_TREE:
+ return bucket_tree_choose((const struct crush_bucket_tree *)in,
+ x, r);
+ case CRUSH_BUCKET_STRAW:
+ return bucket_straw_choose(
+ (const struct crush_bucket_straw *)in,
+ x, r);
+ case CRUSH_BUCKET_STRAW2:
+ return bucket_straw2_choose(
+ (const struct crush_bucket_straw2 *)in,
+ x, r, arg, position);
+ default:
+ dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
+ return in->items[0];
+ }
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(const struct crush_map *map,
+ const __u32 *weight, int weight_max,
+ int item, int x)
+{
+ if (item >= weight_max)
+ return 1;
+ if (weight[item] >= 0x10000)
+ return 0;
+ if (weight[item] == 0)
+ return 1;
+ if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+ < weight[item])
+ return 0;
+ return 1;
+}
+
+/**
+ * crush_choose_firstn - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @out_size: size of the out vector
+ * @tries: number of attempts to make
+ * @recurse_tries: number of attempts to have recursive chooseleaf make
+ * @local_retries: localized retries
+ * @local_fallback_retries: localized fallback retries
+ * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @stable: stable mode starts rep=0 in the recursive call for all replicas
+ * @vary_r: pass r to recursive calls
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
+ */
+static int crush_choose_firstn(const struct crush_map *map,
+ struct crush_work *work,
+ const struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int numrep, int type,
+ int *out, int outpos,
+ int out_size,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ unsigned int local_retries,
+ unsigned int local_fallback_retries,
+ int recurse_to_leaf,
+ unsigned int vary_r,
+ unsigned int stable,
+ int *out2,
+ int parent_r,
+ const struct crush_choose_arg *choose_args)
+{
+ int rep;
+ unsigned int ftotal, flocal;
+ int retry_descent, retry_bucket, skip_rep;
+ const struct crush_bucket *in = bucket;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide, reject;
+ int count = out_size;
+
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d \
+recurse_tries %d local_retries %d local_fallback_retries %d \
+parent_r %d stable %d\n",
+ recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep,
+ tries, recurse_tries, local_retries, local_fallback_retries,
+ parent_r, stable);
+
+ for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
+ /* keep trying until we get a non-out, non-colliding item */
+ ftotal = 0;
+ skip_rep = 0;
+ do {
+ retry_descent = 0;
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ flocal = 0;
+ do {
+ collide = 0;
+ retry_bucket = 0;
+ r = rep + parent_r;
+ /* r' = r + f_total */
+ r += ftotal;
+
+ /* bucket choose */
+ if (in->size == 0) {
+ reject = 1;
+ goto reject;
+ }
+ if (local_fallback_retries > 0 &&
+ flocal >= (in->size>>1) &&
+ flocal > local_fallback_retries)
+ item = bucket_perm_choose(
+ in, work->work[-1-in->id],
+ x, r);
+ else
+ item = crush_bucket_choose(
+ in, work->work[-1-in->id],
+ x, r,
+ (choose_args ? &choose_args[-1-in->id] : 0),
+ outpos);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ skip_rep = 1;
+ break;
+ }
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ skip_rep = 1;
+ break;
+ }
+ in = map->buckets[-1-item];
+ retry_bucket = 1;
+ continue;
+ }
+
+ /* collision? */
+ for (i = 0; i < outpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+
+ reject = 0;
+ if (!collide && recurse_to_leaf) {
+ if (item < 0) {
+ int sub_r;
+ if (vary_r)
+ sub_r = r >> (vary_r-1);
+ else
+ sub_r = 0;
+ if (crush_choose_firstn(
+ map,
+ work,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, stable ? 1 : outpos+1, 0,
+ out2, outpos, count,
+ recurse_tries, 0,
+ local_retries,
+ local_fallback_retries,
+ 0,
+ vary_r,
+ stable,
+ NULL,
+ sub_r,
+ choose_args) <= outpos)
+ /* didn't get leaf */
+ reject = 1;
+ } else {
+ /* we already have a leaf! */
+ out2[outpos] = item;
+ }
+ }
+
+ if (!reject && !collide) {
+ /* out? */
+ if (itemtype == 0)
+ reject = is_out(map, weight,
+ weight_max,
+ item, x);
+ }
+
+reject:
+ if (reject || collide) {
+ ftotal++;
+ flocal++;
+
+ if (collide && flocal <= local_retries)
+ /* retry locally a few times */
+ retry_bucket = 1;
+ else if (local_fallback_retries > 0 &&
+ flocal <= in->size + local_fallback_retries)
+ /* exhaustive bucket search */
+ retry_bucket = 1;
+ else if (ftotal < tries)
+ /* then retry descent */
+ retry_descent = 1;
+ else
+ /* else give up */
+ skip_rep = 1;
+ dprintk(" reject %d collide %d "
+ "ftotal %u flocal %u\n",
+ reject, collide, ftotal,
+ flocal);
+ }
+ } while (retry_bucket);
+ } while (retry_descent);
+
+ if (skip_rep) {
+ dprintk("skip rep\n");
+ continue;
+ }
+
+ dprintk("CHOOSE got %d\n", item);
+ out[outpos] = item;
+ outpos++;
+ count--;
+#ifndef __KERNEL__
+ if (map->choose_tries && ftotal <= map->choose_total_tries)
+ map->choose_tries[ftotal]++;
+#endif
+ }
+
+ dprintk("CHOOSE returns %d\n", outpos);
+ return outpos;
+}
+
+
+/**
+ * crush_choose_indep: alternative breadth-first positionally stable mapping
+ *
+ */
+static void crush_choose_indep(const struct crush_map *map,
+ struct crush_work *work,
+ const struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int left, int numrep, int type,
+ int *out, int outpos,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ int recurse_to_leaf,
+ int *out2,
+ int parent_r,
+ const struct crush_choose_arg *choose_args)
+{
+ const struct crush_bucket *in = bucket;
+ int endpos = outpos + left;
+ int rep;
+ unsigned int ftotal;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide;
+
+ dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep);
+
+ /* initially my result is undefined */
+ for (rep = outpos; rep < endpos; rep++) {
+ out[rep] = CRUSH_ITEM_UNDEF;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_UNDEF;
+ }
+
+ for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+#ifdef DEBUG_INDEP
+ if (out2 && ftotal) {
+ dprintk("%u %d a: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out[rep]);
+ }
+ dprintk("\n");
+ dprintk("%u %d b: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out2[rep]);
+ }
+ dprintk("\n");
+ }
+#endif
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] != CRUSH_ITEM_UNDEF)
+ continue;
+
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ for (;;) {
+ /* note: we base the choice on the position
+ * even in the nested call. that means that
+ * if the first layer chooses the same bucket
+ * in a different position, we will tend to
+ * choose a different item in that bucket.
+ * this will involve more devices in data
+ * movement and tend to distribute the load.
+ */
+ r = rep + parent_r;
+
+ /* be careful */
+ if (in->alg == CRUSH_BUCKET_UNIFORM &&
+ in->size % numrep == 0)
+ /* r'=r+(n+1)*f_total */
+ r += (numrep+1) * ftotal;
+ else
+ /* r' = r + n*f_total */
+ r += numrep * ftotal;
+
+ /* bucket choose */
+ if (in->size == 0) {
+ dprintk(" empty bucket\n");
+ break;
+ }
+
+ item = crush_bucket_choose(
+ in, work->work[-1-in->id],
+ x, r,
+ (choose_args ? &choose_args[-1-in->id] : 0),
+ outpos);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] =
+ CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+ in = map->buckets[-1-item];
+ continue;
+ }
+
+ /* collision? */
+ collide = 0;
+ for (i = outpos; i < endpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+ if (collide)
+ break;
+
+ if (recurse_to_leaf) {
+ if (item < 0) {
+ crush_choose_indep(
+ map,
+ work,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, 1, numrep, 0,
+ out2, rep,
+ recurse_tries, 0,
+ 0, NULL, r, choose_args);
+ if (out2[rep] == CRUSH_ITEM_NONE) {
+ /* placed nothing; no leaf */
+ break;
+ }
+ } else {
+ /* we already have a leaf! */
+ out2[rep] = item;
+ }
+ }
+
+ /* out? */
+ if (itemtype == 0 &&
+ is_out(map, weight, weight_max, item, x))
+ break;
+
+ /* yay! */
+ out[rep] = item;
+ left--;
+ break;
+ }
+ }
+ }
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] == CRUSH_ITEM_UNDEF) {
+ out[rep] = CRUSH_ITEM_NONE;
+ }
+ if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
+ out2[rep] = CRUSH_ITEM_NONE;
+ }
+ }
+#ifndef __KERNEL__
+ if (map->choose_tries && ftotal <= map->choose_total_tries)
+ map->choose_tries[ftotal]++;
+#endif
+#ifdef DEBUG_INDEP
+ if (out2) {
+ dprintk("%u %d a: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out[rep]);
+ }
+ dprintk("\n");
+ dprintk("%u %d b: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out2[rep]);
+ }
+ dprintk("\n");
+ }
+#endif
+}
+
+
+/* This takes a chunk of memory and sets it up to be a shiny new
+ working area for a CRUSH placement computation. It must be called
+ on any newly allocated memory before passing it in to
+ crush_do_rule. It may be used repeatedly after that, so long as the
+ map has not changed. If the map /has/ changed, you must make sure
+ the working size is no smaller than what was allocated and re-run
+ crush_init_workspace.
+
+ If you do retain the working space between calls to crush, make it
+ thread-local. If you reinstitute the locking I've spent so much
+ time getting rid of, I will be very unhappy with you. */
+
+void crush_init_workspace(const struct crush_map *m, void *v) {
+ /* We work by moving through the available space and setting
+ values and pointers as we go.
+
+ It's a bit like Forth's use of the 'allot' word since we
+ set the pointer first and then reserve the space for it to
+ point to by incrementing the point. */
+ struct crush_work *w = (struct crush_work *)v;
+ char *point = (char *)v;
+ __s32 b;
+ point += sizeof(struct crush_work);
+ w->work = (struct crush_work_bucket **)point;
+ point += m->max_buckets * sizeof(struct crush_work_bucket *);
+ for (b = 0; b < m->max_buckets; ++b) {
+ if (m->buckets[b] == 0)
+ continue;
+
+ w->work[b] = (struct crush_work_bucket *) point;
+ switch (m->buckets[b]->alg) {
+ default:
+ point += sizeof(struct crush_work_bucket);
+ break;
+ }
+ w->work[b]->perm_x = 0;
+ w->work[b]->perm_n = 0;
+ w->work[b]->perm = (__u32 *)point;
+ point += m->buckets[b]->size * sizeof(__u32);
+ }
+ BUG_ON((char *)point - (char *)w != m->working_size);
+}
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
+ * @cwin: Pointer to at least map->working_size bytes of memory or NULL.
+ */
+int crush_do_rule(const struct crush_map *map,
+ int ruleno, int x, int *result, int result_max,
+ const __u32 *weight, int weight_max,
+ void *cwin, const struct crush_choose_arg *choose_args)
+{
+ int result_len;
+ struct crush_work *cw = cwin;
+ int *a = (int *)((char *)cw + map->working_size);
+ int *b = a + result_max;
+ int *c = b + result_max;
+ int *w = a;
+ int *o = b;
+ int recurse_to_leaf;
+ int wsize = 0;
+ int osize;
+ int *tmp;
+ const struct crush_rule *rule;
+ __u32 step;
+ int i, j;
+ int numrep;
+ int out_size;
+ /*
+ * the original choose_total_tries value was off by one (it
+ * counted "retries" and not "tries"). add one.
+ */
+ int choose_tries = map->choose_total_tries + 1;
+ int choose_leaf_tries = 0;
+ /*
+ * the local tries values were counted as "retries", though,
+ * and need no adjustment
+ */
+ int choose_local_retries = map->choose_local_tries;
+ int choose_local_fallback_retries = map->choose_local_fallback_tries;
+
+ int vary_r = map->chooseleaf_vary_r;
+ int stable = map->chooseleaf_stable;
+
+ if ((__u32)ruleno >= map->max_rules) {
+ dprintk(" bad ruleno %d\n", ruleno);
+ return 0;
+ }
+
+ rule = map->rules[ruleno];
+ result_len = 0;
+
+ for (step = 0; step < rule->len; step++) {
+ int firstn = 0;
+ const struct crush_rule_step *curstep = &rule->steps[step];
+
+ switch (curstep->op) {
+ case CRUSH_RULE_TAKE:
+ if ((curstep->arg1 >= 0 &&
+ curstep->arg1 < map->max_devices) ||
+ (-1-curstep->arg1 >= 0 &&
+ -1-curstep->arg1 < map->max_buckets &&
+ map->buckets[-1-curstep->arg1])) {
+ w[0] = curstep->arg1;
+ wsize = 1;
+ } else {
+ dprintk(" bad take value %d\n", curstep->arg1);
+ }
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_TRIES:
+ if (curstep->arg1 > 0)
+ choose_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+ if (curstep->arg1 > 0)
+ choose_leaf_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
+ if (curstep->arg1 >= 0)
+ choose_local_retries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
+ if (curstep->arg1 >= 0)
+ choose_local_fallback_retries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+ if (curstep->arg1 >= 0)
+ vary_r = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
+ if (curstep->arg1 >= 0)
+ stable = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ firstn = 1;
+ /* fall through */
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
+ case CRUSH_RULE_CHOOSE_INDEP:
+ if (wsize == 0)
+ break;
+
+ recurse_to_leaf =
+ curstep->op ==
+ CRUSH_RULE_CHOOSELEAF_FIRSTN ||
+ curstep->op ==
+ CRUSH_RULE_CHOOSELEAF_INDEP;
+
+ /* reset output */
+ osize = 0;
+
+ for (i = 0; i < wsize; i++) {
+ int bno;
+ numrep = curstep->arg1;
+ if (numrep <= 0) {
+ numrep += result_max;
+ if (numrep <= 0)
+ continue;
+ }
+ j = 0;
+ /* make sure bucket id is valid */
+ bno = -1 - w[i];
+ if (bno < 0 || bno >= map->max_buckets) {
+ // w[i] is probably CRUSH_ITEM_NONE
+ dprintk(" bad w[i] %d\n", w[i]);
+ continue;
+ }
+ if (firstn) {
+ int recurse_tries;
+ if (choose_leaf_tries)
+ recurse_tries =
+ choose_leaf_tries;
+ else if (map->chooseleaf_descend_once)
+ recurse_tries = 1;
+ else
+ recurse_tries = choose_tries;
+ osize += crush_choose_firstn(
+ map,
+ cw,
+ map->buckets[bno],
+ weight, weight_max,
+ x, numrep,
+ curstep->arg2,
+ o+osize, j,
+ result_max-osize,
+ choose_tries,
+ recurse_tries,
+ choose_local_retries,
+ choose_local_fallback_retries,
+ recurse_to_leaf,
+ vary_r,
+ stable,
+ c+osize,
+ 0,
+ choose_args);
+ } else {
+ out_size = ((numrep < (result_max-osize)) ?
+ numrep : (result_max-osize));
+ crush_choose_indep(
+ map,
+ cw,
+ map->buckets[bno],
+ weight, weight_max,
+ x, out_size, numrep,
+ curstep->arg2,
+ o+osize, j,
+ choose_tries,
+ choose_leaf_tries ?
+ choose_leaf_tries : 1,
+ recurse_to_leaf,
+ c+osize,
+ 0,
+ choose_args);
+ osize += out_size;
+ }
+ }
+
+ if (recurse_to_leaf)
+ /* copy final _leaf_ values to output set */
+ memcpy(o, c, osize*sizeof(*o));
+
+ /* swap o and w arrays */
+ tmp = o;
+ o = w;
+ w = tmp;
+ wsize = osize;
+ break;
+
+
+ case CRUSH_RULE_EMIT:
+ for (i = 0; i < wsize && result_len < result_max; i++) {
+ result[result_len] = w[i];
+ result_len++;
+ }
+ wsize = 0;
+ break;
+
+ default:
+ dprintk(" unknown op %d at step %d\n",
+ curstep->op, step);
+ break;
+ }
+ }
+
+ return result_len;
+}
diff --git a/src/crush/mapper.h b/src/crush/mapper.h
new file mode 100644
index 00000000..e76be767
--- /dev/null
+++ b/src/crush/mapper.h
@@ -0,0 +1,93 @@
+#ifndef CEPH_CRUSH_MAPPER_H
+#define CEPH_CRUSH_MAPPER_H
+
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL2.1
+ */
+
+#include "crush.h"
+
+extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
+/** @ingroup API
+ *
+ * Map __x__ to __result_max__ items and store them in the __result__
+ * array. The mapping is done by following each step of the rule
+ * __ruleno__. See crush_make_rule(), crush_rule_set_step() and
+ * crush_add_rule() for more information on how the rules are created,
+ * populated and added to the crush __map__.
+ *
+ * The return value is the the number of items in the __result__
+ * array. If the caller asked for __result_max__ items and the return
+ * value is X where X < __result_max__, the content of __result[0,X[__
+ * is defined but the content of __result[X,result_max[__ is
+ * undefined. For example:
+ *
+ * crush_do_rule(map, ruleno=1, x=1, result, result_max=3,...) == 1
+ * result[0] is set
+ * result[1] is undefined
+ * result[2] is undefined
+ *
+ * An entry in the __result__ array is either an item in the crush
+ * __map__ or ::CRUSH_ITEM_NONE if no item was found. For example:
+ *
+ * crush_do_rule(map, ruleno=1, x=1, result, result_max=4,...) == 2
+ * result[0] is CRUSH_ITEM_NONE
+ * result[1] is item number 5
+ * result[2] is undefined
+ * result[3] is undefined
+ *
+ * The __weight__ array contains the probabilities that a leaf is
+ * ignored even if it is selected. It is a 16.16 fixed point
+ * number in the range [0x00000,0x10000]. The lower the value, the
+ * more often the leaf is ignored. For instance:
+ *
+ * - weight[leaf] == 0x00000 == 0.0 always ignore
+ * - weight[leaf] == 0x10000 == 1.0 never ignore
+ * - weight[leaf] == 0x08000 == 0.5 ignore 50% of the time
+ * - weight[leaf] == 0x04000 == 0.25 ignore 75% of the time
+ * - etc.
+ *
+ * During mapping, each leaf is checked against the __weight__ array,
+ * using the leaf as an index. If there is no entry in __weight__ for
+ * the leaf, it is ignored. If there is an entry, the leaf will be
+ * ignored some of the time, depending on the probability.
+ *
+ * The __cwin__ argument must be set as follows:
+ *
+ * char __cwin__[crush_work_size(__map__, __result_max__)];
+ * crush_init_workspace(__map__, __cwin__);
+ *
+ * @param map the crush_map
+ * @param ruleno a positive integer < __CRUSH_MAX_RULES__
+ * @param x the value to map to __result_max__ items
+ * @param result an array of items of size __result_max__
+ * @param result_max the size of the __result__ array
+ * @param weights an array of weights of size __weight_max__
+ * @param weight_max the size of the __weights__ array
+ * @param cwin must be an char array initialized by crush_init_workspace
+ * @param choose_args weights and ids for each known bucket
+ *
+ * @return 0 on error or the size of __result__ on success
+ */
+extern int crush_do_rule(const struct crush_map *map,
+ int ruleno,
+ int x, int *result, int result_max,
+ const __u32 *weights, int weight_max,
+ void *cwin, const struct crush_choose_arg *choose_args);
+
+/* Returns the exact amount of workspace that will need to be used
+ for a given combination of crush_map and result_max. The caller can
+ then allocate this much on its own, either on the stack, in a
+ per-thread long-lived buffer, or however it likes. */
+
+static inline size_t crush_work_size(const struct crush_map *map,
+ int result_max) {
+ return map->working_size + result_max * 3 * sizeof(__u32);
+}
+
+extern void crush_init_workspace(const struct crush_map *m, void *v);
+
+#endif
diff --git a/src/crush/old_sample.txt b/src/crush/old_sample.txt
new file mode 100644
index 00000000..54cf06a7
--- /dev/null
+++ b/src/crush/old_sample.txt
@@ -0,0 +1,82 @@
+
+# first define our types
+<types>
+ <type osd>
+ type_id = 0
+ </type>
+ <type cab>
+ type_id = 2
+ </type>
+ <type row>
+ type_id = 3
+ </type>
+ <type pool>
+ type_id = 10
+ </type>
+</types>
+
+# hierarchy
+<devices>
+ <osd osd001>
+ id 1
+ weight 500
+ </osd>
+ <osd osd002>
+ id 2
+ weight 500
+ </osd>
+ <osd osd003>
+ id 3
+ weight 500
+ </osd>
+ <osd osd004>
+ id 4
+ weight 500
+ </osd>
+ <osd osd005>
+ id 5
+ weight 500
+ </osd>
+</devices>
+
+<buckets>
+ <cab cab-d2>
+ alg straw
+ id -12
+ <item osd001/>
+ <item osd002/>
+ <item osd003/>
+ <item osd004>
+ weight 600
+ </item>
+ </cab>
+
+# <pool newlayout>
+# <item satapool>
+# weight 1.0
+# </item>
+# <item fcpool>
+# weight 3.0
+# </item>
+# </pool>
+</buckets>
+
+<devices>
+ <osd osd006>
+ id 5
+ weight 500
+ </osd>
+</devices>
+
+# rules
+<rules>
+ <rule normal>
+ pool 0
+ type replicated
+ min_size 1
+ mix_size 4
+ step take root
+ step choose_indep 0 osd
+ step emit
+ </rule>
+</rules>
diff --git a/src/crush/sample.txt b/src/crush/sample.txt
new file mode 100644
index 00000000..f7e0ac39
--- /dev/null
+++ b/src/crush/sample.txt
@@ -0,0 +1,47 @@
+
+# devices
+device 1 osd001
+device 2 osd002
+device 3 osd003 down # same as offload 1.0
+device 4 osd004 offload 0 # 0.0 -> normal, 1.0 -> failed
+device 5 osd005 offload 0.1
+device 6 osd006 offload 0.1
+
+# hierarchy
+type 0 osd # 'device' is actually the default for 0
+type 2 cab
+type 3 row
+type 10 pool
+
+cab root {
+ id -1 # optional
+ alg tree # required
+ item osd001
+ item osd002 weight 600 pos 1
+ item osd003 weight 600 pos 0
+ item osd004 weight 600 pos 3
+ item osd005 weight 600 pos 4
+}
+
+# rules
+rule normal {
+ # these are required.
+ pool 0
+ type replicated
+ min_size 1
+ max_size 4
+ # need 1 or more of these.
+ step take root
+ step choose firstn 0 type osd
+ step emit
+}
+
+rule {
+ pool 1
+ type erasure
+ min_size 3
+ max_size 6
+ step take root
+ step choose indep 0 type osd
+ step emit
+}
diff --git a/src/crush/types.h b/src/crush/types.h
new file mode 100644
index 00000000..919eed25
--- /dev/null
+++ b/src/crush/types.h
@@ -0,0 +1,17 @@
+#ifndef CEPH_CRUSH_TYPES_H
+#define CEPH_CRUSH_TYPES_H
+
+#ifdef KERNEL
+# define free(x) kfree(x)
+#else
+# include <stdlib.h>
+#endif
+
+
+#include <linux/types.h> /* just for int types */
+
+#ifndef BUG_ON
+# define BUG_ON(x) ceph_assert(!(x))
+#endif
+
+#endif