25 files changed, 13491 insertions, 0 deletions
diff --git a/src/crush/CMakeLists.txt b/src/crush/CMakeLists.txt
new file mode 100644
index 000000000..1c875d594
--- /dev/null
+++ b/src/crush/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(crush_srcs
+  builder.c
+  mapper.c
+  crush.c
+  hash.c
+  CrushWrapper.cc
+  CrushCompiler.cc
+  CrushTester.cc
+  CrushLocation.cc)
+
+add_library(crush_objs OBJECT ${crush_srcs})
diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc
new file mode 100644
index 000000000..eafda63af
--- /dev/null
+++ b/src/crush/CrushCompiler.cc
@@ -0,0 +1,1286 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "CrushCompiler.h"
+
+#if defined(_AIX)
+#define EBADE ECORRUPT
+#endif
+
+#ifndef EBADE
+#define EBADE EFTYPE
+#endif
+#include <string>
+#include "common/errno.h"
+#include <boost/algorithm/string.hpp>
+
+using std::cout;
+using std::istream;
+using std::map;
+using std::ostream;
+using std::set;
+using std::string;
+using std::vector;
+
+// -------------
+
+static void print_type_name(ostream& out, int t, CrushWrapper &crush)
+{
+  const char *name = crush.get_type_name(t);
+  if (name)
+    out << name;
+  else if (t == 0)
+    out << "device";
+  else
+    out << "type" << t;
+}
+
+static void print_item_name(ostream& out, int t, CrushWrapper &crush)
+{
+  const char *name = crush.get_item_name(t);
+  if (name)
+    out << name;
+  else if (t >= 0)
+    out << "device" << t;
+  else
+    out << "bucket" << (-1-t);
+}
+
+static void print_bucket_class_ids(ostream& out, int t, CrushWrapper &crush)
+{
+  if (crush.class_bucket.count(t) == 0)
+    return;
+  auto &class_to_id = crush.class_bucket[t];
+  for (auto &i : class_to_id) {
+    int c = i.first;
+    int cid = i.second;
+    const char* class_name = crush.get_class_name(c);
+    ceph_assert(class_name);
+    out << "\tid " << cid << " class " << class_name << "\t\t# do not change unnecessarily\n";
+  }
+}
+
+static void print_item_class(ostream& out, int t, CrushWrapper &crush)
+{
+  const char *c = crush.get_item_class(t);
+  if (c)
+    out << " class " << c;
+}
+
+static void print_class(ostream& out, int t, CrushWrapper &crush)
+{
+  const char *c = crush.get_class_name(t);
+  if (c)
+    out << " class " << c;
+  else
+    out << " # unexpected class " << t;
+}
+
+static void print_rule_name(ostream& out, int t, CrushWrapper &crush)
+{
+  const char *name = crush.get_rule_name(t);
+  if (name)
+    out << name;
+  else
+    out << "rule" << t;
+}
+
+static void print_fixedpoint(ostream& out, int i)
+{
+  char s[20];
+  snprintf(s, sizeof(s), "%.3f", (float)i / (float)0x10000);
+  out << s;
+}
+
+int CrushCompiler::decompile_bucket_impl(int i, ostream &out)
+{
+  const char *name = crush.get_item_name(i);
+  if (name && !crush.is_valid_crush_name(name))
+    return 0;
+  int type = crush.get_bucket_type(i);
+  print_type_name(out, type, crush);
+  out << " ";
+  print_item_name(out, i, crush);
+  out << " {\n";
+  out << "\tid " << i << "\t\t# do not change unnecessarily\n";
+  print_bucket_class_ids(out, i, crush);
+
+  out << "\t# weight ";
+  print_fixedpoint(out, crush.get_bucket_weight(i));
+  out << "\n";
+
+  int n = crush.get_bucket_size(i);
+
+  int alg = crush.get_bucket_alg(i);
+  out << "\talg " << crush_bucket_alg_name(alg);
+
+  // notate based on alg type
+  bool dopos = false;
+  switch (alg) {
+  case CRUSH_BUCKET_UNIFORM:
+    out << "\t# do not change bucket size (" << n << ") unnecessarily";
+    dopos = true;
+    break;
+  case CRUSH_BUCKET_LIST:
+    out << "\t# add new items at the end; do not change order unnecessarily";
+    break;
+  case CRUSH_BUCKET_TREE:
+    out << "\t# do not change pos for existing items unnecessarily";
+    dopos = true;
+    break;
+  }
+  out << "\n";
+
+  int hash = crush.get_bucket_hash(i);
+  out << "\thash " << hash << "\t# " << crush_hash_name(hash) << "\n";
+
+  for (int j=0; j<n; j++) {
+    int item = crush.get_bucket_item(i, j);
+    int w = crush.get_bucket_item_weight(i, j);
+    out << "\titem ";
+    print_item_name(out, item, crush);
+    out << " weight ";
+    print_fixedpoint(out, w);
+    if (dopos) 
+      out << " pos " << j;
+    
+    out << "\n";
+  }
+  out << "}\n";
+  return 0;
+}
+
+/* Basically, we just descend recursively into all of the buckets,
+ * executing a depth-first traversal of the graph. Since the buckets form a
+ * directed acyclic graph, this should work just fine. The graph isn't
+ * necessarily a tree, so we have to keep track of what buckets we already
+ * outputted. We don't want to output anything twice. We also keep track of
+ * what buckets are in progress so that we can detect cycles. These can
+ * arise through user error.
+ */
+int CrushCompiler::decompile_bucket(int cur,
+				    std::map<int, dcb_state_t>& dcb_states,
+				    ostream &out)
+{
+  if ((cur == 0) || (!crush.bucket_exists(cur)))
+    return 0;
+
+  std::map<int, dcb_state_t>::iterator c = dcb_states.find(cur);
+  if (c == dcb_states.end()) {
+    // Mark this bucket as "in progress."
+    std::map<int, dcb_state_t>::value_type val(cur, DCB_STATE_IN_PROGRESS);
+    std::pair <std::map<int, dcb_state_t>::iterator, bool> rval
+      (dcb_states.insert(val));
+    ceph_assert(rval.second);
+    c = rval.first;
+  }
+  else if (c->second == DCB_STATE_DONE) {
+    // We already did this bucket.
+    return 0;
+  }
+  else if (c->second == DCB_STATE_IN_PROGRESS) {
+    err << "decompile_crush_bucket: logic error: tried to decompile "
+	"a bucket that is already being decompiled" << std::endl;
+    return -EBADE;
+  }
+  else {
+    err << "decompile_crush_bucket: logic error: illegal bucket state! "
+	 << c->second << std::endl;
+    return -EBADE;
+  }
+
+  int bsize = crush.get_bucket_size(cur);
+  for (int i = 0; i < bsize; ++i) {
+    int item = crush.get_bucket_item(cur, i);
+    std::map<int, dcb_state_t>::iterator d = dcb_states.find(item);
+    if (d == dcb_states.end()) {
+      int ret = decompile_bucket(item, dcb_states, out);
+      if (ret)
+	return ret;
+    }
+    else if (d->second == DCB_STATE_IN_PROGRESS) {
+      err << "decompile_crush_bucket: error: while trying to output bucket "
+	   << cur << ", we found out that it contains one of the buckets that "
+	   << "contain it. This is not allowed. The buckets must form a "
+	   <<  "directed acyclic graph." << std::endl;
+      return -EINVAL;
+    }
+    else if (d->second != DCB_STATE_DONE) {
+      err << "decompile_crush_bucket: logic error: illegal bucket state "
+	   << d->second << std::endl;
+      return -EBADE;
+    }
+  }
+  decompile_bucket_impl(cur, out);
+  c->second = DCB_STATE_DONE;
+  return 0;
+}
+
+int CrushCompiler::decompile_weight_set_weights(crush_weight_set weight_set,
+                                                ostream &out)
+{
+  out << "      [ ";
+  for (__u32 i = 0; i < weight_set.size; i++) {
+    print_fixedpoint(out, weight_set.weights[i]);
+    out << " ";
+  }
+  out << "]\n";
+  return 0;
+}
+
+int CrushCompiler::decompile_weight_set(crush_weight_set *weight_set,
+                                        __u32 size,
+                                        ostream &out)
+{
+  out << "    weight_set [\n";
+  for (__u32 i = 0; i < size; i++) {
+    int r = decompile_weight_set_weights(weight_set[i], out);
+    if (r < 0)
+      return r;
+  }
+  out << "    ]\n";
+  return 0;
+}
+
+int CrushCompiler::decompile_ids(__s32 *ids,
+                                 __u32 size,
+                                 ostream &out)
+{
+  out << "    ids [ ";
+  for (__u32 i = 0; i < size; i++)
+    out << ids[i] << " ";
+  out << "]\n";
+  return 0;
+}
+
+int CrushCompiler::decompile_choose_arg(crush_choose_arg *arg,
+                                        int bucket_id,
+                                        ostream &out)
+{
+  int r;
+  out << "  {\n";
+  out << "    bucket_id " << bucket_id << "\n";
+  if (arg->weight_set_positions > 0) {
+    r = decompile_weight_set(arg->weight_set, arg->weight_set_positions, out);
+    if (r < 0)
+      return r;
+  }
+  if (arg->ids_size > 0) {
+    r = decompile_ids(arg->ids, arg->ids_size, out);
+    if (r < 0)
+      return r;
+  }
+  out << "  }\n";
+  return 0;
+}
+
+int CrushCompiler::decompile_choose_arg_map(crush_choose_arg_map arg_map,
+                                            ostream &out)
+{
+  for (__u32 i = 0; i < arg_map.size; i++) {
+    if ((arg_map.args[i].ids_size == 0) &&
+        (arg_map.args[i].weight_set_positions == 0))
+      continue;
+    int r = decompile_choose_arg(&arg_map.args[i], -1-i, out);
+    if (r < 0)
+      return r;
+  }
+  return 0;
+}
+
+int CrushCompiler::decompile_choose_args(const std::pair<const long unsigned int, crush_choose_arg_map> &i,
+                                         ostream &out)
+{
+  out << "choose_args " << i.first << " {\n";
+  int r = decompile_choose_arg_map(i.second, out);
+  if (r < 0)
+    return r;
+  out << "}\n";
+  return 0;
+}
+
+int CrushCompiler::decompile(ostream &out)
+{
+  out << "# begin crush map\n";
+
+  // only dump tunables if they differ from the defaults
+  if (crush.get_choose_local_tries() != 2)
+    out << "tunable choose_local_tries " << crush.get_choose_local_tries() << "\n";
+  if (crush.get_choose_local_fallback_tries() != 5)
+    out << "tunable choose_local_fallback_tries " << crush.get_choose_local_fallback_tries() << "\n";
+  if (crush.get_choose_total_tries() != 19)
+    out << "tunable choose_total_tries " << crush.get_choose_total_tries() << "\n";
+  if (crush.get_chooseleaf_descend_once() != 0)
+    out << "tunable chooseleaf_descend_once " << crush.get_chooseleaf_descend_once() << "\n";
+  if (crush.get_chooseleaf_vary_r() != 0)
+    out << "tunable chooseleaf_vary_r " << crush.get_chooseleaf_vary_r() << "\n";
+  if (crush.get_chooseleaf_stable() != 0)
+    out << "tunable chooseleaf_stable " << crush.get_chooseleaf_stable() << "\n";
+  if (crush.get_straw_calc_version() != 0)
+    out << "tunable straw_calc_version " << crush.get_straw_calc_version() << "\n";
+  if (crush.get_allowed_bucket_algs() != CRUSH_LEGACY_ALLOWED_BUCKET_ALGS)
+    out << "tunable allowed_bucket_algs " << crush.get_allowed_bucket_algs()
+	<< "\n";
+
+  out << "\n# devices\n";
+  for (int i=0; i<crush.get_max_devices(); i++) {
+    const char *name = crush.get_item_name(i);
+    if (name) {
+      out << "device " << i << " " << name;
+      print_item_class(out, i, crush);
+      out << "\n";
+    }
+  }
+  
+  out << "\n# types\n";
+  int n = crush.get_num_type_names();
+  for (int i=0; n; i++) {
+    const char *name = crush.get_type_name(i);
+    if (!name) {
+      if (i == 0) out << "type 0 osd\n";
+      continue;
+    }
+    n--;
+    out << "type " << i << " " << name << "\n";
+  }
+
+  out << "\n# buckets\n";
+  std::map<int, dcb_state_t> dcb_states;
+  for (int bucket = -1; bucket > -1-crush.get_max_buckets(); --bucket) {
+    int ret = decompile_bucket(bucket, dcb_states, out);
+    if (ret)
+      return ret;
+  }
+
+  out << "\n# rules\n";
+  for (int i=0; i<crush.get_max_rules(); i++) {
+    if (!crush.rule_exists(i))
+      continue;
+    out << "rule ";
+    if (crush.get_rule_name(i))
+      print_rule_name(out, i, crush);
+    out << " {\n";
+    out << "\tid " << i << "\n";
+    if (i != crush.get_rule_mask_ruleset(i)) {
+      out << "\t# WARNING: ruleset " << crush.get_rule_mask_ruleset(i) << " != id " << i << "; this will not recompile to the same map\n";
+    }
+
+    switch (crush.get_rule_mask_type(i)) {
+    case CEPH_PG_TYPE_REPLICATED:
+      out << "\ttype replicated\n";
+      break;
+    case CEPH_PG_TYPE_ERASURE:
+      out << "\ttype erasure\n";
+      break;
+    default:
+      out << "\ttype " << crush.get_rule_mask_type(i) << "\n";
+    }
+
+    out << "\tmin_size " << crush.get_rule_mask_min_size(i) << "\n";
+    out << "\tmax_size " << crush.get_rule_mask_max_size(i) << "\n";
+
+    for (int j=0; j<crush.get_rule_len(i); j++) {
+      switch (crush.get_rule_op(i, j)) {
+      case CRUSH_RULE_NOOP:
+	out << "\tstep noop\n";
+	break;
+      case CRUSH_RULE_TAKE:
+	out << "\tstep take ";
+	{
+          int step_item = crush.get_rule_arg1(i, j);
+          int original_item;
+          int c;
+          int res = crush.split_id_class(step_item, &original_item, &c);
+          if (res < 0)
+            return res;
+	  if (c >= 0)
+            step_item = original_item;
+          print_item_name(out, step_item, crush);
+	  if (c >= 0)
+	    print_class(out, c, crush);
+	}
+	out << "\n";
+	break;
+      case CRUSH_RULE_EMIT:
+	out << "\tstep emit\n";
+	break;
+      case CRUSH_RULE_SET_CHOOSE_TRIES:
+	out << "\tstep set_choose_tries " << crush.get_rule_arg1(i, j)
+	    << "\n";
+	break;
+      case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
+	out << "\tstep set_choose_local_tries " << crush.get_rule_arg1(i, j)
+	    << "\n";
+	break;
+      case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
+	out << "\tstep set_choose_local_fallback_tries " << crush.get_rule_arg1(i, j)
+	    << "\n";
+	break;
+      case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+	out << "\tstep set_chooseleaf_tries " << crush.get_rule_arg1(i, j)
+	    << "\n";
+	break;
+      case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+	out << "\tstep set_chooseleaf_vary_r " << crush.get_rule_arg1(i, j)
+	    << "\n";
+	break;
+      case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
+	out << "\tstep set_chooseleaf_stable " << crush.get_rule_arg1(i, j)
+	    << "\n";
+	break;
+      case CRUSH_RULE_CHOOSE_FIRSTN:
+	out << "\tstep choose firstn "
+	    << crush.get_rule_arg1(i, j) 
+	    << " type ";
+	print_type_name(out, crush.get_rule_arg2(i, j), crush);
+	out << "\n";
+	break;
+      case CRUSH_RULE_CHOOSE_INDEP:
+	out << "\tstep choose indep "
+	    << crush.get_rule_arg1(i, j) 
+	    << " type ";
+	print_type_name(out, crush.get_rule_arg2(i, j), crush);
+	out << "\n";
+	break;
+      case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+	out << "\tstep chooseleaf firstn "
+	    << crush.get_rule_arg1(i, j) 
+	    << " type ";
+	print_type_name(out, crush.get_rule_arg2(i, j), crush);
+	out << "\n";
+	break;
+      case CRUSH_RULE_CHOOSELEAF_INDEP:
+	out << "\tstep chooseleaf indep "
+	    << crush.get_rule_arg1(i, j) 
+	    << " type ";
+	print_type_name(out, crush.get_rule_arg2(i, j), crush);
+	out << "\n";
+	break;
+      }
+    }
+    out << "}\n";
+  }
+  if (crush.choose_args.size() > 0) {
+    out << "\n# choose_args\n";
+    for (auto i : crush.choose_args) {
+      int ret = decompile_choose_args(i, out);
+      if (ret)
+        return ret;
+    }
+  }
+  out << "\n# end crush map" << std::endl;
+  return 0;
+}
+
+
+// ================================================================
+
+string CrushCompiler::string_node(node_t &node)
+{
+  return boost::trim_copy(string(node.value.begin(), node.value.end()));
+}
+
+int CrushCompiler::int_node(node_t &node) 
+{
+  string str = string_node(node);
+  return strtol(str.c_str(), 0, 10);
+}
+
+float CrushCompiler::float_node(node_t &node)
+{
+  string s = string_node(node);
+  return strtof(s.c_str(), 0);
+}
+
+int CrushCompiler::parse_device(iter_t const& i)
+{
+  int id = int_node(i->children[1]);
+
+  string name = string_node(i->children[2]);
+  crush.set_item_name(id, name.c_str());
+  if (item_id.count(name)) {
+    err << "item " << name << " defined twice" << std::endl;
+    return -1;
+  }    
+  item_id[name] = id;
+  id_item[id] = name;
+
+  if (verbose) err << "device " << id << " '" << name << "'";
+
+  if (i->children.size() > 3) {
+    string c = string_node(i->children[4]);
+    crush.set_item_class(id, c);
+    if (verbose) err << " class" << " '" << c << "'" << std::endl;
+  } else {
+    if (verbose) err << std::endl;
+  }
+  return 0;
+}
+
+int CrushCompiler::parse_tunable(iter_t const& i)
+{
+  string name = string_node(i->children[1]);
+  int val = int_node(i->children[2]);
+
+  if (name == "choose_local_tries")
+    crush.set_choose_local_tries(val);
+  else if (name == "choose_local_fallback_tries")
+    crush.set_choose_local_fallback_tries(val);
+  else if (name == "choose_total_tries")
+    crush.set_choose_total_tries(val);
+  else if (name == "chooseleaf_descend_once")
+    crush.set_chooseleaf_descend_once(val);
+  else if (name == "chooseleaf_vary_r")
+    crush.set_chooseleaf_vary_r(val);
+  else if (name == "chooseleaf_stable")
+    crush.set_chooseleaf_stable(val);
+  else if (name == "straw_calc_version")
+    crush.set_straw_calc_version(val);
+  else if (name == "allowed_bucket_algs")
+    crush.set_allowed_bucket_algs(val);
+  else {
+    err << "tunable " << name << " not recognized" << std::endl;
+    return -1;
+  }
+
+  /*
+
+    current crop of tunables are all now "safe".  re-enable this when we
+    add new ones that are ... new.
+
+  if (!unsafe_tunables) {
+    err << "tunables are NOT FULLY IMPLEMENTED; enable with --enable-unsafe-tunables to enable this feature" << std::endl;
+    return -1;
+  }
+  */
+
+  if (verbose) err << "tunable " << name << " " << val << std::endl;
+  return 0;
+}
+
+int CrushCompiler::parse_bucket_type(iter_t const& i)
+{
+  int id = int_node(i->children[1]);
+  string name = string_node(i->children[2]);
+  if (verbose) err << "type " << id << " '" << name << "'" << std::endl;
+  type_id[name] = id;
+  crush.set_type_name(id, name.c_str());
+  return 0;
+}
+
+int CrushCompiler::parse_bucket(iter_t const& i)
+{
+  string tname = string_node(i->children[0]);
+  if (!type_id.count(tname)) {
+    err << "bucket type '" << tname << "' is not defined" << std::endl;
+    return -1;
+  }
+  int type = type_id[tname];
+
+  string name = string_node(i->children[1]);
+  if (item_id.count(name)) {
+    err << "bucket or device '" << name << "' is already defined" << std::endl;
+    return -1;
+  }
+
+  int id = 0;  // none, yet!
+  int alg = -1;
+  int hash = 0;
+  set<int> used_items;
+  int size = 0;
+  map<int32_t, int32_t> class_id;
+  
+  for (unsigned p=3; p<i->children.size()-1; p++) {
+    iter_t sub = i->children.begin() + p;
+    string tag = string_node(sub->children[0]);
+    //err << "tag " << tag << std::endl;
+    if (tag == "id") {
+      int maybe_id = int_node(sub->children[1]);
+      if (verbose) err << "bucket " << name << " id " << maybe_id;
+      if (sub->children.size() > 2) {
+        string class_name = string_node(sub->children[3]);
+        // note that we do not verify class existence here,
+        // as this bucket might come from an empty shadow tree
+        // which currently has no OSDs but is still referenced by a rule!
+        int cid = crush.get_or_create_class_id(class_name);
+        if (class_id.count(cid) != 0) {
+          err << "duplicate device class " << class_name << " for bucket " << name << std::endl;
+          return -ERANGE;
+        }
+        class_id[cid] = maybe_id;
+        if (verbose) err << " class" << " '" << class_name << "'" << std::endl;
+      } else {
+        id = maybe_id;
+        if (verbose) err << std::endl;
+      }
+    } else if (tag == "alg") {
+      string a = string_node(sub->children[1]);
+      if (a == "uniform")
+	alg = CRUSH_BUCKET_UNIFORM;
+      else if (a == "list")
+	alg = CRUSH_BUCKET_LIST;
+      else if (a == "tree")
+	alg = CRUSH_BUCKET_TREE;
+      else if (a == "straw")
+	alg = CRUSH_BUCKET_STRAW;
+      else if (a == "straw2")
+	alg = CRUSH_BUCKET_STRAW2;
+      else {
+	err << "unknown bucket alg '" << a << "'" << std::endl << std::endl;
+	return -EINVAL;
+      }
+    }
+    else if (tag == "hash") {
+      string a = string_node(sub->children[1]);
+      if (a == "rjenkins1")
+	hash = CRUSH_HASH_RJENKINS1;
+      else
+	hash = atoi(a.c_str());
+    }
+    else if (tag == "item") {
+      // first, just determine which item pos's are already used
+      size++;
+      for (unsigned q = 2; q < sub->children.size(); q++) {
+	string tag = string_node(sub->children[q++]);
+	if (tag == "pos") {
+	  int pos = int_node(sub->children[q]);
+	  if (used_items.count(pos)) {
+	    err << "item '" << string_node(sub->children[1]) << "' in bucket '" << name << "' has explicit pos " << pos << ", which is occupied" << std::endl;
+	    return -1;
+	  }
+	  used_items.insert(pos);
+	}
+      }
+    }
+    else ceph_abort();
+  }
+
+  // now do the items.
+  if (!used_items.empty())
+    size = std::max(size, *used_items.rbegin());
+  vector<int> items(size);
+  vector<int> weights(size);
+
+  int curpos = 0;
+  unsigned bucketweight = 0;
+  bool have_uniform_weight = false;
+  unsigned uniform_weight = 0;
+  for (unsigned p=3; p<i->children.size()-1; p++) {
+    iter_t sub = i->children.begin() + p;
+    string tag = string_node(sub->children[0]);
+    if (tag == "item") {
+
+      string iname = string_node(sub->children[1]);
+      if (!item_id.count(iname)) {
+	err << "item '" << iname << "' in bucket '" << name << "' is not defined" << std::endl;
+	return -1;
+      }
+      int itemid = item_id[iname];
+
+      unsigned weight = 0x10000;
+      if (item_weight.count(itemid))
+	weight = item_weight[itemid];
+
+      int pos = -1;
+      for (unsigned q = 2; q < sub->children.size(); q++) {
+	string tag = string_node(sub->children[q++]);
+	if (tag == "weight") {
+	  weight = float_node(sub->children[q]) * (float)0x10000;
+	  if (weight > CRUSH_MAX_DEVICE_WEIGHT && itemid >= 0) {
+	    err << "device weight limited to " << CRUSH_MAX_DEVICE_WEIGHT / 0x10000 << std::endl;
+	    return -ERANGE;
+	  }
+	  else if (weight > CRUSH_MAX_BUCKET_WEIGHT && itemid < 0) {
+	    err << "bucket weight limited to " << CRUSH_MAX_BUCKET_WEIGHT / 0x10000
+	        << " to prevent overflow" << std::endl;
+	    return -ERANGE;
+	  }
+	}
+	else if (tag == "pos") 
+	  pos = int_node(sub->children[q]);
+	else
+	  ceph_abort();
+
+      }
+      if (alg == CRUSH_BUCKET_UNIFORM) {
+	if (!have_uniform_weight) {
+	  have_uniform_weight = true;
+	  uniform_weight = weight;
+	} else {
+	  if (uniform_weight != weight) {
+	    err << "item '" << iname << "' in uniform bucket '" << name << "' has weight " << weight
+		<< " but previous item(s) have weight " << (float)uniform_weight/(float)0x10000
+		<< "; uniform bucket items must all have identical weights." << std::endl;
+	    return -1;
+	  }
+	}
+      }
+
+      if (pos >= size) {
+	err << "item '" << iname << "' in bucket '" << name << "' has pos " << pos << " >= size " << size << std::endl;
+	return -1;
+      }
+      if (pos < 0) {
+	while (used_items.count(curpos)) curpos++;
+	pos = curpos++;
+      }
+      //err << " item " << iname << " (" << itemid << ") pos " << pos << " weight " << weight << std::endl;
+      items[pos] = itemid;
+      weights[pos] = weight;
+
+      if (crush_addition_is_unsafe(bucketweight, weight)) {
+        err << "oh no! our bucket weights are overflowing all over the place, better lower the item weights" << std::endl;
+        return -ERANGE;
+      }
+
+      bucketweight += weight;
+    }
+  }
+
+  if (id == 0) {
+    for (id=-1; id_item.count(id); id--) ;
+    //err << "assigned id " << id << std::endl;
+  }
+
+  for (auto &i : class_id)
+    class_bucket[id][i.first] = i.second;
+
+  if (verbose) err << "bucket " << name << " (" << id << ") " << size << " items and weight "
+		   << (float)bucketweight / (float)0x10000 << std::endl;
+  id_item[id] = name;
+  item_id[name] = id;
+  item_weight[id] = bucketweight;
+  
+  ceph_assert(id != 0);
+  int idout;
+  int r = crush.add_bucket(id, alg, hash, type, size,
+                           items.data(), weights.data(), &idout);
+  if (r < 0) {
+    if (r == -EEXIST)
+      err << "Duplicate bucket id " << id << std::endl;
+    else
+      err << "add_bucket failed " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  r = crush.set_item_name(id, name.c_str());
+  return r;
+}
+
+int CrushCompiler::parse_rule(iter_t const& i)
+{
+  int start;  // rule name is optional!
+ 
+  string rname = string_node(i->children[1]);
+  if (rname != "{") {
+    if (rule_id.count(rname)) {
+      err << "rule name '" << rname << "' already defined\n" << std::endl;
+      return -1;
+    }
+    start = 4;
+  } else {
+    rname = string();
+    start = 3;
+  }
+
+  int ruleno = int_node(i->children[start]);
+
+  string tname = string_node(i->children[start+2]);
+  int type;
+  if (tname == "replicated")
+    type = CEPH_PG_TYPE_REPLICATED;
+  else if (tname == "erasure")
+    type = CEPH_PG_TYPE_ERASURE;
+  else 
+    ceph_abort();
+
+  int minsize = int_node(i->children[start+4]);
+  int maxsize = int_node(i->children[start+6]);
+  
+  int steps = i->children.size() - start - 8;
+  //err << "num steps " << steps << std::endl;
+
+  if (crush.rule_exists(ruleno)) {
+    err << "rule " << ruleno << " already exists" << std::endl;
+    return -1;
+  }
+  int r = crush.add_rule(ruleno, steps, type, minsize, maxsize);
+  if (r != ruleno) {
+    err << "unable to add rule id " << ruleno << " for rule '" << rname
+	<< "'" << std::endl;
+    return -1;
+  }
+  if (rname.length()) {
+    crush.set_rule_name(ruleno, rname.c_str());
+    rule_id[rname] = ruleno;
+  }
+
+  int step = 0;
+  for (iter_t p = i->children.begin() + start + 7; step < steps; p++) {
+    iter_t s = p->children.begin() + 1;
+    int stepid = s->value.id().to_long();
+    switch (stepid) {
+    case crush_grammar::_step_take: 
+      {
+	string item = string_node(s->children[1]);
+	if (!item_id.count(item)) {
+	  err << "in rule '" << rname << "' item '" << item << "' not defined" << std::endl;
+	  return -1;
+	}
+        int id = item_id[item];
+        int c = -1;
+        string class_name;
+        if (s->children.size() > 2) {
+          class_name = string_node(s->children[3]);
+          c = crush.get_class_id(class_name);
+          if (c < 0)
+            return c;
+          if (crush.class_bucket.count(id) == 0) {
+            err << "in rule '" << rname << "' step take " << item
+                << " has no class information" << std::endl;
+            return -EINVAL;
+          }
+          if (crush.class_bucket[id].count(c) == 0) {
+            err << "in rule '" << rname << "' step take " << item
+                << " no matching bucket for class " << class_name << std::endl;
+            return -EINVAL;
+          }
+          id = crush.class_bucket[id][c];
+        }
+        if (verbose) {
+          err << "rule " << rname << " take " << item;
+          if (c < 0)
+            err << std::endl;
+          else
+            err << " remapped to " << crush.get_item_name(id) << std::endl;
+        }
+
+	crush.set_rule_step_take(ruleno, step++, id);
+      }
+      break;
+
+    case crush_grammar::_step_set_choose_tries:
+      {
+	int val = int_node(s->children[1]);
+	crush.set_rule_step_set_choose_tries(ruleno, step++, val);
+      }
+      break;
+
+    case crush_grammar::_step_set_choose_local_tries:
+      {
+	int val = int_node(s->children[1]);
+	crush.set_rule_step_set_choose_local_tries(ruleno, step++, val);
+      }
+      break;
+
+    case crush_grammar::_step_set_choose_local_fallback_tries:
+      {
+	int val = int_node(s->children[1]);
+	crush.set_rule_step_set_choose_local_fallback_tries(ruleno, step++, val);
+      }
+      break;
+
+    case crush_grammar::_step_set_chooseleaf_tries:
+      {
+	int val = int_node(s->children[1]);
+	crush.set_rule_step_set_chooseleaf_tries(ruleno, step++, val);
+      }
+      break;
+
+    case crush_grammar::_step_set_chooseleaf_vary_r:
+      {
+	int val = int_node(s->children[1]);
+	crush.set_rule_step_set_chooseleaf_vary_r(ruleno, step++, val);
+      }
+      break;
+
+    case crush_grammar::_step_set_chooseleaf_stable:
+      {
+	int val = int_node(s->children[1]);
+	crush.set_rule_step_set_chooseleaf_stable(ruleno, step++, val);
+      }
+      break;
+
+    case crush_grammar::_step_choose:
+    case crush_grammar::_step_chooseleaf:
+      {
+	string type = string_node(s->children[4]);
+	if (!type_id.count(type)) {
+	  err << "in rule '" << rname << "' type '" << type << "' not defined" << std::endl;
+	  return -1;
+	}
+	string choose = string_node(s->children[0]);
+	string mode = string_node(s->children[1]);
+	if (choose == "choose") {
+	  if (mode == "firstn")
+	    crush.set_rule_step_choose_firstn(ruleno, step++, int_node(s->children[2]), type_id[type]);
+	  else if (mode == "indep")
+	    crush.set_rule_step_choose_indep(ruleno, step++, int_node(s->children[2]), type_id[type]);
+	  else ceph_abort();
+	} else if (choose == "chooseleaf") {
+	  if (mode == "firstn") 
+	    crush.set_rule_step_choose_leaf_firstn(ruleno, step++, int_node(s->children[2]), type_id[type]);
+	  else if (mode == "indep")
+	    crush.set_rule_step_choose_leaf_indep(ruleno, step++, int_node(s->children[2]), type_id[type]);
+	  else ceph_abort();
+	} else ceph_abort();
+      }
+      break;
+
+    case crush_grammar::_step_emit:
+      crush.set_rule_step_emit(ruleno, step++);
+      break;
+
+    default:
+      err << "bad crush step " << stepid << std::endl;
+      return -1;
+    }
+  }
+  ceph_assert(step == steps);
+  return 0;
+}
+
+int CrushCompiler::parse_weight_set_weights(iter_t const& i, int bucket_id, crush_weight_set *weight_set)
+{
+  // -2 for the enclosing [ ]
+  __u32 size = i->children.size() - 2;
+  __u32 bucket_size = crush.get_bucket_size(bucket_id);
+  if (size != bucket_size) {
+    err << bucket_id << " needs exactly " << bucket_size
+        << " weights but got " << size << std::endl;
+    return -1;
+  }
+  weight_set->size = size;
+  weight_set->weights = (__u32 *)calloc(weight_set->size, sizeof(__u32));
+  __u32 pos = 0;
+  for (iter_t p = i->children.begin() + 1; p != i->children.end(); p++, pos++)
+    if (pos < size)
+      weight_set->weights[pos] = float_node(*p) * (float)0x10000;
+  return 0;
+}
+
+int CrushCompiler::parse_weight_set(iter_t const& i, int bucket_id, crush_choose_arg *arg)
+{
+  // -3 stands for the leading "weight_set" keyword and the enclosing [ ]
+  arg->weight_set_positions = i->children.size() - 3;
+  arg->weight_set = (crush_weight_set *)calloc(arg->weight_set_positions, sizeof(crush_weight_set));
+  __u32 pos = 0;
+  for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
+    int r = 0;
+    switch((int)p->value.id().to_long()) {
+    case crush_grammar::_weight_set_weights:
+      if (pos < arg->weight_set_positions) {
+        r = parse_weight_set_weights(p, bucket_id, &arg->weight_set[pos]);
+        pos++;
+      } else {
+        err << "invalid weight_set syntax" << std::endl;
+        r = -1;
+      }
+    }
+    if (r < 0)
+      return r;
+  }
+  return 0;
+}
+
+int CrushCompiler::parse_choose_arg_ids(iter_t const& i, int bucket_id, crush_choose_arg *arg)
+{
+  // -3 for the leading "ids" keyword and the enclosing [ ]
+  __u32 size = i->children.size() - 3;
+  __u32 bucket_size = crush.get_bucket_size(bucket_id);
+  if (size != bucket_size) {
+    err << bucket_id << " needs exactly " << bucket_size
+        << " ids but got " << size << std::endl;
+    return -1;
+  }
+  arg->ids_size = size;
+  arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
+  __u32 pos = 0;
+  for (iter_t p = i->children.begin() + 2; pos < size; p++, pos++)
+    arg->ids[pos] = int_node(*p);
+  return 0;
+}
+
+int CrushCompiler::parse_choose_arg(iter_t const& i, crush_choose_arg *args)
+{
+  int bucket_id = int_node(i->children[2]);
+  if (-1-bucket_id < 0 || -1-bucket_id >= crush.get_max_buckets()) {
+    err << bucket_id << " is out of range" << std::endl;
+    return -1;
+  }
+  if (!crush.bucket_exists(bucket_id)) {
+    err << bucket_id << " does not exist" << std::endl;
+    return -1;
+  }
+  crush_choose_arg *arg = &args[-1-bucket_id];
+  for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
+    int r = 0;
+    switch((int)p->value.id().to_long()) {
+    case crush_grammar::_weight_set:
+      r = parse_weight_set(p, bucket_id, arg);
+      break;
+    case crush_grammar::_choose_arg_ids:
+      r = parse_choose_arg_ids(p, bucket_id, arg);
+      break;
+    }
+    if (r < 0)
+      return r;
+  }
+  return 0;
+}
+
+int CrushCompiler::parse_choose_args(iter_t const& i)
+{
+  int choose_arg_index = int_node(i->children[1]);
+  if (crush.choose_args.find(choose_arg_index) != crush.choose_args.end()) {
+    err << choose_arg_index << " duplicated" << std::endl;
+    return -1;
+  }
+  const auto max_buckets = crush.get_max_buckets();
+  if (max_buckets < 0) {
+    err << "get_max_buckets() returned error" << std::endl;
+    return -1;
+  }
+  crush_choose_arg_map arg_map;
+  arg_map.size = max_buckets;
+  arg_map.args = (crush_choose_arg *)calloc(arg_map.size, sizeof(crush_choose_arg));
+  for (iter_t p = i->children.begin() + 2; p != i->children.end(); p++) {
+    int r = 0;
+    switch((int)p->value.id().to_long()) {
+    case crush_grammar::_choose_arg:
+      r = parse_choose_arg(p, arg_map.args);
+      break;
+    }
+    if (r < 0) {
+      crush.destroy_choose_args(arg_map);
+      return r;
+    }
+  }
+  crush.choose_args[choose_arg_index] = arg_map;
+  return 0;
+}
+
+void CrushCompiler::find_used_bucket_ids(iter_t const& i)
+{
+  for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
+    if ((int)p->value.id().to_long() == crush_grammar::_bucket) {
+      for (iter_t firstline = p->children.begin() + 3;
+	   firstline != p->children.end();
+	   ++firstline) {
+	string tag = string_node(firstline->children[0]);
+	if (tag != "id") {
+	  break;
+	}
+	int id = int_node(firstline->children[1]);
+	//err << "saw bucket id " << id << std::endl;
+	id_item[id] = string();
+      }
+    }
+  }
+}
+
+int CrushCompiler::parse_crush(iter_t const& i) 
+{ 
+  find_used_bucket_ids(i);
+  bool saw_rule = false;
+  for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
+    int r = 0;
+    switch (p->value.id().to_long()) {
+    case crush_grammar::_tunable:
+      r = parse_tunable(p);
+      break;
+    case crush_grammar::_device: 
+      r = parse_device(p);
+      break;
+    case crush_grammar::_bucket_type: 
+      r = parse_bucket_type(p);
+      break;
+    case crush_grammar::_bucket:
+      if (saw_rule) {
+	err << "buckets must be defined before rules" << std::endl;
+	return -1;
+      }
+      r = parse_bucket(p);
+      break;
+    case crush_grammar::_crushrule:
+      if (!saw_rule) {
+	saw_rule = true;
+	crush.populate_classes(class_bucket);
+      }
+      r = parse_rule(p);
+      break;
+    case crush_grammar::_choose_args:
+      r = parse_choose_args(p);
+      break;
+    default:
+      ceph_abort();
+    }
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  //err << "max_devices " << crush.get_max_devices() << std::endl;
+  crush.finalize();
+
+  return 0;
+} 
+
+// squash runs of whitespace to one space, excepting newlines
+string CrushCompiler::consolidate_whitespace(string in)
+{
+  string out;
+
+  bool white = false;
+  for (unsigned p=0; p<in.length(); p++) {
+    if (isspace(in[p]) && in[p] != '\n') {
+      if (white)
+	continue;
+      white = true;
+    } else {
+      if (white) {
+	if (out.length()) out += " ";
+	white = false;
+      }
+      out += in[p];
+    }
+  }
+  if (verbose > 3)
+    err << " \"" << in << "\" -> \"" << out << "\"" << std::endl;
+  return out;
+}
+
+void CrushCompiler::dump(iter_t const& i, int ind) 
+{
+  err << "dump"; 
+  for (int j=0; j<ind; j++)
+    cout << "\t"; 
+  long id = i->value.id().to_long();
+  err << id << "\t"; 
+  err << "'" << string(i->value.begin(), i->value.end())  
+      << "' " << i->children.size() << " children" << std::endl; 
+  for (unsigned int j = 0; j < i->children.size(); j++)  
+    dump(i->children.begin() + j, ind+1); 
+}
+
+/**
+*  This function fix the problem like below
+*   rack using_foo { item foo }  
+*   host foo { ... }
+*
+*  if an item being used by a bucket is defined after that bucket. 
+*  CRUSH compiler will create a map by which we can 
+*  not identify that item when selecting in that bucket.
+**/
+int CrushCompiler::adjust_bucket_item_place(iter_t const &i)
+{
+  map<string,set<string> > bucket_items;
+  map<string,iter_t> bucket_itrer;
+  vector<string> buckets;
+  for (iter_t p = i->children.begin(); p != i->children.end(); ++p) {
+    if ((int)p->value.id().to_long() == crush_grammar::_bucket) {
+      string name = string_node(p->children[1]);
+      buckets.push_back(name);
+      bucket_itrer[name] = p;
+      //skip non-bucket-item children in the bucket's parse tree
+      for (unsigned q=3; q < p->children.size()-1; ++q) {
+        iter_t sub = p->children.begin() + q;
+        if ((int)sub->value.id().to_long() 
+          == crush_grammar::_bucket_item) {
+          string iname = string_node(sub->children[1]);
+          bucket_items[name].insert(iname);
+        }         
+      }       
+    }     
+  }
+  
+  //adjust the bucket
+  for (unsigned i=0; i < buckets.size(); ++i) { 
+    for (unsigned j=i+1; j < buckets.size(); ++j) {
+      if (bucket_items[buckets[i]].count(buckets[j])) {
+        if (bucket_items[buckets[j]].count(buckets[i])) {
+          err << "bucket  '" <<  buckets[i] << "' and bucket '"
+          << buckets[j] << "' are included each other" << std::endl;
+          return -1; 
+        } else {  
+	   std::iter_swap(bucket_itrer[buckets[i]], bucket_itrer[buckets[j]]);
+        } 
+      } 
+    }
+  }
+	
+  return 0;
+}
+
+int CrushCompiler::compile(istream& in, const char *infn)
+{
+  if (!infn)
+    infn = "<input>";
+
+  // always start with legacy tunables, so that the compiled result of
+  // a given crush file is fixed for all time.
+  crush.set_tunables_legacy();
+
+  string big;
+  string str;
+  int line = 1;
+  map<int,int> line_pos;  // pos -> line
+  map<int,string> line_val;
+  while (getline(in, str)) {
+    // remove newline
+    int l = str.length();
+    if (l && str[l - 1] == '\n')
+      str.erase(l-1, 1);
+
+    line_val[line] = str;
+
+    // strip comment
+    int n = str.find("#");
+    if (n >= 0)
+      str.erase(n, str.length()-n);
+    
+    if (verbose>1) err << line << ": " << str << std::endl;
+
+    // work around spirit crankiness by removing extraneous
+    // whitespace.  there is probably a more elegant solution, but
+    // this only broke with the latest spirit (with the switchover to
+    // "classic"), i don't want to spend too much time figuring it
+    // out.
+    string stripped = consolidate_whitespace(str);
+    if (stripped.length() && big.length() && big[big.length()-1] != ' ') big += " ";
+
+    line_pos[big.length()] = line;
+    line++;
+    big += stripped;
+  }
+  
+  if (verbose > 2) err << "whole file is: \"" << big << "\"" << std::endl;
+  
+  crush_grammar crushg;
+  const char *start = big.c_str();
+  //tree_parse_info<const char *> info = ast_parse(start, crushg, space_p);
+  auto info = ast_parse(start, crushg, boost::spirit::space_p);
+
+  // parse error?
+  if (!info.full) {
+    int cpos = info.stop - start;
+    //out << "cpos " << cpos << std::endl;
+    //out << " linemap " << line_pos << std::endl;
+    ceph_assert(!line_pos.empty());
+    map<int,int>::iterator p = line_pos.upper_bound(cpos);
+    if (p != line_pos.begin())
+      --p;
+    int line = p->second;
+    int pos = cpos - p->first;
+    err << infn << ":" << line //<< ":" << (pos+1)
+	<< " error: parse error at '" << line_val[line].substr(pos) << "'" << std::endl;
+    return -1;
+  }
+  
+  int r = adjust_bucket_item_place(info.trees.begin());
+  if (r < 0) {
+    return r;
+  }
+  //out << "parsing succeeded\n";
+  //dump(info.trees.begin());
+  return parse_crush(info.trees.begin());
+}
diff --git a/src/crush/CrushCompiler.h b/src/crush/CrushCompiler.h
new file mode 100644
index 000000000..26dac58cb
--- /dev/null
+++ b/src/crush/CrushCompiler.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CRUSH_COMPILER_H
+#define CEPH_CRUSH_COMPILER_H
+
+#include "crush/CrushWrapper.h"
+#include "crush/grammar.h"
+
+#include <map>
+#include <iostream>
+
+class CrushCompiler {
+  CrushWrapper& crush;
+  std::ostream& err;
+  int verbose;
+  bool unsafe_tunables;
+
+  // decompile
+  enum dcb_state_t {
+    DCB_STATE_IN_PROGRESS = 0,
+    DCB_STATE_DONE
+  };
+
+  int decompile_weight_set_weights(crush_weight_set weight_set,
+				   std::ostream &out);
+  int decompile_weight_set(crush_weight_set *weight_set,
+			   __u32 size,
+			   std::ostream &out);
+  int decompile_choose_arg(crush_choose_arg *arg,
+			   int bucket_id,
+			   std::ostream &out);
+  int decompile_ids(int *ids,
+		    __u32 size,
+		    std::ostream &out);
+  int decompile_choose_arg_map(crush_choose_arg_map arg_map,
+			       std::ostream &out);
+  int decompile_choose_args(const std::pair<const long unsigned int, crush_choose_arg_map> &i,
+			    std::ostream &out);
+  int decompile_bucket_impl(int i, std::ostream &out);
+  int decompile_bucket(int cur,
+		       std::map<int, dcb_state_t>& dcb_states,
+		       std::ostream &out);
+
+  // compile
+  typedef char const*         iterator_t;
+  typedef boost::spirit::tree_match<iterator_t> parse_tree_match_t;
+  typedef parse_tree_match_t::tree_iterator iter_t;
+  typedef parse_tree_match_t::node_t node_t;
+
+  std::map<std::string, int> item_id;
+  std::map<int, std::string> id_item;
+  std::map<int, unsigned> item_weight;
+  std::map<std::string, int> type_id;
+  std::map<std::string, int> rule_id;
+  std::map<int32_t, std::map<int32_t, int32_t> > class_bucket; // bucket id -> class id -> shadow bucket id
+
+  std::string string_node(node_t &node);
+  int int_node(node_t &node); 
+  float float_node(node_t &node);
+
+  int parse_tunable(iter_t const& i);
+  int parse_device(iter_t const& i);
+  int parse_bucket_type(iter_t const& i);
+  int parse_bucket(iter_t const& i);
+  int parse_rule(iter_t const& i);
+  int parse_weight_set_weights(iter_t const& i, int bucket_id, crush_weight_set *weight_set);
+  int parse_weight_set(iter_t const& i, int bucket_id, crush_choose_arg *arg);
+  int parse_choose_arg_ids(iter_t const& i, int bucket_id, crush_choose_arg *args);
+  int parse_choose_arg(iter_t const& i, crush_choose_arg *args);
+  int parse_choose_args(iter_t const& i);
+  void find_used_bucket_ids(iter_t const& i);
+  int parse_crush(iter_t const& i);  
+  void dump(iter_t const& i, int ind=1);
+  std::string consolidate_whitespace(std::string in);
+  int adjust_bucket_item_place(iter_t const &i);
+
+public:
+  CrushCompiler(CrushWrapper& c, std::ostream& eo, int verbosity=0)
+    : crush(c), err(eo), verbose(verbosity),
+      unsafe_tunables(false) {}
+  ~CrushCompiler() {}
+
+  void enable_unsafe_tunables() {
+    unsafe_tunables = true;
+  }
+
+  int decompile(std::ostream& out);
+  int compile(std::istream& in, const char *infn=0);
+};
+
+#endif
diff --git a/src/crush/CrushLocation.cc b/src/crush/CrushLocation.cc
new file mode 100644
index 000000000..05b36d810
--- /dev/null
+++ b/src/crush/CrushLocation.cc
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "CrushLocation.h"
+#include "CrushWrapper.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "include/str_list.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/common_fwd.h"
+#include "include/compat.h"
+
+#include "common/SubProcess.h"
+
+#include <vector>
+
+namespace TOPNSPC::crush {
+
+int CrushLocation::update_from_conf()
+{
+  if (cct->_conf->crush_location.length())
+    return _parse(cct->_conf->crush_location);
+  return 0;
+}
+
+int CrushLocation::_parse(const std::string& s)
+{
+  std::multimap<std::string,std::string> new_crush_location;
+  std::vector<std::string> lvec;
+  get_str_vec(s, ";, \t", lvec);
+  int r = CrushWrapper::parse_loc_multimap(lvec, &new_crush_location);
+  if (r < 0) {
+    lderr(cct) << "warning: crush_location '" << cct->_conf->crush_location
+	       << "' does not parse, keeping original crush_location "
+	       << loc << dendl;
+    return -EINVAL;
+  }
+  std::lock_guard l(lock);
+  loc.swap(new_crush_location);
+  lgeneric_dout(cct, 10) << "crush_location is " << loc << dendl;
+  return 0;
+}
+
+int CrushLocation::update_from_hook()
+{
+  if (cct->_conf->crush_location_hook.length() == 0)
+    return 0;
+ 
+  if (0 != access(cct->_conf->crush_location_hook.c_str(), R_OK)) {
+    lderr(cct) << "the user define crush location hook: " << cct->_conf->crush_location_hook
+               << " may not exist or can not access it" << dendl;
+    return errno;
+  }
+
+  SubProcessTimed hook(
+    cct->_conf->crush_location_hook.c_str(),
+    SubProcess::CLOSE, SubProcess::PIPE, SubProcess::PIPE,
+    cct->_conf->crush_location_hook_timeout);
+  hook.add_cmd_args(
+    "--cluster", cct->_conf->cluster.c_str(),
+    "--id", cct->_conf->name.get_id().c_str(),
+    "--type", cct->_conf->name.get_type_str(),
+    NULL);
+  int ret = hook.spawn();
+  if (ret != 0) {
+    lderr(cct) << "error: failed run " << cct->_conf->crush_location_hook << ": "
+	       << hook.err() << dendl;
+    return ret;
+  }
+
+  ceph::buffer::list bl;
+  ret = bl.read_fd(hook.get_stdout(), 100 * 1024);
+  if (ret < 0) {
+    lderr(cct) << "error: failed read stdout from "
+	       << cct->_conf->crush_location_hook
+	       << ": " << cpp_strerror(-ret) << dendl;
+    ceph::buffer::list err;
+    err.read_fd(hook.get_stderr(), 100 * 1024);
+    lderr(cct) << "stderr:\n";
+    err.hexdump(*_dout);
+    *_dout << dendl;
+  }
+
+  if (hook.join() != 0) {
+    lderr(cct) << "error: failed to join: " << hook.err() << dendl;
+    return -EINVAL;
+  }
+
+  if (ret < 0)
+    return ret;
+
+  std::string out;
+  bl.begin().copy(bl.length(), out);
+  out.erase(out.find_last_not_of(" \n\r\t")+1);
+  return _parse(out);
+}
+
+int CrushLocation::init_on_startup()
+{
+  if (cct->_conf->crush_location.length()) {
+    return update_from_conf();
+  }
+  if (cct->_conf->crush_location_hook.length()) {
+    return update_from_hook();
+  }
+
+  // start with a sane default
+  char hostname[HOST_NAME_MAX + 1];
+  int r = gethostname(hostname, sizeof(hostname));
+  if (r < 0)
+    strcpy(hostname, "unknown_host");
+  // use short hostname
+  for (unsigned i=0; hostname[i]; ++i) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      break;
+    }
+  }
+  std::lock_guard l(lock);
+  loc.clear();
+  loc.insert(std::make_pair<std::string,std::string>("host", hostname));
+  loc.insert(std::make_pair<std::string,std::string>("root", "default"));
+  return 0;
+}
+
+std::multimap<std::string,std::string> CrushLocation::get_location() const
+{
+  std::lock_guard l(lock);
+  return loc;
+}
+
+std::ostream& operator<<(std::ostream& os, const CrushLocation& loc)
+{
+  bool first = true;
+  for (auto& [type, pos] : loc.get_location()) {
+    if (first) {
+      first = false;
+    } else {
+      os << ", ";
+    }
+    os << '"' << type << '=' << pos << '"';
+  }
+  return os;
+}
+
+}
diff --git a/src/crush/CrushLocation.h b/src/crush/CrushLocation.h
new file mode 100644
index 000000000..678135c2e
--- /dev/null
+++ b/src/crush/CrushLocation.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CRUSH_LOCATION_H
+#define CEPH_CRUSH_LOCATION_H
+
+#include <iosfwd>
+#include <map>
+#include <string>
+
+#include "common/ceph_mutex.h"
+#include "include/common_fwd.h"
+
+namespace TOPNSPC::crush {
+
+class CrushLocation {
+public:
+  explicit CrushLocation(CephContext *c) : cct(c) {
+    init_on_startup();
+  }
+
+  int update_from_conf();  ///< refresh from config
+  int update_from_hook();  ///< call hook, if present
+  int init_on_startup();
+
+  std::multimap<std::string,std::string> get_location() const;
+
+private:
+  int _parse(const std::string& s);
+  CephContext *cct;
+  std::multimap<std::string,std::string> loc;
+  mutable ceph::mutex lock = ceph::make_mutex("CrushLocation");
+};
+
+std::ostream& operator<<(std::ostream& os, const CrushLocation& loc);
+}
+#endif
diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc
new file mode 100644
index 000000000..9e59096f3
--- /dev/null
+++ b/src/crush/CrushTester.cc
@@ -0,0 +1,807 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/icl/interval_map.hpp>
+#include <boost/algorithm/string/join.hpp>
+
+#include "common/SubProcess.h"
+#include "common/fork_function.h"
+
+#include "include/stringify.h"
+#include "CrushTester.h"
+#include "CrushTreeDumper.h"
+#include "include/ceph_features.h"
+
+
+using std::cerr;
+using std::cout;
+using std::map;
+using std::ostringstream;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+void CrushTester::set_device_weight(int dev, float f)
+{
+  int w = (int)(f * 0x10000);
+  if (w < 0)
+    w = 0;
+  if (w > 0x10000)
+    w = 0x10000;
+  device_weight[dev] = w;
+}
+
+int CrushTester::get_maximum_affected_by_rule(int ruleno)
+{
+  // get the number of steps in RULENO
+  int rule_size = crush.get_rule_len(ruleno);
+  vector<int> affected_types;
+  map<int,int> replications_by_type;
+
+  for (int i = 0; i < rule_size; i++){
+    // get what operation is done by the current step
+    int rule_operation = crush.get_rule_op(ruleno, i);
+
+    // if the operation specifies choosing a device type, store it
+    if (rule_operation >= 2 && rule_operation != 4){
+      int desired_replication = crush.get_rule_arg1(ruleno,i);
+      int affected_type = crush.get_rule_arg2(ruleno,i);
+      affected_types.push_back(affected_type);
+      replications_by_type[affected_type] = desired_replication;
+    }
+  }
+
+  /*
+   * now for each of the affected bucket types, see what is the
+   * maximum we are (a) requesting or (b) have
+   */
+
+  map<int,int> max_devices_of_type;
+
+  // loop through the vector of affected types
+  for (vector<int>::iterator it = affected_types.begin(); it != affected_types.end(); ++it){
+    // loop through the number of buckets looking for affected types
+    for (map<int,string>::iterator p = crush.name_map.begin(); p != crush.name_map.end(); ++p){
+      int bucket_type = crush.get_bucket_type(p->first);
+      if ( bucket_type == *it)
+        max_devices_of_type[*it]++;
+    }
+  }
+
+  for(std::vector<int>::iterator it = affected_types.begin(); it != affected_types.end(); ++it){
+    if ( replications_by_type[*it] > 0 && replications_by_type[*it] < max_devices_of_type[*it] )
+      max_devices_of_type[*it] = replications_by_type[*it];
+  }
+
+  /*
+   * get the smallest number of buckets available of any type as this is our upper bound on
+   * the number of replicas we can place
+  */
+  int max_affected = std::max( crush.get_max_buckets(), crush.get_max_devices() );
+
+  for(std::vector<int>::iterator it = affected_types.begin(); it != affected_types.end(); ++it){
+    if (max_devices_of_type[*it] > 0 && max_devices_of_type[*it] < max_affected )
+      max_affected = max_devices_of_type[*it];
+  }
+
+  return max_affected;
+}
+
+
+map<int,int> CrushTester::get_collapsed_mapping()
+{
+  int num_to_check = crush.get_max_devices();
+  int next_id = 0;
+  map<int, int> collapse_mask;
+
+  for (int i = 0; i < num_to_check; i++){
+    if (crush.check_item_present(i)){
+      collapse_mask[i] = next_id;
+      next_id++;
+    }
+  }
+  
+  return collapse_mask;
+}
+
+void CrushTester::adjust_weights(vector<__u32>& weight)
+{
+
+  if (mark_down_device_ratio > 0) {
+    // active buckets
+    vector<int> bucket_ids;
+    for (int i = 0; i < crush.get_max_buckets(); i++) {
+      int id = -1 - i;
+      if (crush.get_bucket_weight(id) > 0) {
+        bucket_ids.push_back(id);
+      }
+    }
+
+    // get buckets that are one level above a device
+    vector<int> buckets_above_devices;
+    for (unsigned i = 0; i < bucket_ids.size(); i++) {
+      // grab the first child object of a bucket and check if it's ID is less than 0
+      int id = bucket_ids[i];
+      if (crush.get_bucket_size(id) == 0)
+        continue;
+      int first_child = crush.get_bucket_item(id, 0); // returns the ID of the bucket or device
+      if (first_child >= 0) {
+        buckets_above_devices.push_back(id);
+      }
+    }
+
+    // permute bucket list
+    for (unsigned i = 0; i < buckets_above_devices.size(); i++) {
+      unsigned j = lrand48() % (buckets_above_devices.size() - 1);
+      std::swap(buckets_above_devices[i], buckets_above_devices[j]);
+    }
+
+    // calculate how many buckets and devices we need to reap...
+    int num_buckets_to_visit = (int) (mark_down_bucket_ratio * buckets_above_devices.size());
+
+    for (int i = 0; i < num_buckets_to_visit; i++) {
+      int id = buckets_above_devices[i];
+      int size = crush.get_bucket_size(id);
+      vector<int> items;
+      for (int o = 0; o < size; o++)
+        items.push_back(crush.get_bucket_item(id, o));
+
+      // permute items
+      for (int o = 0; o < size; o++) {
+        int j = lrand48() % (crush.get_bucket_size(id) - 1);
+        std::swap(items[o], items[j]);
+      }
+
+      int local_devices_to_visit = (int) (mark_down_device_ratio*size);
+      for (int o = 0; o < local_devices_to_visit; o++){
+        int item = crush.get_bucket_item(id, o);
+        weight[item] = 0;
+      }
+    }
+  }
+}
+
+bool CrushTester::check_valid_placement(int ruleno, vector<int> in, const vector<__u32>& weight)
+{
+
+  bool valid_placement = true;
+  vector<int> included_devices;
+  map<string,string> seen_devices;
+
+  // first do the easy check that all devices are "up"
+  for (vector<int>::iterator it = in.begin(); it != in.end(); ++it) {
+    if (weight[(*it)] == 0) {
+      valid_placement = false;
+      break;
+    } else if (weight[(*it)] > 0) {
+      included_devices.push_back( (*it) );
+    }
+  }
+
+  /*
+   * now do the harder test of checking that the CRUSH rule r is not violated
+   * we could test that none of the devices mentioned in out are unique,
+   * but this is a special case of this test
+   */
+
+  // get the number of steps in RULENO
+  int rule_size = crush.get_rule_len(ruleno);
+  vector<string> affected_types;
+
+  // get the smallest type id, and name
+  int min_map_type = crush.get_num_type_names();
+  for (map<int,string>::iterator it = crush.type_map.begin(); it != crush.type_map.end(); ++it ) {
+    if ( (*it).first < min_map_type ) {
+      min_map_type = (*it).first;
+    }
+  }
+
+  string min_map_type_name = crush.type_map[min_map_type];
+
+  // get the types of devices affected by RULENO
+  for (int i = 0; i < rule_size; i++) {
+    // get what operation is done by the current step
+    int rule_operation = crush.get_rule_op(ruleno, i);
+
+    // if the operation specifies choosing a device type, store it
+    if (rule_operation >= 2 && rule_operation != 4) {
+      int affected_type = crush.get_rule_arg2(ruleno,i);
+      affected_types.push_back( crush.get_type_name(affected_type));
+    }
+  }
+
+  // find in if we are only dealing with osd's
+  bool only_osd_affected = false;
+  if (affected_types.size() == 1) {
+    if ((affected_types.back() == min_map_type_name) && (min_map_type_name == "osd")) {
+      only_osd_affected = true;
+    }
+  }
+
+  // check that we don't have any duplicate id's
+  for (vector<int>::iterator it = included_devices.begin(); it != included_devices.end(); ++it) {
+    int num_copies = std::count(included_devices.begin(), included_devices.end(), (*it) );
+    if (num_copies > 1) {
+      valid_placement = false;
+    }
+  }
+
+  // if we have more than just osd's affected we need to do a lot more work
+  if (!only_osd_affected) {
+    // loop through the devices that are "in/up"
+    for (vector<int>::iterator it = included_devices.begin(); it != included_devices.end(); ++it) {
+      if (valid_placement == false)
+        break;
+
+      // create a temporary map of the form (device type, device name in map)
+      map<string,string> device_location_hierarchy = crush.get_full_location(*it);
+
+      // loop over the types affected by RULENO looking for duplicate bucket assignments
+      for (vector<string>::iterator t = affected_types.begin(); t != affected_types.end(); ++t) {
+        if (seen_devices.count( device_location_hierarchy[*t])) {
+          valid_placement = false;
+          break;
+        } else {
+          // store the devices we have seen in the form of (device name, device type)
+          seen_devices[ device_location_hierarchy[*t] ] = *t;
+        }
+      }
+    }
+  }
+
+  return valid_placement;
+}
+
+int CrushTester::random_placement(int ruleno, vector<int>& out, int maxout, vector<__u32>& weight)
+{
+  // get the total weight of the system
+  int total_weight = 0;
+  for (unsigned i = 0; i < weight.size(); i++)
+    total_weight += weight[i];
+
+  if (total_weight == 0 ||
+      crush.get_max_devices() == 0)
+    return -EINVAL;
+
+  // determine the real maximum number of devices to return
+  int devices_requested = std::min(maxout, get_maximum_affected_by_rule(ruleno));
+  bool accept_placement = false;
+
+  vector<int> trial_placement(devices_requested);
+  int attempted_tries = 0;
+  int max_tries = 100;
+  do {
+    // create a vector to hold our trial mappings
+    int temp_array[devices_requested];
+    for (int i = 0; i < devices_requested; i++){
+      temp_array[i] = lrand48() % (crush.get_max_devices());
+    }
+
+    trial_placement.assign(temp_array, temp_array + devices_requested);
+    accept_placement = check_valid_placement(ruleno, trial_placement, weight);
+    attempted_tries++;
+  } while (accept_placement == false && attempted_tries < max_tries);
+
+  // save our random placement to the out vector
+  if (accept_placement)
+    out.assign(trial_placement.begin(), trial_placement.end());
+
+  // or don't....
+  else if (attempted_tries == max_tries)
+    return -EINVAL;
+
+  return 0;
+}
+
+void CrushTester::write_integer_indexed_vector_data_string(vector<string> &dst, int index, vector<int> vector_data)
+{
+  stringstream data_buffer (stringstream::in | stringstream::out);
+  unsigned input_size = vector_data.size();
+
+  // pass the indexing variable to the data buffer
+  data_buffer << index;
+
+  // pass the rest of the input data to the buffer
+  for (unsigned i = 0; i < input_size; i++) {
+    data_buffer << ',' << vector_data[i];
+  }
+
+  data_buffer << std::endl;
+
+  // write the data buffer to the destination
+  dst.push_back( data_buffer.str() );
+}
+
+void CrushTester::write_integer_indexed_vector_data_string(vector<string> &dst, int index, vector<float> vector_data)
+{
+  stringstream data_buffer (stringstream::in | stringstream::out);
+  unsigned input_size = vector_data.size();
+
+  // pass the indexing variable to the data buffer
+  data_buffer << index;
+
+  // pass the rest of the input data to the buffer
+  for (unsigned i = 0; i < input_size; i++) {
+    data_buffer << ',' << vector_data[i];
+  }
+
+  data_buffer << std::endl;
+
+  // write the data buffer to the destination
+  dst.push_back( data_buffer.str() );
+}
+
+void CrushTester::write_integer_indexed_scalar_data_string(vector<string> &dst, int index, int scalar_data)
+{
+  stringstream data_buffer (stringstream::in | stringstream::out);
+
+  // pass the indexing variable to the data buffer
+  data_buffer << index;
+
+  // pass the input data to the buffer
+  data_buffer << ',' << scalar_data;
+  data_buffer << std::endl;
+
+  // write the data buffer to the destination
+  dst.push_back( data_buffer.str() );
+}
+void CrushTester::write_integer_indexed_scalar_data_string(vector<string> &dst, int index, float scalar_data)
+{
+  stringstream data_buffer (stringstream::in | stringstream::out);
+
+  // pass the indexing variable to the data buffer
+  data_buffer << index;
+
+  // pass the input data to the buffer
+  data_buffer << ',' << scalar_data;
+  data_buffer << std::endl;
+
+  // write the data buffer to the destination
+  dst.push_back( data_buffer.str() );
+}
+
+int CrushTester::test_with_fork(int timeout)
+{
+  ostringstream sink;
+  int r = fork_function(timeout, sink, [&]() {
+      return test();
+    });
+  if (r == -ETIMEDOUT) {
+    err << "timed out during smoke test (" << timeout << " seconds)";
+  }
+  return r;
+}
+
+namespace {
+  class BadCrushMap : public std::runtime_error {
+  public:
+    int item;
+    BadCrushMap(const char* msg, int id)
+      : std::runtime_error(msg), item(id) {}
+  };
+  // throws if any node in the crush fail to print
+  class CrushWalker : public CrushTreeDumper::Dumper<void> {
+    typedef void DumbFormatter;
+    typedef CrushTreeDumper::Dumper<DumbFormatter> Parent;
+    int max_id;
+  public:
+    CrushWalker(const CrushWrapper *crush, unsigned max_id)
+      : Parent(crush, CrushTreeDumper::name_map_t()), max_id(max_id) {}
+    void dump_item(const CrushTreeDumper::Item &qi, DumbFormatter *) override {
+      int type = -1;
+      if (qi.is_bucket()) {
+	if (!crush->get_item_name(qi.id)) {
+	  throw BadCrushMap("unknown item name", qi.id);
+	}
+	type = crush->get_bucket_type(qi.id);
+      } else {
+	if (max_id > 0 && qi.id >= max_id) {
+	  throw BadCrushMap("item id too large", qi.id);
+	}
+	type = 0;
+      }
+      if (!crush->get_type_name(type)) {
+	throw BadCrushMap("unknown type name", qi.id);
+      }
+    }
+  };
+}
+
+bool CrushTester::check_name_maps(unsigned max_id) const
+{
+  CrushWalker crush_walker(&crush, max_id);
+  try {
+    // walk through the crush, to see if its self-contained
+    crush_walker.dump(NULL);
+    // and see if the maps is also able to handle straying OSDs, whose id >= 0.
+    // "ceph osd tree" will try to print them, even they are not listed in the
+    // crush map.
+    crush_walker.dump_item(CrushTreeDumper::Item(0, 0, 0, 0), NULL);
+  } catch (const BadCrushMap& e) {
+    err << e.what() << ": item#" << e.item << std::endl;
+    return false;
+  }
+  return true;
+}
+
+static string get_rule_name(CrushWrapper& crush, int rule)
+{
+  if (crush.get_rule_name(rule))
+    return crush.get_rule_name(rule);
+  else
+    return string("rule") + std::to_string(rule);
+}
+
+void CrushTester::check_overlapped_rules() const
+{
+  namespace icl = boost::icl;
+  typedef std::set<string> RuleNames;
+  typedef icl::interval_map<int, RuleNames> Rules;
+  // <ruleset, type> => interval_map<size, {names}>
+  typedef std::map<std::pair<int, int>, Rules> RuleSets;
+  using interval = icl::interval<int>;
+
+  // mimic the logic of crush_find_rule(), but it only return the first matched
+  // one, but I am collecting all of them by the overlapped sizes.
+  RuleSets rulesets;
+  for (int rule = 0; rule < crush.get_max_rules(); rule++) {
+    if (!crush.rule_exists(rule)) {
+      continue;
+    }
+    Rules& rules = rulesets[{crush.get_rule_mask_ruleset(rule),
+			     crush.get_rule_mask_type(rule)}];
+    rules += make_pair(interval::closed(crush.get_rule_mask_min_size(rule),
+					crush.get_rule_mask_max_size(rule)),
+		       RuleNames{get_rule_name(crush, rule)});
+  }
+  for (auto i : rulesets) {
+    auto ruleset_type = i.first;
+    const Rules& rules = i.second;
+    for (auto r : rules) {
+      const RuleNames& names = r.second;
+      // if there are more than one rules covering the same size range,
+      // print them out.
+      if (names.size() > 1) {
+	err << "overlapped rules in ruleset " << ruleset_type.first << ": "
+	    << boost::join(names, ", ") << "\n";
+      }
+    }
+  }
+}
+
+int CrushTester::test()
+{
+  if (min_rule < 0 || max_rule < 0) {
+    min_rule = 0;
+    max_rule = crush.get_max_rules() - 1;
+  }
+  if (min_x < 0 || max_x < 0) {
+    min_x = 0;
+    max_x = 1023;
+  }
+
+  // initial osd weights
+  vector<__u32> weight;
+
+  /*
+   * note device weight is set by crushtool
+   * (likely due to a given a command line option)
+   */
+  for (int o = 0; o < crush.get_max_devices(); o++) {
+    if (device_weight.count(o)) {
+      weight.push_back(device_weight[o]);
+    } else if (crush.check_item_present(o)) {
+      weight.push_back(0x10000);
+    } else {
+      weight.push_back(0);
+    }
+  }
+
+  if (output_utilization_all)
+    cerr << "devices weights (hex): " << std::hex << weight << std::dec << std::endl;
+
+  // make adjustments
+  adjust_weights(weight);
+
+
+  int num_devices_active = 0;
+  for (vector<__u32>::iterator p = weight.begin(); p != weight.end(); ++p)
+    if (*p > 0)
+      num_devices_active++;
+
+  if (output_choose_tries)
+    crush.start_choose_profile();
+  
+  for (int r = min_rule; r < crush.get_max_rules() && r <= max_rule; r++) {
+    if (!crush.rule_exists(r)) {
+      if (output_statistics)
+        err << "rule " << r << " dne" << std::endl;
+      continue;
+    }
+    if (ruleset >= 0 &&
+	crush.get_rule_mask_ruleset(r) != ruleset) {
+      continue;
+    }
+    int minr = min_rep, maxr = max_rep;
+    if (min_rep < 0 || max_rep < 0) {
+      minr = crush.get_rule_mask_min_size(r);
+      maxr = crush.get_rule_mask_max_size(r);
+    }
+    
+    if (output_statistics)
+      err << "rule " << r << " (" << crush.get_rule_name(r)
+      << "), x = " << min_x << ".." << max_x
+      << ", numrep = " << minr << ".." << maxr
+      << std::endl;
+
+    for (int nr = minr; nr <= maxr; nr++) {
+      vector<int> per(crush.get_max_devices());
+      map<int,int> sizes;
+
+      int num_objects = ((max_x - min_x) + 1);
+      float num_devices = (float) per.size(); // get the total number of devices, better to cast as a float here 
+
+      // create a structure to hold data for post-processing
+      tester_data_set tester_data;
+      vector<float> vector_data_buffer_f;
+
+      // create a map to hold batch-level placement information
+      map<int, vector<int> > batch_per;
+      int objects_per_batch = num_objects / num_batches;
+      int batch_min = min_x;
+      int batch_max = min_x + objects_per_batch - 1;
+
+      // get the total weight of the system
+      int total_weight = 0;
+      for (unsigned i = 0; i < per.size(); i++)
+        total_weight += weight[i];
+
+      if (total_weight == 0)
+	continue;
+
+      // compute the expected number of objects stored per device in the absence of weighting
+      float expected_objects = std::min(nr, get_maximum_affected_by_rule(r)) * num_objects;
+
+      // compute each device's proportional weight
+      vector<float> proportional_weights( per.size() );
+
+      for (unsigned i = 0; i < per.size(); i++)
+        proportional_weights[i] = (float) weight[i] / (float) total_weight;
+
+      if (output_data_file) {
+        // stage the absolute weight information for post-processing
+        for (unsigned i = 0; i < per.size(); i++) {
+          tester_data.absolute_weights[i] = (float) weight[i] / (float)0x10000;
+        }
+
+        // stage the proportional weight information for post-processing
+        for (unsigned i = 0; i < per.size(); i++) {
+          if (proportional_weights[i] > 0 )
+            tester_data.proportional_weights[i] = proportional_weights[i];
+
+          tester_data.proportional_weights_all[i] = proportional_weights[i];
+        }
+
+      }
+      // compute the expected number of objects stored per device when a device's weight is considered
+      vector<float> num_objects_expected(num_devices);
+
+      for (unsigned i = 0; i < num_devices; i++)
+        num_objects_expected[i] = (proportional_weights[i]*expected_objects);
+
+      for (int current_batch = 0; current_batch < num_batches; current_batch++) {
+        if (current_batch == (num_batches - 1)) {
+          batch_max = max_x;
+          objects_per_batch = (batch_max - batch_min + 1);
+        }
+
+        float batch_expected_objects = std::min(nr, get_maximum_affected_by_rule(r)) * objects_per_batch;
+        vector<float> batch_num_objects_expected( per.size() );
+
+        for (unsigned i = 0; i < per.size() ; i++)
+          batch_num_objects_expected[i] = (proportional_weights[i]*batch_expected_objects);
+
+        // create a vector to hold placement results temporarily 
+        vector<int> temporary_per ( per.size() );
+
+        for (int x = batch_min; x <= batch_max; x++) {
+          // create a vector to hold the results of a CRUSH placement or RNG simulation
+          vector<int> out;
+
+          if (use_crush) {
+            if (output_mappings)
+	      err << "CRUSH"; // prepend CRUSH to placement output
+            uint32_t real_x = x;
+            if (pool_id != -1) {
+              real_x = crush_hash32_2(CRUSH_HASH_RJENKINS1, x, (uint32_t)pool_id);
+            }
+            crush.do_rule(r, real_x, out, nr, weight, 0);
+          } else {
+            if (output_mappings)
+	      err << "RNG"; // prepend RNG to placement output to denote simulation
+            // test our new monte carlo placement generator
+            random_placement(r, out, nr, weight);
+          }
+
+	  if (output_mappings)
+	    err << " rule " << r << " x " << x << " " << out << std::endl;
+
+          if (output_data_file)
+            write_integer_indexed_vector_data_string(tester_data.placement_information, x, out);
+
+          bool has_item_none = false;
+          for (unsigned i = 0; i < out.size(); i++) {
+            if (out[i] != CRUSH_ITEM_NONE) {
+              per[out[i]]++;
+              temporary_per[out[i]]++;
+            } else {
+              has_item_none = true;
+            }
+          }
+
+          batch_per[current_batch] = temporary_per;
+          sizes[out.size()]++;
+          if (output_bad_mappings && 
+              (out.size() != (unsigned)nr ||
+               has_item_none)) {
+            err << "bad mapping rule " << r << " x " << x << " num_rep " << nr << " result " << out << std::endl;
+          }
+        }
+
+        batch_min = batch_max + 1;
+        batch_max = batch_min + objects_per_batch - 1;
+      }
+
+      for (unsigned i = 0; i < per.size(); i++)
+        if (output_utilization && !output_statistics)
+          err << "  device " << i
+          << ":\t" << per[i] << std::endl;
+
+      for (map<int,int>::iterator p = sizes.begin(); p != sizes.end(); ++p)
+        if (output_statistics)
+          err << "rule " << r << " (" << crush.get_rule_name(r) << ") num_rep " << nr
+          << " result size == " << p->first << ":\t"
+          << p->second << "/" << (max_x-min_x+1) << std::endl;
+
+      if (output_statistics)
+        for (unsigned i = 0; i < per.size(); i++) {
+          if (output_utilization) {
+            if (num_objects_expected[i] > 0 && per[i] > 0) {
+              err << "  device " << i << ":\t"
+                  << "\t" << " stored " << ": " << per[i]
+                  << "\t" << " expected " << ": " << num_objects_expected[i]
+                  << std::endl;
+            }
+          } else if (output_utilization_all) {
+            err << "  device " << i << ":\t"
+                << "\t" << " stored " << ": " << per[i]
+                << "\t" << " expected " << ": " << num_objects_expected[i]
+                << std::endl;
+          }
+        }
+
+      if (output_data_file)
+        for (unsigned i = 0; i < per.size(); i++) {
+          vector_data_buffer_f.clear();
+          vector_data_buffer_f.push_back( (float) per[i]);
+          vector_data_buffer_f.push_back( (float) num_objects_expected[i]);
+
+          write_integer_indexed_vector_data_string(tester_data.device_utilization_all, i, vector_data_buffer_f);
+
+          if (num_objects_expected[i] > 0 && per[i] > 0)
+            write_integer_indexed_vector_data_string(tester_data.device_utilization, i, vector_data_buffer_f);
+        }
+
+      if (output_data_file && num_batches > 1) {
+        // stage batch utilization information for post-processing
+        for (int i = 0; i < num_batches; i++) {
+          write_integer_indexed_vector_data_string(tester_data.batch_device_utilization_all, i, batch_per[i]);
+          write_integer_indexed_vector_data_string(tester_data.batch_device_expected_utilization_all, i, batch_per[i]);
+        }
+      }
+
+      string rule_tag = crush.get_rule_name(r);
+
+      if (output_csv)
+        write_data_set_to_csv(output_data_file_name+rule_tag,tester_data);
+    }
+  }
+
+  if (output_choose_tries) {
+    __u32 *v = 0;
+    int n = crush.get_choose_profile(&v);
+    for (int i=0; i<n; i++) {
+      cout.setf(std::ios::right);
+      cout << std::setw(2)
+      << i << ": " << std::setw(9) << v[i];
+      cout.unsetf(std::ios::right);
+      cout << std::endl;
+    }
+
+    crush.stop_choose_profile();
+  }
+
+  return 0;
+}
+
+int CrushTester::compare(CrushWrapper& crush2)
+{
+  if (min_rule < 0 || max_rule < 0) {
+    min_rule = 0;
+    max_rule = crush.get_max_rules() - 1;
+  }
+  if (min_x < 0 || max_x < 0) {
+    min_x = 0;
+    max_x = 1023;
+  }
+
+  // initial osd weights
+  vector<__u32> weight;
+
+  /*
+   * note device weight is set by crushtool
+   * (likely due to a given a command line option)
+   */
+  for (int o = 0; o < crush.get_max_devices(); o++) {
+    if (device_weight.count(o)) {
+      weight.push_back(device_weight[o]);
+    } else if (crush.check_item_present(o)) {
+      weight.push_back(0x10000);
+    } else {
+      weight.push_back(0);
+    }
+  }
+
+  // make adjustments
+  adjust_weights(weight);
+
+  map<int,int> bad_by_rule;
+
+  int ret = 0;
+  for (int r = min_rule; r < crush.get_max_rules() && r <= max_rule; r++) {
+    if (!crush.rule_exists(r)) {
+      if (output_statistics)
+        err << "rule " << r << " dne" << std::endl;
+      continue;
+    }
+    if (ruleset >= 0 &&
+	crush.get_rule_mask_ruleset(r) != ruleset) {
+      continue;
+    }
+    int minr = min_rep, maxr = max_rep;
+    if (min_rep < 0 || max_rep < 0) {
+      minr = crush.get_rule_mask_min_size(r);
+      maxr = crush.get_rule_mask_max_size(r);
+    }
+    int bad = 0;
+    for (int nr = minr; nr <= maxr; nr++) {
+      for (int x = min_x; x <= max_x; ++x) {
+	vector<int> out;
+	crush.do_rule(r, x, out, nr, weight, 0);
+	vector<int> out2;
+	crush2.do_rule(r, x, out2, nr, weight, 0);
+	if (out != out2) {
+	  ++bad;
+	}
+      }
+    }
+    if (bad) {
+      ret = -1;
+    }
+    int max = (maxr - minr + 1) * (max_x - min_x + 1);
+    double ratio = (double)bad / (double)max;
+    cout << "rule " << r << " had " << bad << "/" << max
+	 << " mismatched mappings (" << ratio << ")" << std::endl;
+  }
+  if (ret) {
+    cerr << "warning: maps are NOT equivalent" << std::endl;
+  } else {
+    cout << "maps appear equivalent" << std::endl;
+  }
+  return ret;
+}
diff --git a/src/crush/CrushTester.h b/src/crush/CrushTester.h
new file mode 100644
index 000000000..1bbc01a70
--- /dev/null
+++ b/src/crush/CrushTester.h
@@ -0,0 +1,366 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CRUSH_TESTER_H
+#define CEPH_CRUSH_TESTER_H
+
+#include "crush/CrushWrapper.h"
+
+#include <fstream>
+
+class CrushTester {
+  CrushWrapper& crush;
+  std::ostream& err;
+
+  std::map<int, int> device_weight;
+  int min_rule, max_rule;
+  int ruleset;
+  int min_x, max_x;
+  int min_rep, max_rep;
+  int64_t pool_id;
+
+  int num_batches;
+  bool use_crush;
+
+  float mark_down_device_ratio;
+  float mark_down_bucket_ratio;
+
+  bool output_utilization;
+  bool output_utilization_all;
+  bool output_statistics;
+  bool output_mappings;
+  bool output_bad_mappings;
+  bool output_choose_tries;
+
+  bool output_data_file;
+  bool output_csv;
+
+  std::string output_data_file_name;
+
+/*
+ * mark a ratio of devices down, can be used to simulate placement distributions
+ * under degrated cluster conditions
+ */
+  void adjust_weights(std::vector<__u32>& weight);
+
+  /*
+   * Get the maximum number of devices that could be selected to satisfy ruleno.
+   */
+  int get_maximum_affected_by_rule(int ruleno);
+
+  /*
+   * for maps where in devices have non-sequential id numbers, return a mapping of device id
+   * to a sequential id number. For example, if we have devices with id's 0 1 4 5 6 return a map
+   * where:
+   *     0 = 0
+   *     1 = 1
+   *     4 = 2
+   *     5 = 3
+   *     6 = 4
+   *
+   * which can help make post-processing easier
+   */
+  std::map<int,int> get_collapsed_mapping();
+
+  /*
+   * Essentially a re-implementation of CRUSH. Given a vector of devices
+   * check that the vector represents a valid placement for a given ruleno.
+   */
+  bool check_valid_placement(int ruleno, std::vector<int> in, const std::vector<__u32>& weight);
+
+  /*
+   * Generate a random selection of devices which satisfies ruleno. Essentially a
+   * monte-carlo simulator for CRUSH placements which can be used to compare the
+   * statistical distribution of the CRUSH algorithm to a random number generator
+   */
+  int random_placement(int ruleno, std::vector<int>& out, int maxout, std::vector<__u32>& weight);
+
+  // scaffolding to store data for off-line processing
+   struct tester_data_set {
+     std::vector<std::string> device_utilization;
+     std::vector<std::string> device_utilization_all;
+     std::vector<std::string> placement_information;
+     std::vector<std::string> batch_device_utilization_all;
+     std::vector<std::string> batch_device_expected_utilization_all;
+     std::map<int, float> proportional_weights;
+     std::map<int, float> proportional_weights_all;
+     std::map<int, float> absolute_weights;
+   } ;
+
+  void write_to_csv(std::ofstream& csv_file, std::vector<std::string>& payload)
+   {
+     if (csv_file.good())
+       for (std::vector<std::string>::iterator it = payload.begin(); it != payload.end(); ++it)
+         csv_file << (*it);
+   }
+
+  void write_to_csv(std::ofstream& csv_file, std::map<int, float>& payload)
+   {
+     if (csv_file.good())
+       for (std::map<int, float>::iterator it = payload.begin(); it != payload.end(); ++it)
+         csv_file << (*it).first << ',' << (*it).second << std::endl;
+   }
+
+   void write_data_set_to_csv(std::string user_tag, tester_data_set& tester_data)
+   {
+
+     std::ofstream device_utilization_file((user_tag + (std::string)"-device_utilization.csv").c_str());
+     std::ofstream device_utilization_all_file((user_tag + (std::string)"-device_utilization_all.csv").c_str());
+     std::ofstream placement_information_file((user_tag + (std::string)"-placement_information.csv").c_str());
+     std::ofstream proportional_weights_file((user_tag + (std::string)"-proportional_weights.csv").c_str());
+     std::ofstream proportional_weights_all_file((user_tag + (std::string)"-proportional_weights_all.csv").c_str());
+     std::ofstream absolute_weights_file((user_tag + (std::string)"-absolute_weights.csv").c_str());
+
+     // write the headers
+     device_utilization_file << "Device ID, Number of Objects Stored, Number of Objects Expected" << std::endl;
+     device_utilization_all_file << "Device ID, Number of Objects Stored, Number of Objects Expected" << std::endl;
+     proportional_weights_file << "Device ID, Proportional Weight" << std::endl;
+     proportional_weights_all_file << "Device ID, Proportional Weight" << std::endl;
+     absolute_weights_file << "Device ID, Absolute Weight" << std::endl;
+
+     placement_information_file << "Input";
+     for (int i = 0; i < max_rep; i++) {
+       placement_information_file << ", OSD" << i;
+     }
+     placement_information_file << std::endl;
+
+     write_to_csv(device_utilization_file, tester_data.device_utilization);
+     write_to_csv(device_utilization_all_file, tester_data.device_utilization_all);
+     write_to_csv(placement_information_file, tester_data.placement_information);
+     write_to_csv(proportional_weights_file, tester_data.proportional_weights);
+     write_to_csv(proportional_weights_all_file, tester_data.proportional_weights_all);
+     write_to_csv(absolute_weights_file, tester_data.absolute_weights);
+
+     device_utilization_file.close();
+     device_utilization_all_file.close();
+     placement_information_file.close();
+     proportional_weights_file.close();
+     absolute_weights_file.close();
+
+     if (num_batches > 1) {
+       std::ofstream batch_device_utilization_all_file ((user_tag + (std::string)"-batch_device_utilization_all.csv").c_str());
+       std::ofstream batch_device_expected_utilization_all_file ((user_tag + (std::string)"-batch_device_expected_utilization_all.csv").c_str());
+
+       batch_device_utilization_all_file << "Batch Round";
+       for (unsigned i = 0; i < tester_data.device_utilization.size(); i++) {
+         batch_device_utilization_all_file << ", Objects Stored on OSD" << i;
+       }
+       batch_device_utilization_all_file << std::endl;
+
+       batch_device_expected_utilization_all_file << "Batch Round";
+       for (unsigned i = 0; i < tester_data.device_utilization.size(); i++) {
+         batch_device_expected_utilization_all_file << ", Objects Expected on OSD" << i;
+       }
+       batch_device_expected_utilization_all_file << std::endl;
+
+       write_to_csv(batch_device_utilization_all_file, tester_data.batch_device_utilization_all);
+       write_to_csv(batch_device_expected_utilization_all_file, tester_data.batch_device_expected_utilization_all);
+       batch_device_expected_utilization_all_file.close();
+       batch_device_utilization_all_file.close();
+     }
+   }
+
+   void write_integer_indexed_vector_data_string(std::vector<std::string> &dst, int index, std::vector<int> vector_data);
+   void write_integer_indexed_vector_data_string(std::vector<std::string> &dst, int index, std::vector<float> vector_data);
+   void write_integer_indexed_scalar_data_string(std::vector<std::string> &dst, int index, int scalar_data);
+   void write_integer_indexed_scalar_data_string(std::vector<std::string> &dst, int index, float scalar_data);
+
+public:
+  CrushTester(CrushWrapper& c, std::ostream& eo)
+    : crush(c), err(eo),
+      min_rule(-1), max_rule(-1),
+      ruleset(-1),
+      min_x(-1), max_x(-1),
+      min_rep(-1), max_rep(-1),
+      pool_id(-1),
+      num_batches(1),
+      use_crush(true),
+      mark_down_device_ratio(0.0),
+      mark_down_bucket_ratio(1.0),
+      output_utilization(false),
+      output_utilization_all(false),
+      output_statistics(false),
+      output_mappings(false),
+      output_bad_mappings(false),
+      output_choose_tries(false),
+      output_data_file(false),
+      output_csv(false),
+      output_data_file_name("")
+
+  { }
+
+  void set_output_data_file_name(std::string name) {
+    output_data_file_name = name;
+  }
+  std::string get_output_data_file_name() const {
+    return output_data_file_name;
+  }
+
+  void set_output_data_file(bool b) {
+     output_data_file = b;
+  }
+  bool get_output_data_file() const {
+    return output_data_file;
+  }
+
+  void set_output_csv(bool b) {
+     output_csv = b;
+  }
+  bool get_output_csv() const {
+    return output_csv;
+  }
+
+  void set_output_utilization(bool b) {
+    output_utilization = b;
+  }
+  bool get_output_utilization() const {
+    return output_utilization;
+  }
+
+  void set_output_utilization_all(bool b) {
+    output_utilization_all = b;
+  }
+  bool get_output_utilization_all() const {
+    return output_utilization_all;
+  }
+
+  void set_output_statistics(bool b) {
+    output_statistics = b;
+  }
+  bool get_output_statistics() const {
+    return output_statistics;
+  }
+
+  void set_output_mappings(bool b) {
+    output_mappings = b;
+  }
+  bool get_output_mappings() const {
+    return output_mappings;
+  }
+
+  void set_output_bad_mappings(bool b) {
+    output_bad_mappings = b;
+  }
+  bool get_output_bad_mappings() const {
+    return output_bad_mappings;
+  }
+
+  void set_output_choose_tries(bool b) {
+    output_choose_tries = b;
+  }
+  bool get_output_choose_tries() const {
+    return output_choose_tries;
+  }
+
+  void set_batches(int b) {
+    num_batches = b;
+  }
+  int get_batches() const {
+    return num_batches;
+  }
+
+  void set_random_placement() {
+    use_crush = false;
+  }
+  bool get_random_placement() const {
+    return use_crush == false;
+  }
+
+  void set_bucket_down_ratio(float bucket_ratio) {
+    mark_down_bucket_ratio = bucket_ratio;
+  }
+  float get_bucket_down_ratio() const {
+    return mark_down_bucket_ratio;
+  }
+
+  void set_device_down_ratio(float device_ratio) {
+    mark_down_device_ratio = device_ratio;
+  }
+  float set_device_down_ratio() const {
+    return mark_down_device_ratio;
+  }
+
+  void set_device_weight(int dev, float f);
+
+  void set_min_rep(int r) {
+    min_rep = r;
+  }
+  int get_min_rep() const {
+    return min_rep;
+  }
+
+  void set_max_rep(int r) {
+    max_rep = r;
+  }
+  int get_max_rep() const {
+    return max_rep;
+  }
+
+  void set_num_rep(int r) {
+    min_rep = max_rep = r;
+  }
+  
+  void set_min_x(int x) {
+    min_x = x;
+  }
+
+  void set_pool_id(int64_t x){
+    pool_id = x;
+  }
+
+  int get_min_x() const {
+    return min_x;
+  }
+
+  void set_max_x(int x) {
+    max_x = x;
+  }
+  int get_max_x() const {
+    return max_x;
+  }
+
+  void set_x(int x) {
+    min_x = max_x = x;
+  }
+
+  void set_min_rule(int rule) {
+    min_rule = rule;
+  }
+  int get_min_rule() const {
+    return min_rule;
+  }
+
+  void set_max_rule(int rule) {
+    max_rule = rule;
+  }
+  int get_max_rule() const {
+    return max_rule;
+  }
+
+  void set_rule(int rule) {
+    min_rule = max_rule = rule;
+  }
+
+  void set_ruleset(int rs) {
+    ruleset = rs;
+  }
+
+  /**
+   * check if any bucket/nodes is referencing an unknown name or type
+   * @param max_id rejects any non-bucket items with id less than this number,
+   *               pass 0 to disable this check
+   * @return false if an dangling name/type is referenced or an item id is too
+   *         large, true otherwise
+   */
+  bool check_name_maps(unsigned max_id = 0) const;
+  /**
+   * print out overlapped crush rules belonging to the same ruleset
+   */
+  void check_overlapped_rules() const;
+  int test();
+  int test_with_fork(int timeout);
+
+  int compare(CrushWrapper& other);
+};
+
+#endif
diff --git a/src/crush/CrushTreeDumper.h b/src/crush/CrushTreeDumper.h
new file mode 100644
index 000000000..a10c0f2c1
--- /dev/null
+++ b/src/crush/CrushTreeDumper.h
@@ -0,0 +1,291 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2015 Mirantis Inc
+ *
+ * Author: Mykola Golub <mgolub@mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CRUSH_TREE_DUMPER_H
+#define CRUSH_TREE_DUMPER_H
+
+#include "CrushWrapper.h"
+#include "include/stringify.h"
+
+/**
+ * CrushTreeDumper:
+ * A helper class and functions to dump a crush tree.
+ *
+ * Example:
+ *
+ *  class SimpleDumper : public CrushTreeDumper::Dumper<ostream> {
+ *  public:
+ *    SimpleDumper(const CrushWrapper *crush) :
+ *      CrushTreeDumper::Dumper<ostream>(crush) {}
+ *  protected:
+ *    virtual void dump_item(const CrushTreeDumper::Item &qi, ostream *out) {
+ *      *out << qi.id;
+ *      for (int k = 0; k < qi.depth; k++)
+ *        *out << "-";
+ *      if (qi.is_bucket())
+ *        *out << crush->get_item_name(qi.id)
+ *      else
+ *        *out << "osd." << qi.id;
+ *      *out << "\n";
+ *    }
+ *  };
+ *
+ *  SimpleDumper(crush).dump(out);
+ *
+ */
+
+namespace CrushTreeDumper {
+
+  struct Item {
+    int id;
+    int parent;
+    int depth;
+    float weight;
+    std::list<int> children;
+
+    Item() : id(0), parent(0), depth(0), weight(0) {}
+    Item(int i, int p, int d, float w) : id(i), parent(p), depth(d), weight(w) {}
+
+    bool is_bucket() const { return id < 0; }
+  };
+
+  template <typename F>
+  class Dumper : public std::list<Item> {
+  public:
+    explicit Dumper(const CrushWrapper *crush_,
+		    const name_map_t& weight_set_names_)
+      : crush(crush_), weight_set_names(weight_set_names_) {
+      crush->find_nonshadow_roots(&roots);
+      root = roots.begin();
+    }
+    explicit Dumper(const CrushWrapper *crush_,
+                    const name_map_t& weight_set_names_,
+                    bool show_shadow)
+      : crush(crush_), weight_set_names(weight_set_names_) {
+      if (show_shadow) {
+        crush->find_roots(&roots);
+      } else {
+        crush->find_nonshadow_roots(&roots);
+      }
+      root = roots.begin();
+    }
+
+    virtual ~Dumper() {}
+
+    virtual void reset() {
+      root = roots.begin();
+      touched.clear();
+      clear();
+    }
+
+    virtual bool should_dump_leaf(int i) const {
+      return true;
+    }
+    virtual bool should_dump_empty_bucket() const {
+      return true;
+    }
+
+    bool should_dump(int id) {
+      if (id >= 0)
+	return should_dump_leaf(id);
+      if (should_dump_empty_bucket())
+	return true;
+      int s = crush->get_bucket_size(id);
+      for (int k = s - 1; k >= 0; k--) {
+	int c = crush->get_bucket_item(id, k);
+	if (should_dump(c))
+	  return true;
+      }
+      return false;
+    }
+
+    bool next(Item &qi) {
+      if (empty()) {
+	while (root != roots.end() && !should_dump(*root))
+	  ++root;
+	if (root == roots.end())
+	  return false;
+	push_back(Item(*root, 0, 0, crush->get_bucket_weightf(*root)));
+	++root;
+      }
+
+      qi = front();
+      pop_front();
+      touched.insert(qi.id);
+
+      if (qi.is_bucket()) {
+	// queue bucket contents, sorted by (class, name)
+	int s = crush->get_bucket_size(qi.id);
+	std::map<std::string, std::pair<int,float>> sorted;
+	for (int k = s - 1; k >= 0; k--) {
+	  int id = crush->get_bucket_item(qi.id, k);
+	  if (should_dump(id)) {
+	    std::string sort_by;
+	    if (id >= 0) {
+	      const char *c = crush->get_item_class(id);
+	      sort_by = c ? c : "";
+	      sort_by += "_";
+	      char nn[80];
+	      snprintf(nn, sizeof(nn), "osd.%08d", id);
+	      sort_by += nn;
+	    } else {
+	      sort_by = "_";
+	      sort_by += crush->get_item_name(id);
+	    }
+	    sorted[sort_by] = std::make_pair(
+	      id, crush->get_bucket_item_weightf(qi.id, k));
+	  }
+	}
+	for (auto p = sorted.rbegin(); p != sorted.rend(); ++p) {
+	  qi.children.push_back(p->second.first);
+	  push_front(Item(p->second.first, qi.id, qi.depth + 1,
+			  p->second.second));
+	}
+      }
+      return true;
+    }
+
+    void dump(F *f) {
+      reset();
+      Item qi;
+      while (next(qi))
+	dump_item(qi, f);
+    }
+
+    bool is_touched(int id) const { return touched.count(id) > 0; }
+
+    void set_root(const std::string& bucket) {
+      roots.clear();
+      if (crush->name_exists(bucket)) {
+	int i = crush->get_item_id(bucket);
+	roots.insert(i);
+      }
+    }
+
+  protected:
+    virtual void dump_item(const Item &qi, F *f) = 0;
+
+  protected:
+    const CrushWrapper *crush;
+    const name_map_t &weight_set_names;
+
+  private:
+    std::set<int> touched;
+    std::set<int> roots;
+    std::set<int>::iterator root;
+  };
+
+  inline void dump_item_fields(const CrushWrapper *crush,
+			       const name_map_t& weight_set_names,
+			       const Item &qi, ceph::Formatter *f) {
+    f->dump_int("id", qi.id);
+    const char *c = crush->get_item_class(qi.id);
+    if (c)
+      f->dump_string("device_class", c);
+    if (qi.is_bucket()) {
+      int type = crush->get_bucket_type(qi.id);
+      f->dump_string("name", crush->get_item_name(qi.id));
+      f->dump_string("type", crush->get_type_name(type));
+      f->dump_int("type_id", type);
+    } else {
+      f->dump_stream("name") << "osd." << qi.id;
+      f->dump_string("type", crush->get_type_name(0));
+      f->dump_int("type_id", 0);
+      f->dump_float("crush_weight", qi.weight);
+      f->dump_unsigned("depth", qi.depth);
+    }
+    if (qi.parent < 0) {
+      f->open_object_section("pool_weights");
+      for (auto& p : crush->choose_args) {
+	const crush_choose_arg_map& cmap = p.second;
+	int bidx = -1 - qi.parent;
+	const crush_bucket *b = crush->get_bucket(qi.parent);
+	if (b &&
+	    bidx < (int)cmap.size &&
+	    cmap.args[bidx].weight_set &&
+	    cmap.args[bidx].weight_set_positions >= 1) {
+	  int bpos;
+	  for (bpos = 0;
+	       bpos < (int)cmap.args[bidx].weight_set[0].size &&
+		 b->items[bpos] != qi.id;
+	       ++bpos) ;
+	  std::string name;
+	  if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
+	    name = "(compat)";
+	  } else {
+	    auto q = weight_set_names.find(p.first);
+	    name = q != weight_set_names.end() ? q->second :
+	      stringify(p.first);
+	  }
+	  f->open_array_section(name.c_str());
+	  for (unsigned opos = 0;
+	       opos < cmap.args[bidx].weight_set_positions;
+	       ++opos) {
+	    float w = (float)cmap.args[bidx].weight_set[opos].weights[bpos] /
+	      (float)0x10000;
+	    f->dump_float("weight", w);
+	  }
+	  f->close_section();
+	}
+      }
+      f->close_section();
+    }
+  }
+
+  inline void dump_bucket_children(const CrushWrapper *crush,
+				   const Item &qi, ceph::Formatter *f) {
+    if (!qi.is_bucket())
+      return;
+
+    f->open_array_section("children");
+    for (std::list<int>::const_iterator i = qi.children.begin();
+	 i != qi.children.end();
+	 ++i) {
+      f->dump_int("child", *i);
+    }
+    f->close_section();
+  }
+
+  class FormattingDumper : public Dumper<ceph::Formatter> {
+  public:
+    explicit FormattingDumper(const CrushWrapper *crush,
+			      const name_map_t& weight_set_names)
+      : Dumper<ceph::Formatter>(crush, weight_set_names) {}
+    explicit FormattingDumper(const CrushWrapper *crush,
+                              const name_map_t& weight_set_names,
+                              bool show_shadow)
+      : Dumper<ceph::Formatter>(crush, weight_set_names, show_shadow) {}
+
+  protected:
+    void dump_item(const Item &qi, ceph::Formatter *f) override {
+      f->open_object_section("item");
+      dump_item_fields(qi, f);
+      dump_bucket_children(qi, f);
+      f->close_section();
+    }
+
+    virtual void dump_item_fields(const Item &qi, ceph::Formatter *f) {
+      CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
+    }
+
+    virtual void dump_bucket_children(const Item &qi, ceph::Formatter *f) {
+      CrushTreeDumper::dump_bucket_children(crush, qi, f);
+    }
+  };
+
+}
+
+#endif
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
new file mode 100644
index 000000000..064c4c9b9
--- /dev/null
+++ b/src/crush/CrushWrapper.cc
@@ -0,0 +1,4247 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd/osd_types.h"
+#include "common/debug.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+#include "common/TextTable.h"
+#include "include/stringify.h"
+
+#include "CrushWrapper.h"
+#include "CrushTreeDumper.h"
+
+#define dout_subsys ceph_subsys_crush
+
+using std::cout;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::decode_nohead;
+using ceph::encode;
+using ceph::Formatter;
+
+bool CrushWrapper::has_legacy_rule_ids() const
+{
+  for (unsigned i=0; i<crush->max_rules; i++) {
+    crush_rule *r = crush->rules[i];
+    if (r &&
+	r->mask.ruleset != i) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::map<int, int> CrushWrapper::renumber_rules()
+{
+  std::map<int, int> result;
+  for (unsigned i=0; i<crush->max_rules; i++) {
+    crush_rule *r = crush->rules[i];
+    if (r && r->mask.ruleset != i) {
+      result[r->mask.ruleset] = i;
+      r->mask.ruleset = i;
+    }
+  }
+  return result;
+}
+
+bool CrushWrapper::has_non_straw2_buckets() const
+{
+  for (int i=0; i<crush->max_buckets; ++i) {
+    crush_bucket *b = crush->buckets[i];
+    if (!b)
+      continue;
+    if (b->alg != CRUSH_BUCKET_STRAW2)
+      return true;
+  }
+  return false;
+}
+
+bool CrushWrapper::has_v2_rules() const
+{
+  for (unsigned i=0; i<crush->max_rules; i++) {
+    if (is_v2_rule(i)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CrushWrapper::is_v2_rule(unsigned ruleid) const
+{
+  // check rule for use of indep or new SET_* rule steps
+  if (ruleid >= crush->max_rules)
+    return false;
+  crush_rule *r = crush->rules[ruleid];
+  if (!r)
+    return false;
+  for (unsigned j=0; j<r->len; j++) {
+    if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
+	r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
+	r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
+	r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CrushWrapper::has_v3_rules() const
+{
+  for (unsigned i=0; i<crush->max_rules; i++) {
+    if (is_v3_rule(i)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CrushWrapper::is_v3_rule(unsigned ruleid) const
+{
+  // check rule for use of SET_CHOOSELEAF_VARY_R step
+  if (ruleid >= crush->max_rules)
+    return false;
+  crush_rule *r = crush->rules[ruleid];
+  if (!r)
+    return false;
+  for (unsigned j=0; j<r->len; j++) {
+    if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CrushWrapper::has_v4_buckets() const
+{
+  for (int i=0; i<crush->max_buckets; ++i) {
+    crush_bucket *b = crush->buckets[i];
+    if (!b)
+      continue;
+    if (b->alg == CRUSH_BUCKET_STRAW2)
+      return true;
+  }
+  return false;
+}
+
+bool CrushWrapper::has_v5_rules() const
+{
+  for (unsigned i=0; i<crush->max_rules; i++) {
+    if (is_v5_rule(i)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CrushWrapper::is_v5_rule(unsigned ruleid) const
+{
+  // check rule for use of SET_CHOOSELEAF_STABLE step
+  if (ruleid >= crush->max_rules)
+    return false;
+  crush_rule *r = crush->rules[ruleid];
+  if (!r)
+    return false;
+  for (unsigned j=0; j<r->len; j++) {
+    if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_STABLE) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CrushWrapper::has_choose_args() const
+{
+  return !choose_args.empty();
+}
+
+bool CrushWrapper::has_incompat_choose_args() const
+{
+  if (choose_args.empty())
+    return false;
+  if (choose_args.size() > 1)
+    return true;
+  if (choose_args.begin()->first != DEFAULT_CHOOSE_ARGS)
+    return true;
+  crush_choose_arg_map arg_map = choose_args.begin()->second;
+  for (__u32 i = 0; i < arg_map.size; i++) {
+    crush_choose_arg *arg = &arg_map.args[i];
+    if (arg->weight_set_positions == 0 &&
+	arg->ids_size == 0)
+	continue;
+    if (arg->weight_set_positions != 1)
+      return true;
+    if (arg->ids_size != 0)
+      return true;
+  }
+  return false;
+}
+
+int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
+{
+  if (!item_exists(i))
+    return -EINVAL;
+  string name = get_item_name(i);
+  size_t pos = name.find("~");
+  if (pos == string::npos) {
+    *idout = i;
+    *classout = -1;
+    return 0;
+  }
+  string name_no_class = name.substr(0, pos);
+  if (!name_exists(name_no_class))
+    return -ENOENT;
+  string class_name = name.substr(pos + 1);
+  if (!class_exists(class_name))
+    return -ENOENT;
+  *idout = get_item_id(name_no_class);
+  *classout = get_class_id(class_name);
+  return 0;
+}
+
+int CrushWrapper::can_rename_item(const string& srcname,
+                                  const string& dstname,
+                                  ostream *ss) const
+{
+  if (name_exists(srcname)) {
+    if (name_exists(dstname)) {
+      *ss << "dstname = '" << dstname << "' already exists";
+      return -EEXIST;
+    }
+    if (is_valid_crush_name(dstname)) {
+      return 0;
+    } else {
+      *ss << "dstname = '" << dstname << "' does not match [-_.0-9a-zA-Z]+";
+      return -EINVAL;
+    }
+  } else {
+    if (name_exists(dstname)) {
+      *ss << "srcname = '" << srcname << "' does not exist "
+          << "and dstname = '" << dstname << "' already exists";
+      return -EALREADY;
+    } else {
+      *ss << "srcname = '" << srcname << "' does not exist";
+      return -ENOENT;
+    }
+  }
+}
+
+int CrushWrapper::rename_item(const string& srcname,
+                              const string& dstname,
+                              ostream *ss)
+{
+  int ret = can_rename_item(srcname, dstname, ss);
+  if (ret < 0)
+    return ret;
+  int oldid = get_item_id(srcname);
+  return set_item_name(oldid, dstname);
+}
+
+int CrushWrapper::can_rename_bucket(const string& srcname,
+                                    const string& dstname,
+                                    ostream *ss) const
+{
+  int ret = can_rename_item(srcname, dstname, ss);
+  if (ret)
+    return ret;
+  int srcid = get_item_id(srcname);
+  if (srcid >= 0) {
+    *ss << "srcname = '" << srcname << "' is not a bucket "
+        << "because its id = " << srcid << " is >= 0";
+    return -ENOTDIR;
+  }
+  return 0;
+}
+
+int CrushWrapper::rename_bucket(const string& srcname,
+                                const string& dstname,
+                                ostream *ss)
+{
+  int ret = can_rename_bucket(srcname, dstname, ss);
+  if (ret < 0)
+    return ret;
+  int oldid = get_item_id(srcname);
+  return set_item_name(oldid, dstname);
+}
+
+int CrushWrapper::rename_rule(const string& srcname,
+                              const string& dstname,
+                              ostream *ss)
+{
+  if (!rule_exists(srcname)) {
+    if (ss) {
+      *ss << "source rule name '" << srcname << "' does not exist";
+    }
+    return -ENOENT;
+  }
+  if (rule_exists(dstname)) {
+    if (ss) {
+      *ss << "destination rule name '" << dstname << "' already exists";
+    }
+    return -EEXIST;
+  }
+  int rule_id = get_rule_id(srcname);
+  auto it = rule_name_map.find(rule_id);
+  ceph_assert(it != rule_name_map.end());
+  it->second = dstname;
+  if (have_rmaps) {
+    rule_name_rmap.erase(srcname);
+    rule_name_rmap[dstname] = rule_id;
+  }
+  return 0;
+}
+
+void CrushWrapper::find_takes(set<int> *roots) const
+{
+  for (unsigned i=0; i<crush->max_rules; i++) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j=0; j<r->len; j++) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE)
+	roots->insert(r->steps[j].arg1);
+    }
+  }
+}
+
+void CrushWrapper::find_takes_by_rule(int rule, set<int> *roots) const
+{
+  if (rule < 0 || rule >= (int)crush->max_rules)
+    return;
+  crush_rule *r = crush->rules[rule];
+  if (!r)
+    return;
+  for (unsigned i = 0; i < r->len; i++) {
+    if (r->steps[i].op == CRUSH_RULE_TAKE)
+      roots->insert(r->steps[i].arg1);
+  }
+}
+
+void CrushWrapper::find_roots(set<int> *roots) const
+{
+  for (int i = 0; i < crush->max_buckets; i++) {
+    if (!crush->buckets[i])
+      continue;
+    crush_bucket *b = crush->buckets[i];
+    if (!_search_item_exists(b->id))
+      roots->insert(b->id);
+  }
+}
+
+bool CrushWrapper::subtree_contains(int root, int item) const
+{
+  if (root == item)
+    return true;
+
+  if (root >= 0)
+    return false;  // root is a leaf
+
+  const crush_bucket *b = get_bucket(root);
+  if (IS_ERR(b))
+    return false;
+
+  for (unsigned j=0; j<b->size; j++) {
+    if (subtree_contains(b->items[j], item))
+      return true;
+  }
+  return false;
+}
+
+bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool unlink_only)
+{
+  // last instance?
+  if (_search_item_exists(item)) {
+    return false;
+  }
+  if (item < 0 && _bucket_is_in_use(item)) {
+    return false;
+  }
+
+  if (item < 0 && !unlink_only) {
+    crush_bucket *t = get_bucket(item);
+    ldout(cct, 5) << "_maybe_remove_last_instance removing bucket " << item << dendl;
+    crush_remove_bucket(crush, t);
+    if (class_bucket.count(item) != 0)
+      class_bucket.erase(item);
+    class_remove_item(item);
+    update_choose_args(cct);
+  }
+  if ((item >= 0 || !unlink_only) && name_map.count(item)) {
+    ldout(cct, 5) << "_maybe_remove_last_instance removing name for item " << item << dendl;
+    name_map.erase(item);
+    have_rmaps = false;
+    if (item >= 0 && !unlink_only) {
+      class_remove_item(item);
+    }
+  }
+  rebuild_roots_with_classes(cct);
+  return true;
+}
+
+int CrushWrapper::remove_root(CephContext *cct, int item)
+{
+  crush_bucket *b = get_bucket(item);
+  if (IS_ERR(b)) {
+    // should be idempotent
+    // e.g.: we use 'crush link' to link same host into
+    // different roots, which as a result can cause different
+    // shadow trees reference same hosts too. This means
+    // we may need to destory the same buckets(hosts, racks, etc.)
+    // multiple times during rebuilding all shadow trees.
+    return 0;
+  }
+
+  for (unsigned n = 0; n < b->size; n++) {
+    if (b->items[n] >= 0)
+      continue;
+    int r = remove_root(cct, b->items[n]);
+    if (r < 0)
+      return r;
+  }
+
+  crush_remove_bucket(crush, b);
+  if (name_map.count(item) != 0) {
+    name_map.erase(item);
+    have_rmaps = false;
+  }
+  if (class_bucket.count(item) != 0)
+    class_bucket.erase(item);
+  class_remove_item(item);
+  update_choose_args(cct);
+  return 0;
+}
+
+void CrushWrapper::update_choose_args(CephContext *cct)
+{
+  for (auto& i : choose_args) {
+    crush_choose_arg_map &arg_map = i.second;
+    assert(arg_map.size == (unsigned)crush->max_buckets);
+    unsigned positions = get_choose_args_positions(arg_map);
+    for (int j = 0; j < crush->max_buckets; ++j) {
+      crush_bucket *b = crush->buckets[j];
+      assert(j < (int)arg_map.size);
+      auto& carg = arg_map.args[j];
+      // strip out choose_args for any buckets that no longer exist
+      if (!b || b->alg != CRUSH_BUCKET_STRAW2) {
+	if (carg.ids) {
+	  if (cct)
+	    ldout(cct,10) << __func__ << " removing " << i.first << " bucket "
+			  << (-1-j) << " ids" << dendl;
+	  free(carg.ids);
+	  carg.ids = 0;
+	  carg.ids_size = 0;
+	}
+	if (carg.weight_set) {
+	  if (cct)
+	    ldout(cct,10) << __func__ << " removing " << i.first << " bucket "
+			  << (-1-j) << " weight_sets" << dendl;
+	  for (unsigned p = 0; p < carg.weight_set_positions; ++p) {
+	    free(carg.weight_set[p].weights);
+	  }
+	  free(carg.weight_set);
+	  carg.weight_set = 0;
+	  carg.weight_set_positions = 0;
+	}
+	continue;
+      }
+      if (carg.weight_set_positions == 0) {
+	continue;	// skip it
+      }
+      if (carg.weight_set_positions != positions) {
+	if (cct)
+	  lderr(cct) << __func__ << " " << i.first << " bucket "
+		     << (-1-j) << " positions " << carg.weight_set_positions
+		     << " -> " << positions << dendl;
+	continue;	// wth... skip!
+      }
+      // mis-sized weight_sets?  this shouldn't ever happen.
+      for (unsigned p = 0; p < positions; ++p) {
+	if (carg.weight_set[p].size != b->size) {
+	  if (cct)
+	    lderr(cct) << __func__ << " fixing " << i.first << " bucket "
+		       << (-1-j) << " position " << p
+		       << " size " << carg.weight_set[p].size << " -> "
+		       << b->size << dendl;
+	  auto old_ws = carg.weight_set[p];
+	  carg.weight_set[p].size = b->size;
+	  carg.weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
+	  auto max = std::min<unsigned>(old_ws.size, b->size);
+	  for (unsigned k = 0; k < max; ++k) {
+	    carg.weight_set[p].weights[k] = old_ws.weights[k];
+	  }
+	  free(old_ws.weights);
+	}
+      }
+    }
+  }
+}
+
+int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
+{
+  ldout(cct, 5) << "remove_item " << item
+		<< (unlink_only ? " unlink_only":"") << dendl;
+
+  int ret = -ENOENT;
+
+  if (item < 0 && !unlink_only) {
+    crush_bucket *t = get_bucket(item);
+    if (IS_ERR(t)) {
+      ldout(cct, 1) << "remove_item bucket " << item << " does not exist"
+		    << dendl;
+      return -ENOENT;
+    }
+
+    if (t->size) {
+      ldout(cct, 1) << "remove_item bucket " << item << " has " << t->size
+		    << " items, not empty" << dendl;
+      return -ENOTEMPTY;
+    }
+    if (_bucket_is_in_use(item)) {
+      return -EBUSY;
+    }
+  }
+
+  for (int i = 0; i < crush->max_buckets; i++) {
+    if (!crush->buckets[i])
+      continue;
+    crush_bucket *b = crush->buckets[i];
+
+    for (unsigned i=0; i<b->size; ++i) {
+      int id = b->items[i];
+      if (id == item) {
+	ldout(cct, 5) << "remove_item removing item " << item
+		      << " from bucket " << b->id << dendl;
+	adjust_item_weight_in_bucket(cct, item, 0, b->id, true);
+	bucket_remove_item(b, item);
+	ret = 0;
+      }
+    }
+  }
+
+  if (_maybe_remove_last_instance(cct, item, unlink_only))
+    ret = 0;
+  
+  return ret;
+}
+
+bool CrushWrapper::_search_item_exists(int item) const
+{
+  for (int i = 0; i < crush->max_buckets; i++) {
+    if (!crush->buckets[i])
+      continue;
+    crush_bucket *b = crush->buckets[i];
+    for (unsigned j=0; j<b->size; ++j) {
+      if (b->items[j] == item)
+	return true;
+    }
+  }
+  return false;
+}
+
+bool CrushWrapper::_bucket_is_in_use(int item)
+{
+  for (auto &i : class_bucket)
+    for (auto &j : i.second)
+      if (j.second == item)
+	return true;
+  for (unsigned i = 0; i < crush->max_rules; ++i) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j = 0; j < r->len; ++j) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE) {
+	int step_item = r->steps[j].arg1;
+	int original_item;
+	int c;
+	int res = split_id_class(step_item, &original_item, &c);
+	if (res < 0)
+	  return false;
+	if (step_item == item || original_item == item)
+	  return true;
+      }
+    }
+  }
+  return false;
+}
+
+int CrushWrapper::_remove_item_under(
+  CephContext *cct, int item, int ancestor, bool unlink_only)
+{
+  ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
+		<< (unlink_only ? " unlink_only":"") << dendl;
+
+  if (ancestor >= 0) {
+    return -EINVAL;
+  }
+
+  if (!bucket_exists(ancestor))
+    return -EINVAL;
+
+  int ret = -ENOENT;
+
+  crush_bucket *b = get_bucket(ancestor);
+  for (unsigned i=0; i<b->size; ++i) {
+    int id = b->items[i];
+    if (id == item) {
+      ldout(cct, 5) << "_remove_item_under removing item " << item
+		    << " from bucket " << b->id << dendl;
+      adjust_item_weight_in_bucket(cct, item, 0, b->id, true);
+      bucket_remove_item(b, item);
+      ret = 0;
+    } else if (id < 0) {
+      int r = remove_item_under(cct, item, id, unlink_only);
+      if (r == 0)
+	ret = 0;
+    }
+  }
+  return ret;
+}
+
+int CrushWrapper::remove_item_under(
+  CephContext *cct, int item, int ancestor, bool unlink_only)
+{
+  ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
+		<< (unlink_only ? " unlink_only":"") << dendl;
+
+  if (!unlink_only && _bucket_is_in_use(item)) {
+    return -EBUSY;
+  }
+
+  int ret = _remove_item_under(cct, item, ancestor, unlink_only);
+  if (ret < 0)
+    return ret;
+
+  if (item < 0 && !unlink_only) {
+    crush_bucket *t = get_bucket(item);
+    if (IS_ERR(t)) {
+      ldout(cct, 1) << "remove_item_under bucket " << item
+                    << " does not exist" << dendl;
+      return -ENOENT;
+    }
+
+    if (t->size) {
+      ldout(cct, 1) << "remove_item_under bucket " << item << " has " << t->size
+		    << " items, not empty" << dendl;
+      return -ENOTEMPTY;
+    }
+  }
+
+  if (_maybe_remove_last_instance(cct, item, unlink_only))
+    ret = 0;
+
+  return ret;
+}
+
+int CrushWrapper::get_common_ancestor_distance(CephContext *cct, int id,
+			       const std::multimap<string,string>& loc) const
+{
+  ldout(cct, 5) << __func__ << " " << id << " " << loc << dendl;
+  if (!item_exists(id))
+    return -ENOENT;
+  map<string,string> id_loc = get_full_location(id);
+  ldout(cct, 20) << " id is at " << id_loc << dendl;
+
+  for (map<int,string>::const_iterator p = type_map.begin();
+       p != type_map.end();
+       ++p) {
+    map<string,string>::iterator ip = id_loc.find(p->second);
+    if (ip == id_loc.end())
+      continue;
+    for (std::multimap<string,string>::const_iterator q = loc.find(p->second);
+	 q != loc.end();
+	 ++q) {
+      if (q->first != p->second)
+	break;
+      if (q->second == ip->second)
+	return p->first;
+    }
+  }
+  return -ERANGE;
+}
+
+int CrushWrapper::parse_loc_map(const std::vector<string>& args,
+				std::map<string,string> *ploc)
+{
+  ploc->clear();
+  for (unsigned i = 0; i < args.size(); ++i) {
+    const char *s = args[i].c_str();
+    const char *pos = strchr(s, '=');
+    if (!pos)
+      return -EINVAL;
+    string key(s, 0, pos-s);
+    string value(pos+1);
+    if (value.length())
+      (*ploc)[key] = value;
+    else
+      return -EINVAL;
+  }
+  return 0;
+}
+
+int CrushWrapper::parse_loc_multimap(const std::vector<string>& args,
+					    std::multimap<string,string> *ploc)
+{
+  ploc->clear();
+  for (unsigned i = 0; i < args.size(); ++i) {
+    const char *s = args[i].c_str();
+    const char *pos = strchr(s, '=');
+    if (!pos)
+      return -EINVAL;
+    string key(s, 0, pos-s);
+    string value(pos+1);
+    if (value.length())
+      ploc->insert(make_pair(key, value));
+    else
+      return -EINVAL;
+  }
+  return 0;
+}
+
+bool CrushWrapper::check_item_loc(CephContext *cct, int item, const map<string,string>& loc,
+				  int *weight)
+{
+  ldout(cct, 5) << "check_item_loc item " << item << " loc " << loc << dendl;
+
+  for (map<int,string>::const_iterator p = type_map.begin(); p != type_map.end(); ++p) {
+    // ignore device
+    if (p->first == 0)
+      continue;
+
+    // ignore types that aren't specified in loc
+    map<string,string>::const_iterator q = loc.find(p->second);
+    if (q == loc.end()) {
+      ldout(cct, 2) << "warning: did not specify location for '" << p->second << "' level (levels are "
+		    << type_map << ")" << dendl;
+      continue;
+    }
+
+    if (!name_exists(q->second)) {
+      ldout(cct, 5) << "check_item_loc bucket " << q->second << " dne" << dendl;
+      return false;
+    }
+
+    int id = get_item_id(q->second);
+    if (id >= 0) {
+      ldout(cct, 5) << "check_item_loc requested " << q->second << " for type " << p->second
+		    << " is a device, not bucket" << dendl;
+      return false;
+    }
+
+    ceph_assert(bucket_exists(id));
+    crush_bucket *b = get_bucket(id);
+
+    // see if item exists in this bucket
+    for (unsigned j=0; j<b->size; j++) {
+      if (b->items[j] == item) {
+	ldout(cct, 2) << "check_item_loc " << item << " exists in bucket " << b->id << dendl;
+	if (weight)
+	  *weight = crush_get_bucket_item_weight(b, j);
+	return true;
+      }
+    }
+    return false;
+  }
+  
+  ldout(cct, 2) << __func__ << " item " << item << " loc " << loc << dendl;
+  return false;
+}
+
+map<string, string> CrushWrapper::get_full_location(int id) const
+{
+  vector<pair<string, string> > full_location_ordered;
+  map<string,string> full_location;
+
+  get_full_location_ordered(id, full_location_ordered);
+
+  std::copy(full_location_ordered.begin(),
+      full_location_ordered.end(),
+      std::inserter(full_location, full_location.begin()));
+
+  return full_location;
+}
+
+int CrushWrapper::get_full_location(const string& name,
+				    map<string,string> *ploc)
+{
+  build_rmaps();
+  auto p = name_rmap.find(name);
+  if (p == name_rmap.end()) {
+    return -ENOENT;
+  }
+  *ploc = get_full_location(p->second);
+  return 0;
+}
+
+int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string> >& path) const
+{
+  if (!item_exists(id))
+    return -ENOENT;
+  int cur = id;
+  int ret;
+  while (true) {
+    pair<string, string> parent_coord = get_immediate_parent(cur, &ret);
+    if (ret != 0)
+      break;
+    path.push_back(parent_coord);
+    cur = get_item_id(parent_coord.second);
+  }
+  return 0;
+}
+
+string CrushWrapper::get_full_location_ordered_string(int id) const
+{
+  vector<pair<string, string> > full_location_ordered;
+  string full_location;
+  get_full_location_ordered(id, full_location_ordered);
+  reverse(begin(full_location_ordered), end(full_location_ordered));
+  for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
+    full_location = full_location + i->first + "=" + i->second;
+    if (i != full_location_ordered.end() - 1) {
+      full_location = full_location + ",";
+    }
+  }
+  return full_location;
+}
+
+map<int, string> CrushWrapper::get_parent_hierarchy(int id) const
+{
+  map<int,string> parent_hierarchy;
+  pair<string, string> parent_coord = get_immediate_parent(id);
+  int parent_id;
+
+  // get the integer type for id and create a counter from there
+  int type_counter = get_bucket_type(id);
+
+  // if we get a negative type then we can assume that we have an OSD
+  // change behavior in get_item_type FIXME
+  if (type_counter < 0)
+    type_counter = 0;
+
+  // read the type map and get the name of the type with the largest ID
+  int high_type = 0;
+  if (!type_map.empty())
+    high_type = type_map.rbegin()->first;
+
+  parent_id = get_item_id(parent_coord.second);
+
+  while (type_counter < high_type) {
+    type_counter++;
+    parent_hierarchy[ type_counter ] = parent_coord.first;
+
+    if (type_counter < high_type){
+      // get the coordinate information for the next parent
+      parent_coord = get_immediate_parent(parent_id);
+      parent_id = get_item_id(parent_coord.second);
+    }
+  }
+
+  return parent_hierarchy;
+}
+
+int CrushWrapper::get_children(int id, list<int> *children) const
+{
+  // leaf?
+  if (id >= 0) {
+    return 0;
+  }
+
+  auto *b = get_bucket(id);
+  if (IS_ERR(b)) {
+    return -ENOENT;
+  }
+
+  for (unsigned n=0; n<b->size; n++) {
+    children->push_back(b->items[n]);
+  }
+  return b->size;
+}
+
+int CrushWrapper::get_all_children(int id, set<int> *children) const
+{
+  // leaf?
+  if (id >= 0) {
+    return 0;
+  }
+
+  auto *b = get_bucket(id);
+  if (IS_ERR(b)) {
+    return -ENOENT;
+  }
+
+  int c = 0;
+  for (unsigned n = 0; n < b->size; n++) {
+    children->insert(b->items[n]);
+    c++;
+    auto r = get_all_children(b->items[n], children);
+    if (r < 0)
+      return r;
+    c += r;
+  }
+  return c;
+}
+
+void CrushWrapper::get_children_of_type(int id,
+                                        int type,
+					vector<int> *children,
+					bool exclude_shadow) const
+{
+  if (id >= 0) {
+    if (type == 0) {
+      // want leaf?
+      children->push_back(id);
+    }
+    return;
+  }
+  auto b = get_bucket(id);
+  if (IS_ERR(b)) {
+    return;
+  }
+  if (b->type < type) {
+    // give up
+    return;
+  } else if (b->type == type) {
+    if (!is_shadow_item(b->id) || !exclude_shadow) {
+      children->push_back(b->id);
+    }
+    return;
+  }
+  for (unsigned n = 0; n < b->size; n++) {
+    get_children_of_type(b->items[n], type, children, exclude_shadow);
+  }
+}
+
+int CrushWrapper::verify_upmap(CephContext *cct,
+                               int rule_id,
+                               int pool_size,
+                               const vector<int>& up)
+{
+  auto rule = get_rule(rule_id);
+  if (IS_ERR(rule) || !rule) {
+    lderr(cct) << __func__ << " rule " << rule_id << " does not exist"
+               << dendl;
+    return -ENOENT;
+  }
+  int root_bucket = 0;
+  int cursor = 0;
+  std::map<int, int> type_stack;
+  for (unsigned step = 0; step < rule->len; ++step) {
+    auto curstep = &rule->steps[step];
+    ldout(cct, 10) << __func__ << " step " << step << dendl;
+    switch (curstep->op) {
+    case CRUSH_RULE_TAKE:
+      {
+        root_bucket = curstep->arg1;
+      }
+      break;
+    case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+    case CRUSH_RULE_CHOOSELEAF_INDEP:
+      {
+        int numrep = curstep->arg1;
+        int type = curstep->arg2;
+        if (numrep <= 0)
+          numrep += pool_size;
+        type_stack.emplace(type, numrep);
+        if (type == 0) // osd
+          break;
+        map<int, set<int>> osds_by_parent; // parent_of_desired_type -> osds
+        for (auto osd : up) {
+          auto parent = get_parent_of_type(osd, type, rule_id);
+          if (parent < 0) {
+            osds_by_parent[parent].insert(osd);
+          } else {
+            ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
+                          << ", skipping for now"
+                          << dendl;
+          }
+        }
+        for (auto i : osds_by_parent) {
+          if (i.second.size() > 1) {
+            lderr(cct) << __func__ << " multiple osds " << i.second
+                       << " come from same failure domain " << i.first
+                       << dendl;
+            return -EINVAL;
+          }
+        }
+      }
+      break;
+
+    case CRUSH_RULE_CHOOSE_FIRSTN:
+    case CRUSH_RULE_CHOOSE_INDEP:
+      {
+        int numrep = curstep->arg1;
+        int type = curstep->arg2;
+        if (numrep <= 0)
+          numrep += pool_size;
+        type_stack.emplace(type, numrep);
+        if (type == 0) // osd
+          break;
+        set<int> parents_of_type;
+        for (auto osd : up) {
+          auto parent = get_parent_of_type(osd, type, rule_id);
+          if (parent < 0) {
+            parents_of_type.insert(parent);
+          } else {
+            ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
+                          << ", skipping for now"
+                          << dendl;
+          }
+        }
+        if ((int)parents_of_type.size() > numrep) {
+          lderr(cct) << __func__ << " number of buckets "
+                     << parents_of_type.size() << " exceeds desired " << numrep
+                     << dendl;
+          return -EINVAL;
+        }
+      }
+      break;
+
+    case CRUSH_RULE_EMIT:
+      {
+        if (root_bucket < 0) {
+          int num_osds = 1;
+          for (auto &item : type_stack) {
+            num_osds *= item.second;
+          }
+          // validate the osd's in subtree
+          for (int c = 0; cursor < (int)up.size() && c < num_osds; ++cursor, ++c) {
+            int osd = up[cursor];
+            if (!subtree_contains(root_bucket, osd)) {
+              lderr(cct) << __func__ << " osd " << osd << " not in bucket " << root_bucket << dendl;
+              return -EINVAL;
+            }
+          }
+        }
+        type_stack.clear();
+        root_bucket = 0;
+      }
+      break;
+    default:
+      // ignore
+      break;
+    }
+  }
+  return 0;
+}
+
+int CrushWrapper::_get_leaves(int id, list<int> *leaves) const
+{
+  ceph_assert(leaves);
+
+  // Already leaf?
+  if (id >= 0) {
+    leaves->push_back(id);
+    return 0;
+  }
+
+  auto b = get_bucket(id);
+  if (IS_ERR(b)) {
+    return -ENOENT;
+  }
+
+  for (unsigned n = 0; n < b->size; n++) {
+    if (b->items[n] >= 0) {
+      leaves->push_back(b->items[n]);
+    } else {
+      // is a bucket, do recursive call
+      int r = _get_leaves(b->items[n], leaves);
+      if (r < 0) {
+        return r;
+      }
+    }
+  }
+
+  return 0; // all is well
+}
+
+int CrushWrapper::get_leaves(const string &name, set<int> *leaves) const
+{
+  ceph_assert(leaves);
+  leaves->clear();
+
+  if (!name_exists(name)) {
+    return -ENOENT;
+  }
+
+  int id = get_item_id(name);
+  if (id >= 0) {
+    // already leaf
+    leaves->insert(id);
+    return 0;
+  }
+
+  list<int> unordered;
+  int r = _get_leaves(id, &unordered);
+  if (r < 0) {
+    return r;
+  }
+
+  for (auto &p : unordered) {
+    leaves->insert(p);
+  }
+
+  return 0;
+}
+
+int CrushWrapper::insert_item(
+  CephContext *cct, int item, float weight, string name,
+  const map<string,string>& loc,  // typename -> bucketname
+  bool init_weight_sets)
+{
+  ldout(cct, 5) << "insert_item item " << item << " weight " << weight
+		<< " name " << name << " loc " << loc << dendl;
+
+  if (!is_valid_crush_name(name))
+    return -EINVAL;
+
+  if (!is_valid_crush_loc(cct, loc))
+    return -EINVAL;
+
+  int r = validate_weightf(weight);
+  if (r < 0) {
+    return r;
+  }
+
+  if (name_exists(name)) {
+    if (get_item_id(name) != item) {
+      ldout(cct, 10) << "device name '" << name << "' already exists as id "
+		     << get_item_id(name) << dendl;
+      return -EEXIST;
+    }
+  } else {
+    set_item_name(item, name);
+  }
+
+  int cur = item;
+
+  // 1. create locations if locations don't exist
+  // 2. add child in the location with 0 weight.
+  // Check more detail of insert_item method declared in
+  // CrushWrapper.h
+  for (auto p = type_map.begin(); p != type_map.end(); ++p) {
+    // ignore device type
+    if (p->first == 0)
+      continue;
+
+    // skip types that are unspecified
+    map<string,string>::const_iterator q = loc.find(p->second);
+    if (q == loc.end()) {
+      ldout(cct, 2) << "warning: did not specify location for '"
+		    << p->second << "' level (levels are "
+		    << type_map << ")" << dendl;
+      continue;
+    }
+
+    if (!name_exists(q->second)) {
+      ldout(cct, 5) << "insert_item creating bucket " << q->second << dendl;
+      int zero_weight = 0, new_bucket_id;
+      int r = add_bucket(0, 0,
+			 CRUSH_HASH_DEFAULT, p->first, 1, &cur, &zero_weight, &new_bucket_id);
+      if (r < 0) {
+        ldout(cct, 1) << "add_bucket failure error: " << cpp_strerror(r)
+		      << dendl;
+        return r;
+      }
+      set_item_name(new_bucket_id, q->second);
+      
+      cur = new_bucket_id;
+      continue;
+    }
+
+    // add to an existing bucket
+    int id = get_item_id(q->second);
+    if (!bucket_exists(id)) {
+      ldout(cct, 1) << "insert_item doesn't have bucket " << id << dendl;
+      return -EINVAL;
+    }
+
+    // check that we aren't creating a cycle.
+    if (subtree_contains(id, cur)) {
+      ldout(cct, 1) << "insert_item item " << cur << " already exists beneath "
+		    << id << dendl;
+      return -EINVAL;
+    }
+
+    // we have done sanity check above
+    crush_bucket *b = get_bucket(id);
+
+    if (p->first != b->type) {
+      ldout(cct, 1) << "insert_item existing bucket has type "
+	<< "'" << type_map[b->type] << "' != "
+	<< "'" << type_map[p->first] << "'" << dendl;
+      return -EINVAL;
+    }
+
+    // are we forming a loop?
+    if (subtree_contains(cur, b->id)) {
+      ldout(cct, 1) << "insert_item " << cur << " already contains " << b->id
+		    << "; cannot form loop" << dendl;
+      return -ELOOP;
+    }
+
+    ldout(cct, 5) << "insert_item adding " << cur << " weight " << weight
+		  << " to bucket " << id << dendl;
+    [[maybe_unused]] int r = bucket_add_item(b, cur, 0);
+    ceph_assert(!r);
+    break;
+  }
+
+  // adjust the item's weight in location
+  if (adjust_item_weightf_in_loc(cct, item, weight, loc,
+				 item >= 0 && init_weight_sets) > 0) {
+    if (item >= crush->max_devices) {
+      crush->max_devices = item + 1;
+      ldout(cct, 5) << "insert_item max_devices now " << crush->max_devices
+		    << dendl;
+    }
+    r = rebuild_roots_with_classes(cct);
+    if (r < 0) {
+      ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
+                    << cpp_strerror(r) << dendl;
+      return r;
+    }
+    return 0;
+  }
+
+  ldout(cct, 1) << "error: didn't find anywhere to add item " << item
+		<< " in " << loc << dendl;
+  return -EINVAL;
+}
+
+
+int CrushWrapper::move_bucket(
+  CephContext *cct, int id, const map<string,string>& loc)
+{
+  // sorry this only works for buckets
+  if (id >= 0)
+    return -EINVAL;
+
+  if (!item_exists(id))
+    return -ENOENT;
+
+  // get the name of the bucket we are trying to move for later
+  string id_name = get_item_name(id);
+
+  // detach the bucket
+  int bucket_weight = detach_bucket(cct, id);
+
+  // insert the bucket back into the hierarchy
+  return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc,
+		     false);
+}
+
+int CrushWrapper::detach_bucket(CephContext *cct, int item)
+{
+  if (!crush)
+    return (-EINVAL);
+
+  if (item >= 0)
+    return (-EINVAL);
+
+  // check that the bucket that we want to detach exists
+  ceph_assert(bucket_exists(item));
+
+  // get the bucket's weight
+  crush_bucket *b = get_bucket(item);
+  unsigned bucket_weight = b->weight;
+
+  // get where the bucket is located
+  pair<string, string> bucket_location = get_immediate_parent(item);
+
+  // get the id of the parent bucket
+  int parent_id = get_item_id(bucket_location.second);
+
+  // get the parent bucket
+  crush_bucket *parent_bucket = get_bucket(parent_id);
+
+  if (!IS_ERR(parent_bucket)) {
+    // zero out the bucket weight
+    adjust_item_weight_in_bucket(cct, item, 0, parent_bucket->id, true);
+
+    // remove the bucket from the parent
+    bucket_remove_item(parent_bucket, item);
+  } else if (PTR_ERR(parent_bucket) != -ENOENT) {
+    return PTR_ERR(parent_bucket);
+  }
+
+  // check that we're happy
+  int test_weight = 0;
+  map<string,string> test_location;
+  test_location[ bucket_location.first ] = (bucket_location.second);
+
+  bool successful_detach = !(check_item_loc(cct, item, test_location,
+					    &test_weight));
+  ceph_assert(successful_detach);
+  ceph_assert(test_weight == 0);
+
+  return bucket_weight;
+}
+
+bool CrushWrapper::is_parent_of(int child, int p) const
+{
+  int parent = 0;
+  while (!get_immediate_parent_id(child, &parent)) {
+    if (parent == p) {
+      return true;
+    }
+    child = parent;
+  }
+  return false;
+}
+
+int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
+{
+  if (src >= 0 || dst >= 0)
+    return -EINVAL;
+  if (!item_exists(src) || !item_exists(dst))
+    return -EINVAL;
+  crush_bucket *a = get_bucket(src);
+  crush_bucket *b = get_bucket(dst);
+  if (is_parent_of(a->id, b->id) || is_parent_of(b->id, a->id)) {
+    return -EINVAL;
+  }
+  unsigned aw = a->weight;
+  unsigned bw = b->weight;
+
+  // swap weights
+  adjust_item_weight(cct, a->id, bw);
+  adjust_item_weight(cct, b->id, aw);
+
+  // swap items
+  map<int,unsigned> tmp;
+  unsigned as = a->size;
+  unsigned bs = b->size;
+  for (unsigned i = 0; i < as; ++i) {
+    int item = a->items[0];
+    int itemw = crush_get_bucket_item_weight(a, 0);
+    tmp[item] = itemw;
+    bucket_remove_item(a, item);
+  }
+  ceph_assert(a->size == 0);
+  ceph_assert(b->size == bs);
+  for (unsigned i = 0; i < bs; ++i) {
+    int item = b->items[0];
+    int itemw = crush_get_bucket_item_weight(b, 0);
+    bucket_remove_item(b, item);
+    bucket_add_item(a, item, itemw);
+  }
+  ceph_assert(a->size == bs);
+  ceph_assert(b->size == 0);
+  for (auto t : tmp) {
+    bucket_add_item(b, t.first, t.second);
+  }
+  ceph_assert(a->size == bs);
+  ceph_assert(b->size == as);
+
+  // swap names
+  swap_names(src, dst);
+  return rebuild_roots_with_classes(cct);
+}
+
+int CrushWrapper::link_bucket(
+  CephContext *cct, int id, const map<string,string>& loc)
+{
+  // sorry this only works for buckets
+  if (id >= 0)
+    return -EINVAL;
+
+  if (!item_exists(id))
+    return -ENOENT;
+
+  // get the name of the bucket we are trying to move for later
+  string id_name = get_item_name(id);
+
+  crush_bucket *b = get_bucket(id);
+  unsigned bucket_weight = b->weight;
+
+  return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
+}
+
+int CrushWrapper::create_or_move_item(
+  CephContext *cct, int item, float weight, string name,
+  const map<string,string>& loc,  // typename -> bucketname
+  bool init_weight_sets)
+{
+  int ret = 0;
+  int old_iweight;
+
+  if (!is_valid_crush_name(name))
+    return -EINVAL;
+
+  if (check_item_loc(cct, item, loc, &old_iweight)) {
+    ldout(cct, 5) << "create_or_move_item " << item << " already at " << loc
+		  << dendl;
+  } else {
+    if (_search_item_exists(item)) {
+      weight = get_item_weightf(item);
+      ldout(cct, 10) << "create_or_move_item " << item
+		     << " exists with weight " << weight << dendl;
+      remove_item(cct, item, true);
+    }
+    ldout(cct, 5) << "create_or_move_item adding " << item
+		  << " weight " << weight
+		  << " at " << loc << dendl;
+    ret = insert_item(cct, item, weight, name, loc,
+		      item >= 0 && init_weight_sets);
+    if (ret == 0)
+      ret = 1;  // changed
+  }
+  return ret;
+}
+
+int CrushWrapper::update_item(
+  CephContext *cct, int item, float weight, string name,
+  const map<string,string>& loc)  // typename -> bucketname
+{
+  ldout(cct, 5) << "update_item item " << item << " weight " << weight
+		<< " name " << name << " loc " << loc << dendl;
+  int ret = 0;
+
+  if (!is_valid_crush_name(name))
+    return -EINVAL;
+
+  if (!is_valid_crush_loc(cct, loc))
+    return -EINVAL;
+
+  ret = validate_weightf(weight);
+  if (ret < 0) {
+    return ret;
+  }
+
+  // compare quantized (fixed-point integer) weights!  
+  int iweight = (int)(weight * (float)0x10000);
+  int old_iweight;
+  if (check_item_loc(cct, item, loc, &old_iweight)) {
+    ldout(cct, 5) << "update_item " << item << " already at " << loc << dendl;
+    if (old_iweight != iweight) {
+      ldout(cct, 5) << "update_item " << item << " adjusting weight "
+		    << ((float)old_iweight/(float)0x10000) << " -> " << weight
+		    << dendl;
+      adjust_item_weight_in_loc(cct, item, iweight, loc);
+      ret = rebuild_roots_with_classes(cct);
+      if (ret < 0) {
+	ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
+		      << cpp_strerror(ret) << dendl;
+	return ret;
+      }
+      ret = 1;
+    }
+    if (get_item_name(item) != name) {
+      ldout(cct, 5) << "update_item setting " << item << " name to " << name
+		    << dendl;
+      set_item_name(item, name);
+      ret = 1;
+    }
+  } else {
+    if (item_exists(item)) {
+      remove_item(cct, item, true);
+    }
+    ldout(cct, 5) << "update_item adding " << item << " weight " << weight
+		  << " at " << loc << dendl;
+    ret = insert_item(cct, item, weight, name, loc);
+    if (ret == 0)
+      ret = 1;  // changed
+  }
+  return ret;
+}
+
+int CrushWrapper::get_item_weight(int id) const
+{
+  for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+    crush_bucket *b = crush->buckets[bidx];
+    if (b == NULL)
+      continue;
+    if (b->id == id)
+      return b->weight;
+    for (unsigned i = 0; i < b->size; i++)
+      if (b->items[i] == id)
+	return crush_get_bucket_item_weight(b, i);
+  }
+  return -ENOENT;
+}
+
+int CrushWrapper::get_item_weight_in_loc(int id, const map<string,string> &loc)
+{
+  for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
+
+    int bid = get_item_id(l->second);
+    if (!bucket_exists(bid))
+      continue;
+    crush_bucket *b = get_bucket(bid);
+    for (unsigned int i = 0; i < b->size; i++) {
+      if (b->items[i] == id) {
+	return crush_get_bucket_item_weight(b, i);
+      }
+    }
+  }
+  return -ENOENT;
+}
+
+int CrushWrapper::adjust_item_weight(CephContext *cct, int id, int weight,
+				     bool update_weight_sets)
+{
+  ldout(cct, 5) << __func__ << " " << id << " weight " << weight
+		<< " update_weight_sets=" << (int)update_weight_sets
+		<< dendl;
+  int changed = 0;
+  for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+    if (!crush->buckets[bidx]) {
+      continue;
+    }
+    int r = adjust_item_weight_in_bucket(cct, id, weight, -1-bidx,
+					 update_weight_sets);
+    if (r > 0) {
+      ++changed;
+    }
+  }
+  if (!changed) {
+    return -ENOENT;
+  }
+  return changed;
+}
+
+int CrushWrapper::adjust_item_weight_in_bucket(
+  CephContext *cct, int id, int weight,
+  int bucket_id,
+  bool update_weight_sets)
+{
+  ldout(cct, 5) << __func__ << " " << id << " weight " << weight
+		<< " in bucket " << bucket_id
+		<< " update_weight_sets=" << (int)update_weight_sets
+		<< dendl;
+  int changed = 0;
+  if (!bucket_exists(bucket_id)) {
+    return -ENOENT;
+  }
+  crush_bucket *b = get_bucket(bucket_id);
+  for (unsigned int i = 0; i < b->size; i++) {
+    if (b->items[i] == id) {
+      int diff = bucket_adjust_item_weight(cct, b, id, weight,
+					   update_weight_sets);
+      ldout(cct, 5) << __func__ << " " << id << " diff " << diff
+		    << " in bucket " << bucket_id << dendl;
+      adjust_item_weight(cct, bucket_id, b->weight, false);
+      changed++;
+    }
+  }
+  // update weight-sets so they continue to sum
+  for (auto& p : choose_args) {
+    auto &cmap = p.second;
+    if (!cmap.args) {
+      continue;
+    }
+    crush_choose_arg *arg = &cmap.args[-1 - bucket_id];
+    if (!arg->weight_set) {
+      continue;
+    }
+    ceph_assert(arg->weight_set_positions > 0);
+    vector<int> w(arg->weight_set_positions);
+    for (unsigned i = 0; i < b->size; ++i) {
+      for (unsigned j = 0; j < arg->weight_set_positions; ++j) {
+	crush_weight_set *weight_set = &arg->weight_set[j];
+	w[j] += weight_set->weights[i];
+      }
+    }
+    ldout(cct,5) << __func__ << "  adjusting bucket " << bucket_id
+		 << " cmap " << p.first << " weights to " << w << dendl;
+    ostringstream ss;
+    choose_args_adjust_item_weight(cct, cmap, bucket_id, w, &ss);
+  }
+  if (!changed) {
+    return -ENOENT;
+  }
+  return changed;
+}
+
+int CrushWrapper::adjust_item_weight_in_loc(
+  CephContext *cct, int id, int weight,
+  const map<string,string>& loc,
+  bool update_weight_sets)
+{
+  ldout(cct, 5) << "adjust_item_weight_in_loc " << id << " weight " << weight
+		<< " in " << loc
+		<< " update_weight_sets=" << (int)update_weight_sets
+		<< dendl;
+  int changed = 0;
+  for (auto l = loc.begin(); l != loc.end(); ++l) {
+    int bid = get_item_id(l->second);
+    if (!bucket_exists(bid))
+      continue;
+    int r = adjust_item_weight_in_bucket(cct, id, weight, bid,
+					 update_weight_sets);
+    if (r > 0) {
+      ++changed;
+    }
+  }
+  if (!changed) {
+    return -ENOENT;
+  }
+  return changed;
+}
+
+int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight,
+					bool update_weight_sets)
+{
+  ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
+  crush_bucket *b = get_bucket(id);
+  if (IS_ERR(b))
+    return PTR_ERR(b);
+  int changed = 0;
+  list<crush_bucket*> q;
+  q.push_back(b);
+  while (!q.empty()) {
+    b = q.front();
+    q.pop_front();
+    int local_changed = 0;
+    for (unsigned i=0; i<b->size; ++i) {
+      int n = b->items[i];
+      if (n >= 0) {
+	adjust_item_weight_in_bucket(cct, n, weight, b->id, update_weight_sets);
+	++changed;
+	++local_changed;
+      } else {
+	crush_bucket *sub = get_bucket(n);
+	if (IS_ERR(sub))
+	  continue;
+	q.push_back(sub);
+      }
+    }
+  }
+  int ret = rebuild_roots_with_classes(cct);
+  if (ret < 0) {
+    ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
+		  << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  return changed;
+}
+
+bool CrushWrapper::check_item_present(int id) const
+{
+  bool found = false;
+
+  for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+    crush_bucket *b = crush->buckets[bidx];
+    if (b == 0)
+      continue;
+    for (unsigned i = 0; i < b->size; i++)
+      if (b->items[i] == id)
+	found = true;
+  }
+  return found;
+}
+
+
+pair<string,string> CrushWrapper::get_immediate_parent(int id, int *_ret) const
+{
+
+  for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+    crush_bucket *b = crush->buckets[bidx];
+    if (b == 0)
+      continue;
+   if (is_shadow_item(b->id))
+      continue;
+    for (unsigned i = 0; i < b->size; i++)
+      if (b->items[i] == id) {
+        string parent_id = name_map.at(b->id);
+        string parent_bucket_type = type_map.at(b->type);
+        if (_ret)
+          *_ret = 0;
+        return make_pair(parent_bucket_type, parent_id);
+      }
+  }
+
+  if (_ret)
+    *_ret = -ENOENT;
+
+  return pair<string, string>();
+}
+
+int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
+{
+  for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+    crush_bucket *b = crush->buckets[bidx];
+    if (b == 0)
+      continue;
+    if (is_shadow_item(b->id))
+      continue;
+    for (unsigned i = 0; i < b->size; i++) {
+      if (b->items[i] == id) {
+	*parent = b->id;
+	return 0;
+      }
+    }
+  }
+  return -ENOENT;
+}
+
+int CrushWrapper::get_parent_of_type(int item, int type, int rule) const
+{
+  if (rule < 0) {
+    // no rule specified
+    do {
+      int r = get_immediate_parent_id(item, &item);
+      if (r < 0) {
+        return 0;
+      }
+    } while (get_bucket_type(item) != type);
+    return item;
+  }
+  set<int> roots;
+  find_takes_by_rule(rule, &roots);
+  for (auto root : roots) {
+    vector<int> candidates;
+    get_children_of_type(root, type, &candidates, false);
+    for (auto candidate : candidates) {
+      if (subtree_contains(candidate, item)) {
+	// note that here we assure that no two different buckets
+	// from a single crush rule will share a same device,
+	// which should generally be true.
+        return candidate;
+      }
+    }
+  }
+  return 0; // not found
+}
+
+void CrushWrapper::get_subtree_of_type(int type, vector<int> *subtrees)
+{
+  set<int> roots;
+  find_roots(&roots);
+  for (auto r: roots) {
+    crush_bucket *b = get_bucket(r);
+    if (IS_ERR(b))
+      continue;
+    get_children_of_type(b->id, type, subtrees);
+  }
+}
+
+bool CrushWrapper::class_is_in_use(int class_id, ostream *ss)
+{
+  list<unsigned> rules;
+  for (unsigned i = 0; i < crush->max_rules; ++i) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j = 0; j < r->len; ++j) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE) {
+        int root = r->steps[j].arg1;
+        for (auto &p : class_bucket) {
+          auto& q = p.second;
+          if (q.count(class_id) && q[class_id] == root) {
+            rules.push_back(i);
+          }
+        }
+      }
+    }
+  }
+  if (rules.empty()) {
+    return false;
+  }
+  if (ss) {
+    ostringstream os;
+    for (auto &p: rules) {
+      os << "'" << get_rule_name(p) <<"',";
+    }
+    string out(os.str());
+    out.resize(out.size() - 1); // drop last ','
+    *ss << "still referenced by crush_rule(s): " << out;
+  }
+  return true;
+}
+
+int CrushWrapper::rename_class(const string& srcname, const string& dstname)
+{
+  auto i = class_rname.find(srcname);
+  if (i == class_rname.end())
+    return -ENOENT;
+  auto j = class_rname.find(dstname);
+  if (j != class_rname.end())
+    return -EEXIST;
+
+  int class_id = i->second;
+  ceph_assert(class_name.count(class_id));
+  // rename any shadow buckets of old class name
+  for (auto &it: class_map) {
+    if (it.first < 0 && it.second == class_id) {
+        string old_name = get_item_name(it.first);
+        size_t pos = old_name.find("~");
+        ceph_assert(pos != string::npos);
+        string name_no_class = old_name.substr(0, pos);
+        string old_class_name = old_name.substr(pos + 1);
+        ceph_assert(old_class_name == srcname);
+        string new_name = name_no_class + "~" + dstname;
+        // we do not use set_item_name
+        // because the name is intentionally invalid
+        name_map[it.first] = new_name;
+        have_rmaps = false;
+    }
+  }
+
+  // rename class
+  class_rname.erase(srcname);
+  class_name.erase(class_id);
+  class_rname[dstname] = class_id;
+  class_name[class_id] = dstname;
+  return 0;
+}
+
+int CrushWrapper::populate_classes(
+  const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket)
+{
+  // build set of previous used shadow ids
+  set<int32_t> used_ids;
+  for (auto& p : old_class_bucket) {
+    for (auto& q : p.second) {
+      used_ids.insert(q.second);
+    }
+  }
+  // accumulate weight values for each carg and bucket as we go. because it is
+  // depth first, we will have the nested bucket weights we need when we
+  // finish constructing the containing buckets.
+  map<int,map<int,vector<int>>> cmap_item_weight; // cargs -> bno -> [bucket weight for each position]
+  set<int> roots;
+  find_nonshadow_roots(&roots);
+  for (auto &r : roots) {
+    assert(r < 0);
+    for (auto &c : class_name) {
+      int clone;
+      int res = device_class_clone(r, c.first, old_class_bucket, used_ids,
+				   &clone, &cmap_item_weight);
+      if (res < 0)
+	return res;
+    }
+  }
+  return 0;
+}
+
+int CrushWrapper::trim_roots_with_class(CephContext *cct)
+{
+  set<int> roots;
+  find_shadow_roots(&roots);
+  for (auto &r : roots) {
+    if (r >= 0)
+      continue;
+    int res = remove_root(cct, r);
+    if (res)
+      return res;
+  }
+  // there is no need to reweight because we only remove from the
+  // root and down
+  return 0;
+}
+
+int32_t CrushWrapper::_alloc_class_id() const {
+  if (class_name.empty()) {
+    return 0;
+  }
+  int32_t class_id = class_name.rbegin()->first + 1;
+  if (class_id >= 0) {
+    return class_id;
+  }
+  // wrapped, pick a random start and do exhaustive search
+  uint32_t upperlimit = std::numeric_limits<int32_t>::max();
+  upperlimit++;
+  class_id = rand() % upperlimit;
+  const auto start = class_id;
+  do {
+    if (!class_name.count(class_id)) {
+      return class_id;
+    } else {
+      class_id++;
+      if (class_id < 0) {
+        class_id = 0;
+      }
+    }
+  } while (class_id != start);
+  ceph_abort_msg("no available class id");
+}
+
+int CrushWrapper::set_subtree_class(
+  const string& subtree,
+  const string& new_class)
+{
+  if (!name_exists(subtree)) {
+    return -ENOENT;
+  }
+
+  int new_class_id = get_or_create_class_id(new_class);
+  int id = get_item_id(subtree);
+  list<int> q = { id };
+  while (!q.empty()) {
+    int id = q.front();
+    q.pop_front();
+    crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) {
+      return PTR_ERR(b);
+    }
+    for (unsigned i = 0; i < b->size; ++i) {
+      int item = b->items[i];
+      if (item >= 0) {
+	class_map[item] = new_class_id;
+      } else {
+	q.push_back(item);
+      }
+    }
+  }
+  return 0;
+}
+
+int CrushWrapper::reclassify(
+  CephContext *cct,
+  ostream& out,
+  const map<string,string>& classify_root,
+  const map<string,pair<string,string>>& classify_bucket
+  )
+{
+  map<int,string> reclassified_bucket; // orig_id -> class
+
+  // classify_root
+  for (auto& i : classify_root) {
+    string root = i.first;
+    if (!name_exists(root)) {
+      out << "root " << root << " does not exist" << std::endl;
+      return -EINVAL;
+    }
+    int root_id = get_item_id(root);
+    string new_class = i.second;
+    int new_class_id = get_or_create_class_id(new_class);
+    out << "classify_root " << root << " (" << root_id
+	<< ") as " << new_class << std::endl;
+
+    // validate rules
+    for (unsigned j = 0; j < crush->max_rules; j++) {
+      if (crush->rules[j]) {
+	auto rule = crush->rules[j];
+	for (unsigned k = 0; k < rule->len; ++k) {
+	  if (rule->steps[k].op == CRUSH_RULE_TAKE) {
+	    int step_item = get_rule_arg1(j, k);
+	    int original_item;
+	    int c;
+	    int res = split_id_class(step_item, &original_item, &c);
+	    if (res < 0)
+	      return res;
+	    if (c >= 0) {
+	      if (original_item == root_id) {
+		out << "  rule " << j << " includes take on root "
+		    << root << " class " << c << std::endl;
+		return -EINVAL;
+	      }
+	    }
+	  }
+	}
+      }
+    }
+
+    // rebuild new buckets for root
+    //cout << "before class_bucket: " << class_bucket << std::endl;
+    map<int,int> renumber;
+    list<int> q;
+    q.push_back(root_id);
+    while (!q.empty()) {
+      int id = q.front();
+      q.pop_front();
+      crush_bucket *bucket = get_bucket(id);
+      if (IS_ERR(bucket)) {
+	out << "cannot find bucket " << id
+	    << ": " << cpp_strerror(PTR_ERR(bucket)) << std::endl;
+	return PTR_ERR(bucket);
+      }
+
+      // move bucket
+      int new_id = get_new_bucket_id();
+      out << "  renumbering bucket " << id << " -> " << new_id << std::endl;
+      renumber[id] = new_id;
+      crush->buckets[-1-new_id] = bucket;
+      bucket->id = new_id;
+      crush->buckets[-1-id] = crush_make_bucket(crush,
+						bucket->alg,
+						bucket->hash,
+						bucket->type,
+						0, NULL, NULL);
+      crush->buckets[-1-id]->id = id;
+      for (auto& i : choose_args) {
+	i.second.args[-1-new_id] = i.second.args[-1-id];
+	memset(&i.second.args[-1-id], 0, sizeof(i.second.args[0]));
+      }
+      class_bucket.erase(id);
+      class_bucket[new_id][new_class_id] = id;
+      name_map[new_id] = string(get_item_name(id));
+      name_map[id] = string(get_item_name(id)) + "~" + new_class;
+
+      for (unsigned j = 0; j < bucket->size; ++j) {
+	if (bucket->items[j] < 0) {
+	  q.push_front(bucket->items[j]);
+	} else {
+	  // we don't reclassify the device here; if the users wants that,
+	  // they can pass --set-subtree-class separately.
+	}
+      }
+    }
+    //cout << "mid class_bucket: " << class_bucket << std::endl;
+
+    for (int i = 0; i < crush->max_buckets; ++i) {
+      crush_bucket *b = crush->buckets[i];
+      if (!b) {
+	continue;
+      }
+      for (unsigned j = 0; j < b->size; ++j) {
+	if (renumber.count(b->items[j])) {
+	  b->items[j] = renumber[b->items[j]];
+	}
+      }
+    }
+
+    int r = rebuild_roots_with_classes(cct);
+    if (r < 0) {
+      out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+	  << std::endl;
+      return r;
+    }
+    //cout << "final class_bucket: " << class_bucket << std::endl;
+  }
+
+  // classify_bucket
+  map<int,int> send_to;  // source bucket -> dest bucket
+  map<int,map<int,int>> new_class_bucket;
+  map<int,string> new_bucket_names;
+  map<int,map<string,string>> new_buckets;
+  map<string,int> new_bucket_by_name;
+  for (auto& i : classify_bucket) {
+    const string& match = i.first;  // prefix% or %suffix
+    const string& new_class = i.second.first;
+    const string& default_parent = i.second.second;
+    if (!name_exists(default_parent)) {
+      out << "default parent " << default_parent << " does not exist"
+	  << std::endl;
+      return -EINVAL;
+    }
+    int default_parent_id = get_item_id(default_parent);
+    crush_bucket *default_parent_bucket = get_bucket(default_parent_id);
+    assert(default_parent_bucket);
+    string default_parent_type_name = get_type_name(default_parent_bucket->type);
+
+    out << "classify_bucket " << match << " as " << new_class
+	<< " default bucket " << default_parent
+	<< " (" << default_parent_type_name << ")" << std::endl;
+
+    int new_class_id = get_or_create_class_id(new_class);
+    for (int j = 0; j < crush->max_buckets; ++j) {
+      crush_bucket *b = crush->buckets[j];
+      if (!b || is_shadow_item(b->id)) {
+	continue;
+      }
+      string name = get_item_name(b->id);
+      if (name.length() < match.length()) {
+	continue;
+      }
+      string basename;
+      if (match[0] == '%') {
+	if (match.substr(1) != name.substr(name.size() - match.size() + 1)) {
+	  continue;
+	}
+	basename = name.substr(0, name.size() - match.size() + 1);
+      } else if (match[match.size() - 1] == '%') {
+	if (match.substr(0, match.size() - 1) !=
+	    name.substr(0, match.size() - 1)) {
+	  continue;
+	}
+	basename = name.substr(match.size() - 1);
+      } else if (match == name) {
+	basename = default_parent;
+      } else {
+	continue;
+      }
+      cout << "match " << match << " to " << name << " basename " << basename
+	   << std::endl;
+      // look up or create basename bucket
+      int base_id;
+      if (name_exists(basename)) {
+	base_id = get_item_id(basename);
+	cout << "  have base " << base_id << std::endl;
+      } else if (new_bucket_by_name.count(basename)) {
+	base_id = new_bucket_by_name[basename];
+	cout << "  already creating base " << base_id << std::endl;
+      } else {
+	base_id = get_new_bucket_id();
+	crush->buckets[-1-base_id] = crush_make_bucket(crush,
+						       b->alg,
+						       b->hash,
+						       b->type,
+						       0, NULL, NULL);
+	crush->buckets[-1-base_id]->id = base_id;
+	name_map[base_id] = basename;
+	new_bucket_by_name[basename] = base_id;
+	cout << "  created base " << base_id << std::endl;
+
+	new_buckets[base_id][default_parent_type_name] = default_parent;
+      }
+      send_to[b->id] = base_id;
+      new_class_bucket[base_id][new_class_id] = b->id;
+      new_bucket_names[b->id] = basename + "~" + get_class_name(new_class_id);
+
+      // make sure devices are classified
+      for (unsigned i = 0; i < b->size; ++i) {
+	int item = b->items[i];
+	if (item >= 0) {
+	  class_map[item] = new_class_id;
+	}
+      }
+    }
+  }
+
+  // no name_exists() works below,
+  have_rmaps = false;
+
+  // copy items around
+  //cout << "send_to " << send_to << std::endl;
+  set<int> roots;
+  find_roots(&roots);
+  for (auto& i : send_to) {
+    crush_bucket *from = get_bucket(i.first);
+    crush_bucket *to = get_bucket(i.second);
+    cout << "moving items from " << from->id << " (" << get_item_name(from->id)
+	 << ") to " << to->id << " (" << get_item_name(to->id) << ")"
+	 << std::endl;
+    for (unsigned j = 0; j < from->size; ++j) {
+      int item = from->items[j];
+      int r;
+      map<string,string> to_loc;
+      to_loc[get_type_name(to->type)] = get_item_name(to->id);
+      if (item >= 0) {
+	if (subtree_contains(to->id, item)) {
+	  continue;
+	}
+	map<string,string> from_loc;
+	from_loc[get_type_name(from->type)] = get_item_name(from->id);
+	auto w = get_item_weightf_in_loc(item, from_loc);
+	r = insert_item(cct, item,
+			w,
+			get_item_name(item),
+			to_loc);
+      } else {
+	if (!send_to.count(item)) {
+	  lderr(cct) << "item " << item << " in bucket " << from->id
+	       << " is not also a reclassified bucket" << dendl;
+	  return -EINVAL;
+	}
+	int newitem = send_to[item];
+	if (subtree_contains(to->id, newitem)) {
+	  continue;
+	}
+	r = link_bucket(cct, newitem, to_loc);
+      }
+      if (r != 0) {
+	cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+	     << std::endl;
+	return r;
+      }
+    }
+  }
+
+  // make sure new buckets have parents
+  for (auto& i : new_buckets) {
+    int parent;
+    if (get_immediate_parent_id(i.first, &parent) < 0) {
+      cout << "new bucket " << i.first << " missing parent, adding at "
+	   << i.second << std::endl;
+      int r = link_bucket(cct, i.first, i.second);
+      if (r != 0) {
+	cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+	     << std::endl;
+	return r;
+      }
+    }
+  }
+
+  // set class mappings
+  //cout << "pre class_bucket: " << class_bucket << std::endl;
+  for (auto& i : new_class_bucket) {
+    for (auto& j : i.second) {
+      class_bucket[i.first][j.first] = j.second;
+    }
+
+  }
+  //cout << "post class_bucket: " << class_bucket << std::endl;
+  for (auto& i : new_bucket_names) {
+    name_map[i.first] = i.second;
+  }
+
+  int r = rebuild_roots_with_classes(cct);
+  if (r < 0) {
+    out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+	<< std::endl;
+    return r;
+  }
+  //cout << "final class_bucket: " << class_bucket << std::endl;
+
+  return 0;
+}
+
+int CrushWrapper::get_new_bucket_id()
+{
+  int id = -1;
+  while (crush->buckets[-1-id] &&
+	 -1-id < crush->max_buckets) {
+    id--;
+  }
+  if (-1-id == crush->max_buckets) {
+    ++crush->max_buckets;
+    crush->buckets = (struct crush_bucket**)realloc(
+      crush->buckets,
+      sizeof(crush->buckets[0]) * crush->max_buckets);
+    for (auto& i : choose_args) {
+      assert(i.second.size == (__u32)crush->max_buckets - 1);
+      ++i.second.size;
+      i.second.args = (struct crush_choose_arg*)realloc(
+	i.second.args,
+	sizeof(i.second.args[0]) * i.second.size);
+    }
+  }
+  return id;
+}
+
+void CrushWrapper::reweight(CephContext *cct)
+{
+  set<int> roots;
+  find_nonshadow_roots(&roots);
+  for (auto id : roots) {
+    if (id >= 0)
+      continue;
+    crush_bucket *b = get_bucket(id);
+    ldout(cct, 5) << "reweight root bucket " << id << dendl;
+    int r = crush_reweight_bucket(crush, b);
+    ceph_assert(r == 0);
+
+    for (auto& i : choose_args) {
+      //cout << "carg " << i.first << std::endl;
+      vector<uint32_t> w;  // discard top-level weights
+      reweight_bucket(b, i.second, &w);
+    }
+  }
+  int r = rebuild_roots_with_classes(cct);
+  ceph_assert(r == 0);
+}
+
+void CrushWrapper::reweight_bucket(
+  crush_bucket *b,
+  crush_choose_arg_map& arg_map,
+  vector<uint32_t> *weightv)
+{
+  int idx = -1 - b->id;
+  unsigned npos = arg_map.args[idx].weight_set_positions;
+  //cout << __func__ << " " << b->id << " npos " << npos << std::endl;
+  weightv->resize(npos);
+  for (unsigned i = 0; i < b->size; ++i) {
+    int item = b->items[i];
+    if (item >= 0) {
+      for (unsigned pos = 0; pos < npos; ++pos) {
+	(*weightv)[pos] += arg_map.args[idx].weight_set->weights[i];
+      }
+    } else {
+      vector<uint32_t> subw(npos);
+      crush_bucket *sub = get_bucket(item);
+      assert(sub);
+      reweight_bucket(sub, arg_map, &subw);
+      for (unsigned pos = 0; pos < npos; ++pos) {
+	(*weightv)[pos] += subw[pos];
+	// strash the real bucket weight as the weights for this reference
+	arg_map.args[idx].weight_set->weights[i] = subw[pos];
+      }
+    }
+  }
+  //cout << __func__ << " finish " << b->id << " " << *weightv << std::endl;
+}
+
+int CrushWrapper::add_simple_rule_at(
+  string name, string root_name,
+  string failure_domain_name,
+  string device_class,
+  string mode, int rule_type,
+  int rno,
+  ostream *err)
+{
+  if (rule_exists(name)) {
+    if (err)
+      *err << "rule " << name << " exists";
+    return -EEXIST;
+  }
+  if (rno >= 0) {
+    if (rule_exists(rno)) {
+      if (err)
+        *err << "rule with ruleno " << rno << " exists";
+      return -EEXIST;
+    }
+    if (ruleset_exists(rno)) {
+      if (err)
+        *err << "ruleset " << rno << " exists";
+      return -EEXIST;
+    }
+  } else {
+    for (rno = 0; rno < get_max_rules(); rno++) {
+      if (!rule_exists(rno) && !ruleset_exists(rno))
+        break;
+    }
+  }
+  if (!name_exists(root_name)) {
+    if (err)
+      *err << "root item " << root_name << " does not exist";
+    return -ENOENT;
+  }
+  int root = get_item_id(root_name);
+  int type = 0;
+  if (failure_domain_name.length()) {
+    type = get_type_id(failure_domain_name);
+    if (type < 0) {
+      if (err)
+	*err << "unknown type " << failure_domain_name;
+      return -EINVAL;
+    }
+  }
+  if (device_class.size()) {
+    if (!class_exists(device_class)) {
+      if (err)
+	*err << "device class " << device_class << " does not exist";
+      return -EINVAL;
+    }
+    int c = get_class_id(device_class);
+    if (class_bucket.count(root) == 0 ||
+	class_bucket[root].count(c) == 0) {
+      if (err)
+	*err << "root " << root_name << " has no devices with class "
+	     << device_class;
+      return -EINVAL;
+    }
+    root = class_bucket[root][c];
+  }
+  if (mode != "firstn" && mode != "indep") {
+    if (err)
+      *err << "unknown mode " << mode;
+    return -EINVAL;
+  }
+
+  int steps = 3;
+  if (mode == "indep")
+    steps = 5;
+  int min_rep = mode == "firstn" ? 1 : 3;
+  int max_rep = mode == "firstn" ? 10 : 20;
+  //set the ruleset the same as rule_id(rno)
+  crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_rep, max_rep);
+  ceph_assert(rule);
+  int step = 0;
+  if (mode == "indep") {
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+  }
+  crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
+  if (type)
+    crush_rule_set_step(rule, step++,
+			mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
+			CRUSH_RULE_CHOOSELEAF_INDEP,
+			CRUSH_CHOOSE_N,
+			type);
+  else
+    crush_rule_set_step(rule, step++,
+			mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
+			CRUSH_RULE_CHOOSE_INDEP,
+			CRUSH_CHOOSE_N,
+			0);
+  crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+
+  int ret = crush_add_rule(crush, rule, rno);
+  if(ret < 0) {
+    *err << "failed to add rule " << rno << " because " << cpp_strerror(ret);
+    return ret;
+  }
+  set_rule_name(rno, name);
+  have_rmaps = false;
+  return rno;
+}
+
+int CrushWrapper::add_simple_rule(
+  string name, string root_name,
+  string failure_domain_name,
+  string device_class,
+  string mode, int rule_type,
+  ostream *err)
+{
+  return add_simple_rule_at(name, root_name, failure_domain_name, device_class,
+			    mode,
+			    rule_type, -1, err);
+}
+
+float CrushWrapper::_get_take_weight_osd_map(int root,
+					     map<int,float> *pmap) const
+{
+  float sum = 0.0;
+  list<int> q;
+  q.push_back(root);
+  //breadth first iterate the OSD tree
+  while (!q.empty()) {
+    int bno = q.front();
+    q.pop_front();
+    crush_bucket *b = crush->buckets[-1-bno];
+    ceph_assert(b);
+    for (unsigned j=0; j<b->size; ++j) {
+      int item_id = b->items[j];
+      if (item_id >= 0) { //it's an OSD
+	float w = crush_get_bucket_item_weight(b, j);
+	(*pmap)[item_id] = w;
+	sum += w;
+      } else { //not an OSD, expand the child later
+	q.push_back(item_id);
+      }
+    }
+  }
+  return sum;
+}
+
+void CrushWrapper::_normalize_weight_map(float sum,
+					 const map<int,float>& m,
+					 map<int,float> *pmap) const
+{
+  for (auto& p : m) {
+    map<int,float>::iterator q = pmap->find(p.first);
+    if (q == pmap->end()) {
+      (*pmap)[p.first] = p.second / sum;
+    } else {
+      q->second += p.second / sum;
+    }
+  }
+}
+
+int CrushWrapper::get_take_weight_osd_map(int root, map<int,float> *pmap) const
+{
+  map<int,float> m;
+  float sum = _get_take_weight_osd_map(root, &m);
+  _normalize_weight_map(sum, m, pmap);
+  return 0;
+}
+
+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno,
+					  map<int,float> *pmap) const
+{
+  if (ruleno >= crush->max_rules)
+    return -ENOENT;
+  if (crush->rules[ruleno] == NULL)
+    return -ENOENT;
+  crush_rule *rule = crush->rules[ruleno];
+
+  // build a weight map for each TAKE in the rule, and then merge them
+
+  // FIXME: if there are multiple takes that place a different number of
+  // objects we do not take that into account.  (Also, note that doing this
+  // right is also a function of the pool, since the crush rule
+  // might choose 2 + choose 2 but pool size may only be 3.)
+  for (unsigned i=0; i<rule->len; ++i) {
+    map<int,float> m;
+    float sum = 0;
+    if (rule->steps[i].op == CRUSH_RULE_TAKE) {
+      int n = rule->steps[i].arg1;
+      if (n >= 0) {
+	m[n] = 1.0;
+	sum = 1.0;
+      } else {
+	sum += _get_take_weight_osd_map(n, &m);
+      }
+    }
+    _normalize_weight_map(sum, m, pmap);
+  }
+
+  return 0;
+}
+
+int CrushWrapper::remove_rule(int ruleno)
+{
+  if (ruleno >= (int)crush->max_rules)
+    return -ENOENT;
+  if (crush->rules[ruleno] == NULL)
+    return -ENOENT;
+  crush_destroy_rule(crush->rules[ruleno]);
+  crush->rules[ruleno] = NULL;
+  rule_name_map.erase(ruleno);
+  have_rmaps = false;
+  return rebuild_roots_with_classes(nullptr);
+}
+
+int CrushWrapper::bucket_adjust_item_weight(
+  CephContext *cct, crush_bucket *bucket, int item, int weight,
+  bool adjust_weight_sets)
+{
+  if (adjust_weight_sets) {
+    unsigned position;
+    for (position = 0; position < bucket->size; position++)
+      if (bucket->items[position] == item)
+	break;
+    ceph_assert(position != bucket->size);
+    for (auto &w : choose_args) {
+      crush_choose_arg_map &arg_map = w.second;
+      crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
+      for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+	crush_weight_set *weight_set = &arg->weight_set[j];
+	weight_set->weights[position] = weight;
+      }
+    }
+  }
+  return crush_bucket_adjust_item_weight(crush, bucket, item, weight);
+}
+
+int CrushWrapper::add_bucket(
+  int bucketno, int alg, int hash, int type, int size,
+  int *items, int *weights, int *idout)
+{
+  if (alg == 0) {
+    alg = get_default_bucket_alg();
+    if (alg == 0)
+      return -EINVAL;
+  }
+  crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
+				      weights);
+  ceph_assert(b);
+  ceph_assert(idout);
+  int r = crush_add_bucket(crush, bucketno, b, idout);
+  int pos = -1 - *idout;
+  for (auto& p : choose_args) {
+    crush_choose_arg_map& cmap = p.second;
+    unsigned new_size = crush->max_buckets;
+    if (cmap.args) {
+      if ((int)cmap.size < crush->max_buckets) {
+	cmap.args = static_cast<crush_choose_arg*>(realloc(
+	  cmap.args,
+	  sizeof(crush_choose_arg) * new_size));
+        ceph_assert(cmap.args);
+	memset(&cmap.args[cmap.size], 0,
+	       sizeof(crush_choose_arg) * (new_size - cmap.size));
+	cmap.size = new_size;
+      }
+    } else {
+      cmap.args = static_cast<crush_choose_arg*>(calloc(sizeof(crush_choose_arg),
+							new_size));
+      ceph_assert(cmap.args);
+      cmap.size = new_size;
+    }
+    if (size > 0) {
+      int positions = get_choose_args_positions(cmap);
+      crush_choose_arg& carg = cmap.args[pos];
+      carg.weight_set = static_cast<crush_weight_set*>(calloc(sizeof(crush_weight_set),
+						  size));
+      carg.weight_set_positions = positions;
+      for (int ppos = 0; ppos < positions; ++ppos) {
+	carg.weight_set[ppos].weights = (__u32*)calloc(sizeof(__u32), size);
+	carg.weight_set[ppos].size = size;
+	for (int bpos = 0; bpos < size; ++bpos) {
+	  carg.weight_set[ppos].weights[bpos] = weights[bpos];
+	}
+      }
+    }
+    assert(crush->max_buckets == (int)cmap.size);
+  }
+  return r;
+}
+
+int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
+{
+  __u32 new_size = bucket->size + 1;
+  int r = crush_bucket_add_item(crush, bucket, item, weight);
+  if (r < 0) {
+    return r;
+  }
+  for (auto &w : choose_args) {
+    crush_choose_arg_map &arg_map = w.second;
+    crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
+    for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+      crush_weight_set *weight_set = &arg->weight_set[j];
+      weight_set->weights = (__u32*)realloc(weight_set->weights,
+					    new_size * sizeof(__u32));
+      ceph_assert(weight_set->size + 1 == new_size);
+      weight_set->weights[weight_set->size] = weight;
+      weight_set->size = new_size;
+    }
+    if (arg->ids_size) {
+      arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
+      ceph_assert(arg->ids_size + 1 == new_size);
+      arg->ids[arg->ids_size] = item;
+      arg->ids_size = new_size;
+    }
+  }
+  return 0;
+}
+
+int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
+{
+  __u32 new_size = bucket->size - 1;
+  unsigned position;
+  for (position = 0; position < bucket->size; position++)
+    if (bucket->items[position] == item)
+      break;
+  ceph_assert(position != bucket->size);
+  int r = crush_bucket_remove_item(crush, bucket, item);
+  if (r < 0) {
+    return r;
+  }
+  for (auto &w : choose_args) {
+    crush_choose_arg_map &arg_map = w.second;
+    crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
+    for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+      crush_weight_set *weight_set = &arg->weight_set[j];
+      ceph_assert(weight_set->size - 1 == new_size);
+      for (__u32 k = position; k < new_size; k++)
+	weight_set->weights[k] = weight_set->weights[k+1];
+      if (new_size) {
+	weight_set->weights = (__u32*)realloc(weight_set->weights,
+					      new_size * sizeof(__u32));
+      } else {
+        free(weight_set->weights);
+	weight_set->weights = NULL;
+      }
+      weight_set->size = new_size;
+    }
+    if (arg->ids_size) {
+      ceph_assert(arg->ids_size - 1 == new_size);
+      for (__u32 k = position; k < new_size; k++)
+	arg->ids[k] = arg->ids[k+1];
+      if (new_size) {
+	arg->ids = (__s32 *)realloc(arg->ids, new_size * sizeof(__s32));
+      } else {
+        free(arg->ids);
+	arg->ids = NULL;
+      }
+      arg->ids_size = new_size;
+    }
+  }
+  return 0;
+}
+
+int CrushWrapper::bucket_set_alg(int bid, int alg)
+{
+  crush_bucket *b = get_bucket(bid);
+  if (!b) {
+    return -ENOENT;
+  }
+  b->alg = alg;
+  return 0;
+}
+
+int CrushWrapper::update_device_class(int id,
+                                      const string& class_name,
+                                      const string& name,
+                                      ostream *ss)
+{
+  ceph_assert(item_exists(id));
+  auto old_class_name = get_item_class(id);
+  if (old_class_name && old_class_name != class_name) {
+    *ss << "osd." << id << " has already bound to class '" << old_class_name
+        << "', can not reset class to '" << class_name  << "'; "
+        << "use 'ceph osd crush rm-device-class <id>' to "
+        << "remove old class first";
+    return -EBUSY;
+  }
+
+  int class_id = get_or_create_class_id(class_name);
+  if (id < 0) {
+    *ss << name << " id " << id << " is negative";
+    return -EINVAL;
+  }
+
+  if (class_map.count(id) != 0 && class_map[id] == class_id) {
+    *ss << name << " already set to class " << class_name << ". ";
+    return 0;
+  }
+
+  set_item_class(id, class_id);
+
+  int r = rebuild_roots_with_classes(nullptr);
+  if (r < 0)
+    return r;
+  return 1;
+}
+
+int CrushWrapper::remove_device_class(CephContext *cct, int id, ostream *ss)
+{
+  ceph_assert(ss);
+  const char *name = get_item_name(id);
+  if (!name) {
+    *ss << "osd." << id << " does not have a name";
+    return -ENOENT;
+  }
+
+  const char *class_name = get_item_class(id);
+  if (!class_name) {
+    *ss << "osd." << id << " has not been bound to a specific class yet";
+    return 0;
+  }
+  class_remove_item(id);
+
+  int r = rebuild_roots_with_classes(cct);
+  if (r < 0) {
+    *ss << "unable to rebuild roots with class '" << class_name << "' "
+        << "of osd." << id << ": " << cpp_strerror(r);
+    return r;
+  }
+  return 0;
+}
+
+int CrushWrapper::device_class_clone(
+  int original_id, int device_class,
+  const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket,
+  const std::set<int32_t>& used_ids,
+  int *clone,
+  map<int,map<int,vector<int>>> *cmap_item_weight)
+{
+  const char *item_name = get_item_name(original_id);
+  if (item_name == NULL)
+    return -ECHILD;
+  const char *class_name = get_class_name(device_class);
+  if (class_name == NULL)
+    return -EBADF;
+  string copy_name = item_name + string("~") + class_name;
+  if (name_exists(copy_name)) {
+    *clone = get_item_id(copy_name);
+    return 0;
+  }
+
+  crush_bucket *original = get_bucket(original_id);
+  ceph_assert(!IS_ERR(original));
+  crush_bucket *copy = crush_make_bucket(crush,
+					 original->alg,
+					 original->hash,
+					 original->type,
+					 0, NULL, NULL);
+  ceph_assert(copy);
+
+  vector<unsigned> item_orig_pos;  // new item pos -> orig item pos
+  for (unsigned i = 0; i < original->size; i++) {
+    int item = original->items[i];
+    int weight = crush_get_bucket_item_weight(original, i);
+    if (item >= 0) {
+      if (class_map.count(item) != 0 && class_map[item] == device_class) {
+	int res = crush_bucket_add_item(crush, copy, item, weight);
+	if (res)
+	  return res;
+      } else {
+	continue;
+      }
+    } else {
+      int child_copy_id;
+      int res = device_class_clone(item, device_class, old_class_bucket,
+				   used_ids, &child_copy_id,
+				   cmap_item_weight);
+      if (res < 0)
+	return res;
+      crush_bucket *child_copy = get_bucket(child_copy_id);
+      ceph_assert(!IS_ERR(child_copy));
+      res = crush_bucket_add_item(crush, copy, child_copy_id,
+				  child_copy->weight);
+      if (res)
+	return res;
+    }
+    item_orig_pos.push_back(i);
+  }
+  ceph_assert(item_orig_pos.size() == copy->size);
+
+  int bno = 0;
+  if (old_class_bucket.count(original_id) &&
+      old_class_bucket.at(original_id).count(device_class)) {
+    bno = old_class_bucket.at(original_id).at(device_class);
+  } else {
+    // pick a new shadow bucket id that is not used by the current map
+    // *or* any previous shadow buckets.
+    bno = -1;
+    while (((-1-bno) < crush->max_buckets && crush->buckets[-1-bno]) ||
+	   used_ids.count(bno)) {
+      --bno;
+    }
+  }
+  int res = crush_add_bucket(crush, bno, copy, clone);
+  if (res)
+    return res;
+  ceph_assert(!bno || bno == *clone);
+
+  res = set_item_class(*clone, device_class);
+  if (res < 0)
+    return res;
+
+  // we do not use set_item_name because the name is intentionally invalid
+  name_map[*clone] = copy_name;
+  if (have_rmaps)
+    name_rmap[copy_name] = *clone;
+  class_bucket[original_id][device_class] = *clone;
+
+  // set up choose_args for the new bucket.
+  for (auto& w : choose_args) {
+    crush_choose_arg_map& cmap = w.second;
+    if (crush->max_buckets > (int)cmap.size) {
+      unsigned new_size = crush->max_buckets;
+      cmap.args = static_cast<crush_choose_arg*>(realloc(cmap.args,
+					     new_size * sizeof(cmap.args[0])));
+      ceph_assert(cmap.args);
+      memset(cmap.args + cmap.size, 0,
+	     (new_size - cmap.size) * sizeof(cmap.args[0]));
+      cmap.size = new_size;
+    }
+    auto& o = cmap.args[-1-original_id];
+    auto& n = cmap.args[-1-bno];
+    n.ids_size = 0; // FIXME: implement me someday
+    n.weight_set_positions = o.weight_set_positions;
+    n.weight_set = static_cast<crush_weight_set*>(calloc(
+      n.weight_set_positions, sizeof(crush_weight_set)));
+    for (size_t s = 0; s < n.weight_set_positions; ++s) {
+      n.weight_set[s].size = copy->size;
+      n.weight_set[s].weights = (__u32*)calloc(copy->size, sizeof(__u32));
+    }
+    for (size_t s = 0; s < n.weight_set_positions; ++s) {
+      vector<int> bucket_weights(n.weight_set_positions);
+      for (size_t i = 0; i < copy->size; ++i) {
+	int item = copy->items[i];
+	if (item >= 0) {
+	  n.weight_set[s].weights[i] = o.weight_set[s].weights[item_orig_pos[i]];
+	} else if ((*cmap_item_weight)[w.first].count(item)) {
+	  n.weight_set[s].weights[i] = (*cmap_item_weight)[w.first][item][s];
+	} else {
+	  n.weight_set[s].weights[i] = 0;
+	}
+	bucket_weights[s] += n.weight_set[s].weights[i];
+      }
+      (*cmap_item_weight)[w.first][bno] = bucket_weights;
+    }
+  }
+  return 0;
+}
+
+int CrushWrapper::get_rules_by_class(const string &class_name, set<int> *rules)
+{
+  ceph_assert(rules);
+  rules->clear();
+  if (!class_exists(class_name)) {
+    return -ENOENT;
+  }
+  int class_id = get_class_id(class_name);
+  for (unsigned i = 0; i < crush->max_rules; ++i) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j = 0; j < r->len; ++j) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE) {
+        int step_item = r->steps[j].arg1;
+        int original_item;
+        int c;
+        int res = split_id_class(step_item, &original_item, &c);
+        if (res < 0) {
+          return res;
+        }
+        if (c != -1 && c == class_id) {
+          rules->insert(i);
+          break;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// return rules that might reference the given osd
+int CrushWrapper::get_rules_by_osd(int osd, set<int> *rules)
+{
+  ceph_assert(rules);
+  rules->clear();
+  if (osd < 0) {
+    return -EINVAL;
+  }
+  for (unsigned i = 0; i < crush->max_rules; ++i) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j = 0; j < r->len; ++j) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE) {
+        int step_item = r->steps[j].arg1;
+        list<int> unordered;
+        int rc = _get_leaves(step_item, &unordered);
+        if (rc < 0) {
+          return rc; // propagate fatal errors!
+        }
+        bool match = false;
+        for (auto &o: unordered) {
+          ceph_assert(o >= 0);
+          if (o == osd) {
+            match = true;
+            break;
+          }
+        }
+        if (match) {
+          rules->insert(i);
+          break;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+bool CrushWrapper::_class_is_dead(int class_id)
+{
+  for (auto &p: class_map) {
+    if (p.first >= 0 && p.second == class_id) {
+      return false;
+    }
+  }
+  for (unsigned i = 0; i < crush->max_rules; ++i) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j = 0; j < r->len; ++j) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE) {
+        int root = r->steps[j].arg1;
+        for (auto &p : class_bucket) {
+          auto& q = p.second;
+          if (q.count(class_id) && q[class_id] == root) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+  // no more referenced by any devices or crush rules
+  return true;
+}
+
+void CrushWrapper::cleanup_dead_classes()
+{
+  auto p = class_name.begin();
+  while (p != class_name.end()) {
+    if (_class_is_dead(p->first)) {
+      string n = p->second;
+      ++p;
+      remove_class_name(n);
+    } else {
+      ++p;
+    }
+  }
+}
+
+int CrushWrapper::rebuild_roots_with_classes(CephContext *cct)
+{
+  std::map<int32_t, map<int32_t, int32_t> > old_class_bucket = class_bucket;
+  cleanup_dead_classes();
+  int r = trim_roots_with_class(cct);
+  if (r < 0)
+    return r;
+  class_bucket.clear();
+  return populate_classes(old_class_bucket);
+}
+
+void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
+{
+  using ceph::encode;
+  ceph_assert(crush);
+
+  __u32 magic = CRUSH_MAGIC;
+  encode(magic, bl);
+
+  encode(crush->max_buckets, bl);
+  encode(crush->max_rules, bl);
+  encode(crush->max_devices, bl);
+
+  bool encode_compat_choose_args = false;
+  crush_choose_arg_map arg_map;
+  memset(&arg_map, '\0', sizeof(arg_map));
+  if (has_choose_args() &&
+      !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
+    ceph_assert(!has_incompat_choose_args());
+    encode_compat_choose_args = true;
+    arg_map = choose_args.begin()->second;
+  }
+
+  // buckets
+  for (int i=0; i<crush->max_buckets; i++) {
+    __u32 alg = 0;
+    if (crush->buckets[i]) alg = crush->buckets[i]->alg;
+    encode(alg, bl);
+    if (!alg)
+      continue;
+
+    encode(crush->buckets[i]->id, bl);
+    encode(crush->buckets[i]->type, bl);
+    encode(crush->buckets[i]->alg, bl);
+    encode(crush->buckets[i]->hash, bl);
+    encode(crush->buckets[i]->weight, bl);
+    encode(crush->buckets[i]->size, bl);
+    for (unsigned j=0; j<crush->buckets[i]->size; j++)
+      encode(crush->buckets[i]->items[j], bl);
+
+    switch (crush->buckets[i]->alg) {
+    case CRUSH_BUCKET_UNIFORM:
+      encode((reinterpret_cast<crush_bucket_uniform*>(crush->buckets[i]))->item_weight, bl);
+      break;
+
+    case CRUSH_BUCKET_LIST:
+      for (unsigned j=0; j<crush->buckets[i]->size; j++) {
+	encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->item_weights[j], bl);
+	encode((reinterpret_cast<crush_bucket_list*>(crush->buckets[i]))->sum_weights[j], bl);
+      }
+      break;
+
+    case CRUSH_BUCKET_TREE:
+      encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes, bl);
+      for (unsigned j=0; j<(reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->num_nodes; j++)
+	encode((reinterpret_cast<crush_bucket_tree*>(crush->buckets[i]))->node_weights[j], bl);
+      break;
+
+    case CRUSH_BUCKET_STRAW:
+      for (unsigned j=0; j<crush->buckets[i]->size; j++) {
+	encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->item_weights[j], bl);
+	encode((reinterpret_cast<crush_bucket_straw*>(crush->buckets[i]))->straws[j], bl);
+      }
+      break;
+
+    case CRUSH_BUCKET_STRAW2:
+      {
+	__u32 *weights;
+	if (encode_compat_choose_args &&
+	    arg_map.args[i].weight_set_positions > 0) {
+	  weights = arg_map.args[i].weight_set[0].weights;
+	} else {
+	  weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
+	}
+	for (unsigned j=0; j<crush->buckets[i]->size; j++) {
+	  encode(weights[j], bl);
+	}
+      }
+      break;
+
+    default:
+      ceph_abort();
+      break;
+    }
+  }
+
+  // rules
+  for (unsigned i=0; i<crush->max_rules; i++) {
+    __u32 yes = crush->rules[i] ? 1:0;
+    encode(yes, bl);
+    if (!yes)
+      continue;
+
+    encode(crush->rules[i]->len, bl);
+    encode(crush->rules[i]->mask, bl);
+    for (unsigned j=0; j<crush->rules[i]->len; j++)
+      encode(crush->rules[i]->steps[j], bl);
+  }
+
+  // name info
+  encode(type_map, bl);
+  encode(name_map, bl);
+  encode(rule_name_map, bl);
+
+  // tunables
+  encode(crush->choose_local_tries, bl);
+  encode(crush->choose_local_fallback_tries, bl);
+  encode(crush->choose_total_tries, bl);
+  encode(crush->chooseleaf_descend_once, bl);
+  encode(crush->chooseleaf_vary_r, bl);
+  encode(crush->straw_calc_version, bl);
+  encode(crush->allowed_bucket_algs, bl);
+  if (features & CEPH_FEATURE_CRUSH_TUNABLES5) {
+    encode(crush->chooseleaf_stable, bl);
+  }
+
+  if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+    // device classes
+    encode(class_map, bl);
+    encode(class_name, bl);
+    encode(class_bucket, bl);
+
+    // choose args
+    __u32 size = (__u32)choose_args.size();
+    encode(size, bl);
+    for (auto c : choose_args) {
+      encode(c.first, bl);
+      crush_choose_arg_map arg_map = c.second;
+      size = 0;
+      for (__u32 i = 0; i < arg_map.size; i++) {
+	crush_choose_arg *arg = &arg_map.args[i];
+	if (arg->weight_set_positions == 0 &&
+	    arg->ids_size == 0)
+	  continue;
+	size++;
+      }
+      encode(size, bl);
+      for (__u32 i = 0; i < arg_map.size; i++) {
+	crush_choose_arg *arg = &arg_map.args[i];
+	if (arg->weight_set_positions == 0 &&
+	    arg->ids_size == 0)
+	  continue;
+	encode(i, bl);
+	encode(arg->weight_set_positions, bl);
+	for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+	  crush_weight_set *weight_set = &arg->weight_set[j];
+	  encode(weight_set->size, bl);
+	  for (__u32 k = 0; k < weight_set->size; k++)
+	    encode(weight_set->weights[k], bl);
+	}
+	encode(arg->ids_size, bl);
+	for (__u32 j = 0; j < arg->ids_size; j++)
+	  encode(arg->ids[j], bl);
+      }
+    }
+  }
+}
+
+static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::const_iterator& blp)
+{
+  m.clear();
+  __u32 n;
+  decode(n, blp);
+  while (n--) {
+    __s32 key;
+    decode(key, blp);
+
+    __u32 strlen;
+    decode(strlen, blp);
+    if (strlen == 0) {
+      // der, key was actually 64-bits!
+      decode(strlen, blp);
+    }
+    decode_nohead(strlen, m[key], blp);
+  }
+}
+
+void CrushWrapper::decode(bufferlist::const_iterator& blp)
+{
+  using ceph::decode;
+  create();
+
+  __u32 magic;
+  decode(magic, blp);
+  if (magic != CRUSH_MAGIC)
+    throw ceph::buffer::malformed_input("bad magic number");
+
+  decode(crush->max_buckets, blp);
+  decode(crush->max_rules, blp);
+  decode(crush->max_devices, blp);
+
+  // legacy tunables, unless we decode something newer
+  set_tunables_legacy();
+
+  try {
+    // buckets
+    crush->buckets = (crush_bucket**)calloc(1, crush->max_buckets * sizeof(crush_bucket*));
+    for (int i=0; i<crush->max_buckets; i++) {
+      decode_crush_bucket(&crush->buckets[i], blp);
+    }
+
+    // rules
+    crush->rules = (crush_rule**)calloc(1, crush->max_rules * sizeof(crush_rule*));
+    for (unsigned i = 0; i < crush->max_rules; ++i) {
+      __u32 yes;
+      decode(yes, blp);
+      if (!yes) {
+	crush->rules[i] = NULL;
+	continue;
+      }
+
+      __u32 len;
+      decode(len, blp);
+      crush->rules[i] = reinterpret_cast<crush_rule*>(calloc(1, crush_rule_size(len)));
+      crush->rules[i]->len = len;
+      decode(crush->rules[i]->mask, blp);
+      for (unsigned j=0; j<crush->rules[i]->len; j++)
+	decode(crush->rules[i]->steps[j], blp);
+    }
+
+    // name info
+    // NOTE: we had a bug where we were incoding int instead of int32, which means the
+    // 'key' field for these maps may be either 32 or 64 bits, depending.  tolerate
+    // both by assuming the string is always non-empty.
+    decode_32_or_64_string_map(type_map, blp);
+    decode_32_or_64_string_map(name_map, blp);
+    decode_32_or_64_string_map(rule_name_map, blp);
+
+    // tunables
+    if (!blp.end()) {
+      decode(crush->choose_local_tries, blp);
+      decode(crush->choose_local_fallback_tries, blp);
+      decode(crush->choose_total_tries, blp);
+    }
+    if (!blp.end()) {
+      decode(crush->chooseleaf_descend_once, blp);
+    }
+    if (!blp.end()) {
+      decode(crush->chooseleaf_vary_r, blp);
+    }
+    if (!blp.end()) {
+      decode(crush->straw_calc_version, blp);
+    }
+    if (!blp.end()) {
+      decode(crush->allowed_bucket_algs, blp);
+    }
+    if (!blp.end()) {
+      decode(crush->chooseleaf_stable, blp);
+    }
+    if (!blp.end()) {
+      decode(class_map, blp);
+      decode(class_name, blp);
+      for (auto &c : class_name)
+	class_rname[c.second] = c.first;
+      decode(class_bucket, blp);
+    }
+    if (!blp.end()) {
+      __u32 choose_args_size;
+      decode(choose_args_size, blp);
+      for (__u32 i = 0; i < choose_args_size; i++) {
+        typename decltype(choose_args)::key_type choose_args_index;
+	decode(choose_args_index, blp);
+	crush_choose_arg_map arg_map;
+	arg_map.size = crush->max_buckets;
+	arg_map.args = static_cast<crush_choose_arg*>(calloc(
+	  arg_map.size, sizeof(crush_choose_arg)));
+	__u32 size;
+	decode(size, blp);
+	for (__u32 j = 0; j < size; j++) {
+	  __u32 bucket_index;
+	  decode(bucket_index, blp);
+	  ceph_assert(bucket_index < arg_map.size);
+	  crush_choose_arg *arg = &arg_map.args[bucket_index];
+	  decode(arg->weight_set_positions, blp);
+	  if (arg->weight_set_positions) {
+	    arg->weight_set = static_cast<crush_weight_set*>(calloc(
+	      arg->weight_set_positions, sizeof(crush_weight_set)));
+	    for (__u32 k = 0; k < arg->weight_set_positions; k++) {
+	      crush_weight_set *weight_set = &arg->weight_set[k];
+	      decode(weight_set->size, blp);
+	      weight_set->weights = (__u32*)calloc(
+		weight_set->size, sizeof(__u32));
+	      for (__u32 l = 0; l < weight_set->size; l++)
+		decode(weight_set->weights[l], blp);
+	    }
+	  }
+	  decode(arg->ids_size, blp);
+	  if (arg->ids_size) {
+	    ceph_assert(arg->ids_size == crush->buckets[bucket_index]->size);
+	    arg->ids = (__s32 *)calloc(arg->ids_size, sizeof(__s32));
+	    for (__u32 k = 0; k < arg->ids_size; k++)
+	      decode(arg->ids[k], blp);
+	  }
+	}
+	choose_args[choose_args_index] = arg_map;
+      }
+    }
+    update_choose_args(nullptr); // in case we decode a legacy "corrupted" map
+    finalize();
+  }
+  catch (...) {
+    crush_destroy(crush);
+    throw;
+  }
+}
+
+void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::const_iterator &blp)
+{
+  using ceph::decode;
+  __u32 alg;
+  decode(alg, blp);
+  if (!alg) {
+    *bptr = NULL;
+    return;
+  }
+
+  int size = 0;
+  switch (alg) {
+  case CRUSH_BUCKET_UNIFORM:
+    size = sizeof(crush_bucket_uniform);
+    break;
+  case CRUSH_BUCKET_LIST:
+    size = sizeof(crush_bucket_list);
+    break;
+  case CRUSH_BUCKET_TREE:
+    size = sizeof(crush_bucket_tree);
+    break;
+  case CRUSH_BUCKET_STRAW:
+    size = sizeof(crush_bucket_straw);
+    break;
+  case CRUSH_BUCKET_STRAW2:
+    size = sizeof(crush_bucket_straw2);
+    break;
+  default:
+    {
+      char str[128];
+      snprintf(str, sizeof(str), "unsupported bucket algorithm: %d", alg);
+      throw ceph::buffer::malformed_input(str);
+    }
+  }
+  crush_bucket *bucket = reinterpret_cast<crush_bucket*>(calloc(1, size));
+  *bptr = bucket;
+    
+  decode(bucket->id, blp);
+  decode(bucket->type, blp);
+  decode(bucket->alg, blp);
+  decode(bucket->hash, blp);
+  decode(bucket->weight, blp);
+  decode(bucket->size, blp);
+
+  bucket->items = (__s32*)calloc(1, bucket->size * sizeof(__s32));
+  for (unsigned j = 0; j < bucket->size; ++j) {
+    decode(bucket->items[j], blp);
+  }
+
+  switch (bucket->alg) {
+  case CRUSH_BUCKET_UNIFORM:
+    decode((reinterpret_cast<crush_bucket_uniform*>(bucket))->item_weight, blp);
+    break;
+
+  case CRUSH_BUCKET_LIST: {
+    crush_bucket_list* cbl = reinterpret_cast<crush_bucket_list*>(bucket);
+    cbl->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+    cbl->sum_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+
+    for (unsigned j = 0; j < bucket->size; ++j) {
+      decode(cbl->item_weights[j], blp);
+      decode(cbl->sum_weights[j], blp);
+    }
+    break;
+  }
+
+  case CRUSH_BUCKET_TREE: {
+    crush_bucket_tree* cbt = reinterpret_cast<crush_bucket_tree*>(bucket);
+    decode(cbt->num_nodes, blp);
+    cbt->node_weights = (__u32*)calloc(1, cbt->num_nodes * sizeof(__u32));
+    for (unsigned j=0; j<cbt->num_nodes; j++) {
+      decode(cbt->node_weights[j], blp);
+    }
+    break;
+  }
+
+  case CRUSH_BUCKET_STRAW: {
+    crush_bucket_straw* cbs = reinterpret_cast<crush_bucket_straw*>(bucket);
+    cbs->straws = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+    cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+    for (unsigned j = 0; j < bucket->size; ++j) {
+      decode(cbs->item_weights[j], blp);
+      decode(cbs->straws[j], blp);
+    }
+    break;
+  }
+
+  case CRUSH_BUCKET_STRAW2: {
+    crush_bucket_straw2* cbs = reinterpret_cast<crush_bucket_straw2*>(bucket);
+    cbs->item_weights = (__u32*)calloc(1, bucket->size * sizeof(__u32));
+    for (unsigned j = 0; j < bucket->size; ++j) {
+      decode(cbs->item_weights[j], blp);
+    }
+    break;
+  }
+
+  default:
+    // We should have handled this case in the first switch statement
+    ceph_abort();
+    break;
+  }
+}
+
+  
+void CrushWrapper::dump(Formatter *f) const
+{
+  f->open_array_section("devices");
+  for (int i=0; i<get_max_devices(); i++) {
+    f->open_object_section("device");
+    f->dump_int("id", i);
+    const char *n = get_item_name(i);
+    if (n) {
+      f->dump_string("name", n);
+    } else {
+      char name[20];
+      sprintf(name, "device%d", i);
+      f->dump_string("name", name);
+    }
+    const char *device_class = get_item_class(i);
+    if (device_class != NULL)
+      f->dump_string("class", device_class);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("types");
+  int n = get_num_type_names();
+  for (int i=0; n; i++) {
+    const char *name = get_type_name(i);
+    if (!name) {
+      if (i == 0) {
+	f->open_object_section("type");
+	f->dump_int("type_id", 0);
+	f->dump_string("name", "device");
+	f->close_section();
+      }
+      continue;
+    }
+    n--;
+    f->open_object_section("type");
+    f->dump_int("type_id", i);
+    f->dump_string("name", name);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("buckets");
+  for (int bucket = -1; bucket > -1-get_max_buckets(); --bucket) {
+    if (!bucket_exists(bucket))
+      continue;
+    f->open_object_section("bucket");
+    f->dump_int("id", bucket);
+    if (get_item_name(bucket))
+      f->dump_string("name", get_item_name(bucket));
+    f->dump_int("type_id", get_bucket_type(bucket));
+    if (get_type_name(get_bucket_type(bucket)))
+      f->dump_string("type_name", get_type_name(get_bucket_type(bucket)));
+    f->dump_int("weight", get_bucket_weight(bucket));
+    f->dump_string("alg", crush_bucket_alg_name(get_bucket_alg(bucket)));
+    f->dump_string("hash", crush_hash_name(get_bucket_hash(bucket)));
+    f->open_array_section("items");
+    for (int j=0; j<get_bucket_size(bucket); j++) {
+      f->open_object_section("item");
+      f->dump_int("id", get_bucket_item(bucket, j));
+      f->dump_int("weight", get_bucket_item_weight(bucket, j));
+      f->dump_int("pos", j);
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("rules");
+  dump_rules(f);
+  f->close_section();
+
+  f->open_object_section("tunables");
+  dump_tunables(f);
+  f->close_section();
+
+  dump_choose_args(f);
+}
+
+namespace {
+  // depth first walker
+  class TreeDumper {
+    typedef CrushTreeDumper::Item Item;
+    const CrushWrapper *crush;
+    const CrushTreeDumper::name_map_t& weight_set_names;
+  public:
+    explicit TreeDumper(const CrushWrapper *crush,
+			const CrushTreeDumper::name_map_t& wsnames)
+      : crush(crush), weight_set_names(wsnames) {}
+
+    void dump(Formatter *f) {
+      set<int> roots;
+      crush->find_roots(&roots);
+      for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
+	dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
+      }
+    }
+
+  private:
+    void dump_item(const Item& qi, Formatter* f) {
+      if (qi.is_bucket()) {
+	f->open_object_section("bucket");
+	CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
+	dump_bucket_children(qi, f);
+	f->close_section();
+      } else {
+	f->open_object_section("device");
+	CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
+	f->close_section();
+      }
+    }
+
+    void dump_bucket_children(const Item& parent, Formatter* f) {
+      f->open_array_section("items");
+      const int max_pos = crush->get_bucket_size(parent.id);
+      for (int pos = 0; pos < max_pos; pos++) {
+	int id = crush->get_bucket_item(parent.id, pos);
+	float weight = crush->get_bucket_item_weightf(parent.id, pos);
+	dump_item(Item(id, parent.id, parent.depth + 1, weight), f);
+      }
+      f->close_section();
+    }
+  };
+}
+
+void CrushWrapper::dump_tree(
+  Formatter *f,
+  const CrushTreeDumper::name_map_t& weight_set_names) const
+{
+  ceph_assert(f);
+  TreeDumper(this, weight_set_names).dump(f);
+}
+
+void CrushWrapper::dump_tunables(Formatter *f) const
+{
+  f->dump_int("choose_local_tries", get_choose_local_tries());
+  f->dump_int("choose_local_fallback_tries", get_choose_local_fallback_tries());
+  f->dump_int("choose_total_tries", get_choose_total_tries());
+  f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once());
+  f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r());
+  f->dump_int("chooseleaf_stable", get_chooseleaf_stable());
+  f->dump_int("straw_calc_version", get_straw_calc_version());
+  f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs());
+
+  // be helpful about it
+  if (has_jewel_tunables())
+    f->dump_string("profile", "jewel");
+  else if (has_hammer_tunables())
+    f->dump_string("profile", "hammer");
+  else if (has_firefly_tunables())
+    f->dump_string("profile", "firefly");
+  else if (has_bobtail_tunables())
+    f->dump_string("profile", "bobtail");
+  else if (has_argonaut_tunables())
+    f->dump_string("profile", "argonaut");
+  else
+    f->dump_string("profile", "unknown");
+  f->dump_int("optimal_tunables", (int)has_optimal_tunables());
+  f->dump_int("legacy_tunables", (int)has_legacy_tunables());
+
+  // be helpful about minimum version required
+  f->dump_string("minimum_required_version", get_min_required_version());
+
+  f->dump_int("require_feature_tunables", (int)has_nondefault_tunables());
+  f->dump_int("require_feature_tunables2", (int)has_nondefault_tunables2());
+  f->dump_int("has_v2_rules", (int)has_v2_rules());
+  f->dump_int("require_feature_tunables3", (int)has_nondefault_tunables3());
+  f->dump_int("has_v3_rules", (int)has_v3_rules());
+  f->dump_int("has_v4_buckets", (int)has_v4_buckets());
+  f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5());
+  f->dump_int("has_v5_rules", (int)has_v5_rules());
+}
+
+void CrushWrapper::dump_choose_args(Formatter *f) const
+{
+  f->open_object_section("choose_args");
+  for (auto c : choose_args) {
+    crush_choose_arg_map arg_map = c.second;
+    f->open_array_section(stringify(c.first).c_str());
+    for (__u32 i = 0; i < arg_map.size; i++) {
+      crush_choose_arg *arg = &arg_map.args[i];
+      if (arg->weight_set_positions == 0 &&
+	  arg->ids_size == 0)
+	continue;
+      f->open_object_section("choose_args");
+      int bucket_index = i;
+      f->dump_int("bucket_id", -1-bucket_index);
+      if (arg->weight_set_positions > 0) {
+	f->open_array_section("weight_set");
+	for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+	  f->open_array_section("weights");
+	  __u32 *weights = arg->weight_set[j].weights;
+	  __u32 size = arg->weight_set[j].size;
+	  for (__u32 k = 0; k < size; k++) {
+	    f->dump_float("weight", (float)weights[k]/(float)0x10000);
+	  }
+	  f->close_section();
+	}
+	f->close_section();
+      }
+      if (arg->ids_size > 0) {
+	f->open_array_section("ids");
+	for (__u32 j = 0; j < arg->ids_size; j++)
+	  f->dump_int("id", arg->ids[j]);
+	f->close_section();
+      }
+      f->close_section();
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void CrushWrapper::dump_rules(Formatter *f) const
+{
+  for (int i=0; i<get_max_rules(); i++) {
+    if (!rule_exists(i))
+      continue;
+    dump_rule(i, f);
+  }
+}
+
+void CrushWrapper::dump_rule(int ruleset, Formatter *f) const
+{
+  f->open_object_section("rule");
+  f->dump_int("rule_id", ruleset);
+  if (get_rule_name(ruleset))
+    f->dump_string("rule_name", get_rule_name(ruleset));
+  f->dump_int("ruleset", get_rule_mask_ruleset(ruleset));
+  f->dump_int("type", get_rule_mask_type(ruleset));
+  f->dump_int("min_size", get_rule_mask_min_size(ruleset));
+  f->dump_int("max_size", get_rule_mask_max_size(ruleset));
+  f->open_array_section("steps");
+  for (int j=0; j<get_rule_len(ruleset); j++) {
+    f->open_object_section("step");
+    switch (get_rule_op(ruleset, j)) {
+    case CRUSH_RULE_NOOP:
+      f->dump_string("op", "noop");
+      break;
+    case CRUSH_RULE_TAKE:
+      f->dump_string("op", "take");
+      {
+        int item = get_rule_arg1(ruleset, j);
+        f->dump_int("item", item);
+
+        const char *name = get_item_name(item);
+        f->dump_string("item_name", name ? name : "");
+      }
+      break;
+    case CRUSH_RULE_EMIT:
+      f->dump_string("op", "emit");
+      break;
+    case CRUSH_RULE_CHOOSE_FIRSTN:
+      f->dump_string("op", "choose_firstn");
+      f->dump_int("num", get_rule_arg1(ruleset, j));
+      f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
+      break;
+    case CRUSH_RULE_CHOOSE_INDEP:
+      f->dump_string("op", "choose_indep");
+      f->dump_int("num", get_rule_arg1(ruleset, j));
+      f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
+      break;
+    case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+      f->dump_string("op", "chooseleaf_firstn");
+      f->dump_int("num", get_rule_arg1(ruleset, j));
+      f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
+      break;
+    case CRUSH_RULE_CHOOSELEAF_INDEP:
+      f->dump_string("op", "chooseleaf_indep");
+      f->dump_int("num", get_rule_arg1(ruleset, j));
+      f->dump_string("type", get_type_name(get_rule_arg2(ruleset, j)));
+      break;
+    case CRUSH_RULE_SET_CHOOSE_TRIES:
+      f->dump_string("op", "set_choose_tries");
+      f->dump_int("num", get_rule_arg1(ruleset, j));
+      break;
+    case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+      f->dump_string("op", "set_chooseleaf_tries");
+      f->dump_int("num", get_rule_arg1(ruleset, j));
+      break;
+    default:
+      f->dump_int("opcode", get_rule_op(ruleset, j));
+      f->dump_int("arg1", get_rule_arg1(ruleset, j));
+      f->dump_int("arg2", get_rule_arg2(ruleset, j));
+    }
+    f->close_section();
+  }
+  f->close_section();
+  f->close_section();
+}
+
+void CrushWrapper::list_rules(Formatter *f) const
+{
+  for (int rule = 0; rule < get_max_rules(); rule++) {
+    if (!rule_exists(rule))
+      continue;
+    f->dump_string("name", get_rule_name(rule));
+  }
+}
+
+void CrushWrapper::list_rules(ostream *ss) const
+{
+  for (int rule = 0; rule < get_max_rules(); rule++) {
+    if (!rule_exists(rule))
+      continue;
+    *ss << get_rule_name(rule) << "\n";
+  }
+}
+
+class CrushTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
+public:
+  typedef CrushTreeDumper::Dumper<TextTable> Parent;
+
+  explicit CrushTreePlainDumper(const CrushWrapper *crush,
+				const CrushTreeDumper::name_map_t& wsnames)
+    : Parent(crush, wsnames) {}
+  explicit CrushTreePlainDumper(const CrushWrapper *crush,
+                                const CrushTreeDumper::name_map_t& wsnames,
+                                bool show_shadow)
+    : Parent(crush, wsnames, show_shadow) {}
+
+
+  void dump(TextTable *tbl) {
+    tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
+    for (auto& p : crush->choose_args) {
+      if (p.first == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
+	tbl->define_column("(compat)", TextTable::LEFT, TextTable::RIGHT);
+      } else {
+	string name;
+	auto q = weight_set_names.find(p.first);
+	name = q != weight_set_names.end() ? q->second :
+	  stringify(p.first);
+	tbl->define_column(name.c_str(), TextTable::LEFT, TextTable::RIGHT);
+      }
+    }
+    tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
+    Parent::dump(tbl);
+  }
+
+protected:
+  void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
+    const char *c = crush->get_item_class(qi.id);
+    if (!c)
+      c = "";
+    *tbl << qi.id
+	 << c
+	 << weightf_t(qi.weight);
+    for (auto& p : crush->choose_args) {
+      if (qi.parent < 0) {
+	const crush_choose_arg_map cmap = crush->choose_args_get(p.first);
+	int bidx = -1 - qi.parent;
+	const crush_bucket *b = crush->get_bucket(qi.parent);
+	if (b &&
+	    bidx < (int)cmap.size &&
+	    cmap.args[bidx].weight_set &&
+	    cmap.args[bidx].weight_set_positions >= 1) {
+	  int pos;
+	  for (pos = 0;
+	       pos < (int)cmap.args[bidx].weight_set[0].size &&
+		 b->items[pos] != qi.id;
+	       ++pos) ;
+	  *tbl << weightf_t((float)cmap.args[bidx].weight_set[0].weights[pos] /
+			    (float)0x10000);
+	  continue;
+	}
+      }
+      *tbl << "";
+    }
+    ostringstream ss;
+    for (int k=0; k < qi.depth; k++) {
+      ss << "    ";
+    }
+    if (qi.is_bucket()) {
+      ss << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
+	 << crush->get_item_name(qi.id);
+    } else {
+      ss << "osd." << qi.id;
+    }
+    *tbl << ss.str();
+    *tbl << TextTable::endrow;
+  }
+};
+
+
+class CrushTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
+public:
+  typedef CrushTreeDumper::FormattingDumper Parent;
+
+  explicit CrushTreeFormattingDumper(
+    const CrushWrapper *crush,
+    const CrushTreeDumper::name_map_t& wsnames)
+    : Parent(crush, wsnames) {}
+
+  explicit CrushTreeFormattingDumper(
+    const CrushWrapper *crush,
+    const CrushTreeDumper::name_map_t& wsnames,
+    bool show_shadow)
+    : Parent(crush, wsnames, show_shadow) {}
+
+  void dump(Formatter *f) {
+    f->open_array_section("nodes");
+    Parent::dump(f);
+    f->close_section();
+
+    // There is no stray bucket whose id is a negative number, so just get
+    // the max_id and iterate from 0 to max_id to dump stray osds.
+    f->open_array_section("stray");
+    int32_t max_id = -1;
+    if (!crush->name_map.empty()) {
+      max_id = crush->name_map.rbegin()->first;
+    }
+    for (int32_t i = 0; i <= max_id; i++) {
+      if (crush->item_exists(i) && !is_touched(i) && should_dump(i)) {
+        dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
+      }
+    }
+    f->close_section();
+  }
+};
+
+
+void CrushWrapper::dump_tree(
+  ostream *out,
+  Formatter *f,
+  const CrushTreeDumper::name_map_t& weight_set_names,
+  bool show_shadow) const
+{
+  if (out) {
+    TextTable tbl;
+    CrushTreePlainDumper(this, weight_set_names, show_shadow).dump(&tbl);
+    *out << tbl;
+  }
+  if (f) {
+    CrushTreeFormattingDumper(this, weight_set_names, show_shadow).dump(f);
+  }
+}
+
+void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
+{
+  o.push_back(new CrushWrapper);
+  // fixme
+}
+
+/**
+ * Determine the default CRUSH ruleset ID to be used with
+ * newly created replicated pools.
+ *
+ * @returns a ruleset ID (>=0) or -1 if no suitable ruleset found
+ */
+int CrushWrapper::get_osd_pool_default_crush_replicated_ruleset(CephContext *cct)
+{
+  int crush_ruleset = cct->_conf.get_val<int64_t>("osd_pool_default_crush_rule");
+  if (crush_ruleset < 0) {
+    crush_ruleset = find_first_ruleset(pg_pool_t::TYPE_REPLICATED);
+  } else if (!ruleset_exists(crush_ruleset)) {
+    crush_ruleset = -1; // match find_first_ruleset() retval
+  }
+  return crush_ruleset;
+}
+
+bool CrushWrapper::is_valid_crush_name(const string& s)
+{
+  if (s.empty())
+    return false;
+  for (string::const_iterator p = s.begin(); p != s.end(); ++p) {
+    if (!(*p == '-') &&
+	!(*p == '_') &&
+	!(*p == '.') &&
+	!(*p >= '0' && *p <= '9') &&
+	!(*p >= 'A' && *p <= 'Z') &&
+	!(*p >= 'a' && *p <= 'z'))
+      return false;
+  }
+  return true;
+}
+
+bool CrushWrapper::is_valid_crush_loc(CephContext *cct,
+                                      const map<string,string>& loc)
+{
+  for (map<string,string>::const_iterator l = loc.begin(); l != loc.end(); ++l) {
+    if (!is_valid_crush_name(l->first) ||
+        !is_valid_crush_name(l->second)) {
+      ldout(cct, 1) << "loc["
+                    << l->first << "] = '"
+                    << l->second << "' not a valid crush name ([A-Za-z0-9_-.]+)"
+                    << dendl;
+      return false;
+    }
+  }
+  return true;
+}
+
+int CrushWrapper::_choose_type_stack(
+  CephContext *cct,
+  const vector<pair<int,int>>& stack,
+  const set<int>& overfull,
+  const vector<int>& underfull,
+  const vector<int>& more_underfull,
+  const vector<int>& orig,
+  vector<int>::const_iterator& i,
+  set<int>& used,
+  vector<int> *pw,
+  int root_bucket,
+  int rule) const
+{
+  vector<int> w = *pw;
+  vector<int> o;
+
+  ldout(cct, 10) << __func__ << " stack " << stack
+		 << " orig " << orig
+		 << " at " << *i
+		 << " pw " << *pw
+		 << dendl;
+  ceph_assert(root_bucket < 0);
+  vector<int> cumulative_fanout(stack.size());
+  int f = 1;
+  for (int j = (int)stack.size() - 1; j >= 0; --j) {
+    cumulative_fanout[j] = f;
+    f *= stack[j].second;
+  }
+  ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
+		 << dendl;
+
+  // identify underfull targets for each intermediate level.
+  // this serves two purposes:
+  //   1. we can tell when we are selecting a bucket that does not have any underfull
+  //      devices beneath it.  that means that if the current input includes an overfull
+  //      device, we won't be able to find an underfull device with this parent to
+  //      swap for it.
+  //   2. when we decide we should reject a bucket due to the above, this list gives us
+  //      a list of peers to consider that *do* have underfull devices available..  (we
+  //      are careful to pick one that has the same parent.)
+  vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
+  underfull_buckets.resize(stack.size() - 1);
+  for (auto osd : underfull) {
+    int item = osd;
+    for (int j = (int)stack.size() - 2; j >= 0; --j) {
+      int type = stack[j].first;
+      item = get_parent_of_type(item, type, rule);
+      ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
+		     << " is " << item << dendl;
+      if (!subtree_contains(root_bucket, item)) {
+        ldout(cct, 20) << __func__ << " not in root subtree " << root_bucket << dendl;
+        continue;
+      }
+      underfull_buckets[j].insert(item);
+    }
+  }
+  ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
+
+  for (unsigned j = 0; j < stack.size(); ++j) {
+    int type = stack[j].first;
+    int fanout = stack[j].second;
+    int cum_fanout = cumulative_fanout[j];
+    ldout(cct, 10) << " level " << j << ": type " << type << " fanout " << fanout
+		   << " cumulative " << cum_fanout
+		   << " w " << w << dendl;
+    vector<int> o;
+    auto tmpi = i;
+    if (i == orig.end()) {
+      ldout(cct, 10) << __func__ << " end of orig, break 0" << dendl;
+      break;
+    }
+    for (auto from : w) {
+      ldout(cct, 10) << " from " << from << dendl;
+      // identify leaves under each choice.  we use this to check whether any of these
+      // leaves are overfull.  (if so, we need to make sure there are underfull candidates
+      // to swap for them.)
+      vector<set<int>> leaves;
+      leaves.resize(fanout);
+      for (int pos = 0; pos < fanout; ++pos) {
+	if (type > 0) {
+	  // non-leaf
+	  int item = get_parent_of_type(*tmpi, type, rule);
+	  o.push_back(item);
+	  int n = cum_fanout;
+	  while (n-- && tmpi != orig.end()) {
+	    leaves[pos].insert(*tmpi++);
+	  }
+	  ldout(cct, 10) << __func__ << "   from " << *tmpi << " got " << item
+			 << " of type " << type << " over leaves " << leaves[pos] << dendl;
+	} else {
+	  // leaf
+	  bool replaced = false;
+	  if (overfull.count(*i)) {
+	    for (auto item : underfull) {
+	      ldout(cct, 10) << __func__ << " pos " << pos
+			     << " was " << *i << " considering " << item
+			     << dendl;
+	      if (used.count(item)) {
+		ldout(cct, 20) << __func__ << "   in used " << used << dendl;
+		continue;
+	      }
+	      if (!subtree_contains(from, item)) {
+		ldout(cct, 20) << __func__ << "   not in subtree " << from << dendl;
+		continue;
+	      }
+	      if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
+		ldout(cct, 20) << __func__ << "   in orig " << orig << dendl;
+		continue;
+	      }
+	      o.push_back(item);
+	      used.insert(item);
+	      ldout(cct, 10) << __func__ << " pos " << pos << " replace "
+			     << *i << " -> " << item << dendl;
+	      replaced = true;
+              ceph_assert(i != orig.end());
+	      ++i;
+	      break;
+	    }
+	      if (!replaced) {
+	      for (auto item : more_underfull) {
+	        ldout(cct, 10) << __func__ << " more underfull pos " << pos
+			       << " was " << *i << " considering " << item
+			       << dendl;
+	        if (used.count(item)) {
+		  ldout(cct, 20) << __func__ << "   in used " << used << dendl;
+		  continue;
+	        }
+	        if (!subtree_contains(from, item)) {
+		  ldout(cct, 20) << __func__ << "   not in subtree " << from << dendl;
+		  continue;
+	        }
+	        if (std::find(orig.begin(), orig.end(), item) != orig.end()) {
+		  ldout(cct, 20) << __func__ << "   in orig " << orig << dendl;
+		  continue;
+	        }
+	        o.push_back(item);
+	        used.insert(item);
+	        ldout(cct, 10) << __func__ << " pos " << pos << " replace "
+			       << *i << " -> " << item << dendl;
+	        replaced = true;
+                assert(i != orig.end());
+	        ++i;
+	        break;
+	      }
+	    }
+	  }
+	  if (!replaced) {
+	    ldout(cct, 10) << __func__ << " pos " << pos << " keep " << *i
+			   << dendl;
+            ceph_assert(i != orig.end());
+	    o.push_back(*i);
+	    ++i;
+	  }
+	  if (i == orig.end()) {
+	    ldout(cct, 10) << __func__ << " end of orig, break 1" << dendl;
+	    break;
+	  }
+	}
+      }
+      if (j + 1 < stack.size()) {
+	// check if any buckets have overfull leaves but no underfull candidates
+	for (int pos = 0; pos < fanout; ++pos) {
+	  if (underfull_buckets[j].count(o[pos]) == 0) {
+	    // are any leaves overfull?
+	    bool any_overfull = false;
+	    for (auto osd : leaves[pos]) {
+	      if (overfull.count(osd)) {
+		any_overfull = true;
+               break;
+	      }
+	    }
+	    if (any_overfull) {
+	      ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
+			     << ">0 leaves " << leaves[pos] << " is overfull; alts "
+			     << underfull_buckets[j]
+			     << dendl;
+	      for (auto alt : underfull_buckets[j]) {
+		if (std::find(o.begin(), o.end(), alt) == o.end()) {
+		  // see if alt has the same parent
+		  if (j == 0 ||
+		      get_parent_of_type(o[pos], stack[j-1].first, rule) ==
+		      get_parent_of_type(alt, stack[j-1].first, rule)) {
+		    if (j)
+		      ldout(cct, 10) << "  replacing " << o[pos]
+				     << " (which has no underfull leaves) with " << alt
+				     << " (same parent "
+				     << get_parent_of_type(alt, stack[j-1].first, rule) << " type "
+				     << type << ")" << dendl;
+		    else
+		      ldout(cct, 10) << "  replacing " << o[pos]
+				     << " (which has no underfull leaves) with " << alt
+				     << " (first level)" << dendl;
+		    o[pos] = alt;
+		    break;
+		  } else {
+		    ldout(cct, 30) << "  alt " << alt << " for " << o[pos]
+				   << " has different parent, skipping" << dendl;
+		  }
+		}
+	      }
+	    }
+	  }
+	}
+      }
+      if (i == orig.end()) {
+	ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
+	break;
+      }
+    }
+    ldout(cct, 10) << __func__ << "  w <- " << o << " was " << w << dendl;
+    w.swap(o);
+  }
+  *pw = w;
+  return 0;
+}
+
+int CrushWrapper::try_remap_rule(
+  CephContext *cct,
+  int ruleno,
+  int maxout,
+  const set<int>& overfull,
+  const vector<int>& underfull,
+  const vector<int>& more_underfull,
+  const vector<int>& orig,
+  vector<int> *out) const
+{
+  const crush_map *map = crush;
+  const crush_rule *rule = get_rule(ruleno);
+  ceph_assert(rule);
+
+  ldout(cct, 10) << __func__ << " ruleno " << ruleno
+		<< " numrep " << maxout << " overfull " << overfull
+		<< " underfull " << underfull
+		<< " more_underfull " << more_underfull
+		<< " orig " << orig
+		<< dendl;
+  vector<int> w; // working set
+  out->clear();
+
+  auto i = orig.begin();
+  set<int> used;
+
+  vector<pair<int,int>> type_stack;  // (type, fan-out)
+  int root_bucket = 0;
+  for (unsigned step = 0; step < rule->len; ++step) {
+    const crush_rule_step *curstep = &rule->steps[step];
+    ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
+    switch (curstep->op) {
+    case CRUSH_RULE_TAKE:
+      if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) ||
+	  (-1-curstep->arg1 >= 0 && -1-curstep->arg1 < map->max_buckets &&
+	   map->buckets[-1-curstep->arg1])) {
+	w.clear();
+	w.push_back(curstep->arg1);
+	root_bucket = curstep->arg1;
+	ldout(cct, 10) << __func__ << " take " << w << dendl;
+      } else {
+	ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
+      }
+      break;
+
+    case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+    case CRUSH_RULE_CHOOSELEAF_INDEP:
+      {
+	int numrep = curstep->arg1;
+	int type = curstep->arg2;
+	if (numrep <= 0)
+	  numrep += maxout;
+	type_stack.push_back(make_pair(type, numrep));
+        if (type > 0)
+	  type_stack.push_back(make_pair(0, 1));
+	int r = _choose_type_stack(cct, type_stack, overfull, underfull, more_underfull, orig,
+				   i, used, &w, root_bucket, ruleno);
+	if (r < 0)
+	  return r;
+	type_stack.clear();
+      }
+      break;
+
+    case CRUSH_RULE_CHOOSE_FIRSTN:
+    case CRUSH_RULE_CHOOSE_INDEP:
+      {
+	int numrep = curstep->arg1;
+	int type = curstep->arg2;
+	if (numrep <= 0)
+	  numrep += maxout;
+	type_stack.push_back(make_pair(type, numrep));
+      }
+      break;
+
+    case CRUSH_RULE_EMIT:
+      ldout(cct, 10) << " emit " << w << dendl;
+      if (!type_stack.empty()) {
+	int r = _choose_type_stack(cct, type_stack, overfull, underfull, more_underfull, orig,
+				   i, used, &w, root_bucket, ruleno);
+	if (r < 0)
+	  return r;
+	type_stack.clear();
+      }
+      for (auto item : w) {
+	out->push_back(item);
+      }
+      w.clear();
+      break;
+
+    default:
+      // ignore
+      break;
+    }
+  }
+
+  return 0;
+}
+
+
+int CrushWrapper::_choose_args_adjust_item_weight_in_bucket(
+  CephContext *cct,
+  crush_choose_arg_map cmap,
+  int bucketid,
+  int id,
+  const vector<int>& weight,
+  ostream *ss)
+{
+  int changed = 0;
+  int bidx = -1 - bucketid;
+  crush_bucket *b = crush->buckets[bidx];
+  if (bidx >= (int)cmap.size) {
+    if (ss)
+      *ss << "no weight-set for bucket " << b->id;
+    ldout(cct, 10) << __func__ << "  no crush_choose_arg for bucket " << b->id
+		   << dendl;
+    return 0;
+  }
+  crush_choose_arg *carg = &cmap.args[bidx];
+  if (carg->weight_set == NULL) {
+    // create a weight-set for this bucket and populate it with the
+    // bucket weights
+    unsigned positions = get_choose_args_positions(cmap);
+    carg->weight_set_positions = positions;
+    carg->weight_set = static_cast<crush_weight_set*>(
+      calloc(sizeof(crush_weight_set), positions));
+    for (unsigned p = 0; p < positions; ++p) {
+      carg->weight_set[p].size = b->size;
+      carg->weight_set[p].weights = (__u32*)calloc(b->size, sizeof(__u32));
+      for (unsigned i = 0; i < b->size; ++i) {
+	carg->weight_set[p].weights[i] = crush_get_bucket_item_weight(b, i);
+      }
+    }
+    changed++;
+  }
+  if (carg->weight_set_positions != weight.size()) {
+    if (ss)
+      *ss << "weight_set_positions != " << weight.size() << " for bucket " << b->id;
+    ldout(cct, 10) << __func__ << "  weight_set_positions != " << weight.size()
+		   << " for bucket " << b->id << dendl;
+    return 0;
+  }
+  for (unsigned i = 0; i < b->size; i++) {
+    if (b->items[i] == id) {
+      for (unsigned j = 0; j < weight.size(); ++j) {
+	carg->weight_set[j].weights[i] = weight[j];
+      }
+      ldout(cct, 5) << __func__ << "  set " << id << " to " << weight
+		    << " in bucket " << b->id << dendl;
+      changed++;
+    }
+  }
+  if (changed) {
+    vector<int> bucket_weight(weight.size(), 0);
+    for (unsigned i = 0; i < b->size; i++) {
+      for (unsigned j = 0; j < weight.size(); ++j) {
+	bucket_weight[j] += carg->weight_set[j].weights[i];
+      }
+    }
+    choose_args_adjust_item_weight(cct, cmap, b->id, bucket_weight, nullptr);
+  }
+  return changed;
+}
+
+int CrushWrapper::choose_args_adjust_item_weight(
+  CephContext *cct,
+  crush_choose_arg_map cmap,
+  int id,
+  const vector<int>& weight,
+  ostream *ss)
+{
+  ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
+  int changed = 0;
+  for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+    crush_bucket *b = crush->buckets[bidx];
+    if (b == nullptr) {
+      continue;
+    }
+    changed += _choose_args_adjust_item_weight_in_bucket(
+      cct, cmap, b->id, id, weight, ss);
+  }
+  if (!changed) {
+    if (ss)
+      *ss << "item " << id << " not found in crush map";
+    return -ENOENT;
+  }
+  return changed;
+}
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
new file mode 100644
index 000000000..d33d4bcf4
--- /dev/null
+++ b/src/crush/CrushWrapper.h
@@ -0,0 +1,1677 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CRUSH_WRAPPER_H
+#define CEPH_CRUSH_WRAPPER_H
+
+#include <stdlib.h>
+#include <map>
+#include <set>
+#include <string>
+
+#include <iosfwd>
+
+#include "include/types.h"
+
+extern "C" {
+#include "crush.h"
+#include "hash.h"
+#include "mapper.h"
+#include "builder.h"
+}
+
+#include "include/ceph_assert.h"
+#include "include/err.h"
+#include "include/encoding.h"
+#include "include/mempool.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+namespace CrushTreeDumper {
+typedef mempool::osdmap::map<int64_t,std::string> name_map_t;
+}
+
+WRITE_RAW_ENCODER(crush_rule_mask)   // it's all u8's
+
+inline void encode(const crush_rule_step &s, ceph::buffer::list &bl)
+{
+  using ceph::encode;
+  encode(s.op, bl);
+  encode(s.arg1, bl);
+  encode(s.arg2, bl);
+}
+inline void decode(crush_rule_step &s, ceph::buffer::list::const_iterator &p)
+{
+  using ceph::decode;
+  decode(s.op, p);
+  decode(s.arg1, p);
+  decode(s.arg2, p);
+}
+
+class CrushWrapper {
+public:
+  // magic value used by OSDMap for a "default" fallback choose_args, used if
+  // the choose_arg_map passed to do_rule does not exist.  if this also
+  // doesn't exist, fall back to canonical weights.
+  enum {
+    DEFAULT_CHOOSE_ARGS = -1
+  };
+
+  std::map<int32_t, std::string> type_map; // item(bucket/device) type id ==> item type name
+  std::map<int32_t, std::string> name_map; // item id ==> item name
+  std::map<int32_t, std::string> rule_name_map;
+
+  std::map<int32_t, int32_t> class_map; /* item id -> class id */
+  std::map<int32_t, std::string> class_name; /* class id -> class name */
+  std::map<std::string, int32_t> class_rname; /* class name -> class id */
+  std::map<int32_t, std::map<int32_t, int32_t> > class_bucket; /* bucket[id][class] == id */
+  std::map<int64_t, crush_choose_arg_map> choose_args;
+
+private:
+  struct crush_map *crush = nullptr;
+
+  bool have_uniform_rules = false;
+
+  /* reverse maps */
+  mutable bool have_rmaps = false;
+  mutable std::map<std::string, int> type_rmap, name_rmap, rule_name_rmap;
+  void build_rmaps() const {
+    if (have_rmaps) return;
+    build_rmap(type_map, type_rmap);
+    build_rmap(name_map, name_rmap);
+    build_rmap(rule_name_map, rule_name_rmap);
+    have_rmaps = true;
+  }
+  void build_rmap(const std::map<int, std::string> &f, std::map<std::string, int> &r) const {
+    r.clear();
+    for (auto p = f.begin(); p != f.end(); ++p)
+      r[p->second] = p->first;
+  }
+
+public:
+  CrushWrapper(const CrushWrapper& other);
+  const CrushWrapper& operator=(const CrushWrapper& other);
+
+  CrushWrapper() {
+    create();
+  }
+  ~CrushWrapper() {
+    if (crush)
+      crush_destroy(crush);
+    choose_args_clear();
+  }
+
+  crush_map *get_crush_map() { return crush; }
+
+  /* building */
+  void create() {
+    if (crush)
+      crush_destroy(crush);
+    crush = crush_create();
+    choose_args_clear();
+    ceph_assert(crush);
+    have_rmaps = false;
+
+    set_tunables_default();
+  }
+
+  /**
+   * true if any rule has a rule id != its position in the array
+   *
+   * These indicate "ruleset" IDs that were created by older versions
+   * of Ceph.  They are cleaned up in renumber_rules so that eventually
+   * we can remove the code for handling them.
+   */
+  bool has_legacy_rule_ids() const;
+
+  /**
+   * fix rules whose ruleid != ruleset
+   *
+   * These rules were created in older versions of Ceph.  The concept
+   * of a ruleset no longer exists.
+   *
+   * Return a map of old ID -> new ID.  Caller must update OSDMap
+   * to use new IDs.
+   */
+  std::map<int, int> renumber_rules();
+
+  /// true if any buckets that aren't straw2
+  bool has_non_straw2_buckets() const;
+
+  // tunables
+  void set_tunables_argonaut() {
+    crush->choose_local_tries = 2;
+    crush->choose_local_fallback_tries = 5;
+    crush->choose_total_tries = 19;
+    crush->chooseleaf_descend_once = 0;
+    crush->chooseleaf_vary_r = 0;
+    crush->chooseleaf_stable = 0;
+    crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+  }
+  void set_tunables_bobtail() {
+    crush->choose_local_tries = 0;
+    crush->choose_local_fallback_tries = 0;
+    crush->choose_total_tries = 50;
+    crush->chooseleaf_descend_once = 1;
+    crush->chooseleaf_vary_r = 0;
+    crush->chooseleaf_stable = 0;
+    crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+  }
+  void set_tunables_firefly() {
+    crush->choose_local_tries = 0;
+    crush->choose_local_fallback_tries = 0;
+    crush->choose_total_tries = 50;
+    crush->chooseleaf_descend_once = 1;
+    crush->chooseleaf_vary_r = 1;
+    crush->chooseleaf_stable = 0;
+    crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+  }
+  void set_tunables_hammer() {
+    crush->choose_local_tries = 0;
+    crush->choose_local_fallback_tries = 0;
+    crush->choose_total_tries = 50;
+    crush->chooseleaf_descend_once = 1;
+    crush->chooseleaf_vary_r = 1;
+    crush->chooseleaf_stable = 0;
+    crush->allowed_bucket_algs =
+      (1 << CRUSH_BUCKET_UNIFORM) |
+      (1 << CRUSH_BUCKET_LIST) |
+      (1 << CRUSH_BUCKET_STRAW) |
+      (1 << CRUSH_BUCKET_STRAW2);
+  }
+  void set_tunables_jewel() {
+    crush->choose_local_tries = 0;
+    crush->choose_local_fallback_tries = 0;
+    crush->choose_total_tries = 50;
+    crush->chooseleaf_descend_once = 1;
+    crush->chooseleaf_vary_r = 1;
+    crush->chooseleaf_stable = 1;
+    crush->allowed_bucket_algs =
+      (1 << CRUSH_BUCKET_UNIFORM) |
+      (1 << CRUSH_BUCKET_LIST) |
+      (1 << CRUSH_BUCKET_STRAW) |
+      (1 << CRUSH_BUCKET_STRAW2);
+  }
+
+  void set_tunables_legacy() {
+    set_tunables_argonaut();
+    crush->straw_calc_version = 0;
+  }
+  void set_tunables_optimal() {
+    set_tunables_jewel();
+    crush->straw_calc_version = 1;
+  }
+  void set_tunables_default() {
+    set_tunables_jewel();
+    crush->straw_calc_version = 1;
+  }
+
+  int get_choose_local_tries() const {
+    return crush->choose_local_tries;
+  }
+  void set_choose_local_tries(int n) {
+    crush->choose_local_tries = n;
+  }
+
+  int get_choose_local_fallback_tries() const {
+    return crush->choose_local_fallback_tries;
+  }
+  void set_choose_local_fallback_tries(int n) {
+    crush->choose_local_fallback_tries = n;
+  }
+
+  int get_choose_total_tries() const {
+    return crush->choose_total_tries;
+  }
+  void set_choose_total_tries(int n) {
+    crush->choose_total_tries = n;
+  }
+
+  int get_chooseleaf_descend_once() const {
+    return crush->chooseleaf_descend_once;
+  }
+  void set_chooseleaf_descend_once(int n) {
+    crush->chooseleaf_descend_once = !!n;
+  }
+
+  int get_chooseleaf_vary_r() const {
+    return crush->chooseleaf_vary_r;
+  }
+  void set_chooseleaf_vary_r(int n) {
+    crush->chooseleaf_vary_r = n;
+  }
+
+  int get_chooseleaf_stable() const {
+    return crush->chooseleaf_stable;
+  }
+  void set_chooseleaf_stable(int n) {
+    crush->chooseleaf_stable = n;
+  }
+
+  int get_straw_calc_version() const {
+    return crush->straw_calc_version;
+  }
+  void set_straw_calc_version(int n) {
+    crush->straw_calc_version = n;
+  }
+
+  unsigned get_allowed_bucket_algs() const {
+    return crush->allowed_bucket_algs;
+  }
+  void set_allowed_bucket_algs(unsigned n) {
+    crush->allowed_bucket_algs = n;
+  }
+
+  bool has_argonaut_tunables() const {
+    return
+      crush->choose_local_tries == 2 &&
+      crush->choose_local_fallback_tries == 5 &&
+      crush->choose_total_tries == 19 &&
+      crush->chooseleaf_descend_once == 0 &&
+      crush->chooseleaf_vary_r == 0 &&
+      crush->chooseleaf_stable == 0 &&
+      crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+  }
+  bool has_bobtail_tunables() const {
+    return
+      crush->choose_local_tries == 0 &&
+      crush->choose_local_fallback_tries == 0 &&
+      crush->choose_total_tries == 50 &&
+      crush->chooseleaf_descend_once == 1 &&
+      crush->chooseleaf_vary_r == 0 &&
+      crush->chooseleaf_stable == 0 &&
+      crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+  }
+  bool has_firefly_tunables() const {
+    return
+      crush->choose_local_tries == 0 &&
+      crush->choose_local_fallback_tries == 0 &&
+      crush->choose_total_tries == 50 &&
+      crush->chooseleaf_descend_once == 1 &&
+      crush->chooseleaf_vary_r == 1 &&
+      crush->chooseleaf_stable == 0 &&
+      crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+  }
+  bool has_hammer_tunables() const {
+    return
+      crush->choose_local_tries == 0 &&
+      crush->choose_local_fallback_tries == 0 &&
+      crush->choose_total_tries == 50 &&
+      crush->chooseleaf_descend_once == 1 &&
+      crush->chooseleaf_vary_r == 1 &&
+      crush->chooseleaf_stable == 0 &&
+      crush->allowed_bucket_algs == ((1 << CRUSH_BUCKET_UNIFORM) |
+				      (1 << CRUSH_BUCKET_LIST) |
+				      (1 << CRUSH_BUCKET_STRAW) |
+				      (1 << CRUSH_BUCKET_STRAW2));
+  }
+  bool has_jewel_tunables() const {
+    return
+      crush->choose_local_tries == 0 &&
+      crush->choose_local_fallback_tries == 0 &&
+      crush->choose_total_tries == 50 &&
+      crush->chooseleaf_descend_once == 1 &&
+      crush->chooseleaf_vary_r == 1 &&
+      crush->chooseleaf_stable == 1 &&
+      crush->allowed_bucket_algs == ((1 << CRUSH_BUCKET_UNIFORM) |
+				      (1 << CRUSH_BUCKET_LIST) |
+				      (1 << CRUSH_BUCKET_STRAW) |
+				      (1 << CRUSH_BUCKET_STRAW2));
+  }
+
+  bool has_optimal_tunables() const {
+    return has_jewel_tunables();
+  }
+  bool has_legacy_tunables() const {
+    return has_argonaut_tunables();
+  }
+
+  bool has_nondefault_tunables() const {
+    return
+      (crush->choose_local_tries != 2 ||
+       crush->choose_local_fallback_tries != 5 ||
+       crush->choose_total_tries != 19);
+  }
+  bool has_nondefault_tunables2() const {
+    return
+      crush->chooseleaf_descend_once != 0;
+  }
+  bool has_nondefault_tunables3() const {
+    return
+      crush->chooseleaf_vary_r != 0;
+  }
+  bool has_nondefault_tunables5() const {
+    return
+        crush->chooseleaf_stable != 0;
+  }
+
+  bool has_v2_rules() const;
+  bool has_v3_rules() const;
+  bool has_v4_buckets() const;
+  bool has_v5_rules() const;
+  bool has_choose_args() const;          // any choose_args
+  bool has_incompat_choose_args() const; // choose_args that can't be made compat
+
+  bool is_v2_rule(unsigned ruleid) const;
+  bool is_v3_rule(unsigned ruleid) const;
+  bool is_v5_rule(unsigned ruleid) const;
+
+  std::string get_min_required_version() const {
+    if (has_v5_rules() || has_nondefault_tunables5())
+      return "jewel";
+    else if (has_v4_buckets())
+      return "hammer";
+    else if (has_nondefault_tunables3())
+      return "firefly";
+    else if (has_nondefault_tunables2() || has_nondefault_tunables())
+      return "bobtail";
+    else
+      return "argonaut";
+  }
+
+  // default bucket types
+  unsigned get_default_bucket_alg() const {
+    // in order of preference
+    if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_STRAW2))
+      return CRUSH_BUCKET_STRAW2;
+    if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_STRAW))
+      return CRUSH_BUCKET_STRAW;
+    if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_TREE))
+      return CRUSH_BUCKET_TREE;
+    if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_LIST))
+      return CRUSH_BUCKET_LIST;
+    if (crush->allowed_bucket_algs & (1 << CRUSH_BUCKET_UNIFORM))
+      return CRUSH_BUCKET_UNIFORM;
+    return 0;
+  }
+
+  // bucket types
+  int get_num_type_names() const {
+    return type_map.size();
+  }
+  int get_max_type_id() const {
+    if (type_map.empty())
+      return 0;
+    return type_map.rbegin()->first;
+  }
+  int get_type_id(const std::string& name) const {
+    build_rmaps();
+    if (type_rmap.count(name))
+      return type_rmap[name];
+    return -1;
+  }
+  int get_validated_type_id(const std::string& name, int *id) const {
+    int retval = get_type_id(name);
+    if (retval == -1 && !type_rmap.count(name)) {
+      return -1;
+    }
+    *id = retval;
+    return 0;
+  }
+  const char *get_type_name(int t) const {
+    auto p = type_map.find(t);
+    if (p != type_map.end())
+      return p->second.c_str();
+    return 0;
+  }
+  void set_type_name(int i, const std::string& name) {
+    type_map[i] = name;
+    if (have_rmaps)
+      type_rmap[name] = i;
+  }
+
+  // item/bucket names
+  bool name_exists(const std::string& name) const {
+    build_rmaps();
+    return name_rmap.count(name);
+  }
+  bool item_exists(int i) const {
+    return name_map.count(i);
+  }
+  int get_item_id(const std::string& name) const {
+    build_rmaps();
+    if (name_rmap.count(name))
+      return name_rmap[name];
+    return 0;  /* hrm */
+  }
+  const char *get_item_name(int t) const {
+    std::map<int,std::string>::const_iterator p = name_map.find(t);
+    if (p != name_map.end())
+      return p->second.c_str();
+    return 0;
+  }
+  int set_item_name(int i, const std::string& name) {
+    if (!is_valid_crush_name(name))
+      return -EINVAL;
+    name_map[i] = name;
+    if (have_rmaps)
+      name_rmap[name] = i;
+    return 0;
+  }
+  void swap_names(int a, int b) {
+    std::string an = name_map[a];
+    std::string bn = name_map[b];
+    name_map[a] = bn;
+    name_map[b] = an;
+    if (have_rmaps) {
+      name_rmap[an] = b;
+      name_rmap[bn] = a;
+    }
+  }
+  int split_id_class(int i, int *idout, int *classout) const;
+
+  bool class_exists(const std::string& name) const {
+    return class_rname.count(name);
+  }
+  const char *get_class_name(int i) const {
+    auto p = class_name.find(i);
+    if (p != class_name.end())
+      return p->second.c_str();
+    return 0;
+  }
+  int get_class_id(const std::string& name) const {
+    auto p = class_rname.find(name);
+    if (p != class_rname.end())
+      return p->second;
+    else
+      return -EINVAL;
+  }
+  int remove_class_name(const std::string& name) {
+    auto p = class_rname.find(name);
+    if (p == class_rname.end())
+      return -ENOENT;
+    int class_id = p->second;
+    auto q = class_name.find(class_id);
+    if (q == class_name.end())
+      return -ENOENT;
+    class_rname.erase(name);
+    class_name.erase(class_id);
+    return 0;
+  }
+
+  int32_t _alloc_class_id() const;
+
+  int get_or_create_class_id(const std::string& name) {
+    int c = get_class_id(name);
+    if (c < 0) {
+      int i = _alloc_class_id();
+      class_name[i] = name;
+      class_rname[name] = i;
+      return i;
+    } else {
+      return c;
+    }
+  }
+
+  const char *get_item_class(int t) const {
+    std::map<int,int>::const_iterator p = class_map.find(t);
+    if (p == class_map.end())
+      return 0;
+    return get_class_name(p->second);
+  }
+  int get_item_class_id(int t) const {
+    auto p = class_map.find(t);
+    if (p == class_map.end())
+      return -ENOENT;
+    return p->second;
+  }
+  int set_item_class(int i, const std::string& name) {
+    if (!is_valid_crush_name(name))
+      return -EINVAL;
+    class_map[i] = get_or_create_class_id(name);
+    return 0;
+  }
+  int set_item_class(int i, int c) {
+    class_map[i] = c;
+    return c;
+  }
+  void get_devices_by_class(const std::string &name,
+			    std::set<int> *devices) const {
+    ceph_assert(devices);
+    devices->clear();
+    if (!class_exists(name)) {
+      return;
+    }
+    auto cid = get_class_id(name);
+    for (auto& p : class_map) {
+      if (p.first >= 0 && p.second == cid) {
+        devices->insert(p.first);
+      }
+    }
+  }
+  void class_remove_item(int i) {
+    auto it = class_map.find(i);
+    if (it == class_map.end()) {
+      return;
+    }
+    class_map.erase(it);
+  }
+  int can_rename_item(const std::string& srcname,
+		      const std::string& dstname,
+		      std::ostream *ss) const;
+  int rename_item(const std::string& srcname,
+		  const std::string& dstname,
+		  std::ostream *ss);
+  int can_rename_bucket(const std::string& srcname,
+			const std::string& dstname,
+			std::ostream *ss) const;
+  int rename_bucket(const std::string& srcname,
+		    const std::string& dstname,
+		    std::ostream *ss);
+
+  // rule names
+  int rename_rule(const std::string& srcname,
+                  const std::string& dstname,
+                  std::ostream *ss);
+  bool rule_exists(std::string name) const {
+    build_rmaps();
+    return rule_name_rmap.count(name);
+  }
+  int get_rule_id(std::string name) const {
+    build_rmaps();
+    if (rule_name_rmap.count(name))
+      return rule_name_rmap[name];
+    return -ENOENT;
+  }
+  const char *get_rule_name(int t) const {
+    auto p = rule_name_map.find(t);
+    if (p != rule_name_map.end())
+      return p->second.c_str();
+    return 0;
+  }
+  void set_rule_name(int i, const std::string& name) {
+    rule_name_map[i] = name;
+    if (have_rmaps)
+      rule_name_rmap[name] = i;
+  }
+  bool is_shadow_item(int id) const {
+    const char *name = get_item_name(id);
+    return name && !is_valid_crush_name(name);
+  }
+
+
+  /**
+   * find tree nodes referenced by rules by a 'take' command
+   *
+   * Note that these may not be parentless roots.
+   */
+  void find_takes(std::set<int> *roots) const;
+  void find_takes_by_rule(int rule, std::set<int> *roots) const;
+
+  /**
+   * find tree roots
+   *
+   * These are parentless nodes in the map.
+   */
+  void find_roots(std::set<int> *roots) const;
+
+
+  /**
+   * find tree roots that contain shadow (device class) items only
+   */
+  void find_shadow_roots(std::set<int> *roots) const {
+    std::set<int> all;
+    find_roots(&all);
+    for (auto& p: all) {
+      if (is_shadow_item(p)) {
+        roots->insert(p);
+      }
+    }
+  }
+
+  /**
+   * find tree roots that are not shadow (device class) items
+   *
+   * These are parentless nodes in the map that are not shadow
+   * items for device classes.
+   */
+  void find_nonshadow_roots(std::set<int> *roots) const {
+    std::set<int> all;
+    find_roots(&all);
+    for (auto& p: all) {
+      if (!is_shadow_item(p)) {
+        roots->insert(p);
+      }
+    }
+  }
+
+  /**
+   * see if an item is contained within a subtree
+   *
+   * @param root haystack
+   * @param item needle
+   * @return true if the item is located beneath the given node
+   */
+  bool subtree_contains(int root, int item) const;
+
+private:
+  /**
+   * search for an item in any bucket
+   *
+   * @param i item
+   * @return true if present
+   */
+  bool _search_item_exists(int i) const;
+  bool is_parent_of(int child, int p) const;
+public:
+
+  /**
+   * see if item is located where we think it is
+   *
+   * This verifies that the given item is located at a particular
+   * location in the hierarchy.  However, that check is imprecise; we
+   * are actually verifying that the most specific location key/value
+   * is correct.  For example, if loc specifies that rack=foo and
+   * host=bar, it will verify that host=bar is correct; any placement
+   * above that level in the hierarchy is ignored.  This matches the
+   * semantics for insert_item().
+   *
+   * @param cct cct
+   * @param item item id
+   * @param loc location to check (map of type to bucket names)
+   * @param weight optional pointer to weight of item at that location
+   * @return true if item is at specified location
+   */
+  bool check_item_loc(CephContext *cct, int item,
+		      const std::map<std::string,std::string>& loc,
+		      int *iweight);
+  bool check_item_loc(CephContext *cct, int item,
+		      const std::map<std::string,std::string>& loc,
+		      float *weight) {
+    int iweight;
+    bool ret = check_item_loc(cct, item, loc, &iweight);
+    if (weight)
+      *weight = (float)iweight / (float)0x10000;
+    return ret;
+  }
+
+
+  /**
+   * returns the (type, name) of the parent bucket of id
+   *
+   * FIXME: ambiguous for items that occur multiple times in the map
+   */
+  std::pair<std::string,std::string> get_immediate_parent(int id, int *ret = NULL) const;
+
+  int get_immediate_parent_id(int id, int *parent) const;
+
+  /**
+   * return ancestor of the given type, or 0 if none
+   * can pass in a specific crush **rule** to return ancestor from that rule only 
+   * (parent is always a bucket and thus <0)
+   */
+  int get_parent_of_type(int id, int type, int rule = -1) const;
+
+  /**
+   * get the fully qualified location of a device by successively finding
+   * parents beginning at ID and ending at highest type number specified in
+   * the CRUSH map which assumes that if device foo is under device bar, the
+   * type_id of foo < bar where type_id is the integer specified in the CRUSH map
+   *
+   * returns the location in the form of (type=foo) where type is a type of bucket
+   * specified in the CRUSH map and foo is a name specified in the CRUSH map
+   */
+  std::map<std::string, std::string> get_full_location(int id) const;
+
+  /**
+   * return location map for a item, by name
+   */
+  int get_full_location(
+    const std::string& name,
+    std::map<std::string,std::string> *ploc);
+
+  /*
+   * identical to get_full_location(int id) although it returns the type/name
+   * pairs in the order they occur in the hierarchy.
+   *
+   * returns -ENOENT if id is not found.
+   */
+  int get_full_location_ordered(
+    int id,
+    std::vector<std::pair<std::string, std::string> >& path) const;
+
+  /*
+   * identical to get_full_location_ordered(int id, vector<pair<string, string> >& path),
+   * although it returns a concatenated string with the type/name pairs in descending
+   * hierarchical order with format key1=val1,key2=val2.
+   *
+   * returns the location in descending hierarchy as a string.
+   */
+  std::string get_full_location_ordered_string(int id) const;
+
+  /**
+   * returns (type_id, type) of all parent buckets between id and
+   * default, can be used to check for anomalous CRUSH maps
+   */
+  std::map<int, std::string> get_parent_hierarchy(int id) const;
+
+  /**
+   * enumerate immediate children of given node
+   *
+   * @param id parent bucket or device id
+   * @return number of items, or error
+   */
+  int get_children(int id, std::list<int> *children) const;
+ /**
+   * enumerate all children of given node
+   *
+   * @param id parent bucket or device id
+   * @return number of items, or error
+   */
+  int get_all_children(int id, std::set<int> *children) const;
+  void get_children_of_type(int id,
+                            int type,
+			    std::vector<int> *children,
+			    bool exclude_shadow = true) const;
+  /**
+   * enumerate all subtrees by type
+   */
+  void get_subtree_of_type(int type, std::vector<int> *subtrees);
+
+
+  /**
+   * verify upmapping results.
+   * return 0 on success or a negative errno on error.
+   */
+  int verify_upmap(CephContext *cct,
+                   int rule_id,
+                   int pool_size,
+                   const std::vector<int>& up);
+
+  /**
+    * enumerate leaves(devices) of given node
+    *
+    * @param name parent bucket name
+    * @return 0 on success or a negative errno on error.
+    */
+  int get_leaves(const std::string &name, std::set<int> *leaves) const;
+
+private:
+  int _get_leaves(int id, std::list<int> *leaves) const; // worker
+
+public:
+  /**
+   * insert an item into the map at a specific position
+   *
+   * Add an item at a specific location of the hierarchy.
+   * Specifically, we look for the most specific location constraint
+   * for which a bucket already exists, and then create intervening
+   * buckets beneath that in order to place the item.
+   *
+   * Note that any location specifiers *above* the most specific match
+   * are ignored.  For example, if we specify that osd.12 goes in
+   * host=foo, rack=bar, and row=baz, and rack=bar is the most
+   * specific match, we will create host=foo beneath that point and
+   * put osd.12 inside it.  However, we will not verify that rack=bar
+   * is beneath row=baz or move it.
+   *
+   * In short, we will build out a hierarchy, and move leaves around,
+   * but not adjust the hierarchy's internal structure.  Yet.
+   *
+   * If the item is already present in the map, we will return EEXIST.
+   * If the location key/value pairs are nonsensical
+   * (rack=nameofdevice), or location specifies that do not attach us
+   * to any existing part of the hierarchy, we will return EINVAL.
+   *
+   * @param cct cct
+   * @param id item id
+   * @param weight item weight
+   * @param name item name
+   * @param loc location (map of type to bucket names)
+   * @param init_weight_sets initialize weight-set weights to weight (vs 0)
+   * @return 0 for success, negative on error
+   */
+  int insert_item(CephContext *cct, int id, float weight, std::string name,
+		  const std::map<std::string,std::string>& loc,
+		  bool init_weight_sets=true);
+
+  /**
+   * move a bucket in the hierarchy to the given location
+   *
+   * This has the same location and ancestor creation behavior as
+   * insert_item(), but will relocate the specified existing bucket.
+   *
+   * @param cct cct
+   * @param id bucket id
+   * @param loc location (map of type to bucket names)
+   * @return 0 for success, negative on error
+   */
+  int move_bucket(CephContext *cct, int id, const std::map<std::string,std::string>& loc);
+
+  /**
+   * swap bucket contents of two buckets without touching bucket ids
+   *
+   * @param cct cct
+   * @param src bucket a
+   * @param dst bucket b
+   * @return 0 for success, negative on error
+   */
+  int swap_bucket(CephContext *cct, int src, int dst);
+
+  /**
+   * add a link to an existing bucket in the hierarchy to the new location
+   *
+   * This has the same location and ancestor creation behavior as
+   * insert_item(), but will add a new link to the specified existing
+   * bucket.
+   *
+   * @param cct cct
+   * @param id bucket id
+   * @param loc location (map of type to bucket names)
+   * @return 0 for success, negative on error
+   */
+  int link_bucket(CephContext *cct, int id,
+		  const std::map<std::string,std::string>& loc);
+
+  /**
+   * add or update an item's position in the map
+   *
+   * This is analogous to insert_item, except we will move an item if
+   * it is already present.
+   *
+   * @param cct cct
+   * @param id item id
+   * @param weight item weight
+   * @param name item name
+   * @param loc location (map of type to bucket names)
+   * @return 0 for no change, 1 for successful change, negative on error
+   */
+  int update_item(CephContext *cct, int id, float weight, std::string name,
+		  const std::map<std::string, std::string>& loc);
+
+  /**
+   * create or move an item, but do not adjust its weight if it already exists
+   *
+   * @param cct cct
+   * @param item item id
+   * @param weight initial item weight (if we need to create it)
+   * @param name item name
+   * @param loc location (map of type to bucket names)
+   * @param init_weight_sets initialize weight-set values to weight (vs 0)
+   * @return 0 for no change, 1 for successful change, negative on error
+   */
+  int create_or_move_item(CephContext *cct, int item, float weight,
+			  std::string name,
+			  const std::map<std::string,std::string>& loc,
+			  bool init_weight_sets=true);
+
+  /**
+   * remove all instances of an item from the map
+   *
+   * @param cct cct
+   * @param id item id to remove
+   * @param unlink_only unlink but do not remove bucket (useful if multiple links or not empty)
+   * @return 0 on success, negative on error
+   */
+  int remove_item(CephContext *cct, int id, bool unlink_only);
+
+  /**
+   * recursively remove buckets starting at item and stop removing
+   * when a bucket is in use.
+   *
+   * @param item id to remove
+   * @return 0 on success, negative on error
+   */
+  int remove_root(CephContext *cct, int item);
+
+  /**
+   * remove all instances of an item nested beneath a certain point from the map
+   *
+   * @param cct cct
+   * @param id item id to remove
+   * @param ancestor ancestor item id under which to search for id
+   * @param unlink_only unlink but do not remove bucket (useful if bucket has multiple links or is not empty)
+   * @return 0 on success, negative on error
+   */
+private:
+  bool _maybe_remove_last_instance(CephContext *cct, int id, bool unlink_only);
+  int _remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only);
+  bool _bucket_is_in_use(int id);
+public:
+  int remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only);
+
+  /**
+   * calculate the locality/distance from a given id to a crush location map
+   *
+   * Specifically, we look for the lowest-valued type for which the
+   * location of id matches that described in loc.
+   *
+   * @param cct cct
+   * @param id the existing id in the map
+   * @param loc a set of key=value pairs describing a location in the hierarchy
+   */
+  int get_common_ancestor_distance(CephContext *cct, int id,
+				   const std::multimap<std::string,std::string>& loc) const;
+
+  /**
+   * parse a set of key/value pairs out of a string vector
+   *
+   * These are used to describe a location in the CRUSH hierarchy.
+   *
+   * @param args list of strings (each key= or key=value)
+   * @param ploc pointer to a resulting location map or multimap
+   */
+  static int parse_loc_map(const std::vector<std::string>& args,
+			   std::map<std::string,std::string> *ploc);
+  static int parse_loc_multimap(const std::vector<std::string>& args,
+				std::multimap<std::string,std::string> *ploc);
+
+
+  /**
+   * get an item's weight
+   *
+   * Will return the weight for the first instance it finds.
+   *
+   * @param id item id to check
+   * @return weight of item
+   */
+  int get_item_weight(int id) const;
+  float get_item_weightf(int id) const {
+    return (float)get_item_weight(id) / (float)0x10000;
+  }
+  int get_item_weight_in_loc(int id,
+			     const std::map<std::string, std::string> &loc);
+  float get_item_weightf_in_loc(int id,
+				const std::map<std::string, std::string> &loc) {
+    return (float)get_item_weight_in_loc(id, loc) / (float)0x10000;
+  }
+
+  int validate_weightf(float weight) {
+    uint64_t iweight = weight * 0x10000;
+    if (iweight > static_cast<uint64_t>(std::numeric_limits<int>::max())) {
+      return -EOVERFLOW;
+    }
+    return 0;
+  }
+  int adjust_item_weight(CephContext *cct, int id, int weight,
+			 bool update_weight_sets=true);
+  int adjust_item_weightf(CephContext *cct, int id, float weight,
+			  bool update_weight_sets=true) {
+    int r = validate_weightf(weight);
+    if (r < 0) {
+      return r;
+    }
+    return adjust_item_weight(cct, id, (int)(weight * (float)0x10000),
+			      update_weight_sets);
+  }
+  int adjust_item_weight_in_bucket(CephContext *cct, int id, int weight,
+				   int bucket_id,
+				   bool update_weight_sets);
+  int adjust_item_weight_in_loc(CephContext *cct, int id, int weight,
+				const std::map<std::string,std::string>& loc,
+				bool update_weight_sets=true);
+  int adjust_item_weightf_in_loc(CephContext *cct, int id, float weight,
+				 const std::map<std::string,std::string>& loc,
+				 bool update_weight_sets=true) {
+    int r = validate_weightf(weight);
+    if (r < 0) {
+      return r;
+    }
+    return adjust_item_weight_in_loc(cct, id, (int)(weight * (float)0x10000),
+				     loc, update_weight_sets);
+  }
+  void reweight(CephContext *cct);
+  void reweight_bucket(crush_bucket *b,
+		       crush_choose_arg_map& arg_map,
+		       std::vector<uint32_t> *weightv);
+
+  int adjust_subtree_weight(CephContext *cct, int id, int weight,
+			    bool update_weight_sets=true);
+  int adjust_subtree_weightf(CephContext *cct, int id, float weight,
+			     bool update_weight_sets=true) {
+    int r = validate_weightf(weight);
+    if (r < 0) {
+      return r;
+    }
+    return adjust_subtree_weight(cct, id, (int)(weight * (float)0x10000),
+				 update_weight_sets);
+  }
+
+  /// check if item id is present in the map hierarchy
+  bool check_item_present(int id) const;
+
+
+  /*** devices ***/
+  int get_max_devices() const {
+    if (!crush) return 0;
+    return crush->max_devices;
+  }
+
+
+  /*** rules ***/
+private:
+  crush_rule *get_rule(unsigned ruleno) const {
+    if (!crush) return (crush_rule *)(-ENOENT);
+    if (ruleno >= crush->max_rules)
+      return 0;
+    return crush->rules[ruleno];
+  }
+  crush_rule_step *get_rule_step(unsigned ruleno, unsigned step) const {
+    crush_rule *n = get_rule(ruleno);
+    if (IS_ERR(n)) return (crush_rule_step *)(-EINVAL);
+    if (step >= n->len) return (crush_rule_step *)(-EINVAL);
+    return &n->steps[step];
+  }
+
+public:
+  /* accessors */
+  int get_max_rules() const {
+    if (!crush) return 0;
+    return crush->max_rules;
+  }
+  bool rule_exists(unsigned ruleno) const {
+    if (!crush) return false;
+    if (ruleno < crush->max_rules &&
+	crush->rules[ruleno] != NULL)
+      return true;
+    return false;
+  }
+  bool rule_has_take(unsigned ruleno, int take) const {
+    if (!crush) return false;
+    crush_rule *rule = get_rule(ruleno);
+    for (unsigned i = 0; i < rule->len; ++i) {
+      if (rule->steps[i].op == CRUSH_RULE_TAKE &&
+	  rule->steps[i].arg1 == take) {
+	return true;
+      }
+    }
+    return false;
+  }
+  int get_rule_len(unsigned ruleno) const {
+    crush_rule *r = get_rule(ruleno);
+    if (IS_ERR(r)) return PTR_ERR(r);
+    return r->len;
+  }
+  int get_rule_mask_ruleset(unsigned ruleno) const {
+    crush_rule *r = get_rule(ruleno);
+    if (IS_ERR(r)) return -1;
+    return r->mask.ruleset;
+  }
+  int get_rule_mask_type(unsigned ruleno) const {
+    crush_rule *r = get_rule(ruleno);
+    if (IS_ERR(r)) return -1;
+    return r->mask.type;
+  }
+  int get_rule_mask_min_size(unsigned ruleno) const {
+    crush_rule *r = get_rule(ruleno);
+    if (IS_ERR(r)) return -1;
+    return r->mask.min_size;
+  }
+  int get_rule_mask_max_size(unsigned ruleno) const {
+    crush_rule *r = get_rule(ruleno);
+    if (IS_ERR(r)) return -1;
+    return r->mask.max_size;
+  }
+  int get_rule_op(unsigned ruleno, unsigned step) const {
+    crush_rule_step *s = get_rule_step(ruleno, step);
+    if (IS_ERR(s)) return PTR_ERR(s);
+    return s->op;
+  }
+  int get_rule_arg1(unsigned ruleno, unsigned step) const {
+    crush_rule_step *s = get_rule_step(ruleno, step);
+    if (IS_ERR(s)) return PTR_ERR(s);
+    return s->arg1;
+  }
+  int get_rule_arg2(unsigned ruleno, unsigned step) const {
+    crush_rule_step *s = get_rule_step(ruleno, step);
+    if (IS_ERR(s)) return PTR_ERR(s);
+    return s->arg2;
+  }
+
+private:
+  float _get_take_weight_osd_map(int root, std::map<int,float> *pmap) const;
+  void _normalize_weight_map(float sum, const std::map<int,float>& m,
+			     std::map<int,float> *pmap) const;
+
+public:
+  /**
+   * calculate a map of osds to weights for a given rule
+   *
+   * Generate a map of which OSDs get how much relative weight for a
+   * given rule.
+   *
+   * @param ruleno [in] rule id
+   * @param pmap [out] map of osd to weight
+   * @return 0 for success, or negative error code
+   */
+  int get_rule_weight_osd_map(unsigned ruleno, std::map<int,float> *pmap) const;
+
+  /**
+   * calculate a map of osds to weights for a given starting root
+   *
+   * Generate a map of which OSDs get how much relative weight for a
+   * given starting root
+   *
+   * @param root node
+   * @param pmap [out] map of osd to weight
+   * @return 0 for success, or negative error code
+   */
+  int get_take_weight_osd_map(int root, std::map<int,float> *pmap) const;
+
+  /* modifiers */
+
+  int add_rule(int ruleno, int len, int type, int minsize, int maxsize) {
+    if (!crush) return -ENOENT;
+    crush_rule *n = crush_make_rule(len, ruleno, type, minsize, maxsize);
+    ceph_assert(n);
+    ruleno = crush_add_rule(crush, n, ruleno);
+    return ruleno;
+  }
+  int set_rule_mask_max_size(unsigned ruleno, int max_size) {
+    crush_rule *r = get_rule(ruleno);
+    if (IS_ERR(r)) return -1;
+    return r->mask.max_size = max_size;
+  }
+  int set_rule_step(unsigned ruleno, unsigned step, int op, int arg1, int arg2) {
+    if (!crush) return -ENOENT;
+    crush_rule *n = get_rule(ruleno);
+    if (!n) return -1;
+    crush_rule_set_step(n, step, op, arg1, arg2);
+    return 0;
+  }
+  int set_rule_step_take(unsigned ruleno, unsigned step, int val) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_TAKE, val, 0);
+  }
+  int set_rule_step_set_choose_tries(unsigned ruleno, unsigned step, int val) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_TRIES, val, 0);
+  }
+  int set_rule_step_set_choose_local_tries(unsigned ruleno, unsigned step, int val) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES, val, 0);
+  }
+  int set_rule_step_set_choose_local_fallback_tries(unsigned ruleno, unsigned step, int val) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES, val, 0);
+  }
+  int set_rule_step_set_chooseleaf_tries(unsigned ruleno, unsigned step, int val) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_TRIES, val, 0);
+  }
+  int set_rule_step_set_chooseleaf_vary_r(unsigned ruleno, unsigned step, int val) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_VARY_R, val, 0);
+  }
+  int set_rule_step_set_chooseleaf_stable(unsigned ruleno, unsigned step, int val) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_STABLE, val, 0);
+  }
+  int set_rule_step_choose_firstn(unsigned ruleno, unsigned step, int val, int type) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_FIRSTN, val, type);
+  }
+  int set_rule_step_choose_indep(unsigned ruleno, unsigned step, int val, int type) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_INDEP, val, type);
+  }
+  int set_rule_step_choose_leaf_firstn(unsigned ruleno, unsigned step, int val, int type) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSELEAF_FIRSTN, val, type);
+  }
+  int set_rule_step_choose_leaf_indep(unsigned ruleno, unsigned step, int val, int type) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSELEAF_INDEP, val, type);
+  }
+  int set_rule_step_emit(unsigned ruleno, unsigned step) {
+    return set_rule_step(ruleno, step, CRUSH_RULE_EMIT, 0, 0);
+  }
+
+  int add_simple_rule(
+    std::string name, std::string root_name, std::string failure_domain_type,
+    std::string device_class, std::string mode, int rule_type,
+    std::ostream *err = 0);
+
+  /**
+   * @param rno rule[set] id to use, -1 to pick the lowest available
+   */
+  int add_simple_rule_at(
+    std::string name, std::string root_name,
+    std::string failure_domain_type, std::string device_class, std::string mode,
+    int rule_type, int rno, std::ostream *err = 0);
+
+  int remove_rule(int ruleno);
+
+
+  /** buckets **/
+  const crush_bucket *get_bucket(int id) const {
+    if (!crush)
+      return (crush_bucket *)(-EINVAL);
+    unsigned int pos = (unsigned int)(-1 - id);
+    unsigned int max_buckets = crush->max_buckets;
+    if (pos >= max_buckets)
+      return (crush_bucket *)(-ENOENT);
+    crush_bucket *ret = crush->buckets[pos];
+    if (ret == NULL)
+      return (crush_bucket *)(-ENOENT);
+    return ret;
+  }
+private:
+  crush_bucket *get_bucket(int id) {
+    if (!crush)
+      return (crush_bucket *)(-EINVAL);
+    unsigned int pos = (unsigned int)(-1 - id);
+    unsigned int max_buckets = crush->max_buckets;
+    if (pos >= max_buckets)
+      return (crush_bucket *)(-ENOENT);
+    crush_bucket *ret = crush->buckets[pos];
+    if (ret == NULL)
+      return (crush_bucket *)(-ENOENT);
+    return ret;
+  }
+  /**
+   * detach a bucket from its parent and adjust the parent weight
+   *
+   * returns the weight of the detached bucket
+   **/
+  int detach_bucket(CephContext *cct, int item);
+
+  int get_new_bucket_id();
+
+public:
+  int get_max_buckets() const {
+    if (!crush) return -EINVAL;
+    return crush->max_buckets;
+  }
+  int get_next_bucket_id() const {
+    if (!crush) return -EINVAL;
+    return crush_get_next_bucket_id(crush);
+  }
+  bool bucket_exists(int id) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b))
+      return false;
+    return true;
+  }
+  int get_bucket_weight(int id) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) return PTR_ERR(b);
+    return b->weight;
+  }
+  float get_bucket_weightf(int id) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) return 0;
+    return b->weight / (float)0x10000;
+  }
+  int get_bucket_type(int id) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) return PTR_ERR(b);
+    return b->type;
+  }
+  int get_bucket_alg(int id) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) return PTR_ERR(b);
+    return b->alg;
+  }
+  int get_bucket_hash(int id) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) return PTR_ERR(b);
+    return b->hash;
+  }
+  int get_bucket_size(int id) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) return PTR_ERR(b);
+    return b->size;
+  }
+  int get_bucket_item(int id, int pos) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) return PTR_ERR(b);
+    if ((__u32)pos >= b->size)
+      return PTR_ERR(b);
+    return b->items[pos];
+  }
+  int get_bucket_item_weight(int id, int pos) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) return PTR_ERR(b);
+    return crush_get_bucket_item_weight(b, pos);
+  }
+  float get_bucket_item_weightf(int id, int pos) const {
+    const crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) return 0;
+    return (float)crush_get_bucket_item_weight(b, pos) / (float)0x10000;
+  }
+
+  /* modifiers */
+  int add_bucket(int bucketno, int alg, int hash, int type, int size,
+		 int *items, int *weights, int *idout);
+  int bucket_add_item(crush_bucket *bucket, int item, int weight);
+  int bucket_remove_item(struct crush_bucket *bucket, int item);
+  int bucket_adjust_item_weight(
+    CephContext *cct, struct crush_bucket *bucket, int item, int weight,
+    bool adjust_weight_sets);
+
+  void finalize() {
+    ceph_assert(crush);
+    crush_finalize(crush);
+    if (!name_map.empty() &&
+	name_map.rbegin()->first >= crush->max_devices) {
+      crush->max_devices = name_map.rbegin()->first + 1;
+    }
+    have_uniform_rules = !has_legacy_rule_ids();
+    build_rmaps();
+  }
+  int bucket_set_alg(int id, int alg);
+
+  int update_device_class(int id, const std::string& class_name,
+			  const std::string& name, std::ostream *ss);
+  int remove_device_class(CephContext *cct, int id, std::ostream *ss);
+  int device_class_clone(
+    int original, int device_class,
+    const std::map<int32_t, std::map<int32_t, int32_t>>& old_class_bucket,
+    const std::set<int32_t>& used_ids,
+    int *clone,
+    std::map<int, std::map<int,std::vector<int>>> *cmap_item_weight);
+  bool class_is_in_use(int class_id, std::ostream *ss = nullptr);
+  int rename_class(const std::string& srcname, const std::string& dstname);
+  int populate_classes(
+    const std::map<int32_t, std::map<int32_t, int32_t>>& old_class_bucket);
+  int get_rules_by_class(const std::string &class_name, std::set<int> *rules);
+  int get_rules_by_osd(int osd, std::set<int> *rules);
+  bool _class_is_dead(int class_id);
+  void cleanup_dead_classes();
+  int rebuild_roots_with_classes(CephContext *cct);
+  /* remove unused roots generated for class devices */
+  int trim_roots_with_class(CephContext *cct);
+
+  int reclassify(
+    CephContext *cct,
+    std::ostream& out,
+    const std::map<std::string,std::string>& classify_root,
+    const std::map<std::string,std::pair<std::string,std::string>>& classify_bucket
+    );
+
+  int set_subtree_class(const std::string& name, const std::string& class_name);
+
+  void start_choose_profile() {
+    free(crush->choose_tries);
+    /*
+     * the original choose_total_tries value was off by one (it
+     * counted "retries" and not "tries").  add one to alloc.
+     */
+    crush->choose_tries = (__u32 *)calloc(sizeof(*crush->choose_tries),
+					  (crush->choose_total_tries + 1));
+    memset(crush->choose_tries, 0,
+	   sizeof(*crush->choose_tries) * (crush->choose_total_tries + 1));
+  }
+  void stop_choose_profile() {
+    free(crush->choose_tries);
+    crush->choose_tries = 0;
+  }
+
+  int get_choose_profile(__u32 **vec) {
+    if (crush->choose_tries) {
+      *vec = crush->choose_tries;
+      return crush->choose_total_tries;
+    }
+    return 0;
+  }
+
+
+  void set_max_devices(int m) {
+    crush->max_devices = m;
+  }
+
+  int find_rule(int ruleset, int type, int size) const {
+    if (!crush) return -1;
+    if (have_uniform_rules &&
+	ruleset < (int)crush->max_rules &&
+	crush->rules[ruleset] &&
+	crush->rules[ruleset]->mask.type == type &&
+	crush->rules[ruleset]->mask.min_size <= size &&
+	crush->rules[ruleset]->mask.max_size >= size) {
+      return ruleset;
+    }
+    return crush_find_rule(crush, ruleset, type, size);
+  }
+
+  bool ruleset_exists(const int ruleset) const {
+    for (size_t i = 0; i < crush->max_rules; ++i) {
+      if (rule_exists(i) && crush->rules[i]->mask.ruleset == ruleset) {
+	return true;
+      }
+    }
+
+    return false;
+  }
+
+  /**
+   * Return the lowest numbered ruleset of type `type`
+   *
+   * @returns a ruleset ID, or -1 if no matching rules found.
+   */
+  int find_first_ruleset(int type) const {
+    int result = -1;
+
+    for (size_t i = 0; i < crush->max_rules; ++i) {
+      if (crush->rules[i]
+          && crush->rules[i]->mask.type == type
+          && (crush->rules[i]->mask.ruleset < result || result == -1)) {
+        result = crush->rules[i]->mask.ruleset;
+      }
+    }
+
+    return result;
+  }
+
+  bool have_choose_args(int64_t choose_args_index) const {
+    return choose_args.count(choose_args_index);
+  }
+
+  crush_choose_arg_map choose_args_get_with_fallback(
+    int64_t choose_args_index) const {
+    auto i = choose_args.find(choose_args_index);
+    if (i == choose_args.end()) {
+      i = choose_args.find(DEFAULT_CHOOSE_ARGS);
+    }
+    if (i == choose_args.end()) {
+      crush_choose_arg_map arg_map;
+      arg_map.args = NULL;
+      arg_map.size = 0;
+      return arg_map;
+    } else {
+      return i->second;
+    }
+  }
+  crush_choose_arg_map choose_args_get(int64_t choose_args_index) const {
+    auto i = choose_args.find(choose_args_index);
+    if (i == choose_args.end()) {
+      crush_choose_arg_map arg_map;
+      arg_map.args = NULL;
+      arg_map.size = 0;
+      return arg_map;
+    } else {
+      return i->second;
+    }
+  }
+
+  void destroy_choose_args(crush_choose_arg_map arg_map) {
+    for (__u32 i = 0; i < arg_map.size; i++) {
+      crush_choose_arg *arg = &arg_map.args[i];
+      for (__u32 j = 0; j < arg->weight_set_positions; j++) {
+	crush_weight_set *weight_set = &arg->weight_set[j];
+	free(weight_set->weights);
+      }
+      if (arg->weight_set)
+	free(arg->weight_set);
+      if (arg->ids)
+	free(arg->ids);
+    }
+    free(arg_map.args);
+  }
+
+  bool create_choose_args(int64_t id, int positions) {
+    if (choose_args.count(id))
+      return false;
+    ceph_assert(positions);
+    auto &cmap = choose_args[id];
+    cmap.args = static_cast<crush_choose_arg*>(calloc(sizeof(crush_choose_arg),
+					  crush->max_buckets));
+    cmap.size = crush->max_buckets;
+    for (int bidx=0; bidx < crush->max_buckets; ++bidx) {
+      crush_bucket *b = crush->buckets[bidx];
+      auto &carg = cmap.args[bidx];
+      carg.ids = NULL;
+      carg.ids_size = 0;
+      if (b && b->alg == CRUSH_BUCKET_STRAW2) {
+	crush_bucket_straw2 *sb = reinterpret_cast<crush_bucket_straw2*>(b);
+	carg.weight_set_positions = positions;
+	carg.weight_set = static_cast<crush_weight_set*>(calloc(sizeof(crush_weight_set),
+						    carg.weight_set_positions));
+	// initialize with canonical weights
+	for (int pos = 0; pos < positions; ++pos) {
+	  carg.weight_set[pos].size = b->size;
+	  carg.weight_set[pos].weights = (__u32*)calloc(4, b->size);
+	  for (unsigned i = 0; i < b->size; ++i) {
+	    carg.weight_set[pos].weights[i] = sb->item_weights[i];
+	  }
+	}
+      } else {
+	carg.weight_set = NULL;
+	carg.weight_set_positions = 0;
+      }
+    }
+    return true;
+  }
+
+  void rm_choose_args(int64_t id) {
+    auto p = choose_args.find(id);
+    if (p != choose_args.end()) {
+      destroy_choose_args(p->second);
+      choose_args.erase(p);
+    }
+  }
+
+  void choose_args_clear() {
+    for (auto w : choose_args)
+      destroy_choose_args(w.second);
+    choose_args.clear();
+  }
+
+  // remove choose_args for buckets that no longer exist, create them for new buckets
+  void update_choose_args(CephContext *cct);
+
+  // adjust choose_args_map weight, preserving the hierarchical summation
+  // property.  used by callers optimizing layouts by tweaking weights.
+  int _choose_args_adjust_item_weight_in_bucket(
+    CephContext *cct,
+    crush_choose_arg_map cmap,
+    int bucketid,
+    int id,
+    const std::vector<int>& weight,
+    std::ostream *ss);
+  int choose_args_adjust_item_weight(
+    CephContext *cct,
+    crush_choose_arg_map cmap,
+    int id, const std::vector<int>& weight,
+    std::ostream *ss);
+  int choose_args_adjust_item_weightf(
+    CephContext *cct,
+    crush_choose_arg_map cmap,
+    int id, const std::vector<double>& weightf,
+    std::ostream *ss) {
+    std::vector<int> weight(weightf.size());
+    for (unsigned i = 0; i < weightf.size(); ++i) {
+      weight[i] = (int)(weightf[i] * (double)0x10000);
+    }
+    return choose_args_adjust_item_weight(cct, cmap, id, weight, ss);
+  }
+
+  int get_choose_args_positions(crush_choose_arg_map cmap) {
+    // infer positions from other buckets
+    for (unsigned j = 0; j < cmap.size; ++j) {
+      if (cmap.args[j].weight_set_positions) {
+	return cmap.args[j].weight_set_positions;
+      }
+    }
+    return 1;
+  }
+
+  template<typename WeightVector>
+  void do_rule(int rule, int x, std::vector<int>& out, int maxout,
+	       const WeightVector& weight,
+	       uint64_t choose_args_index) const {
+    int rawout[maxout];
+    char work[crush_work_size(crush, maxout)];
+    crush_init_workspace(crush, work);
+    crush_choose_arg_map arg_map = choose_args_get_with_fallback(
+      choose_args_index);
+    int numrep = crush_do_rule(crush, rule, x, rawout, maxout,
+			       std::data(weight), std::size(weight),
+			       work, arg_map.args);
+    if (numrep < 0)
+      numrep = 0;
+    out.resize(numrep);
+    for (int i=0; i<numrep; i++)
+      out[i] = rawout[i];
+  }
+
+  int _choose_type_stack(
+    CephContext *cct,
+    const std::vector<std::pair<int,int>>& stack,
+    const std::set<int>& overfull,
+    const std::vector<int>& underfull,
+    const std::vector<int>& more_underfull,
+    const std::vector<int>& orig,
+    std::vector<int>::const_iterator& i,
+    std::set<int>& used,
+    std::vector<int> *pw,
+    int root_bucket,
+    int rule) const;
+
+  int try_remap_rule(
+    CephContext *cct,
+    int rule,
+    int maxout,
+    const std::set<int>& overfull,
+    const std::vector<int>& underfull,
+    const std::vector<int>& more_underfull,
+    const std::vector<int>& orig,
+    std::vector<int> *out) const;
+
+  bool check_crush_rule(int ruleset, int type, int size, std::ostream& ss) {
+    ceph_assert(crush);
+
+    __u32 i;
+    for (i = 0; i < crush->max_rules; i++) {
+      if (crush->rules[i] &&
+	  crush->rules[i]->mask.ruleset == ruleset &&
+	  crush->rules[i]->mask.type == type) {
+
+        if (crush->rules[i]->mask.min_size <= size &&
+            crush->rules[i]->mask.max_size >= size) {
+          return true;
+        } else if (size < crush->rules[i]->mask.min_size) {
+          ss << "pool size is smaller than the crush rule min size";
+          return false;
+        } else {
+          ss << "pool size is bigger than the crush rule max size";
+          return false;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &blp);
+  void decode_crush_bucket(crush_bucket** bptr,
+			   ceph::buffer::list::const_iterator &blp);
+  void dump(ceph::Formatter *f) const;
+  void dump_rules(ceph::Formatter *f) const;
+  void dump_rule(int ruleset, ceph::Formatter *f) const;
+  void dump_tunables(ceph::Formatter *f) const;
+  void dump_choose_args(ceph::Formatter *f) const;
+  void list_rules(ceph::Formatter *f) const;
+  void list_rules(std::ostream *ss) const;
+  void dump_tree(std::ostream *out,
+                 ceph::Formatter *f,
+		 const CrushTreeDumper::name_map_t& ws,
+                 bool show_shadow = false) const;
+  void dump_tree(std::ostream *out, ceph::Formatter *f) {
+    dump_tree(out, f, CrushTreeDumper::name_map_t());
+  }
+  void dump_tree(ceph::Formatter *f,
+		 const CrushTreeDumper::name_map_t& ws) const;
+  static void generate_test_instances(std::list<CrushWrapper*>& o);
+
+  int get_osd_pool_default_crush_replicated_ruleset(CephContext *cct);
+
+  static bool is_valid_crush_name(const std::string& s);
+  static bool is_valid_crush_loc(CephContext *cct,
+				 const std::map<std::string,std::string>& loc);
+};
+WRITE_CLASS_ENCODER_FEATURES(CrushWrapper)
+
+#endif
diff --git a/src/crush/CrushWrapper.i b/src/crush/CrushWrapper.i
new file mode 100644
index 000000000..76340611b
--- /dev/null
+++ b/src/crush/CrushWrapper.i
@@ -0,0 +1,47 @@
+/* File : CrushWrapper.i */
+%module CrushWrapper
+%{
+#include "CrushWrapper.h"
+%}
+
+%include typemaps.i
+
+// This tells SWIG to treat 'int *data' as a special case
+%typemap(in) int *items {
+  AV *tempav;
+  I32 len;
+  int i;
+  SV **tv;
+//  int view;
+
+
+  //printf("typemap\n");
+
+  if (!SvROK($input))
+	croak("$input is not a reference.");
+  if (SvTYPE(SvRV($input)) != SVt_PVAV)
+	croak("$input is not an array.");
+
+  tempav = (AV*)SvRV($input);
+  len = av_len(tempav);
+  //printf("typemap len: %i\n",len);
+  $1 = (int *) malloc((len+1)*sizeof(int));
+  for (i = 0; i <= len; i++) {
+	tv = av_fetch(tempav, i, 0);
+	$1[i] = (int) SvIV(*tv);
+    
+	/*
+	  view = SvIV(*tv);
+	  printf("view: %d",view);
+	  printf("\n");
+	*/
+  }
+}
+
+%apply int *items { int *weights };
+%apply double *OUTPUT { double *min, double *max, double *avg };
+
+/* Let's just grab the original header file here */
+%include "CrushWrapper.h"      
+
+%clear double *min, double *max, double *avg;
diff --git a/src/crush/builder.c b/src/crush/builder.c
new file mode 100644
index 000000000..25788e290
--- /dev/null
+++ b/src/crush/builder.c
@@ -0,0 +1,1531 @@
+#include <string.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+
+#include "crush/crush.h"
+#include "builder.h"
+
+#define dprintk(args...) /* printf(args) */
+
+#define BUG_ON(x) assert(!(x))
+
+struct crush_map *crush_create()
+{
+	struct crush_map *m;
+	m = malloc(sizeof(*m));
+        if (!m)
+                return NULL;
+	memset(m, 0, sizeof(*m));
+
+	set_optimal_crush_map(m);
+	return m;
+}
+
+/*
+ * finalize should be called _after_ all buckets are added to the map.
+ */
+void crush_finalize(struct crush_map *map)
+{
+	int b;
+	__u32 i;
+
+	/* Calculate the needed working space while we do other
+	   finalization tasks. */
+	map->working_size = sizeof(struct crush_work);
+	/* Space for the array of pointers to per-bucket workspace */
+	map->working_size += map->max_buckets *
+		sizeof(struct crush_work_bucket *);
+
+	/* calc max_devices */
+	map->max_devices = 0;
+	for (b=0; b<map->max_buckets; b++) {
+		if (map->buckets[b] == 0)
+			continue;
+		for (i=0; i<map->buckets[b]->size; i++)
+			if (map->buckets[b]->items[i] >= map->max_devices)
+				map->max_devices = map->buckets[b]->items[i] + 1;
+
+		switch (map->buckets[b]->alg) {
+		default:
+			/* The base case, permutation variables and
+			   the pointer to the permutation array. */
+			map->working_size += sizeof(struct crush_work_bucket);
+			break;
+		}
+		/* Every bucket has a permutation array. */
+		map->working_size += map->buckets[b]->size * sizeof(__u32);
+	}
+}
+
+
+
+/** rules **/
+
+int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno)
+{
+	__u32 r;
+
+	if (ruleno < 0) {
+		for (r=0; r < map->max_rules; r++)
+			if (map->rules[r] == 0)
+				break;
+		assert(r < CRUSH_MAX_RULES);
+	}
+	else
+		r = ruleno;
+
+	if (r >= map->max_rules) {
+		/* expand array */
+		int oldsize;
+		void *_realloc = NULL;
+		if (map->max_rules +1 > CRUSH_MAX_RULES)
+			return -ENOSPC;
+		oldsize = map->max_rules;
+		map->max_rules = r+1;
+		if ((_realloc = realloc(map->rules, map->max_rules * sizeof(map->rules[0]))) == NULL) {
+			return -ENOMEM; 
+		} else {
+			map->rules = _realloc;
+		} 
+		memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0]));
+	}
+
+	/* add it */
+	map->rules[r] = rule;
+	return r;
+}
+
+struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize)
+{
+	struct crush_rule *rule;
+	rule = malloc(crush_rule_size(len));
+        if (!rule)
+                return NULL;
+	rule->len = len;
+	rule->mask.ruleset = ruleset;
+	rule->mask.type = type;
+	rule->mask.min_size = minsize;
+	rule->mask.max_size = maxsize;
+	return rule;
+}
+
+/*
+ * be careful; this doesn't verify that the buffer you allocated is big enough!
+ */
+void crush_rule_set_step(struct crush_rule *rule, int n, int op, int arg1, int arg2)
+{
+	assert((__u32)n < rule->len);
+	rule->steps[n].op = op;
+	rule->steps[n].arg1 = arg1;
+	rule->steps[n].arg2 = arg2;
+}
+
+
+/** buckets **/
+int crush_get_next_bucket_id(struct crush_map *map)
+{
+	int pos;
+	for (pos=0; pos < map->max_buckets; pos++)
+		if (map->buckets[pos] == 0)
+			break;
+	return -1 - pos;
+}
+
+
+int crush_add_bucket(struct crush_map *map,
+		     int id,
+		     struct crush_bucket *bucket,
+		     int *idout)
+{
+	int pos;
+
+	/* find a bucket id */
+	if (id == 0)
+		id = crush_get_next_bucket_id(map);
+	pos = -1 - id;
+
+	while (pos >= map->max_buckets) {
+		/* expand array */
+		int oldsize = map->max_buckets;
+		if (map->max_buckets)
+			map->max_buckets *= 2;
+		else
+			map->max_buckets = 8;
+		void *_realloc = NULL;
+		if ((_realloc = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0]))) == NULL) {
+			return -ENOMEM; 
+		} else {
+			map->buckets = _realloc;
+		}
+		memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0]));
+	}
+
+	if (map->buckets[pos] != 0) {
+		return -EEXIST;
+	}
+
+        /* add it */
+	bucket->id = id;
+	map->buckets[pos] = bucket;
+
+	if (idout) *idout = id;
+	return 0;
+}
+
+int crush_remove_bucket(struct crush_map *map, struct crush_bucket *bucket)
+{
+	int pos = -1 - bucket->id;
+       assert(pos < map->max_buckets);
+	map->buckets[pos] = NULL;
+	crush_destroy_bucket(bucket);
+	return 0;
+}
+
+
+/* uniform bucket */
+
+struct crush_bucket_uniform *
+crush_make_uniform_bucket(int hash, int type, int size,
+			  int *items,
+			  int item_weight)
+{
+	int i;
+	struct crush_bucket_uniform *bucket;
+
+	bucket = malloc(sizeof(*bucket));
+        if (!bucket)
+                return NULL;
+	memset(bucket, 0, sizeof(*bucket));
+	bucket->h.alg = CRUSH_BUCKET_UNIFORM;
+	bucket->h.hash = hash;
+	bucket->h.type = type;
+	bucket->h.size = size;
+
+	if (crush_multiplication_is_unsafe(size, item_weight))
+                goto err;
+
+	bucket->h.weight = size * item_weight;
+	bucket->item_weight = item_weight;
+
+	if (size == 0) {
+		return bucket;
+	}
+	bucket->h.items = malloc(sizeof(__s32)*size);
+
+        if (!bucket->h.items)
+                goto err;
+
+	for (i=0; i<size; i++)
+		bucket->h.items[i] = items[i];
+
+	return bucket;
+err:
+        free(bucket->h.items);
+        free(bucket);
+        return NULL;
+}
+
+
+/* list bucket */
+
+struct crush_bucket_list*
+crush_make_list_bucket(int hash, int type, int size,
+		       int *items,
+		       int *weights)
+{
+	int i;
+	int w;
+	struct crush_bucket_list *bucket;
+
+	bucket = malloc(sizeof(*bucket));
+        if (!bucket)
+                return NULL;
+	memset(bucket, 0, sizeof(*bucket));
+	bucket->h.alg = CRUSH_BUCKET_LIST;
+	bucket->h.hash = hash;
+	bucket->h.type = type;
+	bucket->h.size = size;
+
+	if (size == 0) {
+		return bucket;
+	}
+
+	bucket->h.items = malloc(sizeof(__s32)*size);
+        if (!bucket->h.items)
+                goto err;
+
+
+        bucket->item_weights = malloc(sizeof(__u32)*size);
+        if (!bucket->item_weights)
+                goto err;
+	bucket->sum_weights = malloc(sizeof(__u32)*size);
+        if (!bucket->sum_weights)
+                goto err;
+	w = 0;
+	for (i=0; i<size; i++) {
+		bucket->h.items[i] = items[i];
+		bucket->item_weights[i] = weights[i];
+
+		if (crush_addition_is_unsafe(w, weights[i]))
+                        goto err;
+
+		w += weights[i];
+		bucket->sum_weights[i] = w;
+		/*dprintk("pos %d item %d weight %d sum %d\n",
+		  i, items[i], weights[i], bucket->sum_weights[i]);*/
+	}
+
+	bucket->h.weight = w;
+
+	return bucket;
+err:
+        free(bucket->sum_weights);
+        free(bucket->item_weights);
+        free(bucket->h.items);
+        free(bucket);
+        return NULL;
+}
+
+
+/* tree bucket */
+
+static int height(int n) {
+	int h = 0;
+	while ((n & 1) == 0) {
+		h++;
+		n = n >> 1;
+	}
+	return h;
+}
+static int on_right(int n, int h) {
+	return n & (1 << (h+1));
+}
+static int parent(int n)
+{
+	int h = height(n);
+	if (on_right(n, h))
+		return n - (1<<h);
+	else
+		return n + (1<<h);
+}
+
+static int calc_depth(int size)
+{
+	if (size == 0) {
+		return 0;
+	}
+
+	int depth = 1;
+	int t = size - 1;
+	while (t) {
+		t = t >> 1;
+		depth++;
+	}
+	return depth;
+}
+
+struct crush_bucket_tree*
+crush_make_tree_bucket(int hash, int type, int size,
+		       int *items,    /* in leaf order */
+		       int *weights)
+{
+	struct crush_bucket_tree *bucket;
+	int depth;
+	int node;
+	int i, j;
+
+	bucket = malloc(sizeof(*bucket));
+        if (!bucket)
+                return NULL;
+	memset(bucket, 0, sizeof(*bucket));
+	bucket->h.alg = CRUSH_BUCKET_TREE;
+	bucket->h.hash = hash;
+	bucket->h.type = type;
+	bucket->h.size = size;
+
+	if (size == 0) {
+		/* printf("size 0 depth 0 nodes 0\n"); */
+		return bucket;
+	}
+
+	bucket->h.items = malloc(sizeof(__s32)*size);
+        if (!bucket->h.items)
+                goto err;
+
+	/* calc tree depth */
+	depth = calc_depth(size);
+	bucket->num_nodes = 1 << depth;
+	dprintk("size %d depth %d nodes %d\n", size, depth, bucket->num_nodes);
+
+        bucket->node_weights = malloc(sizeof(__u32)*bucket->num_nodes);
+        if (!bucket->node_weights)
+                goto err;
+
+	memset(bucket->h.items, 0, sizeof(__s32)*bucket->h.size);
+	memset(bucket->node_weights, 0, sizeof(__u32)*bucket->num_nodes);
+
+	for (i=0; i<size; i++) {
+		bucket->h.items[i] = items[i];
+		node = crush_calc_tree_node(i);
+		dprintk("item %d node %d weight %d\n", i, node, weights[i]);
+		bucket->node_weights[node] = weights[i];
+
+		if (crush_addition_is_unsafe(bucket->h.weight, weights[i]))
+                        goto err;
+
+		bucket->h.weight += weights[i];
+		for (j=1; j<depth; j++) {
+			node = parent(node);
+
+                        if (crush_addition_is_unsafe(bucket->node_weights[node], weights[i]))
+                                goto err;
+
+			bucket->node_weights[node] += weights[i];
+			dprintk(" node %d weight %d\n", node, bucket->node_weights[node]);
+		}
+	}
+	BUG_ON(bucket->node_weights[bucket->num_nodes/2] != bucket->h.weight);
+
+	return bucket;
+err:
+        free(bucket->node_weights);
+        free(bucket->h.items);
+        free(bucket);
+        return NULL;
+}
+
+
+
+/* straw bucket */
+
+/*
+ * this code was written 8 years ago.  i have a vague recollection of
+ * drawing boxes underneath bars of different lengths, where the bar
+ * length represented the probability/weight, and that there was some
+ * trial and error involved in arriving at this implementation.
+ * however, reading the code now after all this time, the intuition
+ * that motivated is lost on me.  lame.  my only excuse is that I now
+ * know that the approach is fundamentally flawed and am not
+ * particularly motivated to reconstruct the flawed reasoning.
+ *
+ * as best as i can remember, the idea is: sort the weights, and start
+ * with the smallest.  arbitrarily scale it at 1.0 (16-bit fixed
+ * point).  look at the next larger weight, and calculate the scaling
+ * factor for that straw based on the relative difference in weight so
+ * far.  what's not clear to me now is why we are looking at wnext
+ * (the delta to the next bigger weight) for all remaining weights,
+ * and slicing things horizontally instead of considering just the
+ * next item or set of items.  or why pow() is used the way it is.
+ *
+ * note that the original version 1 of this function made special
+ * accommodation for the case where straw lengths were identical.  this
+ * is also flawed in a non-obvious way; version 2 drops the special
+ * handling and appears to work just as well.
+ *
+ * moral of the story: if you do something clever, write down why it
+ * works.
+ */
+int crush_calc_straw(struct crush_map *map, struct crush_bucket_straw *bucket)
+{
+	int *reverse;
+	int i, j, k;
+	double straw, wbelow, lastw, wnext, pbelow;
+	int numleft;
+	int size = bucket->h.size;
+	__u32 *weights = bucket->item_weights;
+
+	/* reverse sort by weight (simple insertion sort) */
+	reverse = malloc(sizeof(int) * size);
+        if (!reverse)
+                return -ENOMEM;
+	if (size)
+		reverse[0] = 0;
+	for (i=1; i<size; i++) {
+		for (j=0; j<i; j++) {
+			if (weights[i] < weights[reverse[j]]) {
+				/* insert here */
+				for (k=i; k>j; k--)
+					reverse[k] = reverse[k-1];
+				reverse[j] = i;
+				break;
+			}
+		}
+		if (j == i)
+			reverse[i] = i;
+	}
+
+	numleft = size;
+	straw = 1.0;
+	wbelow = 0;
+	lastw = 0;
+
+	i=0;
+	while (i < size) {
+		if (map->straw_calc_version == 0) {
+			/* zero weight items get 0 length straws! */
+			if (weights[reverse[i]] == 0) {
+				bucket->straws[reverse[i]] = 0;
+				i++;
+				continue;
+			}
+
+			/* set this item's straw */
+			bucket->straws[reverse[i]] = straw * 0x10000;
+			dprintk("item %d at %d weight %d straw %d (%lf)\n",
+				bucket->h.items[reverse[i]],
+				reverse[i], weights[reverse[i]],
+				bucket->straws[reverse[i]], straw);
+			i++;
+			if (i == size)
+				break;
+
+			/* same weight as previous? */
+			if (weights[reverse[i]] == weights[reverse[i-1]]) {
+				dprintk("same as previous\n");
+				continue;
+			}
+
+			/* adjust straw for next guy */
+			wbelow += ((double)weights[reverse[i-1]] - lastw) *
+				numleft;
+			for (j=i; j<size; j++)
+				if (weights[reverse[j]] == weights[reverse[i]])
+					numleft--;
+				else
+					break;
+			wnext = numleft * (weights[reverse[i]] -
+					   weights[reverse[i-1]]);
+			pbelow = wbelow / (wbelow + wnext);
+			dprintk("wbelow %lf  wnext %lf  pbelow %lf  numleft %d\n",
+				wbelow, wnext, pbelow, numleft);
+
+			straw *= pow((double)1.0 / pbelow, (double)1.0 /
+				     (double)numleft);
+
+			lastw = weights[reverse[i-1]];
+		} else if (map->straw_calc_version >= 1) {
+			/* zero weight items get 0 length straws! */
+			if (weights[reverse[i]] == 0) {
+				bucket->straws[reverse[i]] = 0;
+				i++;
+				numleft--;
+				continue;
+			}
+
+			/* set this item's straw */
+			bucket->straws[reverse[i]] = straw * 0x10000;
+			dprintk("item %d at %d weight %d straw %d (%lf)\n",
+				bucket->h.items[reverse[i]],
+				reverse[i], weights[reverse[i]],
+				bucket->straws[reverse[i]], straw);
+			i++;
+			if (i == size)
+				break;
+
+			/* adjust straw for next guy */
+			wbelow += ((double)weights[reverse[i-1]] - lastw) *
+				numleft;
+			numleft--;
+			wnext = numleft * (weights[reverse[i]] -
+					   weights[reverse[i-1]]);
+			pbelow = wbelow / (wbelow + wnext);
+			dprintk("wbelow %lf  wnext %lf  pbelow %lf  numleft %d\n",
+				wbelow, wnext, pbelow, numleft);
+
+			straw *= pow((double)1.0 / pbelow, (double)1.0 /
+				     (double)numleft);
+
+			lastw = weights[reverse[i-1]];
+		}
+	}
+
+	free(reverse);
+	return 0;
+}
+
+struct crush_bucket_straw *
+crush_make_straw_bucket(struct crush_map *map,
+			int hash,
+			int type,
+			int size,
+			int *items,
+			int *weights)
+{
+	struct crush_bucket_straw *bucket;
+	int i;
+
+	bucket = malloc(sizeof(*bucket));
+        if (!bucket)
+                return NULL;
+	memset(bucket, 0, sizeof(*bucket));
+	bucket->h.alg = CRUSH_BUCKET_STRAW;
+	bucket->h.hash = hash;
+	bucket->h.type = type;
+	bucket->h.size = size;
+
+        bucket->h.items = malloc(sizeof(__s32)*size);
+        if (!bucket->h.items)
+                goto err;
+	bucket->item_weights = malloc(sizeof(__u32)*size);
+        if (!bucket->item_weights)
+                goto err;
+        bucket->straws = malloc(sizeof(__u32)*size);
+        if (!bucket->straws)
+                goto err;
+
+	for (i=0; i<size; i++) {
+		bucket->h.items[i] = items[i];
+		bucket->h.weight += weights[i];
+		bucket->item_weights[i] = weights[i];
+	}
+
+        if (crush_calc_straw(map, bucket) < 0)
+                goto err;
+
+	return bucket;
+err:
+        free(bucket->straws);
+        free(bucket->item_weights);
+        free(bucket->h.items);
+        free(bucket);
+        return NULL;
+}
+
+struct crush_bucket_straw2 *
+crush_make_straw2_bucket(struct crush_map *map,
+			 int hash,
+			 int type,
+			 int size,
+			 int *items,
+			 int *weights)
+{
+	struct crush_bucket_straw2 *bucket;
+	int i;
+
+	bucket = malloc(sizeof(*bucket));
+        if (!bucket)
+                return NULL;
+	memset(bucket, 0, sizeof(*bucket));
+	bucket->h.alg = CRUSH_BUCKET_STRAW2;
+	bucket->h.hash = hash;
+	bucket->h.type = type;
+	bucket->h.size = size;
+
+	if (size == 0) {
+		return bucket;
+	}
+
+        bucket->h.items = malloc(sizeof(__s32)*size);
+        if (!bucket->h.items)
+                goto err;
+	bucket->item_weights = malloc(sizeof(__u32)*size);
+        if (!bucket->item_weights)
+                goto err;
+
+	for (i=0; i<size; i++) {
+		bucket->h.items[i] = items[i];
+		bucket->h.weight += weights[i];
+		bucket->item_weights[i] = weights[i];
+	}
+
+	return bucket;
+err:
+        free(bucket->item_weights);
+        free(bucket->h.items);
+        free(bucket);
+        return NULL;
+}
+
+
+
+struct crush_bucket*
+crush_make_bucket(struct crush_map *map,
+		  int alg, int hash, int type, int size,
+		  int *items,
+		  int *weights)
+{
+	int item_weight;
+
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		if (size && weights)
+			item_weight = weights[0];
+		else
+			item_weight = 0;
+		return (struct crush_bucket *)crush_make_uniform_bucket(hash, type, size, items, item_weight);
+
+	case CRUSH_BUCKET_LIST:
+		return (struct crush_bucket *)crush_make_list_bucket(hash, type, size, items, weights);
+
+	case CRUSH_BUCKET_TREE:
+		return (struct crush_bucket *)crush_make_tree_bucket(hash, type, size, items, weights);
+
+	case CRUSH_BUCKET_STRAW:
+		return (struct crush_bucket *)crush_make_straw_bucket(map, hash, type, size, items, weights);
+	case CRUSH_BUCKET_STRAW2:
+		return (struct crush_bucket *)crush_make_straw2_bucket(map, hash, type, size, items, weights);
+	}
+	return 0;
+}
+
+
+/************************************************/
+
+int crush_add_uniform_bucket_item(struct crush_bucket_uniform *bucket, int item, int weight)
+{
+        int newsize = bucket->h.size + 1;
+	void *_realloc = NULL;
+
+	/* In such situation 'CRUSH_BUCKET_UNIFORM', the weight
+	   provided for the item should be the same as
+	   bucket->item_weight defined with 'crush_make_bucket'. This
+	   assumption is enforced by the return value which is always
+	   0. */
+	if (bucket->item_weight != weight) {
+	  return -EINVAL;
+	}
+
+	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->h.items = _realloc;
+	}
+
+	bucket->h.items[newsize-1] = item;
+
+        if (crush_addition_is_unsafe(bucket->h.weight, weight))
+                return -ERANGE;
+
+        bucket->h.weight += weight;
+        bucket->h.size++;
+
+        return 0;
+}
+
+int crush_add_list_bucket_item(struct crush_bucket_list *bucket, int item, int weight)
+{
+        int newsize = bucket->h.size + 1;
+	void *_realloc = NULL;
+
+	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->h.items = _realloc;
+	}
+	if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->item_weights = _realloc;
+	}
+	if ((_realloc = realloc(bucket->sum_weights, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->sum_weights = _realloc;
+	}
+	
+	bucket->h.items[newsize-1] = item;
+	bucket->item_weights[newsize-1] = weight;
+	if (newsize > 1) {
+
+                if (crush_addition_is_unsafe(bucket->sum_weights[newsize-2], weight))
+                        return -ERANGE;
+
+                bucket->sum_weights[newsize-1] = bucket->sum_weights[newsize-2] + weight;
+	}
+
+        else {
+                bucket->sum_weights[newsize-1] = weight;
+        }
+
+	bucket->h.weight += weight;
+	bucket->h.size++;
+	return 0;
+}
+
+int crush_add_tree_bucket_item(struct crush_bucket_tree *bucket, int item, int weight)
+{
+	int newsize = bucket->h.size + 1;
+	int depth = calc_depth(newsize);;
+	int node;
+	int j;
+	void *_realloc = NULL;
+
+	bucket->num_nodes = 1 << depth;
+
+	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->h.items = _realloc;
+	}
+	if ((_realloc = realloc(bucket->node_weights, sizeof(__u32)*bucket->num_nodes)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->node_weights = _realloc;
+	}
+
+	node = crush_calc_tree_node(newsize-1);
+	bucket->node_weights[node] = weight;
+
+	/* if the depth increase, we need to initialize the new root node's weight before add bucket item */
+	int root = bucket->num_nodes/2;
+	if (depth >= 2 && (node - 1) == root) {
+		/* if the new item is the first node in right sub tree, so
+		* the root node initial weight is left sub tree's weight
+		*/
+		bucket->node_weights[root] = bucket->node_weights[root/2];
+	}
+
+	for (j=1; j<depth; j++) {
+		node = parent(node);
+
+                if (crush_addition_is_unsafe(bucket->node_weights[node], weight))
+                        return -ERANGE;
+
+		bucket->node_weights[node] += weight;
+                dprintk(" node %d weight %d\n", node, bucket->node_weights[node]);
+	}
+
+
+	if (crush_addition_is_unsafe(bucket->h.weight, weight))
+                return -ERANGE;
+	
+	bucket->h.items[newsize-1] = item;
+        bucket->h.weight += weight;
+        bucket->h.size++;
+
+	return 0;
+}
+
+int crush_add_straw_bucket_item(struct crush_map *map,
+				struct crush_bucket_straw *bucket,
+				int item, int weight)
+{
+	int newsize = bucket->h.size + 1;
+
+	void *_realloc = NULL;
+
+	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->h.items = _realloc;
+	}
+	if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->item_weights = _realloc;
+	}
+	if ((_realloc = realloc(bucket->straws, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->straws = _realloc;
+	}
+
+	bucket->h.items[newsize-1] = item;
+	bucket->item_weights[newsize-1] = weight;
+
+	if (crush_addition_is_unsafe(bucket->h.weight, weight))
+                return -ERANGE;
+
+	bucket->h.weight += weight;
+	bucket->h.size++;
+	
+	return crush_calc_straw(map, bucket);
+}
+
+int crush_add_straw2_bucket_item(struct crush_map *map,
+				 struct crush_bucket_straw2 *bucket,
+				 int item, int weight)
+{
+	int newsize = bucket->h.size + 1;
+
+	void *_realloc = NULL;
+
+	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->h.items = _realloc;
+	}
+	if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->item_weights = _realloc;
+	}
+
+	bucket->h.items[newsize-1] = item;
+	bucket->item_weights[newsize-1] = weight;
+
+	if (crush_addition_is_unsafe(bucket->h.weight, weight))
+                return -ERANGE;
+
+	bucket->h.weight += weight;
+	bucket->h.size++;
+
+	return 0;
+}
+
+int crush_bucket_add_item(struct crush_map *map,
+			  struct crush_bucket *b, int item, int weight)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return crush_add_uniform_bucket_item((struct crush_bucket_uniform *)b, item, weight);
+	case CRUSH_BUCKET_LIST:
+		return crush_add_list_bucket_item((struct crush_bucket_list *)b, item, weight);
+	case CRUSH_BUCKET_TREE:
+		return crush_add_tree_bucket_item((struct crush_bucket_tree *)b, item, weight);
+	case CRUSH_BUCKET_STRAW:
+		return crush_add_straw_bucket_item(map, (struct crush_bucket_straw *)b, item, weight);
+	case CRUSH_BUCKET_STRAW2:
+		return crush_add_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item, weight);
+	default:
+		return -1;
+	}
+}
+
+/************************************************/
+
+int crush_remove_uniform_bucket_item(struct crush_bucket_uniform *bucket, int item)
+{
+	unsigned i, j;
+	int newsize;
+	void *_realloc = NULL;
+	
+	for (i = 0; i < bucket->h.size; i++)
+		if (bucket->h.items[i] == item)
+			break;
+	if (i == bucket->h.size)
+		return -ENOENT;
+
+	for (j = i; j < bucket->h.size; j++)
+		bucket->h.items[j] = bucket->h.items[j+1];
+	newsize = --bucket->h.size;
+	if (bucket->item_weight < bucket->h.weight)
+		bucket->h.weight -= bucket->item_weight;
+	else
+		bucket->h.weight = 0;
+
+	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->h.items = _realloc;
+	}
+	return 0;
+}
+
+int crush_remove_list_bucket_item(struct crush_bucket_list *bucket, int item)
+{
+	unsigned i, j;
+	int newsize;
+	unsigned weight;
+
+	for (i = 0; i < bucket->h.size; i++)
+		if (bucket->h.items[i] == item)
+			break;
+	if (i == bucket->h.size)
+		return -ENOENT;
+
+	weight = bucket->item_weights[i];
+	for (j = i; j < bucket->h.size; j++) {
+		bucket->h.items[j] = bucket->h.items[j+1];
+		bucket->item_weights[j] = bucket->item_weights[j+1];
+		bucket->sum_weights[j] = bucket->sum_weights[j+1] - weight;
+	}
+	if (weight < bucket->h.weight)
+		bucket->h.weight -= weight;
+	else
+		bucket->h.weight = 0;
+	newsize = --bucket->h.size;
+	
+	void *_realloc = NULL;
+
+	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->h.items = _realloc;
+	}
+	if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->item_weights = _realloc;
+	}
+	if ((_realloc = realloc(bucket->sum_weights, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->sum_weights = _realloc;
+	}
+	return 0;
+}
+
+int crush_remove_tree_bucket_item(struct crush_bucket_tree *bucket, int item)
+{
+	unsigned i;
+	unsigned newsize;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		int node;
+		unsigned weight;
+		int j;
+		int depth = calc_depth(bucket->h.size);
+
+		if (bucket->h.items[i] != item)
+			continue;
+
+		bucket->h.items[i] = 0;
+		node = crush_calc_tree_node(i);
+		weight = bucket->node_weights[node];
+		bucket->node_weights[node] = 0;
+
+		for (j = 1; j < depth; j++) {
+			node = parent(node);
+			bucket->node_weights[node] -= weight;
+			dprintk(" node %d weight %d\n", node, bucket->node_weights[node]);
+		}
+		if (weight < bucket->h.weight)
+			bucket->h.weight -= weight;
+		else
+			bucket->h.weight = 0;
+		break;
+	}
+	if (i == bucket->h.size)
+		return -ENOENT;
+
+	newsize = bucket->h.size;
+	while (newsize > 0) {
+		int node = crush_calc_tree_node(newsize - 1);
+		if (bucket->node_weights[node])
+			break;
+		--newsize;
+	}
+
+	if (newsize != bucket->h.size) {
+		int olddepth, newdepth;
+
+		void *_realloc = NULL;
+
+		if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+			return -ENOMEM;
+		} else {
+			bucket->h.items = _realloc;
+		}
+
+		olddepth = calc_depth(bucket->h.size);
+		newdepth = calc_depth(newsize);
+		if (olddepth != newdepth) {
+			bucket->num_nodes = 1 << newdepth;
+			if ((_realloc = realloc(bucket->node_weights, 
+						sizeof(__u32)*bucket->num_nodes)) == NULL) {
+				return -ENOMEM;
+			} else {
+				bucket->node_weights = _realloc;
+			}
+		}
+
+		bucket->h.size = newsize;
+	}
+	return 0;
+}
+
+int crush_remove_straw_bucket_item(struct crush_map *map,
+				   struct crush_bucket_straw *bucket, int item)
+{
+	int newsize = bucket->h.size - 1;
+	unsigned i, j;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		if (bucket->h.items[i] == item) {
+			if (bucket->item_weights[i] < bucket->h.weight)
+				bucket->h.weight -= bucket->item_weights[i];
+			else
+				bucket->h.weight = 0;
+			for (j = i; j < bucket->h.size - 1; j++) {
+				bucket->h.items[j] = bucket->h.items[j+1];
+				bucket->item_weights[j] = bucket->item_weights[j+1];
+			}
+			break;
+		}
+	}
+	if (i == bucket->h.size)
+		return -ENOENT;
+	bucket->h.size--;
+	if (bucket->h.size == 0) {
+		/* don't bother reallocating */
+		return 0;
+	}
+	void *_realloc = NULL;
+
+	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->h.items = _realloc;
+	}
+	if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->item_weights = _realloc;
+	}
+	if ((_realloc = realloc(bucket->straws, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->straws = _realloc;
+	}
+
+	return crush_calc_straw(map, bucket);
+}
+
+int crush_remove_straw2_bucket_item(struct crush_map *map,
+				    struct crush_bucket_straw2 *bucket, int item)
+{
+	int newsize = bucket->h.size - 1;
+	unsigned i, j;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		if (bucket->h.items[i] == item) {
+			if (bucket->item_weights[i] < bucket->h.weight)
+				bucket->h.weight -= bucket->item_weights[i];
+			else
+				bucket->h.weight = 0;
+			for (j = i; j < bucket->h.size - 1; j++) {
+				bucket->h.items[j] = bucket->h.items[j+1];
+				bucket->item_weights[j] = bucket->item_weights[j+1];
+			}
+			break;
+		}
+	}
+	if (i == bucket->h.size)
+		return -ENOENT;
+
+	bucket->h.size--;
+	if (!newsize) {
+		/* don't bother reallocating a 0-length array. */
+		return 0;
+	}
+
+	void *_realloc = NULL;
+
+	if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->h.items = _realloc;
+	}
+	if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
+		return -ENOMEM;
+	} else {
+		bucket->item_weights = _realloc;
+	}
+
+	return 0;
+}
+
+int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *b, int item)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return crush_remove_uniform_bucket_item((struct crush_bucket_uniform *)b, item);
+	case CRUSH_BUCKET_LIST:
+		return crush_remove_list_bucket_item((struct crush_bucket_list *)b, item);
+	case CRUSH_BUCKET_TREE:
+		return crush_remove_tree_bucket_item((struct crush_bucket_tree *)b, item);
+	case CRUSH_BUCKET_STRAW:
+		return crush_remove_straw_bucket_item(map, (struct crush_bucket_straw *)b, item);
+	case CRUSH_BUCKET_STRAW2:
+		return crush_remove_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item);
+	default:
+		return -1;
+	}
+}
+
+
+/************************************************/
+
+int crush_adjust_uniform_bucket_item_weight(struct crush_bucket_uniform *bucket, int item, int weight)
+{
+	int diff = (weight - bucket->item_weight) * bucket->h.size;
+
+	bucket->item_weight = weight;
+	bucket->h.weight = bucket->item_weight * bucket->h.size;
+
+	return diff;
+}
+
+int crush_adjust_list_bucket_item_weight(struct crush_bucket_list *bucket, int item, int weight)
+{
+	int diff;
+	unsigned i, j;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		if (bucket->h.items[i] == item)
+			break;
+	}
+	if (i == bucket->h.size)
+		return 0;
+
+	diff = weight - bucket->item_weights[i];
+	bucket->item_weights[i] = weight;
+	bucket->h.weight += diff;
+
+	for (j = i; j < bucket->h.size; j++)
+		bucket->sum_weights[j] += diff;
+
+	return diff;
+}
+
+int crush_adjust_tree_bucket_item_weight(struct crush_bucket_tree *bucket, int item, int weight)
+{
+	int diff;
+	int node;
+	unsigned i, j;
+	unsigned depth = calc_depth(bucket->h.size);
+
+	for (i = 0; i < bucket->h.size; i++) {
+		if (bucket->h.items[i] == item)
+			break;
+	}
+	if (i == bucket->h.size)
+		return 0;
+	
+	node = crush_calc_tree_node(i);
+	diff = weight - bucket->node_weights[node];
+	bucket->node_weights[node] = weight;
+	bucket->h.weight += diff;
+
+	for (j=1; j<depth; j++) {
+		node = parent(node);
+		bucket->node_weights[node] += diff;
+	}
+
+	return diff;
+}
+
+int crush_adjust_straw_bucket_item_weight(struct crush_map *map,
+					  struct crush_bucket_straw *bucket,
+					  int item, int weight)
+{
+	unsigned idx;
+	int diff;
+        int r;
+
+	for (idx = 0; idx < bucket->h.size; idx++)
+		if (bucket->h.items[idx] == item)
+			break;
+	if (idx == bucket->h.size)
+		return 0;
+
+	diff = weight - bucket->item_weights[idx];
+	bucket->item_weights[idx] = weight;
+	bucket->h.weight += diff;
+
+	r = crush_calc_straw(map, bucket);
+        if (r < 0)
+                return r;
+
+	return diff;
+}
+
+int crush_adjust_straw2_bucket_item_weight(struct crush_map *map,
+					   struct crush_bucket_straw2 *bucket,
+					   int item, int weight)
+{
+	unsigned idx;
+	int diff;
+
+	for (idx = 0; idx < bucket->h.size; idx++)
+		if (bucket->h.items[idx] == item)
+			break;
+	if (idx == bucket->h.size)
+		return 0;
+
+	diff = weight - bucket->item_weights[idx];
+	bucket->item_weights[idx] = weight;
+	bucket->h.weight += diff;
+
+	return diff;
+}
+
+int crush_bucket_adjust_item_weight(struct crush_map *map,
+				    struct crush_bucket *b,
+				    int item, int weight)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return crush_adjust_uniform_bucket_item_weight((struct crush_bucket_uniform *)b,
+							     item, weight);
+	case CRUSH_BUCKET_LIST:
+		return crush_adjust_list_bucket_item_weight((struct crush_bucket_list *)b,
+							    item, weight);
+	case CRUSH_BUCKET_TREE:
+		return crush_adjust_tree_bucket_item_weight((struct crush_bucket_tree *)b,
+							    item, weight);
+	case CRUSH_BUCKET_STRAW:
+		return crush_adjust_straw_bucket_item_weight(map,
+							     (struct crush_bucket_straw *)b,
+							     item, weight);
+	case CRUSH_BUCKET_STRAW2:
+		return crush_adjust_straw2_bucket_item_weight(map,
+							      (struct crush_bucket_straw2 *)b,
+							     item, weight);
+	default:
+		return -1;
+	}
+}
+
+/************************************************/
+
+static int crush_reweight_uniform_bucket(struct crush_map *map, struct crush_bucket_uniform *bucket)
+{
+	unsigned i;
+	unsigned sum = 0, n = 0, leaves = 0;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		int id = bucket->h.items[i];
+		if (id < 0) {
+			struct crush_bucket *c = map->buckets[-1-id];
+			crush_reweight_bucket(map, c);
+
+			if (crush_addition_is_unsafe(sum, c->weight))
+                                return -ERANGE;
+
+			sum += c->weight;
+			n++;
+		} else {
+			leaves++;
+		}
+	}
+
+	if (n > leaves)
+		bucket->item_weight = sum / n;  // more bucket children than leaves, average!
+	bucket->h.weight = bucket->item_weight * bucket->h.size;
+
+	return 0;
+}
+
+static int crush_reweight_list_bucket(struct crush_map *map, struct crush_bucket_list *bucket)
+{
+	unsigned i;
+
+	bucket->h.weight = 0;
+	for (i = 0; i < bucket->h.size; i++) {
+		int id = bucket->h.items[i];
+		if (id < 0) {
+			struct crush_bucket *c = map->buckets[-1-id];
+			crush_reweight_bucket(map, c);
+			bucket->item_weights[i] = c->weight;
+		}
+
+		if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i]))
+                        return -ERANGE;
+
+		bucket->h.weight += bucket->item_weights[i];
+	}
+
+	return 0;
+}
+
+static int crush_reweight_tree_bucket(struct crush_map *map, struct crush_bucket_tree *bucket)
+{
+	unsigned i;
+
+	bucket->h.weight = 0;
+	for (i = 0; i < bucket->h.size; i++) {
+		int node = crush_calc_tree_node(i);
+		int id = bucket->h.items[i];
+		if (id < 0) {
+			struct crush_bucket *c = map->buckets[-1-id];
+			crush_reweight_bucket(map, c);
+			bucket->node_weights[node] = c->weight;
+		}
+
+		if (crush_addition_is_unsafe(bucket->h.weight, bucket->node_weights[node]))
+                        return -ERANGE;
+
+		bucket->h.weight += bucket->node_weights[node];
+
+
+	}
+
+	return 0;
+}
+
+static int crush_reweight_straw_bucket(struct crush_map *map, struct crush_bucket_straw *bucket)
+{
+	unsigned i;
+
+	bucket->h.weight = 0;
+	for (i = 0; i < bucket->h.size; i++) {
+		int id = bucket->h.items[i];
+		if (id < 0) {
+			struct crush_bucket *c = map->buckets[-1-id];
+			crush_reweight_bucket(map, c);
+			bucket->item_weights[i] = c->weight;
+		}
+
+                if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i]))
+                        return -ERANGE;
+
+                bucket->h.weight += bucket->item_weights[i];
+	}
+	crush_calc_straw(map, bucket);
+
+	return 0;
+}
+
+static int crush_reweight_straw2_bucket(struct crush_map *map, struct crush_bucket_straw2 *bucket)
+{
+	unsigned i;
+
+	bucket->h.weight = 0;
+	for (i = 0; i < bucket->h.size; i++) {
+		int id = bucket->h.items[i];
+		if (id < 0) {
+			struct crush_bucket *c = map->buckets[-1-id];
+			crush_reweight_bucket(map, c);
+			bucket->item_weights[i] = c->weight;
+		}
+
+                if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i]))
+                        return -ERANGE;
+
+                bucket->h.weight += bucket->item_weights[i];
+	}
+
+	return 0;
+}
+
+int crush_reweight_bucket(struct crush_map *map, struct crush_bucket *b)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return crush_reweight_uniform_bucket(map, (struct crush_bucket_uniform *)b);
+	case CRUSH_BUCKET_LIST:
+		return crush_reweight_list_bucket(map, (struct crush_bucket_list *)b);
+	case CRUSH_BUCKET_TREE:
+		return crush_reweight_tree_bucket(map, (struct crush_bucket_tree *)b);
+	case CRUSH_BUCKET_STRAW:
+		return crush_reweight_straw_bucket(map, (struct crush_bucket_straw *)b);
+	case CRUSH_BUCKET_STRAW2:
+		return crush_reweight_straw2_bucket(map, (struct crush_bucket_straw2 *)b);
+	default:
+		return -1;
+	}
+}
+
+struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions)
+{
+  int b;
+  int sum_bucket_size = 0;
+  int bucket_count = 0;
+  for (b = 0; b < map->max_buckets; b++) {
+    if (map->buckets[b] == 0)
+      continue;
+    sum_bucket_size += map->buckets[b]->size;
+    bucket_count++;
+  }
+  dprintk("sum_bucket_size %d max_buckets %d bucket_count %d\n",
+          sum_bucket_size, map->max_buckets, bucket_count);
+  int size = (sizeof(struct crush_choose_arg) * map->max_buckets +
+              sizeof(struct crush_weight_set) * bucket_count * num_positions +
+              sizeof(__u32) * sum_bucket_size * num_positions + // weights
+              sizeof(__s32) * sum_bucket_size); // ids
+  char *space = malloc(size);
+  struct crush_choose_arg *arg = (struct crush_choose_arg *)space;
+  struct crush_weight_set *weight_set = (struct crush_weight_set *)(arg + map->max_buckets);
+  __u32 *weights = (__u32 *)(weight_set + bucket_count * num_positions);
+  char *weight_set_ends __attribute__((unused)) = (char*)weights;
+  __s32 *ids = (__s32 *)(weights + sum_bucket_size * num_positions);
+  char *weights_end __attribute__((unused)) = (char *)ids;
+  char *ids_end __attribute__((unused)) = (char *)(ids + sum_bucket_size);
+  BUG_ON(space + size != ids_end);
+  for (b = 0; b < map->max_buckets; b++) {
+    if (map->buckets[b] == 0) {
+      memset(&arg[b], '\0', sizeof(struct crush_choose_arg));
+      continue;
+    }
+    struct crush_bucket_straw2 *bucket = (struct crush_bucket_straw2 *)map->buckets[b];
+
+    int position;
+    for (position = 0; position < num_positions; position++) {
+      memcpy(weights, bucket->item_weights, sizeof(__u32) * bucket->h.size);
+      weight_set[position].weights = weights;
+      weight_set[position].size = bucket->h.size;
+      dprintk("moving weight %d bytes forward\n", (int)((weights + bucket->h.size) - weights));
+      weights += bucket->h.size;
+    }
+    arg[b].weight_set = weight_set;
+    arg[b].weight_set_positions = num_positions;
+    weight_set += position;
+
+    memcpy(ids, bucket->h.items, sizeof(__s32) * bucket->h.size);
+    arg[b].ids = ids;
+    arg[b].ids_size = bucket->h.size;
+    ids += bucket->h.size;
+  }
+  BUG_ON((char*)weight_set_ends != (char*)weight_set);
+  BUG_ON((char*)weights_end != (char*)weights);
+  BUG_ON((char*)ids != (char*)ids_end);
+  return arg;
+}
+
+void crush_destroy_choose_args(struct crush_choose_arg *args)
+{
+  free(args);
+}
+
+/***************************/
+
+/* methods to check for safe arithmetic operations */
+
+int crush_addition_is_unsafe(__u32 a, __u32 b)
+{
+	if ((((__u32)(-1)) - b) < a)
+		return 1;
+	else
+		return 0;
+}
+
+int crush_multiplication_is_unsafe(__u32  a, __u32 b)
+{
+	/* prevent division by zero */
+        if (!a)
+                return 0;
+	if (!b)
+		return 1;
+	if ((((__u32)(-1)) / b) < a)
+		return 1;
+	else
+		return 0;
+}
+
+/***************************/
+
+/* methods to configure crush_map */
+
+void set_legacy_crush_map(struct crush_map *map) {
+  /* initialize legacy tunable values */
+  map->choose_local_tries = 2;
+  map->choose_local_fallback_tries = 5;
+  map->choose_total_tries = 19;
+  map->chooseleaf_descend_once = 0;
+  map->chooseleaf_vary_r = 0;
+  map->chooseleaf_stable = 0;
+  map->straw_calc_version = 0;
+
+  // by default, use legacy types, and also exclude tree,
+  // since it was buggy.
+  map->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
+}
+
+void set_optimal_crush_map(struct crush_map *map) {
+  map->choose_local_tries = 0;
+  map->choose_local_fallback_tries = 0;
+  map->choose_total_tries = 50;
+  map->chooseleaf_descend_once = 1;
+  map->chooseleaf_vary_r = 1;
+  map->chooseleaf_stable = 1;
+  map->allowed_bucket_algs = (
+    (1 << CRUSH_BUCKET_UNIFORM) |
+    (1 << CRUSH_BUCKET_LIST) |
+    (1 << CRUSH_BUCKET_STRAW) |
+    (1 << CRUSH_BUCKET_STRAW2));
+}
diff --git a/src/crush/builder.h b/src/crush/builder.h
new file mode 100644
index 000000000..bdf0a4b9c
--- /dev/null
+++ b/src/crush/builder.h
@@ -0,0 +1,344 @@
+#ifndef CEPH_CRUSH_BUILDER_H
+#define CEPH_CRUSH_BUILDER_H
+
+#include "include/int_types.h"
+
+struct crush_bucket;
+struct crush_choose_arg;
+struct crush_map;
+struct crush_rule;
+
+/** @ingroup API
+ *
+ * Allocate a crush_map with __malloc(3)__ and initialize it. The
+ * caller is responsible for deallocating the crush_map with
+ * crush_destroy().
+ *
+ * The content of the allocated crush_map is set with
+ * set_optimal_crush_map(). The caller is responsible for setting each
+ * tunable in the __crush_map__ for backward compatibility or mapping
+ * stability.
+ *
+ * @returns a pointer to the newly created crush_map or NULL
+ */
+extern struct crush_map *crush_create();
+/** @ingroup API
+ *
+ * Analyze the content of __map__ and set the internal values required
+ * before it can be used to map values with crush_do_rule(). The caller
+ * must make sure it is run before crush_do_rule() and after any
+ * function that modifies the __map__ (crush_add_bucket(), etc.).
+ *
+ * @param map the crush_map
+ */
+extern void crush_finalize(struct crush_map *map);
+
+/* rules */
+/** @ingroup API
+ *
+ * Allocate an empty crush_rule structure large enough to store __len__ steps.
+ * Steps can be added to a rule via crush_rule_set_step(). The __ruleset__
+ * is a user defined integer, not used by __libcrush__ and stored in
+ * the allocated rule at __rule->mask.ruleset__.
+ *
+ * The rule is designed to allow crush_do_rule() to get at least __minsize__ items
+ * and at most __maxsize__ items.
+ *
+ * The __type__ is defined by the caller and will be used by
+ * crush_find_rule() when looking for a rule and by
+ * __CRUSH_RULE_CHOOSE*__ steps when looking for items.
+ *
+ * The caller is responsible for deallocating the returned pointer via
+ * crush_destroy_rule().
+ *
+ * If __malloc(3)__ fails, return NULL.
+ *
+ * @param len number of steps in the rule
+ * @param ruleset user defined value
+ * @param type user defined value
+ * @param minsize minimum number of items the rule can map
+ * @param maxsize maximum number of items the rule can map
+ *
+ * @returns a pointer to the newly created rule or NULL
+ */
+extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize);
+/** @ingroup API
+ *
+ * Set the __pos__ step of the __rule__ to an operand and up to two arguments.
+ * The value of the operand __op__ determines if the arguments are used and how:
+ *
+ * - __CRUSH_RULE_NOOP__ do nothing.
+ * - __CRUSH_RULE_TAKE__ select the __arg1__ item
+ * - __CRUSH_RULE_EMIT__ append the selection to the results and clear
+ *     the selection
+ *
+ * - __CRUSH_RULE_CHOOSE_FIRSTN__ and __CRUSH_RULE_CHOOSE_INDEP__
+ *     recursively explore each bucket currently selected, looking for
+ *     __arg1__ items of type __arg2__ and select them.
+ * - __CRUSH_RULE_CHOOSELEAF_FIRSTN__ and __CRUSH_RULE_CHOOSELEAF_INDEP__
+ *     recursively explore each bucket currently selected, looking for
+ *     __arg1__ leaves within all the buckets of type __arg2__ and
+ *     select them.
+ *
+ * In all __CHOOSE__ steps, if __arg1__ is less than or equal to zero,
+ * the number of items to select is equal to the __max_result__ argument
+ * of crush_do_rule() minus __arg1__. It is common to set __arg1__ to zero
+ * to select as many items as requested by __max_result__.
+ *
+ * - __CRUSH_RULE_SET_CHOOSE_TRIES__ and __CRUSH_RULE_SET_CHOOSELEAF_TRIES__
+ *
+ *   The CHOOSE_FIRSTN and CHOOSE_INDEP rule step look for buckets of
+ *   a given type, randomly selecting them. If they are unlucky and
+ *   find the same bucket twice, they will try N+1 times (N being the
+ *   value of the choose_total_tries tunable). If there is a previous
+ *   SET_CHOOSE_TRIES step in the same rule, it will try C times
+ *   instead (C being the value of the argument of the
+ *   SET_CHOOSE_TRIES step).
+ *
+ *   Note: the __choose_total_tries__ tunable defined in crush_map is
+ *   the number of retry, not the number of tries. The number of tries
+ *   is the number of retry+1. The SET_CHOOSE_TRIES rule step sets the
+ *   number of tries and does not need the + 1. This confusing
+ *   difference is inherited from an off-by-one bug from years ago.
+ *
+ *   The CHOOSELEAF_FIRSTN and CHOOSELEAF_INDEP rule step do the same
+ *   as CHOOSE_FIRSTN and CHOOSE_INDEP but also recursively explore
+ *   each bucket found, looking for a single device. The same device
+ *   may be found in two different buckets because the crush map is
+ *   not a strict hierarchy, it is a DAG. When such a collision
+ *   happens, they will try again. The number of times they try to
+ *   find a non colliding device is:
+ *
+ *   - If FIRSTN and there is no previous SET_CHOOSELEAF_TRIES rule
+ *     step: try N + 1 times (N being the value of the
+ *     __choose_total_tries__ tunable defined in crush_map)
+ *
+ *   - If FIRSTN and there is a previous SET_CHOOSELEAF_TRIES rule
+ *     step: try P times (P being the value of the argument of the
+ *     SET_CHOOSELEAF_TRIES rule step)
+ *
+ *   - If INDEP and there is no previous SET_CHOOSELEAF_TRIES rule
+ *     step: try 1 time.
+ *
+ *   - If INDEP and there is a previous SET_CHOOSELEAF_TRIES rule step: try
+ *     P times (P being the value of the argument of the SET_CHOOSELEAF_TRIES
+ *     rule step)
+ *
+ * @param rule the rule in which the step is inserted
+ * @param pos the zero based step index
+ * @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__, __CRUSH_RULE_SET_CHOOSE_TRIES__, __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ or __CRUSH_RULE_EMIT__
+ * @param arg1 first argument for __op__
+ * @param arg2 second argument for __op__
+ */
+extern void crush_rule_set_step(struct crush_rule *rule, int pos, int op, int arg1, int arg2);
+/** @ingroup API
+ *
+ * Add the __rule__ into the crush __map__ and assign it the
+ * __ruleno__ unique identifier. If __ruleno__ is -1, the function will
+ * assign the lowest available identifier. The __ruleno__ value must be
+ * a positive integer lower than __CRUSH_MAX_RULES__.
+ *
+ * - return -ENOSPC if the rule identifier is >= __CRUSH_MAX_RULES__
+ * - return -ENOMEM if __realloc(3)__ fails to expand the array of
+ *   rules in the __map__
+ *
+ * @param map the crush_map
+ * @param rule the rule to add to the __map__
+ * @param ruleno a positive integer < __CRUSH_MAX_RULES__ or -1
+ *
+ * @returns the rule unique identifier on success, < 0 on error
+ */
+extern int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno);
+
+/* buckets */
+extern int crush_get_next_bucket_id(struct crush_map *map);
+/** @ingroup API
+ *
+ * Add __bucket__ into the crush __map__ and assign it the
+ * __bucketno__ unique identifier. If __bucketno__ is 0, the function
+ * will assign the lowest available identifier.  The bucket identifier
+ * must be a negative integer. The bucket identifier is returned via
+ * __idout__.
+ *
+ * - return -ENOMEM if __realloc(3)__ fails to expand the array of
+ *   buckets in the __map__
+ * - return -EEXIST if the __bucketno__ identifier is already assigned
+ *   to another bucket.
+ *
+ * @param[in] map the crush_map
+ * @param[in] bucketno the bucket unique identifier or 0
+ * @param[in] bucket the bucket to add to the __map__
+ * @param[out] idout a pointer to the bucket identifier
+ *
+ * @returns 0 on success, < 0 on error
+ */
+extern int crush_add_bucket(struct crush_map *map,
+			    int bucketno,
+			    struct crush_bucket *bucket, int *idout);
+/** @ingroup API
+ *
+ * Allocate a crush_bucket with __malloc(3)__ and initialize it. The
+ * content of the bucket is filled with __size__ items from
+ * __items__. The item selection is set to use __alg__ which is one of
+ * ::CRUSH_BUCKET_UNIFORM , ::CRUSH_BUCKET_LIST or
+ * ::CRUSH_BUCKET_STRAW2. The initial __items__ are assigned a
+ * weight from the __weights__ array, depending on the value of
+ * __alg__. If __alg__ is ::CRUSH_BUCKET_UNIFORM, all items are set
+ * to have a weight equal to __weights[0]__, otherwise the weight of
+ * __items[x]__ is set to be the value of __weights[x]__.
+ *
+ * The caller is responsible for deallocating the returned pointer via
+ * crush_destroy_bucket().
+ *
+ * @param map __unused__
+ * @param alg algorithm for item selection
+ * @param hash always set to CRUSH_HASH_RJENKINS1
+ * @param type user defined bucket type
+ * @param size of the __items__ array
+ * @param items array of __size__ items
+ * @param weights the weight of each item in __items__, depending on __alg__
+ *
+ * @returns a pointer to the newly created bucket or NULL
+ */
+struct crush_bucket *crush_make_bucket(struct crush_map *map, int alg, int hash, int type, int size, int *items, int *weights);
+extern struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions);
+extern void crush_destroy_choose_args(struct crush_choose_arg *args);
+/** @ingroup API
+ *
+ * Add __item__ to __bucket__ with __weight__. The weight of the new
+ * item is added to the weight of the bucket so that it reflects
+ * the total weight of all items.
+ *
+ * If __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM, the value of __weight__ must be equal to
+ * __(struct crush_bucket_uniform *)bucket->item_weight__.
+ *
+ * - return -ENOMEM if the __bucket__ cannot be resized with __realloc(3)__.
+ * - return -ERANGE if adding __weight__ to the weight of the bucket overflows.
+ * - return -EINVAL if __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM and
+ *   the __weight__ is not equal to __(struct crush_bucket_uniform *)bucket->item_weight__.
+ * - return -1 if the value of __bucket->alg__ is unknown.
+ *
+ * @returns 0 on success, < 0 on error
+ */
+extern int crush_bucket_add_item(struct crush_map *map, struct crush_bucket *bucket, int item, int weight);
+/** @ingroup API
+ *
+ * If __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM,
+ * __(struct crush_bucket_uniform *)bucket->item_weight__ is set to __weight__ and the
+ * weight of the bucket is set to be the number of items in the bucket times the weight.
+ * The return value is the difference between the new bucket weight and the former
+ * bucket weight. The __item__ argument is ignored.
+ *
+ * If __bucket->alg__ is different from ::CRUSH_BUCKET_UNIFORM,
+ * set the  __weight__ of  __item__ in __bucket__. The former weight of the
+ * item is subtracted from the weight of the bucket and the new weight is added.
+ * The return value is the difference between the new item weight and the former
+ * item weight.
+ *
+ * @returns the difference between the new weight and the former weight
+ */
+extern int crush_bucket_adjust_item_weight(struct crush_map *map, struct crush_bucket *bucket, int item, int weight);
+/** @ingroup API
+ *
+ * Recursively update the weight of __bucket__ and its children, deep
+ * first. The __bucket__ weight is set to the sum of the weight of the
+ * items it contains.
+ *
+ * - return -ERANGE if the sum of the weight of the items in __bucket__ overflows.
+ * - return -1 if the value of __bucket->alg__ is unknown.
+ *
+ * @param map a crush_map containing __bucket__
+ * @param bucket the root of the tree to reweight
+ * @returns 0 on success, < 0 on error
+ */
+extern int crush_reweight_bucket(struct crush_map *map, struct crush_bucket *bucket);
+/** @ingroup API
+ *
+ * Remove __bucket__ from __map__ and deallocate it via crush_destroy_bucket().
+ * __assert(3)__ that __bucket__ is in __map__. The caller is responsible for
+ * making sure the bucket is not the child of any other bucket in the __map__.
+ *
+ * @param map a crush_map containing __bucket__
+ * @param bucket the bucket to remove from __map__
+ * @returns 0
+ */
+extern int crush_remove_bucket(struct crush_map *map, struct crush_bucket *bucket);
+/** @ingroup API
+ *
+ * Remove __item__ from __bucket__ and subtract the item weight from
+ * the bucket weight. If the weight of the item is greater than the
+ * weight of the bucket, silently set the bucket weight to zero.
+ *
+ * - return -ENOMEM if the __bucket__ cannot be sized down with __realloc(3)__.
+ * - return -1 if the value of __bucket->alg__ is unknown.
+ *
+ * @param map __unused__
+ * @param bucket the bucket from which __item__ is removed
+ * @param item the item to remove from __bucket__
+ * @returns 0 on success, < 0 on error
+ */
+extern int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *bucket, int item);
+
+struct crush_bucket_uniform *
+crush_make_uniform_bucket(int hash, int type, int size,
+			  int *items,
+			  int item_weight);
+struct crush_bucket_list*
+crush_make_list_bucket(int hash, int type, int size,
+		       int *items,
+		       int *weights);
+struct crush_bucket_tree*
+crush_make_tree_bucket(int hash, int type, int size,
+		       int *items,    /* in leaf order */
+		       int *weights);
+struct crush_bucket_straw *
+crush_make_straw_bucket(struct crush_map *map,
+			int hash, int type, int size,
+			int *items,
+			int *weights);
+
+extern int crush_addition_is_unsafe(__u32 a, __u32 b);
+extern int crush_multiplication_is_unsafe(__u32  a, __u32 b);
+
+/** @ingroup API
+ *
+ * Set the __map__ tunables to implement the most ancient behavior,
+ * for backward compatibility purposes only.
+ *
+ * - choose_local_tries == 2
+ * - choose_local_fallback_tries == 5
+ * - choose_total_tries == 19
+ * - chooseleaf_descend_once == 0
+ * - chooseleaf_vary_r == 0
+ * - straw_calc_version == 0
+ * - chooseleaf_stable = 0
+ *
+ * See the __crush_map__ documentation for more information about
+ * each tunable.
+ *
+ * @param map a crush_map
+ */
+extern void set_legacy_crush_map(struct crush_map *map);
+/** @ingroup API
+ *
+ * Set the __map__ tunables to implement the optimal behavior. These
+ * are the values set by crush_create(). It does not guarantee a
+ * stable mapping after an upgrade.
+ *
+ * For instance when a bug is fixed it may significantly change the
+ * mapping. In that case a new tunable (say tunable_new) is added so
+ * the caller can control when the bug fix is activated. The
+ * set_optimal_crush_map() function will always set all tunables,
+ * including tunable_new, to fix all bugs even if it means changing
+ * the mapping. If the caller needs fine grained control on the
+ * tunables to upgrade to a new version without changing the mapping,
+ * it needs to set the __crush_map__ tunables individually.
+ *
+ * See the __crush_map__ documentation for more information about
+ * each tunable.
+ *
+ * @param map a crush_map
+ */
+extern void set_optimal_crush_map(struct crush_map *map);
+
+#endif
diff --git a/src/crush/crush.c b/src/crush/crush.c
new file mode 100644
index 000000000..5bf94c04f
--- /dev/null
+++ b/src/crush/crush.c
@@ -0,0 +1,137 @@
+#ifdef __KERNEL__
+# include <linux/slab.h>
+# include <linux/crush/crush.h>
+#else
+# include "crush_compat.h"
+# include "crush.h"
+#endif
+
+const char *crush_bucket_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM: return "uniform";
+	case CRUSH_BUCKET_LIST: return "list";
+	case CRUSH_BUCKET_TREE: return "tree";
+	case CRUSH_BUCKET_STRAW: return "straw";
+	case CRUSH_BUCKET_STRAW2: return "straw2";
+	default: return "unknown";
+	}
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
+{
+	if ((__u32)p >= b->size)
+		return 0;
+
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return ((struct crush_bucket_uniform *)b)->item_weight;
+	case CRUSH_BUCKET_LIST:
+		return ((struct crush_bucket_list *)b)->item_weights[p];
+	case CRUSH_BUCKET_TREE:
+		return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)];
+	case CRUSH_BUCKET_STRAW:
+		return ((struct crush_bucket_straw *)b)->item_weights[p];
+	case CRUSH_BUCKET_STRAW2:
+		return ((struct crush_bucket_straw2 *)b)->item_weights[p];
+	}
+	return 0;
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+	kfree(b->item_weights);
+	kfree(b->sum_weights);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+	kfree(b->h.items);
+	kfree(b->node_weights);
+	kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+	kfree(b->straws);
+	kfree(b->item_weights);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
+{
+	kfree(b->item_weights);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+		break;
+	case CRUSH_BUCKET_LIST:
+		crush_destroy_bucket_list((struct crush_bucket_list *)b);
+		break;
+	case CRUSH_BUCKET_TREE:
+		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+		break;
+	case CRUSH_BUCKET_STRAW:
+		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+		break;
+	case CRUSH_BUCKET_STRAW2:
+		crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b);
+		break;
+	}
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+	/* buckets */
+	if (map->buckets) {
+		__s32 b;
+		for (b = 0; b < map->max_buckets; b++) {
+			if (map->buckets[b] == NULL)
+				continue;
+			crush_destroy_bucket(map->buckets[b]);
+		}
+		kfree(map->buckets);
+	}
+
+	/* rules */
+	if (map->rules) {
+		__u32 b;
+		for (b = 0; b < map->max_rules; b++)
+			crush_destroy_rule(map->rules[b]);
+		kfree(map->rules);
+	}
+
+#ifndef __KERNEL__
+	kfree(map->choose_tries);
+#endif
+	kfree(map);
+}
+
+void crush_destroy_rule(struct crush_rule *rule)
+{
+	kfree(rule);
+}
diff --git a/src/crush/crush.h b/src/crush/crush.h
new file mode 100644
index 000000000..91b78ad9c
--- /dev/null
+++ b/src/crush/crush.h
@@ -0,0 +1,549 @@
+#ifndef CEPH_CRUSH_CRUSH_H
+#define CEPH_CRUSH_CRUSH_H
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL-2.1 or LGPL-3.0
+ */
+
+
+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
+
+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_RULESET (1<<8)  /* max crush ruleset number */
+#define CRUSH_MAX_RULES CRUSH_MAX_RULESET  /* should be the same as max rulesets */
+
+#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
+#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
+
+#define CRUSH_ITEM_UNDEF  0x7ffffffe  /* undefined result (internal use only) */
+/** @ingroup API
+ * The equivalent of NULL for an item, i.e. the absence of an item.
+ */
+#define CRUSH_ITEM_NONE   0x7fffffff
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices.  A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+	__u32 op;
+	__s32 arg1;
+	__s32 arg2;
+};
+
+/** @ingroup API
+ */
+enum crush_opcodes {
+        /*! do nothing
+         */
+	CRUSH_RULE_NOOP = 0,
+	CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
+	CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+				      /* arg2 = type */
+	CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
+	CRUSH_RULE_EMIT = 4,          /* no args */
+	CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
+	CRUSH_RULE_CHOOSELEAF_INDEP = 7,
+
+	CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
+	CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
+	CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
+	CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
+	CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
+	CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N            0
+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+	__u8 ruleset;
+	__u8 type;
+	__u8 min_size;
+	__u8 max_size;
+};
+
+struct crush_rule {
+	__u32 len;
+	struct crush_rule_mask mask;
+	struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+			      (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).
+ */
+
+/** @ingroup API
+ *
+ * Items within a bucket are chosen with crush_do_rule() using one of
+ * three algorithms representing a tradeoff between performance and
+ * reorganization efficiency. If you are unsure of which bucket type
+ * to use, we recommend using ::CRUSH_BUCKET_STRAW2.
+ *
+ * The table summarizes how the speed of each option measures up
+ * against mapping stability when items are added or removed.
+ *
+ * 	Bucket Alg     Speed       Additions    Removals
+ * 	------------------------------------------------
+ * 	uniform         O(1)       poor         poor
+ * 	list            O(n)       optimal      poor
+ * 	straw2          O(n)       optimal      optimal
+ */
+enum crush_algorithm {
+       /*!
+        * Devices are rarely added individually in a large system.
+        * Instead, new storage is typically deployed in blocks of identical
+        * devices, often as an additional shelf in a server rack or perhaps
+        * an entire cabinet. Devices reaching their end of life are often
+        * similarly decommissioned as a set (individual failures aside),
+        * making it natural to treat them as a unit.  CRUSH uniform buckets
+        * are used to represent an identical set of devices in such
+        * circumstances. The key advantage in doing so is performance
+        * related: CRUSH can map replicas into uniform buckets in constant
+        * time. In cases where the uniformity restrictions are not
+        * appropriate, other bucket types can be used.  If the size of a
+        * uniform bucket changes, there is a complete reshuffling of data
+        * between devices, much like conventional hash-based distribution
+        * strategies.
+        */
+	CRUSH_BUCKET_UNIFORM = 1,
+        /*!
+         * List buckets structure their contents as a linked list, and
+         * can contain items with arbitrary weights.  To place a
+         * replica, CRUSH begins at the head of the list with the most
+         * recently added item and compares its weight to the sum of
+         * all remaining items' weights.  Depending on the value of
+         * hash( x , r , item), either the current item is chosen with
+         * the appropriate probability, or the process continues
+         * recursively down the list.  This is a natural and intuitive
+         * choice for an expanding cluster: either an object is
+         * relocated to the newest device with some appropriate
+         * probability, or it remains on the older devices as before.
+         * The result is optimal data migration when items are added
+         * to the bucket. Items removed from the middle or tail of the
+         * list, however, can result in a significant amount of
+         * unnecessary movement, making list buckets most suitable for
+         * circumstances in which they never (or very rarely) shrink.
+         */
+	CRUSH_BUCKET_LIST = 2,
+        /*! @cond INTERNAL */
+	CRUSH_BUCKET_TREE = 3,
+	CRUSH_BUCKET_STRAW = 4,
+	/*! @endcond */
+        /*!
+         * List and tree buckets are structured such that a limited
+         * number of hash values need to be calculated and compared to
+         * weights in order to select a bucket item.  In doing so,
+         * they divide and conquer in a way that either gives certain
+         * items precedence (e. g., those at the beginning of a list)
+         * or obviates the need to consider entire subtrees of items
+         * at all. That improves the performance of the replica
+         * placement process, but can also introduce suboptimal
+         * reorganization behavior when the contents of a bucket
+         * change due an addition, removal, or re-weighting of an
+         * item.
+         *
+         * The straw2 bucket type allows all items to fairly "compete"
+         * against each other for replica placement through a process
+         * analogous to a draw of straws.  To place a replica, a straw
+         * of random length is drawn for each item in the bucket.  The
+         * item with the longest straw wins.  The length of each straw
+         * is initially a value in a fixed range.  Each straw length
+         * is scaled by a factor based on the item's weight so that
+         * heavily weighted items are more likely to win the draw.
+         * Although this process is almost twice as slow (on average)
+         * than a list bucket and even slower than a tree bucket
+         * (which scales logarithmically), straw2 buckets result in
+         * optimal data movement between nested items when modified.
+         */
+	CRUSH_BUCKET_STRAW2 = 5,
+};
+extern const char *crush_bucket_alg_name(int alg);
+
+/*
+ * although tree was a legacy algorithm, it has been buggy, so
+ * exclude it.
+ */
+#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS (	\
+		(1 << CRUSH_BUCKET_UNIFORM) |	\
+		(1 << CRUSH_BUCKET_LIST) |	\
+		(1 << CRUSH_BUCKET_STRAW))
+
+/** @ingroup API
+ *
+ * A bucket contains __size__ __items__ which are either positive
+ * numbers or negative numbers that reference other buckets and is
+ * uniquely identified with __id__ which is a negative number.  The
+ * __weight__ of a bucket is the cumulative weight of all its
+ * children.  A bucket is assigned a ::crush_algorithm that is used by
+ * crush_do_rule() to draw an item depending on its weight.  A bucket
+ * can be assigned a strictly positive (> 0) __type__ defined by the
+ * caller. The __type__ can be used by crush_do_rule(), when it is
+ * given as an argument of a rule step.
+ *
+ * A pointer to crush_bucket can safely be cast into the following
+ * structure, depending on the value of __alg__:
+ *
+ * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform
+ * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list
+ * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2
+ *
+ * The weight of each item depends on the algorithm and the
+ * information about it is available in the corresponding structure
+ * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2).
+ *
+ * See crush_map for more information on how __id__ is used
+ * to reference the bucket.
+ */
+struct crush_bucket {
+	__s32 id;        /*!< bucket identifier, < 0 and unique within a crush_map */
+	__u16 type;      /*!< > 0 bucket type, defined by the caller */
+	__u8 alg;        /*!< the item selection ::crush_algorithm */
+        /*! @cond INTERNAL */
+	__u8 hash;       /* which hash function to use, CRUSH_HASH_* */
+	/*! @endcond */
+	__u32 weight;    /*!< 16.16 fixed point cumulated children weight */
+	__u32 size;      /*!< size of the __items__ array */
+        __s32 *items;    /*!< array of children: < 0 are buckets, >= 0 items */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights for each item in a bucket. The size of the
+ * array must be exactly the size of the straw2 bucket, just as the
+ * item_weights array.
+ *
+ */
+struct crush_weight_set {
+  __u32 *weights; /*!< 16.16 fixed point weights in the same order as items */
+  __u32 size;     /*!< size of the __weights__ array */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights and ids for a given straw2 bucket, for
+ * placement purposes.
+ *
+ * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
+ * replacement weights found at __weight_set[N]__ are used instead of
+ * the weights from __item_weights__. If __N__ is greater than
+ * __weight_set_positions__, the weights found at __weight_set_positions-1__ are
+ * used instead. For instance if __weight_set__ is:
+ *
+ *    [ [ 0x10000, 0x20000 ],   // position 0
+ *      [ 0x20000, 0x40000 ] ]  // position 1
+ *
+ * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
+ * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
+ * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
+ * etc.
+ *
+ */
+struct crush_choose_arg {
+  __s32 *ids;                           /*!< values to use instead of items */
+  __u32 ids_size;                       /*!< size of the __ids__ array */
+  struct crush_weight_set *weight_set;  /*!< weight replacements for a given position */
+  __u32 weight_set_positions;           /*!< size of the __weight_set__ array */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights and ids for each bucket in the crushmap. The
+ * __size__ of the __args__ array must be exactly the same as the
+ * __map->max_buckets__.
+ *
+ * The __crush_choose_arg__ at index N will be used when choosing
+ * an item from the bucket __map->buckets[N]__ bucket, provided it
+ * is a straw2 bucket.
+ *
+ */
+struct crush_choose_arg_map {
+  struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */
+  __u32 size;                    /*!< size of the __args__ array */
+};
+
+/** @ingroup API
+ * The weight of each item in the bucket when
+ * __h.alg__ == ::CRUSH_BUCKET_UNIFORM.
+ */
+struct crush_bucket_uniform {
+       struct crush_bucket h; /*!< generic bucket information */
+	__u32 item_weight;  /*!< 16.16 fixed point weight for each item */
+};
+
+/** @ingroup API
+ * The weight of each item in the bucket when
+ * __h.alg__ == ::CRUSH_BUCKET_LIST.
+ *
+ * The weight of __h.items[i]__ is __item_weights[i]__ for i in
+ * [0,__h.size__[. The __sum_weight__[i] is the sum of the __item_weights[j]__
+ * for j in [0,i[.
+ *
+ */
+struct crush_bucket_list {
+        struct crush_bucket h; /*!< generic bucket information */
+	__u32 *item_weights;  /*!< 16.16 fixed point weight for each item */
+	__u32 *sum_weights;   /*!< 16.16 fixed point sum of the weights */
+};
+
+struct crush_bucket_tree {
+	struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
+				   actual items */
+	__u8 num_nodes;
+	__u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+	struct crush_bucket h;
+	__u32 *item_weights;   /* 16-bit fixed point */
+	__u32 *straws;         /* 16-bit fixed point */
+};
+
+/** @ingroup API
+ * The weight of each item in the bucket when
+ * __h.alg__ == ::CRUSH_BUCKET_STRAW2.
+ *
+ * The weight of __h.items[i]__ is __item_weights[i]__ for i in
+ * [0,__h.size__].
+ */
+struct crush_bucket_straw2 {
+        struct crush_bucket h; /*!< generic bucket information */
+	__u32 *item_weights;   /*!< 16.16 fixed point weight for each item */
+};
+
+
+
+/** @ingroup API
+ *
+ * A crush map define a hierarchy of crush_bucket that end with leaves
+ * (buckets and leaves are called items) and a set of crush_rule to
+ * map an integer to items with the crush_do_rule() function.
+ *
+ */
+struct crush_map {
+        /*! An array of crush_bucket pointers of size __max_buckets__.
+         * An element of the array may be NULL if the bucket was removed with
+         * crush_remove_bucket(). The buckets must be added with crush_add_bucket().
+         * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i.
+         */
+	struct crush_bucket **buckets;
+        /*! An array of crush_rule pointers of size __max_rules__.
+         * An element of the array may be NULL if the rule was removed (there is
+         * no API to do so but there may be one in the future). The rules must be added
+         * with crush_add_rule().
+         */
+	struct crush_rule **rules;
+        __s32 max_buckets; /*!< the size of __buckets__ */
+	__u32 max_rules; /*!< the size of __rules__ */
+        /*! The value of the highest item stored in the crush_map + 1
+         */
+	__s32 max_devices;
+
+	/*! Backward compatibility tunable. It implements a bad solution
+         * and must always be set to 0 except for backward compatibility
+         * purposes
+         */
+	__u32 choose_local_tries;
+	/*! Backward compatibility tunable. It implements a bad solution
+         * and must always be set to 0 except for backward compatibility
+         * purposes
+         */
+	__u32 choose_local_fallback_tries;
+	/*! Tunable. The default value when the CHOOSE_TRIES or
+         * CHOOSELEAF_TRIES steps are omitted in a rule. See the
+         * documentation for crush_rule_set_step() for more
+         * information
+         */
+	__u32 choose_total_tries;
+	/*! Backward compatibility tunable. It should always be set
+         *  to 1 except for backward compatibility. Implemented in 2012
+         *  it was generalized late 2013 and is mostly unused except
+         *  in one border case, reason why it must be set to 1.
+         *
+         *  Attempt chooseleaf inner descent once for firstn mode; on
+         *  reject retry outer descent.  Note that this does *not*
+         *  apply to a collision: in that case we will retry as we
+         *  used to.
+         */
+	__u32 chooseleaf_descend_once;
+	/*! Backward compatibility tunable. It is a fix for bad
+         *  mappings implemented in 2014 at
+         *  https://github.com/ceph/ceph/pull/1185. It should always
+         *  be set to 1 except for backward compatibility.
+         *
+         *  If non-zero, feed r into chooseleaf, bit-shifted right by
+	 *  (r-1) bits.  a value of 1 is best for new clusters.  for
+	 *  legacy clusters that want to limit reshuffling, a value of
+	 *  3 or 4 will make the mappings line up a bit better with
+	 *  previous mappings.
+         */
+	__u8 chooseleaf_vary_r;
+
+	/*! Backward compatibility tunable. It is an improvement that
+         *  avoids unnecessary mapping changes, implemented at
+         *  https://github.com/ceph/ceph/pull/6572 and explained in
+         *  this post: "chooseleaf may cause some unnecessary pg
+         *  migrations" in October 2015
+         *  https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html
+         *  It should always be set to 1 except for backward compatibility.
+         */
+	__u8 chooseleaf_stable;
+
+        /*! @cond INTERNAL */
+	/* This value is calculated after decode or construction by
+	   the builder. It is exposed here (rather than having a
+	   'build CRUSH working space' function) so that callers can
+	   reserve a static buffer, allocate space on the stack, or
+	   otherwise avoid calling into the heap allocator if they
+	   want to. The size of the working space depends on the map,
+	   while the size of the scratch vector passed to the mapper
+	   depends on the size of the desired result set.
+
+	   Nothing stops the caller from allocating both in one swell
+	   foop and passing in two points, though. */
+	size_t working_size;
+
+#ifndef __KERNEL__
+	/*! @endcond */
+	/*! Backward compatibility tunable. It is a fix for the straw
+         *  scaler values for the straw algorithm which is deprecated
+         *  (straw2 replaces it) implemented at
+         *  https://github.com/ceph/ceph/pull/3057. It should always
+         *  be set to 1 except for backward compatibility.
+         *
+	 */
+	__u8 straw_calc_version;
+
+        /*! @cond INTERNAL */
+	/*
+	 * allowed bucket algs is a bitmask, here the bit positions
+	 * are CRUSH_BUCKET_*.  note that these are *bits* and
+	 * CRUSH_BUCKET_* values are not, so we need to or together (1
+	 * << CRUSH_BUCKET_WHATEVER).  The 0th bit is not used to
+	 * minimize confusion (bucket type values start at 1).
+	 */
+	__u32 allowed_bucket_algs;
+
+	__u32 *choose_tries;
+#endif
+	/*! @endcond */
+};
+
+
+/* crush.c */
+/** @ingroup API
+ *
+ * Return the 16.16 fixed point weight of the item at __pos__ (zero
+ * based index) within the bucket __b__. If __pos__ is negative or
+ * greater or equal to the number of items in the bucket, return 0.
+ *
+ * @param b the bucket containing items
+ * @param pos the zero based index of the item
+ *
+ * @returns the 16.16 fixed point item weight
+ */
+extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
+/** @ingroup API
+ *
+ * Deallocate a bucket created via crush_add_bucket().
+ *
+ * @param b the bucket to deallocate
+ */
+extern void crush_destroy_bucket(struct crush_bucket *b);
+/** @ingroup API
+ *
+ * Deallocate a rule created via crush_add_rule().
+ *
+ * @param r the rule to deallocate
+ */
+extern void crush_destroy_rule(struct crush_rule *r);
+/** @ingroup API
+ *
+ * Deallocate the __map__, previously allocated with crush_create.
+ *
+ * @param map the crush map
+ */
+extern void crush_destroy(struct crush_map *map);
+
+static inline int crush_calc_tree_node(int i)
+{
+	return ((i+1) << 1)-1;
+}
+
+static inline const char *crush_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return "uniform";
+	case CRUSH_BUCKET_LIST:
+		return "list";
+	case CRUSH_BUCKET_TREE:
+		return "tree";
+	case CRUSH_BUCKET_STRAW:
+		return "straw";
+	case CRUSH_BUCKET_STRAW2:
+		return "straw2";
+	default:
+		return "unknown";
+	}
+}
+
+/* ---------------------------------------------------------------------
+			       Private
+   --------------------------------------------------------------------- */
+
+/* These data structures are private to the CRUSH implementation. They
+   are exposed in this header file because builder needs their
+   definitions to calculate the total working size.
+
+   Moving this out of the crush map allow us to treat the CRUSH map as
+   immutable within the mapper and removes the requirement for a CRUSH
+   map lock. */
+
+struct crush_work_bucket {
+	__u32 perm_x; /* @x for which *perm is defined */
+	__u32 perm_n; /* num elements of *perm that are permuted/defined */
+	__u32 *perm;  /* Permutation of the bucket's items */
+} __attribute__ ((packed));
+
+struct crush_work {
+	struct crush_work_bucket **work; /* Per-bucket working store */
+};
+
+#endif
diff --git a/src/crush/crush_compat.h b/src/crush/crush_compat.h
new file mode 100644
index 000000000..08eb4eab9
--- /dev/null
+++ b/src/crush/crush_compat.h
@@ -0,0 +1,39 @@
+#ifndef CEPH_CRUSH_COMPAT_H
+#define CEPH_CRUSH_COMPAT_H
+
+#include "include/int_types.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* asm-generic/bug.h */
+
+#define BUG_ON(x) assert(!(x))
+
+/* linux/kernel.h */
+
+#define U8_MAX		((__u8)~0U)
+#define S8_MAX		((__s8)(U8_MAX>>1))
+#define S8_MIN		((__s8)(-S8_MAX - 1))
+#define U16_MAX		((__u16)~0U)
+#define S16_MAX		((__s16)(U16_MAX>>1))
+#define S16_MIN		((__s16)(-S16_MAX - 1))
+#define U32_MAX		((__u32)~0U)
+#define S32_MAX		((__s32)(U32_MAX>>1))
+#define S32_MIN		((__s32)(-S32_MAX - 1))
+#define U64_MAX		((__u64)~0ULL)
+#define S64_MAX		((__s64)(U64_MAX>>1))
+#define S64_MIN		((__s64)(-S64_MAX - 1))
+
+/* linux/math64.h */
+
+#define div64_s64(dividend, divisor) ((dividend) / (divisor))
+
+/* linux/slab.h */
+
+#define kmalloc(size, flags) malloc(size)
+#define kfree(x) do { if (x) free(x); } while (0)
+
+#endif /* CEPH_CRUSH_COMPAT_H */
diff --git a/src/crush/crush_ln_table.h b/src/crush/crush_ln_table.h
new file mode 100644
index 000000000..aae534c90
--- /dev/null
+++ b/src/crush/crush_ln_table.h
@@ -0,0 +1,164 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CRUSH_LN_H
+#define CEPH_CRUSH_LN_H
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
+/*
+ * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+ * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+ */
+static __s64 __RH_LH_tbl[128*2+2] = {
+  0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
+  0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
+  0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
+  0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll,
+  0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll,
+  0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll,
+  0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll,
+  0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell,
+  0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll,
+  0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll,
+  0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll,
+  0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll,
+  0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll,
+  0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll,
+  0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all,
+  0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll,
+  0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all,
+  0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell,
+  0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll,
+  0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll,
+  0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll,
+  0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll,
+  0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll,
+  0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll,
+  0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll,
+  0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll,
+  0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell,
+  0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll,
+  0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll,
+  0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll,
+  0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll,
+  0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll,
+  0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll,
+  0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll,
+  0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll,
+  0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll,
+  0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll,
+  0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll,
+  0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll,
+  0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll,
+  0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll,
+  0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll,
+  0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll,
+  0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll,
+  0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll,
+  0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll,
+  0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll,
+  0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll,
+  0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll,
+  0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll,
+  0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll,
+  0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll,
+  0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll,
+  0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell,
+  0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell,
+  0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll,
+  0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell,
+  0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll,
+  0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll,
+  0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll,
+  0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll,
+  0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll,
+  0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
+  0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
+  0x0000800000000000ll, 0x0000ffff00000000ll,
+};
+
+/*
+ * LL_tbl[k] = 2^48*log2(1.0+k/2^15)
+ */
+static __s64 __LL_tbl[256] = {
+  0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
+  0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
+  0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
+  0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull,
+  0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull,
+  0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull,
+  0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull,
+  0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull,
+  0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull,
+  0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull,
+  0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull,
+  0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull,
+  0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull,
+  0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull,
+  0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull,
+  0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull,
+  0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull,
+  0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull,
+  0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull,
+  0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull,
+  0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull,
+  0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull,
+  0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull,
+  0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull,
+  0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull,
+  0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull,
+  0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull,
+  0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull,
+  0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull,
+  0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull,
+  0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull,
+  0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull,
+  0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull,
+  0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull,
+  0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull,
+  0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull,
+  0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull,
+  0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull,
+  0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull,
+  0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull,
+  0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull,
+  0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull,
+  0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull,
+  0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull,
+  0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull,
+  0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull,
+  0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull,
+  0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull,
+  0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull,
+  0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull,
+  0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull,
+  0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull,
+  0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull,
+  0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull,
+  0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull,
+  0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull,
+  0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull,
+  0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull,
+  0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull,
+  0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull,
+  0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull,
+  0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull,
+  0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull,
+  0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
+};
+
+#endif
diff --git a/src/crush/grammar.h b/src/crush/grammar.h
new file mode 100644
index 000000000..582e502e6
--- /dev/null
+++ b/src/crush/grammar.h
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2008 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CRUSH_GRAMMAR_H
+#define CEPH_CRUSH_GRAMMAR_H
+
+//#define BOOST_SPIRIT_DEBUG
+
+#ifdef USE_BOOST_SPIRIT_OLD_HDR
+#include <boost/spirit/core.hpp>
+#include <boost/spirit/tree/ast.hpp>
+#include <boost/spirit/tree/tree_to_xml.hpp>
+#else
+#define BOOST_SPIRIT_USE_OLD_NAMESPACE
+#include <boost/spirit/include/classic_core.hpp>
+#include <boost/spirit/include/classic_ast.hpp>
+#include <boost/spirit/include/classic_tree_to_xml.hpp>
+#endif
+using namespace boost::spirit;
+
+struct crush_grammar : public boost::spirit::grammar<crush_grammar>
+{
+  enum {
+    _int = 1,
+    _posint,
+    _negint,
+    _name,
+    _device,
+    _bucket_type,
+    _bucket_id,
+    _bucket_alg,
+    _bucket_hash,
+    _bucket_item,
+    _bucket,
+    _step_take,
+    _step_set_chooseleaf_tries,
+    _step_set_chooseleaf_vary_r,
+    _step_set_chooseleaf_stable,
+    _step_set_choose_tries,
+    _step_set_choose_local_tries,
+    _step_set_choose_local_fallback_tries,
+    _step_choose,
+    _step_chooseleaf,
+    _step_emit,
+    _step,
+    _crushrule,
+    _weight_set_weights,
+    _weight_set,
+    _choose_arg_ids,
+    _choose_arg,
+    _choose_args,
+    _crushmap,
+    _tunable,
+  };
+
+  template <typename ScannerT>
+  struct definition
+  {
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>,boost::spirit::parser_tag<_int> >      integer;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_posint> >      posint;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_negint> >      negint;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_name> >      name;
+
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_tunable> >      tunable;
+
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_device> >      device;
+
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_bucket_type> >    bucket_type;
+
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_bucket_id> >      bucket_id;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_bucket_alg> >     bucket_alg;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_bucket_hash> >    bucket_hash;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_bucket_item> >    bucket_item;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_bucket> >      bucket;
+
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_take> >      step_take;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_set_choose_tries> >    step_set_choose_tries;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_set_choose_local_tries> >    step_set_choose_local_tries;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_set_choose_local_fallback_tries> >    step_set_choose_local_fallback_tries;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_set_chooseleaf_tries> >    step_set_chooseleaf_tries;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_set_chooseleaf_vary_r> >    step_set_chooseleaf_vary_r;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_set_chooseleaf_stable> >    step_set_chooseleaf_stable;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_choose> >    step_choose;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_chooseleaf> >      step_chooseleaf;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step_emit> >      step_emit;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_step> >      step;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_crushrule> >      crushrule;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_weight_set_weights> >     weight_set_weights;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_weight_set> >     weight_set;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_choose_arg_ids> >     choose_arg_ids;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_choose_arg> >     choose_arg;
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_choose_args> >     choose_args;
+
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>, boost::spirit::parser_tag<_crushmap> >      crushmap;
+
+    definition(crush_grammar const& /*self*/)
+    {
+      using boost::spirit::leaf_node_d;
+      using boost::spirit::lexeme_d;
+      using boost::spirit::str_p;
+      using boost::spirit::ch_p;
+      using boost::spirit::digit_p;
+      using boost::spirit::alnum_p;
+      using boost::spirit::real_p;
+
+      // base types
+      integer     =   leaf_node_d[ lexeme_d[
+					    (!ch_p('-') >> +digit_p)
+					    ] ];
+      posint     =   leaf_node_d[ lexeme_d[ +digit_p ] ];
+      negint     =   leaf_node_d[ lexeme_d[ ch_p('-') >> +digit_p ] ];
+      name = leaf_node_d[ lexeme_d[ +( alnum_p || ch_p('-') || ch_p('_') || ch_p('.')) ] ];
+
+      // tunables
+      tunable = str_p("tunable") >> name >> posint;
+
+      // devices
+      device = str_p("device") >> posint >> name >> !( str_p("class") >> name );
+
+      // bucket types
+      bucket_type = str_p("type") >> posint >> name;
+
+      // buckets
+      bucket_id = str_p("id") >> negint >> !( str_p("class") >> name );
+      bucket_alg = str_p("alg") >> name;
+      bucket_hash = str_p("hash") >> ( integer |
+				       str_p("rjenkins1") );
+      bucket_item = str_p("item") >> name
+				  >> !( str_p("weight") >> real_p )
+				  >> !( str_p("pos") >> posint );
+      bucket = name >> name >> '{' >> *bucket_id >> bucket_alg >> *bucket_hash >> *bucket_item >> '}';
+
+      // rules
+      step_take = str_p("take") >> name >> !( str_p("class") >> name );
+      step_set_choose_tries = str_p("set_choose_tries") >> posint;
+      step_set_choose_local_tries = str_p("set_choose_local_tries") >> posint;
+      step_set_choose_local_fallback_tries = str_p("set_choose_local_fallback_tries") >> posint;
+      step_set_chooseleaf_tries = str_p("set_chooseleaf_tries") >> posint;
+      step_set_chooseleaf_vary_r = str_p("set_chooseleaf_vary_r") >> posint;
+      step_set_chooseleaf_stable = str_p("set_chooseleaf_stable") >> posint;
+      step_choose = str_p("choose")
+	>> ( str_p("indep") | str_p("firstn") )
+	>> integer
+	>> str_p("type") >> name;
+      step_chooseleaf = str_p("chooseleaf")
+	>> ( str_p("indep") | str_p("firstn") )
+	>> integer
+	>> str_p("type") >> name;
+      step_emit = str_p("emit");
+      step = str_p("step") >> ( step_take |
+				step_set_choose_tries |
+				step_set_choose_local_tries |
+				step_set_choose_local_fallback_tries |
+				step_set_chooseleaf_tries |
+				step_set_chooseleaf_vary_r |
+				step_set_chooseleaf_stable |
+				step_choose |
+				step_chooseleaf |
+				step_emit );
+      crushrule = str_p("rule") >> !name >> '{'
+				>> (str_p("id") | str_p("ruleset")) >> posint
+			   >> str_p("type") >> ( str_p("replicated") | str_p("erasure") )
+			   >> str_p("min_size") >> posint
+			   >> str_p("max_size") >> posint
+			   >> +step
+			   >> '}';
+
+      weight_set_weights = str_p("[") >> *real_p >> str_p("]");
+      weight_set = str_p("weight_set") >> str_p("[")
+				       >> *weight_set_weights
+				       >> str_p("]");
+      choose_arg_ids = str_p("ids") >> str_p("[") >> *integer >> str_p("]");
+      choose_arg = str_p("{") >> str_p("bucket_id") >> negint
+			      >> !weight_set
+			      >> !choose_arg_ids
+			      >> str_p("}");
+      choose_args = str_p("choose_args") >> posint >> str_p("{") >> *choose_arg >> str_p("}");
+
+      // the whole crush map
+      crushmap = *(tunable | device | bucket_type) >> *(bucket | crushrule) >> *choose_args;
+    }
+
+    boost::spirit::rule<ScannerT, boost::spirit::parser_context<>,
+			boost::spirit::parser_tag<_crushmap> > const&
+    start() const { return crushmap; }
+  };
+};
+
+#endif
diff --git a/src/crush/hash.c b/src/crush/hash.c
new file mode 100644
index 000000000..ed123af49
--- /dev/null
+++ b/src/crush/hash.c
@@ -0,0 +1,151 @@
+#ifdef __KERNEL__
+# include <linux/crush/hash.h>
+#else
+# include "hash.h"
+#endif
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {			\
+		a = a-b;  a = a-c;  a = a^(c>>13);	\
+		b = b-c;  b = b-a;  b = b^(a<<8);	\
+		c = c-a;  c = c-b;  c = c^(b>>13);	\
+		a = a-b;  a = a-c;  a = a^(c>>12);	\
+		b = b-c;  b = b-a;  b = b^(a<<16);	\
+		c = c-a;  c = c-b;  c = c^(b>>5);	\
+		a = a-b;  a = a-c;  a = a^(c>>3);	\
+		b = b-c;  b = b-a;  b = b^(a<<10);	\
+		c = c-a;  c = c-b;  c = c^(b>>15);	\
+	} while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+	__u32 hash = crush_hash_seed ^ a;
+	__u32 b = a;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, a, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(x, a, hash);
+	crush_hashmix(b, y, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(a, x, hash);
+	crush_hashmix(y, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, d, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+				      __u32 e)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(e, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	crush_hashmix(d, x, hash);
+	crush_hashmix(y, e, hash);
+	return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1(a);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_2(a, b);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_3(a, b, c);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_4(a, b, c, d);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_5(a, b, c, d, e);
+	default:
+		return 0;
+	}
+}
+
+const char *crush_hash_name(int type)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return "rjenkins1";
+	default:
+		return "unknown";
+	}
+}
diff --git a/src/crush/hash.h b/src/crush/hash.h
new file mode 100644
index 000000000..d1d902582
--- /dev/null
+++ b/src/crush/hash.h
@@ -0,0 +1,23 @@
+#ifndef CEPH_CRUSH_HASH_H
+#define CEPH_CRUSH_HASH_H
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
+#define CRUSH_HASH_RJENKINS1   0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+			    __u32 e);
+
+#endif
diff --git a/src/crush/mapper.c b/src/crush/mapper.c
new file mode 100644
index 000000000..4ac572627
--- /dev/null
+++ b/src/crush/mapper.c
@@ -0,0 +1,1105 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# include <linux/crush/crush.h>
+# include <linux/crush/hash.h>
+#else
+# include "crush_compat.h"
+# include "crush.h"
+# include "hash.h"
+#endif
+#include "crush_ln_table.h"
+#include "mapper.h"
+
+#define dprintk(args...) /* printf(args) */
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
+{
+	__u32 i;
+
+	for (i = 0; i < map->max_rules; i++) {
+		if (map->rules[i] &&
+		    map->rules[i]->mask.ruleset == ruleset &&
+		    map->rules[i]->mask.type == type &&
+		    map->rules[i]->mask.min_size <= size &&
+		    map->rules[i]->mask.max_size >= size)
+			return i;
+	}
+	return -1;
+}
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+			      struct crush_work_bucket *work,
+			      int x, int r)
+{
+	unsigned int pr = r % bucket->size;
+	unsigned int i, s;
+
+	/* start a new permutation if @x has changed */
+	if (work->perm_x != (__u32)x || work->perm_n == 0) {
+		dprintk("bucket %d new x=%d\n", bucket->id, x);
+		work->perm_x = x;
+
+		/* optimize common r=0 case */
+		if (pr == 0) {
+			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+				bucket->size;
+			work->perm[0] = s;
+			work->perm_n = 0xffff;   /* magic value, see below */
+			goto out;
+		}
+
+		for (i = 0; i < bucket->size; i++)
+			work->perm[i] = i;
+		work->perm_n = 0;
+	} else if (work->perm_n == 0xffff) {
+		/* clean up after the r=0 case above */
+		for (i = 1; i < bucket->size; i++)
+			work->perm[i] = i;
+		work->perm[work->perm[0]] = 0;
+		work->perm_n = 1;
+	}
+
+	/* calculate permutation up to pr */
+	for (i = 0; i < work->perm_n; i++)
+		dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
+	while (work->perm_n <= pr) {
+		unsigned int p = work->perm_n;
+		/* no point in swapping the final entry */
+		if (p < bucket->size - 1) {
+			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+				(bucket->size - p);
+			if (i) {
+				unsigned int t = work->perm[p + i];
+				work->perm[p + i] = work->perm[p];
+				work->perm[p] = t;
+			}
+			dprintk(" perm_choose swap %d with %d\n", p, p+i);
+		}
+		work->perm_n++;
+	}
+	for (i = 0; i < bucket->size; i++)
+		dprintk(" perm_choose  %d: %d\n", i, work->perm[i]);
+
+	s = work->perm[pr];
+out:
+	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+		bucket->size, x, r, pr, s);
+	return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+				 struct crush_work_bucket *work, int x, int r)
+{
+	return bucket_perm_choose(&bucket->h, work, x, r);
+}
+
+/* list */
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
+			      int x, int r)
+{
+	int i;
+
+	for (i = bucket->h.size-1; i >= 0; i--) {
+		__u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
+					 r, bucket->h.id);
+		w &= 0xffff;
+		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+			"sw %x rand %llx",
+			i, x, r, bucket->h.items[i], bucket->item_weights[i],
+			bucket->sum_weights[i], w);
+		w *= bucket->sum_weights[i];
+		w = w >> 16;
+		/*dprintk(" scaled %llx\n", w);*/
+		if (w < bucket->item_weights[i]) {
+			return bucket->h.items[i];
+		}
+	}
+
+	dprintk("bad list sums for bucket %d\n", bucket->h.id);
+	return bucket->h.items[0];
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+	int h = 0;
+	while ((n & 1) == 0) {
+		h++;
+		n = n >> 1;
+	}
+	return h;
+}
+
+static int left(int x)
+{
+	int h = height(x);
+	return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+	int h = height(x);
+	return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+	return x & 1;
+}
+
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
+			      int x, int r)
+{
+	int n;
+	__u32 w;
+	__u64 t;
+
+	/* start at root */
+	n = bucket->num_nodes >> 1;
+
+	while (!terminal(n)) {
+		int l;
+		/* pick point in [0, w) */
+		w = bucket->node_weights[n];
+		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+					  bucket->h.id) * (__u64)w;
+		t = t >> 32;
+
+		/* descend to the left or right? */
+		l = left(n);
+		if (t < bucket->node_weights[l])
+			n = l;
+		else
+			n = right(n);
+	}
+
+	return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
+			       int x, int r)
+{
+	__u32 i;
+	int high = 0;
+	__u64 high_draw = 0;
+	__u64 draw;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+		draw &= 0xffff;
+		draw *= bucket->straws[i];
+		if (i == 0 || draw > high_draw) {
+			high = i;
+			high_draw = draw;
+		}
+	}
+	return bucket->h.items[high];
+}
+
+/* compute 2^44*log2(input+1) */
+static __u64 crush_ln(unsigned int xin)
+{
+	unsigned int x = xin;
+	int iexpon, index1, index2;
+	__u64 RH, LH, LL, xl64, result;
+
+	x++;
+
+	/* normalize input */
+	iexpon = 15;
+
+	// figure out number of bits we need to shift and
+	// do it in one step instead of iteratively
+	if (!(x & 0x18000)) {
+	  int bits = __builtin_clz(x & 0x1FFFF) - 16;
+	  x <<= bits;
+	  iexpon = 15 - bits;
+	}
+
+	index1 = (x >> 8) << 1;
+	/* RH ~ 2^56/index1 */
+	RH = __RH_LH_tbl[index1 - 256];
+	/* LH ~ 2^48 * log2(index1/256) */
+	LH = __RH_LH_tbl[index1 + 1 - 256];
+
+	/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
+	xl64 = (__s64)x * RH;
+	xl64 >>= 48;
+
+	result = iexpon;
+	result <<= (12 + 32);
+
+	index2 = xl64 & 0xff;
+	/* LL ~ 2^48*log2(1.0+index2/2^15) */
+	LL = __LL_tbl[index2];
+
+	LH = LH + LL;
+
+	LH >>= (48 - 12 - 32);
+	result += LH;
+
+	return result;
+}
+
+
+/*
+ * straw2
+ *
+ * Suppose we have two osds: osd.0 and osd.1, with weight 8 and 4 respectively, It means:
+ *   a). For osd.0, the time interval between each io request apply to exponential distribution 
+ *       with lamba equals 8
+ *   b). For osd.1, the time interval between each io request apply to exponential distribution 
+ *       with lamba equals 4
+ *   c). If we apply to each osd's exponential random variable, then the total pgs on each osd
+ *       is proportional to its weight.
+ *
+ * for reference, see:
+ *
+ * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
+ */
+
+static inline __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
+                                            const struct crush_choose_arg *arg,
+                                            int position)
+{
+	if ((arg == NULL) || (arg->weight_set == NULL))
+		return bucket->item_weights;
+	if (position >= arg->weight_set_positions)
+		position = arg->weight_set_positions - 1;
+	return arg->weight_set[position].weights;
+}
+
+static inline __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
+					const struct crush_choose_arg *arg)
+{
+	if ((arg == NULL) || (arg->ids == NULL))
+		return bucket->h.items;
+	return arg->ids;
+}
+
+/*
+ * Compute exponential random variable using inversion method.
+ *
+ * for reference, see the exponential distribution example at:  
+ * https://en.wikipedia.org/wiki/Inverse_transform_sampling#Examples
+ */
+static inline __s64 generate_exponential_distribution(int type, int x, int y, int z, 
+                                                      int weight)
+{
+	unsigned int u = crush_hash32_3(type, x, y, z);
+	u &= 0xffff;
+
+	/*
+	 * for some reason slightly less than 0x10000 produces
+	 * a slightly more accurate distribution... probably a
+	 * rounding effect.
+	 *
+	 * the natural log lookup table maps [0,0xffff]
+	 * (corresponding to real numbers [1/0x10000, 1] to
+	 * [0, 0xffffffffffff] (corresponding to real numbers
+	 * [-11.090355,0]).
+	 */
+	__s64 ln = crush_ln(u) - 0x1000000000000ll;
+
+	/*
+	 * divide by 16.16 fixed-point weight.  note
+	 * that the ln value is negative, so a larger
+	 * weight means a larger (less negative) value
+	 * for draw.
+	 */
+	return div64_s64(ln, weight);
+}
+
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
+				int x, int r, const struct crush_choose_arg *arg,
+                                int position)
+{
+	unsigned int i, high = 0;
+	__s64 draw, high_draw = 0;
+        __u32 *weights = get_choose_arg_weights(bucket, arg, position);
+        __s32 *ids = get_choose_arg_ids(bucket, arg);
+	for (i = 0; i < bucket->h.size; i++) {
+                dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
+		if (weights[i]) {
+			draw = generate_exponential_distribution(bucket->h.hash, x, ids[i], r, weights[i]);
+		} else {
+			draw = S64_MIN;
+		}
+
+		if (i == 0 || draw > high_draw) {
+			high = i;
+			high_draw = draw;
+		}
+	}
+
+	return bucket->h.items[high];
+}
+
+
+static int crush_bucket_choose(const struct crush_bucket *in,
+			       struct crush_work_bucket *work,
+			       int x, int r,
+                               const struct crush_choose_arg *arg,
+                               int position)
+{
+	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+	BUG_ON(in->size == 0);
+	switch (in->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return bucket_uniform_choose(
+			(const struct crush_bucket_uniform *)in,
+			work, x, r);
+	case CRUSH_BUCKET_LIST:
+		return bucket_list_choose((const struct crush_bucket_list *)in,
+					  x, r);
+	case CRUSH_BUCKET_TREE:
+		return bucket_tree_choose((const struct crush_bucket_tree *)in,
+					  x, r);
+	case CRUSH_BUCKET_STRAW:
+		return bucket_straw_choose(
+			(const struct crush_bucket_straw *)in,
+			x, r);
+	case CRUSH_BUCKET_STRAW2:
+		return bucket_straw2_choose(
+			(const struct crush_bucket_straw2 *)in,
+			x, r, arg, position);
+	default:
+		dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
+		return in->items[0];
+	}
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(const struct crush_map *map,
+		  const __u32 *weight, int weight_max,
+		  int item, int x)
+{
+	if (item >= weight_max)
+		return 1;
+	if (weight[item] >= 0x10000)
+		return 0;
+	if (weight[item] == 0)
+		return 1;
+	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+	    < weight[item])
+		return 0;
+	return 1;
+}
+
+/**
+ * crush_choose_firstn - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @out_size: size of the out vector
+ * @tries: number of attempts to make
+ * @recurse_tries: number of attempts to have recursive chooseleaf make
+ * @local_retries: localized retries
+ * @local_fallback_retries: localized fallback retries
+ * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @stable: stable mode starts rep=0 in the recursive call for all replicas
+ * @vary_r: pass r to recursive calls
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
+ */
+static int crush_choose_firstn(const struct crush_map *map,
+			       struct crush_work *work,
+			       const struct crush_bucket *bucket,
+			       const __u32 *weight, int weight_max,
+			       int x, int numrep, int type,
+			       int *out, int outpos,
+			       int out_size,
+			       unsigned int tries,
+			       unsigned int recurse_tries,
+			       unsigned int local_retries,
+			       unsigned int local_fallback_retries,
+			       int recurse_to_leaf,
+			       unsigned int vary_r,
+			       unsigned int stable,
+			       int *out2,
+			       int parent_r,
+                               const struct crush_choose_arg *choose_args)
+{
+	int rep;
+	unsigned int ftotal, flocal;
+	int retry_descent, retry_bucket, skip_rep;
+	const struct crush_bucket *in = bucket;
+	int r;
+	int i;
+	int item = 0;
+	int itemtype;
+	int collide, reject;
+	int count = out_size;
+
+	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d \
+recurse_tries %d local_retries %d local_fallback_retries %d \
+parent_r %d stable %d\n",
+		recurse_to_leaf ? "_LEAF" : "",
+		bucket->id, x, outpos, numrep,
+		tries, recurse_tries, local_retries, local_fallback_retries,
+		parent_r, stable);
+
+	for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
+		/* keep trying until we get a non-out, non-colliding item */
+		ftotal = 0;
+		skip_rep = 0;
+		do {
+			retry_descent = 0;
+			in = bucket;              /* initial bucket */
+
+			/* choose through intervening buckets */
+			flocal = 0;
+			do {
+				collide = 0;
+				retry_bucket = 0;
+				r = rep + parent_r;
+				/* r' = r + f_total */
+				r += ftotal;
+
+				/* bucket choose */
+				if (in->size == 0) {
+					reject = 1;
+					goto reject;
+				}
+				if (local_fallback_retries > 0 &&
+				    flocal >= (in->size>>1) &&
+				    flocal > local_fallback_retries)
+					item = bucket_perm_choose(
+						in, work->work[-1-in->id],
+						x, r);
+				else
+					item = crush_bucket_choose(
+						in, work->work[-1-in->id],
+						x, r,
+                                                (choose_args ? &choose_args[-1-in->id] : 0),
+                                                outpos);
+				if (item >= map->max_devices) {
+					dprintk("   bad item %d\n", item);
+					skip_rep = 1;
+					break;
+				}
+
+				/* desired type? */
+				if (item < 0)
+					itemtype = map->buckets[-1-item]->type;
+				else
+					itemtype = 0;
+				dprintk("  item %d type %d\n", item, itemtype);
+
+				/* keep going? */
+				if (itemtype != type) {
+					if (item >= 0 ||
+					    (-1-item) >= map->max_buckets) {
+						dprintk("   bad item type %d\n", type);
+						skip_rep = 1;
+						break;
+					}
+					in = map->buckets[-1-item];
+					retry_bucket = 1;
+					continue;
+				}
+
+				/* collision? */
+				for (i = 0; i < outpos; i++) {
+					if (out[i] == item) {
+						collide = 1;
+						break;
+					}
+				}
+
+				reject = 0;
+				if (!collide && recurse_to_leaf) {
+					if (item < 0) {
+						int sub_r;
+						if (vary_r)
+							sub_r = r >> (vary_r-1);
+						else
+							sub_r = 0;
+						if (crush_choose_firstn(
+							    map,
+							    work,
+							    map->buckets[-1-item],
+							    weight, weight_max,
+							    x, stable ? 1 : outpos+1, 0,
+							    out2, outpos, count,
+							    recurse_tries, 0,
+							    local_retries,
+							    local_fallback_retries,
+							    0,
+							    vary_r,
+							    stable,
+							    NULL,
+							    sub_r,
+                                                            choose_args) <= outpos)
+							/* didn't get leaf */
+							reject = 1;
+					} else {
+						/* we already have a leaf! */
+						out2[outpos] = item;
+		                        }
+				}
+
+				if (!reject && !collide) {
+					/* out? */
+					if (itemtype == 0)
+						reject = is_out(map, weight,
+								weight_max,
+								item, x);
+				}
+
+reject:
+				if (reject || collide) {
+					ftotal++;
+					flocal++;
+
+					if (collide && flocal <= local_retries)
+						/* retry locally a few times */
+						retry_bucket = 1;
+					else if (local_fallback_retries > 0 &&
+						 flocal <= in->size + local_fallback_retries)
+						/* exhaustive bucket search */
+						retry_bucket = 1;
+					else if (ftotal < tries)
+						/* then retry descent */
+						retry_descent = 1;
+					else
+						/* else give up */
+						skip_rep = 1;
+					dprintk("  reject %d  collide %d  "
+						"ftotal %u  flocal %u\n",
+						reject, collide, ftotal,
+						flocal);
+				}
+			} while (retry_bucket);
+		} while (retry_descent);
+
+		if (skip_rep) {
+			dprintk("skip rep\n");
+			continue;
+		}
+
+		dprintk("CHOOSE got %d\n", item);
+		out[outpos] = item;
+		outpos++;
+		count--;
+#ifndef __KERNEL__
+		if (map->choose_tries && ftotal <= map->choose_total_tries)
+			map->choose_tries[ftotal]++;
+#endif
+	}
+
+	dprintk("CHOOSE returns %d\n", outpos);
+	return outpos;
+}
+
+
+/**
+ * crush_choose_indep: alternative breadth-first positionally stable mapping
+ *
+ */
+static void crush_choose_indep(const struct crush_map *map,
+			       struct crush_work *work,
+			       const struct crush_bucket *bucket,
+			       const __u32 *weight, int weight_max,
+			       int x, int left, int numrep, int type,
+			       int *out, int outpos,
+			       unsigned int tries,
+			       unsigned int recurse_tries,
+			       int recurse_to_leaf,
+			       int *out2,
+			       int parent_r,
+                               const struct crush_choose_arg *choose_args)
+{
+	const struct crush_bucket *in = bucket;
+	int endpos = outpos + left;
+	int rep;
+	unsigned int ftotal;
+	int r;
+	int i;
+	int item = 0;
+	int itemtype;
+	int collide;
+
+	dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+		bucket->id, x, outpos, numrep);
+
+	/* initially my result is undefined */
+	for (rep = outpos; rep < endpos; rep++) {
+		out[rep] = CRUSH_ITEM_UNDEF;
+		if (out2)
+			out2[rep] = CRUSH_ITEM_UNDEF;
+	}
+
+	for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+#ifdef DEBUG_INDEP
+		if (out2 && ftotal) {
+			dprintk("%u %d a: ", ftotal, left);
+			for (rep = outpos; rep < endpos; rep++) {
+				dprintk(" %d", out[rep]);
+			}
+			dprintk("\n");
+			dprintk("%u %d b: ", ftotal, left);
+			for (rep = outpos; rep < endpos; rep++) {
+				dprintk(" %d", out2[rep]);
+			}
+			dprintk("\n");
+		}
+#endif
+		for (rep = outpos; rep < endpos; rep++) {
+			if (out[rep] != CRUSH_ITEM_UNDEF)
+				continue;
+
+			in = bucket;  /* initial bucket */
+
+			/* choose through intervening buckets */
+			for (;;) {
+				/* note: we base the choice on the position
+				 * even in the nested call.  that means that
+				 * if the first layer chooses the same bucket
+				 * in a different position, we will tend to
+				 * choose a different item in that bucket.
+				 * this will involve more devices in data
+				 * movement and tend to distribute the load.
+				 */
+				r = rep + parent_r;
+
+				/* be careful */
+				if (in->alg == CRUSH_BUCKET_UNIFORM &&
+				    in->size % numrep == 0)
+					/* r'=r+(n+1)*f_total */
+					r += (numrep+1) * ftotal;
+				else
+					/* r' = r + n*f_total */
+					r += numrep * ftotal;
+
+				/* bucket choose */
+				if (in->size == 0) {
+					dprintk("   empty bucket\n");
+					break;
+				}
+
+				item = crush_bucket_choose(
+					in, work->work[-1-in->id],
+					x, r,
+                                        (choose_args ? &choose_args[-1-in->id] : 0),
+                                        outpos);
+				if (item >= map->max_devices) {
+					dprintk("   bad item %d\n", item);
+					out[rep] = CRUSH_ITEM_NONE;
+					if (out2)
+						out2[rep] = CRUSH_ITEM_NONE;
+					left--;
+					break;
+				}
+
+				/* desired type? */
+				if (item < 0)
+					itemtype = map->buckets[-1-item]->type;
+				else
+					itemtype = 0;
+				dprintk("  item %d type %d\n", item, itemtype);
+
+				/* keep going? */
+				if (itemtype != type) {
+					if (item >= 0 ||
+					    (-1-item) >= map->max_buckets) {
+						dprintk("   bad item type %d\n", type);
+						out[rep] = CRUSH_ITEM_NONE;
+						if (out2)
+							out2[rep] =
+								CRUSH_ITEM_NONE;
+						left--;
+						break;
+					}
+					in = map->buckets[-1-item];
+					continue;
+				}
+
+				/* collision? */
+				collide = 0;
+				for (i = outpos; i < endpos; i++) {
+					if (out[i] == item) {
+						collide = 1;
+						break;
+					}
+				}
+				if (collide)
+					break;
+
+				if (recurse_to_leaf) {
+					if (item < 0) {
+						crush_choose_indep(
+							map,
+							work,
+							map->buckets[-1-item],
+							weight, weight_max,
+							x, 1, numrep, 0,
+							out2, rep,
+							recurse_tries, 0,
+							0, NULL, r, choose_args);
+						if (out2 && out2[rep] == CRUSH_ITEM_NONE) {
+							/* placed nothing; no leaf */
+							break;
+						}
+					} else if (out2) {
+						/* we already have a leaf! */
+						out2[rep] = item;
+					}
+				}
+
+				/* out? */
+				if (itemtype == 0 &&
+				    is_out(map, weight, weight_max, item, x))
+					break;
+
+				/* yay! */
+				out[rep] = item;
+				left--;
+				break;
+			}
+		}
+	}
+	for (rep = outpos; rep < endpos; rep++) {
+		if (out[rep] == CRUSH_ITEM_UNDEF) {
+			out[rep] = CRUSH_ITEM_NONE;
+		}
+		if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
+			out2[rep] = CRUSH_ITEM_NONE;
+		}
+	}
+#ifndef __KERNEL__
+	if (map->choose_tries && ftotal <= map->choose_total_tries)
+		map->choose_tries[ftotal]++;
+#endif
+#ifdef DEBUG_INDEP
+	if (out2) {
+		dprintk("%u %d a: ", ftotal, left);
+		for (rep = outpos; rep < endpos; rep++) {
+			dprintk(" %d", out[rep]);
+		}
+		dprintk("\n");
+		dprintk("%u %d b: ", ftotal, left);
+		for (rep = outpos; rep < endpos; rep++) {
+			dprintk(" %d", out2[rep]);
+		}
+		dprintk("\n");
+	}
+#endif
+}
+
+
+/* This takes a chunk of memory and sets it up to be a shiny new
+   working area for a CRUSH placement computation. It must be called
+   on any newly allocated memory before passing it in to
+   crush_do_rule. It may be used repeatedly after that, so long as the
+   map has not changed. If the map /has/ changed, you must make sure
+   the working size is no smaller than what was allocated and re-run
+   crush_init_workspace.
+
+   If you do retain the working space between calls to crush, make it
+   thread-local. If you reinstitute the locking I've spent so much
+   time getting rid of, I will be very unhappy with you. */
+
+void crush_init_workspace(const struct crush_map *m, void *v) {
+	/* We work by moving through the available space and setting
+	   values and pointers as we go.
+
+	   It's a bit like Forth's use of the 'allot' word since we
+	   set the pointer first and then reserve the space for it to
+	   point to by incrementing the point. */
+	struct crush_work *w = (struct crush_work *)v;
+	char *point = (char *)v;
+	__s32 b;
+	point += sizeof(struct crush_work);
+	w->work = (struct crush_work_bucket **)point;
+	point += m->max_buckets * sizeof(struct crush_work_bucket *);
+	for (b = 0; b < m->max_buckets; ++b) {
+		if (m->buckets[b] == 0)
+			continue;
+
+		w->work[b] = (struct crush_work_bucket *) point;
+		switch (m->buckets[b]->alg) {
+		default:
+			point += sizeof(struct crush_work_bucket);
+			break;
+		}
+		w->work[b]->perm_x = 0;
+		w->work[b]->perm_n = 0;
+		w->work[b]->perm = (__u32 *)point;
+		point += m->buckets[b]->size * sizeof(__u32);
+	}
+	BUG_ON((char *)point - (char *)w != m->working_size);
+}
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
+ * @cwin: Pointer to at least map->working_size bytes of memory or NULL.
+ */
+int crush_do_rule(const struct crush_map *map,
+		  int ruleno, int x, int *result, int result_max,
+		  const __u32 *weight, int weight_max,
+		  void *cwin, const struct crush_choose_arg *choose_args)
+{
+	int result_len;
+	struct crush_work *cw = cwin;
+	int *a = (int *)((char *)cw + map->working_size);
+	int *b = a + result_max;
+	int *c = b + result_max;
+	int *w = a;
+	int *o = b;
+	int recurse_to_leaf;
+	int wsize = 0;
+	int osize;
+	int *tmp;
+	const struct crush_rule *rule;
+	__u32 step;
+	int i, j;
+	int numrep;
+	int out_size;
+	/*
+	 * the original choose_total_tries value was off by one (it
+	 * counted "retries" and not "tries").  add one.
+	 */
+	int choose_tries = map->choose_total_tries + 1;
+	int choose_leaf_tries = 0;
+	/*
+	 * the local tries values were counted as "retries", though,
+	 * and need no adjustment
+	 */
+	int choose_local_retries = map->choose_local_tries;
+	int choose_local_fallback_retries = map->choose_local_fallback_tries;
+
+	int vary_r = map->chooseleaf_vary_r;
+	int stable = map->chooseleaf_stable;
+
+	if ((__u32)ruleno >= map->max_rules) {
+		dprintk(" bad ruleno %d\n", ruleno);
+		return 0;
+	}
+
+	rule = map->rules[ruleno];
+	result_len = 0;
+
+	for (step = 0; step < rule->len; step++) {
+		int firstn = 0;
+		const struct crush_rule_step *curstep = &rule->steps[step];
+
+		switch (curstep->op) {
+		case CRUSH_RULE_TAKE:
+			if ((curstep->arg1 >= 0 &&
+			     curstep->arg1 < map->max_devices) ||
+			    (-1-curstep->arg1 >= 0 &&
+			     -1-curstep->arg1 < map->max_buckets &&
+			     map->buckets[-1-curstep->arg1])) {
+				w[0] = curstep->arg1;
+				wsize = 1;
+			} else {
+				dprintk(" bad take value %d\n", curstep->arg1);
+			}
+			break;
+
+		case CRUSH_RULE_SET_CHOOSE_TRIES:
+			if (curstep->arg1 > 0)
+				choose_tries = curstep->arg1;
+			break;
+
+		case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+			if (curstep->arg1 > 0)
+				choose_leaf_tries = curstep->arg1;
+			break;
+
+		case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
+			if (curstep->arg1 >= 0)
+				choose_local_retries = curstep->arg1;
+			break;
+
+		case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
+			if (curstep->arg1 >= 0)
+				choose_local_fallback_retries = curstep->arg1;
+			break;
+
+		case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+			if (curstep->arg1 >= 0)
+				vary_r = curstep->arg1;
+			break;
+
+		case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
+			if (curstep->arg1 >= 0)
+				stable = curstep->arg1;
+			break;
+
+		case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+		case CRUSH_RULE_CHOOSE_FIRSTN:
+			firstn = 1;
+			/* fall through */
+		case CRUSH_RULE_CHOOSELEAF_INDEP:
+		case CRUSH_RULE_CHOOSE_INDEP:
+			if (wsize == 0)
+				break;
+
+			recurse_to_leaf =
+				curstep->op ==
+				 CRUSH_RULE_CHOOSELEAF_FIRSTN ||
+				curstep->op ==
+				CRUSH_RULE_CHOOSELEAF_INDEP;
+
+			/* reset output */
+			osize = 0;
+
+			for (i = 0; i < wsize; i++) {
+				int bno;
+				numrep = curstep->arg1;
+				if (numrep <= 0) {
+					numrep += result_max;
+					if (numrep <= 0)
+						continue;
+				}
+				j = 0;
+				/* make sure bucket id is valid */
+				bno = -1 - w[i];
+				if (bno < 0 || bno >= map->max_buckets) {
+					// w[i] is probably CRUSH_ITEM_NONE
+					dprintk("  bad w[i] %d\n", w[i]);
+					continue;
+				}
+				if (firstn) {
+					int recurse_tries;
+					if (choose_leaf_tries)
+						recurse_tries =
+							choose_leaf_tries;
+					else if (map->chooseleaf_descend_once)
+						recurse_tries = 1;
+					else
+						recurse_tries = choose_tries;
+					osize += crush_choose_firstn(
+						map,
+						cw,
+						map->buckets[bno],
+						weight, weight_max,
+						x, numrep,
+						curstep->arg2,
+						o+osize, j,
+						result_max-osize,
+						choose_tries,
+						recurse_tries,
+						choose_local_retries,
+						choose_local_fallback_retries,
+						recurse_to_leaf,
+						vary_r,
+						stable,
+						c+osize,
+						0,
+						choose_args);
+				} else {
+					out_size = ((numrep < (result_max-osize)) ?
+						    numrep : (result_max-osize));
+					crush_choose_indep(
+						map,
+						cw,
+						map->buckets[bno],
+						weight, weight_max,
+						x, out_size, numrep,
+						curstep->arg2,
+						o+osize, j,
+						choose_tries,
+						choose_leaf_tries ?
+						   choose_leaf_tries : 1,
+						recurse_to_leaf,
+						c+osize,
+						0,
+						choose_args);
+					osize += out_size;
+				}
+			}
+
+			if (recurse_to_leaf)
+				/* copy final _leaf_ values to output set */
+				memcpy(o, c, osize*sizeof(*o));
+
+			/* swap o and w arrays */
+			tmp = o;
+			o = w;
+			w = tmp;
+			wsize = osize;
+			break;
+
+
+		case CRUSH_RULE_EMIT:
+			for (i = 0; i < wsize && result_len < result_max; i++) {
+				result[result_len] = w[i];
+				result_len++;
+			}
+			wsize = 0;
+			break;
+
+		default:
+			dprintk(" unknown op %d at step %d\n",
+				curstep->op, step);
+			break;
+		}
+	}
+
+	return result_len;
+}
diff --git a/src/crush/mapper.h b/src/crush/mapper.h
new file mode 100644
index 000000000..2332d4b51
--- /dev/null
+++ b/src/crush/mapper.h
@@ -0,0 +1,93 @@
+#ifndef CEPH_CRUSH_MAPPER_H
+#define CEPH_CRUSH_MAPPER_H
+
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL-2.1 or LGPL-3.0
+ */
+
+#include "crush.h"
+
+extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
+/** @ingroup API
+ *
+ * Map __x__ to __result_max__ items and store them in the __result__
+ * array. The mapping is done by following each step of the rule
+ * __ruleno__. See crush_make_rule(), crush_rule_set_step() and
+ * crush_add_rule() for more information on how the rules are created,
+ * populated and added to the crush __map__.
+ *
+ * The return value is the the number of items in the __result__
+ * array. If the caller asked for __result_max__ items and the return
+ * value is X where X < __result_max__, the content of __result[0,X[__
+ * is defined but the content of __result[X,result_max[__ is
+ * undefined. For example:
+ *
+ *     crush_do_rule(map, ruleno=1, x=1, result, result_max=3,...) == 1
+ *     result[0] is set
+ *     result[1] is undefined
+ *     result[2] is undefined
+ *
+ * An entry in the __result__ array is either an item in the crush
+ * __map__ or ::CRUSH_ITEM_NONE if no item was found. For example:
+ *
+ *     crush_do_rule(map, ruleno=1, x=1, result, result_max=4,...) == 2
+ *     result[0] is CRUSH_ITEM_NONE
+ *     result[1] is item number 5
+ *     result[2] is undefined
+ *     result[3] is undefined
+ *
+ * The __weight__ array contains the probabilities that a leaf is
+ * ignored even if it is selected. It is a 16.16 fixed point
+ * number in the range [0x00000,0x10000]. The lower the value, the
+ * more often the leaf is ignored. For instance:
+ *
+ * - weight[leaf] == 0x00000 == 0.0 always ignore
+ * - weight[leaf] == 0x10000 == 1.0 never ignore
+ * - weight[leaf] == 0x08000 == 0.5 ignore 50% of the time
+ * - weight[leaf] == 0x04000 == 0.25 ignore 75% of the time
+ * - etc.
+ *
+ * During mapping, each leaf is checked against the __weight__ array,
+ * using the leaf as an index. If there is no entry in __weight__ for
+ * the leaf, it is ignored. If there is an entry, the leaf will be
+ * ignored some of the time, depending on the probability.
+ *
+ * The __cwin__ argument must be set as follows:
+ *
+ *         char __cwin__[crush_work_size(__map__, __result_max__)];
+ *         crush_init_workspace(__map__, __cwin__);
+ *
+ * @param map the crush_map
+ * @param ruleno a positive integer < __CRUSH_MAX_RULES__
+ * @param x the value to map to __result_max__ items
+ * @param result an array of items of size __result_max__
+ * @param result_max the size of the __result__ array
+ * @param weights an array of weights of size __weight_max__
+ * @param weight_max the size of the __weights__ array
+ * @param cwin must be an char array initialized by crush_init_workspace
+ * @param choose_args weights and ids for each known bucket
+ *
+ * @return 0 on error or the size of __result__ on success
+ */
+extern int crush_do_rule(const struct crush_map *map,
+			 int ruleno,
+			 int x, int *result, int result_max,
+			 const __u32 *weights, int weight_max,
+			 void *cwin, const struct crush_choose_arg *choose_args);
+
+/* Returns the exact amount of workspace that will need to be used
+   for a given combination of crush_map and result_max. The caller can
+   then allocate this much on its own, either on the stack, in a
+   per-thread long-lived buffer, or however it likes. */
+
+static inline size_t crush_work_size(const struct crush_map *map,
+				     int result_max) {
+	return map->working_size + result_max * 3 * sizeof(__u32);
+}
+
+extern void crush_init_workspace(const struct crush_map *m, void *v);
+
+#endif
diff --git a/src/crush/old_sample.txt b/src/crush/old_sample.txt
new file mode 100644
index 000000000..54cf06a7b
--- /dev/null
+++ b/src/crush/old_sample.txt
@@ -0,0 +1,82 @@
+
+# first define our types
+<types>
+	<type osd>
+	   type_id = 0
+	</type>
+	<type cab>
+	   type_id = 2
+	</type>
+	<type row>
+	   type_id = 3
+	</type>
+	<type pool>
+	   type_id = 10
+	</type>
+</types>
+
+# hierarchy
+<devices>
+	<osd osd001>
+		id 1
+		weight 500
+	</osd>
+	<osd osd002>
+		id 2
+		weight 500
+	</osd>
+	<osd osd003>
+		id 3
+		weight 500
+	</osd>
+	<osd osd004>
+		id 4
+		weight 500
+	</osd>
+	<osd osd005>
+		id 5
+		weight 500
+	</osd>
+</devices>
+
+<buckets>
+	<cab cab-d2>
+		alg straw
+		id   -12
+		<item osd001/>
+		<item osd002/>
+		<item osd003/>
+		<item osd004>
+		      weight 600
+		</item>
+	</cab>
+	
+#	<pool newlayout>
+#		<item satapool>
+#			weight 1.0
+#		</item>
+#		<item fcpool>
+#			weight 3.0
+#		</item>
+#	</pool>
+</buckets>
+
+<devices>
+	<osd osd006>
+		id 5
+		weight 500
+	</osd>
+</devices>
+
+# rules
+<rules>
+	<rule normal>
+		pool 0
+		type replicated
+		min_size 1
+		mix_size 4
+		step take root
+		step choose_indep 0 osd
+		step emit
+	</rule>
+</rules>
diff --git a/src/crush/sample.txt b/src/crush/sample.txt
new file mode 100644
index 000000000..f7e0ac396
--- /dev/null
+++ b/src/crush/sample.txt
@@ -0,0 +1,47 @@
+
+# devices
+device 1 osd001
+device 2 osd002
+device 3 osd003 down   # same as offload 1.0
+device 4 osd004 offload 0       # 0.0 -> normal, 1.0 -> failed
+device 5 osd005 offload 0.1
+device 6 osd006 offload 0.1
+
+# hierarchy
+type 0 osd   # 'device' is actually the default for 0
+type 2 cab
+type 3 row
+type 10 pool
+
+cab root {
+       id -1         # optional
+       alg tree     # required
+       item osd001
+       item osd002 weight 600 pos 1
+       item osd003 weight 600 pos 0
+       item osd004 weight 600 pos 3
+       item osd005 weight 600 pos 4
+}
+
+# rules
+rule normal {
+     # these are required.
+     pool 0
+     type replicated 
+     min_size 1
+     max_size 4
+     # need 1 or more of these.
+     step take root
+     step choose firstn 0 type osd
+     step emit
+}
+
+rule {
+     pool 1
+     type erasure
+     min_size 3
+     max_size 6
+     step take root
+     step choose indep 0 type osd
+     step emit
+}
diff --git a/src/crush/types.h b/src/crush/types.h
new file mode 100644
index 000000000..919eed252
--- /dev/null
+++ b/src/crush/types.h
@@ -0,0 +1,17 @@
+#ifndef CEPH_CRUSH_TYPES_H
+#define CEPH_CRUSH_TYPES_H
+
+#ifdef KERNEL
+# define free(x) kfree(x)
+#else
+# include <stdlib.h>
+#endif
+
+
+#include <linux/types.h>  /* just for int types */
+
+#ifndef BUG_ON
+# define BUG_ON(x) ceph_assert(!(x))
+#endif
+
+#endif