100 files changed, 32336 insertions, 0 deletions
diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt
new file mode 100644
index 00000000..39cdc6b2
--- /dev/null
+++ b/src/include/CMakeLists.txt
@@ -0,0 +1,35 @@
+install(FILES
+  rados/librados.h
+  rados/rados_types.h
+  rados/rados_types.hpp
+  rados/librados_fwd.hpp
+  rados/librados.hpp
+  buffer.h
+  buffer_fwd.h
+  inline_memory.h
+  page.h
+  crc32c.h
+  rados/objclass.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
+if(WITH_LIBRADOSSTRIPER)
+  install(FILES
+    radosstriper/libradosstriper.h
+    radosstriper/libradosstriper.hpp
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/radosstriper)
+endif()
+
+if(WITH_RBD)
+  install(FILES
+    rbd/features.h
+    rbd/librbd.h
+    rbd/librbd.hpp
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rbd)
+endif()
+
+if(WITH_RADOSGW)
+  install(FILES
+    rados/librgw.h
+    rados/rgw_file.h
+    rgw/librgw_admin_user.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
+endif()
diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h
new file mode 100644
index 00000000..a9e15f76
--- /dev/null
+++ b/src/include/CompatSet.h
@@ -0,0 +1,273 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_COMPATSET_H
+#define CEPH_COMPATSET_H
+
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/types.h"
+#include "common/Formatter.h"
+
+struct CompatSet {
+
+  struct Feature {
+    uint64_t id;
+    std::string name;
+
+    Feature(uint64_t _id, const std::string& _name) : id(_id), name(_name) {}
+  };
+
+  class FeatureSet {
+    uint64_t mask;
+    std::map<uint64_t, std::string> names;
+
+  public:
+    friend struct CompatSet;
+    friend class CephCompatSet_AllSet_Test;
+    friend class CephCompatSet_other_Test;
+    friend class CephCompatSet_merge_Test;
+    friend std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs);
+    friend std::ostream& operator<<(std::ostream& out, const CompatSet& compat);
+    FeatureSet() : mask(1), names() {}
+    void insert(const Feature& f) {
+      ceph_assert(f.id > 0);
+      ceph_assert(f.id < 64);
+      mask |= ((uint64_t)1<<f.id);
+      names[f.id] = f.name;
+    }
+
+    bool contains(const Feature& f) const {
+      return names.count(f.id);
+    }
+    bool contains(uint64_t f) const {
+      return names.count(f);
+    }
+    /**
+     * Getter instead of using name[] to be const safe
+     */
+    std::string get_name(uint64_t const f) const {
+      std::map<uint64_t, std::string>::const_iterator i = names.find(f);
+      ceph_assert(i != names.end());
+      return i->second;
+    }
+
+    void remove(uint64_t f) {
+      if (names.count(f)) {
+	names.erase(f);
+	mask &= ~((uint64_t)1<<f);
+      }
+    }
+    void remove(const Feature& f) {
+      remove(f.id);
+    }
+
+    void encode(bufferlist& bl) const {
+      using ceph::encode;
+      /* See below, mask always has the lowest bit set in memory, but
+       * unset in the encoding */
+      encode(mask & (~(uint64_t)1), bl);
+      encode(names, bl);
+    }
+
+    void decode(bufferlist::const_iterator& bl) {
+      using ceph::decode;
+      decode(mask, bl);
+      decode(names, bl);
+      /**
+       * Previously, there was a bug where insert did
+       * mask |= f.id rather than mask |= (1 << f.id).
+       * In FeatureSets from those version, mask always
+       * has the lowest bit set.  Since then, masks always
+       * have the lowest bit unset.
+       *
+       * When we encounter such a FeatureSet, we have to
+       * reconstruct the mask from the names map.
+       */
+      if (mask & 1) {
+	mask = 1;
+	std::map<uint64_t, std::string> temp_names;
+	temp_names.swap(names);
+	for (auto i = temp_names.begin(); i != temp_names.end(); ++i) {
+	  insert(Feature(i->first, i->second));
+	}
+      } else {
+	mask |= 1;
+      }
+    }
+
+    void dump(Formatter *f) const {
+      for (auto p = names.cbegin(); p != names.cend(); ++p) {
+	char s[18];
+	snprintf(s, sizeof(s), "feature_%llu", (unsigned long long)p->first);
+	f->dump_string(s, p->second);
+      }
+    }
+  };
+
+  // These features have no impact on the read / write status
+  FeatureSet compat;
+  // If any of these features are missing, read is possible ( as long
+  // as no incompat feature is missing ) but it is not possible to write
+  FeatureSet ro_compat;
+  // If any of these features are missing, read or write is not possible
+  FeatureSet incompat;
+
+  CompatSet(FeatureSet& _compat, FeatureSet& _ro_compat, FeatureSet& _incompat) :
+    compat(_compat), ro_compat(_ro_compat), incompat(_incompat) {}
+
+  CompatSet() : compat(), ro_compat(), incompat() { }
+
+
+  /* does this filesystem implementation have the
+     features required to read the other? */
+  bool readable(CompatSet const& other) const {
+    return !((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+  }
+
+  /* does this filesystem implementation have the
+     features required to write the other? */
+  bool writeable(CompatSet const& other) const {
+    return readable(other) &&
+      !((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+  }
+
+  /* Compare this CompatSet to another.
+   * CAREFULLY NOTE: This operation is NOT commutative.
+   * a > b DOES NOT imply that b < a.
+   * If returns:
+   * 0: The CompatSets have the same feature set.
+   * 1: This CompatSet's features are a strict superset of the other's.
+   * -1: This CompatSet is missing at least one feature
+   *     described in the other. It may still have more features, though.
+   */
+  int compare(const CompatSet& other) {
+    if ((other.compat.mask == compat.mask) &&
+	(other.ro_compat.mask == ro_compat.mask) &&
+	(other.incompat.mask == incompat.mask)) return 0;
+    //okay, they're not the same
+
+    //if we're writeable we have a superset of theirs on incompat and ro_compat
+    if (writeable(other) && !((other.compat.mask ^ compat.mask)
+			      & other.compat.mask)) return 1;
+    //if we make it here, we weren't writeable or had a difference compat set
+    return -1;
+  }
+
+  /* Get the features supported by other CompatSet but not this one,
+   * as a CompatSet.
+   */
+  CompatSet unsupported(CompatSet& other) {
+    CompatSet diff;
+    uint64_t other_compat =
+      ((other.compat.mask ^ compat.mask) & other.compat.mask);
+    uint64_t other_ro_compat =
+      ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+    uint64_t other_incompat =
+      ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+    for (int id = 1; id < 64; ++id) {
+      uint64_t mask = (uint64_t)1 << id;
+      if (mask & other_compat) {
+	diff.compat.insert( Feature(id, other.compat.names[id]));
+      }
+      if (mask & other_ro_compat) {
+	diff.ro_compat.insert(Feature(id, other.ro_compat.names[id]));
+      }
+      if (mask & other_incompat) {
+	diff.incompat.insert( Feature(id, other.incompat.names[id]));
+      }
+    }
+    return diff;
+  }
+  
+  /* Merge features supported by other CompatSet into this one.
+   * Return: true if some features were merged
+   */
+  bool merge(CompatSet const & other) {
+    uint64_t other_compat =
+      ((other.compat.mask ^ compat.mask) & other.compat.mask);
+    uint64_t other_ro_compat =
+      ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+    uint64_t other_incompat =
+      ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+    if (!other_compat && !other_ro_compat && !other_incompat)
+      return false;
+    for (int id = 1; id < 64; ++id) {
+      uint64_t mask = (uint64_t)1 << id;
+      if (mask & other_compat) {
+	compat.insert( Feature(id, other.compat.get_name(id)));
+      }
+      if (mask & other_ro_compat) {
+	ro_compat.insert(Feature(id, other.ro_compat.get_name(id)));
+      }
+      if (mask & other_incompat) {
+	incompat.insert( Feature(id, other.incompat.get_name(id)));
+      }
+    }
+    return true;
+  }
+
+  void encode(bufferlist& bl) const {
+    compat.encode(bl);
+    ro_compat.encode(bl);
+    incompat.encode(bl);
+  }
+  
+  void decode(bufferlist::const_iterator& bl) {
+    compat.decode(bl);
+    ro_compat.decode(bl);
+    incompat.decode(bl);
+  }
+
+  void dump(Formatter *f) const {
+    f->open_object_section("compat");
+    compat.dump(f);
+    f->close_section();
+    f->open_object_section("ro_compat");
+    ro_compat.dump(f);
+    f->close_section();
+    f->open_object_section("incompat");
+    incompat.dump(f);
+    f->close_section();
+  }
+
+  static void generate_test_instances(std::list<CompatSet*>& o) {
+    o.push_back(new CompatSet);
+    o.push_back(new CompatSet);
+    o.back()->compat.insert(Feature(1, "one"));
+    o.back()->compat.insert(Feature(2, "two"));
+    o.back()->ro_compat.insert(Feature(4, "four"));
+    o.back()->incompat.insert(Feature(3, "three"));
+  }
+};
+WRITE_CLASS_ENCODER(CompatSet)
+
+using ceph::operator <<;
+inline std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs)
+{
+  return out << fs.names;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const CompatSet& compat)
+{
+  return out << "compat=" << compat.compat
+	     << ",rocompat=" << compat.ro_compat
+	     << ",incompat=" << compat.incompat;
+}
+
+#endif
diff --git a/src/include/Context.h b/src/include/Context.h
new file mode 100644
index 00000000..b588b0f1
--- /dev/null
+++ b/src/include/Context.h
@@ -0,0 +1,502 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_CONTEXT_H
+#define CEPH_CONTEXT_H
+
+#include "common/dout.h"
+
+#include <boost/function.hpp>
+#include <list>
+#include <set>
+#include <memory>
+
+#include "include/ceph_assert.h"
+#include "common/Mutex.h"
+
+#define mydout(cct, v) lgeneric_subdout(cct, context, v)
+
+/*
+ * GenContext - abstract callback class
+ */
+template <typename T>
+class GenContext {
+  GenContext(const GenContext& other);
+  const GenContext& operator=(const GenContext& other);
+
+ protected:
+  virtual void finish(T t) = 0;
+
+ public:
+  GenContext() {}
+  virtual ~GenContext() {}       // we want a virtual destructor!!!
+
+  template <typename C>
+  void complete(C &&t) {
+    finish(std::forward<C>(t));
+    delete this;
+  }
+};
+
+template <typename T>
+using GenContextURef = std::unique_ptr<GenContext<T> >;
+
+/*
+ * Context - abstract callback class
+ */
+class Finisher;
+class Context {
+  Context(const Context& other);
+  const Context& operator=(const Context& other);
+
+ protected:
+  virtual void finish(int r) = 0;
+
+  // variant of finish that is safe to call "synchronously."  override should
+  // return true.
+  virtual bool sync_finish(int r) {
+    return false;
+  }
+
+ public:
+  Context() {}
+  virtual ~Context() {}       // we want a virtual destructor!!!
+  virtual void complete(int r) {
+    finish(r);
+    delete this;
+  }
+  virtual bool sync_complete(int r) {
+    if (sync_finish(r)) {
+      delete this;
+      return true;
+    }
+    return false;
+  }
+};
+
+/**
+ * Simple context holding a single object
+ */
+template<class T>
+class ContainerContext : public Context {
+  T obj;
+public:
+  ContainerContext(T &obj) : obj(obj) {}
+  void finish(int r) override {}
+};
+template <typename T>
+ContainerContext<T> *make_container_context(T &&t) {
+  return new ContainerContext<T>(std::forward<T>(t));
+}
+
+template <class T>
+struct Wrapper : public Context {
+  Context *to_run;
+  T val;
+  Wrapper(Context *to_run, T val) : to_run(to_run), val(val) {}
+  void finish(int r) override {
+    if (to_run)
+      to_run->complete(r);
+  }
+};
+struct RunOnDelete {
+  Context *to_run;
+  RunOnDelete(Context *to_run) : to_run(to_run) {}
+  ~RunOnDelete() {
+    if (to_run)
+      to_run->complete(0);
+  }
+};
+typedef std::shared_ptr<RunOnDelete> RunOnDeleteRef;
+
+template <typename T>
+struct LambdaContext : public Context {
+  T t;
+  LambdaContext(T &&t) : t(std::forward<T>(t)) {}
+  void finish(int) override {
+    t();
+  }
+};
+template <typename T>
+LambdaContext<T> *make_lambda_context(T &&t) {
+  return new LambdaContext<T>(std::move(t));
+}
+
+template <typename F, typename T>
+struct LambdaGenContext : GenContext<T> {
+  F f;
+  LambdaGenContext(F &&f) : f(std::forward<F>(f)) {}
+  void finish(T t) override {
+    f(std::forward<T>(t));
+  }
+};
+template <typename T, typename F>
+GenContextURef<T> make_gen_lambda_context(F &&f) {
+  return GenContextURef<T>(new LambdaGenContext<F, T>(std::move(f)));
+}
+
+/*
+ * finish and destroy a list of Contexts
+ */
+template<class C>
+inline void finish_contexts(CephContext *cct, C& finished, int result = 0)
+{
+  if (finished.empty())
+    return;
+
+  C ls;
+  ls.swap(finished); // swap out of place to avoid weird loops
+
+  if (cct)
+    mydout(cct,10) << ls.size() << " contexts to finish with " << result << dendl;
+  for (Context* c : ls) {
+    if (cct)
+      mydout(cct,10) << "---- " << c << dendl;
+    c->complete(result);
+  }
+}
+
+class C_NoopContext : public Context {
+public:
+  void finish(int r) override { }
+};
+
+
+struct C_Lock : public Context {
+  Mutex *lock;
+  Context *fin;
+  C_Lock(Mutex *l, Context *c) : lock(l), fin(c) {}
+  ~C_Lock() override {
+    delete fin;
+  }
+  void finish(int r) override {
+    if (fin) {
+      lock->Lock();
+      fin->complete(r);
+      fin = NULL;
+      lock->Unlock();
+    }
+  }
+};
+
+/*
+ * C_Contexts - set of Contexts
+ *
+ * ContextType must be an ancestor class of ContextInstanceType, or the same class.
+ * ContextInstanceType must be default-constructable.
+ */
+template <class ContextType, class ContextInstanceType, class Container = std::list<ContextType *>>
+class C_ContextsBase : public ContextInstanceType {
+public:
+  CephContext *cct;
+  Container contexts;
+
+  C_ContextsBase(CephContext *cct_)
+    : cct(cct_)
+  {
+  }
+  ~C_ContextsBase() override {
+    for (auto c : contexts) {
+      delete c;
+    }
+  }
+  void add(ContextType* c) {
+    contexts.push_back(c);
+  }
+  void take(Container& ls) {
+    Container c;
+    c.swap(ls);
+    if constexpr (std::is_same_v<Container, std::list<ContextType *>>) {
+      contexts.splice(contexts.end(), c);
+    } else {
+      contexts.insert(contexts.end(), c.begin(), c.end());
+    }
+  }
+  void complete(int r) override {
+    // Neuter any ContextInstanceType custom complete(), because although
+    // I want to look like it, I don't actually want to run its code.
+    Context::complete(r);
+  }
+  void finish(int r) override {
+    finish_contexts(cct, contexts, r);
+  }
+  bool empty() { return contexts.empty(); }
+
+  template<class C>
+  static ContextType *list_to_context(C& cs) {
+    if (cs.size() == 0) {
+      return 0;
+    } else if (cs.size() == 1) {
+      ContextType *c = cs.front();
+      cs.clear();
+      return c;
+    } else {
+      C_ContextsBase<ContextType, ContextInstanceType> *c(new C_ContextsBase<ContextType, ContextInstanceType>(0));
+      c->take(cs);
+      return c;
+    }
+  }
+};
+
+typedef C_ContextsBase<Context, Context> C_Contexts;
+
+/*
+ * C_Gather
+ *
+ * ContextType must be an ancestor class of ContextInstanceType, or the same class.
+ * ContextInstanceType must be default-constructable.
+ *
+ * BUG:? only reports error from last sub to have an error return
+ */
+template <class ContextType, class ContextInstanceType>
+class C_GatherBase {
+private:
+  CephContext *cct;
+  int result;
+  ContextType *onfinish;
+#ifdef DEBUG_GATHER
+  std::set<ContextType*> waitfor;
+#endif
+  int sub_created_count;
+  int sub_existing_count;
+  mutable Mutex lock;
+  bool activated;
+
+  void sub_finish(ContextType* sub, int r) {
+    lock.Lock();
+#ifdef DEBUG_GATHER
+    ceph_assert(waitfor.count(sub));
+    waitfor.erase(sub);
+#endif
+    --sub_existing_count;
+    mydout(cct,10) << "C_GatherBase " << this << ".sub_finish(r=" << r << ") " << sub
+#ifdef DEBUG_GATHER
+		    << " (remaining " << waitfor << ")"
+#endif
+		    << dendl;
+    if (r < 0 && result == 0)
+      result = r;
+    if ((activated == false) || (sub_existing_count != 0)) {
+      lock.Unlock();
+      return;
+    }
+    lock.Unlock();
+    delete_me();
+  }
+
+  void delete_me() {
+    if (onfinish) {
+      onfinish->complete(result);
+      onfinish = 0;
+    }
+    delete this;
+  }
+
+  class C_GatherSub : public ContextInstanceType {
+    C_GatherBase *gather;
+  public:
+    C_GatherSub(C_GatherBase *g) : gather(g) {}
+    void complete(int r) override {
+      // Cancel any customized complete() functionality
+      // from the Context subclass we're templated for,
+      // we only want to hit that in onfinish, not at each
+      // sub finish.  e.g. MDSInternalContext.
+      Context::complete(r);
+    }
+    void finish(int r) override {
+      gather->sub_finish(this, r);
+      gather = 0;
+    }
+    ~C_GatherSub() override {
+      if (gather)
+	gather->sub_finish(this, 0);
+    }
+  };
+
+public:
+  C_GatherBase(CephContext *cct_, ContextType *onfinish_)
+    : cct(cct_), result(0), onfinish(onfinish_),
+      sub_created_count(0), sub_existing_count(0),
+      lock("C_GatherBase::lock", true, false), //disable lockdep
+      activated(false)
+  {
+    mydout(cct,10) << "C_GatherBase " << this << ".new" << dendl;
+  }
+  ~C_GatherBase() {
+    mydout(cct,10) << "C_GatherBase " << this << ".delete" << dendl;
+  }
+  void set_finisher(ContextType *onfinish_) {
+    Mutex::Locker l(lock);
+    ceph_assert(!onfinish);
+    onfinish = onfinish_;
+  }
+  void activate() {
+    lock.Lock();
+    ceph_assert(activated == false);
+    activated = true;
+    if (sub_existing_count != 0) {
+      lock.Unlock();
+      return;
+    }
+    lock.Unlock();
+    delete_me();
+  }
+  ContextType *new_sub() {
+    Mutex::Locker l(lock);
+    ceph_assert(activated == false);
+    sub_created_count++;
+    sub_existing_count++;
+    ContextType *s = new C_GatherSub(this);
+#ifdef DEBUG_GATHER
+    waitfor.insert(s);
+#endif
+    mydout(cct,10) << "C_GatherBase " << this << ".new_sub is " << sub_created_count << " " << s << dendl;
+    return s;
+  }
+
+  inline int get_sub_existing_count() const {
+    Mutex::Locker l(lock);
+    return sub_existing_count;
+  }
+
+  inline int get_sub_created_count() const {
+    Mutex::Locker l(lock);
+    return sub_created_count;
+  }
+};
+
+/*
+ * The C_GatherBuilder remembers each C_Context created by
+ * C_GatherBuilder.new_sub() in a C_Gather.  When a C_Context created
+ * by new_sub() is complete(), C_Gather forgets about it.  When
+ * C_GatherBuilder notices that there are no C_Context left in
+ * C_Gather, it calls complete() on the C_Context provided as the
+ * second argument of the constructor (finisher).
+ *
+ * How to use C_GatherBuilder:
+ *
+ * 1. Create a C_GatherBuilder on the stack
+ * 2. Call gather_bld.new_sub() as many times as you want to create new subs
+ *    It is safe to call this 0 times, or 100, or anything in between.
+ * 3. If you didn't supply a finisher in the C_GatherBuilder constructor,
+ *    set one with gather_bld.set_finisher(my_finisher)
+ * 4. Call gather_bld.activate()
+ *
+ * Example:
+ *
+ * C_SaferCond all_done;
+ * C_GatherBuilder gb(g_ceph_context, all_done);
+ * j.submit_entry(1, first, 0, gb.new_sub()); // add a C_Context to C_Gather
+ * j.submit_entry(2, first, 0, gb.new_sub()); // add a C_Context to C_Gather
+ * gb.activate(); // consume C_Context as soon as they complete()
+ * all_done.wait(); // all_done is complete() after all new_sub() are complete()
+ *
+ * The finisher may be called at any point after step 4, including immediately
+ * from the activate() function.
+ * The finisher will never be called before activate().
+ *
+ * Note: Currently, subs must be manually freed by the caller (for some reason.)
+ */
+template <class ContextType, class GatherType>
+class C_GatherBuilderBase
+{
+public:
+  C_GatherBuilderBase(CephContext *cct_)
+    : cct(cct_), c_gather(NULL), finisher(NULL), activated(false)
+  {
+  }
+  C_GatherBuilderBase(CephContext *cct_, ContextType *finisher_)
+    : cct(cct_), c_gather(NULL), finisher(finisher_), activated(false)
+  {
+  }
+  ~C_GatherBuilderBase() {
+    if (c_gather) {
+      ceph_assert(activated); // Don't forget to activate your C_Gather!
+    }
+    else {
+      delete finisher;
+    }
+  }
+  ContextType *new_sub() {
+    if (!c_gather) {
+      c_gather = new GatherType(cct, finisher);
+    }
+    return c_gather->new_sub();
+  }
+  void activate() {
+    if (!c_gather)
+      return;
+    ceph_assert(finisher != NULL);
+    activated = true;
+    c_gather->activate();
+  }
+  void set_finisher(ContextType *finisher_) {
+    finisher = finisher_;
+    if (c_gather)
+      c_gather->set_finisher(finisher);
+  }
+  GatherType *get() const {
+    return c_gather;
+  }
+  bool has_subs() const {
+    return (c_gather != NULL);
+  }
+  int num_subs_created() {
+    ceph_assert(!activated);
+    if (c_gather == NULL)
+      return 0;
+    return c_gather->get_sub_created_count();
+  }
+  int num_subs_remaining() {
+    ceph_assert(!activated);
+    if (c_gather == NULL)
+      return 0;
+    return c_gather->get_sub_existing_count();
+  }
+
+private:
+  CephContext *cct;
+  GatherType *c_gather;
+  ContextType *finisher;
+  bool activated;
+};
+
+typedef C_GatherBase<Context, Context> C_Gather;
+typedef C_GatherBuilderBase<Context, C_Gather > C_GatherBuilder;
+
+class FunctionContext : public Context {
+public:
+  FunctionContext(boost::function<void(int)> &&callback)
+    : m_callback(std::move(callback))
+  {
+  }
+
+  void finish(int r) override {
+    m_callback(r);
+  }
+private:
+  boost::function<void(int)> m_callback;
+};
+
+template <class ContextType>
+class ContextFactory {
+public:
+  virtual ~ContextFactory() {}
+  virtual ContextType *build() = 0;
+};
+
+#undef mydout
+
+#endif
diff --git a/src/include/Distribution.h b/src/include/Distribution.h
new file mode 100644
index 00000000..e4f0b30b
--- /dev/null
+++ b/src/include/Distribution.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_DISTRIBUTION_H
+#define CEPH_DISTRIBUTION_H
+
+#include <vector>
+
+class Distribution {
+  vector<float> p;
+  vector<int> v;
+
+ public:
+  //Distribution() { 
+  //}
+  
+  unsigned get_width() {
+    return p.size();
+  }
+
+  void clear() {
+    p.clear();
+    v.clear();
+  }
+  void add(int val, float pr) {
+    p.push_back(pr);
+    v.push_back(val);
+  }
+
+  void random() {
+    float sum = 0.0;
+    for (unsigned i=0; i<p.size(); i++) {
+      p[i] = (float)(rand() % 10000);
+      sum += p[i];
+    }
+    for (unsigned i=0; i<p.size(); i++) 
+      p[i] /= sum;
+  }
+
+  int sample() {
+    float s = (float)(rand() % 10000) / 10000.0;
+    for (unsigned i=0; i<p.size(); i++) {
+      if (s < p[i]) return v[i];
+      s -= p[i];
+    }
+    ceph_abort();
+    return v[p.size() - 1];  // hmm.  :/
+  }
+
+  float normalize() {
+    float s = 0.0;
+    for (unsigned i=0; i<p.size(); i++)
+      s += p[i];
+    for (unsigned i=0; i<p.size(); i++)
+      p[i] /= s;
+    return s;
+  }
+
+};
+
+#endif
diff --git a/src/include/addr_parsing.h b/src/include/addr_parsing.h
new file mode 100644
index 00000000..c205ac75
--- /dev/null
+++ b/src/include/addr_parsing.h
@@ -0,0 +1,28 @@
+/*
+ * addr_parsing.h
+ *
+ *  Created on: Sep 14, 2010
+ *      Author: gregf
+ *      contains functions used by Ceph to convert named addresses
+ *      (eg ceph.com) into IP addresses (ie 127.0.0.1).
+ */
+
+#ifndef ADDR_PARSING_H_
+#define ADDR_PARSING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int safe_cat(char **pstr, int *plen, int pos, const char *str2);
+
+/*
+ * returns a string allocated by malloc; caller must free
+ */
+char *resolve_addrs(const char *orig_str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ADDR_PARSING_H_ */
diff --git a/src/include/alloc_ptr.h b/src/include/alloc_ptr.h
new file mode 100644
index 00000000..258c5833
--- /dev/null
+++ b/src/include/alloc_ptr.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ALLOC_PTR_H
+#define CEPH_ALLOC_PTR_H
+
+#include <memory>
+
+template <class T>
+class alloc_ptr
+{
+public:
+    typedef typename std::pointer_traits< std::unique_ptr<T> >::pointer pointer;
+    typedef typename std::pointer_traits< std::unique_ptr<T> >::element_type element_type;
+
+    alloc_ptr() : ptr() {}
+
+    template<class U>
+      alloc_ptr(U&& u) : ptr(std::forward<U>(u)) {}
+
+    alloc_ptr(alloc_ptr<pointer>&& rhs) : ptr(std::move(rhs.ptr)) {}
+    alloc_ptr(const alloc_ptr<pointer>& rhs) = delete;
+    alloc_ptr& operator=(const alloc_ptr<pointer>&& rhs) {
+        ptr = rhs.ptr;
+    }
+    alloc_ptr& operator=(const alloc_ptr<pointer>& rhs) {
+        ptr = rhs.ptr;
+    }
+
+    void swap (alloc_ptr<pointer>& rhs) {
+        ptr.swap(rhs.ptr);
+    }
+    element_type* release() {
+        return ptr.release();
+    }
+    void reset(element_type *p = nullptr) {
+        ptr.reset(p);
+    }
+    element_type* get() const {
+        if (!ptr)
+          ptr.reset(new element_type);
+        return ptr.get();
+    }
+    element_type& operator*() const {
+        if (!ptr)
+          ptr.reset(new element_type);
+        return *ptr;
+    }
+    element_type* operator->() const {
+        if (!ptr)
+          ptr.reset(new element_type);
+        return ptr.get();
+    }
+    operator bool() const {
+        return !!ptr;
+    }
+
+    friend bool operator< (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::less<element_type>(*lhs, *rhs);
+    }
+    friend bool operator<=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::less_equal<element_type>(*lhs, *rhs);
+    }
+    friend bool operator> (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::greater<element_type>(*lhs, *rhs);
+    }
+    friend bool operator>=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::greater_equal<element_type>(*lhs, *rhs);
+    }
+    friend bool operator==(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return *lhs == *rhs;
+    }
+    friend bool operator!=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return *lhs != *rhs;
+    }
+private:
+    mutable std::unique_ptr<element_type> ptr;
+};
+
+#endif
diff --git a/src/include/any.h b/src/include/any.h
new file mode 100644
index 00000000..da59c88f
--- /dev/null
+++ b/src/include/any.h
@@ -0,0 +1,704 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef INCLUDE_STATIC_ANY
+#define INCLUDE_STATIC_ANY
+
+#include <any>
+#include <cstddef>
+#include <initializer_list>
+#include <memory>
+#include <typeinfo>
+#include <type_traits>
+
+#include <boost/smart_ptr/shared_ptr.hpp>
+#include <boost/smart_ptr/make_shared.hpp>
+
+namespace ceph {
+
+namespace _any {
+
+// Shared Functionality
+// --------------------
+//
+// Common implementation details. Most functionality is here. We
+// assume that destructors do not throw. Some of them might and
+// they'll invoke terminate and that's fine.
+//
+// We are using the Curiously Recurring Template Pattern! We require
+// that all classes inheriting from us provide:
+//
+//   - `static constexpr size_t capacity`: Maximum capacity. No object
+//                                         larger than this may be
+//                                         stored. `dynamic` for dynamic.
+//   - `void* ptr() const noexcept`: returns a pointer to storage.
+//                                   (`alloc_storage` must have been called.
+//                                   `free_storage` must not have been called
+//                                   since.)
+//   - `void* alloc_storage(const std::size_t)`: allocate storage
+//   - `void free_storage() noexcept`: free storage. Must be idempotent.
+//
+// We provide most of the public interface, as well as the operator function,
+// cast_helper, and the type() call.
+
+// Set `capacity` to this value to indicate that there is no fixed
+// capacity.
+//
+inline constexpr std::size_t dynamic = ~0;
+
+// Driver Function
+// ---------------
+//
+// The usual type-erasure control function trick. This one is simpler
+// than usual since we punt on moving and copying. We could dispense
+// with this and just store a deleter and a pointer to a typeinfo, but
+// that would be twice the space.
+//
+// Moved out here so the type of `func_t` isn't dependent on the
+// enclosing class.
+//
+enum class op { type, destroy };
+template<typename T>
+inline void op_func(const op o, void* p) noexcept {
+  static const std::type_info& type = typeid(T);
+  switch (o) {
+  case op::type:
+    *(reinterpret_cast<const std::type_info**>(p)) = &type;
+    break;
+  case op::destroy:
+    reinterpret_cast<T*>(p)->~T();
+    break;
+  }
+}
+using func_t = void (*)(const op, void* p) noexcept;
+
+// The base class 
+// --------------
+//
+// The `storage_t` parameter gives the type of the value that manages
+// storage and allocation. We use it to create a protected data member
+// (named `storage`). This allows us to sidestep the problem in
+// initialization order where, where exposed constructors were using
+// trying to allocate or free storage *before* the data members of the
+// derived class were initialized.
+//
+// Making storage_t a member type of the derived class won't work, due
+// to C++'s rules for nested types being *horrible*. Just downright
+// *horrible*.
+//
+template<typename D, typename storage_t>
+class base {
+  // Make definitions from our superclass visible
+  // --------------------------------------------
+  //
+  // And check that they fit the requirements. At least those that are
+  // statically checkable.
+  //
+  static constexpr std::size_t capacity = D::capacity;
+
+  void* ptr() const noexcept {
+    static_assert(
+      noexcept(static_cast<const D*>(this)->ptr()) &&
+      std::is_same_v<decltype(static_cast<const D*>(this)->ptr()), void*>,
+      "‘void* ptr() const noexcept’ missing from superclass");
+    return static_cast<const D*>(this)->ptr();
+  }
+
+  void* alloc_storage(const std::size_t z) {
+    static_assert(
+      std::is_same_v<decltype(static_cast<D*>(this)->alloc_storage(z)), void*>,
+      "‘void* alloc_storage(const size_t)’ missing from superclass.");
+    return static_cast<D*>(this)->alloc_storage(z);
+  }
+
+  void free_storage() noexcept {
+    static_assert(
+      noexcept(static_cast<D*>(this)->free_storage()) &&
+      std::is_void_v<decltype(static_cast<D*>(this)->free_storage())>,
+      "‘void free_storage() noexcept’ missing from superclass.");
+    static_cast<D*>(this)->free_storage();
+  }
+
+
+  // Pile O' Templates
+  // -----------------
+  //
+  // These are just verbose and better typed once than twice. They're
+  // used for SFINAE and declaring noexcept.
+  //
+  template<class T>
+  struct is_in_place_type_helper : std::false_type {};
+  template<class T>
+  struct is_in_place_type_helper<std::in_place_type_t<T>> : std::true_type {};
+
+  template<class T>
+  static constexpr bool is_in_place_type_v =
+    is_in_place_type_helper<std::decay_t<T>>::value;
+
+  // SFINAE condition for value initialized
+  // constructors/assigners. This is analogous to the standard's
+  // requirement that this overload only participate in overload
+  // resolution if std::decay_t<T> is not the same type as the
+  // any-type, nor a specialization of std::in_place_type_t
+  //
+  template<typename T>
+  using value_condition_t = std::enable_if_t<
+    !std::is_same_v<std::decay_t<T>, D> &&
+    !is_in_place_type_v<std::decay_t<T>>>;
+
+  // This `noexcept` condition for value construction lets
+  // `immobile_any`'s value constructor/assigner be noexcept, so long
+  // as the type's copy or move constructor cooperates.
+  //
+  template<typename T>
+  static constexpr bool value_noexcept_v =
+    std::is_nothrow_constructible_v<std::decay_t<T>, T> && capacity != dynamic;
+
+  // SFINAE condition for in-place constructors/assigners
+  //
+  template<typename T, typename... Args>
+  using in_place_condition_t = std::enable_if_t<std::is_constructible_v<
+						  std::decay_t<T>, Args...>>;
+
+  // Analogous to the above. Give noexcept to immobile_any::emplace
+  // when possible.
+  //
+  template<typename T, typename... Args>
+  static constexpr bool in_place_noexcept_v =
+    std::is_nothrow_constructible_v<std::decay_t<T>, Args...> &&
+    capacity != dynamic;
+
+private:
+
+  // Functionality!
+  // --------------
+
+  // The driver function for the currently stored object. Whether this
+  // is null is the canonical way to know whether an instance has a
+  // value.
+  //
+  func_t func = nullptr;
+
+  // Construct an object within ourselves. As you can see we give the
+  // weak exception safety guarantee.
+  //
+  template<typename T, typename ...Args>
+  std::decay_t<T>& construct(Args&& ...args) {
+    using Td = std::decay_t<T>;
+    static_assert(capacity == dynamic || sizeof(Td) <= capacity,
+		  "Supplied type is too large for this specialization.");
+    try {
+      func = &op_func<Td>;
+      return *new (reinterpret_cast<Td*>(alloc_storage(sizeof(Td))))
+	Td(std::forward<Args>(args)...);
+    } catch (...) {
+      reset();
+      throw;
+    }
+  }
+
+protected:
+
+  // We hold the storage, even if the superclass class manipulates it,
+  // so that its default initialization comes soon enough for us to
+  // use it in our constructors.
+  //
+  storage_t storage;
+
+public:
+
+  base() noexcept = default;
+  ~base() noexcept {
+    reset();
+  }
+
+protected:
+  // Since some of our derived classes /can/ be copied or moved.
+  //
+  base(const base& rhs) noexcept : func(rhs.func) {
+    if constexpr (std::is_copy_assignable_v<storage_t>) {
+      storage = rhs.storage;
+    }
+  }
+  base& operator =(const base& rhs) noexcept {
+    reset();
+    func = rhs.func;
+    if constexpr (std::is_copy_assignable_v<storage_t>) {
+      storage = rhs.storage;
+    }
+    return *this;
+  }
+
+  base(base&& rhs) noexcept : func(std::move(rhs.func)) {
+    if constexpr (std::is_move_assignable_v<storage_t>) {
+      storage = std::move(rhs.storage);
+    }
+    rhs.func = nullptr;
+  }
+  base& operator =(base&& rhs) noexcept {
+    reset();
+    func = rhs.func;
+    if constexpr (std::is_move_assignable_v<storage_t>) {
+      storage = std::move(rhs.storage);
+    }
+    rhs.func = nullptr;
+    return *this;
+  }
+
+public:
+
+  // Value construct/assign
+  // ----------------------
+  //
+  template<typename T,
+	   typename = value_condition_t<T>>
+  base(T&& t) noexcept(value_noexcept_v<T>) {
+    construct<T>(std::forward<T>(t));
+  }
+
+  // On exception, *this is set to empty.
+  //
+  template<typename T,
+           typename = value_condition_t<T>>
+  base& operator =(T&& t) noexcept(value_noexcept_v<T>) {
+    reset();
+    construct<T>(std::forward<T>(t));
+    return *this;
+  }
+
+  // In-place construct/assign
+  // -------------------------
+  //
+  // I really hate the way the C++ standard library treats references
+  // as if they were stepchildren in a Charles Dickens novel. I am
+  // quite upset that std::optional lacks a specialization for
+  // references. There's no legitimate reason for it. The whole
+  // 're-seat or refuse' debate is simply a canard. The optional is
+  // effectively a container, so of course it can be emptied or
+  // reassigned. No, pointers are not an acceptable substitute. A
+  // pointer gives an address in memory which may be null and which
+  // may represent an object or may a location in which an object is
+  // to be created. An optional reference, on the other hand, is a
+  // reference to an initialized, live object or /empty/. This is an
+  // obvious difference that should be communicable to any programmer
+  // reading the code through the type system.
+  //
+  // `std::any`, even in the case of in-place construction,
+  // only stores the decayed type. I suspect this was to get around
+  // the question of whether, for a std::any holding a T&,
+  // std::any_cast<T> should return a copy or throw
+  // std::bad_any_cast.
+  //
+  // I think the appropriate response in that case would be to make a
+  // copy if the type supports it and fail otherwise. Once a concrete
+  // type is known the problem solves itself.
+  //
+  // If one were inclined, one could easily load the driver function
+  // with a heavy subset of the type traits (those that depend only on
+  // the type in question) and simply /ask/ whether it's a reference.
+  //
+  // At the moment, I'm maintaining compatibility with the standard
+  // library except for copy/move semantics.
+  //
+  template<typename T,
+           typename... Args,
+           typename = in_place_condition_t<T, Args...>>
+  base(std::in_place_type_t<T>,
+       Args&& ...args) noexcept(in_place_noexcept_v<T, Args...>) {
+    construct<T>(std::forward<Args>(args)...);
+  }
+
+  // On exception, *this is set to empty.
+  //
+  template<typename T,
+           typename... Args,
+           typename = in_place_condition_t<T>>
+  std::decay_t<T>& emplace(Args&& ...args) noexcept(in_place_noexcept_v<
+						    T, Args...>) {
+    reset();
+    return construct<T>(std::forward<Args>(args)...);
+  }
+
+  template<typename T,
+           typename U,
+           typename... Args,
+           typename = in_place_condition_t<T, std::initializer_list<U>,
+					   Args...>>
+  base(std::in_place_type_t<T>,
+       std::initializer_list<U> i,
+       Args&& ...args) noexcept(in_place_noexcept_v<T, std::initializer_list<U>,
+				Args...>) {
+    construct<T>(i, std::forward<Args>(args)...);
+  }
+
+  // On exception, *this is set to empty.
+  //
+  template<typename T,
+           typename U,
+           typename... Args,
+           typename = in_place_condition_t<T, std::initializer_list<U>,
+					   Args...>>
+  std::decay_t<T>& emplace(std::initializer_list<U> i,
+                           Args&& ...args) noexcept(in_place_noexcept_v<T,
+						    std::initializer_list<U>,
+						    Args...>) {
+    reset();
+    return construct<T>(i,std::forward<Args>(args)...);
+  }
+
+  // Empty ourselves, using the subclass to free any storage.
+  //
+  void reset() noexcept {
+    if (has_value()) {
+      func(op::destroy, ptr());
+      func = nullptr;
+    }
+    free_storage();
+  }
+
+  template<typename U = storage_t,
+	   typename = std::enable_if<std::is_swappable_v<storage_t>>>
+  void swap(base& rhs) {
+    using std::swap;
+    swap(func, rhs.func);
+    swap(storage, rhs.storage);
+  }
+
+  // All other functions should use this function to test emptiness
+  // rather than examining `func` directly.
+  //
+  bool has_value() const noexcept {
+    return !!func;
+  }
+
+  // Returns the type of the value stored, if any.
+  //
+  const std::type_info& type() const noexcept {
+    if (has_value()) {
+      const std::type_info* t;
+      func(op::type, reinterpret_cast<void*>(&t));
+      return *t;
+    } else {
+      return typeid(void);
+    }
+  }
+
+  template<typename T, typename U, typename V>
+  friend inline void* cast_helper(const base<U, V>& b) noexcept;
+};
+
+// Function used by all `any_cast` functions
+//
+// Returns a void* to the contents if they exist and match the
+// requested type, otherwise `nullptr`.
+//
+template<typename T, typename U, typename V>
+inline void* cast_helper(const base<U, V>& b) noexcept {
+  if (b.func && ((&op_func<T> == b.func) ||
+		 (b.type() == typeid(T)))) {
+    return b.ptr();
+  } else {
+    return nullptr;
+  }
+}
+}
+
+// `any_cast`
+// ==========
+//
+// Just the usual gamut of `any_cast` overloads. These get a bit
+// repetitive and it would be nice to think of a way to collapse them
+// down a bit.
+//
+
+// The pointer pair!
+//
+template<typename T, typename U, typename V>
+inline T* any_cast(_any::base<U, V>* a) noexcept {
+  if (a) {
+    return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a));
+  }
+  return nullptr;
+}
+
+template<typename T, typename U, typename V>
+inline const T* any_cast(const _any::base<U, V>* a) noexcept {
+  if (a) {
+    return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a));
+  }
+  return nullptr;
+}
+
+// While we disallow copying the immobile any itself, we can allow
+// anything with an extracted value that the type supports.
+//
+template<typename T, typename U, typename V>
+inline T any_cast(_any::base<U, V>& a) {
+  static_assert(std::is_reference_v<T> ||
+                std::is_copy_constructible_v<T>,
+                "The supplied type must be either a reference or "
+                "copy constructible.");
+  auto p = any_cast<std::decay_t<T>>(&a);
+  if (p) {
+    return static_cast<T>(*p);
+  }
+  throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline T any_cast(const _any::base<U, V>& a) {
+  static_assert(std::is_reference_v<T> ||
+                std::is_copy_constructible_v<T>,
+                "The supplied type must be either a reference or "
+                "copy constructible.");
+  auto p = any_cast<std::decay_t<T>>(&a);
+  if (p) {
+    return static_cast<T>(*p);
+  }
+  throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline std::enable_if_t<(std::is_move_constructible_v<T> ||
+			 std::is_copy_constructible_v<T>) &&
+			!std::is_rvalue_reference_v<T>, T>
+any_cast(_any::base<U, V>&& a) {
+  auto p = any_cast<std::decay_t<T>>(&a);
+  if (p) {
+    return std::move((*p));
+  }
+  throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline std::enable_if_t<std::is_rvalue_reference_v<T>, T>
+any_cast(_any::base<U, V>&& a) {
+  auto p = any_cast<std::decay_t<T>>(&a);
+  if (p) {
+    return static_cast<T>(*p);
+  }
+  throw std::bad_any_cast();
+}
+
+// `immobile_any`
+// ==============
+//
+// Sometimes, uncopyable objects exist and I want to do things with
+// them. The C++ standard library is really quite keen on insisting
+// things be copyable before it deigns to work. I find this annoying.
+//
+// Also, the allocator, while useful, is really not considerate of
+// other people's time. Every time we go to visit it, it takes us
+// quite an awfully long time to get away again. As such, I've been
+// trying to avoid its company whenever it is convenient and seemly.
+//
+// We accept any type that will fit in the declared capacity. You may
+// store types with throwing destructors, but terminate will be
+// invoked when they throw.
+//
+template<std::size_t S>
+class immobile_any : public _any::base<immobile_any<S>,
+				       std::aligned_storage_t<S>> {
+  using base = _any::base<immobile_any<S>, std::aligned_storage_t<S>>;
+  friend base;
+
+  using _any::base<immobile_any<S>, std::aligned_storage_t<S>>::storage;
+
+  // Superclass requirements!
+  // ------------------------
+  //
+  // Simple as anything. We have a buffer of fixed size and return the
+  // pointer to it when asked.
+  //
+  static constexpr std::size_t capacity = S;
+  void* ptr() const noexcept {
+    return const_cast<void*>(static_cast<const void*>(&storage));
+  }
+  void* alloc_storage(std::size_t) noexcept {
+    return ptr();
+  }
+  void free_storage() noexcept {}
+
+  static_assert(capacity != _any::dynamic,
+		"That is not a valid size for an immobile_any.");
+
+public:
+
+  immobile_any() noexcept = default;
+
+  immobile_any(const immobile_any&) = delete;
+  immobile_any& operator =(const immobile_any&) = delete;
+  immobile_any(immobile_any&&) = delete;
+  immobile_any& operator =(immobile_any&&) = delete;
+
+  using base::base;
+  using base::operator =;
+
+  void swap(immobile_any&) = delete;
+};
+
+template<typename T, std::size_t S, typename... Args>
+inline immobile_any<S> make_immobile_any(Args&& ...args) {
+  return immobile_any<S>(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, std::size_t S, typename U, typename... Args>
+inline immobile_any<S> make_immobile_any(std::initializer_list<U> i, Args&& ...args) {
+  return immobile_any<S>(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+
+// `unique_any`
+// ============
+//
+// Oh dear. Now we're getting back into allocation. You don't think
+// the allocator noticed all those mean things we said about it, do
+// you?
+//
+// Well. Okay, allocator. Sometimes when it's the middle of the night
+// and you're writing template code you say things you don't exactly
+// mean. If it weren't for you, we wouldn't have any memory to run all
+// our programs in at all. Really, I'm just being considerate of
+// *your* needs, trying to avoid having to run to you every time we
+// instantiate a type, making a few that can be self-sufficient…uh…
+//
+// **Anyway**, this is movable but not copyable, as you should expect
+// from anything with ‘unique’ in the name.
+//
+class unique_any : public _any::base<unique_any, std::unique_ptr<std::byte[]>> {
+  using base = _any::base<unique_any, std::unique_ptr<std::byte[]>>;
+  friend base;
+
+  using base::storage;
+
+  // Superclass requirements
+  // -----------------------
+  //
+  // Our storage is a single chunk of RAM owned by a
+  // `std::unique_ptr`.
+  //
+  static constexpr std::size_t capacity = _any::dynamic;
+  void* ptr() const noexcept {
+    return static_cast<void*>(storage.get());
+    return nullptr;
+  }
+
+  void* alloc_storage(const std::size_t z) {
+    storage.reset(new std::byte[z]);
+    return ptr();
+  }
+
+  void free_storage() noexcept {
+    storage.reset();
+  }
+
+public:
+
+  unique_any() noexcept = default;
+  ~unique_any() noexcept = default;
+
+  unique_any(const unique_any&) = delete;
+  unique_any& operator =(const unique_any&) = delete;
+
+  // We can rely on the behavior of `unique_ptr` and the base class to
+  // give us a default move constructor that does the right thing.
+  //
+  unique_any(unique_any&& rhs) noexcept = default;
+  unique_any& operator =(unique_any&& rhs) = default;
+
+  using base::base;
+  using base::operator =;
+};
+
+inline void swap(unique_any& lhs, unique_any& rhs) noexcept {
+  lhs.swap(rhs);
+}
+
+template<typename T, typename... Args>
+inline unique_any make_unique_any(Args&& ...args) {
+  return unique_any(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, typename U, typename... Args>
+inline unique_any make_unique_any(std::initializer_list<U> i, Args&& ...args) {
+  return unique_any(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+
+// `shared_any`
+// ============
+//
+// Once more with feeling!
+//
+// This is both copyable *and* movable. In case you need that sort of
+// thing. It seemed a reasonable completion.
+//
+class shared_any : public _any::base<shared_any, boost::shared_ptr<std::byte[]>> {
+  using base = _any::base<shared_any, boost::shared_ptr<std::byte[]>>;
+  friend base;
+
+  using base::storage;
+
+  // Superclass requirements
+  // -----------------------
+  //
+  // Our storage is a single chunk of RAM allocated from the
+  // heap. This time it's owned by a `boost::shared_ptr` so we can use
+  // `boost::make_shared_noinit`. (This lets us get the optimization
+  // that allocates array and control block in one without wasting
+  // time on `memset`.)
+  //
+  static constexpr std::size_t capacity = _any::dynamic;
+  void* ptr() const noexcept {
+    return static_cast<void*>(storage.get());
+  }
+
+  void* alloc_storage(std::size_t n) {
+    storage = boost::make_shared_noinit<std::byte[]>(n);
+    return ptr();
+  }
+
+  void free_storage() noexcept {
+    storage.reset();
+  }
+
+public:
+
+  shared_any() noexcept = default;
+  ~shared_any() noexcept = default;
+
+  shared_any(const shared_any& rhs) noexcept = default;
+  shared_any& operator =(const shared_any&) noexcept = default;
+
+  shared_any(shared_any&& rhs) noexcept = default;
+  shared_any& operator =(shared_any&& rhs) noexcept = default;
+
+  using base::base;
+  using base::operator =;
+};
+
+inline void swap(shared_any& lhs, shared_any& rhs) noexcept {
+  lhs.swap(rhs);
+}
+
+template<typename T, typename... Args>
+inline shared_any make_shared_any(Args&& ...args) {
+  return shared_any(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, typename U, typename... Args>
+inline shared_any make_shared_any(std::initializer_list<U> i, Args&& ...args) {
+  return shared_any(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+}
+
+#endif // INCLUDE_STATIC_ANY
diff --git a/src/include/bitmapper.h b/src/include/bitmapper.h
new file mode 100644
index 00000000..5a65cc20
--- /dev/null
+++ b/src/include/bitmapper.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_BITMAPPER_H
+#define CEPH_BITMAPPER_H
+
+class bitmapper {
+  char *_data;
+  int _len;
+
+ public:
+  bitmapper() : _data(0), _len(0) { }
+  bitmapper(char *data, int len) : _data(data), _len(len) { }
+
+  void set_data(char *data, int len) { _data = data; _len = len; }
+
+  int bytes() const { return _len; }
+  int bits() const { return _len * 8; }
+
+  bool operator[](int b) const {
+    return get(b);
+  }
+  bool get(int b) const {
+    return _data[b >> 3] & (1 << (b&7));
+  }
+  void set(int b) {
+    _data[b >> 3] |= 1 << (b&7);
+  }
+  void clear(int b) {
+    _data[b >> 3] &= ~(1 << (b&7));
+  }
+  void toggle(int b) {
+    _data[b >> 3] ^= 1 << (b&7);
+  }
+};
+
+#endif
diff --git a/src/include/blobhash.h b/src/include/blobhash.h
new file mode 100644
index 00000000..597884e4
--- /dev/null
+++ b/src/include/blobhash.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_BLOBHASH_H
+#define CEPH_BLOBHASH_H
+
+#include "hash.h"
+
+/*
+- this is to make some of the STL types work with 64 bit values, string hash keys, etc.
+- added when i was using an old STL.. maybe try taking these out and see if things 
+  compile now?
+*/
+
+class blobhash {
+public:
+  uint32_t operator()(const char *p, unsigned len) {
+    static rjhash<uint32_t> H;
+    uint32_t acc = 0;
+    while (len >= sizeof(acc)) {
+      acc ^= *(uint32_t*)p;
+      p += sizeof(uint32_t);
+      len -= sizeof(uint32_t);
+    }
+    int sh = 0;
+    while (len) {
+      acc ^= (uint32_t)*p << sh;
+      sh += 8;
+      len--;
+      p++;
+    }
+    return H(acc);
+  }
+};
+
+
+#endif
diff --git a/src/include/btree_map.h b/src/include/btree_map.h
new file mode 100644
index 00000000..1f42ea41
--- /dev/null
+++ b/src/include/btree_map.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_INCLUDE_BTREE_MAP_H
+#define CEPH_INCLUDE_BTREE_MAP_H
+
+#include "include/cpp-btree/btree.h"
+#include "include/cpp-btree/btree_map.h"
+#include "include/ceph_assert.h"   // cpp-btree uses system assert, blech
+#include "include/encoding.h"
+
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl, uint64_t features)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U>
+inline void decode(btree::btree_map<T,U>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U>
+inline void encode_nohead(const btree::btree_map<T,U>& m, bufferlist& bl)
+{
+  for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U>
+inline void decode_nohead(int n, btree::btree_map<T,U>& m, bufferlist::const_iterator& p)
+{
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+
+#endif
diff --git a/src/include/buffer.h b/src/include/buffer.h
new file mode 100644
index 00000000..774ca052
--- /dev/null
+++ b/src/include/buffer.h
@@ -0,0 +1,1331 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_BUFFER_H
+#define CEPH_BUFFER_H
+
+#if defined(__linux__) || defined(__FreeBSD__)
+#include <stdlib.h>
+#endif
+#include <limits.h>
+
+#ifndef _XOPEN_SOURCE
+# define _XOPEN_SOURCE 600
+#endif
+
+#include <stdio.h>
+#include <sys/uio.h>
+
+#if defined(__linux__)	// For malloc(2).
+#include <malloc.h>
+#endif
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifndef __CYGWIN__
+# include <sys/mman.h>
+#endif
+
+#include <iosfwd>
+#include <iomanip>
+#include <list>
+#include <vector>
+#include <string>
+#if __cplusplus >= 201703L
+#include <string_view>
+#endif // __cplusplus >= 201703L
+
+#include <exception>
+#include <type_traits>
+
+#include "page.h"
+#include "crc32c.h"
+#include "buffer_fwd.h"
+
+#ifdef __CEPH__
+# include "include/ceph_assert.h"
+#else
+# include <assert.h>
+#endif
+
+#include "inline_memory.h"
+
+#define CEPH_BUFFER_API
+
+#if defined(HAVE_XIO)
+struct xio_reg_mem;
+class XioDispatchHook;
+#endif
+#ifdef HAVE_SEASTAR
+namespace seastar {
+template <typename T> class temporary_buffer;
+namespace net {
+class packet;
+}
+}
+#endif // HAVE_SEASTAR
+class deleter;
+template<uint8_t S>
+struct sha_digest_t;
+using sha1_digest_t = sha_digest_t<20>;
+
+namespace ceph {
+
+template <class T>
+struct nop_delete {
+  void operator()(T*) {}
+};
+
+// This is not unique_ptr-like smart pointer! It just signalizes ownership
+// but DOES NOT manage the resource. It WILL LEAK if not manually deleted.
+// It's rather a replacement for raw pointer than any other smart one.
+//
+// Considered options:
+//  * unique_ptr with custom deleter implemented in .cc (would provide
+//    the non-zero-cost resource management),
+//  * GSL's owner<T*> (pretty neat but would impose an extra depedency),
+//  * unique_ptr with nop deleter,
+//  * raw pointer (doesn't embed ownership enforcement - std::move).
+template <class T>
+struct unique_leakable_ptr : public std::unique_ptr<T, ceph::nop_delete<T>> {
+  using std::unique_ptr<T, ceph::nop_delete<T>>::unique_ptr;
+};
+
+namespace buffer CEPH_BUFFER_API {
+inline namespace v14_2_0 {
+
+  /*
+   * exceptions
+   */
+
+  struct error : public std::exception{
+    const char *what() const throw () override;
+  };
+  struct bad_alloc : public error {
+    const char *what() const throw () override;
+  };
+  struct end_of_buffer : public error {
+    const char *what() const throw () override;
+  };
+  struct malformed_input : public error {
+    explicit malformed_input(const std::string& w) {
+      snprintf(buf, sizeof(buf), "buffer::malformed_input: %s", w.c_str());
+    }
+    const char *what() const throw () override;
+  private:
+    char buf[256];
+  };
+  struct error_code : public malformed_input {
+    explicit error_code(int error);
+    int code;
+  };
+
+
+  /// count of cached crc hits (matching input)
+  int get_cached_crc();
+  /// count of cached crc hits (mismatching input, required adjustment)
+  int get_cached_crc_adjusted();
+  /// count of crc cache misses
+  int get_missed_crc();
+  /// enable/disable tracking of cached crcs
+  void track_cached_crc(bool b);
+
+  /*
+   * an abstract raw buffer.  with a reference count.
+   */
+  class raw;
+  class raw_malloc;
+  class raw_static;
+  class raw_posix_aligned;
+  class raw_hack_aligned;
+  class raw_char;
+  class raw_claimed_char;
+  class raw_unshareable; // diagnostic, unshareable char buffer
+  class raw_combined;
+  class raw_claim_buffer;
+
+
+  class xio_mempool;
+  class xio_msg_buffer;
+
+  /*
+   * named constructors
+   */
+  ceph::unique_leakable_ptr<raw> copy(const char *c, unsigned len);
+  ceph::unique_leakable_ptr<raw> create(unsigned len);
+  ceph::unique_leakable_ptr<raw> create_in_mempool(unsigned len, int mempool);
+  raw* claim_char(unsigned len, char *buf);
+  raw* create_malloc(unsigned len);
+  raw* claim_malloc(unsigned len, char *buf);
+  raw* create_static(unsigned len, char *buf);
+  ceph::unique_leakable_ptr<raw> create_aligned(unsigned len, unsigned align);
+  ceph::unique_leakable_ptr<raw> create_aligned_in_mempool(unsigned len, unsigned align, int mempool);
+  ceph::unique_leakable_ptr<raw> create_page_aligned(unsigned len);
+  ceph::unique_leakable_ptr<raw> create_small_page_aligned(unsigned len);
+  raw* create_unshareable(unsigned len);
+  raw* create_static(unsigned len, char *buf);
+  raw* claim_buffer(unsigned len, char *buf, deleter del);
+
+#ifdef HAVE_SEASTAR
+  /// create a raw buffer to wrap seastar cpu-local memory, using foreign_ptr to
+  /// make it safe to share between cpus
+  raw* create_foreign(seastar::temporary_buffer<char>&& buf);
+  /// create a raw buffer to wrap seastar cpu-local memory, without the safety
+  /// of foreign_ptr. the caller must otherwise guarantee that the buffer ptr is
+  /// destructed on this cpu
+  raw* create(seastar::temporary_buffer<char>&& buf);
+#endif
+#if defined(HAVE_XIO)
+  raw* create_msg(unsigned len, char *buf, XioDispatchHook *m_hook);
+#endif
+
+  /*
+   * a buffer pointer.  references (a subsequence of) a raw buffer.
+   */
+  class CEPH_BUFFER_API ptr {
+    raw *_raw;
+  public: // dirty hack for testing; if it works, this will be abstracted
+    unsigned _off, _len;
+  private:
+
+    void release();
+
+    template<bool is_const>
+    class iterator_impl {
+      const ptr *bp;     ///< parent ptr
+      const char *start; ///< starting pointer into bp->c_str()
+      const char *pos;   ///< pointer into bp->c_str()
+      const char *end_ptr;   ///< pointer to bp->end_c_str()
+      const bool deep;   ///< if true, do not allow shallow ptr copies
+
+      iterator_impl(typename std::conditional<is_const, const ptr*, ptr*>::type p,
+		    size_t offset, bool d)
+	: bp(p),
+	  start(p->c_str() + offset),
+	  pos(start),
+	  end_ptr(p->end_c_str()),
+	  deep(d)
+      {}
+
+      friend class ptr;
+
+    public:
+      using pointer = typename std::conditional<is_const, const char*, char *>::type;
+      pointer get_pos_add(size_t n) {
+	auto r = pos;
+	advance(n);
+	return r;
+      }
+      ptr get_ptr(size_t len) {
+	if (deep) {
+	  return buffer::copy(get_pos_add(len), len);
+	} else {
+	  size_t off = pos - bp->c_str();
+	  advance(len);
+	  return ptr(*bp, off, len);
+	}
+      }
+
+      void advance(size_t len) {
+	pos += len;
+	if (pos > end_ptr)
+	  throw end_of_buffer();
+      }
+
+      const char *get_pos() {
+	return pos;
+      }
+      const char *get_end() {
+	return end_ptr;
+      }
+
+      size_t get_offset() {
+	return pos - start;
+      }
+
+      bool end() const {
+	return pos == end_ptr;
+      }
+    };
+
+  public:
+    using const_iterator = iterator_impl<true>;
+    using iterator = iterator_impl<false>;
+
+    ptr() : _raw(nullptr), _off(0), _len(0) {}
+    // cppcheck-suppress noExplicitConstructor
+    ptr(raw* r);
+    ptr(ceph::unique_leakable_ptr<raw> r);
+    // cppcheck-suppress noExplicitConstructor
+    ptr(unsigned l);
+    ptr(const char *d, unsigned l);
+    ptr(const ptr& p);
+    ptr(ptr&& p) noexcept;
+    ptr(const ptr& p, unsigned o, unsigned l);
+    ptr(const ptr& p, ceph::unique_leakable_ptr<raw> r);
+    ptr& operator= (const ptr& p);
+    ptr& operator= (ptr&& p) noexcept;
+    ~ptr() {
+      // BE CAREFUL: this destructor is called also for hypercombined ptr_node.
+      // After freeing underlying raw, `*this` can become inaccessible as well!
+      release();
+    }
+
+    bool have_raw() const { return _raw ? true:false; }
+
+    ceph::unique_leakable_ptr<raw> clone();
+    void swap(ptr& other) noexcept;
+
+    iterator begin(size_t offset=0) {
+      return iterator(this, offset, false);
+    }
+    const_iterator begin(size_t offset=0) const {
+      return const_iterator(this, offset, false);
+    }
+    const_iterator cbegin() const {
+      return begin();
+    }
+    const_iterator begin_deep(size_t offset=0) const {
+      return const_iterator(this, offset, true);
+    }
+
+    // misc
+    bool is_aligned(unsigned align) const {
+      return ((long)c_str() & (align-1)) == 0;
+    }
+    bool is_page_aligned() const { return is_aligned(CEPH_PAGE_SIZE); }
+    bool is_n_align_sized(unsigned align) const
+    {
+      return (length() % align) == 0;
+    }
+    bool is_n_page_sized() const { return is_n_align_sized(CEPH_PAGE_SIZE); }
+    bool is_partial() const {
+      return have_raw() && (start() > 0 || end() < raw_length());
+    }
+
+    int get_mempool() const;
+    void reassign_to_mempool(int pool);
+    void try_assign_to_mempool(int pool);
+
+    // accessors
+    raw *get_raw() const { return _raw; }
+    const char *c_str() const;
+    char *c_str();
+    const char *end_c_str() const;
+    char *end_c_str();
+    unsigned length() const { return _len; }
+    unsigned offset() const { return _off; }
+    unsigned start() const { return _off; }
+    unsigned end() const { return _off + _len; }
+    unsigned unused_tail_length() const;
+    const char& operator[](unsigned n) const;
+    char& operator[](unsigned n);
+
+    const char *raw_c_str() const;
+    unsigned raw_length() const;
+    int raw_nref() const;
+
+    void copy_out(unsigned o, unsigned l, char *dest) const;
+
+    unsigned wasted() const;
+
+    int cmp(const ptr& o) const;
+    bool is_zero() const;
+
+    // modifiers
+    void set_offset(unsigned o) {
+#ifdef __CEPH__
+      ceph_assert(raw_length() >= o);
+#else
+      assert(raw_length() >= o);
+#endif
+      _off = o;
+    }
+    void set_length(unsigned l) {
+#ifdef __CEPH__
+      ceph_assert(raw_length() >= l);
+#else
+      assert(raw_length() >= l);
+#endif
+      _len = l;
+    }
+
+    unsigned append(char c);
+    unsigned append(const char *p, unsigned l);
+#if __cplusplus >= 201703L
+    inline unsigned append(std::string_view s) {
+      return append(s.data(), s.length());
+    }
+#endif // __cplusplus >= 201703L
+    void copy_in(unsigned o, unsigned l, const char *src, bool crc_reset = true);
+    void zero(bool crc_reset = true);
+    void zero(unsigned o, unsigned l, bool crc_reset = true);
+    unsigned append_zeros(unsigned l);
+
+#ifdef HAVE_SEASTAR
+    /// create a temporary_buffer, copying the ptr as its deleter
+    operator seastar::temporary_buffer<char>() &;
+    /// convert to temporary_buffer, stealing the ptr as its deleter
+    operator seastar::temporary_buffer<char>() &&;
+#endif // HAVE_SEASTAR
+
+  };
+
+
+  struct ptr_hook {
+    mutable ptr_hook* next;
+
+    ptr_hook() = default;
+    ptr_hook(ptr_hook* const next)
+      : next(next) {
+    }
+  };
+
+  class ptr_node : public ptr_hook, public ptr {
+  public:
+    struct cloner {
+      ptr_node* operator()(const ptr_node& clone_this);
+    };
+    struct disposer {
+      void operator()(ptr_node* const delete_this) {
+	if (!dispose_if_hypercombined(delete_this)) {
+	  delete delete_this;
+	}
+      }
+    };
+
+    ~ptr_node() = default;
+
+    static std::unique_ptr<ptr_node, disposer>
+    create(ceph::unique_leakable_ptr<raw> r) {
+      return create_hypercombined(std::move(r));
+    }
+    static std::unique_ptr<ptr_node, disposer> create(raw* const r) {
+      return create_hypercombined(r);
+    }
+    static std::unique_ptr<ptr_node, disposer> create(const unsigned l) {
+      return create_hypercombined(buffer::create(l));
+    }
+    template <class... Args>
+    static std::unique_ptr<ptr_node, disposer> create(Args&&... args) {
+      return std::unique_ptr<ptr_node, disposer>(
+	new ptr_node(std::forward<Args>(args)...));
+    }
+
+    static ptr_node* copy_hypercombined(const ptr_node& copy_this);
+
+  private:
+    template <class... Args>
+    ptr_node(Args&&... args) : ptr(std::forward<Args>(args)...) {
+    }
+    ptr_node(const ptr_node&) = default;
+
+    ptr& operator= (const ptr& p) = delete;
+    ptr& operator= (ptr&& p) noexcept = delete;
+    ptr_node& operator= (const ptr_node& p) = delete;
+    ptr_node& operator= (ptr_node&& p) noexcept = delete;
+    void swap(ptr& other) noexcept = delete;
+    void swap(ptr_node& other) noexcept = delete;
+
+    static bool dispose_if_hypercombined(ptr_node* delete_this);
+    static std::unique_ptr<ptr_node, disposer> create_hypercombined(
+      buffer::raw* r);
+    static std::unique_ptr<ptr_node, disposer> create_hypercombined(
+      ceph::unique_leakable_ptr<raw> r);
+  };
+  /*
+   * list - the useful bit!
+   */
+
+  class CEPH_BUFFER_API list {
+  public:
+    // this the very low-level implementation of singly linked list
+    // ceph::buffer::list is built on. We don't use intrusive slist
+    // of Boost (or any other 3rd party) to save extra dependencies
+    // in our public headers.
+    class buffers_t {
+      // _root.next can be thought as _head
+      ptr_hook _root;
+      ptr_hook* _tail;
+      std::size_t _size;
+
+    public:
+      template <class T>
+      class buffers_iterator {
+	typename std::conditional<
+	  std::is_const<T>::value, const ptr_hook*, ptr_hook*>::type cur;
+	template <class U> friend class buffers_iterator;
+      public:
+	using value_type = T;
+	using reference = typename std::add_lvalue_reference<T>::type;
+	using pointer = typename std::add_pointer<T>::type;
+	using difference_type = std::ptrdiff_t;
+	using iterator_category = std::forward_iterator_tag;
+
+	template <class U>
+	buffers_iterator(U* const p)
+	  : cur(p) {
+	}
+	template <class U>
+	buffers_iterator(const buffers_iterator<U>& other)
+	  : cur(other.cur) {
+	}
+	buffers_iterator() = default;
+
+	T& operator*() const {
+	  return *reinterpret_cast<T*>(cur);
+	}
+	T* operator->() const {
+	  return reinterpret_cast<T*>(cur);
+	}
+
+	buffers_iterator& operator++() {
+	  cur = cur->next;
+	  return *this;
+	}
+	buffers_iterator operator++(int) {
+	  const auto temp(*this);
+	  ++*this;
+	  return temp;
+	}
+
+	template <class U>
+	buffers_iterator& operator=(buffers_iterator<U>& other) {
+	  cur = other.cur;
+	  return *this;
+	}
+
+	bool operator==(const buffers_iterator& rhs) const {
+	  return cur == rhs.cur;
+	}
+	bool operator!=(const buffers_iterator& rhs) const {
+	  return !(*this==rhs);
+	}
+
+	using citer_t = buffers_iterator<typename std::add_const<T>::type>;
+	operator citer_t() const {
+	  return citer_t(cur);
+	}
+      };
+
+      typedef buffers_iterator<const ptr_node> const_iterator;
+      typedef buffers_iterator<ptr_node> iterator;
+
+      typedef const ptr_node& const_reference;
+      typedef ptr_node& reference;
+
+      buffers_t()
+        : _root(&_root),
+	  _tail(&_root),
+	  _size(0) {
+      }
+      buffers_t(const buffers_t&) = delete;
+      buffers_t(buffers_t&& other)
+	: _root(other._root.next == &other._root ? &_root : other._root.next),
+	  _tail(other._tail == &other._root ? &_root : other._tail),
+	  _size(other._size) {
+	other._root.next = &other._root;
+	other._tail = &other._root;
+	other._size = 0;
+
+	_tail->next = &_root;
+      }
+      buffers_t& operator=(buffers_t&& other) {
+	if (&other != this) {
+	  clear_and_dispose();
+	  swap(other);
+	}
+	return *this;
+      }
+
+      void push_back(reference item) {
+	item.next = &_root;
+	// this updates _root.next when called on empty
+	_tail->next = &item;
+	_tail = &item;
+	_size++;
+      }
+
+      void push_front(reference item) {
+	item.next = _root.next;
+	_root.next = &item;
+	_tail = _tail == &_root ? &item : _tail;
+	_size++;
+      }
+
+      // *_after
+      iterator erase_after(const_iterator it) {
+	const auto* to_erase = it->next;
+
+	it->next = to_erase->next;
+	_root.next = _root.next == to_erase ? to_erase->next : _root.next;
+	_tail = _tail == to_erase ? (ptr_hook*)&*it : _tail;
+	_size--;
+	return it->next;
+      }
+
+      void insert_after(const_iterator it, reference item) {
+	item.next = it->next;
+	it->next = &item;
+	_root.next = it == end() ? &item : _root.next;
+	_tail = const_iterator(_tail) == it ? &item : _tail;
+	_size++;
+      }
+
+      void splice_back(buffers_t& other) {
+	if (other._size == 0) {
+	  return;
+	}
+
+	other._tail->next = &_root;
+	// will update root.next if empty() == true
+	_tail->next = other._root.next;
+	_tail = other._tail;
+	_size += other._size;
+
+	other._root.next = &other._root;
+	other._tail = &other._root;
+	other._size = 0;
+      }
+
+      std::size_t size() const { return _size; }
+      bool empty() const { return _tail == &_root; }
+
+      const_iterator begin() const {
+	return _root.next;
+      }
+      const_iterator before_begin() const {
+	return &_root;
+      }
+      const_iterator end() const {
+	return &_root;
+      }
+      iterator begin() {
+	return _root.next;
+      }
+      iterator before_begin() {
+	return &_root;
+      }
+      iterator end() {
+	return &_root;
+      }
+
+      reference front() {
+	return reinterpret_cast<reference>(*_root.next);
+      }
+      reference back() {
+	return reinterpret_cast<reference>(*_tail);
+      }
+      const_reference front() const {
+	return reinterpret_cast<const_reference>(*_root.next);
+      }
+      const_reference back() const {
+	return reinterpret_cast<const_reference>(*_tail);
+      }
+
+      void clone_from(const buffers_t& other) {
+	clear_and_dispose();
+	for (auto& node : other) {
+	  ptr_node* clone = ptr_node::cloner()(node);
+	  push_back(*clone);
+	}
+      }
+      void clear_and_dispose() {
+	for (auto it = begin(); it != end(); /* nop */) {
+	  auto& node = *it;
+	  it = it->next;
+	  ptr_node::disposer()(&node);
+	}
+	_root.next = &_root;
+	_tail = &_root;
+	_size = 0;
+      }
+      iterator erase_after_and_dispose(iterator it) {
+	auto* to_dispose = &*std::next(it);
+	auto ret = erase_after(it);
+	ptr_node::disposer()(to_dispose);
+	return ret;
+      }
+
+      void swap(buffers_t& other) {
+	const auto copy_root = _root;
+	_root.next = \
+	  other._root.next == &other._root ? &this->_root : other._root.next;
+	other._root.next = \
+	  copy_root.next == &_root ? &other._root : copy_root.next;
+
+	const auto copy_tail = _tail;
+	_tail = other._tail == &other._root ? &this->_root : other._tail;
+	other._tail = copy_tail == &_root ? &other._root : copy_tail;
+
+	_tail->next = &_root;
+	other._tail->next = &other._root;
+	std::swap(_size, other._size);
+      }
+    };
+
+    class iterator;
+
+  private:
+    // my private bits
+    buffers_t _buffers;
+
+    // track bufferptr we can modify (especially ::append() to). Not all bptrs
+    // bufferlist holds have this trait -- if somebody ::push_back(const ptr&),
+    // he expects it won't change.
+    ptr* _carriage;
+    unsigned _len;
+    unsigned _memcopy_count; //the total of memcopy using rebuild().
+
+    template <bool is_const>
+    class CEPH_BUFFER_API iterator_impl {
+    protected:
+      typedef typename std::conditional<is_const,
+					const list,
+					list>::type bl_t;
+      typedef typename std::conditional<is_const,
+					const buffers_t,
+					buffers_t >::type list_t;
+      typedef typename std::conditional<is_const,
+					typename buffers_t::const_iterator,
+					typename buffers_t::iterator>::type list_iter_t;
+      bl_t* bl;
+      list_t* ls;  // meh.. just here to avoid an extra pointer dereference..
+      list_iter_t p;
+      unsigned off; // in bl
+      unsigned p_off;   // in *p
+      friend class iterator_impl<true>;
+
+    public:
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = typename std::conditional<is_const, const char, char>::type;
+      using difference_type = std::ptrdiff_t;
+      using pointer = typename std::add_pointer<value_type>::type;
+      using reference = typename std::add_lvalue_reference<value_type>::type;
+
+      // constructor.  position.
+      iterator_impl()
+	: bl(0), ls(0), off(0), p_off(0) {}
+      iterator_impl(bl_t *l, unsigned o=0);
+      iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po)
+	: bl(l), ls(&bl->_buffers), p(ip), off(o), p_off(po) {}
+      iterator_impl(const list::iterator& i);
+
+      /// get current iterator offset in buffer::list
+      unsigned get_off() const { return off; }
+
+      /// get number of bytes remaining from iterator position to the end of the buffer::list
+      unsigned get_remaining() const { return bl->length() - off; }
+
+      /// true if iterator is at the end of the buffer::list
+      bool end() const {
+	return p == ls->end();
+	//return off == bl->length();
+      }
+
+      void advance(int o) = delete;
+      void advance(unsigned o);
+      void advance(size_t o) { advance(static_cast<unsigned>(o)); }
+      void seek(unsigned o);
+      char operator*() const;
+      iterator_impl& operator++();
+      ptr get_current_ptr() const;
+      bool is_pointing_same_raw(const ptr& other) const;
+
+      bl_t& get_bl() const { return *bl; }
+
+      // copy data out.
+      // note that these all _append_ to dest!
+      void copy(unsigned len, char *dest);
+      // deprecated, use copy_deep()
+      void copy(unsigned len, ptr &dest) __attribute__((deprecated));
+      void copy_deep(unsigned len, ptr &dest);
+      void copy_shallow(unsigned len, ptr &dest);
+      void copy(unsigned len, list &dest);
+      void copy(unsigned len, std::string &dest);
+      void copy_all(list &dest);
+
+      // get a pointer to the currenet iterator position, return the
+      // number of bytes we can read from that position (up to want),
+      // and advance the iterator by that amount.
+      size_t get_ptr_and_advance(size_t want, const char **p);
+
+      /// calculate crc from iterator position
+      uint32_t crc32c(size_t length, uint32_t crc);
+
+      friend bool operator==(const iterator_impl& lhs,
+			     const iterator_impl& rhs) {
+	return &lhs.get_bl() == &rhs.get_bl() && lhs.get_off() == rhs.get_off();
+      }
+      friend bool operator!=(const iterator_impl& lhs,
+			     const iterator_impl& rhs) {
+	return &lhs.get_bl() != &rhs.get_bl() || lhs.get_off() != rhs.get_off();
+      }
+    };
+
+  public:
+    typedef iterator_impl<true> const_iterator;
+
+    class CEPH_BUFFER_API iterator : public iterator_impl<false> {
+    public:
+      iterator() = default;
+      iterator(bl_t *l, unsigned o=0);
+      iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po);
+      // copy data in
+      void copy_in(unsigned len, const char *src, bool crc_reset = true);
+      void copy_in(unsigned len, const list& otherl);
+    };
+
+    struct reserve_t {
+      char* bp_data;
+      unsigned* bp_len;
+      unsigned* bl_len;
+    };
+
+    class contiguous_appender {
+      ceph::bufferlist& bl;
+      ceph::bufferlist::reserve_t space;
+      char* pos;
+      bool deep;
+
+      /// running count of bytes appended that are not reflected by @pos
+      size_t out_of_band_offset = 0;
+
+      contiguous_appender(bufferlist& bl, size_t len, bool d)
+	: bl(bl),
+	  space(bl.obtain_contiguous_space(len)),
+	  pos(space.bp_data),
+	  deep(d) {
+      }
+
+      void flush_and_continue() {
+	const size_t l = pos - space.bp_data;
+	*space.bp_len += l;
+	*space.bl_len += l;
+	space.bp_data = pos;
+      }
+
+      friend class list;
+
+    public:
+      ~contiguous_appender() {
+	flush_and_continue();
+      }
+
+      size_t get_out_of_band_offset() const {
+	return out_of_band_offset;
+      }
+      void append(const char* __restrict__ p, size_t l) {
+	maybe_inline_memcpy(pos, p, l, 16);
+	pos += l;
+      }
+      char *get_pos_add(size_t len) {
+	char *r = pos;
+	pos += len;
+	return r;
+      }
+      char *get_pos() {
+	return pos;
+      }
+
+      void append(const bufferptr& p) {
+	const auto plen = p.length();
+	if (!plen) {
+	  return;
+	}
+	if (deep) {
+	  append(p.c_str(), plen);
+	} else {
+	  flush_and_continue();
+	  bl.append(p);
+	  space = bl.obtain_contiguous_space(0);
+	  out_of_band_offset += plen;
+	}
+      }
+      void append(const bufferlist& l) {
+	if (deep) {
+	  for (const auto &p : l._buffers) {
+	    append(p.c_str(), p.length());
+	  }
+	} else {
+	  flush_and_continue();
+	  bl.append(l);
+	  space = bl.obtain_contiguous_space(0);
+	  out_of_band_offset += l.length();
+	}
+      }
+
+      size_t get_logical_offset() {
+	return out_of_band_offset + (pos - space.bp_data);
+      }
+    };
+
+    contiguous_appender get_contiguous_appender(size_t len, bool deep=false) {
+      return contiguous_appender(*this, len, deep);
+    }
+
+    class contiguous_filler {
+      friend buffer::list;
+      char* pos;
+
+      contiguous_filler(char* const pos) : pos(pos) {}
+
+    public:
+      void advance(const unsigned len) {
+	pos += len;
+      }
+      void copy_in(const unsigned len, const char* const src) {
+	memcpy(pos, src, len);
+	advance(len);
+      }
+      char* c_str() {
+        return pos;
+      }
+    };
+    // The contiguous_filler is supposed to be not costlier than a single
+    // pointer. Keep it dumb, please.
+    static_assert(sizeof(contiguous_filler) == sizeof(char*),
+		  "contiguous_filler should be no costlier than pointer");
+
+    class page_aligned_appender {
+      bufferlist *pbl;
+      unsigned min_alloc;
+      ptr buffer;
+      char *pos, *end;
+
+      page_aligned_appender(list *l, unsigned min_pages)
+	: pbl(l),
+	  min_alloc(min_pages * CEPH_PAGE_SIZE),
+	  pos(nullptr), end(nullptr) {}
+
+      friend class list;
+
+    public:
+      ~page_aligned_appender() {
+	flush();
+      }
+
+      void flush() {
+	if (pos && pos != buffer.c_str()) {
+	  size_t len = pos - buffer.c_str();
+	  pbl->append(buffer, 0, len);
+	  buffer.set_length(buffer.length() - len);
+	  buffer.set_offset(buffer.offset() + len);
+	}
+      }
+
+      void append(const char *buf, size_t len) {
+	while (len > 0) {
+	  if (!pos) {
+	    size_t alloc = (len + CEPH_PAGE_SIZE - 1) & CEPH_PAGE_MASK;
+	    if (alloc < min_alloc) {
+	      alloc = min_alloc;
+	    }
+	    buffer = create_page_aligned(alloc);
+	    pos = buffer.c_str();
+	    end = buffer.end_c_str();
+	  }
+	  size_t l = len;
+	  if (l > (size_t)(end - pos)) {
+	    l = end - pos;
+	  }
+	  memcpy(pos, buf, l);
+	  pos += l;
+	  buf += l;
+	  len -= l;
+	  if (pos == end) {
+	    pbl->append(buffer, 0, buffer.length());
+	    pos = end = nullptr;
+	  }
+	}
+      }
+    };
+
+    page_aligned_appender get_page_aligned_appender(unsigned min_pages=1) {
+      return page_aligned_appender(this, min_pages);
+    }
+
+  private:
+    mutable iterator last_p;
+
+    // always_empty_bptr has no underlying raw but its _len is always 0.
+    // This is useful for e.g. get_append_buffer_unused_tail_length() as
+    // it allows to avoid conditionals on hot paths.
+    static ptr always_empty_bptr;
+    ptr_node& refill_append_space(const unsigned len);
+
+  public:
+    // cons/des
+    list()
+      : _carriage(&always_empty_bptr),
+        _len(0),
+        _memcopy_count(0),
+        last_p(this) {
+    }
+    // cppcheck-suppress noExplicitConstructor
+    // cppcheck-suppress noExplicitConstructor
+    list(unsigned prealloc)
+      : _carriage(&always_empty_bptr),
+        _len(0),
+        _memcopy_count(0),
+	last_p(this) {
+      reserve(prealloc);
+    }
+
+    list(const list& other)
+      : _carriage(&always_empty_bptr),
+        _len(other._len),
+        _memcopy_count(other._memcopy_count),
+        last_p(this) {
+      _buffers.clone_from(other._buffers);
+    }
+    list(list&& other) noexcept;
+
+    ~list() {
+      _buffers.clear_and_dispose();
+    }
+
+    list& operator= (const list& other) {
+      if (this != &other) {
+        _carriage = &always_empty_bptr;
+        _buffers.clone_from(other._buffers);
+        _len = other._len;
+        last_p = begin();
+      }
+      return *this;
+    }
+    list& operator= (list&& other) noexcept {
+      _buffers = std::move(other._buffers);
+      _carriage = other._carriage;
+      _len = other._len;
+      _memcopy_count = other._memcopy_count;
+      last_p = begin();
+      other.clear();
+      return *this;
+    }
+
+    uint64_t get_wasted_space() const;
+    unsigned get_num_buffers() const { return _buffers.size(); }
+    const ptr_node& front() const { return _buffers.front(); }
+    const ptr_node& back() const { return _buffers.back(); }
+
+    int get_mempool() const;
+    void reassign_to_mempool(int pool);
+    void try_assign_to_mempool(int pool);
+
+    size_t get_append_buffer_unused_tail_length() const {
+      return _carriage->unused_tail_length();
+    }
+
+    unsigned get_memcopy_count() const {return _memcopy_count; }
+    const buffers_t& buffers() const { return _buffers; }
+    void swap(list& other) noexcept;
+    unsigned length() const {
+#if 0
+      // DEBUG: verify _len
+      unsigned len = 0;
+      for (std::list<ptr>::const_iterator it = _buffers.begin();
+	   it != _buffers.end();
+	   it++) {
+	len += (*it).length();
+      }
+#ifdef __CEPH__
+      ceph_assert(len == _len);
+#else
+      assert(len == _len);
+#endif // __CEPH__
+#endif
+      return _len;
+    }
+
+    bool contents_equal(const buffer::list& other) const;
+
+    bool is_provided_buffer(const char *dst) const;
+    bool is_aligned(unsigned align) const;
+    bool is_page_aligned() const;
+    bool is_n_align_sized(unsigned align) const;
+    bool is_n_page_sized() const;
+    bool is_aligned_size_and_memory(unsigned align_size,
+				    unsigned align_memory) const;
+
+    bool is_zero() const;
+
+    // modifiers
+    void clear() noexcept {
+      _carriage = &always_empty_bptr;
+      _buffers.clear_and_dispose();
+      _len = 0;
+      _memcopy_count = 0;
+      last_p = begin();
+    }
+    void push_back(const ptr& bp) {
+      if (bp.length() == 0)
+	return;
+      _buffers.push_back(*ptr_node::create(bp).release());
+      _len += bp.length();
+    }
+    void push_back(ptr&& bp) {
+      if (bp.length() == 0)
+	return;
+      _len += bp.length();
+      _buffers.push_back(*ptr_node::create(std::move(bp)).release());
+      _carriage = &always_empty_bptr;
+    }
+    void push_back(const ptr_node&) = delete;
+    void push_back(ptr_node&) = delete;
+    void push_back(ptr_node&&) = delete;
+    void push_back(std::unique_ptr<ptr_node, ptr_node::disposer> bp) {
+      if (bp->length() == 0)
+	return;
+      _carriage = bp.get();
+      _len += bp->length();
+      _buffers.push_back(*bp.release());
+    }
+    void push_back(raw* const r) {
+      _buffers.push_back(*ptr_node::create(r).release());
+      _carriage = &_buffers.back();
+      _len += _buffers.back().length();
+    }
+    void push_back(ceph::unique_leakable_ptr<raw> r) {
+      push_back(r.release());
+    }
+
+    void zero();
+    void zero(unsigned o, unsigned l);
+
+    bool is_contiguous() const;
+    void rebuild();
+    void rebuild(std::unique_ptr<ptr_node, ptr_node::disposer> nb);
+    bool rebuild_aligned(unsigned align);
+    // max_buffers = 0 mean don't care _buffers.size(), other
+    // must make _buffers.size() <= max_buffers after rebuilding.
+    bool rebuild_aligned_size_and_memory(unsigned align_size,
+					 unsigned align_memory,
+					 unsigned max_buffers = 0);
+    bool rebuild_page_aligned();
+
+    void reserve(size_t prealloc);
+
+    // assignment-op with move semantics
+    const static unsigned int CLAIM_DEFAULT = 0;
+    const static unsigned int CLAIM_ALLOW_NONSHAREABLE = 1;
+
+    void claim(list& bl, unsigned int flags = CLAIM_DEFAULT);
+    void claim_append(list& bl, unsigned int flags = CLAIM_DEFAULT);
+    // only for bl is bufferlist::page_aligned_appender
+    void claim_append_piecewise(list& bl);
+
+    // copy with explicit volatile-sharing semantics
+    void share(const list& bl)
+    {
+      if (this != &bl) {
+        clear();
+	for (const auto& bp : bl._buffers) {
+          _buffers.push_back(*ptr_node::create(bp).release());
+        }
+        _len = bl._len;
+      }
+    }
+
+#ifdef HAVE_SEASTAR
+    /// convert the bufferlist into a network packet
+    operator seastar::net::packet() &&;
+#endif
+
+    iterator begin() {
+      return iterator(this, 0);
+    }
+    iterator end() {
+      return iterator(this, _len, _buffers.end(), 0);
+    }
+
+    const_iterator begin() const {
+      return const_iterator(this, 0);
+    }
+    const_iterator cbegin() const {
+      return begin();
+    }
+    const_iterator end() const {
+      return const_iterator(this, _len, _buffers.end(), 0);
+    }
+
+    // crope lookalikes.
+    // **** WARNING: this are horribly inefficient for large bufferlists. ****
+    void copy(unsigned off, unsigned len, char *dest) const;
+    void copy(unsigned off, unsigned len, list &dest) const;
+    void copy(unsigned off, unsigned len, std::string& dest) const;
+    void copy_in(unsigned off, unsigned len, const char *src, bool crc_reset = true);
+    void copy_in(unsigned off, unsigned len, const list& src);
+
+    void append(char c);
+    void append(const char *data, unsigned len);
+    void append(std::string s) {
+      append(s.data(), s.length());
+    }
+#if __cplusplus >= 201703L
+    // To forcibly disambiguate between string and string_view in the
+    // case of arrays
+    template<std::size_t N>
+    void append(const char (&s)[N]) {
+      append(s, N);
+    }
+    void append(const char* s) {
+      append(s, strlen(s));
+    }
+    void append(std::string_view s) {
+      append(s.data(), s.length());
+    }
+#endif // __cplusplus >= 201703L
+    void append(const ptr& bp);
+    void append(ptr&& bp);
+    void append(const ptr& bp, unsigned off, unsigned len);
+    void append(const list& bl);
+    void append(std::istream& in);
+    contiguous_filler append_hole(unsigned len);
+    void append_zero(unsigned len);
+    void prepend_zero(unsigned len);
+
+    reserve_t obtain_contiguous_space(unsigned len);
+
+    /*
+     * get a char
+     */
+    const char& operator[](unsigned n) const;
+    char *c_str();
+    std::string to_str() const;
+
+    void substr_of(const list& other, unsigned off, unsigned len);
+
+    // funky modifer
+    void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */);
+    void write(int off, int len, std::ostream& out) const;
+
+    void encode_base64(list& o);
+    void decode_base64(list& o);
+
+    void write_stream(std::ostream &out) const;
+    void hexdump(std::ostream &out, bool trailing_newline = true) const;
+    int read_file(const char *fn, std::string *error);
+    ssize_t read_fd(int fd, size_t len);
+    int write_file(const char *fn, int mode=0644);
+    int write_fd(int fd) const;
+    int write_fd(int fd, uint64_t offset) const;
+    template<typename VectorT>
+    void prepare_iov(VectorT *piov) const {
+#ifdef __CEPH__
+      ceph_assert(_buffers.size() <= IOV_MAX);
+#else
+      assert(_buffers.size() <= IOV_MAX);
+#endif
+      piov->resize(_buffers.size());
+      unsigned n = 0;
+      for (auto& p : _buffers) {
+	(*piov)[n].iov_base = (void *)p.c_str();
+	(*piov)[n].iov_len = p.length();
+	++n;
+      }
+    }
+    uint32_t crc32c(uint32_t crc) const;
+    void invalidate_crc();
+    sha1_digest_t sha1();
+
+    // These functions return a bufferlist with a pointer to a single
+    // static buffer. They /must/ not outlive the memory they
+    // reference.
+    static list static_from_mem(char* c, size_t l);
+    static list static_from_cstring(char* c);
+    static list static_from_string(std::string& s);
+  };
+
+} // inline namespace v14_2_0
+
+  /*
+   * efficient hash of one or more bufferlists
+   */
+
+  class hash {
+    uint32_t crc;
+
+  public:
+    hash() : crc(0) { }
+    // cppcheck-suppress noExplicitConstructor
+    hash(uint32_t init) : crc(init) { }
+
+    void update(const buffer::list& bl) {
+      crc = bl.crc32c(crc);
+    }
+
+    uint32_t digest() {
+      return crc;
+    }
+  };
+
+inline bool operator>(bufferlist& l, bufferlist& r) {
+  for (unsigned p = 0; ; p++) {
+    if (l.length() > p && r.length() == p) return true;
+    if (l.length() == p) return false;
+    if (l[p] > r[p]) return true;
+    if (l[p] < r[p]) return false;
+  }
+}
+inline bool operator>=(bufferlist& l, bufferlist& r) {
+  for (unsigned p = 0; ; p++) {
+    if (l.length() > p && r.length() == p) return true;
+    if (r.length() == p && l.length() == p) return true;
+    if (l.length() == p && r.length() > p) return false;
+    if (l[p] > r[p]) return true;
+    if (l[p] < r[p]) return false;
+  }
+}
+
+inline bool operator==(const bufferlist &l, const bufferlist &r) {
+  if (l.length() != r.length())
+    return false;
+  for (unsigned p = 0; p < l.length(); p++) {
+    if (l[p] != r[p])
+      return false;
+  }
+  return true;
+}
+inline bool operator<(bufferlist& l, bufferlist& r) {
+  return r > l;
+}
+inline bool operator<=(bufferlist& l, bufferlist& r) {
+  return r >= l;
+}
+
+
+std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
+
+std::ostream& operator<<(std::ostream& out, const buffer::raw &r);
+
+std::ostream& operator<<(std::ostream& out, const buffer::list& bl);
+
+std::ostream& operator<<(std::ostream& out, const buffer::error& e);
+
+inline bufferhash& operator<<(bufferhash& l, const bufferlist &r) {
+  l.update(r);
+  return l;
+}
+
+} // namespace buffer
+
+#if defined(HAVE_XIO)
+xio_reg_mem* get_xio_mp(const buffer::ptr& bp);
+#endif
+
+} // namespace ceph
+
+#endif
diff --git a/src/include/buffer_fwd.h b/src/include/buffer_fwd.h
new file mode 100644
index 00000000..7fac5963
--- /dev/null
+++ b/src/include/buffer_fwd.h
@@ -0,0 +1,19 @@
+#ifndef BUFFER_FWD_H
+#define BUFFER_FWD_H
+
+namespace ceph {
+  namespace buffer {
+    inline namespace v14_2_0 {
+      class ptr;
+      class list;
+    }
+    class hash;
+  }
+
+  using bufferptr = buffer::ptr;
+  using bufferlist = buffer::list;
+  using bufferhash = buffer::hash;
+}
+
+#endif
+
diff --git a/src/include/buffer_raw.h b/src/include/buffer_raw.h
new file mode 100644
index 00000000..7557795c
--- /dev/null
+++ b/src/include/buffer_raw.h
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 20127 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BUFFER_RAW_H
+#define CEPH_BUFFER_RAW_H
+
+#include <atomic>
+#include <map>
+#include <utility>
+#include <type_traits>
+#include "include/buffer.h"
+#include "include/mempool.h"
+#include "include/spinlock.h"
+
+namespace ceph::buffer {
+inline namespace v14_2_0 {
+
+  class raw {
+  public:
+    // In the future we might want to have a slab allocator here with few
+    // embedded slots. This would allow to avoid the "if" in dtor of ptr_node.
+    std::aligned_storage<sizeof(ptr_node),
+			 alignof(ptr_node)>::type bptr_storage;
+    char *data;
+    unsigned len;
+    std::atomic<unsigned> nref { 0 };
+    int mempool;
+
+    std::pair<size_t, size_t> last_crc_offset {std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max()};
+    std::pair<uint32_t, uint32_t> last_crc_val;
+
+    mutable ceph::spinlock crc_spinlock;
+
+    explicit raw(unsigned l, int mempool=mempool::mempool_buffer_anon)
+      : data(nullptr), len(l), nref(0), mempool(mempool) {
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+    }
+    raw(char *c, unsigned l, int mempool=mempool::mempool_buffer_anon)
+      : data(c), len(l), nref(0), mempool(mempool) {
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+    }
+    virtual ~raw() {
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+	-1, -(int)len);
+    }
+
+    void _set_len(unsigned l) {
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+	-1, -(int)len);
+      len = l;
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+    }
+
+    void reassign_to_mempool(int pool) {
+      if (pool == mempool) {
+	return;
+      }
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+	-1, -(int)len);
+      mempool = pool;
+      mempool::get_pool(mempool::pool_index_t(pool)).adjust_count(1, len);
+    }
+
+    void try_assign_to_mempool(int pool) {
+      if (mempool == mempool::mempool_buffer_anon) {
+	reassign_to_mempool(pool);
+      }
+    }
+
+private:
+    // no copying.
+    // cppcheck-suppress noExplicitConstructor
+    raw(const raw &other) = delete;
+    const raw& operator=(const raw &other) = delete;
+public:
+    char *get_data() {
+      return data;
+    }
+    virtual raw* clone_empty() = 0;
+    ceph::unique_leakable_ptr<raw> clone() {
+      raw* const c = clone_empty();
+      memcpy(c->data, data, len);
+      return ceph::unique_leakable_ptr<raw>(c);
+    }
+    virtual bool is_shareable() const {
+      // true if safe to reference/share the existing buffer copy
+      // false if it is not safe to share the buffer, e.g., due to special
+      // and/or registered memory that is scarce
+      return true;
+    }
+    bool get_crc(const std::pair<size_t, size_t> &fromto,
+		 std::pair<uint32_t, uint32_t> *crc) const {
+      std::lock_guard lg(crc_spinlock);
+      if (last_crc_offset == fromto) {
+        *crc = last_crc_val;
+        return true;
+      }
+      return false;
+    }
+    void set_crc(const std::pair<size_t, size_t> &fromto,
+		 const std::pair<uint32_t, uint32_t> &crc) {
+      std::lock_guard lg(crc_spinlock);
+      last_crc_offset = fromto;
+      last_crc_val = crc;
+    }
+    void invalidate_crc() {
+      std::lock_guard lg(crc_spinlock);
+      last_crc_offset.first = std::numeric_limits<size_t>::max();
+      last_crc_offset.second = std::numeric_limits<size_t>::max();
+    }
+  };
+
+} // inline namespace v14_2_0
+} // namespace ceph::buffer
+
+#endif // CEPH_BUFFER_RAW_H
diff --git a/src/include/byteorder.h b/src/include/byteorder.h
new file mode 100644
index 00000000..85268543
--- /dev/null
+++ b/src/include/byteorder.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <type_traits>
+#include "acconfig.h"
+#include "int_types.h"
+
+
+#ifdef __GNUC__
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type
+swab(T val) {
+  return __builtin_bswap16(val);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type
+swab(T val) {
+  return __builtin_bswap32(val);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type
+swab(T val) {
+  return __builtin_bswap64(val);
+}
+#else
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type
+swab(T val) {
+  return (val >> 8) | (val << 8);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type
+swab(T val) {
+  return (( val >> 24) |
+	  ((val >> 8)  & 0xff00) |
+	  ((val << 8)  & 0xff0000) | 
+	  ((val << 24)));
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type
+swab(T val) {
+  return (( val >> 56) |
+	  ((val >> 40) & 0xff00ull) |
+	  ((val >> 24) & 0xff0000ull) |
+	  ((val >> 8)  & 0xff000000ull) |
+	  ((val << 8)  & 0xff00000000ull) |
+	  ((val << 24) & 0xff0000000000ull) |
+	  ((val << 40) & 0xff000000000000ull) |
+	  ((val << 56)));
+}
+#endif
+
+// mswab == maybe swab (if not LE)
+#ifdef CEPH_BIG_ENDIAN
+template<typename T>
+inline T mswab(T val) {
+  return swab(val);
+}
+#else
+template<typename T>
+inline T mswab(T val) {
+  return val;
+}
+#endif
+
+template<typename T>
+struct ceph_le {
+  T v;
+  ceph_le<T>& operator=(T nv) {
+    v = mswab(nv);
+    return *this;
+  }
+  operator T() const { return mswab(v); }
+} __attribute__ ((packed));
+
+template<typename T>
+inline bool operator==(ceph_le<T> a, ceph_le<T> b) {
+  return a.v == b.v;
+}
+
+using ceph_le64 = ceph_le<__u64>;
+using ceph_le32 = ceph_le<__u32>;
+using ceph_le16 = ceph_le<__u16>;
+
+inline ceph_le64 init_le64(__u64 x) {
+  ceph_le64 v;
+  v = x;
+  return v;
+}
+inline ceph_le32 init_le32(__u32 x) {
+  ceph_le32 v;
+  v = x;
+  return v;
+}
+inline ceph_le16 init_le16(__u16 x) {
+  ceph_le16 v;
+  v = x;
+  return v;
+}
+
+  /*
+#define cpu_to_le64(x) (x)
+#define cpu_to_le32(x) (x)
+#define cpu_to_le16(x) (x)
+  */
+#define le64_to_cpu(x) ((uint64_t)x)
+#define le32_to_cpu(x) ((__u32)x)
+#define le16_to_cpu(x) ((__u16)x)
diff --git a/src/include/ceph_assert.h b/src/include/ceph_assert.h
new file mode 100644
index 00000000..36d6c430
--- /dev/null
+++ b/src/include/ceph_assert.h
@@ -0,0 +1,147 @@
+#ifndef CEPH_ASSERT_H
+#define CEPH_ASSERT_H
+
+#include <cstdlib>
+#include <string>
+
+#if defined(__linux__)
+#include <features.h>
+
+#ifndef __STRING
+# define __STRING(x) #x
+#endif
+
+#elif defined(__FreeBSD__)
+#include <sys/cdefs.h>
+#define	__GNUC_PREREQ(minor, major)	__GNUC_PREREQ__(minor, major)
+#elif defined(__sun) || defined(_AIX)
+#include "include/compat.h"
+#include <assert.h>
+#endif
+
+#ifdef __CEPH__
+# include "acconfig.h"
+#endif
+
+class CephContext;
+
+namespace ceph {
+
+struct BackTrace;
+
+/*
+ * Select a function-name variable based on compiler tests, and any compiler
+ * specific overrides.
+ */
+#if defined(HAVE_PRETTY_FUNC)
+# define __CEPH_ASSERT_FUNCTION __PRETTY_FUNCTION__
+#elif defined(HAVE_FUNC)
+# define __CEPH_ASSERT_FUNCTION __func__
+#else
+# define __CEPH_ASSERT_FUNCTION ((__const char *) 0)
+#endif
+
+extern void register_assert_context(CephContext *cct);
+
+struct assert_data {
+  const char *assertion;
+  const char *file;
+  const int line;
+  const char *function;
+};
+
+extern void __ceph_assert_fail(const char *assertion, const char *file, int line, const char *function)
+  __attribute__ ((__noreturn__));
+extern void __ceph_assert_fail(const assert_data &ctx)
+  __attribute__ ((__noreturn__));
+
+extern void __ceph_assertf_fail(const char *assertion, const char *file, int line, const char *function, const char* msg, ...)
+  __attribute__ ((__noreturn__));
+extern void __ceph_assert_warn(const char *assertion, const char *file, int line, const char *function);
+
+[[noreturn]] void __ceph_abort(const char *file, int line, const char *func,
+                               const std::string& msg);
+
+[[noreturn]] void __ceph_abortf(const char *file, int line, const char *func,
+                                const char* msg, ...);
+
+#define _CEPH_ASSERT_VOID_CAST static_cast<void>
+
+#define assert_warn(expr)							\
+  ((expr)								\
+   ? _CEPH_ASSERT_VOID_CAST (0)					\
+   : __ceph_assert_warn (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION))
+
+}
+
+using namespace ceph;
+
+
+/*
+ * ceph_abort aborts the program with a nice backtrace.
+ *
+ * Currently, it's the same as assert(0), but we may one day make assert a
+ * debug-only thing, like it is in many projects.
+ */
+#define ceph_abort(msg, ...)                                            \
+  __ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, "abort() called")
+
+#define ceph_abort_msg(msg)                                             \
+  __ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, msg) 
+
+#define ceph_abort_msgf(...)                                             \
+  __ceph_abortf( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)
+
+#ifdef __SANITIZE_ADDRESS__
+#define ceph_assert(expr)                           \
+  do {                                              \
+    ((expr))                                        \
+    ? _CEPH_ASSERT_VOID_CAST (0)                    \
+    : __ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \
+  } while (false)
+#else
+#define ceph_assert(expr)							\
+  do { static const ceph::assert_data assert_data_ctx = \
+   {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \
+   ((expr) \
+   ? _CEPH_ASSERT_VOID_CAST (0) \
+   : __ceph_assert_fail(assert_data_ctx)); } while(false)
+#endif
+
+// this variant will *never* get compiled out to NDEBUG in the future.
+// (ceph_assert currently doesn't either, but in the future it might.)
+#ifdef __SANITIZE_ADDRESS__
+#define ceph_assert_always(expr)                    \
+  do {                                              \
+    ((expr))                                        \
+    ? _CEPH_ASSERT_VOID_CAST (0)                    \
+    : __ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \
+  } while(false)
+#else
+#define ceph_assert_always(expr)							\
+  do { static const ceph::assert_data assert_data_ctx = \
+   {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \
+   ((expr) \
+   ? _CEPH_ASSERT_VOID_CAST (0) \
+   : __ceph_assert_fail(assert_data_ctx)); } while(false)
+#endif
+
+// Named by analogy with printf.  Along with an expression, takes a format
+// string and parameters which are printed if the assertion fails.
+#define assertf(expr, ...)                  \
+  ((expr)								\
+   ? _CEPH_ASSERT_VOID_CAST (0)					\
+   : __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+#define ceph_assertf(expr, ...)                  \
+  ((expr)								\
+   ? _CEPH_ASSERT_VOID_CAST (0)					\
+   : __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+
+// this variant will *never* get compiled out to NDEBUG in the future.
+// (ceph_assertf currently doesn't either, but in the future it might.)
+#define ceph_assertf_always(expr, ...)                  \
+  ((expr)								\
+   ? _CEPH_ASSERT_VOID_CAST (0)					\
+   : __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+
+#endif
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
new file mode 100644
index 00000000..6fec3a0c
--- /dev/null
+++ b/src/include/ceph_features.h
@@ -0,0 +1,279 @@
+#ifndef __CEPH_FEATURES
+#define __CEPH_FEATURES
+
+#include "sys/types.h"
+
+/*
+ * Each time we reclaim bits for reuse we need to specify another
+ * bitmask that, if all bits are set, indicates we have the new
+ * incarnation of that feature.  Base case is 1 (first use)
+ */
+#define CEPH_FEATURE_INCARNATION_1 (0ull)
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57)              // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
+
+#define DEFINE_CEPH_FEATURE(bit, incarnation, name)			\
+	const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit);		\
+	const static uint64_t CEPH_FEATUREMASK_##name =			\
+		(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+// this bit is ignored but still advertised by release *when*
+#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
+	const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
+	const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name =		\
+		(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+// this bit is ignored by release *unused* and not advertised by
+// release *unadvertised*
+#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
+
+
+// test for a feature.  this test is safer than a typical mask against
+// the bit because it ensures that we have the bit AND the marker for the
+// bit's incarnation.  this must be used in any case where the features
+// bits may include an old meaning of the bit.
+#define HAVE_FEATURE(x, name)				\
+	(((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
+
+
+/*
+ * Notes on deprecation:
+ *
+ * A *major* release is a release through which all upgrades must pass
+ * (e.g., jewel).  For example, no pre-jewel server will ever talk to
+ * a post-jewel server (mon, osd, etc).
+ *
+ * For feature bits used *only* on the server-side:
+ *
+ *  - In the first phase we indicate that a feature is DEPRECATED as of
+ *    a particular release.  This is the first major release X (say,
+ *    jewel) that does not depend on its peers advertising the feature.
+ *    That is, it safely assumes its peers all have the feature.  We
+ *    indicate this with the DEPRECATED macro.  For example,
+ *
+ *      DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL)
+ *
+ *    because 10.2.z (jewel) did not care if its peers advertised this
+ *    feature bit.
+ *
+ *  - In the second phase we stop advertising the the bit and call it
+ *    RETIRED.  This can normally be done in the *next* major release
+ *    following the one in which we marked the feature DEPRECATED.  In
+ *    the above example, for 12.0.z (luminous) we can say:
+ *
+ *      DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+ *
+ *  - The bit can be reused in the first post-luminous release, 13.0.z
+ *    (m).
+ *
+ * This ensures that no two versions who have different meanings for
+ * the bit ever speak to each other.
+ */
+
+/*
+ * Notes on the kernel client:
+ *
+ * - "X" means that the feature bit has been advertised and supported
+ *   since kernel X
+ *
+ * - "X req" means that the feature bit has been advertised and required
+ *   since kernel X
+ *
+ * The remaining feature bits are not and have never been used by the
+ * kernel client.
+ */
+
+DEFINE_CEPH_FEATURE( 0, 1, UID)
+DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)        // 2.6.35 req
+DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
+DEFINE_CEPH_FEATURE( 3, 1, FLOCK)            // 2.6.36
+DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)       // 4.6 req
+DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
+DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ)    // 3.10 req
+DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH)    // 2.6.38
+DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
+DEFINE_CEPH_FEATURE( 9, 1, PGID64)           // 3.9 req
+DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
+DEFINE_CEPH_FEATURE(11, 1, PGPOOL3)          // 3.9 req
+DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
+DEFINE_CEPH_FEATURE(13, 1, OSDENC)           // 3.9 req
+DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
+DEFINE_CEPH_FEATURE(15, 1, MONENC)
+DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(16, 3, SERVER_O)
+DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(17, 3, OS_PERF_STAT_NS)
+DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)   // 3.6
+DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(19, 2, OSD_PGLOG_HARDLIMIT)
+
+DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS)  // 4.13
+DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT)  // overlap
+DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF)    // overlap
+DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP)  // overlap
+DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(22, 2, OSD_FIXED_COLLECTION_LIST)
+DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH)         // 3.19 req (unless nocephx_require_signatures)
+DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS)
+DEFINE_CEPH_FEATURE(24, 2, RECOVERY_RESERVATION_2)
+DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)  // 3.9
+DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
+DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) // 3.9
+DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
+DEFINE_CEPH_FEATURE(29, 1, MDSENC)           // 4.7
+DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)    // 3.9
+DEFINE_CEPH_FEATURE_DEPRECATED(31, 1, MON_SINGLE_PAXOS, NAUTILUS)
+DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL)    // 3.14
+DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2)         // 3.14
+DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER)      // 3.14
+DEFINE_CEPH_FEATURE_DEPRECATED(38, 1, OSD_ERASURE_CODES, MIMIC)
+DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC)       // 3.15
+DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA)  // 3.19
+DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3)  // 3.15
+DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
+DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2)  // 4.3 (for consistency)
+DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND)   // 4.13
+DEFINE_CEPH_FEATURE_DEPRECATED(44, 1, ERASURE_CODE_PLUGINS_V2, MIMIC)
+DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA)        // 4.17
+DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4)         // 4.1
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE_DEPRECATED(50, 1, MON_METADATA, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(51, 1, OSD_BITWISE_HOBJ_SORT, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(52, 1, OSD_PROXY_WRITE_FEATURES, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(53, 1, ERASURE_CODE_PLUGINS_V3, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(54, 1, OSD_HITSET_GMT, MIMIC)
+DEFINE_CEPH_FEATURE_DEPRECATED(55, 1, HAMMER_0_94_4, MIMIC)
+DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) // 4.13 (for pg_pool_t >= v25)
+DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) // 4.13
+DEFINE_CEPH_FEATURE_DEPRECATED(57, 1, MON_ROUTE_OSDMAP, MIMIC) // overlap
+DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
+DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5)  // 4.5
+DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
+DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
+DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
+DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
+DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
+DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit*
+DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2)         // 4.19, *do not share this bit*
+
+DEFINE_CEPH_FEATURE(62, 1, RESERVED)           // do not use; used as a sentinel
+DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
+
+
+/*
+ * Features supported.  Should be everything above.
+ */
+#define CEPH_FEATURES_ALL		 \
+	(CEPH_FEATURE_UID |		 \
+	 CEPH_FEATURE_NOSRCADDR |	 \
+	 CEPH_FEATURE_FLOCK |		 \
+	 CEPH_FEATURE_SUBSCRIBE2 |	 \
+	 CEPH_FEATURE_MONNAMES |	 \
+	 CEPH_FEATURE_RECONNECT_SEQ |	 \
+	 CEPH_FEATURE_DIRLAYOUTHASH |	 \
+	 CEPH_FEATURE_OBJECTLOCATOR |	 \
+	 CEPH_FEATURE_PGID64 |		 \
+	 CEPH_FEATURE_INCSUBOSDMAP |	 \
+	 CEPH_FEATURE_PGPOOL3 |		 \
+	 CEPH_FEATURE_OSDREPLYMUX |	 \
+	 CEPH_FEATURE_OSDENC |		 \
+	 CEPH_FEATURE_MONENC |		 \
+	 CEPH_FEATURE_CRUSH_TUNABLES |	 \
+	 CEPH_FEATURE_MSG_AUTH |	     \
+	 CEPH_FEATURE_CRUSH_TUNABLES2 |	     \
+	 CEPH_FEATURE_CREATEPOOLID |	     \
+	 CEPH_FEATURE_REPLY_CREATE_INODE |   \
+	 CEPH_FEATURE_MDSENC |			\
+	 CEPH_FEATURE_OSDHASHPSPOOL |       \
+	 CEPH_FEATURE_NEW_OSDOP_ENCODING |        \
+         CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \
+	 DEPRECATED_CEPH_FEATURE_MON_SINGLE_PAXOS |    \
+	 CEPH_FEATURE_OSD_CACHEPOOL |	    \
+	 CEPH_FEATURE_CRUSH_V2 |	    \
+	 CEPH_FEATURE_EXPORT_PEER |	    \
+         DEPRECATED_CEPH_FEATURE_OSD_ERASURE_CODES |   \
+	 CEPH_FEATURE_OSDMAP_ENC |          \
+	 CEPH_FEATURE_MDS_INLINE_DATA |	    \
+	 CEPH_FEATURE_CRUSH_TUNABLES3 |	    \
+	 CEPH_FEATURE_OSD_PRIMARY_AFFINITY |	\
+	 CEPH_FEATURE_MSGR_KEEPALIVE2 |	\
+	 CEPH_FEATURE_OSD_POOLRESEND |	\
+         DEPRECATED_CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 |	\
+	 CEPH_FEATURE_OSD_FADVISE_FLAGS |     \
+	 CEPH_FEATURE_MDS_QUOTA | \
+         CEPH_FEATURE_CRUSH_V4 |	     \
+	 DEPRECATED_CEPH_FEATURE_MON_METADATA |			 \
+	 DEPRECATED_CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT |		 \
+         DEPRECATED_CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 |		 \
+         DEPRECATED_CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES |			\
+	 DEPRECATED_CEPH_FEATURE_OSD_HITSET_GMT |				 \
+	 DEPRECATED_CEPH_FEATURE_HAMMER_0_94_4 |		 \
+	 CEPH_FEATURE_MON_STATEFUL_SUB |	 \
+	 DEPRECATED_CEPH_FEATURE_MON_ROUTE_OSDMAP |	 \
+	 CEPH_FEATURE_CRUSH_TUNABLES5 |	    \
+	 CEPH_FEATURE_SERVER_JEWEL |  \
+	 CEPH_FEATURE_FS_FILE_LAYOUT_V2 |		 \
+	 CEPH_FEATURE_SERVER_KRAKEN |	\
+	 CEPH_FEATURE_FS_BTIME |			 \
+	 CEPH_FEATURE_FS_CHANGE_ATTR |			 \
+	 CEPH_FEATURE_MSG_ADDR2 | \
+	 CEPH_FEATURE_SERVER_LUMINOUS |		\
+	 CEPH_FEATURE_RESEND_ON_SPLIT |		\
+	 CEPH_FEATURE_RADOS_BACKOFF |		\
+	 CEPH_FEATURE_OSD_RECOVERY_DELETES |	\
+	 CEPH_FEATURE_SERVER_MIMIC |		\
+	 CEPH_FEATURE_RECOVERY_RESERVATION_2 |	\
+	 CEPH_FEATURE_SERVER_NAUTILUS |		\
+	 CEPH_FEATURE_CEPHX_V2 | \
+	 CEPH_FEATURE_OSD_PGLOG_HARDLIMIT | \
+	 CEPH_FEATURE_OSD_FIXED_COLLECTION_LIST | \
+	 0ULL)
+
+#define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
+
+/*
+ * crush related features
+ */
+#define CEPH_FEATURES_CRUSH			\
+	(CEPH_FEATURE_CRUSH_TUNABLES |		\
+	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
+	 CEPH_FEATURE_CRUSH_TUNABLES3 |		\
+	 CEPH_FEATURE_CRUSH_TUNABLES5 |		\
+	 CEPH_FEATURE_CRUSH_V2 |		\
+	 CEPH_FEATURE_CRUSH_V4 |		\
+	 CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS)
+
+/*
+ * make sure we don't try to use the reserved features
+ */
+#define CEPH_STATIC_ASSERT(x) (void)(sizeof(int[((x)==0) ? -1 : 0]))
+
+static inline void ____build_time_check_for_reserved_bits(void) {
+	CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL &
+			    (CEPH_FEATURE_RESERVED |
+			     DEPRECATED_CEPH_FEATURE_RESERVED_BROKEN)) == 0);
+}
+
+#endif
diff --git a/src/include/ceph_frag.h b/src/include/ceph_frag.h
new file mode 100644
index 00000000..5babb8e9
--- /dev/null
+++ b/src/include/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+	return (b << 24) |
+		(v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+	return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+	return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+	return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+	return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+	/* is sub as specific as us, and contained by us? */
+	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f) - 1,
+			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1,
+	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+	int newbits = ceph_frag_bits(f) + by;
+	return ceph_frag_make(newbits,
+			 ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+	return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+	return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+			 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
new file mode 100644
index 00000000..1c73ff37
--- /dev/null
+++ b/src/include/ceph_fs.h
@@ -0,0 +1,982 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2.1
+ */
+
+#ifndef CEPH_FS_H
+#define CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * The data structures defined here are shared between Linux kernel and
+ * user space.  Also, those data structures are maintained always in
+ * little-endian byte order, even on big-endian systems.  This is handled
+ * differently in kernel vs. user space.  For use as kernel headers, the
+ * little-endian fields need to use the __le16/__le32/__le64 types.  These
+ * are markers that indicate endian conversion routines must be used
+ * whenever such fields are accessed, which can be verified by checker
+ * tools like "sparse".  For use as user-space headers, the little-endian
+ * fields instead use types ceph_le16/ceph_le32/ceph_le64, which are C++
+ * classes that implement automatic endian conversion on every access.
+ * To still allow for header sharing, this file uses the __le types, but
+ * redefines those to the ceph_ types when compiled in user space.
+ */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
+
+
+#define CEPH_INO_ROOT   1
+#define CEPH_INO_CEPH   2       /* hidden .ceph dir */
+#define CEPH_INO_LOST_AND_FOUND 4	/* reserved ino for use in recovery */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+	/* file -> object mapping */
+	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+				      of page size. */
+	__le32 fl_stripe_count;    /* over this many objects */
+	__le32 fl_object_size;     /* until objects are this big, then move to
+				      new objects */
+	__le32 fl_cas_hash;        /* UNUSED.  0 = none; 1 = sha256 */
+
+	/* pg -> disk layout */
+	__le32 fl_object_stripe_unit;  /* UNUSED.  for per-object parity, if any */
+
+	/* object -> pg layout */
+	__le32 fl_unused;       /* unused; used to be preferred primary for pg (-1 for none) */
+	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+struct ceph_dir_layout {
+	__u8   dl_dir_hash;   /* see ceph_hash.h for ids */
+	__u8   dl_unused1;
+	__u16  dl_unused2;
+	__u32  dl_unused3;
+} __attribute__ ((packed));
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+
+#define CEPH_AES_IV "cephsageyudagreg"
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN	0x0
+#define CEPH_AUTH_NONE	 	0x1
+#define CEPH_AUTH_CEPHX	 	0x2
+
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN 0x0
+#define CEPH_CON_MODE_CRC     0x1
+#define CEPH_CON_MODE_SECURE  0x2
+
+extern const char *ceph_con_mode_name(int con_mode);
+
+/*  For options with "_", like: GSS_GSS
+    which means: Mode/Protocol to validate "authentication_authorization",
+    where:
+      - Authentication: Verifying the identity of an entity.
+      - Authorization:  Verifying that an authenticated entity has
+                        the right to access a particular resource.
+*/ 
+#define CEPH_AUTH_GSS     0x4
+#define CEPH_AUTH_GSS_GSS CEPH_AUTH_GSS
+
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_MON_GET_OSDMAP         6
+#define CEPH_MSG_MON_METADATA           7
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH			17
+#define CEPH_MSG_AUTH_REPLY		18
+#define CEPH_MSG_MON_GET_VERSION        19
+#define CEPH_MSG_MON_GET_VERSION_REPLY  20
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP                21
+
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_RECLAIM		27
+#define CEPH_MSG_CLIENT_RECLAIM_REPLY   28
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+#define CEPH_MSG_CLIENT_QUOTA           0x314
+
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
+
+
+/* osd */
+#define CEPH_MSG_OSD_MAP                41
+#define CEPH_MSG_OSD_OP                 42
+#define CEPH_MSG_OSD_OPREPLY            43
+#define CEPH_MSG_WATCH_NOTIFY           44
+#define CEPH_MSG_OSD_BACKOFF            61
+
+/* FSMap subscribers (see all MDS clusters at once) */
+#define CEPH_MSG_FS_MAP                 45
+/* FSMapUser subscribers (get MDS clusters name->ID mapping) */
+#define CEPH_MSG_FS_MAP_USER		103
+
+/* watch-notify operations */
+enum {
+	CEPH_WATCH_EVENT_NOTIFY		  = 1, /* notifying watcher */
+	CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
+	CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
+};
+
+const char *ceph_watch_event_name(int o);
+
+/* pool operations */
+enum {
+  POOL_OP_CREATE			= 0x01,
+  POOL_OP_DELETE			= 0x02,
+  POOL_OP_AUID_CHANGE			= 0x03,
+  POOL_OP_CREATE_SNAP			= 0x11,
+  POOL_OP_DELETE_SNAP			= 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP		= 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP		= 0x22,
+};
+
+struct ceph_mon_request_header {
+	__le64 have_version;
+	__le16 session_mon;
+	__le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+	__le64 kb, kb_used, kb_avail;
+	__le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+	struct ceph_fsid fsid;
+	__le64 version;
+	struct ceph_statfs st;
+} __attribute__ ((packed));
+
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 pool;
+	__le32 op;
+	__le64 __old_auid;  // obsolete
+	__le64 snapid;
+	__le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 reply_code;
+	__le32 epoch;
+	char has_data;
+	char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+	__le64 snapid;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+	struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+#define CEPH_SUBSCRIBE_ONETIME    1  /* i want only 1 update after have */
+
+struct ceph_mon_subscribe_item {
+	__le64 start;
+	__u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+	__le32 duration;         /* seconds */
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mdsmap flags
+ */
+#define CEPH_MDSMAP_NOT_JOINABLE                 (1<<0)  /* standbys cannot join */
+#define CEPH_MDSMAP_DOWN                         (CEPH_MDSMAP_NOT_JOINABLE) /* backwards compat */
+#define CEPH_MDSMAP_ALLOW_SNAPS                  (1<<1)  /* cluster allowed to create snapshots */
+/* deprecated #define CEPH_MDSMAP_ALLOW_MULTIMDS (1<<2) cluster allowed to have >1 active MDS */
+/* deprecated #define CEPH_MDSMAP_ALLOW_DIRFRAGS (1<<3) cluster allowed to fragment directories */
+#define CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS	     (1<<4)  /* cluster alllowed to enable MULTIMDS
+                                                            and SNAPS at the same time */
+#define CEPH_MDSMAP_ALLOW_STANDBY_REPLAY         (1<<5)  /* cluster alllowed to enable MULTIMDS */
+
+#define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \
+			      CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS)
+
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+					  empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAYONCE   -9 /* Legacy, unused */
+#define CEPH_MDS_STATE_NULL         -10
+
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+					  operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+#define CEPH_MDS_STATE_DAMAGED      15 /* rank not replayable, need repair */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DVERSION    1
+#define CEPH_LOCK_DN          2
+#define CEPH_LOCK_IVERSION    16    /* mds internal */
+#define CEPH_LOCK_ISNAP       32
+#define CEPH_LOCK_IFILE       64
+#define CEPH_LOCK_IAUTH       128
+#define CEPH_LOCK_ILINK       256
+#define CEPH_LOCK_IDFT        512   /* dir frag tree */
+#define CEPH_LOCK_INEST       1024  /* mds internal */
+#define CEPH_LOCK_IXATTR      2048
+#define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
+#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IPOLICY     16384 /* policy lock on dirs. MDS internal */
+
+/* client_session ops */
+enum {
+	CEPH_SESSION_REQUEST_OPEN,
+	CEPH_SESSION_OPEN,
+	CEPH_SESSION_REQUEST_CLOSE,
+	CEPH_SESSION_CLOSE,
+	CEPH_SESSION_REQUEST_RENEWCAPS,
+	CEPH_SESSION_RENEWCAPS,
+	CEPH_SESSION_STALE,
+	CEPH_SESSION_RECALL_STATE,
+	CEPH_SESSION_FLUSHMSG,
+	CEPH_SESSION_FLUSHMSG_ACK,
+	CEPH_SESSION_FORCE_RO,
+    // A response to REQUEST_OPEN indicating that the client should
+    // permanently desist from contacting the MDS
+	CEPH_SESSION_REJECT,
+        CEPH_SESSION_REQUEST_FLUSH_MDLOG
+};
+
+// flags for state reclaim
+#define CEPH_RECLAIM_RESET	1
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+	__le32 op;
+	__le64 seq;
+	struct ceph_timespec stamp;
+	__le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+	CEPH_MDS_OP_LOOKUP     = 0x00100,
+	CEPH_MDS_OP_GETATTR    = 0x00101,
+	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+	CEPH_MDS_OP_LOOKUPINO  = 0x00104,
+	CEPH_MDS_OP_LOOKUPNAME = 0x00105,
+
+	CEPH_MDS_OP_SETXATTR   = 0x01105,
+	CEPH_MDS_OP_RMXATTR    = 0x01106,
+	CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+	CEPH_MDS_OP_SETATTR    = 0x01108,
+	CEPH_MDS_OP_SETFILELOCK= 0x01109,
+	CEPH_MDS_OP_GETFILELOCK= 0x00110,
+	CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
+
+	CEPH_MDS_OP_MKNOD      = 0x01201,
+	CEPH_MDS_OP_LINK       = 0x01202,
+	CEPH_MDS_OP_UNLINK     = 0x01203,
+	CEPH_MDS_OP_RENAME     = 0x01204,
+	CEPH_MDS_OP_MKDIR      = 0x01220,
+	CEPH_MDS_OP_RMDIR      = 0x01221,
+	CEPH_MDS_OP_SYMLINK    = 0x01222,
+
+	CEPH_MDS_OP_CREATE     = 0x01301,
+	CEPH_MDS_OP_OPEN       = 0x00302,
+	CEPH_MDS_OP_READDIR    = 0x00305,
+
+	CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+	CEPH_MDS_OP_MKSNAP     = 0x01400,
+	CEPH_MDS_OP_RMSNAP     = 0x01401,
+	CEPH_MDS_OP_LSSNAP     = 0x00402,
+	CEPH_MDS_OP_RENAMESNAP = 0x01403,
+
+	// internal op
+	CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
+	CEPH_MDS_OP_EXPORTDIR  = 0x01501,
+	CEPH_MDS_OP_FLUSH      = 0x01502,
+	CEPH_MDS_OP_ENQUEUE_SCRUB  = 0x01503,
+	CEPH_MDS_OP_REPAIR_FRAGSTATS = 0x01504,
+	CEPH_MDS_OP_REPAIR_INODESTATS = 0x01505,
+	CEPH_MDS_OP_UPGRADE_SNAPREALM = 0x01506
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+#ifndef CEPH_SETATTR_MODE
+#define CEPH_SETATTR_MODE	(1 << 0)
+#define CEPH_SETATTR_UID	(1 << 1)
+#define CEPH_SETATTR_GID	(1 << 2)
+#define CEPH_SETATTR_MTIME	(1 << 3)
+#define CEPH_SETATTR_ATIME	(1 << 4)
+#define CEPH_SETATTR_SIZE	(1 << 5)
+#define CEPH_SETATTR_CTIME	(1 << 6)
+#define CEPH_SETATTR_MTIME_NOW	(1 << 7)
+#define CEPH_SETATTR_ATIME_NOW	(1 << 8)
+#define CEPH_SETATTR_BTIME	(1 << 9)
+#endif
+#define CEPH_SETATTR_KILL_SGUID	(1 << 10)
+
+/*
+ * open request flags
+ */
+#define CEPH_O_RDONLY          00000000
+#define CEPH_O_WRONLY          00000001
+#define CEPH_O_RDWR            00000002
+#define CEPH_O_CREAT           00000100
+#define CEPH_O_EXCL            00000200
+#define CEPH_O_TRUNC           00001000
+#define CEPH_O_LAZY            00020000
+#define CEPH_O_DIRECTORY       00200000
+#define CEPH_O_NOFOLLOW        00400000
+
+int ceph_flags_sys2wire(int flags);
+
+/*
+ * Ceph setxattr request flags.
+ */
+#define CEPH_XATTR_CREATE  (1 << 0)
+#define CEPH_XATTR_REPLACE (1 << 1)
+#define CEPH_XATTR_REMOVE  (1 << 31)
+
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS	(1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END		(1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
+#define CEPH_READDIR_HASH_ORDER		(1<<9)
+#define CEPH_READDIR_OFFSET_HASH       (1<<10)
+
+/* Note that this is embedded wthin ceph_mds_request_head_legacy. */
+union ceph_mds_request_args_legacy {
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+	} __attribute__ ((packed)) getattr;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+	} __attribute__ ((packed)) setattr;
+	struct {
+		__le32 frag;                 /* which dir fragment */
+		__le32 max_entries;          /* how many dentries to grab */
+		__le32 max_bytes;
+		__le16 flags;
+               __le32 offset_hash;
+	} __attribute__ ((packed)) readdir;
+	struct {
+		__le32 mode;
+		__le32 rdev;
+	} __attribute__ ((packed)) mknod;
+	struct {
+		__le32 mode;
+	} __attribute__ ((packed)) mkdir;
+	struct {
+		__le32 flags;
+		__le32 mode;
+		__le32 stripe_unit;          /* layout for newly created file */
+		__le32 stripe_count;         /* ... */
+		__le32 object_size;
+		__le32 pool;                 /* if >= 0 and CREATEPOOLID feature */
+		__le32 mask;                 /* CEPH_CAP_* */
+		__le64 old_size;             /* if O_TRUNC */
+	} __attribute__ ((packed)) open;
+	struct {
+		__le32 flags;
+		__le32 osdmap_epoch; 	    /* use for set file/dir layout */
+	} __attribute__ ((packed)) setxattr;
+	struct {
+		struct ceph_file_layout layout;
+	} __attribute__ ((packed)) setlayout;
+	struct {
+		__u8 rule; /* currently fcntl or flock */
+		__u8 type; /* shared, exclusive, remove*/
+		__le64 owner; /* who requests/holds the lock */
+		__le64 pid; /* process id requesting the lock */
+		__le64 start; /* initial location to lock */
+		__le64 length; /* num bytes to lock from start */
+		__u8 wait; /* will caller wait for lock to become available? */
+	} __attribute__ ((packed)) filelock_change;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+
+struct ceph_mds_request_head_legacy {
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args_legacy args;
+} __attribute__ ((packed));
+
+/*
+ * Note that this is embedded wthin ceph_mds_request_head. Also, compatibility
+ * with the ceph_mds_request_args_legacy must be maintained!
+ */
+union ceph_mds_request_args {
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+	} __attribute__ ((packed)) getattr;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+		struct ceph_timespec btime;
+	} __attribute__ ((packed)) setattr;
+	struct {
+		__le32 frag;                 /* which dir fragment */
+		__le32 max_entries;          /* how many dentries to grab */
+		__le32 max_bytes;
+		__le16 flags;
+               __le32 offset_hash;
+	} __attribute__ ((packed)) readdir;
+	struct {
+		__le32 mode;
+		__le32 rdev;
+	} __attribute__ ((packed)) mknod;
+	struct {
+		__le32 mode;
+	} __attribute__ ((packed)) mkdir;
+	struct {
+		__le32 flags;
+		__le32 mode;
+		__le32 stripe_unit;          /* layout for newly created file */
+		__le32 stripe_count;         /* ... */
+		__le32 object_size;
+		__le32 pool;                 /* if >= 0 and CREATEPOOLID feature */
+		__le32 mask;                 /* CEPH_CAP_* */
+		__le64 old_size;             /* if O_TRUNC */
+	} __attribute__ ((packed)) open;
+	struct {
+		__le32 flags;
+		__le32 osdmap_epoch; 	    /* use for set file/dir layout */
+	} __attribute__ ((packed)) setxattr;
+	struct {
+		struct ceph_file_layout layout;
+	} __attribute__ ((packed)) setlayout;
+	struct {
+		__u8 rule; /* currently fcntl or flock */
+		__u8 type; /* shared, exclusive, remove*/
+		__le64 owner; /* who requests/holds the lock */
+		__le64 pid; /* process id requesting the lock */
+		__le64 start; /* initial location to lock */
+		__le64 length; /* num bytes to lock from start */
+		__u8 wait; /* will caller wait for lock to become available? */
+	} __attribute__ ((packed)) filelock_change;
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+		__le64 snapid;
+		__le64 parent;
+		__le32 hash;
+	} __attribute__ ((packed)) lookupino;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_REQUEST_HEAD_VERSION	1
+
+/*
+ * Note that any change to this structure must ensure that it is compatible
+ * with ceph_mds_request_head_legacy.
+ */
+struct ceph_mds_request_head {
+	__le16 version;
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+	__le64 ino, cap_id;            /* ino and unique cap id */
+	__le32 caps, wanted;           /* new issued, wanted */
+	__le32 seq, issue_seq, mseq;
+	__le32 dname_seq;              /* if releasing a dentry lease, a */
+	__le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+
+static inline void
+copy_from_legacy_head(struct ceph_mds_request_head *head,
+			struct ceph_mds_request_head_legacy *legacy)
+{
+	memcpy(&(head->oldest_client_tid), legacy, sizeof(*legacy));
+}
+
+static inline void
+copy_to_legacy_head(struct ceph_mds_request_head_legacy *legacy,
+			struct ceph_mds_request_head *head)
+{
+	memcpy(legacy, &(head->oldest_client_tid), sizeof(*legacy));
+}
+
+/* client reply */
+struct ceph_mds_reply_head {
+	__le32 op;
+	__le32 result;
+	__le32 mdsmap_epoch;
+	__u8 safe;                     /* true if committed to disk */
+	__u8 is_dentry, is_target;     /* true if dentry, target inode records
+					  are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+	__le32 frag;                   /* this frag splits... */
+	__le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+	__le32 nsplits;                /* num ceph_frag_tree_split records */
+	struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+	__le32 caps, wanted;           /* caps issued, wanted */
+	__le64 cap_id;
+	__le32 seq, mseq;
+	__le64 realm;                  /* snap realm */
+	__u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH	(1 << 0)	/* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_RELEASE	(1 << 1)        /* ask client to release the cap */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+	__le16 mask;            /* lease type(s) */
+	__le32 duration_ms;     /* lease duration */
+	__le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+	__le32 frag;            /* fragment */
+	__le32 auth;            /* auth mds, if this is a delegation point */
+	__le32 ndist;           /* number of mds' this is replicated on */
+	__le32 dist[];
+} __attribute__ ((packed));
+
+#define CEPH_LOCK_FCNTL		1
+#define CEPH_LOCK_FLOCK		2
+#define CEPH_LOCK_FCNTL_INTR	3
+#define CEPH_LOCK_FLOCK_INTR	4
+
+#define CEPH_LOCK_SHARED   1
+#define CEPH_LOCK_EXCL     2
+#define CEPH_LOCK_UNLOCK   4
+
+struct ceph_filelock {
+	__le64 start;/* file offset to start lock at */
+	__le64 length; /* num bytes to lock; 0 for all following start */
+	__le64 client; /* which client holds the lock */
+	__le64 owner; /* who requests/holds the lock */
+	__le64 pid; /* process id holding the lock on the client */
+	__u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
+
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+/* inline data state */
+#define CEPH_INLINE_NONE	((__u64)-1)
+#define CEPH_INLINE_MAX_SIZE	CEPH_MIN_STRIPE_UNIT
+
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+/* note: these definitions are duplicated in mds/locks.c */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+
+#define CEPH_CAP_SIMPLE_BITS  2
+#define CEPH_CAP_FILE_BITS    8
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |			\
+				 CEPH_CAP_AUTH_SHARED |	\
+				 CEPH_CAP_LINK_SHARED |	\
+				 CEPH_CAP_FILE_SHARED |	\
+				 CEPH_CAP_XATTR_SHARED)
+#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
+				   CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT        CEPH_CAP_FILE_WREXTEND
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
+			      CEPH_CAP_LINK_SHARED |			\
+			      CEPH_CAP_XATTR_SHARED |			\
+			      CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |	\
+			   CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |		\
+			   CEPH_CAP_LINK_EXCL |		\
+			   CEPH_CAP_XATTR_EXCL |	\
+			   CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \
+                              CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
+			      CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+			   CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+			CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+	CEPH_CAP_OP_GRANT,         /* mds->client grant */
+	CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+	CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+	CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+	CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+	CEPH_CAP_OP_UPDATE,        /* client->mds update */
+	CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+	CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+	CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+	CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+	CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+	CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+	CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/* extra info for cap import/export */
+struct ceph_mds_cap_peer {
+	__le64 cap_id;
+	__le32 seq;
+	__le32 mseq;
+	__le32 mds;
+	__u8   flags;
+} __attribute__ ((packed));
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps_head {
+	__le32 op;                  /* CEPH_CAP_OP_* */
+	__le64 ino, realm;
+	__le64 cap_id;
+	__le32 seq, issue_seq;
+	__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+	__le32 migrate_seq;
+	__le64 snap_follows;
+	__le32 snap_trace_len;
+
+	/* authlock */
+	__le32 uid, gid, mode;
+
+	/* linklock */
+	__le32 nlink;
+
+	/* xattrlock */
+	__le32 xattr_len;
+	__le64 xattr_version;
+} __attribute__ ((packed));
+
+struct ceph_mds_caps_body_legacy {
+	union {
+		/* all except export */
+		struct {
+			/* filelock */
+			__le64 size, max_size, truncate_size;
+			__le32 truncate_seq;
+			struct ceph_timespec mtime, atime, ctime;
+			struct ceph_file_layout layout;
+			__le32 time_warp_seq;
+		};
+		/* export message */
+		struct ceph_mds_cap_peer peer;
+	};
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+	__le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+	__le64 ino;
+	__le64 cap_id;
+	__le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+	__u8 action;            /* CEPH_MDS_LEASE_* */
+	__le16 mask;            /* which lease */
+	__le64 ino;
+	__le64 first, last;     /* snap range */
+	__le32 seq;
+	__le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+	__le64 cap_id;
+	__le32 wanted;
+	__le32 issued;
+	__le64 snaprealm;
+	__le64 pathbase;        /* base ino for our path to this ino */
+	__le32 flock_len;       /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+
+struct ceph_mds_cap_reconnect_v1 {
+	__le64 cap_id;
+	__le32 wanted;
+	__le32 issued;
+	__le64 size;
+	struct ceph_timespec mtime, atime;
+	__le64 snaprealm;
+	__le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+
+struct ceph_mds_snaprealm_reconnect {
+	__le64 ino;     /* snap realm base */
+	__le64 seq;     /* snap seq for this snap realm */
+	__le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+	CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+	CEPH_SNAP_OP_CREATE,
+	CEPH_SNAP_OP_DESTROY,
+	CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+	__le32 op;                /* CEPH_SNAP_OP_* */
+	__le64 split;             /* ino to split off, if any */
+	__le32 num_split_inos;    /* # inos belonging to new child realm */
+	__le32 num_split_realms;  /* # child realms udner new child realm */
+	__le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+	__le64 ino;           /* ino */
+	__le64 created;       /* snap: when created */
+	__le64 parent;        /* ino: parent realm */
+	__le64 parent_since;  /* snap: same parent since */
+	__le64 seq;           /* snap: version */
+	__le32 num_snaps;
+	__le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/ceph_fuse.h b/src/include/ceph_fuse.h
new file mode 100644
index 00000000..45881930
--- /dev/null
+++ b/src/include/ceph_fuse.h
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ */
+#ifndef CEPH_FUSE_H
+#define CEPH_FUSE_H
+
+#define FUSE_USE_VERSION 30
+#include "acconfig.h"
+#include <fuse.h>
+
+static inline int filler_compat(fuse_fill_dir_t filler,
+                                void *buf, const char *name,
+                                const struct stat *stbuf,
+                                off_t off)
+{
+  return filler(buf, name, stbuf, off
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                , static_cast<enum fuse_fill_dir_flags>(0)
+#endif
+        );
+}
+#endif /* CEPH_FUSE_H */
diff --git a/src/include/ceph_hash.h b/src/include/ceph_hash.h
new file mode 100644
index 00000000..f9d80ac3
--- /dev/null
+++ b/src/include/ceph_hash.h
@@ -0,0 +1,14 @@
+#ifndef FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+extern bool ceph_str_hash_valid(int type);
+
+#endif
diff --git a/src/include/cephfs/ceph_ll_client.h b/src/include/cephfs/ceph_ll_client.h
new file mode 100644
index 00000000..4f3d4235
--- /dev/null
+++ b/src/include/cephfs/ceph_ll_client.h
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * scalable distributed file system
+ *
+ * Copyright (C) Jeff Layton <jlayton@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_CEPH_LL_CLIENT_H
+#define CEPH_CEPH_LL_CLIENT_H
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+
+class Fh;
+
+struct inodeno_t;
+struct vinodeno_t;
+typedef struct vinodeno_t vinodeno;
+
+#else /* __cplusplus */
+
+typedef struct Fh Fh;
+
+typedef struct inodeno_t {
+  uint64_t val;
+} inodeno_t;
+
+typedef struct _snapid_t {
+  uint64_t val;
+} snapid_t;
+
+typedef struct vinodeno_t {
+  inodeno_t ino;
+  snapid_t snapid;
+} vinodeno_t;
+
+#endif /* __cplusplus */
+
+/*
+ * Heavily borrowed from David Howells' draft statx patchset.
+ *
+ * Since the xstat patches are still a work in progress, we borrow its data
+ * structures and #defines to implement ceph_getattrx. Once the xstat stuff
+ * has been merged we should drop this and switch over to using that instead.
+ */
+struct ceph_statx {
+	uint32_t	stx_mask;
+	uint32_t	stx_blksize;
+	uint32_t	stx_nlink;
+	uint32_t	stx_uid;
+	uint32_t	stx_gid;
+	uint16_t	stx_mode;
+	uint64_t	stx_ino;
+	uint64_t	stx_size;
+	uint64_t	stx_blocks;
+	dev_t		stx_dev;
+	dev_t		stx_rdev;
+	struct timespec	stx_atime;
+	struct timespec	stx_ctime;
+	struct timespec	stx_mtime;
+	struct timespec	stx_btime;
+	uint64_t	stx_version;
+};
+
+#define CEPH_STATX_MODE		0x00000001U     /* Want/got stx_mode */
+#define CEPH_STATX_NLINK	0x00000002U     /* Want/got stx_nlink */
+#define CEPH_STATX_UID		0x00000004U     /* Want/got stx_uid */
+#define CEPH_STATX_GID		0x00000008U     /* Want/got stx_gid */
+#define CEPH_STATX_RDEV		0x00000010U     /* Want/got stx_rdev */
+#define CEPH_STATX_ATIME	0x00000020U     /* Want/got stx_atime */
+#define CEPH_STATX_MTIME	0x00000040U     /* Want/got stx_mtime */
+#define CEPH_STATX_CTIME	0x00000080U     /* Want/got stx_ctime */
+#define CEPH_STATX_INO		0x00000100U     /* Want/got stx_ino */
+#define CEPH_STATX_SIZE		0x00000200U     /* Want/got stx_size */
+#define CEPH_STATX_BLOCKS	0x00000400U     /* Want/got stx_blocks */
+#define CEPH_STATX_BASIC_STATS	0x000007ffU     /* The stuff in the normal stat struct */
+#define CEPH_STATX_BTIME	0x00000800U     /* Want/got stx_btime */
+#define CEPH_STATX_VERSION	0x00001000U     /* Want/got stx_version */
+#define CEPH_STATX_ALL_STATS	0x00001fffU     /* All supported stats */
+
+/*
+ * Compatibility macros until these defines make their way into glibc
+ */
+#ifndef AT_NO_ATTR_SYNC
+#define AT_NO_ATTR_SYNC		0x4000 /* Don't sync attributes with the server */
+#endif
+
+/*
+ * The statx interfaces only allow these flags. In order to allow us to add
+ * others in the future, we disallow setting any that aren't recognized.
+ */
+#define CEPH_REQ_FLAG_MASK		(AT_SYMLINK_NOFOLLOW|AT_NO_ATTR_SYNC)
+
+/* delegation recalls */
+typedef void (*ceph_deleg_cb_t)(Fh *fh, void *priv);
+
+/* inode data/metadata invalidation */
+typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino,
+	      int64_t off, int64_t len);
+
+/* dentry invalidation */
+typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
+					 vinodeno_t ino, const char *name,
+					 size_t len);
+
+/* remount entire fs */
+typedef int (*client_remount_callback_t)(void *handle);
+
+/* lock request interrupted */
+typedef void (*client_switch_interrupt_callback_t)(void *handle, void *data);
+
+/* fetch umask of actor */
+typedef mode_t (*client_umask_callback_t)(void *handle);
+
+/* request that application release Inode references */
+typedef void (*client_ino_release_t)(void *handle, vinodeno_t ino);
+
+/*
+ * The handle is an opaque value that gets passed to some callbacks. Any fields
+ * set to NULL will be left alone. There is no way to unregister callbacks.
+ */
+struct ceph_client_callback_args {
+  void *handle;
+  client_ino_callback_t ino_cb;
+  client_dentry_callback_t dentry_cb;
+  client_switch_interrupt_callback_t switch_intr_cb;
+  client_remount_callback_t remount_cb;
+  client_umask_callback_t umask_cb;
+  client_ino_release_t ino_release_cb;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_STATX_H */
+
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h
new file mode 100755
index 00000000..c1668769
--- /dev/null
+++ b/src/include/cephfs/libcephfs.h
@@ -0,0 +1,1869 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIB_H
+#define CEPH_LIB_H
+
+#if defined(__linux__)
+#include <features.h>
+#endif
+#include <utime.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/statvfs.h>
+#include <sys/socket.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#include "ceph_ll_client.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBCEPHFS_VER_MAJOR 10
+#define LIBCEPHFS_VER_MINOR 0
+#define LIBCEPHFS_VER_EXTRA 2
+
+#define LIBCEPHFS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBCEPHFS_VERSION_CODE LIBCEPHFS_VERSION(LIBCEPHFS_VER_MAJOR, LIBCEPHFS_VER_MINOR, LIBCEPHFS_VER_EXTRA)
+
+/*
+ * If using glibc check that file offset is 64-bit.
+ */
+#if defined(__GLIBC__) && !defined(__USE_FILE_OFFSET64)
+# error libceph: glibc must define __USE_FILE_OFFSET64 or readdir results will be corrupted
+#endif
+
+/*
+ * XXXX redeclarations from ceph_fs.h, rados.h, etc.  We need more of this
+ * in the interface, but shouldn't be re-typing it (and using different
+ * C data types).
+ */
+#ifndef __cplusplus
+
+#define CEPH_INO_ROOT  1
+#define CEPH_NOSNAP  ((uint64_t)(-2))
+
+struct ceph_file_layout {
+	/* file -> object mapping */
+	uint32_t fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+				      of page size. */
+	uint32_t fl_stripe_count;    /* over this many objects */
+	uint32_t fl_object_size;     /* until objects are this big, then move to
+				      new objects */
+	uint32_t fl_cas_hash;        /* 0 = none; 1 = sha256 */
+
+	/* pg -> disk layout */
+	uint32_t fl_object_stripe_unit;  /* for per-object parity, if any */
+
+	/* object -> pg layout */
+	uint32_t fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+	uint32_t fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#endif /* ! __cplusplus */
+
+struct UserPerm;
+typedef struct UserPerm UserPerm;
+
+struct Inode;
+typedef struct Inode Inode;
+
+struct ceph_mount_info;
+struct ceph_dir_result;
+struct CephContext;
+
+/* setattr mask bits */
+#ifndef CEPH_SETATTR_MODE
+# define CEPH_SETATTR_MODE	1
+# define CEPH_SETATTR_UID	2
+# define CEPH_SETATTR_GID	4
+# define CEPH_SETATTR_MTIME	8
+# define CEPH_SETATTR_ATIME	16
+# define CEPH_SETATTR_SIZE	32
+# define CEPH_SETATTR_CTIME	64
+# define CEPH_SETATTR_MTIME_NOW	128
+# define CEPH_SETATTR_ATIME_NOW	256
+# define CEPH_SETATTR_BTIME	512
+#endif
+
+/* define error codes for the mount function*/
+# define CEPHFS_ERROR_MON_MAP_BUILD 1000
+# define CEPHFS_ERROR_NEW_CLIENT 1002
+# define CEPHFS_ERROR_MESSENGER_START 1003
+
+/**
+ * Create a UserPerm credential object.
+ *
+ * Some calls (most notably, the ceph_ll_* ones), take a credential object
+ * that represents the credentials that the calling program is using. This
+ * function creates a new credential object for this purpose. Returns a
+ * pointer to the object, or NULL if it can't be allocated.
+ *
+ * Note that the gidlist array is used directly and is not copied. It must
+ * remain valid over the lifetime of the created UserPerm object.
+ *
+ * @param uid uid to be used
+ * @param gid gid to be used
+ * @param ngids number of gids in supplemental grouplist
+ * @param gidlist array of gid_t's in the list of groups
+ */
+UserPerm *ceph_userperm_new(uid_t uid, gid_t gid, int ngids, gid_t *gidlist);
+
+/**
+ * Destroy a UserPerm credential object.
+ *
+ * @param perm pointer to object to be destroyed
+ *
+ * Currently this just frees the object. Note that the gidlist array is not
+ * freed. The caller must do so if it's necessary.
+ */
+void ceph_userperm_destroy(UserPerm *perm);
+
+/**
+ * Get a pointer to the default UserPerm object for the mount.
+ *
+ * @param cmount the mount info handle
+ *
+ * Every cmount has a default set of credentials. This returns a pointer to
+ * that object.
+ *
+ * Unlike with ceph_userperm_new, this object should not be freed.
+ */
+struct UserPerm *ceph_mount_perms(struct ceph_mount_info *cmount);
+
+/**
+ * Set cmount's default permissions
+ *
+ * @param cmount the mount info handle
+ * @param perm permissions to set to default for mount
+ *
+ * Every cmount has a default set of credentials. This does a deep copy of
+ * the given permissions to the ones in the cmount. Must be done after
+ * ceph_init but before ceph_mount.
+ *
+ * Returns 0 on success, and -EISCONN if the cmount is already mounted.
+ */
+int ceph_mount_perms_set(struct ceph_mount_info *cmount, UserPerm *perm);
+
+/**
+ * @defgroup libcephfs_h_init Setup and Teardown
+ * These are the first and last functions that should be called
+ * when using libcephfs.
+ *
+ * @{
+ */
+
+/**
+ * Get the version of libcephfs.
+ *
+ * The version number is major.minor.patch.
+ *
+ * @param major where to store the major version number
+ * @param minor where to store the minor version number
+ * @param patch where to store the extra version number
+ */
+const char *ceph_version(int *major, int *minor, int *patch);
+
+/**
+ * Create a mount handle for interacting with Ceph.  All libcephfs
+ * functions operate on a mount info handle.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param id the id of the client.  This can be a unique id that identifies
+ *           this client, and will get appended onto "client.".  Callers can
+ *           pass in NULL, and the id will be the process id of the client.
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create(struct ceph_mount_info **cmount, const char * const id);
+
+/**
+ * Create a mount handle from a CephContext, which holds the configuration
+ * for the ceph cluster.  A CephContext can be acquired from an existing ceph_mount_info
+ * handle, using the @ref ceph_get_mount_context call.  Note that using the same CephContext
+ * for two different mount handles results in the same client entity id being used.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param conf reuse this pre-existing CephContext config
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create_with_context(struct ceph_mount_info **cmount, struct CephContext *conf);
+
+
+#ifndef VOIDPTR_RADOS_T
+#define VOIDPTR_RADOS_T
+typedef void *rados_t;
+#endif // VOIDPTR_RADOS_T
+
+/**
+ * Create a mount handle from a rados_t, for using libcephfs in the
+ * same process as librados.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param cluster reference to already-initialized librados handle
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create_from_rados(struct ceph_mount_info **cmount, rados_t cluster);
+
+/**
+ * Initialize the filesystem client (but do not mount the filesystem yet)
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_init(struct ceph_mount_info *cmount);
+
+/**
+ * Optionally set which filesystem to mount, before calling mount.
+ *
+ * An error will be returned if this libcephfs instance is already
+ * mounted. This function is an alternative to setting the global
+ * client_mds_namespace setting.  Using this function enables multiple
+ * libcephfs instances in the same process to mount different filesystems.
+ *
+ * The filesystem name is *not* validated in this function.  That happens
+ * during mount(), where an ENOENT error will result if a non-existent
+ * filesystem was specified here.
+ *
+ * @param cmount the mount info handle
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_select_filesystem(struct ceph_mount_info *cmount, const char *fs_name);
+
+
+/**
+ * Perform a mount using the path for the root of the mount.
+ *
+ * It is optional to call ceph_init before this.  If ceph_init has
+ * not already been called, it will be called in the course of this operation.
+ *
+ * @param cmount the mount info handle
+ * @param root the path for the root of the mount.  This can be an existing
+ *	       directory within the ceph cluster, but most likely it will
+ * 	       be "/".  Passing in NULL is equivalent to "/".
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_mount(struct ceph_mount_info *cmount, const char *root);
+
+/**
+ * Return cluster ID for a mounted ceph filesystem
+ *
+ * Every ceph filesystem has a filesystem ID associated with it. This
+ * function returns that value. If the ceph_mount_info does not refer to a
+ * mounted filesystem, this returns a negative error code.
+ */
+int64_t ceph_get_fs_cid(struct ceph_mount_info *cmount);
+
+/**
+ * Execute a management command remotely on an MDS.
+ *
+ * Must have called ceph_init or ceph_mount before calling this.
+ *
+ * @param mds_spec string representing rank, MDS name, GID or '*'
+ * @param cmd array of null-terminated strings
+ * @param cmdlen length of cmd array
+ * @param inbuf non-null-terminated input data to command
+ * @param inbuflen length in octets of inbuf
+ * @param outbuf populated with pointer to buffer (command output data)
+ * @param outbuflen length of allocated outbuf
+ * @param outs populated with pointer to buffer (command error strings)
+ * @param outslen length of allocated outs
+ *
+ * @return 0 on success, negative error code on failure
+ *
+ */
+int ceph_mds_command(struct ceph_mount_info *cmount,
+    const char *mds_spec,
+    const char **cmd,
+    size_t cmdlen,
+    const char *inbuf, size_t inbuflen,
+    char **outbuf, size_t *outbuflen,
+    char **outs, size_t *outslen);
+
+/**
+ * Free a buffer, such as those used for output arrays from ceph_mds_command
+ */
+void ceph_buffer_free(char *buf);
+
+/**
+ * Unmount a mount handle.
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure
+ */
+int ceph_unmount(struct ceph_mount_info *cmount);
+
+/**
+ * Abort mds connections
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure
+ */
+int ceph_abort_conn(struct ceph_mount_info *cmount);
+
+/**
+ * Destroy the mount handle.
+ *
+ * The handle should not be mounted. This should be called on completion of
+ * all libcephfs functions.
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure.
+ */
+int ceph_release(struct ceph_mount_info *cmount);
+
+/**
+ * Deprecated. Unmount and destroy the ceph mount handle. This should be
+ * called on completion of all libcephfs functions.
+ *
+ * Equivalent to ceph_unmount() + ceph_release() without error handling.
+ *
+ * @param cmount the mount handle to shutdown
+ */
+void ceph_shutdown(struct ceph_mount_info *cmount);
+
+/**
+ * Get a global id for current instance
+ *
+ * The handle should not be mounted. This should be called on completion of
+ * all libcephfs functions.
+ *
+ * @param cmount the mount handle
+ * @returns instance global id
+ */
+uint64_t ceph_get_instance_id(struct ceph_mount_info *cmount);
+
+/**
+ * Extract the CephContext from the mount point handle.
+ *
+ * @param cmount the ceph mount handle to get the context from.
+ * @returns the CephContext associated with the mount handle.
+ */
+struct CephContext *ceph_get_mount_context(struct ceph_mount_info *cmount);
+
+/*
+ * Check mount status.
+ *
+ * Return non-zero value if mounted. Otherwise, zero.
+ */
+int ceph_is_mounted(struct ceph_mount_info *cmount);
+
+/** @} init */
+
+/**
+ * @defgroup libcephfs_h_config Config
+ * Functions for manipulating the Ceph configuration at runtime.
+ *
+ * @{
+ */
+
+/**
+ * Load the ceph configuration from the specified config file.
+ *
+ * @param cmount the mount handle to load the configuration into.
+ * @param path_list the configuration file path
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_read_file(struct ceph_mount_info *cmount, const char *path_list);
+
+/**
+ * Parse the command line arguments and load the configuration parameters.
+ *
+ * @param cmount the mount handle to load the configuration parameters into.
+ * @param argc count of the arguments in argv
+ * @param argv the argument list
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc, const char **argv);
+
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre ceph_mount() has not been called on the handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cmount handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *var);
+
+/** Sets a configuration value from a string.
+ *
+ * @param cmount the mount handle to set the configuration value on
+ * @param option the configuration option to set
+ * @param value the value of the configuration option to set
+ * 
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_conf_set(struct ceph_mount_info *cmount, const char *option, const char *value);
+
+/**
+ * Gets the configuration value as a string.
+ *
+ * @param cmount the mount handle to set the configuration value on
+ * @param option the config option to get
+ * @param buf the buffer to fill with the value
+ * @param len the length of the buffer.
+ * @returns the size of the buffer filled in with the value, or negative error code on failure
+ */
+int ceph_conf_get(struct ceph_mount_info *cmount, const char *option, char *buf, size_t len);
+
+/** @} config */
+
+/**
+ * @defgroup libcephfs_h_fsops File System Operations.
+ * Functions for getting/setting file system wide information specific to a particular
+ * mount handle.
+ *
+ * @{
+ */
+
+/**
+ * Perform a statfs on the ceph file system.  This call fills in file system wide statistics
+ * into the passed in buffer.
+ *
+ * @param cmount the ceph mount handle to use for performing the statfs.
+ * @param path can be any path within the mounted filesystem
+ * @param stbuf the file system statistics filled in by this function.
+ * @return 0 on success, negative error code otherwise.
+ */
+int ceph_statfs(struct ceph_mount_info *cmount, const char *path, struct statvfs *stbuf);
+
+/**
+ * Synchronize all filesystem data to persistent media.
+ *
+ * @param cmount the ceph mount handle to use for performing the sync_fs.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_sync_fs(struct ceph_mount_info *cmount);
+
+/**
+ * Get the current working directory.
+ *
+ * @param cmount the ceph mount to get the current working directory for.
+ * @returns the path to the current working directory
+ */
+const char* ceph_getcwd(struct ceph_mount_info *cmount);
+
+/**
+ * Change the current working directory.
+ *
+ * @param cmount the ceph mount to change the current working directory for.
+ * @param path the path to the working directory to change into.
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_chdir(struct ceph_mount_info *cmount, const char *path);
+
+/** @} fsops */
+
+/**
+ * @defgroup libcephfs_h_dir Directory Operations.
+ * Functions for manipulating and listing directories.
+ *
+ * @{
+ */
+
+/**
+ * Open the given directory.
+ *
+ * @param cmount the ceph mount handle to use to open the directory
+ * @param name the path name of the directory to open.  Must be either an absolute path
+ *        or a path relative to the current working directory.
+ * @param dirpp the directory result pointer structure to fill in.
+ * @returns 0 on success or negative error code otherwise.
+ */
+int ceph_opendir(struct ceph_mount_info *cmount, const char *name, struct ceph_dir_result **dirpp);
+
+/**
+ * Close the open directory.
+ *
+ * @param cmount the ceph mount handle to use for closing the directory
+ * @param dirp the directory result pointer (set by ceph_opendir) to close
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_closedir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Get the next entry in an open directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry to return.
+ * @returns the next directory entry or NULL if at the end of the directory (or the directory
+ *          is empty.  This pointer should not be freed by the caller, and is only safe to
+ *          access between return and the next call to ceph_readdir or ceph_closedir.
+ */
+struct dirent * ceph_readdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * A safe version of ceph_readdir, where the directory entry struct is allocated by the caller.
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry to return.
+ * @param de the directory entry pointer filled in with the next directory entry of the dirp state.
+ * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached,
+ *          and a negative error code on failure.
+ */
+int ceph_readdir_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de);
+
+/**
+ * A safe version of ceph_readdir that also returns the file statistics (readdir+stat).
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir_plus_r.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry to return.
+ * @param de the directory entry pointer filled in with the next directory entry of the dirp state.
+ * @param stx the stats of the file/directory of the entry returned
+ * @param want mask showing desired inode attrs for returned entry
+ * @param flags bitmask of flags to use when filling out attributes
+ * @param out optional returned Inode argument. If non-NULL, then a reference will be taken on
+ *            the inode and the pointer set on success.
+ * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached,
+ *          and a negative error code on failure.
+ */
+int ceph_readdirplus_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de,
+		       struct ceph_statx *stx, unsigned want, unsigned flags, struct Inode **out);
+
+/**
+ * Gets multiple directory entries.
+ *
+ * @param cmount the ceph mount handle to use for performing the getdents.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry/entries to return.
+ * @param name an array of struct dirent that gets filled in with the  to fill returned directory entries into.
+ * @param buflen the length of the buffer, which should be the number of dirent structs * sizeof(struct dirent).
+ * @returns the length of the buffer that was filled in, will always be multiples of sizeof(struct dirent), or a
+ *          negative error code.  If the buffer is not large enough for a single entry, -ERANGE is returned.
+ */
+int ceph_getdents(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen);
+
+/**
+ * Gets multiple directory names.
+ * 
+ * @param cmount the ceph mount handle to use for performing the getdents.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry/entries to return.
+ * @param name a buffer to fill in with directory entry names.
+ * @param buflen the length of the buffer that can be filled in.
+ * @returns the length of the buffer filled in with entry names, or a negative error code on failure.
+ *          If the buffer isn't large enough for a single entry, -ERANGE is returned.
+ */
+int ceph_getdnames(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen);
+
+/**
+ * Rewind the directory stream to the beginning of the directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the rewinddir.
+ * @param dirp the directory stream pointer to rewind.
+ */
+void ceph_rewinddir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Get the current position of a directory stream.
+ *
+ * @param cmount the ceph mount handle to use for performing the telldir.
+ * @param dirp the directory stream pointer to get the current position of.
+ * @returns the position of the directory stream.  Note that the offsets returned
+ *          by ceph_telldir do not have a particular order (cannot be compared with
+ *          inequality).
+ */
+int64_t ceph_telldir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Move the directory stream to a position specified by the given offset.
+ *
+ * @param cmount the ceph mount handle to use for performing the seekdir.
+ * @param dirp the directory stream pointer to move.
+ * @param offset the position to move the directory stream to.  This offset should be
+ *        a value returned by seekdir.  Note that this value does not refer to the nth
+ *        entry in a directory, and can not be manipulated with plus or minus.
+ */
+void ceph_seekdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, int64_t offset);
+
+/**
+ * Create a directory.
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the directory to create.  This must be either an
+ *        absolute path or a relative path off of the current working directory.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdir(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Create multiple directories at once.
+ *
+ * @param cmount the ceph mount handle to use for making the directories.
+ * @param path the full path of directories and sub-directories that should
+ *        be created.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdirs(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Remove a directory.
+ *
+ * @param cmount the ceph mount handle to use for removing directories.
+ * @param path the path of the directory to remove.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_rmdir(struct ceph_mount_info *cmount, const char *path);
+
+/** @} dir */
+
+/**
+ * @defgroup libcephfs_h_links Links and Link Handling.
+ * Functions for creating and manipulating hard links and symbolic inks.
+ *
+ * @{
+ */
+
+/**
+ * Create a link.
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_link(struct ceph_mount_info *cmount, const char *existing, const char *newname);
+
+/**
+ * Read a symbolic link.
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param path the path to the symlink to read
+ * @param buf the buffer to hold the path of the file that the symlink points to.
+ * @param size the length of the buffer
+ * @returns number of bytes copied on success or negative error code on failure
+ */
+int ceph_readlink(struct ceph_mount_info *cmount, const char *path, char *buf, int64_t size);
+
+/**
+ * Creates a symbolic link.
+ *
+ * @param cmount the ceph mount handle to use for creating the symbolic link.
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on failure.
+ */
+int ceph_symlink(struct ceph_mount_info *cmount, const char *existing, const char *newname);
+
+/** @} links */
+
+/**
+ * @defgroup libcephfs_h_files File manipulation and handling.
+ * Functions for creating and manipulating files.
+ *
+ * @{
+ */
+
+/**
+ * Removes a file, link, or symbolic link.  If the file/link has multiple links to it, the
+ * file will not disappear from the namespace until all references to it are removed.
+ * 
+ * @param cmount the ceph mount handle to use for performing the unlink.
+ * @param path the path of the file or link to unlink.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_unlink(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Rename a file or directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the rename.
+ * @param from the path to the existing file or directory.
+ * @param to the new name of the file or directory
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_rename(struct ceph_mount_info *cmount, const char *from, const char *to);
+
+/**
+ * Get an open file's extended statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param fd the file descriptor of the file to get statistics of.
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (only AT_NO_ATTR_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fstatx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx,
+		unsigned int want, unsigned int flags);
+
+/**
+ * Get a file's extended statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (only AT_NO_ATTR_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_statx(struct ceph_mount_info *cmount, const char *path, struct ceph_statx *stx,
+	       unsigned int want, unsigned int flags);
+
+/**
+ * Get a file's statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stbuf the stat struct that will be filled in with the file's statistics.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_stat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf);
+
+/**
+ * Get a file's statistics and attributes, without following symlinks.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stbuf the stat struct that will be filled in with the file's statistics.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lstat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf);
+
+/**
+ * Get the open file's statistics.
+ *
+ * @param cmount the ceph mount handle to use for performing the fstat.
+ * @param fd the file descriptor of the file to get statistics of.
+ * @param stbuf the stat struct of the file's statistics, filled in by the
+ *    function.
+ * @returns 0 on success or a negative error code on failure
+ */
+int ceph_fstat(struct ceph_mount_info *cmount, int fd, struct stat *stbuf);
+
+/**
+ * Set a file's attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the setattr.
+ * @param relpath the path to the file/directory to set the attributes of.
+ * @param stx the statx struct that must include attribute values to set on the file.
+ * @param mask a mask of all the CEPH_SETATTR_* values that have been set in the statx struct.
+ * @param flags mask of AT_* flags (only AT_ATTR_NOFOLLOW is respected for now)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_setattrx(struct ceph_mount_info *cmount, const char *relpath, struct ceph_statx *stx, int mask, int flags);
+
+/**
+ * Set a file's attributes (extended version).
+ * 
+ * @param cmount the ceph mount handle to use for performing the setattr.
+ * @param fd the fd of the open file/directory to set the attributes of.
+ * @param stx the statx struct that must include attribute values to set on the file.
+ * @param mask a mask of all the stat values that have been set on the stat struct.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fsetattrx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx, int mask);
+
+/**
+ * Change the mode bits (permissions) of a file/directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param path the path to the file/directory to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_chmod(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Change the mode bits (permissions) of an open file.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param fd the open file descriptor to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fchmod(struct ceph_mount_info *cmount, int fd, mode_t mode);
+
+/**
+ * Change the ownership of a file/directory.
+ * 
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param path the path of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_chown(struct ceph_mount_info *cmount, const char *path, int uid, int gid);
+
+/**
+ * Change the ownership of a file from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param fd the fd of the open file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fchown(struct ceph_mount_info *cmount, int fd, int uid, int gid);
+
+/**
+ * Change the ownership of a file/directory, don't follow symlinks.
+ * 
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param path the path of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lchown(struct ceph_mount_info *cmount, const char *path, int uid, int gid);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param buf holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utime(struct ceph_mount_info *cmount, const char *path, struct utimbuf *buf);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param buf holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futime(struct ceph_mount_info *cmount, int fd, struct utimbuf *buf);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times, don't follow symlinks.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lutimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futimes(struct ceph_mount_info *cmount, int fd, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futimens(struct ceph_mount_info *cmount, int fd, struct timespec times[2]);
+
+/**
+ * Apply or remove an advisory lock.
+ *
+ * @param cmount the ceph mount handle to use for performing the lock.
+ * @param fd the open file descriptor to change advisory lock.
+ * @param operation the advisory lock operation to be performed on the file
+ * descriptor among LOCK_SH (shared lock), LOCK_EX (exclusive lock),
+ * or LOCK_UN (remove lock). The LOCK_NB value can be ORed to perform a
+ * non-blocking operation.
+ * @param owner the user-supplied owner identifier (an arbitrary integer)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_flock(struct ceph_mount_info *cmount, int fd, int operation,
+	       uint64_t owner);
+
+/**
+ * Truncate the file to the given size.  If this operation causes the
+ * file to expand, the empty bytes will be filled in with zeros.
+ *
+ * @param cmount the ceph mount handle to use for performing the truncate.
+ * @param path the path to the file to truncate.
+ * @param size the new size of the file.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_truncate(struct ceph_mount_info *cmount, const char *path, int64_t size);
+
+/**
+ * Make a block or character special file.
+ *
+ * @param cmount the ceph mount handle to use for performing the mknod.
+ * @param path the path to the special file.
+ * @param mode the permissions to use and the type of special file.  The type can be
+ *        one of S_IFREG, S_IFCHR, S_IFBLK, S_IFIFO.
+ * @param rdev If the file type is S_IFCHR or S_IFBLK then this parameter specifies the
+ *        major and minor numbers of the newly created device special file.  Otherwise, 
+ *        it is ignored.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_mknod(struct ceph_mount_info *cmount, const char *path, mode_t mode, dev_t rdev);
+/**
+ * Create and/or open a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param path the path of the file to open.  If the flags parameter includes O_CREAT,
+ *        the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ *        is specified in the flags.
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_open(struct ceph_mount_info *cmount, const char *path, int flags, mode_t mode);
+
+/**
+ * Create and/or open a file with a specific file layout.
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param path the path of the file to open.  If the flags parameter includes O_CREAT,
+ *        the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ *        is specified in the flags.
+ * @param stripe_unit the stripe unit size (option, 0 for default)
+ * @param stripe_count the stripe count (optional, 0 for default)
+ * @param object_size the object size (optional, 0 for default)
+ * @param data_pool name of target data pool name (optional, NULL or empty string for default)
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_open_layout(struct ceph_mount_info *cmount, const char *path, int flags,
+ 		     mode_t mode, int stripe_unit, int stripe_count, int object_size,
+ 		     const char *data_pool);
+
+/**
+ * Close the open file.
+ *
+ * @param cmount the ceph mount handle to use for performing the close.
+ * @param fd the file descriptor referring to the open file.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_close(struct ceph_mount_info *cmount, int fd);
+
+/**
+ * Reposition the open file stream based on the given offset.
+ *
+ * @param cmount the ceph mount handle to use for performing the lseek.
+ * @param fd the open file descriptor referring to the open file and holding the
+ *        current position of the stream.
+ * @param offset the offset to set the stream to
+ * @param whence the flag to indicate what type of seeking to perform:
+ *	SEEK_SET: the offset is set to the given offset in the file.
+ *      SEEK_CUR: the offset is set to the current location plus @e offset bytes.
+ *      SEEK_END: the offset is set to the end of the file plus @e offset bytes.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int64_t ceph_lseek(struct ceph_mount_info *cmount, int fd, int64_t offset, int whence);
+/**
+ * Read data from the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the read.
+ * @param fd the file descriptor of the open file to read from.
+ * @param buf the buffer to read data into
+ * @param size the initial size of the buffer
+ * @param offset the offset in the file to read from.  If this value is negative, the
+ *        function reads from the current offset of the file descriptor.
+ * @returns the number of bytes read into buf, or a negative error code on failure.
+ */
+int ceph_read(struct ceph_mount_info *cmount, int fd, char *buf, int64_t size, int64_t offset);
+
+/**
+ * Read data from the file.
+ * @param cmount the ceph mount handle to use for performing the read.
+ * @param fd the file descriptor of the open file to read from.
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset in the file to read from.  If this value is negative, the
+ *        function reads from the current offset of the file descriptor.
+ * @returns the number of bytes read into buf, or a negative error code on failure.
+ */
+int ceph_preadv(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+           int64_t offset);
+
+/**
+ * Write data to a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the write.
+ * @param fd the file descriptor of the open file to write to
+ * @param buf the bytes to write to the file
+ * @param size the size of the buf array
+ * @param offset the offset of the file write into.  If this value is negative, the
+ *        function writes to the current offset of the file descriptor.
+ * @returns the number of bytes written, or a negative error code
+ */
+int ceph_write(struct ceph_mount_info *cmount, int fd, const char *buf, int64_t size,
+	       int64_t offset);
+
+/**
+ * Write data to a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the write.
+ * @param fd the file descriptor of the open file to write to
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset of the file write into.  If this value is negative, the
+ *        function writes to the current offset of the file descriptor.
+ * @returns the number of bytes written, or a negative error code
+ */
+int ceph_pwritev(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+           int64_t offset);
+
+/**
+ * Truncate a file to the given size.
+ *
+ * @param cmount the ceph mount handle to use for performing the ftruncate.
+ * @param fd the file descriptor of the file to truncate
+ * @param size the new size of the file
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_ftruncate(struct ceph_mount_info *cmount, int fd, int64_t size);
+
+/**
+ * Synchronize an open file to persistent media.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param syncdataonly a boolean whether to synchronize metadata and data (0)
+ *        or just data (1).
+ * @return 0 on success or a negative error code on failure.
+ */
+int ceph_fsync(struct ceph_mount_info *cmount, int fd, int syncdataonly);
+
+/**
+ * Preallocate or release disk space for the file for the byte range.
+ *
+ * @param cmount the ceph mount handle to use for performing the fallocate.
+ * @param fd the file descriptor of the file to fallocate.
+ * @param mode the flags determines the operation to be performed on the given range.
+ *        default operation (0) allocate and initialize to zero the file in the byte range,
+ *        and the file size will be changed if offset + length is greater than
+ *        the file size. if the FALLOC_FL_KEEP_SIZE flag is specified in the mode,
+ *        the file size will not be changed. if the FALLOC_FL_PUNCH_HOLE flag is
+ *        specified in the mode, the operation is deallocate space and zero the byte range.
+ * @param offset the byte range starting.
+ * @param length the length of the range.
+ * @return 0 on success or a negative error code on failure.
+ */
+int ceph_fallocate(struct ceph_mount_info *cmount, int fd, int mode,
+	                      int64_t offset, int64_t length);
+
+/**
+ * Enable/disable lazyio for the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param enable a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio(struct ceph_mount_info *cmount, int fd, int enable);
+
+
+/**
+ * Flushes the write buffer for the file thereby propogating the buffered write to the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param offset a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio_propagate(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count);
+
+
+/**
+ * Flushes the write buffer for the file and invalidate the read cache. This allows a subsequent read operation to read and cache data directly from the file and hence everyone's propagated writes would be visible. 
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param offset a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio_synchronize(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count);
+
+/** @} file */
+
+/**
+ * @defgroup libcephfs_h_xattr Extended Attribute manipulation and handling.
+ * Functions for creating and manipulating extended attributes on files.
+ *
+ * @{
+ */
+
+/**
+ * Get an extended attribute.
+ *
+ * @param cmount the ceph mount handle to use for performing the getxattr.
+ * @param path the path to the file
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_getxattr(struct ceph_mount_info *cmount, const char *path, const char *name, 
+	void *value, size_t size);
+
+/**
+ * Get an extended attribute.
+ *
+ * @param cmount the ceph mount handle to use for performing the getxattr.
+ * @param fd the open file descriptor referring to the file to get extended attribute from.
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_fgetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+	void *value, size_t size);
+
+/**
+ * Get an extended attribute without following symbolic links.  This function is
+ * identical to ceph_getxattr, but if the path refers to a symbolic link,
+ * we get the extended attributes of the symlink rather than the attributes
+ * of the link itself.
+ *
+ * @param cmount the ceph mount handle to use for performing the lgetxattr.
+ * @param path the path to the file
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_lgetxattr(struct ceph_mount_info *cmount, const char *path, const char *name, 
+	void *value, size_t size);
+
+/**
+ * List the extended attribute keys on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the listxattr.
+ * @param path the path to the file.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_listxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size);
+
+/**
+ * List the extended attribute keys on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the listxattr.
+ * @param fd the open file descriptor referring to the file to list extended attributes on.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_flistxattr(struct ceph_mount_info *cmount, int fd, char *list, size_t size);
+
+/**
+ * Get the list of extended attribute keys on a file, but do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the llistxattr.
+ * @param path the path to the file.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_llistxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size);
+
+/**
+ * Remove an extended attribute from a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the removexattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_removexattr(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Remove an extended attribute from a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the removexattr.
+ * @param fd the open file descriptor referring to the file to remove extended attribute from.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fremovexattr(struct ceph_mount_info *cmount, int fd, const char *name);
+
+/**
+ * Remove the extended attribute from a file, do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the lremovexattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lremovexattr(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Set an extended attribute on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the setxattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ *	CEPH_XATTR_CREATE: create the extended attribute.  Must not exist.
+ *      CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_setxattr(struct ceph_mount_info *cmount, const char *path, const char *name, 
+	const void *value, size_t size, int flags);
+
+/**
+ * Set an extended attribute on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the setxattr.
+ * @param fd the open file descriptor referring to the file to set extended attribute on.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ *	CEPH_XATTR_CREATE: create the extended attribute.  Must not exist.
+ *      CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fsetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+	const void *value, size_t size, int flags);
+
+/**
+ * Set an extended attribute on a file, do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the lsetxattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ *	CEPH_XATTR_CREATE: create the extended attribute.  Must not exist.
+ *      CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char *name, 
+	const void *value, size_t size, int flags);
+
+/** @} xattr */
+
+/**
+ * @defgroup libcephfs_h_filelayout Control File Layout.
+ * Functions for setting and getting the file layout of existing files.
+ *
+ * @{
+ */
+
+/**
+ * Get the file striping unit from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the striping unit of.
+ * @returns the striping unit of the file or a negative error code on failure.
+ */
+int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file striping unit.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the striping unit of.
+ * @returns the striping unit of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file striping count from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file striping count.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file object size from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file object size.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file pool information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the pool information of.
+ * @returns the ceph pool id that the file is in
+ */
+int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file pool information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the pool information of.
+ * @returns the ceph pool id that the file is in
+ */
+int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the name of the pool a opened file is stored in,
+ *
+ * Write the name of the file's pool to the buffer.  If buflen is 0, return
+ * a suggested length for the buffer.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen);
+
+/**
+ * get the name of a pool by id
+ *
+ * Given a pool's numeric identifier, get the pool's alphanumeric name.
+ *
+ * @param cmount the ceph mount handle to use
+ * @param pool the numeric pool id
+ * @param buf buffer to sore the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough
+ */
+int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t buflen);
+
+/**
+ * Get the name of the pool a file is stored in
+ *
+ * Write the name of the file's pool to the buffer.  If buflen is 0, return
+ * a suggested length for the buffer.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t buflen);
+
+/**
+ * Get the default pool name of cephfs
+ * Write the name of the default pool to the buffer. If buflen is 0, return
+ * a suggested length for the buffer.
+ * @param cmount the ceph mount handle to use.
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_default_data_pool_name(struct ceph_mount_info *cmount, char *buf, size_t buflen);
+
+/**
+ * Get the file layout from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file layout.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file replication information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the replication information of.
+ * @returns the replication factor of the file.
+ */
+int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file replication information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the replication information of.
+ * @returns the replication factor of the file.
+ */
+int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the id of the named pool.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param pool_name the name of the pool.
+ * @returns the pool id, or a negative error code on failure.
+ */
+int ceph_get_pool_id(struct ceph_mount_info *cmount, const char *pool_name);
+
+/**
+ * Get the pool replication factor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param pool_id the pool id to look up
+ * @returns the replication factor, or a negative error code on failure.
+ */
+int ceph_get_pool_replication(struct ceph_mount_info *cmount, int pool_id);
+
+/**
+ * Get the OSD address where the primary copy of a file stripe is located.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the open file descriptor referring to the file to get the striping unit of.
+ * @param offset the offset into the file to specify the stripe.  The offset can be
+ *	anywhere within the stripe unit.
+ * @param addr the address of the OSD holding that stripe
+ * @param naddr the capacity of the address passed in.
+ * @returns the size of the addressed filled into the @e addr parameter, or a negative
+ *	error code on failure.
+ */
+int ceph_get_file_stripe_address(struct ceph_mount_info *cmount, int fd, int64_t offset,
+				 struct sockaddr_storage *addr, int naddr);
+
+/**
+ * Get the list of OSDs where the objects containing a file offset are located.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the open file descriptor referring to the file.
+ * @param offset the offset within the file.
+ * @param length return the number of bytes between the offset and the end of
+ * the stripe unit (optional).
+ * @param osds an integer array to hold the OSD ids.
+ * @param nosds the size of the integer array.
+ * @returns the number of items stored in the output array, or -ERANGE if the
+ * array is not large enough.
+ */
+int ceph_get_file_extent_osds(struct ceph_mount_info *cmount, int fd,
+                              int64_t offset, int64_t *length, int *osds, int nosds);
+
+/**
+ * Get the fully qualified CRUSH location of an OSD.
+ *
+ * Returns (type, name) string pairs for each device in the CRUSH bucket
+ * hierarchy starting from the given osd to the root. Each pair element is
+ * separated by a NULL character.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param osd the OSD id.
+ * @param path buffer to store location.
+ * @param len size of buffer.
+ * @returns the amount of bytes written into the buffer, or -ERANGE if the
+ * array is not large enough.
+ */
+int ceph_get_osd_crush_location(struct ceph_mount_info *cmount,
+    int osd, char *path, size_t len);
+
+/**
+ * Get the network address of an OSD.
+ *
+ * @param cmount the ceph mount handle.
+ * @param osd the OSD id.
+ * @param addr the OSD network address.
+ * @returns zero on success, other returns a negative error code.
+ */
+int ceph_get_osd_addr(struct ceph_mount_info *cmount, int osd,
+    struct sockaddr_storage *addr);
+
+/**
+ * Get the file layout stripe unit granularity.
+ * @param cmount the ceph mount handle.
+ * @returns the stripe unit granularity or a negative error code on failure.
+ */
+int ceph_get_stripe_unit_granularity(struct ceph_mount_info *cmount);
+
+/** @} filelayout */
+
+/**
+ * No longer available.  Do not use.
+ * These functions will return -EOPNOTSUPP.
+ */
+int ceph_set_default_file_stripe_unit(struct ceph_mount_info *cmount, int stripe);
+int ceph_set_default_file_stripe_count(struct ceph_mount_info *cmount, int count);
+int ceph_set_default_object_size(struct ceph_mount_info *cmount, int size);
+int ceph_set_default_preferred_pg(struct ceph_mount_info *cmount, int osd);
+int ceph_set_default_file_replication(struct ceph_mount_info *cmount, int replication);
+
+/**
+ * Read from local replicas when possible.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param val a boolean to set (1) or clear (0) the option to favor local objects
+ *     for reads.
+ * @returns 0
+ */
+int ceph_localize_reads(struct ceph_mount_info *cmount, int val);
+
+/**
+ * Get the osd id of the local osd (if any)
+ *
+ * @param cmount the ceph mount handle to use.
+ * @returns the osd (if any) local to the node where this call is made, otherwise
+ *	-1 is returned.
+ */
+int ceph_get_local_osd(struct ceph_mount_info *cmount);
+
+/** @} default_filelayout */
+
+/**
+ * Get the capabilities currently issued to the client.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the file descriptor to get issued
+ * @returns the current capabilities issued to this client
+ *       for the open file
+ */
+int ceph_debug_get_fd_caps(struct ceph_mount_info *cmount, int fd);
+
+/**
+ * Get the capabilities currently issued to the client.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path to the file
+ * @returns the current capabilities issued to this client
+ *       for the file
+ */
+int ceph_debug_get_file_caps(struct ceph_mount_info *cmount, const char *path);
+
+/* Low Level */
+struct Inode *ceph_ll_get_inode(struct ceph_mount_info *cmount,
+				vinodeno_t vino);
+int ceph_ll_lookup_inode(
+    struct ceph_mount_info *cmount,
+    struct inodeno_t ino,
+    Inode **inode);
+
+/**
+ * Get the root inode of FS. Increase counter of references for root Inode. You must call ceph_ll_forget for it!
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param parent pointer to pointer to Inode struct. Pointer to root inode will be returned
+ * @returns 0 if all good
+ */
+int ceph_ll_lookup_root(struct ceph_mount_info *cmount,
+                  Inode **parent);
+int ceph_ll_lookup(struct ceph_mount_info *cmount, Inode *parent,
+		   const char *name, Inode **out, struct ceph_statx *stx,
+		   unsigned want, unsigned flags, const UserPerm *perms);
+int ceph_ll_put(struct ceph_mount_info *cmount, struct Inode *in);
+int ceph_ll_forget(struct ceph_mount_info *cmount, struct Inode *in,
+		   int count);
+int ceph_ll_walk(struct ceph_mount_info *cmount, const char* name, Inode **i,
+		 struct ceph_statx *stx, unsigned int want, unsigned int flags,
+		 const UserPerm *perms);
+int ceph_ll_getattr(struct ceph_mount_info *cmount, struct Inode *in,
+		    struct ceph_statx *stx, unsigned int want, unsigned int flags,
+		    const UserPerm *perms);
+int ceph_ll_setattr(struct ceph_mount_info *cmount, struct Inode *in,
+		    struct ceph_statx *stx, int mask, const UserPerm *perms);
+int ceph_ll_open(struct ceph_mount_info *cmount, struct Inode *in, int flags,
+		 struct Fh **fh, const UserPerm *perms);
+off_t ceph_ll_lseek(struct ceph_mount_info *cmount, struct Fh* filehandle,
+		     off_t offset, int whence);
+int ceph_ll_read(struct ceph_mount_info *cmount, struct Fh* filehandle,
+		 int64_t off, uint64_t len, char* buf);
+int ceph_ll_fsync(struct ceph_mount_info *cmount, struct Fh *fh,
+		  int syncdataonly);
+int ceph_ll_sync_inode(struct ceph_mount_info *cmount, struct Inode *in,
+		  int syncdataonly);
+int ceph_ll_fallocate(struct ceph_mount_info *cmount, struct Fh *fh,
+		      int mode, int64_t offset, int64_t length);
+int ceph_ll_write(struct ceph_mount_info *cmount, struct Fh* filehandle,
+		  int64_t off, uint64_t len, const char *data);
+int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh,
+		      const struct iovec *iov, int iovcnt, int64_t off);
+int64_t ceph_ll_writev(struct ceph_mount_info *cmount, struct Fh *fh,
+		       const struct iovec *iov, int iovcnt, int64_t off);
+int ceph_ll_close(struct ceph_mount_info *cmount, struct Fh* filehandle);
+int ceph_ll_iclose(struct ceph_mount_info *cmount, struct Inode *in, int mode);
+/**
+ * Get xattr value by xattr name.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param in file handle
+ * @param name name of attribute
+ * @param value pointer to begin buffer
+ * @param size buffer size
+ * @param perms pointer to UserPerms object
+ * @returns size of returned buffer. Negative number in error case
+ */
+int ceph_ll_getxattr(struct ceph_mount_info *cmount, struct Inode *in,
+		     const char *name, void *value, size_t size,
+		     const UserPerm *perms);
+int ceph_ll_setxattr(struct ceph_mount_info *cmount, struct Inode *in,
+		     const char *name, const void *value, size_t size,
+		     int flags, const UserPerm *perms);
+int ceph_ll_listxattr(struct ceph_mount_info *cmount, struct Inode *in,
+                      char *list, size_t buf_size, size_t *list_size,
+		      const UserPerm *perms);
+int ceph_ll_removexattr(struct ceph_mount_info *cmount, struct Inode *in,
+			const char *name, const UserPerm *perms);
+int ceph_ll_create(struct ceph_mount_info *cmount, Inode *parent,
+		   const char *name, mode_t mode, int oflags, Inode **outp,
+		   Fh **fhp, struct ceph_statx *stx, unsigned want,
+		   unsigned lflags, const UserPerm *perms);
+int ceph_ll_mknod(struct ceph_mount_info *cmount, Inode *parent,
+		  const char *name, mode_t mode, dev_t rdev, Inode **out,
+		  struct ceph_statx *stx, unsigned want, unsigned flags,
+		  const UserPerm *perms);
+int ceph_ll_mkdir(struct ceph_mount_info *cmount, Inode *parent,
+		  const char *name, mode_t mode, Inode **out,
+		  struct ceph_statx *stx, unsigned want,
+		  unsigned flags, const UserPerm *perms);
+int ceph_ll_link(struct ceph_mount_info *cmount, struct Inode *in,
+		 struct Inode *newparent, const char *name,
+		 const UserPerm *perms);
+int ceph_ll_opendir(struct ceph_mount_info *cmount, struct Inode *in,
+		    struct ceph_dir_result **dirpp, const UserPerm *perms);
+int ceph_ll_releasedir(struct ceph_mount_info *cmount,
+		       struct ceph_dir_result* dir);
+int ceph_ll_rename(struct ceph_mount_info *cmount, struct Inode *parent,
+		   const char *name, struct Inode *newparent,
+		   const char *newname, const UserPerm *perms);
+int ceph_ll_unlink(struct ceph_mount_info *cmount, struct Inode *in,
+		   const char *name, const UserPerm *perms);
+int ceph_ll_statfs(struct ceph_mount_info *cmount, struct Inode *in,
+		   struct statvfs *stbuf);
+int ceph_ll_readlink(struct ceph_mount_info *cmount, struct Inode *in,
+		     char *buf, size_t bufsize, const UserPerm *perms);
+int ceph_ll_symlink(struct ceph_mount_info *cmount,
+		    Inode *in, const char *name, const char *value,
+		    Inode **out, struct ceph_statx *stx,
+		    unsigned want, unsigned flags,
+		    const UserPerm *perms);
+int ceph_ll_rmdir(struct ceph_mount_info *cmount, struct Inode *in,
+		  const char *name, const UserPerm *perms);
+uint32_t ceph_ll_stripe_unit(struct ceph_mount_info *cmount,
+			     struct Inode *in);
+uint32_t ceph_ll_file_layout(struct ceph_mount_info *cmount,
+			     struct Inode *in,
+			     struct ceph_file_layout *layout);
+uint64_t ceph_ll_snap_seq(struct ceph_mount_info *cmount,
+			  struct Inode *in);
+int ceph_ll_get_stripe_osd(struct ceph_mount_info *cmount,
+			   struct Inode *in,
+			   uint64_t blockno,
+			   struct ceph_file_layout* layout);
+int ceph_ll_num_osds(struct ceph_mount_info *cmount);
+int ceph_ll_osdaddr(struct ceph_mount_info *cmount,
+		    int osd, uint32_t *addr);
+uint64_t ceph_ll_get_internal_offset(struct ceph_mount_info *cmount,
+				     struct Inode *in, uint64_t blockno);
+int ceph_ll_read_block(struct ceph_mount_info *cmount,
+		       struct Inode *in, uint64_t blockid,
+		       char* bl, uint64_t offset, uint64_t length,
+		       struct ceph_file_layout* layout);
+int ceph_ll_write_block(struct ceph_mount_info *cmount,
+			struct Inode *in, uint64_t blockid,
+			char* buf, uint64_t offset,
+			uint64_t length, struct ceph_file_layout* layout,
+			uint64_t snapseq, uint32_t sync);
+int ceph_ll_commit_blocks(struct ceph_mount_info *cmount,
+			  struct Inode *in, uint64_t offset, uint64_t range);
+
+
+int ceph_ll_getlk(struct ceph_mount_info *cmount,
+		  Fh *fh, struct flock *fl, uint64_t owner);
+int ceph_ll_setlk(struct ceph_mount_info *cmount,
+		  Fh *fh, struct flock *fl, uint64_t owner, int sleep);
+
+int ceph_ll_lazyio(struct ceph_mount_info *cmount, Fh *fh, int enable);
+
+/*
+ * Delegation support
+ *
+ * Delegations are way for an application to request exclusive or
+ * semi-exclusive access to an Inode. The client requests the delegation and
+ * if it's successful it can reliably cache file data and metadata until the
+ * delegation is recalled.
+ *
+ * Recalls are issued via a callback function, provided by the application.
+ * Callback functions should act something like signal handlers.  You want to
+ * do as little as possible in the callback. Any major work should be deferred
+ * in some fashion as it's difficult to predict the context in which this
+ * function will be called.
+ *
+ * Once the delegation has been recalled, the application should return it as
+ * soon as possible. The application has client_deleg_timeout seconds to
+ * return it, after which the cmount structure is forcibly unmounted and
+ * further calls into it fail.
+ *
+ * The application can set the client_deleg_timeout config option to suit its
+ * needs, but it should take care to choose a value that allows it to avoid
+ * forcible eviction from the cluster in the event of an application bug.
+ */
+
+/* Commands for manipulating delegation state */
+#ifndef CEPH_DELEGATION_NONE
+# define CEPH_DELEGATION_NONE	0
+# define CEPH_DELEGATION_RD	1
+# define CEPH_DELEGATION_WR	2
+#endif
+
+/**
+ * Get the amount of time that the client has to return caps
+ * @param cmount the ceph mount handle to use.
+ *
+ * In the event that a client does not return its caps, the MDS may blacklist
+ * it after this timeout. Applications should check this value and ensure
+ * that they set the delegation timeout to a value lower than this.
+ *
+ * This call returns the cap return timeout (in seconds) for this cmount, or
+ * zero if it's not mounted.
+ */
+uint32_t ceph_get_cap_return_timeout(struct ceph_mount_info *cmount);
+
+/**
+ * Set the delegation timeout for the mount (thereby enabling delegations)
+ * @param cmount the ceph mount handle to use.
+ * @param timeout the delegation timeout (in seconds)
+ *
+ * Since the client could end up blacklisted if it doesn't return delegations
+ * in time, we mandate that any application wanting to use delegations
+ * explicitly set the timeout beforehand. Until this call is done on the
+ * mount, attempts to set a delegation will return -ETIME.
+ *
+ * Once a delegation is recalled, if it is not returned in this amount of
+ * time, the cmount will be forcibly unmounted and further access attempts
+ * will fail (usually with -ENOTCONN errors).
+ *
+ * This value is further vetted against the cap return timeout, and this call
+ * can fail with -EINVAL if the timeout value is too long. Delegations can be
+ * disabled again by setting the timeout to 0.
+ */
+int ceph_set_deleg_timeout(struct ceph_mount_info *cmount, uint32_t timeout);
+
+/**
+ * Request a delegation on an open Fh
+ * @param cmount the ceph mount handle to use.
+ * @param fh file handle
+ * @param cmd CEPH_DELEGATION_* command
+ * @param cb callback function for recalling delegation
+ * @param priv opaque token passed back during recalls
+ *
+ * Returns 0 if the delegation was granted, -EAGAIN if there was a conflict
+ * and other error codes if there is a fatal error of some sort (e.g. -ENOMEM,
+ * -ETIME)
+ */
+int ceph_ll_delegation(struct ceph_mount_info *cmount, Fh *fh,
+		       unsigned int cmd, ceph_deleg_cb_t cb, void *priv);
+
+mode_t ceph_umask(struct ceph_mount_info *cmount, mode_t mode);
+
+/* state reclaim */
+#define CEPH_RECLAIM_RESET 	1
+
+/**
+ * Set ceph client uuid
+ * @param cmount the ceph mount handle to use.
+ * @param uuid the uuid to set
+ *
+ * Must be called before mount.
+ */
+void ceph_set_uuid(struct ceph_mount_info *cmount, const char *uuid);
+
+/**
+ * Set ceph client session timeout
+ * @param cmount the ceph mount handle to use.
+ * @param timeout the timeout to set
+ *
+ * Must be called before mount.
+ */
+void ceph_set_session_timeout(struct ceph_mount_info *cmount, unsigned timeout);
+
+/**
+ * Start to reclaim states of other client
+ * @param cmount the ceph mount handle to use.
+ * @param uuid uuid of client whose states need to be reclaimed
+ * @param flags flags that control how states get reclaimed
+ *
+ * Returns 0 success, -EOPNOTSUPP if mds does not support the operation,
+ * -ENOENT if CEPH_RECLAIM_RESET is specified and there is no client
+ * with the given uuid, -ENOTRECOVERABLE in all other error cases.
+ */
+int ceph_start_reclaim(struct ceph_mount_info *cmount,
+		       const char *uuid, unsigned flags);
+
+/**
+ * finish reclaiming states of other client (
+ * @param cmount the ceph mount handle to use.
+ */
+void ceph_finish_reclaim(struct ceph_mount_info *cmount);
+
+/**
+ * Register a set of callbacks to be used with this cmount
+ * @param cmount the ceph mount handle on which the cb's should be registerd
+ * @param args   callback arguments to register with the cmount
+ *
+ * Any fields set to NULL will be ignored. There currently is no way to
+ * unregister these callbacks, so this is a one-way change.
+ */
+void ceph_ll_register_callbacks(struct ceph_mount_info *cmount,
+				struct ceph_client_callback_args *args);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/cmp.h b/src/include/cmp.h
new file mode 100644
index 00000000..79372fde
--- /dev/null
+++ b/src/include/cmp.h
@@ -0,0 +1,205 @@
+#ifndef __CEPH_CMP_H
+#define __CEPH_CMP_H
+
+/*
+ * macros to define comparison operators for classes with small numbers of members.
+ */
+
+#define WRITE_EQ_OPERATORS_1(type, a)					\
+  inline bool operator==(const type &l, const type &r) {		\
+    return l.a == r.a;							\
+  }									\
+  inline bool operator!=(const type &l, const type &r) {		\
+    return l.a != r.a;							\
+  }
+
+#define WRITE_CMP_OPERATORS_1(type, a)					\
+  inline bool operator>(const type &l, const type &r) {			\
+    return l.a > r.a;							\
+  }									\
+  inline bool operator<(const type &l, const type &r) {			\
+    return l.a < r.a;							\
+  }									\
+  inline bool operator>=(const type &l, const type &r) {		\
+    return l.a >= r.a;							\
+  }									\
+  inline bool operator<=(const type &l, const type &r) {		\
+    return l.a <= r.a;							\
+  }
+
+#define WRITE_EQ_OPERATORS_2(type, a, b)				\
+  inline bool operator==(const type &l, const type &r) {		\
+    return l.a == r.a && l.b == r.b;					\
+  }									\
+  inline bool operator!=(const type &l, const type &r) {		\
+    return l.a != r.a || l.b != r.b;					\
+  }
+
+#define WRITE_CMP_OPERATORS_2(type, a, b)			\
+  inline bool operator>(const type &l, const type &r) {			\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b));					\
+  }									\
+  inline bool operator<(const type &l, const type &r) {			\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b));					\
+  }									\
+  inline bool operator>=(const type &l, const type &r) {		\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b >= r.b));					\
+  }									\
+  inline bool operator<=(const type &l, const type &r) {		\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b <= r.b));					\
+  }
+
+
+#define WRITE_EQ_OPERATORS_3(type, a, b, c)				\
+  inline bool operator==(const type &l, const type &r) {		\
+    return l.a == r.a && l.b == r.b && l.c == r.c;			\
+  }									\
+  inline bool operator!=(const type &l, const type &r) {		\
+    return l.a != r.a || l.b != r.b || l.c != r.c;			\
+  }
+
+#define WRITE_CMP_OPERATORS_3(type, a, b, c)				\
+  inline bool operator>(const type &l, const type &r) {			\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c > r.c))));			\
+  }									\
+  inline bool operator<(const type &l, const type &r) {			\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c < r.c))));			\
+  }									\
+  inline bool operator>=(const type &l, const type &r) {		\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c >= r.c))));			\
+  }									\
+  inline bool operator<=(const type &l, const type &r) {		\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c <= r.c))));			\
+  }
+
+#define WRITE_EQ_OPERATORS_4(type, a, b, c, d)				\
+  inline bool operator==(const type &l, const type &r) {		\
+    return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d;	\
+  }									\
+  inline bool operator!=(const type &l, const type &r) {		\
+    return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d;	\
+  }
+
+#define WRITE_CMP_OPERATORS_4(type, a, b, c, d)				\
+  inline bool operator>(const type &l, const type &r) {			\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c > r.c ||			\
+				      (l.c == r.c && (l.d > r.d))))));	\
+  }									\
+  inline bool operator<(const type &l, const type &r) {			\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c < r.c ||			\
+				      (l.c == r.c && (l.d < r.d))))));	\
+  }									\
+  inline bool operator>=(const type &l, const type &r) {		\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c > r.c ||			\
+				      (l.c == r.c && (l.d >= r.d))))));	\
+  }									\
+  inline bool operator<=(const type &l, const type &r) {		\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c < r.c ||			\
+				      (l.c == r.c && (l.d <= r.d)))))); \
+  }
+
+
+
+#define WRITE_EQ_OPERATORS_5(type, a, b, c, d, e)			\
+  inline bool operator==(const type &l, const type &r) {		\
+    return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d && l.e == r.e; \
+  }									\
+  inline bool operator!=(const type &l, const type &r) {		\
+    return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d || l.e != r.e; \
+  }
+
+#define WRITE_CMP_OPERATORS_5(type, a, b, c, d, e)			\
+  inline bool operator>(const type &l, const type &r) {			\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c > r.c ||			\
+				      (l.c == r.c && (l.d > r.d ||	\
+						      (l.d == r.d && l.e > r.e))))))); \
+  }									\
+  inline bool operator<(const type &l, const type &r) {			\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c < r.c ||			\
+				      (l.c == r.c && (l.d < r.d ||	\
+						      (l.d == r.d && (l.e < r.e)))))))); \
+  }									\
+  inline bool operator>=(const type &l, const type &r) {		\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c > r.c ||			\
+				      (l.c == r.c && (l.d > r.d ||	\
+						      (l.d == r.d && l.e >= r.e))))))); \
+  }									\
+  inline bool operator<=(const type &l, const type &r) {		\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c < r.c ||			\
+				      (l.c == r.c && (l.d < r.d ||	\
+						      (l.d == r.d && l.e <= r.e))))))); \
+  }
+
+#define WRITE_EQ_OPERATORS_7(type, a, b, c, d, e, f, g)			\
+  inline bool operator==(const type &l, const type &r) {		\
+    return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d && l.e == r.e && l.f == r.f && l.g == r.g; \
+  }									\
+  inline bool operator!=(const type &l, const type &r) {		\
+    return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d || l.e != r.e || l.f != r.f || l.g != r.g; \
+  }
+#define WRITE_CMP_OPERATORS_7(type, a, b, c, d, e, f, g)		\
+  inline bool operator<=(const type &l, const type &r) {		\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c < r.c ||			\
+				      (l.c == r.c && (l.d < r.d ||	\
+						      (l.d == r.d && (l.e < r.e || \
+								      (l.e == r.e && (l.f < r.f || \
+										      (l.f == r.f && l.g <= r.g))))))))))); \
+  }									\
+  inline bool operator>=(const type &l, const type &r) {		\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c > r.c ||			\
+				      (l.c == r.c && (l.d > r.d ||	\
+						      (l.d == r.d && (l.e > r.e || \
+								      (l.e == r.e && (l.f > r.f || \
+										      (l.f == r.f && l.g >= r.g))))))))))); \
+  }									\
+  inline bool operator>(const type &l, const type &r) {			\
+    return l.a > r.a ||							\
+      (l.a == r.a && (l.b > r.b ||					\
+		      (l.b == r.b && (l.c > r.c ||			\
+				      (l.c == r.c && (l.d > r.d ||	\
+						      (l.d == r.d && (l.e > r.e || \
+								      (l.e == r.e && (l.f > r.f || \
+										      (l.f == r.f && l.g > r.g))))))))))); \
+  }									\
+  inline bool operator<(const type &l, const type &r) {			\
+    return l.a < r.a ||							\
+      (l.a == r.a && (l.b < r.b ||					\
+		      (l.b == r.b && (l.c < r.c ||			\
+				      (l.c == r.c && (l.d < r.d ||	\
+						      (l.d == r.d && (l.e < r.e || \
+								      (l.e == r.e && (l.f < r.f || \
+										      (l.f == r.f && l.g < r.g))))))))))); \
+  }
+#endif
diff --git a/src/include/color.h b/src/include/color.h
new file mode 100644
index 00000000..6c8df40e
--- /dev/null
+++ b/src/include/color.h
@@ -0,0 +1,13 @@
+#ifndef CEPH_COLOR_H
+#define CEPH_COLOR_H
+
+#define TEXT_NORMAL	"\033[0m"
+/*#define TEXT_HAZARD	"\033[5;31m"*/
+#define TEXT_RED	"\033[0;31m"
+#define TEXT_GREEN	"\033[0;32m"
+#define TEXT_YELLOW	"\033[0;33m"
+#define TEXT_BLUE	"\033[0;34m"
+#define TEXT_MAGENTA	"\033[0;35m"
+#define TEXT_CYAN	"\033[0;36m"
+
+#endif
diff --git a/src/include/compact_map.h b/src/include/compact_map.h
new file mode 100644
index 00000000..3ccb7982
--- /dev/null
+++ b/src/include/compact_map.h
@@ -0,0 +1,383 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_MAP_H
+#define CEPH_COMPACT_MAP_H
+
+#include "buffer.h"
+#include "encoding.h"
+
+#include <map>
+#include <memory>
+
+#include "include/encoding.h"
+
+template <class Key, class T, class Map>
+class compact_map_base {
+protected:
+  std::unique_ptr<Map> map;
+  void alloc_internal() {
+    if (!map)
+      map.reset(new Map);
+  }
+  void free_internal() {
+    map.reset();
+  }
+  template <class It>
+  class const_iterator_base {
+    const compact_map_base *map;
+    It it;
+    const_iterator_base() : map(0) { }
+    const_iterator_base(const compact_map_base* m) : map(m) { }
+    const_iterator_base(const compact_map_base *m, const It& i) : map(m), it(i) { }
+    friend class compact_map_base;
+    friend class iterator_base;
+  public:
+    const_iterator_base(const const_iterator_base& o) {
+      map = o.map;
+      it = o.it;
+    }
+    bool operator==(const const_iterator_base& o) const {
+      return (map == o.map) && (!map->map || it == o.it);
+    }
+    bool operator!=(const const_iterator_base& o) const {
+      return !(*this == o);;
+    }
+    const_iterator_base& operator=(const const_iterator_base& o) {
+      map = o.map;
+      it = o.it;
+      return *this;
+    }
+    const_iterator_base& operator++() {
+      ++it;
+      return *this;
+    }
+    const_iterator_base& operator--() {
+      --it;
+      return *this;
+    }
+    const std::pair<const Key,T>& operator*() {
+      return *it;
+    }
+    const std::pair<const Key,T>* operator->() {
+      return it.operator->();
+    }
+  };
+  template <class It>
+  class iterator_base {
+  private:
+    const compact_map_base* map;
+    It it;
+    iterator_base() : map(0) { }
+    iterator_base(compact_map_base* m) : map(m) { }
+    iterator_base(compact_map_base* m, const It& i) : map(m), it(i) { }
+    friend class compact_map_base;
+  public:
+    iterator_base(const iterator_base& o) {
+      map = o.map;
+      it = o.it;
+    }
+    bool operator==(const iterator_base& o) const {
+      return (map == o.map) && (!map->map || it == o.it);
+    }
+    bool operator!=(const iterator_base& o) const {
+      return !(*this == o);;
+    }
+    iterator_base& operator=(const iterator_base& o) {
+      map = o.map;
+      it = o.it;
+      return *this;
+    }
+    iterator_base& operator++() {
+      ++it;
+      return *this;
+    }
+    iterator_base operator++(int) {
+      iterator_base tmp = *this;
+      ++it;
+      return tmp;
+    }
+    iterator_base& operator--() {
+      --it;
+      return *this;
+    }
+    std::pair<const Key,T>& operator*() {
+      return *it;
+    }
+    std::pair<const Key,T>* operator->() {
+      return it.operator->();
+    }
+    operator const_iterator_base<It>() const {
+      return const_iterator_base<It>(map, it);
+    }
+  };
+
+public:
+  class iterator : public iterator_base<typename Map::iterator> {
+    public:
+      iterator() { }
+      iterator(const iterator_base<typename Map::iterator>& o)
+	: iterator_base<typename Map::iterator>(o) { }
+      iterator(compact_map_base* m) : iterator_base<typename Map::iterator>(m) { }
+      iterator(compact_map_base* m, const typename Map::iterator& i)
+	: iterator_base<typename Map::iterator>(m, i) { }
+  };
+  class const_iterator : public const_iterator_base<typename Map::const_iterator> {
+    public:
+      const_iterator() { }
+      const_iterator(const iterator_base<typename Map::const_iterator>& o)
+	: const_iterator_base<typename Map::const_iterator>(o) { }
+      const_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_iterator>(m) { }
+      const_iterator(const compact_map_base* m, const typename Map::const_iterator& i)
+	: const_iterator_base<typename Map::const_iterator>(m, i) { }
+  };
+  class reverse_iterator : public iterator_base<typename Map::reverse_iterator> {
+    public:
+      reverse_iterator() { }
+      reverse_iterator(const iterator_base<typename Map::reverse_iterator>& o)
+	: iterator_base<typename Map::reverse_iterator>(o) { }
+      reverse_iterator(compact_map_base* m) : iterator_base<typename Map::reverse_iterator>(m) { }
+      reverse_iterator(compact_map_base* m, const typename Map::reverse_iterator& i)
+	: iterator_base<typename Map::reverse_iterator>(m, i) { }
+  };
+  class const_reverse_iterator : public const_iterator_base<typename Map::const_reverse_iterator> {
+    public:
+      const_reverse_iterator() { }
+      const_reverse_iterator(const iterator_base<typename Map::const_reverse_iterator>& o)
+	: iterator_base<typename Map::const_reverse_iterator>(o) { }
+      const_reverse_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_reverse_iterator>(m) { }
+      const_reverse_iterator(const compact_map_base* m, const typename Map::const_reverse_iterator& i)
+	: const_iterator_base<typename Map::const_reverse_iterator>(m, i) { }
+  };
+  compact_map_base(const compact_map_base& o) {
+    if (o.map) {
+      alloc_internal();
+      *map = *o.map;
+    }
+  }
+  compact_map_base() {}
+  ~compact_map_base() {}
+
+  bool empty() const {
+    return !map || map->empty();
+  }
+  size_t size() const {
+    return map ? map->size() : 0;
+  }
+  bool operator==(const compact_map_base& o) const {
+    return (empty() && o.empty()) || (map && o.map && *map == *o.map);
+  }
+  bool operator!=(const compact_map_base& o) const {
+    return !(*this == o);
+  }
+  size_t count (const Key& k) const {
+    return map ? map->count(k) : 0;
+  }
+  iterator erase (iterator p) {
+    if (map) {
+      ceph_assert(this == p.map);
+      auto it = map->erase(p.it);
+      if (map->empty()) {
+        free_internal();
+        return iterator(this);
+      } else {
+        return iterator(this, it);
+      }
+    } else {
+      return iterator(this);
+    }
+  }
+  size_t erase (const Key& k) {
+    if (!map)
+      return 0;
+    size_t r = map->erase(k);
+    if (map->empty())
+	free_internal();
+    return r;
+  }
+  void clear() {
+    free_internal();
+  }
+  void swap(compact_map_base& o) {
+    map.swap(o.map);
+  }
+  compact_map_base& operator=(const compact_map_base& o) {
+    if (o.map) {
+      alloc_internal();
+      *map = *o.map;
+    } else
+      free_internal();
+    return *this;
+  }
+  iterator insert(const std::pair<const Key, T>& val) {
+    alloc_internal();
+    return iterator(this, map->insert(val));
+  }
+  template <class... Args>
+  std::pair<iterator,bool> emplace ( Args&&... args ) {
+    alloc_internal();
+    auto em = map->emplace(std::forward<Args>(args)...);
+    return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+  }
+  iterator begin() {
+   if (!map)
+     return iterator(this);
+   return iterator(this, map->begin());
+  }
+  iterator end() {
+   if (!map)
+     return iterator(this);
+   return iterator(this, map->end());
+  }
+  reverse_iterator rbegin() {
+   if (!map)
+     return reverse_iterator(this);
+   return reverse_iterator(this, map->rbegin());
+  }
+  reverse_iterator rend() {
+   if (!map)
+     return reverse_iterator(this);
+   return reverse_iterator(this, map->rend());
+  }
+  iterator find(const Key& k) {
+    if (!map)
+      return iterator(this);
+    return iterator(this, map->find(k));
+  }
+  iterator lower_bound(const Key& k) {
+    if (!map)
+      return iterator(this);
+    return iterator(this, map->lower_bound(k));
+  }
+  iterator upper_bound(const Key& k) {
+    if (!map)
+      return iterator(this);
+    return iterator(this, map->upper_bound(k));
+  }
+  const_iterator begin() const {
+   if (!map)
+     return const_iterator(this);
+   return const_iterator(this, map->begin());
+  }
+  const_iterator end() const {
+   if (!map)
+     return const_iterator(this);
+   return const_iterator(this, map->end());
+  }
+  const_reverse_iterator rbegin() const {
+   if (!map)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, map->rbegin());
+  }
+  const_reverse_iterator rend() const {
+   if (!map)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, map->rend());
+  }
+  const_iterator find(const Key& k) const {
+    if (!map)
+      return const_iterator(this);
+    return const_iterator(this, map->find(k));
+  }
+  const_iterator lower_bound(const Key& k) const {
+    if (!map)
+      return const_iterator(this);
+    return const_iterator(this, map->lower_bound(k));
+  }
+  const_iterator upper_bound(const Key& k) const {
+    if (!map)
+      return const_iterator(this);
+    return const_iterator(this, map->upper_bound(k));
+  }
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    if (map)
+      encode(*map, bl);
+    else
+      encode((uint32_t)0, bl);
+  }
+  void encode(bufferlist &bl, uint64_t features) const {
+    using ceph::encode;
+    if (map)
+      encode(*map, bl, features);
+    else
+      encode((uint32_t)0, bl);
+  }
+  void decode(bufferlist::const_iterator& p) {
+    using ceph::decode;
+    using ceph::decode_nohead;
+    uint32_t n;
+    decode(n, p);
+    if (n > 0) {
+      alloc_internal();
+      decode_nohead(n, *map, p);
+    } else
+      free_internal();
+  }
+};
+
+template<class Key, class T, class Map>
+inline void encode(const compact_map_base<Key, T, Map>& m, bufferlist& bl) {
+  m.encode(bl);
+}
+template<class Key, class T, class Map>
+inline void encode(const compact_map_base<Key, T, Map>& m, bufferlist& bl,
+		   uint64_t features) {
+  m.encode(bl, features);
+}
+template<class Key, class T, class Map>
+inline void decode(compact_map_base<Key, T, Map>& m, bufferlist::const_iterator& p) {
+  m.decode(p);
+}
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_map : public compact_map_base<Key, T, std::map<Key,T,Compare,Alloc> > {
+public:
+  T& operator[](const Key& k) {
+    this->alloc_internal();
+    return (*(this->map))[k];
+  }
+};
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_map<Key, T, Compare, Alloc>& m)
+{
+  out << "{";
+  bool first = true;
+  for (const auto &p : m) {
+    if (!first)
+      out << ",";
+    out << p.first << "=" << p.second;
+    first = false;
+  }
+  out << "}";
+  return out;
+}
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_multimap : public compact_map_base<Key, T, std::multimap<Key,T,Compare,Alloc> > {
+};
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_multimap<Key, T, Compare, Alloc>& m)
+{
+  out << "{{";
+  bool first = true;
+  for (const auto &p : m) {
+    if (!first)
+      out << ",";
+    out << p.first << "=" << p.second;
+    first = false;
+  }
+  out << "}}";
+  return out;
+}
+#endif
diff --git a/src/include/compact_set.h b/src/include/compact_set.h
new file mode 100644
index 00000000..ba743fb0
--- /dev/null
+++ b/src/include/compact_set.h
@@ -0,0 +1,305 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_SET_H
+#define CEPH_COMPACT_SET_H
+
+#include "buffer.h"
+#include "encoding.h"
+
+#include <memory>
+#include <set>
+
+template <class T, class Set>
+class compact_set_base {
+protected:
+  std::unique_ptr<Set> set;
+  void alloc_internal() {
+    if (!set)
+      set.reset(new Set);
+  }
+  void free_internal() {
+    set.reset();
+  }
+  template <class It>
+  class iterator_base {
+  private:
+    const compact_set_base* set;
+    It it;
+    iterator_base() : set(0) { }
+    iterator_base(const compact_set_base* s) : set(s) { }
+    iterator_base(const compact_set_base* s, const It& i) : set(s), it(i) { }
+    friend class compact_set_base;
+  public:
+    iterator_base(const iterator_base& o) {
+      set = o.set;
+      it = o.it;
+    }
+    bool operator==(const iterator_base& o) const {
+      return (set == o.set) && (!set->set || it == o.it);
+    }
+    bool operator!=(const iterator_base& o) const {
+      return !(*this == o);;
+    }
+    iterator_base& operator=(const iterator_base& o) {
+      set->set = o.set;
+      it = o.it;
+      return *this;
+    }
+    iterator_base& operator++() {
+      ++it;
+      return *this;
+    }
+    iterator_base operator++(int) {
+      iterator_base tmp = *this;
+      ++it;
+      return tmp;
+    }
+    iterator_base& operator--() {
+      --it;
+      return *this;
+    }
+    const T& operator*() {
+      return *it;
+    }
+  };
+public:
+  class const_iterator : public iterator_base<typename Set::const_iterator> {
+    public:
+      const_iterator() { }
+      const_iterator(const iterator_base<typename Set::const_iterator>& o)
+	: iterator_base<typename Set::const_iterator>(o) { }
+      const_iterator(const compact_set_base* s) : iterator_base<typename Set::const_iterator>(s) { }
+      const_iterator(const compact_set_base* s, const typename Set::const_iterator& i)
+	: iterator_base<typename Set::const_iterator>(s, i) { }
+  };
+  class iterator : public iterator_base<typename Set::iterator> {
+    public:
+      iterator() { }
+      iterator(const iterator_base<typename Set::iterator>& o)
+	: iterator_base<typename Set::iterator>(o) { }
+      iterator(compact_set_base* s) : iterator_base<typename Set::iterator>(s) { }
+      iterator(compact_set_base* s, const typename Set::iterator& i)
+	: iterator_base<typename Set::iterator>(s, i) { }
+      operator const_iterator() const {
+	return const_iterator(this->set, this->it);
+      }
+  };
+  class const_reverse_iterator : public iterator_base<typename Set::const_reverse_iterator> {
+    public:
+      const_reverse_iterator() { }
+      const_reverse_iterator(const iterator_base<typename Set::const_reverse_iterator>& o)
+	: iterator_base<typename Set::const_reverse_iterator>(o) { }
+      const_reverse_iterator(const compact_set_base* s) : iterator_base<typename Set::const_reverse_iterator>(s) { }
+      const_reverse_iterator(const compact_set_base* s, const typename Set::const_reverse_iterator& i)
+	: iterator_base<typename Set::const_reverse_iterator>(s, i) { }
+  };
+  class reverse_iterator : public iterator_base<typename Set::reverse_iterator> {
+    public:
+      reverse_iterator() { }
+      reverse_iterator(const iterator_base<typename Set::reverse_iterator>& o)
+	: iterator_base<typename Set::reverse_iterator>(o) { }
+      reverse_iterator(compact_set_base* s) : iterator_base<typename Set::reverse_iterator>(s) { }
+      reverse_iterator(compact_set_base* s, const typename Set::reverse_iterator& i)
+	: iterator_base<typename Set::reverse_iterator>(s, i) { }
+      operator const_iterator() const {
+	return const_iterator(this->set, this->it);
+      }
+  };
+
+  compact_set_base() {}
+  compact_set_base(const compact_set_base& o) {
+    if (o.set) {
+      alloc_internal();
+      *set = *o.set;
+    }
+  }
+  ~compact_set_base() {}
+
+
+  bool empty() const {
+    return !set || set->empty();
+  }
+  size_t size() const {
+    return set ? set->size() : 0;
+  }
+  bool operator==(const compact_set_base& o) const {
+    return (empty() && o.empty()) || (set && o.set && *set == *o.set);
+  }
+  bool operator!=(const compact_set_base& o) const {
+    return !(*this == o);
+  }
+  size_t count(const T& t) const {
+    return set ? set->count(t) : 0;
+  }
+  iterator erase (iterator p) {
+    if (set) {
+      ceph_assert(this == p.set);
+      auto it = set->erase(p.it);
+      if (set->empty()) {
+        free_internal();
+        return iterator(this);
+      } else {
+        return iterator(this, it);
+      }
+    } else {
+      return iterator(this);
+    }
+  }
+  size_t erase (const T& t) {
+    if (!set)
+      return 0;
+    size_t r = set->erase(t);
+    if (set->empty())
+      free_internal();
+    return r;
+  }
+  void clear() {
+    free_internal();
+  }
+  void swap(compact_set_base& o) {
+    set.swap(o.set);
+  }
+  compact_set_base& operator=(const compact_set_base& o) {
+    if (o.set) {
+      alloc_internal();
+      *set = *o.set;
+    } else
+      free_internal();
+    return *this;
+  }
+  std::pair<iterator,bool> insert(const T& t) {
+    alloc_internal();
+    std::pair<typename Set::iterator,bool> r = set->insert(t);
+    return std::make_pair(iterator(this, r.first), r.second);
+  }
+  template <class... Args>
+  std::pair<iterator,bool> emplace ( Args&&... args ) {
+    alloc_internal();
+    auto em = set->emplace(std::forward<Args>(args)...);
+    return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+  }
+
+  iterator begin() {
+   if (!set)
+     return iterator(this);
+   return iterator(this, set->begin());
+  }
+  iterator end() {
+   if (!set)
+     return iterator(this);
+   return iterator(this, set->end());
+  }
+  reverse_iterator rbegin() {
+   if (!set)
+     return reverse_iterator(this);
+   return reverse_iterator(this, set->rbegin());
+  }
+  reverse_iterator rend() {
+   if (!set)
+     return reverse_iterator(this);
+   return reverse_iterator(this, set->rend());
+  }
+  iterator find(const T& t) {
+    if (!set)
+      return iterator(this);
+    return iterator(this, set->find(t));
+  }
+  iterator lower_bound(const T& t) {
+    if (!set)
+      return iterator(this);
+    return iterator(this, set->lower_bound(t));
+  }
+  iterator upper_bound(const T& t) {
+    if (!set)
+      return iterator(this);
+    return iterator(this, set->upper_bound(t));
+  }
+  const_iterator begin() const {
+   if (!set)
+     return const_iterator(this);
+   return const_iterator(this, set->begin());
+  }
+  const_iterator end() const {
+   if (!set)
+     return const_iterator(this);
+   return const_iterator(this, set->end());
+  }
+  const_reverse_iterator rbegin() const {
+   if (!set)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, set->rbegin());
+  }
+  const_reverse_iterator rend() const {
+   if (!set)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, set->rend());
+  }
+  const_iterator find(const T& t) const {
+    if (!set)
+      return const_iterator(this);
+    return const_iterator(this, set->find(t));
+  }
+  const_iterator lower_bound(const T& t) const {
+    if (!set)
+      return const_iterator(this);
+    return const_iterator(this, set->lower_bound(t));
+  }
+  const_iterator upper_bound(const T& t) const {
+    if (!set)
+      return const_iterator(this);
+    return const_iterator(this, set->upper_bound(t));
+  }
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    if (set)
+      encode(*set, bl);
+    else
+      encode((uint32_t)0, bl);
+  }
+  void decode(bufferlist::const_iterator& p) {
+    using ceph::decode;
+    uint32_t n;
+    decode(n, p);
+    if (n > 0) {
+      alloc_internal();
+      decode_nohead(n, *set, p);
+    } else
+      free_internal();
+  }
+};
+
+template<class T, class Set>
+inline void encode(const compact_set_base<T, Set>& m, bufferlist& bl) {
+  m.encode(bl);
+}
+template<class T, class Set>
+inline void decode(compact_set_base<T, Set>& m, bufferlist::const_iterator& p) {
+  m.decode(p);
+}
+
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+class compact_set : public compact_set_base<T, std::set<T, Compare, Alloc> > {
+};
+
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+inline std::ostream& operator<<(std::ostream& out, const compact_set<T,Compare,Alloc>& s)
+{
+  bool first = true;
+  for (auto &v : s) {
+    if (!first)
+      out << ",";
+    out << v;
+    first = false;
+  }
+  return out;
+}
+#endif
diff --git a/src/include/compat.h b/src/include/compat.h
new file mode 100644
index 00000000..7c75dac2
--- /dev/null
+++ b/src/include/compat.h
@@ -0,0 +1,198 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_COMPAT_H
+#define CEPH_COMPAT_H
+
+#include "acconfig.h"
+#include <sys/types.h>
+
+#if defined(__linux__)
+#define PROCPREFIX
+#endif
+
+#include <sys/stat.h>
+#ifndef ACCESSPERMS
+#define ACCESSPERMS (S_IRWXU|S_IRWXG|S_IRWXO)
+#endif
+
+#if defined(__FreeBSD__)
+
+// FreeBSD supports Linux procfs with its compatibility module
+// And all compatibility stuff is standard mounted on this 
+#define PROCPREFIX "/compat/linux"
+
+#ifndef MSG_MORE
+#define MSG_MORE 0
+#endif
+
+#ifndef O_DSYNC
+#define O_DSYNC O_SYNC
+#endif
+
+/* And include the extra required include file */
+#include <pthread_np.h>
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#define cpu_set_t cpuset_t
+int sched_setaffinity(pid_t pid, size_t cpusetsize,
+                      cpu_set_t *mask);
+
+#endif /* __FreeBSD__ */
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+/* Make sure that ENODATA is defined in the correct way */
+#ifdef ENODATA
+#if (ENODATA == 9919)
+// #warning ENODATA already defined to be 9919, redefining to fix
+// Silencing this warning because it fires at all files where compat.h
+// is included after boost files.
+//
+// This value stems from the definition in the boost library
+// And when this case occurs it is due to the fact that boost files
+// are included before this file. Redefinition might not help in this
+// case since already parsed code has evaluated to the wrong value.
+// This would warrrant for d definition that would actually be evaluated
+// at the location of usage and report a possible conflict.
+// This is left up to a future improvement
+#elif (ENODATA != 87)
+// #warning ENODATA already defined to a value different from 87 (ENOATRR), refining to fix
+#endif
+#undef ENODATA
+#endif
+#define ENODATA ENOATTR
+
+// Fix clock accuracy
+#if !defined(CLOCK_MONOTONIC_COARSE)
+#if defined(CLOCK_MONOTONIC_FAST)
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC_FAST
+#else
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
+#endif
+#endif
+#if !defined(CLOCK_REALTIME_COARSE)
+#if defined(CLOCK_REALTIME_FAST)
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME_FAST
+#else
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
+#endif
+#endif
+
+/* get PATH_MAX */
+#include <limits.h>
+
+#ifndef EUCLEAN
+#define EUCLEAN 117
+#endif
+#ifndef EREMOTEIO
+#define EREMOTEIO 121
+#endif
+#ifndef EKEYREJECTED
+#define EKEYREJECTED 129
+#endif
+#ifndef XATTR_CREATE
+#define XATTR_CREATE 1
+#endif
+
+#ifndef HOST_NAME_MAX
+#ifdef MAXHOSTNAMELEN 
+#define HOST_NAME_MAX MAXHOSTNAMELEN 
+#else
+#define HOST_NAME_MAX 255
+#endif
+#endif
+
+#endif /* __APPLE__ */
+
+/* O_LARGEFILE is not defined/required on OSX/FreeBSD */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+/* Could be relevant for other platforms */
+#ifndef ERESTART
+#define ERESTART EINTR
+#endif
+
+#ifndef TEMP_FAILURE_RETRY
+#define TEMP_FAILURE_RETRY(expression) ({     \
+  __typeof(expression) __result;              \
+  do {                                        \
+    __result = (expression);                  \
+  } while (__result == -1 && errno == EINTR); \
+  __result; })
+#endif
+
+#ifdef __cplusplus
+# define VOID_TEMP_FAILURE_RETRY(expression) \
+   static_cast<void>(TEMP_FAILURE_RETRY(expression))
+#else
+# define VOID_TEMP_FAILURE_RETRY(expression) \
+   do { (void)TEMP_FAILURE_RETRY(expression); } while (0)
+#endif
+
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#define lseek64(fd, offset, whence) lseek(fd, offset, whence)
+#endif
+
+#if defined(__sun) || defined(_AIX)
+#define LOG_AUTHPRIV    (10<<3)
+#define LOG_FTP         (11<<3)
+#define __STRING(x)     "x"
+#define IFTODT(mode)   (((mode) & 0170000) >> 12)
+#endif
+
+#if defined(_AIX)
+#define MSG_DONTWAIT MSG_NONBLOCK
+#endif
+
+#if defined(HAVE_PTHREAD_SETNAME_NP)
+  #if defined(__APPLE__)
+    #define ceph_pthread_setname(thread, name) ({ \
+      int __result = 0;                         \
+      if (thread == pthread_self())             \
+        __result = pthread_setname_np(name);    \
+      __result; })
+  #else
+    #define ceph_pthread_setname pthread_setname_np
+  #endif
+#elif defined(HAVE_PTHREAD_SET_NAME_NP)
+  /* Fix a small name diff and return 0 */
+  #define ceph_pthread_setname(thread, name) ({ \
+    pthread_set_name_np(thread, name);          \
+    0; })
+#else
+  /* compiler warning free success noop */
+  #define ceph_pthread_setname(thread, name) ({ \
+    int __i = 0;                              \
+    __i; })
+#endif
+
+#if defined(HAVE_PTHREAD_GETNAME_NP)
+  #define ceph_pthread_getname pthread_getname_np
+#elif defined(HAVE_PTHREAD_GET_NAME_NP)
+  #define ceph_pthread_getname(thread, name, len) ({ \
+    pthread_get_name_np(thread, name, len);          \
+    0; })
+#else
+  /* compiler warning free success noop */
+  #define ceph_pthread_getname(thread, name, len) ({ \
+    if (name != NULL)                              \
+      *name = '\0';                                \
+    0; })
+#endif
+
+int ceph_posix_fallocate(int fd, off_t offset, off_t len);
+
+int pipe_cloexec(int pipefd[2]);
+
+#endif /* !CEPH_COMPAT_H */
diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake
new file mode 100644
index 00000000..acced696
--- /dev/null
+++ b/src/include/config-h.in.cmake
@@ -0,0 +1,366 @@
+/* config.h file expanded by Cmake for build */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+/* fallocate(2) is supported */
+#cmakedefine CEPH_HAVE_FALLOCATE
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#cmakedefine HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#cmakedefine HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `syncfs' function. */
+#cmakedefine HAVE_SYS_SYNCFS 1
+
+/* sync_file_range(2) is supported */
+#cmakedefine HAVE_SYNC_FILE_RANGE
+
+/* Define if you have mallinfo */
+#cmakedefine HAVE_MALLINFO
+
+/* Define to 1 if you have the `pwritev' function. */
+#cmakedefine HAVE_PWRITEV 1
+
+/* Define to 1 if you have the <sys/mount.h> header file. */
+#cmakedefine HAVE_SYS_MOUNT_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#cmakedefine HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#cmakedefine HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/vfs.h> header file. */
+#cmakedefine HAVE_SYS_VFS_H 1
+
+/* Define to 1 if you have the <execinfo.h> header file. */
+#cmakedefine HAVE_EXECINFO_H 1
+
+/* Define to 1 if the system has the type `__be16'. */
+#cmakedefine HAVE___BE16 1
+
+/* Define to 1 if the system has the type `__be32'. */
+#cmakedefine HAVE___BE32 1
+
+/* Define to 1 if the system has the type `__be64'. */
+#cmakedefine HAVE___BE64 1
+
+/* Define to 1 if the system has the type `__le16'. */
+#cmakedefine HAVE___LE16 1
+
+/* Define to 1 if the system has the type `__le32'. */
+#cmakedefine HAVE___LE32 1
+
+/* Define to 1 if the system has the type `__le64'. */
+#cmakedefine HAVE___LE64 1
+
+/* Define to 1 if the system has the type `__s16'. */
+#cmakedefine HAVE___S16 1
+
+/* Define to 1 if the system has the type `__s32'. */
+#cmakedefine HAVE___S32 1
+
+/* Define to 1 if the system has the type `__s64'. */
+#cmakedefine HAVE___S64 1
+
+/* Define to 1 if the system has the type `__s8'. */
+#cmakedefine HAVE___S8 1
+
+/* Define to 1 if the system has the type `__u16'. */
+#cmakedefine HAVE___U16 1
+
+/* Define to 1 if the system has the type `__u32'. */
+#cmakedefine HAVE___U32 1
+
+/* Define to 1 if the system has the type `__u64'. */
+#cmakedefine HAVE___U64 1
+
+/* Define to 1 if the system has the type `__u8'. */
+#cmakedefine HAVE___U8 1
+
+/* Define if you have res_nquery */
+#cmakedefine HAVE_RES_NQUERY
+
+/* Defined if you have LZ4 */
+#cmakedefine HAVE_LZ4
+
+/* Defined if you have BROTLI */
+#cmakedefine HAVE_BROTLI
+
+/* Defined if you have libaio */
+#cmakedefine HAVE_LIBAIO
+
+/* Defind if you have POSIX AIO */
+#cmakedefine HAVE_POSIXAIO
+
+/* Defined if OpenLDAP enabled */
+#cmakedefine HAVE_OPENLDAP
+
+/* Define if you have fuse */
+#cmakedefine HAVE_LIBFUSE
+
+/* Define to 1 if you have libxfs */
+#cmakedefine HAVE_LIBXFS 1
+
+/* SPDK conditional compilation */
+#cmakedefine HAVE_SPDK
+
+/* DPDK conditional compilation */
+#cmakedefine HAVE_DPDK
+
+/* PMEM conditional compilation */
+#cmakedefine HAVE_PMEM
+
+/* Defined if LevelDB supports bloom filters */
+#cmakedefine HAVE_LEVELDB_FILTER_POLICY
+
+/* Define if you have tcmalloc */
+#cmakedefine HAVE_LIBTCMALLOC
+
+/* Define if have curl_multi_wait() */
+#cmakedefine HAVE_CURL_MULTI_WAIT 1
+
+/* Define if using NSS. */
+#cmakedefine USE_NSS
+
+/* Define if using OpenSSL. */
+#cmakedefine USE_OPENSSL
+
+/* Accelio conditional compilation */
+#cmakedefine HAVE_XIO
+
+
+/* AsyncMessenger RDMA conditional compilation */
+#cmakedefine HAVE_RDMA
+
+/* ibverbs experimental conditional compilation */
+#cmakedefine HAVE_IBV_EXP
+
+/* define if bluestore enabled */
+#cmakedefine WITH_BLUESTORE
+
+/* define if cephfs enabled */
+#cmakedefine WITH_CEPHFS
+
+/*define if GSSAPI/KRB5 enabled */
+#cmakedefine HAVE_GSSAPI
+
+/* define if rbd enabled */
+#cmakedefine WITH_RBD
+
+/* define if kernel rbd enabled */
+#cmakedefine WITH_KRBD
+
+/* define if key-value-store is enabled */
+#cmakedefine WITH_KVS
+
+/* define if radosgw enabled */
+#cmakedefine WITH_RADOSGW
+
+/* define if radosgw enabled */
+#cmakedefine WITH_RADOSGW_FCGI_FRONTEND
+
+/* define if leveldb is enabled */
+#cmakedefine WITH_LEVELDB
+
+/* define if radosgw's beast frontend enabled */
+#cmakedefine WITH_RADOSGW_BEAST_FRONTEND
+
+/* define if radosgw has openssl support */
+#cmakedefine WITH_CURL_OPENSSL
+
+/* define if HAVE_THREAD_SAFE_RES_QUERY */
+#cmakedefine HAVE_THREAD_SAFE_RES_QUERY
+
+/* define if HAVE_REENTRANT_STRSIGNAL */
+#cmakedefine HAVE_REENTRANT_STRSIGNAL
+
+/* Define if you want to use LTTng */
+#cmakedefine WITH_LTTNG
+
+/* Define if you want to OSD function instrumentation */
+#cmakedefine WITH_OSD_INSTRUMENT_FUNCTIONS
+
+/* Define if you want to use Babeltrace */
+#cmakedefine WITH_BABELTRACE
+
+/* Define to 1 if you have the <babeltrace/babeltrace.h> header file. */
+#cmakedefine HAVE_BABELTRACE_BABELTRACE_H 1
+
+/* Define to 1 if you have the <babeltrace/ctf/events.h> header file. */
+#cmakedefine HAVE_BABELTRACE_CTF_EVENTS_H 1
+
+/* Define to 1 if you have the <babeltrace/ctf/iterator.h> header file. */
+#cmakedefine HAVE_BABELTRACE_CTF_ITERATOR_H 1
+
+/* Define to 1 if you have the <arpa/nameser_compat.h> header file. */
+#cmakedefine HAVE_ARPA_NAMESER_COMPAT_H 1
+
+/* FastCGI headers are in /usr/include/fastcgi */
+#cmakedefine FASTCGI_INCLUDE_DIR
+
+/* splice(2) is supported */
+#cmakedefine CEPH_HAVE_SPLICE
+
+/* Define if you want C_Gather debugging */
+#cmakedefine DEBUG_GATHER
+
+/* Define to 1 if you have the `getgrouplist' function. */
+#cmakedefine HAVE_GETGROUPLIST 1
+
+/* LTTng is disabled, so define this macro to be nothing. */
+#cmakedefine tracepoint
+
+/* Define to 1 if you have fdatasync. */
+#cmakedefine HAVE_FDATASYNC 1
+
+/* Defined if you have librocksdb enabled */
+#cmakedefine HAVE_LIBROCKSDB
+
+/* Define to 1 if you have the <valgrind/helgrind.h> header file. */
+#cmakedefine HAVE_VALGRIND_HELGRIND_H 1
+
+/* Define to 1 if you have the <sys/prctl.h> header file. */
+#cmakedefine HAVE_SYS_PRCTL_H 1
+
+/* Define to 1 if you have the <linux/types.h> header file. */
+#cmakedefine HAVE_LINUX_TYPES_H 1
+
+/* Define to 1 if you have the <linux/version.h> header file. */
+#cmakedefine HAVE_LINUX_VERSION_H 1
+
+/* Define to 1 if you have sched.h. */
+#cmakedefine HAVE_SCHED 1
+
+/* Define to 1 if you have sigdescr_np. */
+#cmakedefine HAVE_SIGDESCR_NP 1
+
+/* Support SSE (Streaming SIMD Extensions) instructions */
+#cmakedefine HAVE_SSE
+
+/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
+#cmakedefine HAVE_SSE2
+
+/* Define to 1 if you have the `pipe2' function. */
+#cmakedefine HAVE_PIPE2 1
+
+/* Support NEON instructions */
+#cmakedefine HAVE_NEON
+
+/* Define if you have pthread_spin_init */
+#cmakedefine HAVE_PTHREAD_SPINLOCK
+
+/* name_to_handle_at exists */
+#cmakedefine HAVE_NAME_TO_HANDLE_AT
+
+/* we have a recent yasm and are x86_64 */
+#cmakedefine HAVE_GOOD_YASM_ELF64 
+
+/* yasm can also build the isa-l */
+#cmakedefine HAVE_BETTER_YASM_ELF64
+
+/* Define to 1 if strerror_r returns char *. */
+#cmakedefine STRERROR_R_CHAR_P 1
+
+/* Defined if you have libzfs enabled */
+#cmakedefine HAVE_LIBZFS
+
+/* Define if the C compiler supports __func__ */
+#cmakedefine HAVE_FUNC
+
+/* Define if the C compiler supports __PRETTY_FUNCTION__ */
+#cmakedefine HAVE_PRETTY_FUNC
+
+/* Have eventfd extension. */
+#cmakedefine HAVE_EVENTFD
+
+/* Define if enabling coverage. */
+#cmakedefine ENABLE_COVERAGE
+
+/* Defined if you want pg ref debugging */
+#cmakedefine PG_DEBUG_REFS
+
+/* Support ARMv8 CRC instructions */
+#cmakedefine HAVE_ARMV8_CRC
+
+/* Support ARMv8 CRYPTO instructions */
+#cmakedefine HAVE_ARMV8_CRYPTO
+
+/* Support ARMv8 CRC and CRYPTO intrinsics */
+#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+
+/* Define if you have struct stat.st_mtimespec.tv_nsec */
+#cmakedefine HAVE_STAT_ST_MTIMESPEC_TV_NSEC
+
+/* Define if you have struct stat.st_mtim.tv_nsec */
+#cmakedefine HAVE_STAT_ST_MTIM_TV_NSEC
+
+/* Define if compiler supports static_cast<> */
+#cmakedefine HAVE_STATIC_CAST
+
+/* Version number of package */
+#cmakedefine VERSION "@VERSION@"
+
+/* Defined if pthread_setname_np() is available */
+#cmakedefine HAVE_PTHREAD_SETNAME_NP 1
+
+/* Defined if pthread_rwlockattr_setkind_np() is available */
+#cmakedefine HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP
+
+/* Defined if blkin enabled */
+#cmakedefine WITH_BLKIN
+
+/* Defined if pthread_set_name_np() is available */
+#cmakedefine HAVE_PTHREAD_SET_NAME_NP
+
+/* Defined if pthread_getname_np() is available */
+#cmakedefine HAVE_PTHREAD_GETNAME_NP 1
+
+/* Support POWER8 instructions */
+#cmakedefine HAVE_POWER8
+
+/* Define if endian type is big endian */
+#cmakedefine CEPH_BIG_ENDIAN
+
+/* Define if endian type is little endian */
+#cmakedefine CEPH_LITTLE_ENDIAN
+
+#cmakedefine MGR_PYTHON_EXECUTABLE "@MGR_PYTHON_EXECUTABLE@"
+
+/* Define to 1 if you have the `getprogname' function. */
+#cmakedefine HAVE_GETPROGNAME 1
+
+/* Defined if getentropy() is available */
+#cmakedefine HAVE_GETENTROPY
+
+/* Defined if boost::context is available */
+#cmakedefine HAVE_BOOST_CONTEXT
+
+/* Defined if libradosstriper is enabled: */
+#cmakedefine WITH_LIBRADOSSTRIPER
+
+/* Defined if OpenSSL is available for the rgw beast frontend */
+#cmakedefine WITH_RADOSGW_BEAST_OPENSSL
+
+/* Defined if rabbitmq-c is available for rgw amqp push endpoint */
+#cmakedefine WITH_RADOSGW_AMQP_ENDPOINT
+
+/* Defined if libedkafka is available for rgw kafka push endpoint */
+#cmakedefine WITH_RADOSGW_KAFKA_ENDPOINT
+
+/* Defined if std::map::merge() is supported */
+#cmakedefine HAVE_STDLIB_MAP_SPLICING
+
+/* Defined if Intel QAT compress/decompress is supported */
+#cmakedefine HAVE_QATZIP
+
+/* Define if seastar is available. */
+#cmakedefine HAVE_SEASTAR
+
+/* Define if unit tests are built. */
+#cmakedefine UNIT_TESTS_BUILT
+
+#endif /* CONFIG_H */
diff --git a/src/include/coredumpctl.h b/src/include/coredumpctl.h
new file mode 100644
index 00000000..60fab432
--- /dev/null
+++ b/src/include/coredumpctl.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include "acconfig.h"
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <iostream>
+#include <sys/prctl.h>
+#include "common/errno.h"
+
+class PrCtl {
+  int saved_state = -1;
+  static int get_dumpable() {
+    int r = prctl(PR_GET_DUMPABLE);
+    if (r == -1) {
+      r = errno;
+      std::cerr << "warning: unable to get dumpable flag: " << cpp_strerror(r)
+                << std::endl;
+    }
+    return r;
+  }
+  static int set_dumpable(bool new_state) {
+    int r = prctl(PR_SET_DUMPABLE, new_state);
+    if (r) {
+      r = -errno;
+      std::cerr << "warning: unable to " << (new_state ? "set" : "unset")
+                << " dumpable flag: " << cpp_strerror(r)
+                << std::endl;
+    }
+    return r;
+  }
+public:
+  PrCtl(int new_state = 0) {
+    int r = get_dumpable();
+    if (r == -1) {
+      return;
+    }
+    if (r != new_state) {
+      if (!set_dumpable(new_state)) {
+        saved_state = r;
+      }
+    }
+  }
+  ~PrCtl() {
+    if (saved_state < 0) {
+      return;
+    }
+    set_dumpable(saved_state);
+  }
+};
+
+#else
+#include <sys/resource.h>
+#ifdef RLIMIT_CORE
+#include <iostream>
+#include <sys/resource.h>
+#include "common/errno.h"
+
+class PrCtl {
+  rlimit saved_lim;
+  static int get_dumpable(rlimit* saved) {
+    int r = getrlimit(RLIMIT_CORE, saved);
+    if (r) {
+      r = errno;
+      std::cerr << "warning: unable to getrlimit(): " << cpp_strerror(r)
+                << std::endl;
+    }
+    return r;
+  }
+  static void set_dumpable(const rlimit& rlim) {
+    int r = setrlimit(RLIMIT_CORE, &rlim);
+    if (r) {
+      r = -errno;
+      std::cerr << "warning: unable to setrlimit(): " << cpp_strerror(r)
+                << std::endl;
+    }
+  }
+public:
+  PrCtl(int new_state = 0) {
+    int r = get_dumpable(&saved_lim);
+    if (r == -1) {
+      return;
+    }
+    rlimit new_lim;
+    if (new_state) {
+      new_lim.rlim_cur = saved_lim.rlim_max;
+    } else {
+      new_lim.rlim_cur = new_lim.rlim_max = 0;
+    }
+    if (new_lim.rlim_cur == saved_lim.rlim_cur) {
+      return;
+    }
+    set_dumpable(new_lim);
+  }
+  ~PrCtl() {
+    set_dumpable(saved_lim);
+  }
+};
+#else
+struct PrCtl {
+  // to silence the Wunused-variable warning
+  PrCtl() {}
+};
+
+#endif  // RLIMIT_CORE
+#endif
diff --git a/src/include/counter.h b/src/include/counter.h
new file mode 100644
index 00000000..61ed7409
--- /dev/null
+++ b/src/include/counter.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COUNTER_H
+#define CEPH_COUNTER_H
+
+#include <atomic>
+
+template <typename T>
+class Counter {
+public:
+  Counter() {
+    _count()++;
+    _increments()++;
+  }
+  Counter(const Counter &rhs) {
+    _count()++;
+    _increments()++;
+  }
+  Counter(Counter &&rhs) {}
+  ~Counter() {
+    _count()--;
+  }
+  static uint64_t count() {
+    return _count();
+  }
+  static uint64_t increments() {
+    return _increments();
+  }
+  static uint64_t decrements() {
+    return increments()-count();
+  }
+
+private:
+  static std::atomic<uint64_t> &_count() {
+    static std::atomic<uint64_t> c;
+    return c;
+  }
+  static std::atomic<uint64_t> &_increments() {
+    static std::atomic<uint64_t> i;
+    return i;
+  }
+};
+
+#endif
diff --git a/src/include/cpp-btree/btree.h b/src/include/cpp-btree/btree.h
new file mode 100644
index 00000000..0a40e0e1
--- /dev/null
+++ b/src/include/cpp-btree/btree.h
@@ -0,0 +1,2396 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree implementation of the STL set and map interfaces. A btree is both
+// smaller and faster than STL set/map. The red-black tree implementation of
+// STL set/map has an overhead of 3 pointers (left, right and parent) plus the
+// node color information for each stored value. So a set<int32> consumes 20
+// bytes for each value stored. This btree implementation stores multiple
+// values on fixed size nodes (usually 256 bytes) and doesn't store child
+// pointers for leaf nodes. The result is that a btree_set<int32> may use much
+// less memory per stored value. For the random insertion benchmark in
+// btree_test.cc, a btree_set<int32> with node-size of 256 uses 4.9 bytes per
+// stored value.
+//
+// The packing of multiple values on to each node of a btree has another effect
+// besides better space utilization: better cache locality due to fewer cache
+// lines being accessed. Better cache locality translates into faster
+// operations.
+//
+// CAVEATS
+//
+// Insertions and deletions on a btree can cause splitting, merging or
+// rebalancing of btree nodes. And even without these operations, insertions
+// and deletions on a btree will move values around within a node. In both
+// cases, the result is that insertions and deletions can invalidate iterators
+// pointing to values other than the one being inserted/deleted. This is
+// notably different from STL set/map which takes care to not invalidate
+// iterators on insert/erase except, of course, for iterators pointing to the
+// value being erased.  A partial workaround when erasing is available:
+// erase() returns an iterator pointing to the item just after the one that was
+// erased (or end() if none exists).  See also safe_btree.
+
+// PERFORMANCE
+//
+//   btree_bench --benchmarks=. 2>&1 | ./benchmarks.awk
+//
+// Run on pmattis-warp.nyc (4 X 2200 MHz CPUs); 2010/03/04-15:23:06
+// Benchmark                 STL(ns) B-Tree(ns) @    <size>
+// --------------------------------------------------------
+// BM_set_int32_insert        1516      608  +59.89%  <256>    [40.0,  5.2]
+// BM_set_int32_lookup        1160      414  +64.31%  <256>    [40.0,  5.2]
+// BM_set_int32_fulllookup     960      410  +57.29%  <256>    [40.0,  4.4]
+// BM_set_int32_delete        1741      528  +69.67%  <256>    [40.0,  5.2]
+// BM_set_int32_queueaddrem   3078     1046  +66.02%  <256>    [40.0,  5.5]
+// BM_set_int32_mixedaddrem   3600     1384  +61.56%  <256>    [40.0,  5.3]
+// BM_set_int32_fifo           227      113  +50.22%  <256>    [40.0,  4.4]
+// BM_set_int32_fwditer        158       26  +83.54%  <256>    [40.0,  5.2]
+// BM_map_int32_insert        1551      636  +58.99%  <256>    [48.0, 10.5]
+// BM_map_int32_lookup        1200      508  +57.67%  <256>    [48.0, 10.5]
+// BM_map_int32_fulllookup     989      487  +50.76%  <256>    [48.0,  8.8]
+// BM_map_int32_delete        1794      628  +64.99%  <256>    [48.0, 10.5]
+// BM_map_int32_queueaddrem   3189     1266  +60.30%  <256>    [48.0, 11.6]
+// BM_map_int32_mixedaddrem   3822     1623  +57.54%  <256>    [48.0, 10.9]
+// BM_map_int32_fifo           151      134  +11.26%  <256>    [48.0,  8.8]
+// BM_map_int32_fwditer        161       32  +80.12%  <256>    [48.0, 10.5]
+// BM_set_int64_insert        1546      636  +58.86%  <256>    [40.0, 10.5]
+// BM_set_int64_lookup        1200      512  +57.33%  <256>    [40.0, 10.5]
+// BM_set_int64_fulllookup     971      487  +49.85%  <256>    [40.0,  8.8]
+// BM_set_int64_delete        1745      616  +64.70%  <256>    [40.0, 10.5]
+// BM_set_int64_queueaddrem   3163     1195  +62.22%  <256>    [40.0, 11.6]
+// BM_set_int64_mixedaddrem   3760     1564  +58.40%  <256>    [40.0, 10.9]
+// BM_set_int64_fifo           146      103  +29.45%  <256>    [40.0,  8.8]
+// BM_set_int64_fwditer        162       31  +80.86%  <256>    [40.0, 10.5]
+// BM_map_int64_insert        1551      720  +53.58%  <256>    [48.0, 20.7]
+// BM_map_int64_lookup        1214      612  +49.59%  <256>    [48.0, 20.7]
+// BM_map_int64_fulllookup     994      592  +40.44%  <256>    [48.0, 17.2]
+// BM_map_int64_delete        1778      764  +57.03%  <256>    [48.0, 20.7]
+// BM_map_int64_queueaddrem   3189     1547  +51.49%  <256>    [48.0, 20.9]
+// BM_map_int64_mixedaddrem   3779     1887  +50.07%  <256>    [48.0, 21.6]
+// BM_map_int64_fifo           147      145   +1.36%  <256>    [48.0, 17.2]
+// BM_map_int64_fwditer        162       41  +74.69%  <256>    [48.0, 20.7]
+// BM_set_string_insert       1989     1966   +1.16%  <256>    [64.0, 44.5]
+// BM_set_string_lookup       1709     1600   +6.38%  <256>    [64.0, 44.5]
+// BM_set_string_fulllookup   1573     1529   +2.80%  <256>    [64.0, 35.4]
+// BM_set_string_delete       2520     1920  +23.81%  <256>    [64.0, 44.5]
+// BM_set_string_queueaddrem  4706     4309   +8.44%  <256>    [64.0, 48.3]
+// BM_set_string_mixedaddrem  5080     4654   +8.39%  <256>    [64.0, 46.7]
+// BM_set_string_fifo          318      512  -61.01%  <256>    [64.0, 35.4]
+// BM_set_string_fwditer       182       93  +48.90%  <256>    [64.0, 44.5]
+// BM_map_string_insert       2600     2227  +14.35%  <256>    [72.0, 55.8]
+// BM_map_string_lookup       2068     1730  +16.34%  <256>    [72.0, 55.8]
+// BM_map_string_fulllookup   1859     1618  +12.96%  <256>    [72.0, 44.0]
+// BM_map_string_delete       3168     2080  +34.34%  <256>    [72.0, 55.8]
+// BM_map_string_queueaddrem  5840     4701  +19.50%  <256>    [72.0, 59.4]
+// BM_map_string_mixedaddrem  6400     5200  +18.75%  <256>    [72.0, 57.8]
+// BM_map_string_fifo          398      596  -49.75%  <256>    [72.0, 44.0]
+// BM_map_string_fwditer       243      113  +53.50%  <256>    [72.0, 55.8]
+
+#ifndef UTIL_BTREE_BTREE_H__
+#define UTIL_BTREE_BTREE_H__
+
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+#include <new>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "include/ceph_assert.h"
+
+namespace btree {
+
+// Inside a btree method, if we just call swap(), it will choose the
+// btree::swap method, which we don't want. And we can't say ::swap
+// because then MSVC won't pickup any std::swap() implementations. We
+// can't just use std::swap() directly because then we don't get the
+// specialization for types outside the std namespace. So the solution
+// is to have a special swap helper function whose name doesn't
+// collide with other swap functions defined by the btree classes.
+template <typename T>
+inline void btree_swap_helper(T &a, T &b) {
+  using std::swap;
+  swap(a, b);
+}
+
+// A template helper used to select A or B based on a condition.
+template<bool cond, typename A, typename B>
+struct if_{
+  typedef A type;
+};
+
+template<typename A, typename B>
+struct if_<false, A, B> {
+  typedef B type;
+};
+
+// Types small_ and big_ are promise that sizeof(small_) < sizeof(big_)
+typedef char small_;
+
+struct big_ {
+  char dummy[2];
+};
+
+// A compile-time assertion.
+template <bool>
+struct CompileAssert {
+};
+
+#define COMPILE_ASSERT(expr, msg) \
+  typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
+
+// A helper type used to indicate that a key-compare-to functor has been
+// provided. A user can specify a key-compare-to functor by doing:
+//
+//  struct MyStringComparer
+//      : public util::btree::btree_key_compare_to_tag {
+//    int operator()(const string &a, const string &b) const {
+//      return a.compare(b);
+//    }
+//  };
+//
+// Note that the return type is an int and not a bool. There is a
+// COMPILE_ASSERT which enforces this return type.
+struct btree_key_compare_to_tag {
+};
+
+// A helper class that indicates if the Compare parameter is derived from
+// btree_key_compare_to_tag.
+template <typename Compare>
+struct btree_is_key_compare_to
+    : public std::is_convertible<Compare, btree_key_compare_to_tag> {
+};
+
+// A helper class to convert a boolean comparison into a three-way
+// "compare-to" comparison that returns a negative value to indicate
+// less-than, zero to indicate equality and a positive value to
+// indicate greater-than. This helper class is specialized for
+// less<string> and greater<string>. The btree_key_compare_to_adapter
+// class is provided so that btree users automatically get the more
+// efficient compare-to code when using common google string types
+// with common comparison functors.
+template <typename Compare>
+struct btree_key_compare_to_adapter : Compare {
+  btree_key_compare_to_adapter() { }
+  btree_key_compare_to_adapter(const Compare &c) : Compare(c) { }
+  btree_key_compare_to_adapter(const btree_key_compare_to_adapter<Compare> &c)
+      : Compare(c) {
+  }
+};
+
+template <>
+struct btree_key_compare_to_adapter<std::less<std::string> >
+    : public btree_key_compare_to_tag {
+  btree_key_compare_to_adapter() {}
+  btree_key_compare_to_adapter(const std::less<std::string>&) {}
+  btree_key_compare_to_adapter(
+      const btree_key_compare_to_adapter<std::less<std::string> >&) {}
+  int operator()(const std::string &a, const std::string &b) const {
+    return a.compare(b);
+  }
+};
+
+template <>
+struct btree_key_compare_to_adapter<std::greater<std::string> >
+    : public btree_key_compare_to_tag {
+  btree_key_compare_to_adapter() {}
+  btree_key_compare_to_adapter(const std::greater<std::string>&) {}
+  btree_key_compare_to_adapter(
+      const btree_key_compare_to_adapter<std::greater<std::string> >&) {}
+  int operator()(const std::string &a, const std::string &b) const {
+    return b.compare(a);
+  }
+};
+
+// A helper class that allows a compare-to functor to behave like a plain
+// compare functor. This specialization is used when we do not have a
+// compare-to functor.
+template <typename Key, typename Compare, bool HaveCompareTo>
+struct btree_key_comparer {
+  btree_key_comparer() {}
+  btree_key_comparer(Compare c) : comp(c) {}
+  static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
+    return comp(x, y);
+  }
+  bool operator()(const Key &x, const Key &y) const {
+    return bool_compare(comp, x, y);
+  }
+  Compare comp;
+};
+
+// A specialization of btree_key_comparer when a compare-to functor is
+// present. We need a plain (boolean) comparison in some parts of the btree
+// code, such as insert-with-hint.
+template <typename Key, typename Compare>
+struct btree_key_comparer<Key, Compare, true> {
+  btree_key_comparer() {}
+  btree_key_comparer(Compare c) : comp(c) {}
+  static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
+    return comp(x, y) < 0;
+  }
+  bool operator()(const Key &x, const Key &y) const {
+    return bool_compare(comp, x, y);
+  }
+  Compare comp;
+};
+
+// A helper function to compare to keys using the specified compare
+// functor. This dispatches to the appropriate btree_key_comparer comparison,
+// depending on whether we have a compare-to functor or not (which depends on
+// whether Compare is derived from btree_key_compare_to_tag).
+template <typename Key, typename Compare>
+static bool btree_compare_keys(
+    const Compare &comp, const Key &x, const Key &y) {
+  typedef btree_key_comparer<Key, Compare,
+      btree_is_key_compare_to<Compare>::value> key_comparer;
+  return key_comparer::bool_compare(comp, x, y);
+}
+
+template <typename Key, typename Compare,
+          typename Alloc, int TargetNodeSize, int ValueSize>
+struct btree_common_params {
+  // If Compare is derived from btree_key_compare_to_tag then use it as the
+  // key_compare type. Otherwise, use btree_key_compare_to_adapter<> which will
+  // fall-back to Compare if we don't have an appropriate specialization.
+  typedef typename if_<
+    btree_is_key_compare_to<Compare>::value,
+    Compare, btree_key_compare_to_adapter<Compare> >::type key_compare;
+  // A type which indicates if we have a key-compare-to functor or a plain old
+  // key-compare functor.
+  typedef btree_is_key_compare_to<key_compare> is_key_compare_to;
+
+  typedef Alloc allocator_type;
+  typedef Key key_type;
+  typedef ssize_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  enum {
+    kTargetNodeSize = TargetNodeSize,
+
+    // Available space for values.  This is largest for leaf nodes,
+    // which has overhead no fewer than two pointers.
+    kNodeValueSpace = TargetNodeSize - 2 * sizeof(void*),
+  };
+
+  // This is an integral type large enough to hold as many
+  // ValueSize-values as will fit a node of TargetNodeSize bytes.
+  typedef typename if_<
+    (kNodeValueSpace / ValueSize) >= 256,
+    uint16_t,
+    uint8_t>::type node_count_type;
+};
+
+// A parameters structure for holding the type parameters for a btree_map.
+template <typename Key, typename Data, typename Compare,
+          typename Alloc, int TargetNodeSize>
+struct btree_map_params
+    : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
+                                 sizeof(Key) + sizeof(Data)> {
+  typedef Data data_type;
+  typedef Data mapped_type;
+  typedef std::pair<const Key, data_type> value_type;
+  typedef std::pair<Key, data_type> mutable_value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  enum {
+    kValueSize = sizeof(Key) + sizeof(data_type),
+  };
+
+  static const Key& key(const value_type &x) { return x.first; }
+  static const Key& key(const mutable_value_type &x) { return x.first; }
+  static void swap(mutable_value_type *a, mutable_value_type *b) {
+    btree_swap_helper(a->first, b->first);
+    btree_swap_helper(a->second, b->second);
+  }
+};
+
+// A parameters structure for holding the type parameters for a btree_set.
+template <typename Key, typename Compare, typename Alloc, int TargetNodeSize>
+struct btree_set_params
+    : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
+                                 sizeof(Key)> {
+  typedef std::false_type data_type;
+  typedef std::false_type mapped_type;
+  typedef Key value_type;
+  typedef value_type mutable_value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  enum {
+    kValueSize = sizeof(Key),
+  };
+
+  static const Key& key(const value_type &x) { return x; }
+  static void swap(mutable_value_type *a, mutable_value_type *b) {
+    btree_swap_helper<mutable_value_type>(*a, *b);
+  }
+};
+
+// An adapter class that converts a lower-bound compare into an upper-bound
+// compare.
+template <typename Key, typename Compare>
+struct btree_upper_bound_adapter : public Compare {
+  btree_upper_bound_adapter(Compare c) : Compare(c) {}
+  bool operator()(const Key &a, const Key &b) const {
+    return !static_cast<const Compare&>(*this)(b, a);
+  }
+};
+
+template <typename Key, typename CompareTo>
+struct btree_upper_bound_compare_to_adapter : public CompareTo {
+  btree_upper_bound_compare_to_adapter(CompareTo c) : CompareTo(c) {}
+  int operator()(const Key &a, const Key &b) const {
+    return static_cast<const CompareTo&>(*this)(b, a);
+  }
+};
+
+// Dispatch helper class for using linear search with plain compare.
+template <typename K, typename N, typename Compare>
+struct btree_linear_search_plain_compare {
+  static int lower_bound(const K &k, const N &n, Compare comp)  {
+    return n.linear_search_plain_compare(k, 0, n.count(), comp);
+  }
+  static int upper_bound(const K &k, const N &n, Compare comp)  {
+    typedef btree_upper_bound_adapter<K, Compare> upper_compare;
+    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// Dispatch helper class for using linear search with compare-to
+template <typename K, typename N, typename CompareTo>
+struct btree_linear_search_compare_to {
+  static int lower_bound(const K &k, const N &n, CompareTo comp)  {
+    return n.linear_search_compare_to(k, 0, n.count(), comp);
+  }
+  static int upper_bound(const K &k, const N &n, CompareTo comp)  {
+    typedef btree_upper_bound_adapter<K,
+        btree_key_comparer<K, CompareTo, true> > upper_compare;
+    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// Dispatch helper class for using binary search with plain compare.
+template <typename K, typename N, typename Compare>
+struct btree_binary_search_plain_compare {
+  static int lower_bound(const K &k, const N &n, Compare comp)  {
+    return n.binary_search_plain_compare(k, 0, n.count(), comp);
+  }
+  static int upper_bound(const K &k, const N &n, Compare comp)  {
+    typedef btree_upper_bound_adapter<K, Compare> upper_compare;
+    return n.binary_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// Dispatch helper class for using binary search with compare-to.
+template <typename K, typename N, typename CompareTo>
+struct btree_binary_search_compare_to {
+  static int lower_bound(const K &k, const N &n, CompareTo comp)  {
+    return n.binary_search_compare_to(k, 0, n.count(), CompareTo());
+  }
+  static int upper_bound(const K &k, const N &n, CompareTo comp)  {
+    typedef btree_upper_bound_adapter<K,
+        btree_key_comparer<K, CompareTo, true> > upper_compare;
+    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// A node in the btree holding. The same node type is used for both internal
+// and leaf nodes in the btree, though the nodes are allocated in such a way
+// that the children array is only valid in internal nodes.
+template <typename Params>
+class btree_node {
+ public:
+  typedef Params params_type;
+  typedef btree_node<Params> self_type;
+  typedef typename Params::key_type key_type;
+  typedef typename Params::data_type data_type;
+  typedef typename Params::value_type value_type;
+  typedef typename Params::mutable_value_type mutable_value_type;
+  typedef typename Params::pointer pointer;
+  typedef typename Params::const_pointer const_pointer;
+  typedef typename Params::reference reference;
+  typedef typename Params::const_reference const_reference;
+  typedef typename Params::key_compare key_compare;
+  typedef typename Params::size_type size_type;
+  typedef typename Params::difference_type difference_type;
+  // Typedefs for the various types of node searches.
+  typedef btree_linear_search_plain_compare<
+    key_type, self_type, key_compare> linear_search_plain_compare_type;
+  typedef btree_linear_search_compare_to<
+    key_type, self_type, key_compare> linear_search_compare_to_type;
+  typedef btree_binary_search_plain_compare<
+    key_type, self_type, key_compare> binary_search_plain_compare_type;
+  typedef btree_binary_search_compare_to<
+    key_type, self_type, key_compare> binary_search_compare_to_type;
+  // If we have a valid key-compare-to type, use linear_search_compare_to,
+  // otherwise use linear_search_plain_compare.
+  typedef typename if_<
+    Params::is_key_compare_to::value,
+    linear_search_compare_to_type,
+    linear_search_plain_compare_type>::type linear_search_type;
+  // If we have a valid key-compare-to type, use binary_search_compare_to,
+  // otherwise use binary_search_plain_compare.
+  typedef typename if_<
+    Params::is_key_compare_to::value,
+    binary_search_compare_to_type,
+    binary_search_plain_compare_type>::type binary_search_type;
+  // If the key is an integral or floating point type, use linear search which
+  // is faster than binary search for such types. Might be wise to also
+  // configure linear search based on node-size.
+  typedef typename if_<
+    std::is_integral<key_type>::value ||
+    std::is_floating_point<key_type>::value,
+    linear_search_type, binary_search_type>::type search_type;
+
+  struct base_fields {
+    typedef typename Params::node_count_type field_type;
+
+    // A boolean indicating whether the node is a leaf or not.
+    bool leaf;
+    // The position of the node in the node's parent.
+    field_type position;
+    // The maximum number of values the node can hold.
+    field_type max_count;
+    // The count of the number of values in the node.
+    field_type count;
+    // A pointer to the node's parent.
+    btree_node *parent;
+  };
+
+  enum {
+    kValueSize = params_type::kValueSize,
+    kTargetNodeSize = params_type::kTargetNodeSize,
+
+    // Compute how many values we can fit onto a leaf node.
+    kNodeTargetValues = (kTargetNodeSize - sizeof(base_fields)) / kValueSize,
+    // We need a minimum of 3 values per internal node in order to perform
+    // splitting (1 value for the two nodes involved in the split and 1 value
+    // propagated to the parent as the delimiter for the split).
+    kNodeValues = kNodeTargetValues >= 3 ? kNodeTargetValues : 3,
+
+    kExactMatch = 1 << 30,
+    kMatchMask = kExactMatch - 1,
+  };
+
+  struct leaf_fields : public base_fields {
+    // The array of values. Only the first count of these values have been
+    // constructed and are valid.
+    mutable_value_type values[kNodeValues];
+  };
+
+  struct internal_fields : public leaf_fields {
+    // The array of child pointers. The keys in children_[i] are all less than
+    // key(i). The keys in children_[i + 1] are all greater than key(i). There
+    // are always count + 1 children.
+    btree_node *children[kNodeValues + 1];
+  };
+
+  struct root_fields : public internal_fields {
+    btree_node *rightmost;
+    size_type size;
+  };
+
+ public:
+  // Getter/setter for whether this is a leaf node or not. This value doesn't
+  // change after the node is created.
+  bool leaf() const { return fields_.leaf; }
+
+  // Getter for the position of this node in its parent.
+  int position() const { return fields_.position; }
+  void set_position(int v) { fields_.position = v; }
+
+  // Getter/setter for the number of values stored in this node.
+  int count() const { return fields_.count; }
+  void set_count(int v) { fields_.count = v; }
+  int max_count() const { return fields_.max_count; }
+
+  // Getter for the parent of this node.
+  btree_node* parent() const { return fields_.parent; }
+  // Getter for whether the node is the root of the tree. The parent of the
+  // root of the tree is the leftmost node in the tree which is guaranteed to
+  // be a leaf.
+  bool is_root() const { return parent()->leaf(); }
+  void make_root() {
+    ceph_assert(parent()->is_root());
+    fields_.parent = fields_.parent->parent();
+  }
+
+  // Getter for the rightmost root node field. Only valid on the root node.
+  btree_node* rightmost() const { return fields_.rightmost; }
+  btree_node** mutable_rightmost() { return &fields_.rightmost; }
+
+  // Getter for the size root node field. Only valid on the root node.
+  size_type size() const { return fields_.size; }
+  size_type* mutable_size() { return &fields_.size; }
+
+  // Getters for the key/value at position i in the node.
+  const key_type& key(int i) const {
+    return params_type::key(fields_.values[i]);
+  }
+  reference value(int i) {
+    return reinterpret_cast<reference>(fields_.values[i]);
+  }
+  const_reference value(int i) const {
+    return reinterpret_cast<const_reference>(fields_.values[i]);
+  }
+  mutable_value_type* mutable_value(int i) {
+    return &fields_.values[i];
+  }
+
+  // Swap value i in this node with value j in node x.
+  void value_swap(int i, btree_node *x, int j) {
+    params_type::swap(mutable_value(i), x->mutable_value(j));
+  }
+
+  // Getters/setter for the child at position i in the node.
+  btree_node* child(int i) const { return fields_.children[i]; }
+  btree_node** mutable_child(int i) { return &fields_.children[i]; }
+  void set_child(int i, btree_node *c) {
+    *mutable_child(i) = c;
+    c->fields_.parent = this;
+    c->fields_.position = i;
+  }
+
+  // Returns the position of the first value whose key is not less than k.
+  template <typename Compare>
+  int lower_bound(const key_type &k, const Compare &comp) const {
+    return search_type::lower_bound(k, *this, comp);
+  }
+  // Returns the position of the first value whose key is greater than k.
+  template <typename Compare>
+  int upper_bound(const key_type &k, const Compare &comp) const {
+    return search_type::upper_bound(k, *this, comp);
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // linear search performed using plain compare.
+  template <typename Compare>
+  int linear_search_plain_compare(
+      const key_type &k, int s, int e, const Compare &comp) const {
+    while (s < e) {
+      if (!btree_compare_keys(comp, key(s), k)) {
+        break;
+      }
+      ++s;
+    }
+    return s;
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // linear search performed using compare-to.
+  template <typename Compare>
+  int linear_search_compare_to(
+      const key_type &k, int s, int e, const Compare &comp) const {
+    while (s < e) {
+      int c = comp(key(s), k);
+      if (c == 0) {
+        return s | kExactMatch;
+      } else if (c > 0) {
+        break;
+      }
+      ++s;
+    }
+    return s;
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // binary search performed using plain compare.
+  template <typename Compare>
+  int binary_search_plain_compare(
+      const key_type &k, int s, int e, const Compare &comp) const {
+    while (s != e) {
+      int mid = (s + e) / 2;
+      if (btree_compare_keys(comp, key(mid), k)) {
+        s = mid + 1;
+      } else {
+        e = mid;
+      }
+    }
+    return s;
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // binary search performed using compare-to.
+  template <typename CompareTo>
+  int binary_search_compare_to(
+      const key_type &k, int s, int e, const CompareTo &comp) const {
+    while (s != e) {
+      int mid = (s + e) / 2;
+      int c = comp(key(mid), k);
+      if (c < 0) {
+        s = mid + 1;
+      } else if (c > 0) {
+        e = mid;
+      } else {
+        // Need to return the first value whose key is not less than k, which
+        // requires continuing the binary search. Note that we are guaranteed
+        // that the result is an exact match because if "key(mid-1) < k" the
+        // call to binary_search_compare_to() will return "mid".
+        s = binary_search_compare_to(k, s, mid, comp);
+        return s | kExactMatch;
+      }
+    }
+    return s;
+  }
+
+  // Inserts the value x at position i, shifting all existing values and
+  // children at positions >= i to the right by 1.
+  void insert_value(int i, const value_type &x);
+
+  // Removes the value at position i, shifting all existing values and children
+  // at positions > i to the left by 1.
+  void remove_value(int i);
+
+  // Rebalances a node with its right sibling.
+  void rebalance_right_to_left(btree_node *sibling, int to_move);
+  void rebalance_left_to_right(btree_node *sibling, int to_move);
+
+  // Splits a node, moving a portion of the node's values to its right sibling.
+  void split(btree_node *sibling, int insert_position);
+
+  // Merges a node with its right sibling, moving all of the values and the
+  // delimiting key in the parent node onto itself.
+  void merge(btree_node *sibling);
+
+  // Swap the contents of "this" and "src".
+  void swap(btree_node *src);
+
+#ifdef NDEBUG
+  static constexpr auto no_debug = true;
+#else
+  static constexpr auto no_debug = false;
+#endif
+  // Node allocation/deletion routines.
+  static btree_node* init_leaf(
+      leaf_fields *f, btree_node *parent, int max_count) {
+    btree_node *n = reinterpret_cast<btree_node*>(f);
+    f->leaf = 1;
+    f->position = 0;
+    f->max_count = max_count;
+    f->count = 0;
+    f->parent = parent;
+    if (!no_debug) {
+      memset(&f->values, 0, max_count * sizeof(value_type));
+    }
+    return n;
+  }
+  static btree_node* init_internal(internal_fields *f, btree_node *parent) {
+    btree_node *n = init_leaf(f, parent, kNodeValues);
+    f->leaf = 0;
+    if (!no_debug) {
+      memset(f->children, 0, sizeof(f->children));
+    }
+    return n;
+  }
+  static btree_node* init_root(root_fields *f, btree_node *parent) {
+    btree_node *n = init_internal(f, parent);
+    f->rightmost = parent;
+    f->size = parent->count();
+    return n;
+  }
+  void destroy() {
+    for (int i = 0; i < count(); ++i) {
+      value_destroy(i);
+    }
+  }
+
+ private:
+  void value_init(int i) {
+    new (&fields_.values[i]) mutable_value_type;
+  }
+  void value_init(int i, const value_type &x) {
+    new (&fields_.values[i]) mutable_value_type(x);
+  }
+  void value_destroy(int i) {
+    fields_.values[i].~mutable_value_type();
+  }
+
+ private:
+  root_fields fields_;
+
+ private:
+  btree_node(const btree_node&);
+  void operator=(const btree_node&);
+};
+
+template <typename Node, typename Reference, typename Pointer>
+struct btree_iterator {
+  typedef typename Node::key_type key_type;
+  typedef typename Node::size_type size_type;
+  typedef typename Node::difference_type difference_type;
+  typedef typename Node::params_type params_type;
+
+  typedef Node node_type;
+  typedef typename std::remove_const<Node>::type normal_node;
+  typedef const Node const_node;
+  typedef typename params_type::value_type value_type;
+  typedef typename params_type::pointer normal_pointer;
+  typedef typename params_type::reference normal_reference;
+  typedef typename params_type::const_pointer const_pointer;
+  typedef typename params_type::const_reference const_reference;
+
+  typedef Pointer pointer;
+  typedef Reference reference;
+  typedef std::bidirectional_iterator_tag iterator_category;
+
+  typedef btree_iterator<
+    normal_node, normal_reference, normal_pointer> iterator;
+  typedef btree_iterator<
+    const_node, const_reference, const_pointer> const_iterator;
+  typedef btree_iterator<Node, Reference, Pointer> self_type;
+
+  btree_iterator()
+      : node(NULL),
+        position(-1) {
+  }
+  btree_iterator(Node *n, int p)
+      : node(n),
+        position(p) {
+  }
+  btree_iterator(const iterator &x)
+      : node(x.node),
+        position(x.position) {
+  }
+
+  // Increment/decrement the iterator.
+  void increment() {
+    if (node->leaf() && ++position < node->count()) {
+      return;
+    }
+    increment_slow();
+  }
+  void increment_by(int count);
+  void increment_slow();
+
+  void decrement() {
+    if (node->leaf() && --position >= 0) {
+      return;
+    }
+    decrement_slow();
+  }
+  void decrement_slow();
+
+  bool operator==(const const_iterator &x) const {
+    return node == x.node && position == x.position;
+  }
+  bool operator!=(const const_iterator &x) const {
+    return node != x.node || position != x.position;
+  }
+
+  // Accessors for the key/value the iterator is pointing at.
+  const key_type& key() const {
+    return node->key(position);
+  }
+  reference operator*() const {
+    return node->value(position);
+  }
+  pointer operator->() const {
+    return &node->value(position);
+  }
+
+  self_type& operator++() {
+    increment();
+    return *this;
+  }
+  self_type& operator--() {
+    decrement();
+    return *this;
+  }
+  self_type operator++(int) {
+    self_type tmp = *this;
+    ++*this;
+    return tmp;
+  }
+  self_type operator--(int) {
+    self_type tmp = *this;
+    --*this;
+    return tmp;
+  }
+
+  // The node in the tree the iterator is pointing at.
+  Node *node;
+  // The position within the node of the tree the iterator is pointing at.
+  int position;
+};
+
+// Dispatch helper class for using btree::internal_locate with plain compare.
+struct btree_internal_locate_plain_compare {
+  template <typename K, typename T, typename Iter>
+  static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
+    return t.internal_locate_plain_compare(k, iter);
+  }
+};
+
+// Dispatch helper class for using btree::internal_locate with compare-to.
+struct btree_internal_locate_compare_to {
+  template <typename K, typename T, typename Iter>
+  static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
+    return t.internal_locate_compare_to(k, iter);
+  }
+};
+
+template <typename Params>
+class btree : public Params::key_compare {
+  typedef btree<Params> self_type;
+  typedef btree_node<Params> node_type;
+  typedef typename node_type::base_fields base_fields;
+  typedef typename node_type::leaf_fields leaf_fields;
+  typedef typename node_type::internal_fields internal_fields;
+  typedef typename node_type::root_fields root_fields;
+  typedef typename Params::is_key_compare_to is_key_compare_to;
+
+  friend class btree_internal_locate_plain_compare;
+  friend class btree_internal_locate_compare_to;
+  typedef typename if_<
+    is_key_compare_to::value,
+    btree_internal_locate_compare_to,
+    btree_internal_locate_plain_compare>::type internal_locate_type;
+
+  enum {
+    kNodeValues = node_type::kNodeValues,
+    kMinNodeValues = kNodeValues / 2,
+    kValueSize = node_type::kValueSize,
+    kExactMatch = node_type::kExactMatch,
+    kMatchMask = node_type::kMatchMask,
+  };
+
+  // A helper class to get the empty base class optimization for 0-size
+  // allocators. Base is internal_allocator_type.
+  // (e.g. empty_base_handle<internal_allocator_type, node_type*>). If Base is
+  // 0-size, the compiler doesn't have to reserve any space for it and
+  // sizeof(empty_base_handle) will simply be sizeof(Data). Google [empty base
+  // class optimization] for more details.
+  template <typename Base, typename Data>
+  struct empty_base_handle : public Base {
+    empty_base_handle(const Base &b, const Data &d)
+        : Base(b),
+          data(d) {
+    }
+    Data data;
+  };
+
+  struct node_stats {
+    node_stats(ssize_t l, ssize_t i)
+        : leaf_nodes(l),
+          internal_nodes(i) {
+    }
+
+    node_stats& operator+=(const node_stats &x) {
+      leaf_nodes += x.leaf_nodes;
+      internal_nodes += x.internal_nodes;
+      return *this;
+    }
+
+    ssize_t leaf_nodes;
+    ssize_t internal_nodes;
+  };
+
+ public:
+  typedef Params params_type;
+  typedef typename Params::key_type key_type;
+  typedef typename Params::data_type data_type;
+  typedef typename Params::mapped_type mapped_type;
+  typedef typename Params::value_type value_type;
+  typedef typename Params::key_compare key_compare;
+  typedef typename Params::pointer pointer;
+  typedef typename Params::const_pointer const_pointer;
+  typedef typename Params::reference reference;
+  typedef typename Params::const_reference const_reference;
+  typedef typename Params::size_type size_type;
+  typedef typename Params::difference_type difference_type;
+  typedef btree_iterator<node_type, reference, pointer> iterator;
+  typedef typename iterator::const_iterator const_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+
+  typedef typename Params::allocator_type allocator_type;
+  typedef typename allocator_type::template rebind<char>::other
+    internal_allocator_type;
+
+ public:
+  // Default constructor.
+  btree(const key_compare &comp, const allocator_type &alloc);
+
+  // Copy constructor.
+  btree(const self_type &x);
+
+  // Destructor.
+  ~btree() {
+    clear();
+  }
+
+  // Iterator routines.
+  iterator begin() {
+    return iterator(leftmost(), 0);
+  }
+  const_iterator begin() const {
+    return const_iterator(leftmost(), 0);
+  }
+  iterator end() {
+    return iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
+  }
+  const_iterator end() const {
+    return const_iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
+  }
+  reverse_iterator rbegin() {
+    return reverse_iterator(end());
+  }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  reverse_iterator rend() {
+    return reverse_iterator(begin());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  // Finds the first element whose key is not less than key.
+  iterator lower_bound(const key_type &key) {
+    return internal_end(
+        internal_lower_bound(key, iterator(root(), 0)));
+  }
+  const_iterator lower_bound(const key_type &key) const {
+    return internal_end(
+        internal_lower_bound(key, const_iterator(root(), 0)));
+  }
+
+  // Finds the first element whose key is greater than key.
+  iterator upper_bound(const key_type &key) {
+    return internal_end(
+        internal_upper_bound(key, iterator(root(), 0)));
+  }
+  const_iterator upper_bound(const key_type &key) const {
+    return internal_end(
+        internal_upper_bound(key, const_iterator(root(), 0)));
+  }
+
+  // Finds the range of values which compare equal to key. The first member of
+  // the returned pair is equal to lower_bound(key). The second member pair of
+  // the pair is equal to upper_bound(key).
+  std::pair<iterator,iterator> equal_range(const key_type &key) {
+    return std::make_pair(lower_bound(key), upper_bound(key));
+  }
+  std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
+    return std::make_pair(lower_bound(key), upper_bound(key));
+  }
+
+  // Inserts a value into the btree only if it does not already exist. The
+  // boolean return value indicates whether insertion succeeded or failed. The
+  // ValuePointer type is used to avoid instatiating the value unless the key
+  // is being inserted. Value is not dereferenced if the key already exists in
+  // the btree. See btree_map::operator[].
+  template <typename ValuePointer>
+  std::pair<iterator,bool> insert_unique(const key_type &key, ValuePointer value);
+
+  // Inserts a value into the btree only if it does not already exist. The
+  // boolean return value indicates whether insertion succeeded or failed.
+  std::pair<iterator,bool> insert_unique(const value_type &v) {
+    return insert_unique(params_type::key(v), &v);
+  }
+
+  // Insert with hint. Check to see if the value should be placed immediately
+  // before position in the tree. If it does, then the insertion will take
+  // amortized constant time. If not, the insertion will take amortized
+  // logarithmic time as if a call to insert_unique(v) were made.
+  iterator insert_unique(iterator position, const value_type &v);
+
+  // Insert a range of values into the btree.
+  template <typename InputIterator>
+  void insert_unique(InputIterator b, InputIterator e);
+
+  // Inserts a value into the btree. The ValuePointer type is used to avoid
+  // instatiating the value unless the key is being inserted. Value is not
+  // dereferenced if the key already exists in the btree. See
+  // btree_map::operator[].
+  template <typename ValuePointer>
+  iterator insert_multi(const key_type &key, ValuePointer value);
+
+  // Inserts a value into the btree.
+  iterator insert_multi(const value_type &v) {
+    return insert_multi(params_type::key(v), &v);
+  }
+
+  // Insert with hint. Check to see if the value should be placed immediately
+  // before position in the tree. If it does, then the insertion will take
+  // amortized constant time. If not, the insertion will take amortized
+  // logarithmic time as if a call to insert_multi(v) were made.
+  iterator insert_multi(iterator position, const value_type &v);
+
+  // Insert a range of values into the btree.
+  template <typename InputIterator>
+  void insert_multi(InputIterator b, InputIterator e);
+
+  void assign(const self_type &x);
+
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(iterator iter);
+
+  // Erases range. Returns the number of keys erased.
+  int erase(iterator begin, iterator end);
+
+  // Erases the specified key from the btree. Returns 1 if an element was
+  // erased and 0 otherwise.
+  int erase_unique(const key_type &key);
+
+  // Erases all of the entries matching the specified key from the
+  // btree. Returns the number of elements erased.
+  int erase_multi(const key_type &key);
+
+  // Finds the iterator corresponding to a key or returns end() if the key is
+  // not present.
+  iterator find_unique(const key_type &key) {
+    return internal_end(
+        internal_find_unique(key, iterator(root(), 0)));
+  }
+  const_iterator find_unique(const key_type &key) const {
+    return internal_end(
+        internal_find_unique(key, const_iterator(root(), 0)));
+  }
+  iterator find_multi(const key_type &key) {
+    return internal_end(
+        internal_find_multi(key, iterator(root(), 0)));
+  }
+  const_iterator find_multi(const key_type &key) const {
+    return internal_end(
+        internal_find_multi(key, const_iterator(root(), 0)));
+  }
+
+  // Returns a count of the number of times the key appears in the btree.
+  size_type count_unique(const key_type &key) const {
+    const_iterator begin = internal_find_unique(
+        key, const_iterator(root(), 0));
+    if (!begin.node) {
+      // The key doesn't exist in the tree.
+      return 0;
+    }
+    return 1;
+  }
+  // Returns a count of the number of times the key appears in the btree.
+  size_type count_multi(const key_type &key) const {
+    return distance(lower_bound(key), upper_bound(key));
+  }
+
+  // Clear the btree, deleting all of the values it contains.
+  void clear();
+
+  // Swap the contents of *this and x.
+  void swap(self_type &x);
+
+  // Assign the contents of x to *this.
+  self_type& operator=(const self_type &x) {
+    if (&x == this) {
+      // Don't copy onto ourselves.
+      return *this;
+    }
+    assign(x);
+    return *this;
+  }
+
+  key_compare* mutable_key_comp() {
+    return this;
+  }
+  const key_compare& key_comp() const {
+    return *this;
+  }
+  bool compare_keys(const key_type &x, const key_type &y) const {
+    return btree_compare_keys(key_comp(), x, y);
+  }
+
+  // Dump the btree to the specified ostream. Requires that operator<< is
+  // defined for Key and Value.
+  void dump(std::ostream &os) const {
+    if (root() != NULL) {
+      internal_dump(os, root(), 0);
+    }
+  }
+
+  // Verifies the structure of the btree.
+  void verify() const;
+
+  // Size routines. Note that empty() is slightly faster than doing size()==0.
+  size_type size() const {
+    if (empty()) return 0;
+    if (root()->leaf()) return root()->count();
+    return root()->size();
+  }
+  size_type max_size() const { return std::numeric_limits<size_type>::max(); }
+  bool empty() const { return root() == NULL; }
+
+  // The height of the btree. An empty tree will have height 0.
+  size_type height() const {
+    size_type h = 0;
+    if (root()) {
+      // Count the length of the chain from the leftmost node up to the
+      // root. We actually count from the root back around to the level below
+      // the root, but the calculation is the same because of the circularity
+      // of that traversal.
+      const node_type *n = root();
+      do {
+        ++h;
+        n = n->parent();
+      } while (n != root());
+    }
+    return h;
+  }
+
+  // The number of internal, leaf and total nodes used by the btree.
+  size_type leaf_nodes() const {
+    return internal_stats(root()).leaf_nodes;
+  }
+  size_type internal_nodes() const {
+    return internal_stats(root()).internal_nodes;
+  }
+  size_type nodes() const {
+    node_stats stats = internal_stats(root());
+    return stats.leaf_nodes + stats.internal_nodes;
+  }
+
+  // The total number of bytes used by the btree.
+  size_type bytes_used() const {
+    node_stats stats = internal_stats(root());
+    if (stats.leaf_nodes == 1 && stats.internal_nodes == 0) {
+      return sizeof(*this) +
+          sizeof(base_fields) + root()->max_count() * sizeof(value_type);
+    } else {
+      return sizeof(*this) +
+          sizeof(root_fields) - sizeof(internal_fields) +
+          stats.leaf_nodes * sizeof(leaf_fields) +
+          stats.internal_nodes * sizeof(internal_fields);
+    }
+  }
+
+  // The average number of bytes used per value stored in the btree.
+  static double average_bytes_per_value() {
+    // Returns the number of bytes per value on a leaf node that is 75%
+    // full. Experimentally, this matches up nicely with the computed number of
+    // bytes per value in trees that had their values inserted in random order.
+    return sizeof(leaf_fields) / (kNodeValues * 0.75);
+  }
+
+  // The fullness of the btree. Computed as the number of elements in the btree
+  // divided by the maximum number of elements a tree with the current number
+  // of nodes could hold. A value of 1 indicates perfect space
+  // utilization. Smaller values indicate space wastage.
+  double fullness() const {
+    return double(size()) / (nodes() * kNodeValues);
+  }
+  // The overhead of the btree structure in bytes per node. Computed as the
+  // total number of bytes used by the btree minus the number of bytes used for
+  // storing elements divided by the number of elements.
+  double overhead() const {
+    if (empty()) {
+      return 0.0;
+    }
+    return (bytes_used() - size() * kValueSize) / double(size());
+  }
+
+ private:
+  // Internal accessor routines.
+  node_type* root() { return root_.data; }
+  const node_type* root() const { return root_.data; }
+  node_type** mutable_root() { return &root_.data; }
+
+  // The rightmost node is stored in the root node.
+  node_type* rightmost() {
+    return (!root() || root()->leaf()) ? root() : root()->rightmost();
+  }
+  const node_type* rightmost() const {
+    return (!root() || root()->leaf()) ? root() : root()->rightmost();
+  }
+  node_type** mutable_rightmost() { return root()->mutable_rightmost(); }
+
+  // The leftmost node is stored as the parent of the root node.
+  node_type* leftmost() { return root() ? root()->parent() : NULL; }
+  const node_type* leftmost() const { return root() ? root()->parent() : NULL; }
+
+  // The size of the tree is stored in the root node.
+  size_type* mutable_size() { return root()->mutable_size(); }
+
+  // Allocator routines.
+  internal_allocator_type* mutable_internal_allocator() {
+    return static_cast<internal_allocator_type*>(&root_);
+  }
+  const internal_allocator_type& internal_allocator() const {
+    return *static_cast<const internal_allocator_type*>(&root_);
+  }
+
+  // Node creation/deletion routines.
+  node_type* new_internal_node(node_type *parent) {
+    internal_fields *p = reinterpret_cast<internal_fields*>(
+        mutable_internal_allocator()->allocate(sizeof(internal_fields)));
+    return node_type::init_internal(p, parent);
+  }
+  node_type* new_internal_root_node() {
+    root_fields *p = reinterpret_cast<root_fields*>(
+        mutable_internal_allocator()->allocate(sizeof(root_fields)));
+    return node_type::init_root(p, root()->parent());
+  }
+  node_type* new_leaf_node(node_type *parent) {
+    leaf_fields *p = reinterpret_cast<leaf_fields*>(
+        mutable_internal_allocator()->allocate(sizeof(leaf_fields)));
+    return node_type::init_leaf(p, parent, kNodeValues);
+  }
+  node_type* new_leaf_root_node(int max_count) {
+    leaf_fields *p = reinterpret_cast<leaf_fields*>(
+        mutable_internal_allocator()->allocate(
+            sizeof(base_fields) + max_count * sizeof(value_type)));
+    return node_type::init_leaf(p, reinterpret_cast<node_type*>(p), max_count);
+  }
+  void delete_internal_node(node_type *node) {
+    node->destroy();
+    ceph_assert(node != root());
+    mutable_internal_allocator()->deallocate(
+        reinterpret_cast<char*>(node), sizeof(internal_fields));
+  }
+  void delete_internal_root_node() {
+    root()->destroy();
+    mutable_internal_allocator()->deallocate(
+        reinterpret_cast<char*>(root()), sizeof(root_fields));
+  }
+  void delete_leaf_node(node_type *node) {
+    node->destroy();
+    mutable_internal_allocator()->deallocate(
+        reinterpret_cast<char*>(node),
+        sizeof(base_fields) + node->max_count() * sizeof(value_type));
+  }
+
+  // Rebalances or splits the node iter points to.
+  void rebalance_or_split(iterator *iter);
+
+  // Merges the values of left, right and the delimiting key on their parent
+  // onto left, removing the delimiting key and deleting right.
+  void merge_nodes(node_type *left, node_type *right);
+
+  // Tries to merge node with its left or right sibling, and failing that,
+  // rebalance with its left or right sibling. Returns true if a merge
+  // occurred, at which point it is no longer valid to access node. Returns
+  // false if no merging took place.
+  bool try_merge_or_rebalance(iterator *iter);
+
+  // Tries to shrink the height of the tree by 1.
+  void try_shrink();
+
+  iterator internal_end(iterator iter) {
+    return iter.node ? iter : end();
+  }
+  const_iterator internal_end(const_iterator iter) const {
+    return iter.node ? iter : end();
+  }
+
+  // Inserts a value into the btree immediately before iter. Requires that
+  // key(v) <= iter.key() and (--iter).key() <= key(v).
+  iterator internal_insert(iterator iter, const value_type &v);
+
+  // Returns an iterator pointing to the first value >= the value "iter" is
+  // pointing at. Note that "iter" might be pointing to an invalid location as
+  // iter.position == iter.node->count(). This routine simply moves iter up in
+  // the tree to a valid location.
+  template <typename IterType>
+  static IterType internal_last(IterType iter);
+
+  // Returns an iterator pointing to the leaf position at which key would
+  // reside in the tree. We provide 2 versions of internal_locate. The first
+  // version (internal_locate_plain_compare) always returns 0 for the second
+  // field of the pair. The second version (internal_locate_compare_to) is for
+  // the key-compare-to specialization and returns either kExactMatch (if the
+  // key was found in the tree) or -kExactMatch (if it wasn't) in the second
+  // field of the pair. The compare_to specialization allows the caller to
+  // avoid a subsequent comparison to determine if an exact match was made,
+  // speeding up string keys.
+  template <typename IterType>
+  std::pair<IterType, int> internal_locate(
+      const key_type &key, IterType iter) const;
+  template <typename IterType>
+  std::pair<IterType, int> internal_locate_plain_compare(
+      const key_type &key, IterType iter) const;
+  template <typename IterType>
+  std::pair<IterType, int> internal_locate_compare_to(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements lower_bound().
+  template <typename IterType>
+  IterType internal_lower_bound(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements upper_bound().
+  template <typename IterType>
+  IterType internal_upper_bound(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements find_unique().
+  template <typename IterType>
+  IterType internal_find_unique(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements find_multi().
+  template <typename IterType>
+  IterType internal_find_multi(
+      const key_type &key, IterType iter) const;
+
+  // Deletes a node and all of its children.
+  void internal_clear(node_type *node);
+
+  // Dumps a node and all of its children to the specified ostream.
+  void internal_dump(std::ostream &os, const node_type *node, int level) const;
+
+  // Verifies the tree structure of node.
+  int internal_verify(const node_type *node,
+                      const key_type *lo, const key_type *hi) const;
+
+  node_stats internal_stats(const node_type *node) const {
+    if (!node) {
+      return node_stats(0, 0);
+    }
+    if (node->leaf()) {
+      return node_stats(1, 0);
+    }
+    node_stats res(0, 1);
+    for (int i = 0; i <= node->count(); ++i) {
+      res += internal_stats(node->child(i));
+    }
+    return res;
+  }
+
+ private:
+  empty_base_handle<internal_allocator_type, node_type*> root_;
+
+ private:
+  // A never instantiated helper function that returns big_ if we have a
+  // key-compare-to functor or if R is bool and small_ otherwise.
+  template <typename R>
+  static typename if_<
+   if_<is_key_compare_to::value,
+             std::is_same<R, int>,
+             std::is_same<R, bool> >::type::value,
+   big_, small_>::type key_compare_checker(R);
+
+  // A never instantiated helper function that returns the key comparison
+  // functor.
+  static key_compare key_compare_helper();
+
+  // Verify that key_compare returns a bool. This is similar to the way
+  // is_convertible in base/type_traits.h works. Note that key_compare_checker
+  // is never actually invoked. The compiler will select which
+  // key_compare_checker() to instantiate and then figure out the size of the
+  // return type of key_compare_checker() at compile time which we then check
+  // against the sizeof of big_.
+  COMPILE_ASSERT(
+      sizeof(key_compare_checker(key_compare_helper()(key_type(), key_type()))) ==
+      sizeof(big_),
+      key_comparison_function_must_return_bool);
+
+  // Note: We insist on kTargetValues, which is computed from
+  // Params::kTargetNodeSize, must fit the base_fields::field_type.
+  COMPILE_ASSERT(kNodeValues <
+                 (1 << (8 * sizeof(typename base_fields::field_type))),
+                 target_node_size_too_large);
+
+  // Test the assumption made in setting kNodeValueSpace.
+  COMPILE_ASSERT(sizeof(base_fields) >= 2 * sizeof(void*),
+                 node_space_assumption_incorrect);
+};
+
+////
+// btree_node methods
+template <typename P>
+inline void btree_node<P>::insert_value(int i, const value_type &x) {
+  ceph_assert(i <= count());
+  value_init(count(), x);
+  for (int j = count(); j > i; --j) {
+    value_swap(j, this, j - 1);
+  }
+  set_count(count() + 1);
+
+  if (!leaf()) {
+    ++i;
+    for (int j = count(); j > i; --j) {
+      *mutable_child(j) = child(j - 1);
+      child(j)->set_position(j);
+    }
+    *mutable_child(i) = NULL;
+  }
+}
+
+template <typename P>
+inline void btree_node<P>::remove_value(int i) {
+  if (!leaf()) {
+    ceph_assert(child(i + 1)->count() == 0);
+    for (int j = i + 1; j < count(); ++j) {
+      *mutable_child(j) = child(j + 1);
+      child(j)->set_position(j);
+    }
+    *mutable_child(count()) = NULL;
+  }
+
+  set_count(count() - 1);
+  for (; i < count(); ++i) {
+    value_swap(i, this, i + 1);
+  }
+  value_destroy(i);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_right_to_left(btree_node *src, int to_move) {
+  ceph_assert(parent() == src->parent());
+  ceph_assert(position() + 1 == src->position());
+  ceph_assert(src->count() >= count());
+  ceph_assert(to_move >= 1);
+  ceph_assert(to_move <= src->count());
+
+  // Make room in the left node for the new values.
+  for (int i = 0; i < to_move; ++i) {
+    value_init(i + count());
+  }
+
+  // Move the delimiting value to the left node and the new delimiting value
+  // from the right node.
+  value_swap(count(), parent(), position());
+  parent()->value_swap(position(), src, to_move - 1);
+
+  // Move the values from the right to the left node.
+  for (int i = 1; i < to_move; ++i) {
+    value_swap(count() + i, src, i - 1);
+  }
+  // Shift the values in the right node to their correct position.
+  for (int i = to_move; i < src->count(); ++i) {
+    src->value_swap(i - to_move, src, i);
+  }
+  for (int i = 1; i <= to_move; ++i) {
+    src->value_destroy(src->count() - i);
+  }
+
+  if (!leaf()) {
+    // Move the child pointers from the right to the left node.
+    for (int i = 0; i < to_move; ++i) {
+      set_child(1 + count() + i, src->child(i));
+    }
+    for (int i = 0; i <= src->count() - to_move; ++i) {
+      ceph_assert(i + to_move <= src->max_count());
+      src->set_child(i, src->child(i + to_move));
+      *src->mutable_child(i + to_move) = NULL;
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(count() + to_move);
+  src->set_count(src->count() - to_move);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_left_to_right(btree_node *dest, int to_move) {
+  ceph_assert(parent() == dest->parent());
+  ceph_assert(position() + 1 == dest->position());
+  ceph_assert(count() >= dest->count());
+  ceph_assert(to_move >= 1);
+  ceph_assert(to_move <= count());
+
+  // Make room in the right node for the new values.
+  for (int i = 0; i < to_move; ++i) {
+    dest->value_init(i + dest->count());
+  }
+  for (int i = dest->count() - 1; i >= 0; --i) {
+    dest->value_swap(i, dest, i + to_move);
+  }
+
+  // Move the delimiting value to the right node and the new delimiting value
+  // from the left node.
+  dest->value_swap(to_move - 1, parent(), position());
+  parent()->value_swap(position(), this, count() - to_move);
+  value_destroy(count() - to_move);
+
+  // Move the values from the left to the right node.
+  for (int i = 1; i < to_move; ++i) {
+    value_swap(count() - to_move + i, dest, i - 1);
+    value_destroy(count() - to_move + i);
+  }
+
+  if (!leaf()) {
+    // Move the child pointers from the left to the right node.
+    for (int i = dest->count(); i >= 0; --i) {
+      dest->set_child(i + to_move, dest->child(i));
+      *dest->mutable_child(i) = NULL;
+    }
+    for (int i = 1; i <= to_move; ++i) {
+      dest->set_child(i - 1, child(count() - to_move + i));
+      *mutable_child(count() - to_move + i) = NULL;
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(count() - to_move);
+  dest->set_count(dest->count() + to_move);
+}
+
+template <typename P>
+void btree_node<P>::split(btree_node *dest, int insert_position) {
+  ceph_assert(dest->count() == 0);
+
+  // We bias the split based on the position being inserted. If we're
+  // inserting at the beginning of the left node then bias the split to put
+  // more values on the right node. If we're inserting at the end of the
+  // right node then bias the split to put more values on the left node.
+  if (insert_position == 0) {
+    dest->set_count(count() - 1);
+  } else if (insert_position == max_count()) {
+    dest->set_count(0);
+  } else {
+    dest->set_count(count() / 2);
+  }
+  set_count(count() - dest->count());
+  ceph_assert(count() >= 1);
+
+  // Move values from the left sibling to the right sibling.
+  for (int i = 0; i < dest->count(); ++i) {
+    dest->value_init(i);
+    value_swap(count() + i, dest, i);
+    value_destroy(count() + i);
+  }
+
+  // The split key is the largest value in the left sibling.
+  set_count(count() - 1);
+  parent()->insert_value(position(), value_type());
+  value_swap(count(), parent(), position());
+  value_destroy(count());
+  parent()->set_child(position() + 1, dest);
+
+  if (!leaf()) {
+    for (int i = 0; i <= dest->count(); ++i) {
+      ceph_assert(child(count() + i + 1) != NULL);
+      dest->set_child(i, child(count() + i + 1));
+      *mutable_child(count() + i + 1) = NULL;
+    }
+  }
+}
+
+template <typename P>
+void btree_node<P>::merge(btree_node *src) {
+  ceph_assert(parent() == src->parent());
+  ceph_assert(position() + 1 == src->position());
+
+  // Move the delimiting value to the left node.
+  value_init(count());
+  value_swap(count(), parent(), position());
+
+  // Move the values from the right to the left node.
+  for (int i = 0; i < src->count(); ++i) {
+    value_init(1 + count() + i);
+    value_swap(1 + count() + i, src, i);
+    src->value_destroy(i);
+  }
+
+  if (!leaf()) {
+    // Move the child pointers from the right to the left node.
+    for (int i = 0; i <= src->count(); ++i) {
+      set_child(1 + count() + i, src->child(i));
+      *src->mutable_child(i) = NULL;
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(1 + count() + src->count());
+  src->set_count(0);
+
+  // Remove the value on the parent node.
+  parent()->remove_value(position());
+}
+
+template <typename P>
+void btree_node<P>::swap(btree_node *x) {
+  ceph_assert(leaf() == x->leaf());
+
+  // Swap the values.
+  for (int i = count(); i < x->count(); ++i) {
+    value_init(i);
+  }
+  for (int i = x->count(); i < count(); ++i) {
+    x->value_init(i);
+  }
+  int n = std::max(count(), x->count());
+  for (int i = 0; i < n; ++i) {
+    value_swap(i, x, i);
+  }
+  for (int i = count(); i < x->count(); ++i) {
+    x->value_destroy(i);
+  }
+  for (int i = x->count(); i < count(); ++i) {
+    value_destroy(i);
+  }
+
+  if (!leaf()) {
+    // Swap the child pointers.
+    for (int i = 0; i <= n; ++i) {
+      btree_swap_helper(*mutable_child(i), *x->mutable_child(i));
+    }
+    for (int i = 0; i <= count(); ++i) {
+      x->child(i)->fields_.parent = x;
+    }
+    for (int i = 0; i <= x->count(); ++i) {
+      child(i)->fields_.parent = this;
+    }
+  }
+
+  // Swap the counts.
+  btree_swap_helper(fields_.count, x->fields_.count);
+}
+
+////
+// btree_iterator methods
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_slow() {
+  if (node->leaf()) {
+    ceph_assert(position >= node->count());
+    self_type save(*this);
+    while (position == node->count() && !node->is_root()) {
+      ceph_assert(node->parent()->child(node->position()) == node);
+      position = node->position();
+      node = node->parent();
+    }
+    if (position == node->count()) {
+      *this = save;
+    }
+  } else {
+    ceph_assert(position < node->count());
+    node = node->child(position + 1);
+    while (!node->leaf()) {
+      node = node->child(0);
+    }
+    position = 0;
+  }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_by(int count) {
+  while (count > 0) {
+    if (node->leaf()) {
+      int rest = node->count() - position;
+      position += std::min(rest, count);
+      count = count - rest;
+      if (position < node->count()) {
+        return;
+      }
+    } else {
+      --count;
+    }
+    increment_slow();
+  }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::decrement_slow() {
+  if (node->leaf()) {
+    ceph_assert(position <= -1);
+    self_type save(*this);
+    while (position < 0 && !node->is_root()) {
+      ceph_assert(node->parent()->child(node->position()) == node);
+      position = node->position() - 1;
+      node = node->parent();
+    }
+    if (position < 0) {
+      *this = save;
+    }
+  } else {
+    ceph_assert(position >= 0);
+    node = node->child(position);
+    while (!node->leaf()) {
+      node = node->child(node->count());
+    }
+    position = node->count() - 1;
+  }
+}
+
+////
+// btree methods
+template <typename P>
+btree<P>::btree(const key_compare &comp, const allocator_type &alloc)
+    : key_compare(comp),
+      root_(alloc, NULL) {
+}
+
+template <typename P>
+btree<P>::btree(const self_type &x)
+    : key_compare(x.key_comp()),
+      root_(x.internal_allocator(), NULL) {
+  assign(x);
+}
+
+template <typename P> template <typename ValuePointer>
+std::pair<typename btree<P>::iterator, bool>
+btree<P>::insert_unique(const key_type &key, ValuePointer value) {
+  if (empty()) {
+    *mutable_root() = new_leaf_root_node(1);
+  }
+
+  std::pair<iterator, int> res = internal_locate(key, iterator(root(), 0));
+  iterator &iter = res.first;
+  if (res.second == kExactMatch) {
+    // The key already exists in the tree, do nothing.
+    return std::make_pair(internal_last(iter), false);
+  } else if (!res.second) {
+    iterator last = internal_last(iter);
+    if (last.node && !compare_keys(key, last.key())) {
+      // The key already exists in the tree, do nothing.
+      return std::make_pair(last, false);
+    }
+  }
+
+  return std::make_pair(internal_insert(iter, *value), true);
+}
+
+template <typename P>
+inline typename btree<P>::iterator
+btree<P>::insert_unique(iterator position, const value_type &v) {
+  if (!empty()) {
+    const key_type &key = params_type::key(v);
+    if (position == end() || compare_keys(key, position.key())) {
+      iterator prev = position;
+      if (position == begin() || compare_keys((--prev).key(), key)) {
+        // prev.key() < key < position.key()
+        return internal_insert(position, v);
+      }
+    } else if (compare_keys(position.key(), key)) {
+      iterator next = position;
+      ++next;
+      if (next == end() || compare_keys(key, next.key())) {
+        // position.key() < key < next.key()
+        return internal_insert(next, v);
+      }
+    } else {
+      // position.key() == key
+      return position;
+    }
+  }
+  return insert_unique(v).first;
+}
+
+template <typename P> template <typename InputIterator>
+void btree<P>::insert_unique(InputIterator b, InputIterator e) {
+  for (; b != e; ++b) {
+    insert_unique(end(), *b);
+  }
+}
+
+template <typename P> template <typename ValuePointer>
+typename btree<P>::iterator
+btree<P>::insert_multi(const key_type &key, ValuePointer value) {
+  if (empty()) {
+    *mutable_root() = new_leaf_root_node(1);
+  }
+
+  iterator iter = internal_upper_bound(key, iterator(root(), 0));
+  if (!iter.node) {
+    iter = end();
+  }
+  return internal_insert(iter, *value);
+}
+
+template <typename P>
+typename btree<P>::iterator
+btree<P>::insert_multi(iterator position, const value_type &v) {
+  if (!empty()) {
+    const key_type &key = params_type::key(v);
+    if (position == end() || !compare_keys(position.key(), key)) {
+      iterator prev = position;
+      if (position == begin() || !compare_keys(key, (--prev).key())) {
+        // prev.key() <= key <= position.key()
+        return internal_insert(position, v);
+      }
+    } else {
+      iterator next = position;
+      ++next;
+      if (next == end() || !compare_keys(next.key(), key)) {
+        // position.key() < key <= next.key()
+        return internal_insert(next, v);
+      }
+    }
+  }
+  return insert_multi(v);
+}
+
+template <typename P> template <typename InputIterator>
+void btree<P>::insert_multi(InputIterator b, InputIterator e) {
+  for (; b != e; ++b) {
+    insert_multi(end(), *b);
+  }
+}
+
+template <typename P>
+void btree<P>::assign(const self_type &x) {
+  clear();
+
+  *mutable_key_comp() = x.key_comp();
+  *mutable_internal_allocator() = x.internal_allocator();
+
+  // Assignment can avoid key comparisons because we know the order of the
+  // values is the same order we'll store them in.
+  for (const_iterator iter = x.begin(); iter != x.end(); ++iter) {
+    if (empty()) {
+      insert_multi(*iter);
+    } else {
+      // If the btree is not empty, we can just insert the new value at the end
+      // of the tree!
+      internal_insert(end(), *iter);
+    }
+  }
+}
+
+template <typename P>
+typename btree<P>::iterator btree<P>::erase(iterator iter) {
+  bool internal_delete = false;
+  if (!iter.node->leaf()) {
+    // Deletion of a value on an internal node. Swap the key with the largest
+    // value of our left child. This is easy, we just decrement iter.
+    iterator tmp_iter(iter--);
+    ceph_assert(iter.node->leaf());
+    ceph_assert(!compare_keys(tmp_iter.key(), iter.key()));
+    iter.node->value_swap(iter.position, tmp_iter.node, tmp_iter.position);
+    internal_delete = true;
+    --*mutable_size();
+  } else if (!root()->leaf()) {
+    --*mutable_size();
+  }
+
+  // Delete the key from the leaf.
+  iter.node->remove_value(iter.position);
+
+  // We want to return the next value after the one we just erased. If we
+  // erased from an internal node (internal_delete == true), then the next
+  // value is ++(++iter). If we erased from a leaf node (internal_delete ==
+  // false) then the next value is ++iter. Note that ++iter may point to an
+  // internal node and the value in the internal node may move to a leaf node
+  // (iter.node) when rebalancing is performed at the leaf level.
+
+  // Merge/rebalance as we walk back up the tree.
+  iterator res(iter);
+  for (;;) {
+    if (iter.node == root()) {
+      try_shrink();
+      if (empty()) {
+        return end();
+      }
+      break;
+    }
+    if (iter.node->count() >= kMinNodeValues) {
+      break;
+    }
+    bool merged = try_merge_or_rebalance(&iter);
+    if (iter.node->leaf()) {
+      res = iter;
+    }
+    if (!merged) {
+      break;
+    }
+    iter.node = iter.node->parent();
+  }
+
+  // Adjust our return value. If we're pointing at the end of a node, advance
+  // the iterator.
+  if (res.position == res.node->count()) {
+    res.position = res.node->count() - 1;
+    ++res;
+  }
+  // If we erased from an internal node, advance the iterator.
+  if (internal_delete) {
+    ++res;
+  }
+  return res;
+}
+
+template <typename P>
+int btree<P>::erase(iterator begin, iterator end) {
+  int count = distance(begin, end);
+  for (int i = 0; i < count; i++) {
+    begin = erase(begin);
+  }
+  return count;
+}
+
+template <typename P>
+int btree<P>::erase_unique(const key_type &key) {
+  iterator iter = internal_find_unique(key, iterator(root(), 0));
+  if (!iter.node) {
+    // The key doesn't exist in the tree, return nothing done.
+    return 0;
+  }
+  erase(iter);
+  return 1;
+}
+
+template <typename P>
+int btree<P>::erase_multi(const key_type &key) {
+  iterator begin = internal_lower_bound(key, iterator(root(), 0));
+  if (!begin.node) {
+    // The key doesn't exist in the tree, return nothing done.
+    return 0;
+  }
+  // Delete all of the keys between begin and upper_bound(key).
+  iterator end = internal_end(
+      internal_upper_bound(key, iterator(root(), 0)));
+  return erase(begin, end);
+}
+
+template <typename P>
+void btree<P>::clear() {
+  if (root() != NULL) {
+    internal_clear(root());
+  }
+  *mutable_root() = NULL;
+}
+
+template <typename P>
+void btree<P>::swap(self_type &x) {
+  std::swap(static_cast<key_compare&>(*this), static_cast<key_compare&>(x));
+  std::swap(root_, x.root_);
+}
+
+template <typename P>
+void btree<P>::verify() const {
+  if (root() != NULL) {
+    ceph_assert(size() == internal_verify(root(), NULL, NULL));
+    ceph_assert(leftmost() == (++const_iterator(root(), -1)).node);
+    ceph_assert(rightmost() == (--const_iterator(root(), root()->count())).node);
+    ceph_assert(leftmost()->leaf());
+    ceph_assert(rightmost()->leaf());
+  } else {
+    ceph_assert(size() == 0);
+    ceph_assert(leftmost() == NULL);
+    ceph_assert(rightmost() == NULL);
+  }
+}
+
+template <typename P>
+void btree<P>::rebalance_or_split(iterator *iter) {
+  node_type *&node = iter->node;
+  int &insert_position = iter->position;
+  ceph_assert(node->count() == node->max_count());
+
+  // First try to make room on the node by rebalancing.
+  node_type *parent = node->parent();
+  if (node != root()) {
+    if (node->position() > 0) {
+      // Try rebalancing with our left sibling.
+      node_type *left = parent->child(node->position() - 1);
+      if (left->count() < left->max_count()) {
+        // We bias rebalancing based on the position being inserted. If we're
+        // inserting at the end of the right node then we bias rebalancing to
+        // fill up the left node.
+        int to_move = (left->max_count() - left->count()) /
+            (1 + (insert_position < left->max_count()));
+        to_move = std::max(1, to_move);
+
+        if (((insert_position - to_move) >= 0) ||
+            ((left->count() + to_move) < left->max_count())) {
+          left->rebalance_right_to_left(node, to_move);
+
+          ceph_assert(node->max_count() - node->count() == to_move);
+          insert_position = insert_position - to_move;
+          if (insert_position < 0) {
+            insert_position = insert_position + left->count() + 1;
+            node = left;
+          }
+
+          ceph_assert(node->count() < node->max_count());
+          return;
+        }
+      }
+    }
+
+    if (node->position() < parent->count()) {
+      // Try rebalancing with our right sibling.
+      node_type *right = parent->child(node->position() + 1);
+      if (right->count() < right->max_count()) {
+        // We bias rebalancing based on the position being inserted. If we're
+        // inserting at the beginning of the left node then we bias rebalancing
+        // to fill up the right node.
+        int to_move = (right->max_count() - right->count()) /
+            (1 + (insert_position > 0));
+        to_move = std::max(1, to_move);
+
+        if ((insert_position <= (node->count() - to_move)) ||
+            ((right->count() + to_move) < right->max_count())) {
+          node->rebalance_left_to_right(right, to_move);
+
+          if (insert_position > node->count()) {
+            insert_position = insert_position - node->count() - 1;
+            node = right;
+          }
+
+          ceph_assert(node->count() < node->max_count());
+          return;
+        }
+      }
+    }
+
+    // Rebalancing failed, make sure there is room on the parent node for a new
+    // value.
+    if (parent->count() == parent->max_count()) {
+      iterator parent_iter(node->parent(), node->position());
+      rebalance_or_split(&parent_iter);
+    }
+  } else {
+    // Rebalancing not possible because this is the root node.
+    if (root()->leaf()) {
+      // The root node is currently a leaf node: create a new root node and set
+      // the current root node as the child of the new root.
+      parent = new_internal_root_node();
+      parent->set_child(0, root());
+      *mutable_root() = parent;
+      ceph_assert(*mutable_rightmost() == parent->child(0));
+    } else {
+      // The root node is an internal node. We do not want to create a new root
+      // node because the root node is special and holds the size of the tree
+      // and a pointer to the rightmost node. So we create a new internal node
+      // and move all of the items on the current root into the new node.
+      parent = new_internal_node(parent);
+      parent->set_child(0, parent);
+      parent->swap(root());
+      node = parent;
+    }
+  }
+
+  // Split the node.
+  node_type *split_node;
+  if (node->leaf()) {
+    split_node = new_leaf_node(parent);
+    node->split(split_node, insert_position);
+    if (rightmost() == node) {
+      *mutable_rightmost() = split_node;
+    }
+  } else {
+    split_node = new_internal_node(parent);
+    node->split(split_node, insert_position);
+  }
+
+  if (insert_position > node->count()) {
+    insert_position = insert_position - node->count() - 1;
+    node = split_node;
+  }
+}
+
+template <typename P>
+void btree<P>::merge_nodes(node_type *left, node_type *right) {
+  left->merge(right);
+  if (right->leaf()) {
+    if (rightmost() == right) {
+      *mutable_rightmost() = left;
+    }
+    delete_leaf_node(right);
+  } else {
+    delete_internal_node(right);
+  }
+}
+
+template <typename P>
+bool btree<P>::try_merge_or_rebalance(iterator *iter) {
+  node_type *parent = iter->node->parent();
+  if (iter->node->position() > 0) {
+    // Try merging with our left sibling.
+    node_type *left = parent->child(iter->node->position() - 1);
+    if ((1 + left->count() + iter->node->count()) <= left->max_count()) {
+      iter->position += 1 + left->count();
+      merge_nodes(left, iter->node);
+      iter->node = left;
+      return true;
+    }
+  }
+  if (iter->node->position() < parent->count()) {
+    // Try merging with our right sibling.
+    node_type *right = parent->child(iter->node->position() + 1);
+    if ((1 + iter->node->count() + right->count()) <= right->max_count()) {
+      merge_nodes(iter->node, right);
+      return true;
+    }
+    // Try rebalancing with our right sibling. We don't perform rebalancing if
+    // we deleted the first element from iter->node and the node is not
+    // empty. This is a small optimization for the common pattern of deleting
+    // from the front of the tree.
+    if ((right->count() > kMinNodeValues) &&
+        ((iter->node->count() == 0) ||
+         (iter->position > 0))) {
+      int to_move = (right->count() - iter->node->count()) / 2;
+      to_move = std::min(to_move, right->count() - 1);
+      iter->node->rebalance_right_to_left(right, to_move);
+      return false;
+    }
+  }
+  if (iter->node->position() > 0) {
+    // Try rebalancing with our left sibling. We don't perform rebalancing if
+    // we deleted the last element from iter->node and the node is not
+    // empty. This is a small optimization for the common pattern of deleting
+    // from the back of the tree.
+    node_type *left = parent->child(iter->node->position() - 1);
+    if ((left->count() > kMinNodeValues) &&
+        ((iter->node->count() == 0) ||
+         (iter->position < iter->node->count()))) {
+      int to_move = (left->count() - iter->node->count()) / 2;
+      to_move = std::min(to_move, left->count() - 1);
+      left->rebalance_left_to_right(iter->node, to_move);
+      iter->position += to_move;
+      return false;
+    }
+  }
+  return false;
+}
+
+template <typename P>
+void btree<P>::try_shrink() {
+  if (root()->count() > 0) {
+    return;
+  }
+  // Deleted the last item on the root node, shrink the height of the tree.
+  if (root()->leaf()) {
+    ceph_assert(size() == 0);
+    delete_leaf_node(root());
+    *mutable_root() = NULL;
+  } else {
+    node_type *child = root()->child(0);
+    if (child->leaf()) {
+      // The child is a leaf node so simply make it the root node in the tree.
+      child->make_root();
+      delete_internal_root_node();
+      *mutable_root() = child;
+    } else {
+      // The child is an internal node. We want to keep the existing root node
+      // so we move all of the values from the child node into the existing
+      // (empty) root node.
+      child->swap(root());
+      delete_internal_node(child);
+    }
+  }
+}
+
+template <typename P> template <typename IterType>
+inline IterType btree<P>::internal_last(IterType iter) {
+  while (iter.node && iter.position == iter.node->count()) {
+    iter.position = iter.node->position();
+    iter.node = iter.node->parent();
+    if (iter.node->leaf()) {
+      iter.node = NULL;
+    }
+  }
+  return iter;
+}
+
+template <typename P>
+inline typename btree<P>::iterator
+btree<P>::internal_insert(iterator iter, const value_type &v) {
+  if (!iter.node->leaf()) {
+    // We can't insert on an internal node. Instead, we'll insert after the
+    // previous value which is guaranteed to be on a leaf node.
+    --iter;
+    ++iter.position;
+  }
+  if (iter.node->count() == iter.node->max_count()) {
+    // Make room in the leaf for the new item.
+    if (iter.node->max_count() < kNodeValues) {
+      // Insertion into the root where the root is smaller that the full node
+      // size. Simply grow the size of the root node.
+      ceph_assert(iter.node == root());
+      iter.node = new_leaf_root_node(
+          std::min<int>(kNodeValues, 2 * iter.node->max_count()));
+      iter.node->swap(root());
+      delete_leaf_node(root());
+      *mutable_root() = iter.node;
+    } else {
+      rebalance_or_split(&iter);
+      ++*mutable_size();
+    }
+  } else if (!root()->leaf()) {
+    ++*mutable_size();
+  }
+  iter.node->insert_value(iter.position, v);
+  return iter;
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate(
+    const key_type &key, IterType iter) const {
+  return internal_locate_type::dispatch(key, *this, iter);
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate_plain_compare(
+    const key_type &key, IterType iter) const {
+  for (;;) {
+    iter.position = iter.node->lower_bound(key, key_comp());
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return std::make_pair(iter, 0);
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate_compare_to(
+    const key_type &key, IterType iter) const {
+  for (;;) {
+    int res = iter.node->lower_bound(key, key_comp());
+    iter.position = res & kMatchMask;
+    if (res & kExactMatch) {
+      return std::make_pair(iter, static_cast<int>(kExactMatch));
+    }
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return std::make_pair(iter, -kExactMatch);
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_lower_bound(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    for (;;) {
+      iter.position =
+          iter.node->lower_bound(key, key_comp()) & kMatchMask;
+      if (iter.node->leaf()) {
+        break;
+      }
+      iter.node = iter.node->child(iter.position);
+    }
+    iter = internal_last(iter);
+  }
+  return iter;
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_upper_bound(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    for (;;) {
+      iter.position = iter.node->upper_bound(key, key_comp());
+      if (iter.node->leaf()) {
+        break;
+      }
+      iter.node = iter.node->child(iter.position);
+    }
+    iter = internal_last(iter);
+  }
+  return iter;
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_find_unique(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    std::pair<IterType, int> res = internal_locate(key, iter);
+    if (res.second == kExactMatch) {
+      return res.first;
+    }
+    if (!res.second) {
+      iter = internal_last(res.first);
+      if (iter.node && !compare_keys(key, iter.key())) {
+        return iter;
+      }
+    }
+  }
+  return IterType(NULL, 0);
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_find_multi(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    iter = internal_lower_bound(key, iter);
+    if (iter.node) {
+      iter = internal_last(iter);
+      if (iter.node && !compare_keys(key, iter.key())) {
+        return iter;
+      }
+    }
+  }
+  return IterType(NULL, 0);
+}
+
+template <typename P>
+void btree<P>::internal_clear(node_type *node) {
+  if (!node->leaf()) {
+    for (int i = 0; i <= node->count(); ++i) {
+      internal_clear(node->child(i));
+    }
+    if (node == root()) {
+      delete_internal_root_node();
+    } else {
+      delete_internal_node(node);
+    }
+  } else {
+    delete_leaf_node(node);
+  }
+}
+
+template <typename P>
+void btree<P>::internal_dump(
+    std::ostream &os, const node_type *node, int level) const {
+  for (int i = 0; i < node->count(); ++i) {
+    if (!node->leaf()) {
+      internal_dump(os, node->child(i), level + 1);
+    }
+    for (int j = 0; j < level; ++j) {
+      os << "  ";
+    }
+    os << node->key(i) << " [" << level << "]\n";
+  }
+  if (!node->leaf()) {
+    internal_dump(os, node->child(node->count()), level + 1);
+  }
+}
+
+template <typename P>
+int btree<P>::internal_verify(
+    const node_type *node, const key_type *lo, const key_type *hi) const {
+  ceph_assert(node->count() > 0);
+  ceph_assert(node->count() <= node->max_count());
+  if (lo) {
+    ceph_assert(!compare_keys(node->key(0), *lo));
+  }
+  if (hi) {
+    ceph_assert(!compare_keys(*hi, node->key(node->count() - 1)));
+  }
+  for (int i = 1; i < node->count(); ++i) {
+    ceph_assert(!compare_keys(node->key(i), node->key(i - 1)));
+  }
+  int count = node->count();
+  if (!node->leaf()) {
+    for (int i = 0; i <= node->count(); ++i) {
+      ceph_assert(node->child(i) != NULL);
+      ceph_assert(node->child(i)->parent() == node);
+      ceph_assert(node->child(i)->position() == i);
+      count += internal_verify(
+          node->child(i),
+          (i == 0) ? lo : &node->key(i - 1),
+          (i == node->count()) ? hi : &node->key(i));
+    }
+  }
+  return count;
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_H__
diff --git a/src/include/cpp-btree/btree_container.h b/src/include/cpp-btree/btree_container.h
new file mode 100644
index 00000000..fb617abe
--- /dev/null
+++ b/src/include/cpp-btree/btree_container.h
@@ -0,0 +1,349 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_BTREE_BTREE_CONTAINER_H__
+#define UTIL_BTREE_BTREE_CONTAINER_H__
+
+#include <iosfwd>
+#include <utility>
+
+#include "btree.h"
+
+namespace btree {
+
+// A common base class for btree_set, btree_map, btree_multiset and
+// btree_multimap.
+template <typename Tree>
+class btree_container {
+  typedef btree_container<Tree> self_type;
+
+ public:
+  typedef typename Tree::params_type params_type;
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+  typedef typename Tree::pointer pointer;
+  typedef typename Tree::const_pointer const_pointer;
+  typedef typename Tree::reference reference;
+  typedef typename Tree::const_reference const_reference;
+  typedef typename Tree::size_type size_type;
+  typedef typename Tree::difference_type difference_type;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+  typedef typename Tree::reverse_iterator reverse_iterator;
+  typedef typename Tree::const_reverse_iterator const_reverse_iterator;
+
+ public:
+  // Default constructor.
+  btree_container(const key_compare &comp, const allocator_type &alloc)
+      : tree_(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_container(const self_type &x)
+      : tree_(x.tree_) {
+  }
+
+  // Iterator routines.
+  iterator begin() { return tree_.begin(); }
+  const_iterator begin() const { return tree_.begin(); }
+  iterator end() { return tree_.end(); }
+  const_iterator end() const { return tree_.end(); }
+  reverse_iterator rbegin() { return tree_.rbegin(); }
+  const_reverse_iterator rbegin() const { return tree_.rbegin(); }
+  reverse_iterator rend() { return tree_.rend(); }
+  const_reverse_iterator rend() const { return tree_.rend(); }
+
+  // Lookup routines.
+  iterator lower_bound(const key_type &key) {
+    return tree_.lower_bound(key);
+  }
+  const_iterator lower_bound(const key_type &key) const {
+    return tree_.lower_bound(key);
+  }
+  iterator upper_bound(const key_type &key) {
+    return tree_.upper_bound(key);
+  }
+  const_iterator upper_bound(const key_type &key) const {
+    return tree_.upper_bound(key);
+  }
+  std::pair<iterator,iterator> equal_range(const key_type &key) {
+    return tree_.equal_range(key);
+  }
+  std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
+    return tree_.equal_range(key);
+  }
+
+  // Utility routines.
+  void clear() {
+    tree_.clear();
+  }
+  void swap(self_type &x) {
+    tree_.swap(x.tree_);
+  }
+  void dump(std::ostream &os) const {
+    tree_.dump(os);
+  }
+  void verify() const {
+    tree_.verify();
+  }
+
+  // Size routines.
+  size_type size() const { return tree_.size(); }
+  size_type max_size() const { return tree_.max_size(); }
+  bool empty() const { return tree_.empty(); }
+  size_type height() const { return tree_.height(); }
+  size_type internal_nodes() const { return tree_.internal_nodes(); }
+  size_type leaf_nodes() const { return tree_.leaf_nodes(); }
+  size_type nodes() const { return tree_.nodes(); }
+  size_type bytes_used() const { return tree_.bytes_used(); }
+  static double average_bytes_per_value() {
+    return Tree::average_bytes_per_value();
+  }
+  double fullness() const { return tree_.fullness(); }
+  double overhead() const { return tree_.overhead(); }
+
+  bool operator==(const self_type& x) const {
+    if (size() != x.size()) {
+      return false;
+    }
+    for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi) {
+      if (*i != *xi) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool operator!=(const self_type& other) const {
+    return !operator==(other);
+  }
+
+
+ protected:
+  Tree tree_;
+};
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const btree_container<T> &b) {
+  b.dump(os);
+  return os;
+}
+
+// A common base class for btree_set and safe_btree_set.
+template <typename Tree>
+class btree_unique_container : public btree_container<Tree> {
+  typedef btree_unique_container<Tree> self_type;
+  typedef btree_container<Tree> super_type;
+
+ public:
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::size_type size_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+
+ public:
+  // Default constructor.
+  btree_unique_container(const key_compare &comp = key_compare(),
+                         const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_unique_container(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_unique_container(InputIterator b, InputIterator e,
+                         const key_compare &comp = key_compare(),
+                         const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+    insert(b, e);
+  }
+
+  // Lookup routines.
+  iterator find(const key_type &key) {
+    return this->tree_.find_unique(key);
+  }
+  const_iterator find(const key_type &key) const {
+    return this->tree_.find_unique(key);
+  }
+  size_type count(const key_type &key) const {
+    return this->tree_.count_unique(key);
+  }
+
+  // Insertion routines.
+  std::pair<iterator,bool> insert(const value_type &x) {
+    return this->tree_.insert_unique(x);
+  }
+  iterator insert(iterator position, const value_type &x) {
+    return this->tree_.insert_unique(position, x);
+  }
+  template <typename InputIterator>
+  void insert(InputIterator b, InputIterator e) {
+    this->tree_.insert_unique(b, e);
+  }
+
+  // Deletion routines.
+  int erase(const key_type &key) {
+    return this->tree_.erase_unique(key);
+  }
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(const iterator &iter) {
+    return this->tree_.erase(iter);
+  }
+  void erase(const iterator &first, const iterator &last) {
+    this->tree_.erase(first, last);
+  }
+};
+
+// A common base class for btree_map and safe_btree_map.
+template <typename Tree>
+class btree_map_container : public btree_unique_container<Tree> {
+  typedef btree_map_container<Tree> self_type;
+  typedef btree_unique_container<Tree> super_type;
+
+ public:
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::data_type data_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::mapped_type mapped_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+
+ private:
+  // A pointer-like object which only generates its value when
+  // dereferenced. Used by operator[] to avoid constructing an empty data_type
+  // if the key already exists in the map.
+  struct generate_value {
+    generate_value(const key_type &k)
+        : key(k) {
+    }
+    value_type operator*() const {
+      return std::make_pair(key, data_type());
+    }
+    const key_type &key;
+  };
+
+ public:
+  // Default constructor.
+  btree_map_container(const key_compare &comp = key_compare(),
+                      const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_map_container(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_map_container(InputIterator b, InputIterator e,
+                      const key_compare &comp = key_compare(),
+                      const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+
+  // Insertion routines.
+  data_type& operator[](const key_type &key) {
+    return this->tree_.insert_unique(key, generate_value(key)).first->second;
+  }
+};
+
+// A common base class for btree_multiset and btree_multimap.
+template <typename Tree>
+class btree_multi_container : public btree_container<Tree> {
+  typedef btree_multi_container<Tree> self_type;
+  typedef btree_container<Tree> super_type;
+
+ public:
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::size_type size_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+
+ public:
+  // Default constructor.
+  btree_multi_container(const key_compare &comp = key_compare(),
+                        const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_multi_container(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_multi_container(InputIterator b, InputIterator e,
+                        const key_compare &comp = key_compare(),
+                        const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+    insert(b, e);
+  }
+
+  // Lookup routines.
+  iterator find(const key_type &key) {
+    return this->tree_.find_multi(key);
+  }
+  const_iterator find(const key_type &key) const {
+    return this->tree_.find_multi(key);
+  }
+  size_type count(const key_type &key) const {
+    return this->tree_.count_multi(key);
+  }
+
+  // Insertion routines.
+  iterator insert(const value_type &x) {
+    return this->tree_.insert_multi(x);
+  }
+  iterator insert(iterator position, const value_type &x) {
+    return this->tree_.insert_multi(position, x);
+  }
+  template <typename InputIterator>
+  void insert(InputIterator b, InputIterator e) {
+    this->tree_.insert_multi(b, e);
+  }
+
+  // Deletion routines.
+  int erase(const key_type &key) {
+    return this->tree_.erase_multi(key);
+  }
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(const iterator &iter) {
+    return this->tree_.erase(iter);
+  }
+  void erase(const iterator &first, const iterator &last) {
+    this->tree_.erase(first, last);
+  }
+};
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_CONTAINER_H__
diff --git a/src/include/cpp-btree/btree_map.h b/src/include/cpp-btree/btree_map.h
new file mode 100644
index 00000000..b83489f0
--- /dev/null
+++ b/src/include/cpp-btree/btree_map.h
@@ -0,0 +1,130 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree_map<> implements the STL unique sorted associative container
+// interface and the pair associative container interface (a.k.a map<>) using a
+// btree. A btree_multimap<> implements the STL multiple sorted associative
+// container interface and the pair associtive container interface (a.k.a
+// multimap<>) using a btree. See btree.h for details of the btree
+// implementation and caveats.
+
+#ifndef UTIL_BTREE_BTREE_MAP_H__
+#define UTIL_BTREE_BTREE_MAP_H__
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// The btree_map class is needed mainly for its constructors.
+template <typename Key, typename Value,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<std::pair<const Key, Value> >,
+          int TargetNodeSize = 256>
+class btree_map : public btree_map_container<
+  btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_map<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_map_params<
+    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_map_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+  // Default constructor.
+  btree_map(const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_map(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_map(InputIterator b, InputIterator e,
+            const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename V, typename C, typename A, int N>
+inline void swap(btree_map<K, V, C, A, N> &x,
+                 btree_map<K, V, C, A, N> &y) {
+  x.swap(y);
+}
+
+// The btree_multimap class is needed mainly for its constructors.
+template <typename Key, typename Value,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<std::pair<const Key, Value> >,
+          int TargetNodeSize = 256>
+class btree_multimap : public btree_multi_container<
+  btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_multimap<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_map_params<
+    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_multi_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+  typedef typename btree_type::data_type data_type;
+  typedef typename btree_type::mapped_type mapped_type;
+
+ public:
+  // Default constructor.
+  btree_multimap(const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_multimap(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_multimap(InputIterator b, InputIterator e,
+                 const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename V, typename C, typename A, int N>
+inline void swap(btree_multimap<K, V, C, A, N> &x,
+                 btree_multimap<K, V, C, A, N> &y) {
+  x.swap(y);
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_MAP_H__
diff --git a/src/include/cpp-btree/btree_set.h b/src/include/cpp-btree/btree_set.h
new file mode 100644
index 00000000..f9b2e75d
--- /dev/null
+++ b/src/include/cpp-btree/btree_set.h
@@ -0,0 +1,121 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree_set<> implements the STL unique sorted associative container
+// interface (a.k.a set<>) using a btree. A btree_multiset<> implements the STL
+// multiple sorted associative container interface (a.k.a multiset<>) using a
+// btree. See btree.h for details of the btree implementation and caveats.
+
+#ifndef UTIL_BTREE_BTREE_SET_H__
+#define UTIL_BTREE_BTREE_SET_H__
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// The btree_set class is needed mainly for its constructors.
+template <typename Key,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<Key>,
+          int TargetNodeSize = 256>
+class btree_set : public btree_unique_container<
+  btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_set<Key, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_unique_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+  // Default constructor.
+  btree_set(const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_set(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_set(InputIterator b, InputIterator e,
+            const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename C, typename A, int N>
+inline void swap(btree_set<K, C, A, N> &x, btree_set<K, C, A, N> &y) {
+  x.swap(y);
+}
+
+// The btree_multiset class is needed mainly for its constructors.
+template <typename Key,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<Key>,
+          int TargetNodeSize = 256>
+class btree_multiset : public btree_multi_container<
+  btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_multiset<Key, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_multi_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+  // Default constructor.
+  btree_multiset(const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_multiset(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_multiset(InputIterator b, InputIterator e,
+                 const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename C, typename A, int N>
+inline void swap(btree_multiset<K, C, A, N> &x,
+                 btree_multiset<K, C, A, N> &y) {
+  x.swap(y);
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_SET_H__
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
new file mode 100644
index 00000000..dd4ede66
--- /dev/null
+++ b/src/include/crc32c.h
@@ -0,0 +1,57 @@
+#ifndef CEPH_CRC32C_H
+#define CEPH_CRC32C_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
+
+/*
+ * this is a static global with the chosen crc32c implementation for
+ * the given architecture.
+ */
+extern ceph_crc32c_func_t ceph_crc32c_func;
+
+extern ceph_crc32c_func_t ceph_choose_crc32(void);
+
+/**
+ * calculate crc32c for data that is entirely 0 (ZERO)
+ *
+ * Note: works the same as ceph_crc32c_func for data == nullptr, 
+ * but faster than the optimized assembly on certain architectures.
+ * This is faster than intel optimized assembly, but not as fast as 
+ * ppc64le optimized assembly.  
+ *
+ * @param crc initial value
+ * @param length length of buffer
+ */
+uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length);
+
+/**
+ * calculate crc32c
+ *
+ * Note: if the data pointer is NULL, we calculate a crc value as if
+ * it were zero-filled.
+ *
+ * @param crc initial value
+ * @param data pointer to data buffer
+ * @param length length of buffer
+ */
+static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
+{
+#ifndef HAVE_POWER8
+  if (!data && length > 16)
+    return ceph_crc32c_zeros(crc, length);
+#endif /* HAVE_POWER8 */
+
+  return ceph_crc32c_func(crc, data, length);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/demangle.h b/src/include/demangle.h
new file mode 100644
index 00000000..9e46d952
--- /dev/null
+++ b/src/include/demangle.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INCLUDE_DEMANGLE
+#define CEPH_INCLUDE_DEMANGLE
+
+//// Stole this code from http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
+#ifdef __GNUG__
+#include <cstdlib>
+#include <memory>
+#include <cxxabi.h>
+
+static std::string ceph_demangle(const char* name)
+{
+  int status = -4; // some arbitrary value to eliminate the compiler warning
+
+  // enable c++11 by passing the flag -std=c++11 to g++
+  std::unique_ptr<char, void(*)(void*)> res {
+    abi::__cxa_demangle(name, NULL, NULL, &status),
+    std::free
+  };
+
+  return (status == 0) ? res.get() : name ;
+}
+
+#else
+
+// does nothing if not g++
+static std::string demangle(const char* name)
+{
+  return name;
+}
+
+#endif
+
+
+#endif
diff --git a/src/include/denc.h b/src/include/denc.h
new file mode 100644
index 00000000..a6a0fcaa
--- /dev/null
+++ b/src/include/denc.h
@@ -0,0 +1,1724 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+// If you #include "include/encoding.h" you get the old-style *and*
+// the new-style definitions.  (The old-style needs denc_traits<> in
+// order to disable the container helpers when new-style traits are
+// present.)
+
+// You can also just #include "include/denc.h" and get only the
+// new-style helpers.  The eventual goal is to drop the legacy
+// definitions.
+
+#ifndef _ENC_DEC_H
+#define _ENC_DEC_H
+
+#include <array>
+#include <cstring>
+#include <map>
+#include <optional>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/intrusive/set.hpp>
+#include <boost/optional.hpp>
+
+#include "include/ceph_assert.h"	// boost clobbers this
+#include "include/intarith.h"
+#include "include/int_types.h"
+
+#include "buffer.h"
+#include "byteorder.h"
+
+#include "common/convenience.h"
+
+template<typename T, typename=void>
+struct denc_traits {
+  static constexpr bool supported = false;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = true;
+};
+
+template<typename T>
+inline constexpr bool denc_supported = denc_traits<T>::supported;
+
+
+// hack for debug only; FIXME
+//#include <iostream>
+//using std::cout;
+
+// Define this to compile in a dump of all encoded objects to disk to
+// populate ceph-object-corpus.  Note that there is an almost
+// identical implementation in encoding.h, but you only need to define
+// ENCODE_DUMP_PATH here.
+//
+// See src/test/encoding/generate-corpus-objects.sh.
+//
+//#define ENCODE_DUMP_PATH /tmp/something
+
+#ifdef ENCODE_DUMP_PATH
+# include <cstdio>
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <fcntl.h>
+# define ENCODE_STR(x) #x
+# define ENCODE_STRINGIFY(x) ENCODE_STR(x)
+# define DENC_DUMP_PRE(Type)			\
+  char *__denc_dump_pre = p.get_pos();
+  // this hackery with bits below is just to get a semi-reasonable
+  // distribution across time.  it is somewhat exponential but not
+  // quite.
+# define DENC_DUMP_POST(Type)			\
+  do {									\
+    static int i = 0;							\
+    i++;								\
+    int bits = 0;							\
+    for (unsigned t = i; t; bits++)					\
+      t &= t - 1;							\
+    if (bits > 2)							\
+      break;								\
+    char fn[PATH_MAX];							\
+    snprintf(fn, sizeof(fn),						\
+	     ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #Type,		\
+	     getpid(), i++);						\
+    int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC, 0644);		\
+    if (fd >= 0) {							\
+      size_t len = p.get_pos() - __denc_dump_pre;			\
+      int r = ::write(fd, __denc_dump_pre, len);			\
+      (void)r;								\
+      ::close(fd);							\
+    }									\
+  } while (0)
+#else
+# define DENC_DUMP_PRE(Type)
+# define DENC_DUMP_POST(Type)
+#endif
+
+
+/*
+
+  top level level functions look like so
+  ======================================
+
+    inline void denc(const T& o, size_t& p, uint64_t features=0);
+    inline void denc(const T& o, buffer::list::contiguous_appender& p,
+                     uint64_t features=0);
+    inline void denc(T& o, buffer::ptr::const_iterator& p, uint64_t features=0);
+
+  or (for featured objects)
+
+    inline void denc(const T& o, size_t& p, uint64_t features);
+    inline void denc(const T& o, buffer::list::contiguous_appender& p,
+                     uint64_t features);
+    inline void denc(T& o, buffer::ptr::const_iterator& p, uint64_t features);
+
+  - These are symmetrical, so that they can be used from the magic DENC
+  method of writing the bound_encode/encode/decode methods all in one go;
+  they differ only in the type of p.
+
+  - These are automatically fabricated via a template that calls into
+  the denc_traits<> methods (see below), provided denc_traits<T>::supported
+  is defined and true.  They never need to be written explicitly.
+
+
+  static denc_traits<> definitions look like so
+  =============================================
+
+    template<>
+    struct denc_traits<T> {
+      static constexpr bool supported = true;
+      static constexpr bool bounded = false;
+      static constexpr bool featured = false;
+      static constexpr bool need_contiguous = true;
+      static void bound_encode(const T &o, size_t& p, uint64_t f=0);
+      static void encode(const T &o, buffer::list::contiguous_appender& p,
+		         uint64_t f=0);
+      static void decode(T& o, buffer::ptr::const_iterator &p, uint64_t f=0);
+    };
+
+  or (for featured objects)
+
+    template<>
+    struct denc_traits<T> {
+      static constexpr bool supported = true;
+      static constexpr bool bounded = false;
+      static constexpr bool featured = true;
+      static constexpr bool need_contiguous = true;
+      static void bound_encode(const T &o, size_t& p, uint64_t f);
+      static void encode(const T &o, buffer::list::contiguous_appender& p,
+		         uint64_t f);
+      static void decode(T& o, buffer::ptr::const_iterator &p, uint64_t f=0);
+    };
+
+  - denc_traits<T> is normally declared via the WRITE_CLASS_DENC(type) macro,
+  which is used in place of the old-style WRITE_CLASS_ENCODER(type) macro.
+  There are _FEATURED and _BOUNDED variants.  The class traits simply call
+  into class methods of the same name (see below).
+
+  - denc_traits<T> can also be written explicitly for some type to indicate
+  how it should be encoded.  This is the "source of truth" for how a type
+  is encoded.
+
+  - denc_traits<T> are declared for the base integer types, string, bufferptr,
+  and bufferlist base types.
+
+  - denc_traits<std::foo<T>>-like traits are declared for standard container
+  types.
+
+
+  class methods look like so
+  ==========================
+
+    void bound_encode(size_t& p) const;
+    void encode(buffer::list::contiguous_appender& p) const;
+    void decode(buffer::ptr::const_iterator &p);
+
+  or (for featured objects)
+
+    void bound_encode(size_t& p, uint64_t f) const;
+    void encode(buffer::list::contiguous_appender& p, uint64_t f) const;
+    void decode(buffer::ptr::const_iterator &p);
+
+  - These are normally invoked by the denc_traits<> methods that are
+  declared via WRITE_CLASS_DENC, although you can also invoke them explicitly
+  in your code.
+
+  - These methods are optimised for contiguous buffer, but denc() will try
+    rebuild a contigous one if the decoded bufferlist is segmented. If you are
+    concerned about the cost, you might want to define yet another method:
+
+    void decode(buffer::list::iterator &p);
+
+  - These can be defined either explicitly (as above), or can be "magically"
+  defined all in one go using the DENC macro and DENC_{START,FINISH} helpers
+  (which work like the legacy {ENCODE,DECODE}_{START,FINISH} macros):
+
+    class foo_t {
+      ...
+      DENC(foo_t, v, p) {
+        DENC_START(1, 1, p);
+        denc(v.foo, p);
+        denc(v.bar, p);
+        denc(v.baz, p);
+        DENC_FINISH(p);
+      }
+      ...
+    };
+    WRITE_CLASS_DENC(foo_t)
+
+  */
+
+// ---------------------------------------------------------------------
+// raw types
+namespace _denc {
+template<typename T, typename... Us>
+inline constexpr bool is_any_of = (... || std::is_same_v<T, Us>);
+
+template<typename T, typename=void> struct underlying_type {
+  using type = T;
+};
+template<typename T>
+struct underlying_type<T, std::enable_if_t<std::is_enum_v<T>>> {
+  using type = std::underlying_type_t<T>;
+};
+template<typename T>
+using underlying_type_t = typename underlying_type<T>::type;
+}
+
+template<class It>
+struct is_const_iterator
+  : std::conditional_t<std::is_const_v<std::remove_pointer_t<typename It::pointer>>,
+		       std::true_type,
+		       std::false_type>
+{};
+template<>
+struct is_const_iterator<size_t> : std::false_type {};
+template<>
+struct is_const_iterator<buffer::list::contiguous_appender> : std::false_type {
+  // appender is used for *changing* the buffer
+};
+template<class It>
+inline constexpr bool is_const_iterator_v = is_const_iterator<It>::value;
+
+template<typename T, class It>
+std::enable_if_t<is_const_iterator_v<It>, const T&>
+get_pos_add(It& i) {
+  return *reinterpret_cast<const T*>(i.get_pos_add(sizeof(T)));
+}
+
+template<typename T, class It>
+std::enable_if_t<!is_const_iterator_v<It>, T&>
+get_pos_add(It& i) {
+  return *reinterpret_cast<T*>(i.get_pos_add(sizeof(T)));
+}
+
+template<typename T>
+struct denc_traits<
+  T,
+  std::enable_if_t<
+    _denc::is_any_of<_denc::underlying_type_t<T>,
+		     ceph_le64, ceph_le32, ceph_le16, uint8_t
+#ifndef _CHAR_IS_SIGNED
+		       , int8_t
+#endif
+		     >>> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+  static void bound_encode(const T &o, size_t& p, uint64_t f=0) {
+    p += sizeof(T);
+  }
+  template<class It>
+  static std::enable_if_t<!is_const_iterator_v<It>>
+  encode(const T &o, It& p, uint64_t f=0) {
+    get_pos_add<T>(p) = o;
+  }
+  template<class It>
+  static std::enable_if_t<is_const_iterator_v<It>>
+  decode(T& o, It& p, uint64_t f=0) {
+    o = get_pos_add<T>(p);
+  }
+  static void decode(T& o, buffer::list::const_iterator &p) {
+    p.copy(sizeof(T), reinterpret_cast<char*>(&o));
+  }
+};
+
+
+// -----------------------------------------------------------------------
+// integer types
+
+// itype == internal type
+// otype == external type, i.e., the type on the wire
+
+// NOTE: the overload resolution ensures that the legacy encode/decode methods
+// defined for int types is preferred to the ones  defined using the specialized
+// template, and hence get selected. This machinery prevents these these from
+// getting glued into the legacy encode/decode methods; the overhead of setting
+// up a contiguous_appender etc is likely to be slower.
+namespace _denc {
+
+template<typename T, typename=void> struct ExtType {
+  using type = void;
+};
+
+template<typename T>
+struct ExtType<T, std::enable_if_t<std::is_same_v<T, int16_t> ||
+				   std::is_same_v<T, uint16_t>>> {
+  using type = ceph_le16;
+};
+
+template<typename T>
+struct ExtType<T, std::enable_if_t<std::is_same_v<T, int32_t> ||
+				   std::is_same_v<T, uint32_t>>> {
+  using type = ceph_le32;
+};
+
+template<typename T>
+struct ExtType<T, std::enable_if_t<std::is_same_v<T, int64_t> ||
+				   std::is_same_v<T, uint64_t>>> {
+  using type = ceph_le64;
+};
+
+template<>
+struct ExtType<bool> {
+  using type = uint8_t;
+};
+template<typename T>
+using ExtType_t = typename ExtType<T>::type;
+} // namespace _denc
+
+template<typename T>
+struct denc_traits<T, std::enable_if_t<!std::is_void_v<_denc::ExtType_t<T>>>>
+{
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+  using etype = _denc::ExtType_t<T>;
+  static void bound_encode(const T &o, size_t& p, uint64_t f=0) {
+    p += sizeof(etype);
+  }
+  template<class It>
+  static std::enable_if_t<!is_const_iterator_v<It>>
+  encode(const T &o, It& p, uint64_t f=0) {
+    get_pos_add<etype>(p) = o;
+  }
+  template<class It>
+  static std::enable_if_t<is_const_iterator_v<It>>
+  decode(T& o, It &p, uint64_t f=0) {
+    o = get_pos_add<etype>(p);
+  }
+  static void decode(T& o, buffer::list::const_iterator &p) {
+    etype e;
+    p.copy(sizeof(etype), reinterpret_cast<char*>(&e));
+    o = e;
+  }
+};
+
+// varint
+//
+// high bit of each byte indicates another byte follows.
+template<typename T>
+inline void denc_varint(T v, size_t& p) {
+  p += sizeof(T) + 1;
+}
+
+template<typename T>
+inline void denc_varint(T v, bufferlist::contiguous_appender& p) {
+  uint8_t byte = v & 0x7f;
+  v >>= 7;
+  while (v) {
+    byte |= 0x80;
+    get_pos_add<__u8>(p) = byte;
+    byte = (v & 0x7f);
+    v >>= 7;
+  }
+  get_pos_add<__u8>(p) = byte;
+}
+
+template<typename T>
+inline void denc_varint(T& v, bufferptr::const_iterator& p) {
+  uint8_t byte = *(__u8*)p.get_pos_add(1);
+  v = byte & 0x7f;
+  int shift = 7;
+  while (byte & 0x80) {
+    byte = get_pos_add<__u8>(p);
+    v |= (T)(byte & 0x7f) << shift;
+    shift += 7;
+  }
+}
+
+
+// signed varint encoding
+//
+// low bit = 1 = negative, 0 = positive
+// high bit of every byte indicates whether another byte follows.
+inline void denc_signed_varint(int64_t v, size_t& p) {
+  p += sizeof(v) + 2;
+}
+template<class It>
+inline std::enable_if_t<!is_const_iterator_v<It>>
+denc_signed_varint(int64_t v, It& p) {
+  if (v < 0) {
+    v = (-v << 1) | 1;
+  } else {
+    v <<= 1;
+  }
+  denc_varint(v, p);
+}
+
+template<typename T, class It>
+inline std::enable_if_t<is_const_iterator_v<It>>
+denc_signed_varint(T& v, It& p)
+{
+  int64_t i = 0;
+  denc_varint(i, p);
+  if (i & 1) {
+    v = -(i >> 1);
+  } else {
+    v = i >> 1;
+  }
+}
+
+// varint + lowz encoding
+//
+// first(low) 2 bits = how many low zero bits (nibbles)
+// high bit of each byte = another byte follows
+// (so, 5 bits data in first byte, 7 bits data thereafter)
+inline void denc_varint_lowz(uint64_t v, size_t& p) {
+  p += sizeof(v) + 2;
+}
+inline void denc_varint_lowz(uint64_t v, bufferlist::contiguous_appender& p) {
+  int lowznib = v ? (ctz(v) / 4) : 0;
+  if (lowznib > 3)
+    lowznib = 3;
+  v >>= lowznib * 4;
+  v <<= 2;
+  v |= lowznib;
+  denc_varint(v, p);
+}
+
+template<typename T>
+inline void denc_varint_lowz(T& v, bufferptr::const_iterator& p)
+{
+  uint64_t i = 0;
+  denc_varint(i, p);
+  int lowznib = (i & 3);
+  i >>= 2;
+  i <<= lowznib * 4;
+  v = i;
+}
+
+// signed varint + lowz encoding
+//
+// first low bit = 1 for negative, 0 for positive
+// next 2 bits = how many low zero bits (nibbles)
+// high bit of each byte = another byte follows
+// (so, 4 bits data in first byte, 7 bits data thereafter)
+inline void denc_signed_varint_lowz(int64_t v, size_t& p) {
+  p += sizeof(v) + 2;
+}
+template<class It>
+inline std::enable_if_t<!is_const_iterator_v<It>>
+denc_signed_varint_lowz(int64_t v, It& p) {
+  bool negative = false;
+  if (v < 0) {
+    v = -v;
+    negative = true;
+  }
+  unsigned lowznib = v ? (ctz(v) / 4) : 0u;
+  if (lowznib > 3)
+    lowznib = 3;
+  v >>= lowznib * 4;
+  v <<= 3;
+  v |= lowznib << 1;
+  v |= (int)negative;
+  denc_varint(v, p);
+}
+
+template<typename T, class It>
+inline std::enable_if_t<is_const_iterator_v<It>>
+denc_signed_varint_lowz(T& v, It& p)
+{
+  int64_t i = 0;
+  denc_varint(i, p);
+  int lowznib = (i & 6) >> 1;
+  if (i & 1) {
+    i >>= 3;
+    i <<= lowznib * 4;
+    v = -i;
+  } else {
+    i >>= 3;
+    i <<= lowznib * 4;
+    v = i;
+  }
+}
+
+
+// LBA
+//
+// first 1-3 bits = how many low zero bits
+//     *0 = 12 (common 4 K alignment case)
+//    *01 = 16
+//   *011 = 20
+//   *111 = byte
+// then 28-30 bits of data
+// then last bit = another byte follows
+// high bit of each subsequent byte = another byte follows
+inline void denc_lba(uint64_t v, size_t& p) {
+  p += sizeof(v) + 2;
+}
+
+template<class It>
+inline std::enable_if_t<!is_const_iterator_v<It>>
+denc_lba(uint64_t v, It& p) {
+  int low_zero_nibbles = v ? (int)(ctz(v) / 4) : 0;
+  int pos;
+  uint32_t word;
+  int t = low_zero_nibbles - 3;
+  if (t < 0) {
+    pos = 3;
+    word = 0x7;
+  } else if (t < 3) {
+    v >>= (low_zero_nibbles * 4);
+    pos = t + 1;
+    word = (1 << t) - 1;
+  } else {
+    v >>= 20;
+    pos = 3;
+    word = 0x3;
+  }
+  word |= (v << pos) & 0x7fffffff;
+  v >>= 31 - pos;
+  if (!v) {
+    *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word;
+    return;
+  }
+  word |= 0x80000000;
+  *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word;
+  uint8_t byte = v & 0x7f;
+  v >>= 7;
+  while (v) {
+    byte |= 0x80;
+    *(__u8*)p.get_pos_add(1) = byte;
+    byte = (v & 0x7f);
+    v >>= 7;
+  }
+  *(__u8*)p.get_pos_add(1) = byte;
+}
+
+template<class It>
+inline std::enable_if_t<is_const_iterator_v<It>>
+denc_lba(uint64_t& v, It& p) {
+  uint32_t word = *(ceph_le32*)p.get_pos_add(sizeof(uint32_t));
+  int shift;
+  switch (word & 7) {
+  case 0:
+  case 2:
+  case 4:
+  case 6:
+    v = (uint64_t)(word & 0x7ffffffe) << (12 - 1);
+    shift = 12 + 30;
+    break;
+  case 1:
+  case 5:
+    v = (uint64_t)(word & 0x7ffffffc) << (16 - 2);
+    shift = 16 + 29;
+    break;
+  case 3:
+    v = (uint64_t)(word & 0x7ffffff8) << (20 - 3);
+    shift = 20 + 28;
+    break;
+  case 7:
+    v = (uint64_t)(word & 0x7ffffff8) >> 3;
+    shift = 28;
+  }
+  uint8_t byte = word >> 24;
+  while (byte & 0x80) {
+    byte = *(__u8*)p.get_pos_add(1);
+    v |= (uint64_t)(byte & 0x7f) << shift;
+    shift += 7;
+  }
+}
+
+
+// ---------------------------------------------------------------------
+// denc top-level methods that call into denc_traits<T> methods
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported> denc(
+  const T& o,
+  size_t& p,
+  uint64_t f=0)
+{
+  if constexpr (traits::featured) {
+    traits::bound_encode(o, p, f);
+  } else {
+    traits::bound_encode(o, p);
+  }
+}
+
+template<typename T, class It, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !is_const_iterator_v<It>>
+denc(const T& o,
+     It& p,
+     uint64_t features=0)
+{
+  if constexpr (traits::featured) {
+    traits::encode(o, p, features);
+  } else {
+    traits::encode(o, p);
+  }
+}
+
+template<typename T, class It, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && is_const_iterator_v<It>>
+denc(T& o,
+     It& p,
+     uint64_t features=0)
+{
+  if constexpr (traits::featured) {
+    traits::decode(o, p, features);
+  } else {
+    traits::decode(o, p);
+  }
+}
+
+namespace _denc {
+template<typename T, typename = void>
+struct has_legacy_denc : std::false_type {};
+template<typename T>
+struct has_legacy_denc<T, decltype(std::declval<T&>()
+				   .decode(std::declval<
+					   bufferlist::const_iterator&>()))>
+  : std::true_type {
+  static void decode(T& v, bufferlist::const_iterator& p) {
+    v.decode(p);
+  }
+};
+template<typename T>
+struct has_legacy_denc<T,
+		       std::enable_if_t<
+			 !denc_traits<T>::need_contiguous>> : std::true_type {
+  static void decode(T& v, bufferlist::const_iterator& p) {
+    denc_traits<T>::decode(v, p);
+  }
+};
+}
+
+template<typename T,
+	 typename traits=denc_traits<T>,
+	 typename has_legacy_denc=_denc::has_legacy_denc<T>>
+inline std::enable_if_t<traits::supported &&
+			has_legacy_denc::value> denc(
+  T& o,
+  buffer::list::const_iterator& p)
+{
+  has_legacy_denc::decode(o, p);
+}
+
+// ---------------------------------------------------------------------
+// base types and containers
+
+//
+// std::string
+//
+template<typename A>
+struct denc_traits<std::basic_string<char,std::char_traits<char>,A>> {
+private:
+  using value_type = std::basic_string<char,std::char_traits<char>,A>;
+
+public:
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = false;
+
+  static void bound_encode(const value_type& s, size_t& p, uint64_t f=0) {
+    p += sizeof(uint32_t) + s.size();
+  }
+  template<class It>
+  static void encode(const value_type& s,
+		     It& p,
+                     uint64_t f=0) {
+    denc((uint32_t)s.size(), p);
+    memcpy(p.get_pos_add(s.size()), s.data(), s.size());
+  }
+  template<class It>
+  static void decode(value_type& s,
+		     It& p,
+		     uint64_t f=0) {
+    uint32_t len;
+    denc(len, p);
+    decode_nohead(len, s, p);
+  }
+  static void decode(value_type& s, buffer::list::const_iterator& p)
+  {
+    uint32_t len;
+    denc(len, p);
+    decode_nohead(len, s, p);
+  }
+  template<class It>
+  static void decode_nohead(size_t len, value_type& s, It& p) {
+    s.clear();
+    if (len) {
+      s.append(p.get_pos_add(len), len);
+    }
+  }
+  static void decode_nohead(size_t len, value_type& s,
+                            buffer::list::const_iterator& p) {
+    if (len) {
+      if constexpr (std::is_same_v<value_type, std::string>) {
+        s.clear();
+        p.copy(len, s);
+      } else {
+        s.resize(len);
+        p.copy(len, s.data());
+      }
+    } else {
+      s.clear();
+    }
+  }
+  template<class It>
+  static std::enable_if_t<!is_const_iterator_v<It>>
+  encode_nohead(const value_type& s, It& p) {
+    auto len = s.length();
+    maybe_inline_memcpy(p.get_pos_add(len), s.data(), len, 16);
+  }
+};
+
+//
+// bufferptr
+//
+template<>
+struct denc_traits<bufferptr> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = false;
+  static void bound_encode(const bufferptr& v, size_t& p, uint64_t f=0) {
+    p += sizeof(uint32_t) + v.length();
+  }
+  template <class It>
+  static std::enable_if_t<!is_const_iterator_v<It>>
+  encode(const bufferptr& v, It& p, uint64_t f=0) {
+    denc((uint32_t)v.length(), p);
+    p.append(v);
+  }
+  template <class It>
+  static std::enable_if_t<is_const_iterator_v<It>>
+  decode(bufferptr& v, It& p, uint64_t f=0) {
+    uint32_t len;
+    denc(len, p);
+    v = p.get_ptr(len);
+  }
+  static void decode(bufferptr& v, buffer::list::const_iterator& p) {
+    uint32_t len;
+    denc(len, p);
+    bufferlist s;
+    p.copy(len, s);
+    if (len) {
+      if (s.get_num_buffers() == 1)
+	v = s.front();
+      else
+	v = buffer::copy(s.c_str(), s.length());
+    }
+  }
+};
+
+//
+// bufferlist
+//
+template<>
+struct denc_traits<bufferlist> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = false;
+  static void bound_encode(const bufferlist& v, size_t& p, uint64_t f=0) {
+    p += sizeof(uint32_t) + v.length();
+  }
+  static void encode(const bufferlist& v, buffer::list::contiguous_appender& p,
+	      uint64_t f=0) {
+    denc((uint32_t)v.length(), p);
+    p.append(v);
+  }
+  static void decode(bufferlist& v, buffer::ptr::const_iterator& p, uint64_t f=0) {
+    uint32_t len;
+    denc(len, p);
+    v.clear();
+    v.push_back(p.get_ptr(len));
+  }
+  static void decode(bufferlist& v, buffer::list::const_iterator& p) {
+    uint32_t len;
+    denc(len, p);
+    v.clear();
+    p.copy(len, v);
+  }
+  static void encode_nohead(const bufferlist& v,
+			    buffer::list::contiguous_appender& p) {
+    p.append(v);
+  }
+  static void decode_nohead(size_t len, bufferlist& v,
+			    buffer::ptr::const_iterator& p) {
+    v.clear();
+    if (len) {
+      v.append(p.get_ptr(len));
+    }
+  }
+  static void decode_nohead(size_t len, bufferlist& v,
+			    buffer::list::const_iterator& p) {
+    v.clear();
+    p.copy(len, v);
+  }
+};
+
+//
+// std::pair<A, B>
+//
+template<typename A, typename B>
+struct denc_traits<
+  std::pair<A, B>,
+  std::enable_if_t<denc_supported<A> && denc_supported<B>>> {
+  typedef denc_traits<A> a_traits;
+  typedef denc_traits<B> b_traits;
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = a_traits::featured || b_traits::featured ;
+  static constexpr bool bounded = a_traits::bounded && b_traits::bounded;
+  static constexpr bool need_contiguous = (a_traits::need_contiguous ||
+					   b_traits::need_contiguous);
+
+  static void bound_encode(const std::pair<A,B>& v, size_t& p, uint64_t f = 0) {
+    if constexpr (featured) {
+      denc(v.first, p, f);
+      denc(v.second, p, f);
+    } else {
+      denc(v.first, p);
+      denc(v.second, p);
+    }
+  }
+
+  static void encode(const std::pair<A,B>& v, bufferlist::contiguous_appender& p,
+		     uint64_t f = 0) {
+    if constexpr (featured) {
+      denc(v.first, p, f);
+      denc(v.second, p, f);
+    } else {
+      denc(v.first, p);
+      denc(v.second, p);
+    }
+  }
+
+  static void decode(std::pair<A,B>& v, buffer::ptr::const_iterator& p, uint64_t f=0) {
+    denc(v.first, p, f);
+    denc(v.second, p, f);
+  }
+  template<typename AA=A>
+  static std::enable_if_t<!!sizeof(AA) && !need_contiguous>
+    decode(std::pair<A,B>& v, buffer::list::const_iterator& p,
+	    uint64_t f = 0) {
+    denc(v.first, p);
+    denc(v.second, p);
+  }
+};
+
+namespace _denc {
+  template<template<class...> class C, typename Details, typename ...Ts>
+  struct container_base {
+  private:
+    using container = C<Ts...>;
+    using T = typename Details::T;
+
+  public:
+    using traits = denc_traits<T>;
+
+    static constexpr bool supported = true;
+    static constexpr bool featured = traits::featured;
+    static constexpr bool bounded = false;
+    static constexpr bool need_contiguous = traits::need_contiguous;
+
+    template<typename U=T>
+    static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+      p += sizeof(uint32_t);
+      if constexpr (traits::bounded) {
+        if (!s.empty()) {
+          // STL containers use weird element types like std::pair<const K, V>;
+          // cast to something we have denc_traits for.
+          size_t elem_size = 0;
+          if constexpr (traits::featured) {
+            denc(static_cast<const T&>(*s.begin()), elem_size, f);
+          } else {
+            denc(static_cast<const T&>(*s.begin()), elem_size);
+          }
+          p += sizeof(uint32_t) + elem_size * s.size();
+        }
+      } else {
+        for (const T& e : s) {
+          if constexpr (traits::featured) {
+            denc(e, p, f);
+          } else {
+            denc(e, p);
+          }
+        }
+      }
+    }
+
+    template<typename U=T>
+    static void encode(const container& s, buffer::list::contiguous_appender& p,
+	   uint64_t f = 0) {
+      denc((uint32_t)s.size(), p);
+      if constexpr (traits::featured) {
+        encode_nohead(s, p, f);
+      } else {
+        encode_nohead(s, p);
+      }
+    }
+    static void decode(container& s, buffer::ptr::const_iterator& p, uint64_t f = 0) {
+      uint32_t num;
+      denc(num, p);
+      decode_nohead(num, s, p, f);
+    }
+    template<typename U=T>
+    static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+    decode(container& s, buffer::list::const_iterator& p) {
+      uint32_t num;
+      denc(num, p);
+      decode_nohead(num, s, p);
+    }
+
+    // nohead
+    static void encode_nohead(const container& s, buffer::list::contiguous_appender& p,
+			      uint64_t f = 0) {
+      for (const T& e : s) {
+        if constexpr (traits::featured) {
+          denc(e, p, f);
+        } else {
+          denc(e, p);
+        }
+      }
+    }
+    static void decode_nohead(size_t num, container& s,
+			      buffer::ptr::const_iterator& p, uint64_t f=0) {
+      s.clear();
+      Details::reserve(s, num);
+      while (num--) {
+	T t;
+	denc(t, p, f);
+	Details::insert(s, std::move(t));
+      }
+    }
+    template<typename U=T>
+    static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+    decode_nohead(size_t num, container& s,
+		  buffer::list::const_iterator& p) {
+      s.clear();
+      Details::reserve(s, num);
+      while (num--) {
+	T t;
+	denc(t, p);
+	Details::insert(s, std::move(t));
+      }
+    }
+  };
+
+  template<typename T>
+  class container_has_reserve {
+    template<typename U, U> struct SFINAE_match;
+    template<typename U>
+    static std::true_type test(SFINAE_match<T(*)(typename T::size_type),
+			       &U::reserve>*);
+
+    template<typename U>
+    static std::false_type test(...);
+
+  public:
+    static constexpr bool value = decltype(
+      test<denc_traits<T>>(0))::value;
+  };
+  template<typename T>
+  inline constexpr bool container_has_reserve_v =
+    container_has_reserve<T>::value;
+
+
+  template<typename Container>
+  struct container_details_base {
+    using T = typename Container::value_type;
+    static void reserve(Container& c, size_t s) {
+      if constexpr (container_has_reserve_v<Container>) {
+        c.reserve(s);
+      }
+    }
+  };
+
+  template<typename Container>
+  struct pushback_details : public container_details_base<Container> {
+    template<typename ...Args>
+    static void insert(Container& c, Args&& ...args) {
+      c.emplace_back(std::forward<Args>(args)...);
+    }
+  };
+}
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+  std::list<T, Ts...>,
+  typename std::enable_if_t<denc_traits<T>::supported>>
+  : public _denc::container_base<std::list,
+				 _denc::pushback_details<std::list<T, Ts...>>,
+				 T, Ts...> {};
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+  std::vector<T, Ts...>,
+  typename std::enable_if_t<denc_traits<T>::supported>>
+  : public _denc::container_base<std::vector,
+				 _denc::pushback_details<std::vector<T, Ts...>>,
+				 T, Ts...> {};
+
+namespace _denc {
+  template<typename Container>
+  struct setlike_details : public container_details_base<Container> {
+    using T = typename Container::value_type;
+    template<typename ...Args>
+    static void insert(Container& c, Args&& ...args) {
+      c.emplace_hint(c.cend(), std::forward<Args>(args)...);
+    }
+  };
+}
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+  std::set<T, Ts...>,
+  std::enable_if_t<denc_traits<T>::supported>>
+  : public _denc::container_base<std::set,
+				 _denc::setlike_details<std::set<T, Ts...>>,
+				 T, Ts...> {};
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+  boost::container::flat_set<T, Ts...>,
+  std::enable_if_t<denc_traits<T>::supported>>
+  : public _denc::container_base<
+  boost::container::flat_set,
+  _denc::setlike_details<boost::container::flat_set<T, Ts...>>,
+  T, Ts...> {};
+
+namespace _denc {
+  template<typename Container>
+  struct maplike_details : public container_details_base<Container> {
+    using T = std::pair<typename Container::key_type,
+			typename Container::mapped_type>;
+    template<typename ...Args>
+    static void insert(Container& c, Args&& ...args) {
+      c.emplace_hint(c.cend(), std::forward<Args>(args)...);
+    }
+  };
+}
+
+template<typename A, typename B, typename ...Ts>
+struct denc_traits<
+  std::map<A, B, Ts...>,
+  std::enable_if_t<denc_traits<A>::supported &&
+		   denc_traits<B>::supported>>
+  : public _denc::container_base<std::map,
+				 _denc::maplike_details<std::map<A, B, Ts...>>,
+				 A, B, Ts...> {};
+
+template<typename A, typename B, typename ...Ts>
+struct denc_traits<
+  boost::container::flat_map<A, B, Ts...>,
+  std::enable_if_t<denc_traits<A>::supported &&
+		   denc_traits<B>::supported>>
+  : public _denc::container_base<
+  boost::container::flat_map,
+  _denc::maplike_details<boost::container::flat_map<
+			   A, B, Ts...>>,
+  A, B, Ts...> {};
+
+template<typename T, size_t N>
+struct denc_traits<
+  std::array<T, N>,
+  std::enable_if_t<denc_traits<T>::supported>> {
+private:
+  using container = std::array<T, N>;
+public:
+  using traits = denc_traits<T>;
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = traits::featured;
+  static constexpr bool bounded = traits::bounded;
+  static constexpr bool need_contiguous = traits::need_contiguous;
+
+  static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+    if constexpr (traits::bounded) {
+      if constexpr (traits::featured) {
+        if (!s.empty()) {
+          size_t elem_size = 0;
+          denc(*s.begin(), elem_size, f);
+          p += elem_size * s.size();
+        }
+      } else {
+        size_t elem_size = 0;
+        denc(*s.begin(), elem_size);
+        p += elem_size * N;
+      }
+    } else {
+      for (const auto& e : s) {
+        if constexpr (traits::featured) {
+          denc(e, p, f);
+        } else {
+          denc(e, p);
+        }
+      }
+    }
+  }
+
+  static void encode(const container& s, buffer::list::contiguous_appender& p,
+	 uint64_t f = 0) {
+    for (const auto& e : s) {
+      if constexpr (traits::featured) {
+        denc(e, p, f);
+      } else {
+        denc(e, p);
+      }
+    }
+  }
+  static void decode(container& s, buffer::ptr::const_iterator& p, uint64_t f = 0) {
+    for (auto& e : s)
+      denc(e, p, f);
+  }
+  template<typename U=T>
+  static std::enable_if_t<!!sizeof(U) &&
+			  !need_contiguous>
+  decode(container& s, buffer::list::const_iterator& p) {
+    for (auto& e : s) {
+      denc(e, p);
+    }
+  }
+};
+
+template<typename... Ts>
+struct denc_traits<
+  std::tuple<Ts...>,
+  std::enable_if_t<(denc_traits<Ts>::supported && ...)>> {
+
+private:
+  static_assert(sizeof...(Ts) > 0,
+		"Zero-length tuples are not supported.");
+  using container = std::tuple<Ts...>;
+
+public:
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = (denc_traits<Ts>::featured || ...);
+  static constexpr bool bounded = (denc_traits<Ts>::bounded && ...);
+  static constexpr bool need_contiguous =
+      (denc_traits<Ts>::need_contiguous || ...);
+
+  template<typename U = container>
+  static std::enable_if_t<denc_traits<U>::featured>
+  bound_encode(const container& s, size_t& p, uint64_t f) {
+    ceph::for_each(s, [&p, f] (const auto& e) {
+	if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) {
+	  denc(e, p, f);
+	} else {
+	  denc(e, p);
+	}
+      });
+  }
+  template<typename U = container>
+  static std::enable_if_t<!denc_traits<U>::featured>
+  bound_encode(const container& s, size_t& p) {
+    ceph::for_each(s, [&p] (const auto& e) {
+	denc(e, p);
+      });
+  }
+
+  template<typename U = container>
+  static std::enable_if_t<denc_traits<U>::featured>
+  encode(const container& s, buffer::list::contiguous_appender& p, uint64_t f) {
+    ceph::for_each(s, [&p, f] (const auto& e) {
+	if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) {
+	  denc(e, p, f);
+	} else {
+	  denc(e, p);
+	}
+      });
+  }
+  template<typename U = container>
+  static std::enable_if_t<!denc_traits<U>::featured>
+  encode(const container& s, buffer::list::contiguous_appender& p) {
+    ceph::for_each(s, [&p] (const auto& e) {
+	denc(e, p);
+      });
+  }
+
+  static void decode(container& s, buffer::ptr::const_iterator& p, uint64_t f = 0) {
+    ceph::for_each(s, [&p] (auto& e) {
+	denc(e, p);
+      });
+  }
+
+  template<typename U = container>
+  static std::enable_if_t<!denc_traits<U>::need_contiguous>
+  decode(container& s, buffer::list::const_iterator& p, uint64_t f = 0) {
+    ceph::for_each(s, [&p] (auto& e) {
+	denc(e, p);
+      });
+  }
+};
+
+//
+// boost::optional<T>
+//
+template<typename T>
+struct denc_traits<
+  boost::optional<T>,
+  std::enable_if_t<denc_traits<T>::supported>> {
+  using traits = denc_traits<T>;
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = traits::featured;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = traits::need_contiguous;
+
+  static void bound_encode(const boost::optional<T>& v, size_t& p,
+			   uint64_t f = 0) {
+    p += sizeof(bool);
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void encode(const boost::optional<T>& v,
+		     bufferlist::contiguous_appender& p,
+		     uint64_t f = 0) {
+    denc((bool)v, p);
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void decode(boost::optional<T>& v, buffer::ptr::const_iterator& p,
+		     uint64_t f = 0) {
+    bool x;
+    denc(x, p, f);
+    if (x) {
+      v = T{};
+      denc(*v, p, f);
+    } else {
+      v = boost::none;
+    }
+  }
+
+  template<typename U = T>
+  static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+  decode(boost::optional<T>& v, buffer::list::const_iterator& p) {
+    bool x;
+    denc(x, p);
+    if (x) {
+      v = T{};
+      denc(*v, p);
+    } else {
+      v = boost::none;
+    }
+  }
+
+  template<typename U = T>
+  static void encode_nohead(const boost::optional<T>& v,
+			    bufferlist::contiguous_appender& p,
+			    uint64_t f = 0) {
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void decode_nohead(bool num, boost::optional<T>& v,
+			    buffer::ptr::const_iterator& p, uint64_t f = 0) {
+    if (num) {
+      v = T();
+      denc(*v, p, f);
+    } else {
+      v = boost::none;
+    }
+  }
+};
+
+template<>
+struct denc_traits<boost::none_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+
+  static void bound_encode(const boost::none_t& v, size_t& p) {
+    p += sizeof(bool);
+  }
+
+  static void encode(const boost::none_t& v,
+		     bufferlist::contiguous_appender& p) {
+    denc(false, p);
+  }
+};
+
+//
+// std::optional<T>
+//
+template<typename T>
+struct denc_traits<
+  std::optional<T>,
+  std::enable_if_t<denc_traits<T>::supported>> {
+  using traits = denc_traits<T>;
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = traits::featured;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = traits::need_contiguous;
+
+  static void bound_encode(const std::optional<T>& v, size_t& p,
+			   uint64_t f = 0) {
+    p += sizeof(bool);
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void encode(const std::optional<T>& v,
+		     bufferlist::contiguous_appender& p,
+		     uint64_t f = 0) {
+    denc((bool)v, p);
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void decode(std::optional<T>& v, buffer::ptr::const_iterator& p,
+		     uint64_t f = 0) {
+    bool x;
+    denc(x, p, f);
+    if (x) {
+      v = T{};
+      denc(*v, p, f);
+    } else {
+      v = std::nullopt;
+    }
+  }
+
+  template<typename U = T>
+  static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+  decode(std::optional<T>& v, buffer::list::const_iterator& p) {
+    bool x;
+    denc(x, p);
+    if (x) {
+      v = T{};
+      denc(*v, p);
+    } else {
+      v = std::nullopt;
+    }
+  }
+
+  static void encode_nohead(const std::optional<T>& v,
+			    bufferlist::contiguous_appender& p,
+			    uint64_t f = 0) {
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void decode_nohead(bool num, std::optional<T>& v,
+			    buffer::ptr::const_iterator& p, uint64_t f = 0) {
+    if (num) {
+      v = T();
+      denc(*v, p, f);
+    } else {
+      v = std::nullopt;
+    }
+  }
+};
+
+template<>
+struct denc_traits<std::nullopt_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+
+  static void bound_encode(const std::nullopt_t& v, size_t& p) {
+    p += sizeof(bool);
+  }
+
+  static void encode(const std::nullopt_t& v,
+		     bufferlist::contiguous_appender& p) {
+    denc(false, p);
+  }
+};
+
+// ----------------------------------------------------------------------
+// class helpers
+
+// Write denc_traits<> for a class that defines bound_encode/encode/decode
+// methods.
+
+#define WRITE_CLASS_DENC(T) _DECLARE_CLASS_DENC(T, false)
+#define WRITE_CLASS_DENC_BOUNDED(T) _DECLARE_CLASS_DENC(T, true)
+#define _DECLARE_CLASS_DENC(T, b)					\
+  template<> struct denc_traits<T> {					\
+    static constexpr bool supported = true;				\
+    static constexpr bool featured = false;				\
+    static constexpr bool bounded = b;					\
+    static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\
+    static void bound_encode(const T& v, size_t& p, uint64_t f=0) {	\
+      v.bound_encode(p);						\
+    }									\
+    static void encode(const T& v, buffer::list::contiguous_appender& p, \
+		       uint64_t f=0) {					\
+      v.encode(p);							\
+    }									\
+    static void decode(T& v, buffer::ptr::const_iterator& p, uint64_t f=0) {	\
+      v.decode(p);							\
+    }									\
+  };
+
+#define WRITE_CLASS_DENC_FEATURED(T) _DECLARE_CLASS_DENC_FEATURED(T, false)
+#define WRITE_CLASS_DENC_FEATURED_BOUNDED(T) _DECLARE_CLASS_DENC_FEATURED(T, true)
+#define _DECLARE_CLASS_DENC_FEATURED(T, b)				\
+  template<> struct denc_traits<T> {					\
+    static constexpr bool supported = true;				\
+    static constexpr bool featured = true;				\
+    static constexpr bool bounded = b;					\
+    static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\
+    static void bound_encode(const T& v, size_t& p, uint64_t f) {	\
+      v.bound_encode(p, f);						\
+    }									\
+    static void encode(const T& v, buffer::list::contiguous_appender& p, \
+		       uint64_t f) {					\
+      v.encode(p, f);							\
+    }									\
+    static void decode(T& v, buffer::ptr::const_iterator& p, uint64_t f=0) {	\
+      v.decode(p, f);							\
+    }									\
+  };
+
+
+// ----------------------------------------------------------------------
+// encode/decode wrappers
+
+// These glue the new-style denc world into old-style calls to encode
+// and decode by calling into denc_traits<> methods (when present).
+
+namespace ceph {
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::featured> encode(
+  const T& o,
+  bufferlist& bl,
+  uint64_t features_unused=0)
+{
+  size_t len = 0;
+  traits::bound_encode(o, len);
+  auto a = bl.get_contiguous_appender(len);
+  traits::encode(o, a);
+}
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && traits::featured> encode(
+  const T& o, bufferlist& bl,
+  uint64_t features)
+{
+  size_t len = 0;
+  traits::bound_encode(o, len, features);
+  auto a = bl.get_contiguous_appender(len);
+  traits::encode(o, a, features);
+}
+
+template<typename T,
+	 typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::need_contiguous> decode(
+  T& o,
+  bufferlist::const_iterator& p)
+{
+  if (p.end())
+    throw buffer::end_of_buffer();
+  const auto& bl = p.get_bl();
+  const auto remaining = bl.length() - p.get_off();
+  // it is expensive to rebuild a contigous buffer and drop it, so avoid this.
+  if (!p.is_pointing_same_raw(bl.back()) && remaining > CEPH_PAGE_SIZE) {
+    traits::decode(o, p);
+  } else {
+    // ensure we get a contigous buffer... until the end of the
+    // bufferlist.  we don't really know how much we'll need here,
+    // unfortunately.  hopefully it is already contiguous and we're just
+    // bumping the raw ref and initializing the ptr tmp fields.
+    bufferptr tmp;
+    auto t = p;
+    t.copy_shallow(remaining, tmp);
+    auto cp = std::cbegin(tmp);
+    traits::decode(o, cp);
+    p.advance(cp.get_offset());
+  }
+}
+
+template<typename T,
+	 typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && traits::need_contiguous> decode(
+  T& o,
+  bufferlist::const_iterator& p)
+{
+  if (p.end())
+    throw buffer::end_of_buffer();
+  // ensure we get a contigous buffer... until the end of the
+  // bufferlist.  we don't really know how much we'll need here,
+  // unfortunately.  hopefully it is already contiguous and we're just
+  // bumping the raw ref and initializing the ptr tmp fields.
+  bufferptr tmp;
+  auto t = p;
+  t.copy_shallow(p.get_bl().length() - p.get_off(), tmp);
+  auto cp = std::cbegin(tmp);
+  traits::decode(o, cp);
+  p.advance(cp.get_offset());
+}
+
+// nohead variants
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported &&
+			!traits::featured> encode_nohead(
+  const T& o,
+  bufferlist& bl)
+{
+  size_t len = 0;
+  traits::bound_encode(o, len);
+  auto a = bl.get_contiguous_appender(len);
+  traits::encode_nohead(o, a);
+}
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::featured> decode_nohead(
+  size_t num,
+  T& o,
+  bufferlist::const_iterator& p)
+{
+  if (!num)
+    return;
+  if (p.end())
+    throw buffer::end_of_buffer();
+  if constexpr (traits::need_contiguous) {
+    bufferptr tmp;
+    auto t = p;
+    if constexpr (denc_traits<typename T::value_type>::bounded) {
+      size_t element_size = 0;
+      typename T::value_type v;
+      denc_traits<typename T::value_type>::bound_encode(v, element_size);
+      t.copy_shallow(num * element_size, tmp);
+    } else {
+      t.copy_shallow(p.get_bl().length() - p.get_off(), tmp);
+    }
+    auto cp = std::cbegin(tmp);
+    traits::decode_nohead(num, o, cp);
+    p.advance(cp.get_offset());
+  } else {
+    traits::decode_nohead(num, o, p);
+  }
+}
+}
+
+
+// ----------------------------------------------------------------
+// DENC
+
+// These are some class methods we need to do the version and length
+// wrappers for DENC_{START,FINISH} for inter-version
+// interoperability.
+
+#define DENC_HELPERS							\
+  /* bound_encode */							\
+  static void _denc_start(size_t& p,					\
+			  __u8 *struct_v,				\
+			  __u8 *struct_compat,				\
+			  char **, uint32_t *) {			\
+    p += 2 + 4;								\
+  }									\
+  static void _denc_finish(size_t& p,					\
+			   __u8 *struct_v,				\
+			   __u8 *struct_compat,				\
+			   char **, uint32_t *) { }			\
+  /* encode */								\
+  static void _denc_start(bufferlist::contiguous_appender& p,		\
+			  __u8 *struct_v,				\
+			  __u8 *struct_compat,				\
+			  char **len_pos,				\
+			  uint32_t *start_oob_off) {			\
+    denc(*struct_v, p);							\
+    denc(*struct_compat, p);						\
+    *len_pos = p.get_pos_add(4);					\
+    *start_oob_off = p.get_out_of_band_offset();			\
+  }									\
+  static void _denc_finish(bufferlist::contiguous_appender& p,		\
+			   __u8 *struct_v,				\
+			   __u8 *struct_compat,				\
+			   char **len_pos,				\
+			   uint32_t *start_oob_off) {			\
+    *(ceph_le32*)*len_pos = p.get_pos() - *len_pos - sizeof(uint32_t) +	\
+      p.get_out_of_band_offset() - *start_oob_off;			\
+  }									\
+  /* decode */								\
+  static void _denc_start(buffer::ptr::const_iterator& p,		\
+			  __u8 *struct_v,				\
+			  __u8 *struct_compat,				\
+			  char **start_pos,				\
+			  uint32_t *struct_len) {			\
+    denc(*struct_v, p);							\
+    denc(*struct_compat, p);						\
+    denc(*struct_len, p);						\
+    *start_pos = const_cast<char*>(p.get_pos());			\
+  }									\
+  static void _denc_finish(buffer::ptr::const_iterator& p,		\
+			   __u8 *struct_v, __u8 *struct_compat,		\
+			   char **start_pos,				\
+			   uint32_t *struct_len) {			\
+    const char *pos = p.get_pos();					\
+    char *end = *start_pos + *struct_len;				\
+    ceph_assert(pos <= end);							\
+    if (pos < end) {							\
+      p.advance(end - pos);						\
+    }									\
+  }
+
+// Helpers for versioning the encoding.  These correspond to the
+// {ENCODE,DECODE}_{START,FINISH} macros.
+
+#define DENC_START(v, compat, p)					\
+  __u8 struct_v = v;							\
+  __u8 struct_compat = compat;						\
+  char *_denc_pchar;							\
+  uint32_t _denc_u32;							\
+  _denc_start(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32);	\
+  do {
+
+#define DENC_FINISH(p)							\
+  } while (false);							\
+  _denc_finish(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32);
+
+
+// ----------------------------------------------------------------------
+
+// Helpers for writing a unified bound_encode/encode/decode
+// implementation that won't screw up buffer size estimations.
+
+#define DENC(Type, v, p)						\
+  DENC_HELPERS								\
+  void bound_encode(size_t& p) const {					\
+    _denc_friend(*this, p);						\
+  }									\
+  void encode(bufferlist::contiguous_appender& p) const {		\
+    DENC_DUMP_PRE(Type);						\
+    _denc_friend(*this, p);						\
+    DENC_DUMP_POST(Type);						\
+  }									\
+  void decode(buffer::ptr::const_iterator& p) {				\
+    _denc_friend(*this, p);						\
+  }									\
+  template<typename T, typename P>					\
+  friend std::enable_if_t<std::is_same_v<T, Type> ||			\
+			  std::is_same_v<T, const Type>>		\
+  _denc_friend(T& v, P& p)
+
+#define DENC_FEATURED(Type, v, p, f)					\
+  DENC_HELPERS								\
+  void bound_encode(size_t& p, uint64_t f) const {			\
+    _denc_friend(*this, p, f);						\
+  }									\
+  void encode(bufferlist::contiguous_appender& p, uint64_t f) const {	\
+    DENC_DUMP_PRE(Type);						\
+    _denc_friend(*this, p, f);						\
+    DENC_DUMP_POST(Type);						\
+  }									\
+  void decode(buffer::ptr::const_iterator& p, uint64_t f=0) {		\
+    _denc_friend(*this, p, f);						\
+  }									\
+  template<typename T, typename P>					\
+  friend std::enable_if_t<std::is_same_v<T, Type> ||			\
+			  std::is_same_v<T, const Type>>			\
+  _denc_friend(T& v, P& p, uint64_t f)
+
+#endif
diff --git a/src/include/elist.h b/src/include/elist.h
new file mode 100644
index 00000000..38be35db
--- /dev/null
+++ b/src/include/elist.h
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_ELIST_H
+#define CEPH_ELIST_H
+
+/*
+ * elist: embedded list.
+ *
+ * requirements:
+ *   - elist<T>::item be embedded in the parent class
+ *   - items are _always_ added to the list via the same elist<T>::item at the same
+ *     fixed offset in the class.
+ *   - begin(), front(), back() methods take the member offset as an argument for traversal.
+ *
+ */
+
+#define member_offset(cls, member) ((size_t)(&((cls*)1)->member) - 1)
+
+template<typename T>
+class elist {
+public:
+  struct item {
+    item *_prev, *_next;
+    
+    item(T i=0) : _prev(this), _next(this) {}
+    ~item() { 
+      ceph_assert(!is_on_list());
+    }
+
+    item(const item& other) = delete;
+    const item& operator= (const item& right) = delete;
+
+    
+    bool empty() const { return _prev == this; }
+    bool is_on_list() const { return !empty(); }
+
+    bool remove_myself() {
+      if (_next == this) {
+	ceph_assert(_prev == this);
+	return false;
+      }
+      _next->_prev = _prev;
+      _prev->_next = _next;
+      _prev = _next = this;
+      return true;
+    }
+
+    void insert_after(item *other) {
+      ceph_assert(other->empty());
+      other->_prev = this;
+      other->_next = _next;
+      _next->_prev = other;
+      _next = other;
+    }
+    void insert_before(item *other) {
+      ceph_assert(other->empty());
+      other->_next = this;
+      other->_prev = _prev;
+      _prev->_next = other;
+      _prev = other;
+    }
+
+    T get_item(size_t offset) {
+      ceph_assert(offset);
+      return (T)(((char *)this) - offset); 
+    }
+  };
+
+private:
+  item _head;
+  size_t item_offset;
+
+public:
+  elist(const elist& other);
+  const elist& operator=(const elist& other);
+
+  elist(size_t o) : _head(NULL), item_offset(o) {}
+  ~elist() { 
+    ceph_assert(_head.empty());
+  }
+
+  bool empty() const {
+    return _head.empty();
+  }
+
+  void clear() {
+    while (!_head.empty())
+      pop_front();
+  }
+
+  void push_front(item *i) {
+    if (!i->empty()) 
+      i->remove_myself();
+    _head.insert_after(i);
+  }
+  void push_back(item *i) {
+    if (!i->empty()) 
+      i->remove_myself();
+    _head.insert_before(i);
+  }
+
+  T front(size_t o=0) {
+    ceph_assert(!_head.empty());
+    return _head._next->get_item(o ? o : item_offset);
+  }
+  T back(size_t o=0) {
+    ceph_assert(!_head.empty());
+    return _head._prev->get_item(o ? o : item_offset);
+  }
+
+  void pop_front() {
+    ceph_assert(!empty());
+    _head._next->remove_myself();
+  }
+  void pop_back() {
+    ceph_assert(!empty());
+    _head._prev->remove_myself();
+  }
+
+  void clear_list() {
+    while (!empty())
+      pop_front();
+  }
+
+  enum mode_t {
+    MAGIC, CURRENT, CACHE_NEXT
+  };
+
+  class iterator {
+  private:
+    item *head;
+    item *cur, *next;
+    size_t item_offset;
+    mode_t mode;
+  public:
+    iterator(item *h, size_t o, mode_t m) :
+      head(h), cur(h->_next), next(cur->_next), item_offset(o),
+      mode(m) {
+      ceph_assert(item_offset > 0);
+    }
+    T operator*() {
+      return cur->get_item(item_offset);
+    }
+    iterator& operator++() {
+      ceph_assert(cur);
+      ceph_assert(cur != head);
+      if (mode == MAGIC) {
+	// if 'cur' appears to be valid, use that.  otherwise,
+	// use cached 'next'.
+	// this is a bit magic, and probably a bad idea... :/
+	if (cur->empty())
+	  cur = next;
+	else
+	  cur = cur->_next;
+      } else if (mode == CURRENT)
+	cur = cur->_next;
+      else if (mode == CACHE_NEXT)
+	cur = next;
+      else
+	ceph_abort();
+      next = cur->_next;
+      return *this;
+    }
+    bool end() const {
+      return cur == head;
+    }
+  };
+
+  iterator begin(size_t o=0) {
+    return iterator(&_head, o ? o : item_offset, MAGIC);
+  }
+  iterator begin_use_current(size_t o=0) {
+    return iterator(&_head, o ? o : item_offset, CURRENT);
+  }
+  iterator begin_cache_next(size_t o=0) {
+    return iterator(&_head, o ? o : item_offset, CACHE_NEXT);
+  }
+};
+
+
+#endif
diff --git a/src/include/encoding.h b/src/include/encoding.h
new file mode 100644
index 00000000..61219024
--- /dev/null
+++ b/src/include/encoding.h
@@ -0,0 +1,1505 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_ENCODING_H
+#define CEPH_ENCODING_H
+
+#include <set>
+#include <map>
+#include <deque>
+#include <vector>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <boost/container/small_vector.hpp>
+#include <boost/optional/optional_io.hpp>
+#include <boost/tuple/tuple.hpp>
+
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+#include "common/ceph_time.h"
+
+#include "include/int_types.h"
+
+#include "common/convenience.h"
+
+#include "byteorder.h"
+#include "buffer.h"
+
+// pull in the new-style encoding so that we get the denc_traits<> definition.
+#include "denc.h"
+
+#include "assert.h"
+
+using namespace ceph;
+
+namespace ceph {
+
+/*
+ * Notes on feature encoding:
+ *
+ * - The default encode() methods have a features argument with a default parameter
+ *   (which goes to zero).
+ * - Normal classes will use WRITE_CLASS_ENCODER, with that features=0 default.
+ * - Classes that _require_ features will use WRITE_CLASS_ENCODER_FEATURES, which
+ *   does not define the default.  Any caller must explicitly pass it in.
+ * - STL container macros have two encode variants: one with a features arg, and one
+ *   without.
+ *
+ * The result:
+ * - A feature encode() method will fail to compile if a value is not
+ *   passed in.
+ * - The feature varianet of the STL templates will be used when the feature arg is
+ *   provided.  It will be passed through to any template arg types, but it will be
+ *   ignored when not needed.
+ */
+
+// --------------------------------------
+// base types
+
+template<class T>
+inline void encode_raw(const T& t, bufferlist& bl)
+{
+  bl.append((char*)&t, sizeof(t));
+}
+template<class T>
+inline void decode_raw(T& t, bufferlist::const_iterator &p)
+{
+  p.copy(sizeof(t), (char*)&t);
+}
+
+#define WRITE_RAW_ENCODER(type)						\
+  inline void encode(const type &v, ::ceph::bufferlist& bl, uint64_t features=0) { ::ceph::encode_raw(v, bl); } \
+  inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { ::ceph::decode_raw(v, p); }
+
+WRITE_RAW_ENCODER(__u8)
+#ifndef _CHAR_IS_SIGNED
+WRITE_RAW_ENCODER(__s8)
+#endif
+WRITE_RAW_ENCODER(char)
+WRITE_RAW_ENCODER(ceph_le64)
+WRITE_RAW_ENCODER(ceph_le32)
+WRITE_RAW_ENCODER(ceph_le16)
+
+inline void encode(const bool &v, bufferlist& bl) {
+  __u8 vv = v;
+  encode_raw(vv, bl);
+}
+inline void decode(bool &v, bufferlist::const_iterator& p) {
+  __u8 vv;
+  decode_raw(vv, p);
+  v = vv;
+}
+
+
+// -----------------------------------
+// int types
+
+#define WRITE_INTTYPE_ENCODER(type, etype)				\
+  inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \
+    ceph_##etype e;					                \
+    e = v;                                                              \
+    ::ceph::encode_raw(e, bl);						\
+  }									\
+  inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) {	\
+    ceph_##etype e;							\
+    ::ceph::decode_raw(e, p);						\
+    v = e;								\
+  }
+
+WRITE_INTTYPE_ENCODER(uint64_t, le64)
+WRITE_INTTYPE_ENCODER(int64_t, le64)
+WRITE_INTTYPE_ENCODER(uint32_t, le32)
+WRITE_INTTYPE_ENCODER(int32_t, le32)
+WRITE_INTTYPE_ENCODER(uint16_t, le16)
+WRITE_INTTYPE_ENCODER(int16_t, le16)
+
+// -----------------------------------
+// float types
+//
+// NOTE: The following code assumes all supported platforms use IEEE binary32
+// as float and IEEE binary64 as double floating-point format.  The assumption
+// is verified by the assertions below.
+//
+// Under this assumption, we can use raw encoding of floating-point types
+// on little-endian machines, but we still need to perform a byte swap
+// on big-endian machines to ensure cross-architecture compatibility.
+// To achive that, we reinterpret the values as integers first, which are
+// byte-swapped via the ceph_le types as above.  The extra conversions
+// are optimized away on little-endian machines by the compiler.
+#define WRITE_FLTTYPE_ENCODER(type, itype, etype)			\
+  static_assert(sizeof(type) == sizeof(itype));				\
+  static_assert(std::numeric_limits<type>::is_iec559,			\
+	      "floating-point type not using IEEE754 format");		\
+  inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \
+    ceph_##etype e;							\
+    e = *reinterpret_cast<itype *>(&v);					\
+    ::ceph::encode_raw(e, bl);						\
+  }									\
+  inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) {	\
+    ceph_##etype e;							\
+    ::ceph::decode_raw(e, p);						\
+    *reinterpret_cast<itype *>(&v) = e;					\
+  }
+
+WRITE_FLTTYPE_ENCODER(float, uint32_t, le32)
+WRITE_FLTTYPE_ENCODER(double, uint64_t, le64)
+
+// see denc.h for ENCODE_DUMP_PATH discussion and definition.
+#ifdef ENCODE_DUMP_PATH
+# define ENCODE_DUMP_PRE()			\
+  unsigned pre_off = bl.length()
+# define ENCODE_DUMP_POST(cl)						\
+  do {									\
+    static int i = 0;							\
+    i++;								\
+    int bits = 0;							\
+    for (unsigned t = i; t; bits++)					\
+      t &= t - 1;							\
+    if (bits > 2)							\
+      break;								\
+    char fn[PATH_MAX];							\
+    snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #cl, getpid(), i++); \
+    int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC, 0644);		\
+    if (fd >= 0) {							\
+      ::ceph::bufferlist sub;						\
+      sub.substr_of(bl, pre_off, bl.length() - pre_off);		\
+      sub.write_fd(fd);							\
+      ::close(fd);							\
+    }									\
+  } while (0)
+#else
+# define ENCODE_DUMP_PRE()
+# define ENCODE_DUMP_POST(cl)
+#endif
+
+
+#define WRITE_CLASS_ENCODER(cl)						\
+  inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features=0) { \
+    ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); }		\
+  inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_MEMBER_ENCODER(cl)					\
+  inline void encode(const cl &c, ::ceph::bufferlist &bl) const {	\
+    ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); }		\
+  inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_ENCODER_FEATURES(cl)				\
+  inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features) { \
+    ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); }	\
+  inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_ENCODER_OPTIONAL_FEATURES(cl)				\
+  inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features = 0) { \
+    ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); }	\
+  inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+
+// string
+inline void encode(std::string_view s, bufferlist& bl, uint64_t features=0)
+{
+  __u32 len = s.length();
+  encode(len, bl);
+  if (len)
+    bl.append(s.data(), len);
+}
+inline void encode(const std::string& s, bufferlist& bl, uint64_t features=0)
+{
+  return encode(std::string_view(s), bl, features);
+}
+inline void decode(std::string& s, bufferlist::const_iterator& p)
+{
+  __u32 len;
+  decode(len, p);
+  s.clear();
+  p.copy(len, s);
+}
+
+inline void encode_nohead(std::string_view s, bufferlist& bl)
+{
+  bl.append(s.data(), s.length());
+}
+inline void encode_nohead(const std::string& s, bufferlist& bl)
+{
+  encode_nohead(std::string_view(s), bl);
+}
+inline void decode_nohead(int len, std::string& s, bufferlist::const_iterator& p)
+{
+  s.clear();
+  p.copy(len, s);
+}
+
+// const char* (encode only, string compatible)
+inline void encode(const char *s, bufferlist& bl) 
+{
+  encode(std::string_view(s, strlen(s)), bl);
+}
+
+
+// -----------------------------
+// buffers
+
+// bufferptr (encapsulated)
+inline void encode(const buffer::ptr& bp, bufferlist& bl) 
+{
+  __u32 len = bp.length();
+  encode(len, bl);
+  if (len)
+    bl.append(bp);
+}
+inline void decode(buffer::ptr& bp, bufferlist::const_iterator& p)
+{
+  __u32 len;
+  decode(len, p);
+
+  bufferlist s;
+  p.copy(len, s);
+
+  if (len) {
+    if (s.get_num_buffers() == 1)
+      bp = s.front();
+    else
+      bp = buffer::copy(s.c_str(), s.length());
+  }
+}
+
+// bufferlist (encapsulated)
+inline void encode(const bufferlist& s, bufferlist& bl) 
+{
+  __u32 len = s.length();
+  encode(len, bl);
+  bl.append(s);
+}
+inline void encode_destructively(bufferlist& s, bufferlist& bl) 
+{
+  __u32 len = s.length();
+  encode(len, bl);
+  bl.claim_append(s);
+}
+inline void decode(bufferlist& s, bufferlist::const_iterator& p)
+{
+  __u32 len;
+  decode(len, p);
+  s.clear();
+  p.copy(len, s);
+}
+
+inline void encode_nohead(const bufferlist& s, bufferlist& bl) 
+{
+  bl.append(s);
+}
+inline void decode_nohead(int len, bufferlist& s, bufferlist::const_iterator& p)
+{
+  s.clear();
+  p.copy(len, s);
+}
+
+// Time, since the templates are defined in std::chrono
+
+template<typename Clock, typename Duration,
+         typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr>
+void encode(const std::chrono::time_point<Clock, Duration>& t,
+	    ceph::bufferlist &bl) {
+  auto ts = Clock::to_timespec(t);
+  // A 32 bit count of seconds causes me vast unhappiness.
+  uint32_t s = ts.tv_sec;
+  uint32_t ns = ts.tv_nsec;
+  encode(s, bl);
+  encode(ns, bl);
+}
+
+template<typename Clock, typename Duration,
+         typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr>
+void decode(std::chrono::time_point<Clock, Duration>& t,
+	    bufferlist::const_iterator& p) {
+  uint32_t s;
+  uint32_t ns;
+  decode(s, p);
+  decode(ns, p);
+  struct timespec ts = {
+    static_cast<time_t>(s),
+    static_cast<long int>(ns)};
+
+  t = Clock::from_timespec(ts);
+}
+
+template<typename Rep, typename Period,
+         typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr>
+void encode(const std::chrono::duration<Rep, Period>& d,
+	    ceph::bufferlist &bl) {
+  using namespace std::chrono;
+  uint32_t s = duration_cast<seconds>(d).count();
+  uint32_t ns = (duration_cast<nanoseconds>(d) % seconds(1)).count();
+  encode(s, bl);
+  encode(ns, bl);
+}
+
+template<typename Rep, typename Period,
+         typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr>
+void decode(std::chrono::duration<Rep, Period>& d,
+	    bufferlist::const_iterator& p) {
+  uint32_t s;
+  uint32_t ns;
+  decode(s, p);
+  decode(ns, p);
+  d = std::chrono::seconds(s) + std::chrono::nanoseconds(ns);
+}
+
+// -----------------------------
+// STL container types
+
+template<typename T>
+inline void encode(const boost::optional<T> &p, bufferlist &bl);
+template<typename T>
+inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp);
+template<class A, class B, class C>
+inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl);
+template<class A, class B, class C>
+inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp);
+template<class A, class B,
+	 typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported || !b_traits::supported>
+encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features);
+template<class A, class B,
+	 typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported ||
+			!b_traits::supported>
+encode(const std::pair<A,B> &p, bufferlist &bl);
+template<class A, class B,
+	 typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported ||
+			!b_traits::supported>
+decode(std::pair<A,B> &pa, bufferlist::const_iterator &p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::list<T, Alloc>& ls, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p);
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist& bl);
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist& bl, uint64_t features);
+template<class T, class Alloc>
+inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s,
+	      bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s,
+	      bufferlist::iterator& p);
+template<class T, class Comp, class Alloc>
+inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc>
+inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::vector<T,Alloc>& v, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p);
+template<class T,class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist& bl,
+		   uint64_t features);
+template<class T, class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist& bl);
+template<class T, class Alloc>
+inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist::const_iterator& p);
+// small_vector
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p);
+// std::map
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported ||
+			!u_traits::supported>
+encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl,
+       uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m,
+			   bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+	      bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+	      bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m,
+	      bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc>
+inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl,
+		   uint64_t features);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class Hash, class Pred, class Alloc>
+inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl);
+template<class T, class Hash, class Pred, class Alloc>
+inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features);
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl);
+template<class T, class Alloc>
+inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::array<T, N>& v, bufferlist::const_iterator& p);
+
+// full bl decoder
+template<class T>
+inline void decode(T &o, const bufferlist& bl)
+{
+  auto p = bl.begin();
+  decode(o, p);
+  ceph_assert(p.end());
+}
+
+// boost optional
+template<typename T>
+inline void encode(const boost::optional<T> &p, bufferlist &bl)
+{
+  __u8 present = static_cast<bool>(p);
+  encode(present, bl);
+  if (p)
+    encode(p.get(), bl);
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+template<typename T>
+inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp)
+{
+  __u8 present;
+  decode(present, bp);
+  if (present) {
+    p = T{};
+    decode(p.get(), bp);
+  } else {
+    p = boost::none;
+  }
+}
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+// std::tuple
+template<typename... Ts>
+inline void encode(const std::tuple<Ts...> &t, bufferlist& bl)
+{
+  ceph::for_each(t, [&bl](const auto& e) {
+      encode(e, bl);
+    });
+}
+template<typename... Ts>
+inline void decode(std::tuple<Ts...> &t, bufferlist::const_iterator &bp)
+{
+  ceph::for_each(t, [&bp](auto& e) {
+      decode(e, bp);
+    });
+}
+
+//triple boost::tuple
+template<class A, class B, class C>
+inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl)
+{
+  encode(boost::get<0>(t), bl);
+  encode(boost::get<1>(t), bl);
+  encode(boost::get<2>(t), bl);
+}
+template<class A, class B, class C>
+inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp)
+{
+  decode(boost::get<0>(t), bp);
+  decode(boost::get<1>(t), bp);
+  decode(boost::get<2>(t), bp);
+}
+
+// std::pair<A,B>
+template<class A, class B,
+	 typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported || !b_traits::supported>
+  encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features)
+{
+  encode(p.first, bl, features);
+  encode(p.second, bl, features);
+}
+template<class A, class B,
+	 typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported ||
+			!b_traits::supported>
+  encode(const std::pair<A,B> &p, bufferlist &bl)
+{
+  encode(p.first, bl);
+  encode(p.second, bl);
+}
+template<class A, class B, typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported ||
+			!b_traits::supported>
+  decode(std::pair<A,B> &pa, bufferlist::const_iterator &p)
+{
+  decode(pa.first, p);
+  decode(pa.second, p);
+}
+
+// std::list<T>
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::list<T, Alloc>& ls, bufferlist& bl)
+{
+  __u32 n = (__u32)(ls.size());  // c++11 std::list::size() is O(1)
+  encode(n, bl);
+  for (auto p = ls.begin(); p != ls.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features)
+{
+  // should i pre- or post- count?
+  if (!ls.empty()) {
+    unsigned pos = bl.length();
+    unsigned n = 0;
+    encode(n, bl);
+    for (auto p = ls.begin(); p != ls.end(); ++p) {
+      n++;
+      encode(*p, bl, features);
+    }
+    ceph_le32 en;
+    en = n;
+    bl.copy_in(pos, sizeof(en), (char*)&en);
+  } else {
+    __u32 n = (__u32)(ls.size());    // FIXME: this is slow on a list.
+    encode(n, bl);
+    for (auto p = ls.begin(); p != ls.end(); ++p)
+      encode(*p, bl, features);
+  }
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  ls.clear();
+  while (n--) {
+    ls.emplace_back();
+    decode(ls.back(), p);
+  }
+}
+
+// std::list<std::shared_ptr<T>>
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist& bl)
+{
+  __u32 n = (__u32)(ls.size());  // c++11 std::list::size() is O(1)
+  encode(n, bl);
+  for (const auto& ref : ls) {
+    encode(*ref, bl);
+  }
+}
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist& bl, uint64_t features)
+{
+  __u32 n = (__u32)(ls.size());  // c++11 std::list::size() is O(1)
+  encode(n, bl);
+  for (const auto& ref : ls) {
+    encode(*ref, bl, features);
+  }
+}
+template<class T, class Alloc>
+inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  ls.clear();
+  while (n--) {
+    auto ref = std::make_shared<T>();
+    decode(*ref, p);
+    ls.emplace_back(std::move(ref));
+  }
+}
+
+// std::set<T>
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl)
+{
+  __u32 n = (__u32)(s.size());
+  encode(n, bl);
+  for (auto p = s.begin(); p != s.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  s.clear();
+  while (n--) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+template<class T, class Comp, class Alloc, typename traits>
+inline typename std::enable_if<!traits::supported>::type
+  encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl)
+{
+  for (auto p = s.begin(); p != s.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+  for (int i=0; i<len; i++) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+// boost::container::flat_set<T>
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl)
+{
+  __u32 n = (__u32)(s.size());
+  encode(n, bl);
+  for (const auto& e : s)
+    encode(e, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  s.clear();
+  s.reserve(n);
+  while (n--) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s,
+	      bufferlist& bl)
+{
+  for (const auto& e : s)
+    encode(e, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s,
+	      bufferlist::iterator& p)
+{
+  s.reserve(len);
+  for (int i=0; i<len; i++) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+// multiset
+template<class T, class Comp, class Alloc>
+inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl)
+{
+  __u32 n = (__u32)(s.size());
+  encode(n, bl);
+  for (auto p = s.begin(); p != s.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Comp, class Alloc>
+inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  s.clear();
+  while (n--) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (auto p = v.begin(); p != v.end(); ++p)
+    encode(*p, bl, features);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::vector<T,Alloc>& v, bufferlist& bl)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (auto p = v.begin(); p != v.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  v.resize(n);
+  for (__u32 i=0; i<n; i++) 
+    decode(v[i], p);
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl)
+{
+  for (auto p = v.begin(); p != v.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p)
+{
+  v.resize(len);
+  for (__u32 i=0; i<v.size(); i++) 
+    decode(v[i], p);
+}
+
+// small vector
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (const auto& i : v)
+    encode(i, bl, features);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (const auto& i : v)
+    encode(i, bl);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  v.resize(n);
+  for (auto& i : v)
+    decode(i, p);
+}
+
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl)
+{
+  for (const auto& i : v)
+    encode(i, bl);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p)
+{
+  v.resize(len);
+  for (auto& i : v)
+    decode(i, p);
+}
+
+
+// vector (shared_ptr)
+template<class T,class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist& bl,
+		   uint64_t features)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (const auto& ref : v) {
+    if (ref)
+      encode(*ref, bl, features);
+    else
+      encode(T(), bl, features);
+  }
+}
+template<class T, class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist& bl)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (const auto& ref : v) {
+    if (ref)
+      encode(*ref, bl);
+    else
+      encode(T(), bl);
+  }
+}
+template<class T, class Alloc>
+inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  v.clear();
+  v.reserve(n);
+  while (n--) {
+    auto ref = std::make_shared<T>();
+    decode(*ref, p);
+    v.emplace_back(std::move(ref));
+  }
+}
+
+// map
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported ||
+			!u_traits::supported>
+  encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features)
+{
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+
+// boost::container::flat-map
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (typename boost::container::flat_map<T,U,Comp>::const_iterator p
+	 = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl,
+	 uint64_t features)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  m.reserve(n);
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m,
+			   bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.reserve(m.size() + n);
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+		bufferlist& bl)
+{
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+		bufferlist& bl, uint64_t features)
+{
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m,
+		bufferlist::const_iterator& p)
+{
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+
+// multimap
+template<class T, class U, class Comp, class Alloc>
+inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    typename std::pair<T,U> tu = std::pair<T,U>();
+    decode(tu.first, p);
+    typename std::multimap<T,U,Comp,Alloc>::iterator it = m.insert(tu);
+    decode(it->second, p);
+  }
+}
+
+// ceph::unordered_map
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl,
+		   uint64_t features)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+
+// ceph::unordered_set
+template<class T, class Hash, class Pred, class Alloc>
+inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Hash, class Pred, class Alloc>
+inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    m.insert(k);
+  }
+}
+
+// deque
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features)
+{
+  __u32 n = ls.size();
+  encode(n, bl);
+  for (auto p = ls.begin(); p != ls.end(); ++p)
+    encode(*p, bl, features);
+}
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl)
+{
+  __u32 n = ls.size();
+  encode(n, bl);
+  for (auto p = ls.begin(); p != ls.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Alloc>
+inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  ls.clear();
+  while (n--) {
+    ls.emplace_back();
+    decode(ls.back(), p);
+  }
+}
+
+// std::array<T, N>
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features)
+{
+  for (const auto& e : v)
+    encode(e, bl, features);
+}
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl)
+{
+  for (const auto& e : v)
+    encode(e, bl);
+}
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode(std::array<T, N>& v, bufferlist::const_iterator& p)
+{
+  for (auto& e : v)
+    decode(e, p);
+}
+}
+
+/*
+ * guards
+ */
+
+/**
+ * start encoding block
+ *
+ * @param v current (code) version of the encoding
+ * @param compat oldest code version that can decode it
+ * @param bl bufferlist to encode to
+ *
+ */
+#define ENCODE_START(v, compat, bl)			     \
+  __u8 struct_v = v;                                         \
+  __u8 struct_compat = compat;		                     \
+  ceph_le32 struct_len;				             \
+  auto filler = (bl).append_hole(sizeof(struct_v) + 	     \
+    sizeof(struct_compat) + sizeof(struct_len));	     \
+  const auto starting_bl_len = (bl).length();		     \
+  using ::ceph::encode;					     \
+  do {
+
+/**
+ * finish encoding block
+ *
+ * @param bl bufferlist we were encoding to
+ * @param new_struct_compat struct-compat value to use
+ */
+#define ENCODE_FINISH_NEW_COMPAT(bl, new_struct_compat)      \
+  } while (false);                                           \
+  if (new_struct_compat) {                                   \
+    struct_compat = new_struct_compat;                       \
+  }                                                          \
+  struct_len = (bl).length() - starting_bl_len;              \
+  filler.copy_in(sizeof(struct_v), (char *)&struct_v);       \
+  filler.copy_in(sizeof(struct_compat),			     \
+    (char *)&struct_compat);				     \
+  filler.copy_in(sizeof(struct_len), (char *)&struct_len);
+
+#define ENCODE_FINISH(bl) ENCODE_FINISH_NEW_COMPAT(bl, 0)
+
+#define DECODE_ERR_OLDVERSION(func, v, compatv)					\
+  (std::string(func) + " no longer understand old encoding version " #v " < " + std::to_string(compatv))
+
+#define DECODE_ERR_PAST(func) \
+  (std::string(func) + " decode past end of struct encoding")
+
+/**
+ * check for very old encoding
+ *
+ * If the encoded data is older than oldestv, raise an exception.
+ *
+ * @param oldestv oldest version of the code we can successfully decode.
+ */
+#define DECODE_OLDEST(oldestv)						\
+  if (struct_v < oldestv)						\
+    throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, oldestv)); 
+
+/**
+ * start a decoding block
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param bl bufferlist::iterator for the encoded data
+ */
+#define DECODE_START(v, bl)						\
+  __u8 struct_v, struct_compat;						\
+  using ::ceph::decode;							\
+  decode(struct_v, bl);						\
+  decode(struct_compat, bl);						\
+  if (v < struct_compat)						\
+    throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
+  __u32 struct_len;							\
+  decode(struct_len, bl);						\
+  if (struct_len > bl.get_remaining())					\
+    throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+  unsigned struct_end = bl.get_off() + struct_len;			\
+  do {
+
+/* BEWARE: any change to this macro MUST be also reflected in the duplicative
+ * DECODE_START_LEGACY_COMPAT_LEN! */
+#define __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, skip_v, bl)	\
+  using ::ceph::decode;							\
+  __u8 struct_v;							\
+  decode(struct_v, bl);						\
+  if (struct_v >= compatv) {						\
+    __u8 struct_compat;							\
+    decode(struct_compat, bl);					\
+    if (v < struct_compat)						\
+      throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
+  } else if (skip_v) {							\
+    if (bl.get_remaining() < skip_v)					\
+      throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+    bl.advance(skip_v);							\
+  }									\
+  unsigned struct_end = 0;						\
+  if (struct_v >= lenv) {						\
+    __u32 struct_len;							\
+    decode(struct_len, bl);						\
+    if (struct_len > bl.get_remaining())				\
+      throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+    struct_end = bl.get_off() + struct_len;				\
+  }									\
+  do {
+
+/**
+ * start a decoding block with legacy support for older encoding schemes
+ *
+ * The old encoding schemes has a __u8 struct_v only, or lacked either
+ * the compat version or length.  Skip those fields conditionally.
+ *
+ * Most of the time, v, compatv, and lenv will all match the version
+ * where the structure was switched over to the new macros.
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param compatv oldest version that includes a __u8 compat version field
+ * @param lenv oldest version that includes a __u32 length wrapper
+ * @param bl bufferlist::iterator containing the encoded data
+ */
+
+/* BEWARE: this is duplication of __DECODE_START_LEGACY_COMPAT_LEN which
+ * MUST be changed altogether. For the rationale behind code duplication,
+ * please `git blame` and refer to the commit message. */
+#define DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, bl)		\
+  using ::ceph::decode;							\
+  __u8 struct_v;							\
+  decode(struct_v, bl);							\
+  if (struct_v >= compatv) {						\
+    __u8 struct_compat;							\
+    decode(struct_compat, bl);						\
+    if (v < struct_compat)						\
+      throw buffer::malformed_input(DECODE_ERR_OLDVERSION(		\
+	__PRETTY_FUNCTION__, v, struct_compat));			\
+  }									\
+  unsigned struct_end = 0;						\
+  if (struct_v >= lenv) {						\
+    __u32 struct_len;							\
+    decode(struct_len, bl);						\
+    if (struct_len > bl.get_remaining())				\
+      throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+    struct_end = bl.get_off() + struct_len;				\
+  }									\
+  do {
+
+/**
+ * start a decoding block with legacy support for older encoding schemes
+ *
+ * This version of the macro assumes the legacy encoding had a 32 bit
+ * version
+ *
+ * The old encoding schemes has a __u8 struct_v only, or lacked either
+ * the compat version or length.  Skip those fields conditionally.
+ *
+ * Most of the time, v, compatv, and lenv will all match the version
+ * where the structure was switched over to the new macros.
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param compatv oldest version that includes a __u8 compat version field
+ * @param lenv oldest version that includes a __u32 length wrapper
+ * @param bl bufferlist::iterator containing the encoded data
+ */
+#define DECODE_START_LEGACY_COMPAT_LEN_32(v, compatv, lenv, bl)		\
+  __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 3u, bl)
+
+#define DECODE_START_LEGACY_COMPAT_LEN_16(v, compatv, lenv, bl)		\
+  __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 1u, bl)
+
+/**
+ * finish decode block
+ *
+ * @param bl bufferlist::iterator we were decoding from
+ */
+#define DECODE_FINISH(bl)						\
+  } while (false);							\
+  if (struct_end) {							\
+    if (bl.get_off() > struct_end)					\
+      throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+    if (bl.get_off() < struct_end)					\
+      bl.advance(struct_end - bl.get_off());				\
+  }
+
+namespace ceph {
+
+/*
+ * Encoders/decoders to read from current offset in a file handle and
+ * encode/decode the data according to argument types.
+ */
+inline ssize_t decode_file(int fd, std::string &str)
+{
+  bufferlist bl;
+  __u32 len = 0;
+  bl.read_fd(fd, sizeof(len));
+  decode(len, bl);                                                                                                  
+  bl.read_fd(fd, len);
+  decode(str, bl);                                                                                                  
+  return bl.length();
+}
+
+inline ssize_t decode_file(int fd, bufferptr &bp)
+{
+  bufferlist bl;
+  __u32 len = 0;
+  bl.read_fd(fd, sizeof(len));
+  decode(len, bl);
+  bl.read_fd(fd, len);
+  auto bli = std::cbegin(bl);
+
+  decode(bp, bli);
+  return bl.length();
+}
+}
+
+#endif
diff --git a/src/include/err.h b/src/include/err.h
new file mode 100644
index 00000000..ba4b32ae
--- /dev/null
+++ b/src/include/err.h
@@ -0,0 +1,29 @@
+#ifndef CEPH_ERR_H
+#define CEPH_ERR_H
+
+/*
+ * adapted from linux 2.6.24 include/linux/err.h
+ */
+#define MAX_ERRNO 4095
+#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO)
+
+#include <errno.h>
+
+/* this generates a warning in c++; caller can do the cast manually
+static inline void *ERR_PTR(long error)
+{
+  return (void *) error;
+}
+*/
+
+static inline long PTR_ERR(const void *ptr)
+{
+  return (long) ptr;
+}
+
+static inline long IS_ERR(const void *ptr)
+{
+  return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+#endif
diff --git a/src/include/error.h b/src/include/error.h
new file mode 100644
index 00000000..a548d975
--- /dev/null
+++ b/src/include/error.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include <stdarg.h>
+
+#ifdef    __cplusplus
+extern "C" {
+#endif
+
+#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__)
+
+#define ASSERT(c) \
+  ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1))
+
+/* print usage error message and exit */
+extern void userror(const char *use, const char *fmt, ...);
+
+/* print system error message and exit */
+extern void syserror(const char *fmt, ...);
+
+/* print error message and exit */
+extern void exiterror(const char *fmt, ...);
+
+/* print error message */
+extern void error(const char *fmt, ...);
+
+#ifdef    __cplusplus
+} // extern "C"
+#endif
diff --git a/src/include/event_type.h b/src/include/event_type.h
new file mode 100644
index 00000000..aa6ddedb
--- /dev/null
+++ b/src/include/event_type.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_EVENT_TYPE_H
+#define CEPH_COMMON_EVENT_TYPE_H
+
+#define EVENT_SOCKET_TYPE_NONE 0
+#define EVENT_SOCKET_TYPE_PIPE 1
+#define EVENT_SOCKET_TYPE_EVENTFD 2
+
+#endif
diff --git a/src/include/filepath.h b/src/include/filepath.h
new file mode 100644
index 00000000..832016ac
--- /dev/null
+++ b/src/include/filepath.h
@@ -0,0 +1,247 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_FILEPATH_H
+#define CEPH_FILEPATH_H
+
+/*
+ * BUG:  /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string.
+ *   -> should it be different?  how?  should this[0] be "", with depth 4?
+ *
+ */
+
+
+#include <iosfwd>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "buffer.h"
+#include "encoding.h"
+#include "include/types.h"
+#include "include/fs_types.h"
+
+#include "common/Formatter.h"
+
+
+class filepath {
+  inodeno_t ino;   // base inode.  ino=0 implies pure relative path.
+  string path;     // relative path.
+
+  /** bits - path segments
+   * this is ['a', 'b', 'c'] for both the aboslute and relative case.
+   *
+   * NOTE: this value is LAZILY maintained... i.e. it's a cache
+   */
+  mutable vector<string> bits;
+  bool encoded;
+
+  void rebuild_path() {
+    path.clear();
+    for (unsigned i=0; i<bits.size(); i++) {
+      if (i) path += "/";
+      path += bits[i];
+    }
+  }
+  void parse_bits() const {
+    bits.clear();
+    int off = 0;
+    while (off < (int)path.length()) {
+      int nextslash = path.find('/', off);
+      if (nextslash < 0) 
+        nextslash = path.length();  // no more slashes
+      if (((nextslash - off) > 0) || encoded) {
+        // skip empty components unless they were introduced deliberately
+        // see commit message for more detail
+        bits.push_back( path.substr(off,nextslash-off) );
+      }
+      off = nextslash+1;
+    }
+  }
+
+ public:
+  filepath() : ino(0), encoded(false) { }
+  filepath(std::string_view s, inodeno_t i) : ino(i), path(s), encoded(false) { }
+  filepath(const string& s, inodeno_t i) : ino(i), path(s), encoded(false) { }
+  filepath(const char* s, inodeno_t i) : ino(i), path(s), encoded(false) { }
+  filepath(const filepath& o) {
+    ino = o.ino;
+    path = o.path;
+    bits = o.bits;
+    encoded = o.encoded;
+  }
+  filepath(inodeno_t i) : ino(i), encoded(false) { }
+  
+  /*
+   * if we are fed a relative path as a string, either set ino=0 (strictly
+   * relative) or 1 (absolute).  throw out any leading '/'.
+   */
+  filepath(std::string_view s) : encoded(false) {
+    set_path(s);
+  }
+  filepath(const char *s) : encoded(false) {
+    set_path(std::string_view(s));
+  }
+
+  void set_path(std::string_view s, inodeno_t b) {
+    path = s;
+    ino = b;
+  }
+  void set_path(std::string_view s) {
+    if (s[0] == '/') {
+      path = s.substr(1);
+      ino = 1;
+    } else {
+      ino = 0;
+      path = s;
+    }
+    bits.clear();
+  }
+
+
+  // accessors
+  inodeno_t get_ino() const { return ino; }
+  const string& get_path() const { return path; }
+  const char *c_str() const { return path.c_str(); }
+
+  int length() const { return path.length(); }
+  unsigned depth() const {
+    if (bits.empty() && path.length() > 0) parse_bits();
+    return bits.size();
+  }
+  bool empty() const { return path.length() == 0 && ino == 0; }
+
+  bool absolute() const { return ino == 1; }
+  bool pure_relative() const { return ino == 0; }
+  bool ino_relative() const { return ino > 0; }
+  
+  const string& operator[](int i) const {
+    if (bits.empty() && path.length() > 0) parse_bits();
+    return bits[i];
+  }
+
+  const string& last_dentry() const {
+    if (bits.empty() && path.length() > 0) parse_bits();
+    ceph_assert(!bits.empty());
+    return bits[ bits.size()-1 ];
+  }
+
+  filepath prefixpath(int s) const {
+    filepath t(ino);
+    for (int i=0; i<s; i++)
+      t.push_dentry(bits[i]);
+    return t;
+  }
+  filepath postfixpath(int s) const {
+    filepath t;
+    for (unsigned i=s; i<bits.size(); i++)
+      t.push_dentry(bits[i]);
+    return t;
+  }
+
+
+  // modifiers
+  //  string can be relative "a/b/c" (ino=0) or absolute "/a/b/c" (ino=1)
+  void _set_ino(inodeno_t i) { ino = i; }
+  void clear() {
+    ino = 0;
+    path = "";
+    bits.clear();
+  }
+
+  void pop_dentry() {
+    if (bits.empty() && path.length() > 0) 
+      parse_bits();
+    bits.pop_back();
+    rebuild_path();
+  }    
+  void push_dentry(std::string_view s) {
+    if (bits.empty() && path.length() > 0) 
+      parse_bits();
+    if (!bits.empty())
+      path += "/";
+    path += s;
+    bits.emplace_back(s);
+  }
+  void push_dentry(const string& s) {
+    push_dentry(std::string_view(s));
+  }
+  void push_dentry(const char *cs) {
+    push_dentry(std::string_view(cs, strlen(cs)));
+  }
+  void push_front_dentry(const string& s) {
+    bits.insert(bits.begin(), s);
+    rebuild_path();
+  }
+  void append(const filepath& a) {
+    ceph_assert(a.pure_relative());
+    for (unsigned i=0; i<a.depth(); i++) 
+      push_dentry(a[i]);
+  }
+
+  // encoding
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    __u8 struct_v = 1;
+    encode(struct_v, bl);
+    encode(ino, bl);
+    encode(path, bl);
+  }
+  void decode(bufferlist::const_iterator& blp) {
+    using ceph::decode;
+    bits.clear();
+    __u8 struct_v;
+    decode(struct_v, blp);
+    decode(ino, blp);
+    decode(path, blp);
+    encoded = true;
+  }
+  void dump(Formatter *f) const {
+    f->dump_unsigned("base_ino", ino);
+    f->dump_string("relative_path", path);
+  }
+  static void generate_test_instances(list<filepath*>& o) {
+    o.push_back(new filepath);
+    o.push_back(new filepath("/usr/bin", 0));
+    o.push_back(new filepath("/usr/sbin", 1));
+    o.push_back(new filepath("var/log", 1));
+    o.push_back(new filepath("foo/bar", 101));
+  }
+
+  bool is_last_dot_or_dotdot() const {
+    if (depth() > 0) {
+      std::string dname = last_dentry();
+      if (dname == "." || dname == "..") {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
+WRITE_CLASS_ENCODER(filepath)
+
+inline ostream& operator<<(ostream& out, const filepath& path)
+{
+  if (path.get_ino()) {
+    out << '#' << path.get_ino();
+    if (path.length())
+      out << '/';
+  }
+  return out << path.get_path();
+}
+
+#endif
diff --git a/src/include/frag.h b/src/include/frag.h
new file mode 100644
index 00000000..5e8b154f
--- /dev/null
+++ b/src/include/frag.h
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_FRAG_H
+#define CEPH_FRAG_H
+
+#include <boost/container/small_vector.hpp>
+
+#include <iostream>
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "buffer.h"
+#include "compact_map.h"
+
+#include "ceph_frag.h"
+#include "include/encoding.h"
+#include "include/ceph_assert.h"
+
+#include "common/dout.h"
+
+/*
+ * 
+ * the goal here is to use a binary split strategy to partition a namespace.  
+ * frag_t represents a particular fragment.  bits() tells you the size of the
+ * fragment, and value() it's name.  this is roughly analogous to an ip address
+ * and netmask.
+ * 
+ * fragtree_t represents an entire namespace and it's partition.  it essentially 
+ * tells you where fragments are split into other fragments, and by how much 
+ * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
+ * 
+ * this vaguely resembles a btree, in that when a fragment becomes large or small
+ * we can split or merge, except that there is no guarantee of being balanced.
+ *
+ * presumably we are partitioning the output of a (perhaps specialized) hash 
+ * function.
+ */
+
+/**
+ * frag_t
+ *
+ * description of an individual fragment.  that is, a particular piece
+ * of the overall namespace.
+ *
+ * this is conceptually analogous to an ip address and netmask.
+ *
+ * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
+ *
+ * we write it as v/b, where v is a value and b is the number of bits.
+ * 0/0 (bits==0) corresponds to the entire namespace.  if we bisect that,
+ * we get 0/1 and 1/1.  quartering gives us 0/2, 1/2, 2/2, 3/2.  and so on.
+ *
+ * this makes the right most bit of v the "most significant", which is the 
+ * opposite of what we usually see.
+ */
+
+/*
+ * TODO:
+ *  - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) 
+ *    iteration efficient (see, e.g., try_assimilate_children()
+ *  - rework frag_t so that we mask the left-most (most significant) bits instead of
+ *    the right-most (least significant) bits.  just because it's more intuitive, and
+ *    matches the network/netmask concept.
+ */
+
+class frag_t {
+  /*
+   * encoding is dictated by frag_* functions in ceph_fs.h.  use those
+   * helpers _exclusively_.
+   */
+public:
+  using _frag_t = uint32_t;
+  
+  frag_t() = default;
+  frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { }
+  frag_t(_frag_t e) : _enc(e) { }
+
+  // constructors
+  void from_unsigned(unsigned e) { _enc = e; }
+  
+  // accessors
+  unsigned value() const { return ceph_frag_value(_enc); }
+  unsigned bits() const { return ceph_frag_bits(_enc); }
+  unsigned mask() const { return ceph_frag_mask(_enc); }
+  unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); }
+
+  operator _frag_t() const { return _enc; }
+
+  // tests
+  bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); }
+  bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); }
+  bool is_root() const { return bits() == 0; }
+  frag_t parent() const {
+    ceph_assert(bits() > 0);
+    return frag_t(ceph_frag_parent(_enc));
+  }
+
+  // splitting
+  frag_t make_child(int i, int nb) const {
+    ceph_assert(i < (1<<nb));
+    return frag_t(ceph_frag_make_child(_enc, nb, i));
+  }
+  template<typename T>
+  void split(int nb, T& fragments) const {
+    ceph_assert(nb > 0);
+    unsigned nway = 1 << nb;
+    for (unsigned i=0; i<nway; i++) 
+      fragments.push_back(make_child(i, nb));
+  }
+
+  // binary splitting
+  frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); }
+  frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); }
+
+  bool is_left() const { return ceph_frag_is_left_child(_enc); }
+  bool is_right() const { return ceph_frag_is_right_child(_enc); }
+  frag_t get_sibling() const {
+    ceph_assert(!is_root());
+    return frag_t(ceph_frag_sibling(_enc));
+  }
+
+  // sequencing
+  bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); }
+  bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); }
+  frag_t next() const {
+    ceph_assert(!is_rightmost());
+    return frag_t(ceph_frag_next(_enc));
+  }
+
+  // parse
+  bool parse(const char *s) {
+    int pvalue, pbits;
+    int r = sscanf(s, "%x/%d", &pvalue, &pbits);
+    if (r == 2) {
+      *this = frag_t(pvalue, pbits);
+      return true;
+    }
+    return false;
+  }
+
+  void encode(bufferlist& bl) const {
+    encode_raw(_enc, bl);
+  }
+  void decode(bufferlist::const_iterator& p) {
+    __u32 v;
+    decode_raw(v, p);
+    _enc = v;
+  }
+
+private:
+  _frag_t _enc = 0;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
+{
+  //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
+  unsigned num = hb.bits();
+  if (num) {
+    unsigned val = hb.value();
+    for (unsigned bit = 23; num; num--, bit--) 
+      out << ((val & (1<<bit)) ? '1':'0');
+  }
+  return out << '*';
+}
+
+inline void encode(const frag_t &f, bufferlist& bl) { f.encode(bl); }
+inline void decode(frag_t &f, bufferlist::const_iterator& p) { f.decode(p); }
+
+using frag_vec_t = boost::container::small_vector<frag_t, 4>;
+
+/**
+ * fragtree_t -- partition an entire namespace into one or more frag_t's. 
+ */
+class fragtree_t {
+  // pairs <f, b>:
+  //  frag_t f is split by b bits.
+  //  if child frag_t does not appear, it is not split.
+public:
+  compact_map<frag_t,int32_t> _splits;
+
+public:
+  // -------------
+  // basics
+  void swap(fragtree_t& other) {
+    _splits.swap(other._splits);
+  }
+  void clear() {
+    _splits.clear();
+  }
+
+  // -------------
+  // accessors
+  bool empty() const { 
+    return _splits.empty();
+  }
+  int get_split(const frag_t hb) const {
+    compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
+    if (p == _splits.end())
+      return 0;
+    else
+      return p->second;
+  }
+
+  
+  bool is_leaf(frag_t x) const {
+    frag_vec_t s;
+    get_leaves_under(x, s);
+    //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
+    return s.size() == 1 && s.front() == x;
+  }
+
+  /**
+   * get_leaves -- list all leaves
+   */
+  template<typename T>
+  void get_leaves(T& c) const {
+    return get_leaves_under_split(frag_t(), c);
+  }
+
+  /**
+   * get_leaves_under_split -- list all leaves under a known split point (or root)
+   */
+  template<typename T>
+  void get_leaves_under_split(frag_t under, T& c) const {
+    frag_vec_t s;
+    s.push_back(under);
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      int nb = get_split(t);
+      if (nb) 
+	t.split(nb, s);   // queue up children
+      else
+	c.push_back(t);  // not spit, it's a leaf.
+    }
+  }
+
+  /**
+   * get_branch -- get branch point at OR above frag @a x
+   *  - may be @a x itself, if @a x is a split
+   *  - may be root (frag_t())
+   */
+  frag_t get_branch(frag_t x) const {
+    while (1) {
+      if (x == frag_t()) return x;  // root
+      if (get_split(x)) return x;   // found it!
+      x = x.parent();
+    }
+  }
+
+  /**
+   * get_branch_above -- get a branch point above frag @a x
+   *  - may be root (frag_t())
+   *  - may NOT be @a x, even if @a x is a split.
+   */
+  frag_t get_branch_above(frag_t x) const {
+    while (1) {
+      if (x == frag_t()) return x;  // root
+      x = x.parent();
+      if (get_split(x)) return x;   // found it!
+    }
+  }
+
+
+  /**
+   * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
+   *  - may be @a x itself, if @a x is a split or leaf
+   *  - may be root (frag_t())
+   */
+  frag_t get_branch_or_leaf(frag_t x) const {
+    frag_t branch = get_branch(x);
+    int nb = get_split(branch);
+    if (nb > 0 &&                                  // if branch is a split, and
+	branch.bits() + nb <= x.bits())            // one of the children is or contains x 
+      return frag_t(x.value(), branch.bits()+nb);  // then return that child (it's a leaf)
+    else
+      return branch;
+  }
+
+  /**
+   * get_leaves_under(x, ls) -- search for any leaves fully contained by x
+   */
+  template<typename T>
+  void get_leaves_under(frag_t x, T& c) const {
+    frag_vec_t s;
+    s.push_back(get_branch_or_leaf(x));
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      if (t.bits() >= x.bits() &&    // if t is more specific than x, and
+	  !x.contains(t))            // x does not contain t,
+	continue;         // then skip
+      int nb = get_split(t);
+      if (nb) 
+	t.split(nb, s);   // queue up children
+      else if (x.contains(t))
+	c.push_back(t);  // not spit, it's a leaf.
+    }
+  }
+
+  /**
+   * contains(fg) -- does fragtree contain the specific frag @a x
+   */
+  bool contains(frag_t x) const {
+    frag_vec_t s;
+    s.push_back(get_branch(x));
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      if (t.bits() >= x.bits() &&  // if t is more specific than x, and
+	  !x.contains(t))          // x does not contain t,
+	continue;         // then skip 
+      int nb = get_split(t);
+      if (nb) {
+	if (t == x) return false;  // it's split.
+	t.split(nb, s);   // queue up children
+      } else {
+	if (t == x) return true;   // it's there.
+      }
+    }
+    return false;
+  }
+
+  /** 
+   * operator[] -- map a (hash?) value to a frag
+   */
+  frag_t operator[](unsigned v) const {
+    frag_t t;
+    while (1) {
+      ceph_assert(t.contains(v));
+      int nb = get_split(t);
+
+      // is this a leaf?
+      if (nb == 0) return t;  // done.
+      
+      // pick appropriate child fragment.
+      unsigned nway = 1 << nb;
+      unsigned i;
+      for (i=0; i<nway; i++) {
+	frag_t n = t.make_child(i, nb);
+	if (n.contains(v)) {
+	  t = n;
+	  break;
+	}
+      }
+      ceph_assert(i < nway);
+    }
+  }
+
+
+  // ---------------
+  // modifiers
+  void split(frag_t x, int b, bool simplify=true) {
+    ceph_assert(is_leaf(x));
+    _splits[x] = b;
+    
+    if (simplify)
+      try_assimilate_children(get_branch_above(x));
+  }
+  void merge(frag_t x, int b, bool simplify=true) {
+    ceph_assert(!is_leaf(x));
+    ceph_assert(_splits[x] == b);
+    _splits.erase(x);
+
+    if (simplify)
+      try_assimilate_children(get_branch_above(x));
+  }
+
+  /*
+   * if all of a given split's children are identically split,
+   * then the children can be assimilated.
+   */
+  void try_assimilate_children(frag_t x) {
+    int nb = get_split(x);
+    if (!nb) return;
+    frag_vec_t children;
+    x.split(nb, children);
+    int childbits = 0;
+    for (auto& frag : children) {
+      int cb = get_split(frag);
+      if (!cb) return;  // nope.
+      if (childbits && cb != childbits) return;  // not the same
+      childbits = cb;
+    }
+    // all children are split with childbits!
+    for (auto& frag : children)
+      _splits.erase(frag);
+    _splits[x] += childbits;
+  }
+
+  bool force_to_leaf(CephContext *cct, frag_t x) {
+    if (is_leaf(x))
+      return false;
+
+    lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl;
+
+    frag_t parent = get_branch_or_leaf(x);
+    ceph_assert(parent.bits() <= x.bits());
+    lgeneric_dout(cct, 10) << "parent is " << parent << dendl;
+
+    // do we need to split from parent to x?
+    if (parent.bits() < x.bits()) {
+      int spread = x.bits() - parent.bits();
+      int nb = get_split(parent);
+      lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl;
+      if (nb == 0) {
+	// easy: split parent (a leaf) by the difference
+	lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl;
+	split(parent, spread);
+	ceph_assert(is_leaf(x));
+	return true;
+      }
+      ceph_assert(nb > spread);
+      
+      // add an intermediary split
+      merge(parent, nb, false);
+      split(parent, spread, false);
+
+      frag_vec_t subs;
+      parent.split(spread, subs);
+      for (auto& frag : subs) {
+	lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl;
+	split(frag, nb - spread, false);
+      }
+    }
+
+    // x is now a leaf or split.  
+    // hoover up any children.
+    frag_vec_t s;
+    s.push_back(x);
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      int nb = get_split(t);
+      if (nb) {
+	lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl;
+	merge(t, nb, false);    // merge this point, and
+	t.split(nb, s);         // queue up children
+      }
+    }
+
+    lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl;
+    ceph_assert(is_leaf(x));
+    return true;
+  }
+
+  // encoding
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(_splits, bl);
+  }
+  void decode(bufferlist::const_iterator& p) {
+    using ceph::decode;
+    decode(_splits, p);
+  }
+  void encode_nohead(bufferlist& bl) const {
+    using ceph::encode;
+    for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
+	 p != _splits.end();
+	 ++p) {
+      encode(p->first, bl);
+      encode(p->second, bl);
+    }
+  }
+  void decode_nohead(int n, bufferlist::const_iterator& p) {
+    using ceph::decode;
+    _splits.clear();
+    while (n-- > 0) {
+      frag_t f;
+      decode(f, p);
+      decode(_splits[f], p);
+    }
+  }
+
+  void print(std::ostream& out) {
+    out << "fragtree_t(";
+    frag_vec_t s;
+    s.push_back(frag_t());
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      // newline + indent?
+      if (t.bits()) {
+	out << std::endl;
+	for (unsigned i=0; i<t.bits(); i++) out << ' ';
+      }
+      int nb = get_split(t);
+      if (nb) {
+	out << t << " %" << nb;
+	t.split(nb, s);   // queue up children
+      } else {
+	out << t;
+      }
+    }
+    out << ")";
+  }
+
+  void dump(Formatter *f) const {
+    f->open_array_section("splits");
+    for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
+         p != _splits.end();
+         ++p) {
+      f->open_object_section("split");
+      std::ostringstream frag_str;
+      frag_str << p->first;
+      f->dump_string("frag", frag_str.str());
+      f->dump_int("children", p->second);
+      f->close_section(); // split
+    }
+    f->close_section(); // splits
+  }
+};
+WRITE_CLASS_ENCODER(fragtree_t)
+
+inline bool operator==(const fragtree_t& l, const fragtree_t& r) {
+  return l._splits == r._splits;
+}
+inline bool operator!=(const fragtree_t& l, const fragtree_t& r) {
+  return l._splits != r._splits;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
+{
+  out << "fragtree_t(";
+  
+  for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
+       p != ft._splits.end();
+       ++p) {
+    if (p != ft._splits.begin())
+      out << " ";
+    out << p->first << "^" << p->second;
+  }
+  return out << ")";
+}
+
+
+/**
+ * fragset_t -- a set of fragments
+ */
+class fragset_t {
+  std::set<frag_t> _set;
+
+public:
+  const std::set<frag_t> &get() const { return _set; }
+  std::set<frag_t>::iterator begin() { return _set.begin(); }
+  std::set<frag_t>::iterator end() { return _set.end(); }
+
+  bool empty() const { return _set.empty(); }
+
+  bool contains(frag_t f) const {
+    while (1) {
+      if (_set.count(f)) return true;
+      if (f.bits() == 0) return false;
+      f = f.parent();
+    }
+  }
+  
+  void insert(frag_t f) {
+    _set.insert(f);
+    simplify();
+  }
+
+  void simplify() {
+    while (1) {
+      bool clean = true;
+      std::set<frag_t>::iterator p = _set.begin();
+      while (p != _set.end()) {
+	if (!p->is_root() &&
+	    _set.count(p->get_sibling())) {
+	  _set.erase(p->get_sibling());
+	  _set.insert(p->parent());
+	  _set.erase(p++);
+	  clean = false;
+	} else {
+	  p++;
+	}
+      }
+      if (clean)
+	break;
+    }
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs) 
+{
+  return out << "fragset_t(" << fs.get() << ")";
+}
+
+#endif
diff --git a/src/include/fs_types.h b/src/include/fs_types.h
new file mode 100644
index 00000000..2132db9a
--- /dev/null
+++ b/src/include/fs_types.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_INCLUDE_FS_TYPES_H
+#define CEPH_INCLUDE_FS_TYPES_H
+
+#include "types.h"
+
+// --------------------------------------
+// ino
+
+typedef uint64_t _inodeno_t;
+
+struct inodeno_t {
+  _inodeno_t val;
+  inodeno_t() : val(0) {}
+  // cppcheck-suppress noExplicitConstructor
+  inodeno_t(_inodeno_t v) : val(v) {}
+  inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; }
+  operator _inodeno_t() const { return val; }
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(val, bl);
+  }
+  void decode(bufferlist::const_iterator& p) {
+    using ceph::decode;
+    decode(val, p);
+  }
+} __attribute__ ((__may_alias__));
+WRITE_CLASS_ENCODER(inodeno_t)
+
+template<>
+struct denc_traits<inodeno_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const inodeno_t &o, size_t& p) {
+    denc(o.val, p);
+  }
+  static void encode(const inodeno_t &o, buffer::list::contiguous_appender& p) {
+    denc(o.val, p);
+  }
+  static void decode(inodeno_t& o, buffer::ptr::const_iterator &p) {
+    denc(o.val, p);
+  }
+};
+
+inline ostream& operator<<(ostream& out, const inodeno_t& ino) {
+  return out << hex << "0x" << ino.val << dec;
+}
+
+namespace std {
+  template<> struct hash< inodeno_t >
+  {
+    size_t operator()( const inodeno_t& x ) const
+    {
+      static rjhash<uint64_t> H;
+      return H(x.val);
+    }
+  };
+} // namespace std
+
+
+// file modes
+
+inline bool file_mode_is_readonly(int mode) {
+  return (mode & CEPH_FILE_MODE_WR) == 0;
+}
+
+
+// dentries
+#define MAX_DENTRY_LEN 255
+
+// --
+namespace ceph {
+  class Formatter;
+}
+void dump(const ceph_file_layout& l, ceph::Formatter *f);
+void dump(const ceph_dir_layout& l, ceph::Formatter *f);
+
+
+
+// file_layout_t
+
+struct file_layout_t {
+  // file -> object mapping
+  uint32_t stripe_unit;   ///< stripe unit, in bytes,
+  uint32_t stripe_count;  ///< over this many objects
+  uint32_t object_size;   ///< until objects are this big
+
+  int64_t pool_id;        ///< rados pool id
+  string pool_ns;         ///< rados pool namespace
+
+  file_layout_t(uint32_t su=0, uint32_t sc=0, uint32_t os=0)
+    : stripe_unit(su),
+      stripe_count(sc),
+      object_size(os),
+      pool_id(-1) {
+  }
+
+  static file_layout_t get_default() {
+    return file_layout_t(1<<22, 1, 1<<22);
+  }
+
+  uint64_t get_period() const {
+    return static_cast<uint64_t>(stripe_count) * object_size;
+  }
+
+  void from_legacy(const ceph_file_layout& fl);
+  void to_legacy(ceph_file_layout *fl) const;
+
+  bool is_valid() const;
+
+  void encode(bufferlist& bl, uint64_t features) const;
+  void decode(bufferlist::const_iterator& p);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<file_layout_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(file_layout_t)
+
+WRITE_EQ_OPERATORS_5(file_layout_t, stripe_unit, stripe_count, object_size, pool_id, pool_ns);
+
+ostream& operator<<(ostream& out, const file_layout_t &layout);
+
+#endif
diff --git a/src/include/hash.h b/src/include/hash.h
new file mode 100644
index 00000000..2ab95448
--- /dev/null
+++ b/src/include/hash.h
@@ -0,0 +1,64 @@
+#ifndef CEPH_HASH_H
+#define CEPH_HASH_H
+
+#include "acconfig.h"
+
+// Robert Jenkins' function for mixing 32-bit values
+// http://burtleburtle.net/bob/hash/evahash.html
+// a, b = random bits, c = input and output
+
+#define hashmix(a,b,c) \
+	a=a-b;  a=a-c;  a=a^(c>>13); \
+	b=b-c;  b=b-a;  b=b^(a<<8);  \
+	c=c-a;  c=c-b;  c=c^(b>>13); \
+	a=a-b;  a=a-c;  a=a^(c>>12); \
+	b=b-c;  b=b-a;  b=b^(a<<16); \
+	c=c-a;  c=c-b;  c=c^(b>>5);  \
+	a=a-b;  a=a-c;  a=a^(c>>3); \
+	b=b-c;  b=b-a;  b=b^(a<<10); \
+	c=c-a;  c=c-b;  c=c^(b>>15);
+
+
+//namespace ceph {
+
+template <class _Key> struct rjhash { };
+
+inline uint64_t rjhash64(uint64_t key) {
+  key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+  key = key ^ (key >> 24);
+  key = (key + (key << 3)) + (key << 8); // key * 265
+  key = key ^ (key >> 14);
+  key = (key + (key << 2)) + (key << 4); // key * 21
+  key = key ^ (key >> 28);
+  key = key + (key << 31);
+  return key;
+}
+
+inline uint32_t rjhash32(uint32_t a) {
+  a = (a+0x7ed55d16) + (a<<12);
+  a = (a^0xc761c23c) ^ (a>>19);
+  a = (a+0x165667b1) + (a<<5);
+  a = (a+0xd3a2646c) ^ (a<<9);
+  a = (a+0xfd7046c5) + (a<<3);
+  a = (a^0xb55a4f09) ^ (a>>16);
+  return a;
+}
+
+
+template<> struct rjhash<uint32_t> {
+  inline size_t operator()(const uint32_t x) const {
+    return rjhash32(x);
+  }
+};
+
+template<> struct rjhash<uint64_t> {
+  inline size_t operator()(const uint64_t x) const {
+    return rjhash64(x);
+  }
+};
+
+//}
+
+
+
+#endif
diff --git a/src/include/health.h b/src/include/health.h
new file mode 100644
index 00000000..5c00225e
--- /dev/null
+++ b/src/include/health.h
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "include/encoding.h"
+
+// health_status_t
+enum health_status_t {
+  HEALTH_ERR = 0,
+  HEALTH_WARN = 1,
+  HEALTH_OK = 2,
+};
+
+inline void encode(health_status_t hs, bufferlist& bl) {
+  using ceph::encode;
+  uint8_t v = hs;
+  encode(v, bl);
+}
+inline void decode(health_status_t& hs, bufferlist::const_iterator& p) {
+  using ceph::decode;
+  uint8_t v;
+  decode(v, p);
+  hs = health_status_t(v);
+}
+template<>
+struct denc_traits<health_status_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+  static void bound_encode(const bufferptr& v, size_t& p, uint64_t f=0) {
+    p++;
+  }
+  static void encode(const health_status_t& v,
+		     buffer::list::contiguous_appender& p,
+		     uint64_t f=0) {
+    ::denc((uint8_t)v, p);
+  }
+  static void decode(health_status_t& v, buffer::ptr::const_iterator& p,
+		     uint64_t f=0) {
+    uint8_t tmp;
+    ::denc(tmp, p);
+    v = health_status_t(tmp);
+  }
+  static void decode(health_status_t& v, buffer::list::const_iterator& p,
+		     uint64_t f=0) {
+    uint8_t tmp;
+    ::denc(tmp, p);
+    v = health_status_t(tmp);
+  }
+};
+
+inline std::ostream& operator<<(std::ostream &oss, const health_status_t status) {
+  switch (status) {
+    case HEALTH_ERR:
+      oss << "HEALTH_ERR";
+      break;
+    case HEALTH_WARN:
+      oss << "HEALTH_WARN";
+      break;
+    case HEALTH_OK:
+      oss << "HEALTH_OK";
+      break;
+  }
+  return oss;
+}
diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h
new file mode 100644
index 00000000..48d88976
--- /dev/null
+++ b/src/include/inline_memory.h
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_INLINE_MEMORY_H
+#define CEPH_INLINE_MEMORY_H
+
+#if defined(__GNUC__)
+
+// optimize for the common case, which is very small copies
+static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+				       size_t inline_len)
+  __attribute__((always_inline));
+
+void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+			 size_t inline_len)
+{
+  if (l > inline_len) {
+    return memcpy(dest, src, l);
+  }
+  switch (l) {
+  case 8:
+    return __builtin_memcpy(dest, src, 8);
+  case 4:
+    return __builtin_memcpy(dest, src, 4);
+  case 3:
+    return __builtin_memcpy(dest, src, 3);
+  case 2:
+    return __builtin_memcpy(dest, src, 2);
+  case 1:
+    return __builtin_memcpy(dest, src, 1);
+  default:
+    int cursor = 0;
+    while (l >= sizeof(uint64_t)) {
+      __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+		       sizeof(uint64_t));
+      cursor += sizeof(uint64_t);
+      l -= sizeof(uint64_t);
+    }
+    while (l >= sizeof(uint32_t)) {
+      __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+		       sizeof(uint32_t));
+      cursor += sizeof(uint32_t);
+      l -= sizeof(uint32_t);
+    }
+    while (l > 0) {
+      *((char*)dest + cursor) = *((char*)src + cursor);
+      cursor++;
+      l--;
+    }
+  }
+  return dest;
+}
+
+#else
+
+#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l)
+
+#endif
+
+
+#if defined(__GNUC__) && defined(__x86_64__)
+
+namespace ceph {
+typedef unsigned uint128_t __attribute__ ((mode (TI)));
+}
+using ceph::uint128_t;
+
+static inline bool mem_is_zero(const char *data, size_t len)
+  __attribute__((always_inline));
+
+bool mem_is_zero(const char *data, size_t len)
+{
+  // we do have XMM registers in x86-64, so if we need to check at least
+  // 16 bytes, make use of them
+  if (len / sizeof(uint128_t) > 0) {
+    // align data pointer to 16 bytes, otherwise it'll segfault due to bug
+    // in (at least some) GCC versions (using MOVAPS instead of MOVUPS).
+    // check up to 15 first bytes while at it.
+    while (((unsigned long long)data) & 15) {
+      if (*(uint8_t*)data != 0) {
+	return false;
+      }
+      data += sizeof(uint8_t);
+      --len;
+    }
+
+    const char* data_start = data;
+    const char* max128 = data + (len / sizeof(uint128_t))*sizeof(uint128_t);
+
+    while (data < max128) {
+      if (*(uint128_t*)data != 0) {
+	return false;
+      }
+      data += sizeof(uint128_t);
+    }
+    len -= (data - data_start);
+  }
+
+  const char* max = data + len;
+  const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t);
+  while (data < max32) {
+    if (*(uint32_t*)data != 0) {
+      return false;
+    }
+    data += sizeof(uint32_t);
+  }
+  while (data < max) {
+    if (*(uint8_t*)data != 0) {
+      return false;
+    }
+    data += sizeof(uint8_t);
+  }
+  return true;
+}
+
+#else  // gcc and x86_64
+
+static inline bool mem_is_zero(const char *data, size_t len) {
+  const char *end = data + len;
+  const char* end64 = data + (len / sizeof(uint64_t))*sizeof(uint64_t);
+
+  while (data < end64) {
+    if (*(uint64_t*)data != 0) {
+      return false;
+    }
+    data += sizeof(uint64_t);
+  }
+
+  while (data < end) {
+    if (*data != 0) {
+      return false;
+    }
+    ++data;
+  }
+  return true;
+}
+
+#endif  // !x86_64
+
+#endif
diff --git a/src/include/int_types.h b/src/include/int_types.h
new file mode 100644
index 00000000..56b2723f
--- /dev/null
+++ b/src/include/int_types.h
@@ -0,0 +1,65 @@
+#ifndef CEPH_INTTYPES_H
+#define CEPH_INTTYPES_H
+
+#include "acconfig.h"
+
+#include <inttypes.h>
+
+#ifdef HAVE_LINUX_TYPES_H
+#include <linux/types.h>
+#else
+#ifndef HAVE___U8
+typedef uint8_t __u8;
+#endif
+
+#ifndef HAVE___S8
+typedef int8_t __s8;
+#endif
+
+#ifndef HAVE___U16
+typedef uint16_t __u16;
+#endif
+
+#ifndef HAVE___S16
+typedef int16_t __s16;
+#endif
+
+#ifndef HAVE___U32
+typedef uint32_t __u32;
+#endif
+
+#ifndef HAVE___S32
+typedef int32_t __s32;
+#endif
+
+#ifndef HAVE___U64
+typedef uint64_t __u64;
+#endif
+
+#ifndef HAVE___S64
+typedef int64_t __s64;
+#endif
+#endif /* LINUX_TYPES_H */
+
+#define __bitwise__
+
+typedef __u16 __bitwise__ __le16;
+typedef __u16 __bitwise__ __be16;
+typedef __u32 __bitwise__ __le32;
+typedef __u32 __bitwise__ __be32;
+typedef __u64 __bitwise__ __le64;
+typedef __u64 __bitwise__ __be64;
+
+#ifndef BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#endif
+
+#ifndef BOOST_MPL_LIMIT_VECTOR_SIZE
+#define BOOST_MPL_LIMIT_VECTOR_SIZE 30 // or whatever you need
+#endif
+
+#ifndef BOOST_MPL_LIMIT_MAP_SIZE
+#define BOOST_MPL_LIMIT_MAP_SIZE 30 // or whatever you need
+#endif
+
+#endif
diff --git a/src/include/intarith.h b/src/include/intarith.h
new file mode 100644
index 00000000..e912cbe7
--- /dev/null
+++ b/src/include/intarith.h
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INTARITH_H
+#define CEPH_INTARITH_H
+
+#include <type_traits>
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> div_round_up(T n, U d) {
+  return (n + d - 1) / d;
+}
+
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> round_up_to(T n, U d) {
+  return (n % d ? (n + d - n % d) : n);
+}
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> shift_round_up(T x, U y) {
+  return (x + (1 << y) - 1) >> y;
+}
+
+/*
+ * Wrapper to determine if value is a power of 2
+ */
+template<typename T>
+constexpr inline bool isp2(T x) {
+  return (x & (x - 1)) == 0;
+}
+
+/*
+ * Wrappers for various sorts of alignment and rounding.  The "align" must
+ * be a power of 2.  Often times it is a block, sector, or page.
+ */
+
+/*
+ * return x rounded down to an align boundary
+ * eg, p2align(1200, 1024) == 1024 (1*align)
+ * eg, p2align(1024, 1024) == 1024 (1*align)
+ * eg, p2align(0x1234, 0x100) == 0x1200 (0x12*align)
+ * eg, p2align(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+template<typename T>
+constexpr inline T p2align(T x, T align) {
+  return x & -align;
+}
+
+/*
+ * return x % (mod) align
+ * eg, p2phase(0x1234, 0x100) == 0x34 (x-0x12*align)
+ * eg, p2phase(0x5600, 0x100) == 0x00 (x-0x56*align)
+ */
+template<typename T>
+constexpr inline T p2phase(T x, T align) {
+  return x & (align - 1);
+}
+
+/*
+ * return how much space is left in this block (but if it's perfectly
+ * aligned, return 0).
+ * eg, p2nphase(0x1234, 0x100) == 0xcc (0x13*align-x)
+ * eg, p2nphase(0x5600, 0x100) == 0x00 (0x56*align-x)
+ */
+template<typename T>
+constexpr inline T p2nphase(T x, T align) {
+  return -x & (align - 1);
+}
+
+/*
+ * return x rounded up to an align boundary
+ * eg, p2roundup(0x1234, 0x100) == 0x1300 (0x13*align)
+ * eg, p2roundup(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+template<typename T>
+constexpr inline T p2roundup(T x, T align) {
+  return -(-x & -align);
+}
+
+// count trailing zeros.
+// NOTE: the builtin is nondeterministic on 0 input
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) <= sizeof(unsigned)),
+  unsigned>::type ctz(T v) {
+  if (v == 0)
+    return sizeof(v) * 8;
+  return __builtin_ctz(v);
+}
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned int) &&
+   sizeof(T) <= sizeof(unsigned long)),
+  unsigned>::type ctz(T v) {
+  if (v == 0)
+    return sizeof(v) * 8;
+  return __builtin_ctzl(v);
+}
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned long) &&
+   sizeof(T) <= sizeof(unsigned long long)),
+  unsigned>::type ctz(T v) {
+  if (v == 0)
+    return sizeof(v) * 8;
+  return __builtin_ctzll(v);
+}
+
+// count leading zeros
+// NOTE: the builtin is nondeterministic on 0 input
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) <= sizeof(unsigned)),
+  unsigned>::type clz(T v) {
+  if (v == 0)
+    return sizeof(v) * 8;
+  return __builtin_clz(v);
+}
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned int) &&
+   sizeof(T) <= sizeof(unsigned long)),
+  unsigned>::type clz(T v) {
+  if (v == 0)
+    return sizeof(v) * 8;
+  return __builtin_clzl(v);
+}
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned long) &&
+   sizeof(T) <= sizeof(unsigned long long)),
+  unsigned>::type clz(T v) {
+  if (v == 0)
+    return sizeof(v) * 8;
+  return __builtin_clzll(v);
+}
+
+// count bits (set + any 0's that follow)
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) <= sizeof(unsigned)),
+  unsigned>::type cbits(T v) {
+  if (v == 0)
+    return 0;
+  return (sizeof(v) * 8) - __builtin_clz(v);
+}
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned int) &&
+   sizeof(T) <= sizeof(unsigned long)),
+  unsigned>::type cbits(T v) {
+  if (v == 0)
+    return 0;
+  return (sizeof(v) * 8) - __builtin_clzl(v);
+}
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned long) &&
+   sizeof(T) <= sizeof(unsigned long long)),
+  unsigned>::type cbits(T v) {
+  if (v == 0)
+    return 0;
+  return (sizeof(v) * 8) - __builtin_clzll(v);
+}
+
+#endif
diff --git a/src/include/interval_set.h b/src/include/interval_set.h
new file mode 100644
index 00000000..4fb6be45
--- /dev/null
+++ b/src/include/interval_set.h
@@ -0,0 +1,783 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_INTERVAL_SET_H
+#define CEPH_INTERVAL_SET_H
+
+#include <iterator>
+#include <map>
+#include <ostream>
+
+#include "encoding.h"
+
+/*
+ * *** NOTE ***
+ *
+ * This class is written to work with a variety of map-like containers,
+ * *include* ones that invalidate iterators when they are modified (e.g.,
+ * flat_map and btree_map).
+ */
+
+template<typename T, typename Map = std::map<T,T>>
+class interval_set {
+ public:
+  using value_type = T;
+
+  class const_iterator;
+
+  class iterator : public std::iterator <std::forward_iterator_tag, T>
+  {
+    public:
+        explicit iterator(typename Map::iterator iter)
+          : _iter(iter)
+        { }
+
+        // For the copy constructor and assignment operator, the compiler-generated functions, which
+        // perform simple bitwise copying, should be fine.
+
+        bool operator==(const iterator& rhs) const {
+          return (_iter == rhs._iter);
+        }
+
+        bool operator!=(const iterator& rhs) const {
+          return (_iter != rhs._iter);
+        }
+
+        // Dereference this iterator to get a pair.
+        std::pair < T, T > &operator*() {
+                return *_iter;
+        }
+
+        // Return the interval start.
+        T get_start() const {
+                return _iter->first;
+        }
+
+        // Return the interval length.
+        T get_len() const {
+                return _iter->second;
+        }
+        T get_end() const {
+                return _iter->first + _iter->second;
+        }
+
+        // Set the interval length.
+        void set_len(T len) {
+                _iter->second = len;
+        }
+
+        // Preincrement
+        iterator &operator++()
+        {
+                ++_iter;
+                return *this;
+        }
+
+        // Postincrement
+        iterator operator++(int)
+        {
+                iterator prev(_iter);
+                ++_iter;
+                return prev;
+        }
+
+    friend class interval_set<T,Map>::const_iterator;
+
+    protected:
+        typename Map::iterator _iter;
+    friend class interval_set<T,Map>;
+  };
+
+  class const_iterator : public std::iterator <std::forward_iterator_tag, T>
+  {
+    public:
+        explicit const_iterator(typename Map::const_iterator iter)
+          : _iter(iter)
+        { }
+
+        const_iterator(const iterator &i)
+	  : _iter(i._iter)
+        { }
+
+        // For the copy constructor and assignment operator, the compiler-generated functions, which
+        // perform simple bitwise copying, should be fine.
+
+        bool operator==(const const_iterator& rhs) const {
+          return (_iter == rhs._iter);
+        }
+
+        bool operator!=(const const_iterator& rhs) const {
+          return (_iter != rhs._iter);
+        }
+
+        // Dereference this iterator to get a pair.
+        std::pair < T, T > operator*() const {
+                return *_iter;
+        }
+
+        // Return the interval start.
+        T get_start() const {
+                return _iter->first;
+        }
+        T get_end() const {
+                return _iter->first + _iter->second;
+        }
+
+        // Return the interval length.
+        T get_len() const {
+                return _iter->second;
+        }
+
+        // Preincrement
+        const_iterator &operator++()
+        {
+                ++_iter;
+                return *this;
+        }
+
+        // Postincrement
+        const_iterator operator++(int)
+        {
+                const_iterator prev(_iter);
+                ++_iter;
+                return prev;
+        }
+
+    protected:
+        typename Map::const_iterator _iter;
+  };
+
+  interval_set() : _size(0) {}
+  interval_set(Map& other) {
+    m.swap(other);
+    _size = 0;
+    for (auto& i : m) {
+      _size += i.second;
+    }
+  }
+
+  int num_intervals() const
+  {
+    return m.size();
+  }
+
+  typename interval_set<T,Map>::iterator begin() {
+    return typename interval_set<T,Map>::iterator(m.begin());
+  }
+
+  typename interval_set<T,Map>::iterator lower_bound(T start) {
+    return typename interval_set<T,Map>::iterator(find_inc_m(start));
+  }
+
+  typename interval_set<T,Map>::iterator end() {
+    return typename interval_set<T,Map>::iterator(m.end());
+  }
+
+  typename interval_set<T,Map>::const_iterator begin() const {
+    return typename interval_set<T,Map>::const_iterator(m.begin());
+  }
+
+  typename interval_set<T,Map>::const_iterator lower_bound(T start) const {
+    return typename interval_set<T,Map>::const_iterator(find_inc(start));
+  }
+
+  typename interval_set<T,Map>::const_iterator end() const {
+    return typename interval_set<T,Map>::const_iterator(m.end());
+  }
+
+  // helpers
+ private:
+  typename Map::const_iterator find_inc(T start) const {
+    typename Map::const_iterator p = m.lower_bound(start);  // p->first >= start
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      p--;   // might overlap?
+      if (p->first + p->second <= start)
+        p++; // it doesn't.
+    }
+    return p;
+  }
+  
+  typename Map::iterator find_inc_m(T start) {
+    typename Map::iterator p = m.lower_bound(start);
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      p--;   // might overlap?
+      if (p->first + p->second <= start)
+        p++; // it doesn't.
+    }
+    return p;
+  }
+  
+  typename Map::const_iterator find_adj(T start) const {
+    typename Map::const_iterator p = m.lower_bound(start);
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      p--;   // might touch?
+      if (p->first + p->second < start)
+        p++; // it doesn't.
+    }
+    return p;
+  }
+  
+  typename Map::iterator find_adj_m(T start) {
+    typename Map::iterator p = m.lower_bound(start);
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      p--;   // might touch?
+      if (p->first + p->second < start)
+        p++; // it doesn't.
+    }
+    return p;
+  }
+
+  void intersection_size_asym(const interval_set &s, const interval_set &l) {
+    typename decltype(m)::const_iterator ps = s.m.begin(), pl;
+    ceph_assert(ps != s.m.end());
+    T offset = ps->first;
+    bool first = true;
+    typename decltype(m)::iterator mi = m.begin();
+
+    while (1) {
+      if (first)
+        first = false;
+      pl = l.find_inc(offset);
+      if (pl == l.m.end())
+        break;
+      while (ps != s.m.end() && ps->first + ps->second <= pl->first)
+        ++ps;
+      if (ps == s.m.end())
+        break;
+      offset = pl->first + pl->second;
+      if (offset <= ps->first) {
+        offset = ps->first;
+        continue;
+      }
+
+      if (*ps == *pl) {
+        do {
+          mi = m.insert(mi, *ps);
+          _size += ps->second;
+          ++ps;
+          ++pl;
+        } while (ps != s.m.end() && pl != l.m.end() && *ps == *pl);
+        if (ps == s.m.end())
+          break;
+        offset = ps->first;
+        continue;
+      }
+
+      T start = std::max<T>(ps->first, pl->first);
+      T en = std::min<T>(ps->first + ps->second, offset);
+      ceph_assert(en > start);
+      typename decltype(m)::value_type i{start, en - start};
+      mi = m.insert(mi, i);
+      _size += i.second;
+      if (ps->first + ps->second <= offset) {
+        ++ps;
+        if (ps == s.m.end())
+          break;
+        offset = ps->first;
+      }
+    }
+  }
+
+  bool subset_size_sym(const interval_set &b) const {
+    auto pa = m.begin(), pb = b.m.begin();
+    const auto a_end = m.end(), b_end = b.m.end();
+
+    while (pa != a_end && pb != b_end) {
+      while (pb->first + pb->second <= pa->first) {
+        ++pb;
+        if (pb == b_end)
+          return false;
+      }
+
+      if (*pa == *pb) {
+        do {
+          ++pa;
+          ++pb;
+        } while (pa != a_end && pb != b_end && *pa == *pb);
+        continue;
+      }
+
+      // interval begins before other
+      if (pa->first < pb->first)
+        return false;
+      // interval is longer than other
+      if (pa->first + pa->second > pb->first + pb->second)
+        return false;
+
+      ++pa;
+    }
+
+    return pa == a_end;
+  }
+  
+ public:
+  bool operator==(const interval_set& other) const {
+    return _size == other._size && m == other.m;
+  }
+
+  int64_t size() const {
+    return _size;
+  }
+
+  void bound_encode(size_t& p) const {
+    denc_traits<Map>::bound_encode(m, p);
+  }
+  void encode(bufferlist::contiguous_appender& p) const {
+    denc(m, p);
+  }
+  void decode(bufferptr::const_iterator& p) {
+    denc(m, p);
+    _size = 0;
+    for (const auto& i : m) {
+      _size += i.second;
+    }
+  }
+  void decode(bufferlist::iterator& p) {
+    denc(m, p);
+    _size = 0;
+    for (const auto& i : m) {
+      _size += i.second;
+    }
+  }
+
+  void encode_nohead(bufferlist::contiguous_appender& p) const {
+    denc_traits<Map>::encode_nohead(m, p);
+  }
+  void decode_nohead(int n, bufferptr::const_iterator& p) {
+    denc_traits<Map>::decode_nohead(n, m, p);
+    _size = 0;
+    for (const auto& i : m) {
+      _size += i.second;
+    }
+  }
+
+  void clear() {
+    m.clear();
+    _size = 0;
+  }
+
+  bool contains(T i, T *pstart=0, T *plen=0) const {
+    typename Map::const_iterator p = find_inc(i);
+    if (p == m.end()) return false;
+    if (p->first > i) return false;
+    if (p->first+p->second <= i) return false;
+    ceph_assert(p->first <= i && p->first+p->second > i);
+    if (pstart)
+      *pstart = p->first;
+    if (plen)
+      *plen = p->second;
+    return true;
+  }
+  bool contains(T start, T len) const {
+    typename Map::const_iterator p = find_inc(start);
+    if (p == m.end()) return false;
+    if (p->first > start) return false;
+    if (p->first+p->second <= start) return false;
+    ceph_assert(p->first <= start && p->first+p->second > start);
+    if (p->first+p->second < start+len) return false;
+    return true;
+  }
+  bool intersects(T start, T len) const {
+    interval_set a;
+    a.insert(start, len);
+    interval_set i;
+    i.intersection_of( *this, a );
+    if (i.empty()) return false;
+    return true;
+  }
+
+  // outer range of set
+  bool empty() const {
+    return m.empty();
+  }
+  T range_start() const {
+    ceph_assert(!empty());
+    typename Map::const_iterator p = m.begin();
+    return p->first;
+  }
+  T range_end() const {
+    ceph_assert(!empty());
+    typename Map::const_iterator p = m.end();
+    p--;
+    return p->first+p->second;
+  }
+
+  // interval start after p (where p not in set)
+  bool starts_after(T i) const {
+    ceph_assert(!contains(i));
+    typename Map::const_iterator p = find_inc(i);
+    if (p == m.end()) return false;
+    return true;
+  }
+  T start_after(T i) const {
+    ceph_assert(!contains(i));
+    typename Map::const_iterator p = find_inc(i);
+    return p->first;
+  }
+
+  // interval end that contains start
+  T end_after(T start) const {
+    ceph_assert(contains(start));
+    typename Map::const_iterator p = find_inc(start);
+    return p->first+p->second;
+  }
+  
+  void insert(T val) {
+    insert(val, 1);
+  }
+
+  void insert(T start, T len, T *pstart=0, T *plen=0) {
+    //cout << "insert " << start << "~" << len << endl;
+    ceph_assert(len > 0);
+    _size += len;
+    typename Map::iterator p = find_adj_m(start);
+    if (p == m.end()) {
+      m[start] = len;                  // new interval
+      if (pstart)
+	*pstart = start;
+      if (plen)
+	*plen = len;
+    } else {
+      if (p->first < start) {
+        
+        if (p->first + p->second != start) {
+          //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl;
+          ceph_abort();
+        }
+        
+        p->second += len;               // append to end
+        
+        typename Map::iterator n = p;
+        n++;
+	if (pstart)
+	  *pstart = p->first;
+        if (n != m.end() && 
+            start+len == n->first) {   // combine with next, too!
+          p->second += n->second;
+	  if (plen)
+	    *plen = p->second;
+          m.erase(n);
+        } else {
+	  if (plen)
+	    *plen = p->second;
+	}
+      } else {
+        if (start+len == p->first) {
+	  if (pstart)
+	    *pstart = start;
+	  if (plen)
+	    *plen = len + p->second;
+	  T psecond = p->second;
+          m.erase(p);
+          m[start] = len + psecond;  // append to front
+        } else {
+          ceph_assert(p->first > start+len);
+	  if (pstart)
+	    *pstart = start;
+	  if (plen)
+	    *plen = len;
+          m[start] = len;              // new interval
+        }
+      }
+    }
+  }
+
+  void swap(interval_set<T,Map>& other) {
+    m.swap(other.m);
+    std::swap(_size, other._size);
+  }    
+  
+  void erase(iterator &i) {
+    _size -= i.get_len();
+    ceph_assert(_size >= 0);
+    m.erase(i._iter);
+  }
+
+  void erase(T val) {
+    erase(val, 1);
+  }
+
+  void erase(T start, T len, 
+    std::function<bool(T, T)> claim = {}) {
+    typename Map::iterator p = find_inc_m(start);
+
+    _size -= len;
+    ceph_assert(_size >= 0);
+
+    ceph_assert(p != m.end());
+    ceph_assert(p->first <= start);
+
+    T before = start - p->first;
+    ceph_assert(p->second >= before+len);
+    T after = p->second - before - len;
+    if (before) {
+      if (claim && claim(p->first, before)) {
+	_size -= before;
+	m.erase(p);
+      } else {
+	p->second = before;        // shorten bit before
+      }
+    } else {
+      m.erase(p);
+    }
+    if (after) {
+      if (claim && claim(start + len, after)) {
+	_size -= after;
+      } else {
+	m[start + len] = after;
+      }
+    }
+  }
+
+  void subtract(const interval_set &a) {
+    for (typename Map::const_iterator p = a.m.begin();
+         p != a.m.end();
+         p++)
+      erase(p->first, p->second);
+  }
+
+  void insert(const interval_set &a) {
+    for (typename Map::const_iterator p = a.m.begin();
+         p != a.m.end();
+         p++)
+      insert(p->first, p->second);
+  }
+
+
+  void intersection_of(const interval_set &a, const interval_set &b) {
+    ceph_assert(&a != this);
+    ceph_assert(&b != this);
+    clear();
+
+    const interval_set *s, *l;
+
+    if (a.size() < b.size()) {
+      s = &a;
+      l = &b;
+    } else {
+      s = &b;
+      l = &a;
+    }
+
+    if (!s->size())
+      return;
+
+    /*
+     * Use the lower_bound algorithm for larger size ratios
+     * where it performs better, but not for smaller size
+     * ratios where sequential search performs better.
+     */
+    if (l->size() / s->size() >= 10) {
+      intersection_size_asym(*s, *l);
+      return;
+    }
+
+    typename Map::const_iterator pa = a.m.begin();
+    typename Map::const_iterator pb = b.m.begin();
+    typename decltype(m)::iterator mi = m.begin();
+
+    while (pa != a.m.end() && pb != b.m.end()) {
+      // passing?
+      if (pa->first + pa->second <= pb->first) 
+        { pa++;  continue; }
+      if (pb->first + pb->second <= pa->first) 
+        { pb++;  continue; }
+
+      if (*pa == *pb) {
+        do {
+          mi = m.insert(mi, *pa);
+          _size += pa->second;
+          ++pa;
+          ++pb;
+        } while (pa != a.m.end() && pb != b.m.end() && *pa == *pb);
+        continue;
+      }
+
+      T start = std::max(pa->first, pb->first);
+      T en = std::min(pa->first+pa->second, pb->first+pb->second);
+      ceph_assert(en > start);
+      typename decltype(m)::value_type i{start, en - start};
+      mi = m.insert(mi, i);
+      _size += i.second;
+      if (pa->first+pa->second > pb->first+pb->second)
+        pb++;
+      else
+        pa++; 
+    }
+  }
+  void intersection_of(const interval_set& b) {
+    interval_set a;
+    swap(a);
+    intersection_of(a, b);
+  }
+
+  void union_of(const interval_set &a, const interval_set &b) {
+    ceph_assert(&a != this);
+    ceph_assert(&b != this);
+    clear();
+    
+    //cout << "union_of" << endl;
+
+    // a
+    m = a.m;
+    _size = a._size;
+
+    // - (a*b)
+    interval_set ab;
+    ab.intersection_of(a, b);
+    subtract(ab);
+
+    // + b
+    insert(b);
+    return;
+  }
+  void union_of(const interval_set &b) {
+    interval_set a;
+    swap(a);    
+    union_of(a, b);
+  }
+  void union_insert(T off, T len) {
+    interval_set a;
+    a.insert(off, len);
+    union_of(a);
+  }
+
+  bool subset_of(const interval_set &big) const {
+    if (!size())
+      return true;
+    if (size() > big.size())
+      return false;
+    if (range_end() > big.range_end())
+      return false;
+
+    /*
+     * Use the lower_bound algorithm for larger size ratios
+     * where it performs better, but not for smaller size
+     * ratios where sequential search performs better.
+     */
+    if (big.size() / size() < 10)
+      return subset_size_sym(big);
+
+    for (typename Map::const_iterator i = m.begin();
+         i != m.end();
+         i++) 
+      if (!big.contains(i->first, i->second)) return false;
+    return true;
+  }  
+
+  /*
+   * build a subset of @other, starting at or after @start, and including
+   * @len worth of values, skipping holes.  e.g.,
+   *  span_of([5~10,20~5], 8, 5) -> [8~2,20~3]
+   */
+  void span_of(const interval_set &other, T start, T len) {
+    clear();
+    typename Map::const_iterator p = other.find_inc(start);
+    if (p == other.m.end())
+      return;
+    if (p->first < start) {
+      if (p->first + p->second < start)
+	return;
+      if (p->first + p->second < start + len) {
+	T howmuch = p->second - (start - p->first);
+	insert(start, howmuch);
+	len -= howmuch;
+	p++;
+      } else {
+	insert(start, len);
+	return;
+      }
+    }
+    while (p != other.m.end() && len > 0) {
+      if (p->second < len) {
+	insert(p->first, p->second);
+	len -= p->second;
+	p++;
+      } else {
+	insert(p->first, len);
+	return;
+      }
+    }
+  }
+
+  /*
+   * Move contents of m into another Map. Use that instead of
+   * encoding interval_set into bufferlist then decoding it back into Map.
+   */
+  void move_into(Map& other) {
+    other = std::move(m);
+  }
+
+private:
+  // data
+  int64_t _size;
+  Map m;   // map start -> len
+};
+
+// declare traits explicitly because (1) it's templatized, and (2) we
+// want to include _nohead variants.
+template<typename T, typename Map>
+struct denc_traits<interval_set<T,Map>> {
+  static constexpr bool supported = true;
+  static constexpr bool bounded = false;
+  static constexpr bool featured = false;
+  static constexpr bool need_contiguous = denc_traits<T,Map>::need_contiguous;
+  static void bound_encode(const interval_set<T,Map>& v, size_t& p) {
+    v.bound_encode(p);
+  }
+  static void encode(const interval_set<T,Map>& v,
+		     bufferlist::contiguous_appender& p) {
+    v.encode(p);
+  }
+  static void decode(interval_set<T,Map>& v, bufferptr::const_iterator& p) {
+    v.decode(p);
+  }
+  template<typename U=T>
+    static typename std::enable_if<sizeof(U) && !need_contiguous>::type
+    decode(interval_set<T,Map>& v, bufferlist::iterator& p) {
+    v.decode(p);
+  }
+  static void encode_nohead(const interval_set<T,Map>& v,
+			    bufferlist::contiguous_appender& p) {
+    v.encode_nohead(p);
+  }
+  static void decode_nohead(size_t n, interval_set<T,Map>& v,
+			    bufferptr::const_iterator& p) {
+    v.decode_nohead(n, p);
+  }
+};
+
+
+template<class T, typename Map>
+inline std::ostream& operator<<(std::ostream& out, const interval_set<T,Map> &s) {
+  out << "[";
+  const char *prequel = "";
+  for (typename interval_set<T,Map>::const_iterator i = s.begin();
+       i != s.end();
+       ++i)
+  {
+    out << prequel << i.get_start() << "~" << i.get_len();
+    prequel = ",";
+  }
+  out << "]";
+  return out;
+}
+
+
+#endif
diff --git a/src/include/ipaddr.h b/src/include/ipaddr.h
new file mode 100644
index 00000000..e8bed829
--- /dev/null
+++ b/src/include/ipaddr.h
@@ -0,0 +1,48 @@
+#ifndef CEPH_IPADDR_H
+#define CEPH_IPADDR_H
+
+class entity_addr_t;
+
+/*
+ * Find an IP address that is in the wanted subnet.
+ *
+ * If there are multiple matches, the first one is returned; this order
+ * is system-dependent and should not be relied on.
+ */
+const struct ifaddrs *find_ip_in_subnet(const struct ifaddrs *addrs,
+					const struct sockaddr *net,
+					unsigned int prefix_len,
+					int numa_node = -1);
+
+/*
+ * Validate and parse IPv4 or IPv6 network
+ *
+ * Given a network (e.g. "192.168.0.0/24") and pointers to a sockaddr_storage
+ * struct and an unsigned int:
+ *
+ * if the network string is valid, return true and populate sockaddr_storage
+ * and prefix_len;
+ *
+ * if the network string is invalid, return false.
+ */
+bool parse_network(const char *s,
+		   struct sockaddr_storage *network,
+		   unsigned int *prefix_len);
+bool parse_network(const char *s,
+		   entity_addr_t *network,
+		   unsigned int *prefix_len);
+
+void netmask_ipv6(const struct in6_addr *addr,
+		  unsigned int prefix_len,
+		  struct in6_addr *out);
+
+void netmask_ipv4(const struct in_addr *addr,
+		  unsigned int prefix_len,
+		  struct in_addr *out);
+
+bool network_contains(
+	const struct entity_addr_t& network,
+	unsigned int prefix_len,
+	const struct entity_addr_t& addr);
+
+#endif
diff --git a/src/include/krbd.h b/src/include/krbd.h
new file mode 100644
index 00000000..977d45fe
--- /dev/null
+++ b/src/include/krbd.h
@@ -0,0 +1,97 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_KRBD_H
+#define CEPH_KRBD_H
+
+#include "rados/librados.h"
+
+/*
+ * Don't wait for udev add uevents in krbd_map() and udev remove
+ * uevents in krbd_unmap*().  Instead, make do with the respective
+ * kernel uevents and return as soon as they are received.
+ *
+ * systemd-udevd sends out udev uevents after it finishes processing
+ * the respective kernel uevents, which mostly boils down to executing
+ * all matching udev rules.  With this flag set, on return from
+ * krbd_map() systemd-udevd may still be poking at the device: it
+ * may still be open with tools such as blkid and various ioctls to
+ * be run against it, none of the persistent symlinks to the device
+ * node may be there, etc.  udev used to be responsible for creating
+ * the device node as well, but that has been handled by devtmpfs in
+ * the kernel for many years now, so the device node (as returned
+ * through @pdevnode) is guaranteed to be there.
+ *
+ * If set, krbd_map() and krbd_unmap*() can be invoked from any
+ * network namespace that is owned by the initial user namespace
+ * (which is a formality because things like loading kernel modules
+ * and creating block devices are not namespaced and require global
+ * privileges, i.e. capabilities in the initial user namespace).
+ * Otherwise, krbd_map() and krbd_unmap*() must be invoked from
+ * the initial network namespace.
+ *
+ * If set, krbd_unmap*() doesn't attempt to settle the udev queue
+ * before retrying unmap for the last time.  Some EBUSY errors due
+ * to systemd-udevd poking at the device at the time krbd_unmap*()
+ * is invoked that are otherwise covered by the retry logic may be
+ * returned.
+ */
+#define KRBD_CTX_F_NOUDEV       (1U << 0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct krbd_ctx;
+
+int krbd_create_from_context(rados_config_t cct, uint32_t flags,
+                             struct krbd_ctx **pctx);
+void krbd_destroy(struct krbd_ctx *ctx);
+
+int krbd_map(struct krbd_ctx *ctx,
+             const char *pool_name,
+             const char *nspace_name,
+             const char *image_name,
+             const char *snap_name,
+             const char *options,
+             char **pdevnode);
+int krbd_is_mapped(struct krbd_ctx *ctx,
+                   const char *pool_name,
+                   const char *nspace_name,
+                   const char *image_name,
+                   const char *snap_name,
+                   char **pdevnode);
+
+int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
+               const char *options);
+int krbd_unmap_by_spec(struct krbd_ctx *ctx,
+                       const char *pool_name,
+                       const char *nspace_name,
+                       const char *image_name,
+                       const char *snap_name,
+                       const char *options);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+namespace ceph {
+  class Formatter;
+}
+
+int krbd_showmapped(struct krbd_ctx *ctx, ceph::Formatter *f);
+
+#endif /* __cplusplus */
+
+#endif /* CEPH_KRBD_H */
diff --git a/src/include/linux_fiemap.h b/src/include/linux_fiemap.h
new file mode 100644
index 00000000..36046b5c
--- /dev/null
+++ b/src/include/linux_fiemap.h
@@ -0,0 +1,73 @@
+/*
+ * FS_IOC_FIEMAP ioctl infrastructure.
+ *
+ * Some portions copyright (C) 2007 Cluster File Systems, Inc
+ *
+ * Authors: Mark Fasheh <mfasheh@suse.com>
+ *          Kalpak Shah <kalpak.shah@sun.com>
+ *          Andreas Dilger <adilger@sun.com>
+ */
+#ifndef _LINUX_FIEMAP_H
+#define _LINUX_FIEMAP_H
+
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD_)
+#include <sys/types.h>
+#endif
+
+#include "include/int_types.h"
+
+struct fiemap_extent {
+	__u64 fe_logical;  /* logical offset in bytes for the start of
+			    * the extent from the beginning of the file */
+	__u64 fe_physical; /* physical offset in bytes for the start
+			    * of the extent from the beginning of the disk */
+	__u64 fe_length;   /* length in bytes for this extent */
+	__u64 fe_reserved64[2];
+	__u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+	__u32 fe_reserved[3];
+};
+
+struct fiemap {
+	__u64 fm_start;		/* logical offset (inclusive) at
+				 * which to start mapping (in) */
+	__u64 fm_length;	/* logical length of mapping which
+				 * userspace wants (in) */
+	__u32 fm_flags;		/* FIEMAP_FLAG_* flags for request (in/out) */
+	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+	__u32 fm_extent_count;  /* size of fm_extents array (in) */
+	__u32 fm_reserved;
+	struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET	(~0ULL)
+
+#define FIEMAP_FLAG_SYNC	0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR	0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_FLAGS_COMPAT	(FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
+
+#define FIEMAP_EXTENT_LAST		0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN		0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC		0x00000004 /* Location still pending.
+						    * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED		0x00000008 /* Data can not be read
+						    * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED	0x00000080 /* Data is encrypted by fs.
+						    * Sets EXTENT_NO_BYPASS. */
+#define FIEMAP_EXTENT_NOT_ALIGNED	0x00000100 /* Extent offsets may not be
+						    * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE	0x00000200 /* Data mixed with metadata.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL		0x00000400 /* Multiple files in block.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN		0x00000800 /* Space allocated, but
+						    * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED		0x00001000 /* File does not natively
+						    * support extents. Result
+						    * merged for efficiency. */
+#define FIEMAP_EXTENT_SHARED		0x00002000 /* Space shared with other
+						    * files. */
+
+#endif /* _LINUX_FIEMAP_H */
diff --git a/src/include/lru.h b/src/include/lru.h
new file mode 100644
index 00000000..1e30cdfe
--- /dev/null
+++ b/src/include/lru.h
@@ -0,0 +1,243 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef CEPH_LRU_H
+#define CEPH_LRU_H
+
+#include <math.h>
+#include <stdint.h>
+
+#include "common/config.h"
+#include "xlist.h"
+
+class LRUObject {
+public:
+  LRUObject() : lru(), lru_link(this), lru_pinned(false) { }
+  ~LRUObject();
+
+  // pin/unpin item in cache
+  void lru_pin();
+  void lru_unpin();
+  bool lru_is_expireable() const { return !lru_pinned; }
+
+  friend class LRU;
+private:
+  class LRU *lru;
+  xlist<LRUObject *>::item lru_link;
+  bool lru_pinned;
+};
+
+class LRU {
+public:
+  LRU() : num_pinned(0), midpoint(0.6) {}
+
+  uint64_t lru_get_size() const { return lru_get_top()+lru_get_bot()+lru_get_pintail(); }
+  uint64_t lru_get_top() const { return top.size(); }
+  uint64_t lru_get_bot() const{ return bottom.size(); }
+  uint64_t lru_get_pintail() const { return pintail.size(); }
+  uint64_t lru_get_num_pinned() const { return num_pinned; }
+
+  void lru_set_midpoint(double f) { midpoint = fmin(1.0, fmax(0.0, f)); }
+  
+  void lru_clear() {
+    while (!top.empty()) {
+      lru_remove(top.front());
+    }
+    while (!bottom.empty()) {
+      lru_remove(bottom.front());
+    }
+    while (!pintail.empty()) {
+      lru_remove(pintail.front());
+    }
+    ceph_assert(num_pinned == 0);
+  }
+
+  // insert at top of lru
+  void lru_insert_top(LRUObject *o) {
+    ceph_assert(!o->lru);
+    o->lru = this;
+    top.push_front(&o->lru_link);
+    if (o->lru_pinned) num_pinned++;
+    adjust();
+  }
+
+  // insert at mid point in lru
+  void lru_insert_mid(LRUObject *o) {
+    ceph_assert(!o->lru);
+    o->lru = this;
+    bottom.push_front(&o->lru_link);
+    if (o->lru_pinned) num_pinned++;
+    adjust();
+  }
+
+  // insert at bottom of lru
+  void lru_insert_bot(LRUObject *o) {
+    ceph_assert(!o->lru);
+    o->lru = this;
+    bottom.push_back(&o->lru_link);
+    if (o->lru_pinned) num_pinned++;
+    adjust();
+  }
+
+  // remove an item
+  LRUObject *lru_remove(LRUObject *o) {
+    if (!o->lru) return o;
+    auto list = o->lru_link.get_list();
+    ceph_assert(list == &top || list == &bottom || list == &pintail);
+    o->lru_link.remove_myself();
+    if (o->lru_pinned) num_pinned--;
+    o->lru = nullptr;
+    adjust();
+    return o;
+  }
+
+  // touch item -- move to head of lru
+  bool lru_touch(LRUObject *o) {
+    if (!o->lru) {
+      lru_insert_top(o);
+    } else {
+      ceph_assert(o->lru == this);
+      auto list = o->lru_link.get_list();
+      ceph_assert(list == &top || list == &bottom || list == &pintail);
+      top.push_front(&o->lru_link);
+      adjust();
+    }
+    return true;
+  }
+
+  // touch item -- move to midpoint (unless already higher)
+  bool lru_midtouch(LRUObject *o) {
+    if (!o->lru) {
+      lru_insert_mid(o);
+    } else {
+      ceph_assert(o->lru == this);
+      auto list = o->lru_link.get_list();
+      ceph_assert(list == &top || list == &bottom || list == &pintail);
+      if (list == &top) return false;
+      bottom.push_front(&o->lru_link);
+      adjust();
+    }
+    return true;
+  }
+
+  // touch item -- move to bottom
+  bool lru_bottouch(LRUObject *o) {
+    if (!o->lru) {
+      lru_insert_bot(o);
+    } else {
+      ceph_assert(o->lru == this);
+      auto list = o->lru_link.get_list();
+      ceph_assert(list == &top || list == &bottom || list == &pintail);
+      bottom.push_back(&o->lru_link);
+      adjust();
+    }
+    return true;
+  }
+
+  void lru_touch_entire_pintail() {
+    // promote entire pintail to the top lru
+    while (pintail.size() > 0) {
+      top.push_back(&pintail.front()->lru_link);
+      adjust();
+    }
+  }
+
+  // expire -- expire a single item
+  LRUObject *lru_get_next_expire() {
+    adjust();
+    // look through tail of bot
+    while (bottom.size()) {
+      LRUObject *p = bottom.back();
+      if (!p->lru_pinned) return p;
+
+      // move to pintail
+      pintail.push_front(&p->lru_link);
+    }
+
+    // ok, try head then
+    while (top.size()) {
+      LRUObject *p = top.back();
+      if (!p->lru_pinned) return p;
+
+      // move to pintail
+      pintail.push_front(&p->lru_link);
+    }
+    
+    // no luck!
+    return NULL;
+  }
+  
+  LRUObject *lru_expire() {
+    LRUObject *p = lru_get_next_expire();
+    if (p) 
+      return lru_remove(p);
+    return NULL;
+  }
+
+  void lru_status() {
+    //generic_dout(10) << "lru: " << lru_get_size() << " items, " << top.size() << " top, " << bottom.size() << " bot, " << pintail.size() << " pintail" << dendl;
+  }
+
+protected:
+  // adjust top/bot balance, as necessary
+  void adjust() {
+    uint64_t toplen = top.size();
+    uint64_t topwant = (midpoint * (double)(lru_get_size() - num_pinned));
+    /* move items from below midpoint (bottom) to top: move midpoint forward */
+    for (uint64_t i = toplen; i < topwant; i++) {
+      top.push_back(&bottom.front()->lru_link);
+    }
+    /* or: move items from above midpoint (top) to bottom: move midpoint backwards */
+    for (uint64_t i = toplen; i > topwant; i--) {
+      bottom.push_front(&top.back()->lru_link);
+    }
+  }
+
+  uint64_t num_pinned;
+  double midpoint;
+
+  friend class LRUObject;
+private:
+  typedef xlist<LRUObject *> LRUList;
+  LRUList top, bottom, pintail;
+};
+
+inline LRUObject::~LRUObject() {
+  if (lru) {
+    lru->lru_remove(this);
+  }
+}
+
+inline void LRUObject::lru_pin() {
+  if (lru && !lru_pinned) {
+    lru->num_pinned++;
+  }
+  lru_pinned = true;
+}
+
+inline void LRUObject::lru_unpin() {
+  if (lru && lru_pinned) {
+    lru->num_pinned--;
+
+    // move from pintail -> bot
+    if (lru_link.get_list() == &lru->pintail) {
+      lru->lru_bottouch(this);
+    }
+  }
+  lru_pinned = false;
+}
+
+#endif
diff --git a/src/include/mempool.h b/src/include/mempool.h
new file mode 100644
index 00000000..9cee3825
--- /dev/null
+++ b/src/include/mempool.h
@@ -0,0 +1,547 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef _CEPH_INCLUDE_MEMPOOL_H
+#define _CEPH_INCLUDE_MEMPOOL_H
+
+#include <cstddef>
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <vector>
+#include <list>
+#include <mutex>
+#include <atomic>
+#include <typeinfo>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/flat_map.hpp>
+
+#include <common/Formatter.h>
+#include "include/ceph_assert.h"
+#include "include/compact_map.h"
+#include "include/compact_set.h"
+
+
+/*
+
+Memory Pools
+============
+
+A memory pool is a method for accounting the consumption of memory of
+a set of containers.
+
+Memory pools are statically declared (see pool_index_t).
+
+Each memory pool tracks the number of bytes and items it contains.
+
+Allocators can be declared and associated with a type so that they are
+tracked independently of the pool total.  This additional accounting
+is optional and only incurs an overhead if the debugging is enabled at
+runtime.  This allows developers to see what types are consuming the
+pool resources.
+
+
+Declaring
+---------
+
+Using memory pools is very easy.
+
+To create a new memory pool, simply add a new name into the list of
+memory pools that's defined in "DEFINE_MEMORY_POOLS_HELPER".  That's
+it.  :)
+
+For each memory pool that's created a C++ namespace is also
+automatically created (name is same as in DEFINE_MEMORY_POOLS_HELPER).
+That namespace contains a set of common STL containers that are predefined
+with the appropriate allocators.
+
+Thus for mempool "osd" we have automatically available to us:
+
+   mempool::osd::map
+   mempool::osd::multimap
+   mempool::osd::set
+   mempool::osd::multiset
+   mempool::osd::list
+   mempool::osd::vector
+   mempool::osd::unordered_map
+
+
+Putting objects in a mempool
+----------------------------
+
+In order to use a memory pool with a particular type, a few additional
+declarations are needed.
+
+For a class:
+
+  struct Foo {
+    MEMPOOL_CLASS_HELPERS();
+    ...
+  };
+
+Then, in an appropriate .cc file,
+
+  MEMPOOL_DEFINE_OBJECT_FACTORY(Foo, foo, osd);
+
+The second argument can generally be identical to the first, except
+when the type contains a nested scope.  For example, for
+BlueStore::Onode, we need to do
+
+  MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
+                                bluestore_meta);
+
+(This is just because we need to name some static variables and we
+can't use :: in a variable name.)
+
+XXX Note: the new operator hard-codes the allocation size to the size of the
+object given in MEMPOOL_DEFINE_OBJECT_FACTORY. For this reason, you cannot
+incorporate mempools into a base class without also defining a helper/factory
+for the child class as well (as the base class is usually smaller than the
+child class).
+
+In order to use the STL containers, simply use the namespaced variant
+of the container type.  For example,
+
+  mempool::osd::map<int> myvec;
+
+Introspection
+-------------
+
+The simplest way to interrogate the process is with
+
+  Formater *f = ...
+  mempool::dump(f);
+
+This will dump information about *all* memory pools.  When debug mode
+is enabled, the runtime complexity of dump is O(num_shards *
+num_types).  When debug name is disabled it is O(num_shards).
+
+You can also interrogate a specific pool programmatically with
+
+  size_t bytes = mempool::unittest_2::allocated_bytes();
+  size_t items = mempool::unittest_2::allocated_items();
+
+The runtime complexity is O(num_shards).
+
+Note that you cannot easily query per-type, primarily because debug
+mode is optional and you should not rely on that information being
+available.
+
+*/
+
+namespace mempool {
+
+// --------------------------------------------------------------
+// define memory pools
+
+#define DEFINE_MEMORY_POOLS_HELPER(f) \
+  f(bloom_filter)		      \
+  f(bluestore_alloc)		      \
+  f(bluestore_cache_data)	      \
+  f(bluestore_cache_onode)	      \
+  f(bluestore_cache_meta)	      \
+  f(bluestore_cache_other)	      \
+  f(bluestore_Buffer)		      \
+  f(bluestore_Extent)		      \
+  f(bluestore_Blob)		      \
+  f(bluestore_SharedBlob)	      \
+  f(bluestore_inline_bl)	      \
+  f(bluestore_fsck)		      \
+  f(bluestore_txc)		      \
+  f(bluestore_writing_deferred)      \
+  f(bluestore_writing)		      \
+  f(bluefs)			      \
+  f(bluefs_file_reader)              \
+  f(bluefs_file_writer)              \
+  f(buffer_anon)		      \
+  f(buffer_meta)		      \
+  f(osd)			      \
+  f(osd_mapbl)			      \
+  f(osd_pglog)			      \
+  f(osdmap)			      \
+  f(osdmap_mapping)		      \
+  f(pgmap)			      \
+  f(mds_co)			      \
+  f(unittest_1)			      \
+  f(unittest_2)
+
+
+// give them integer ids
+#define P(x) mempool_##x,
+enum pool_index_t {
+  DEFINE_MEMORY_POOLS_HELPER(P)
+  num_pools        // Must be last.
+};
+#undef P
+
+extern bool debug_mode;
+extern void set_debug_mode(bool d);
+
+// --------------------------------------------------------------
+class pool_t;
+
+// we shard pool stats across many shard_t's to reduce the amount
+// of cacheline ping pong.
+enum {
+  num_shard_bits = 5
+};
+enum {
+  num_shards = 1 << num_shard_bits
+};
+
+// align shard to a cacheline
+struct shard_t {
+  std::atomic<size_t> bytes = {0};
+  std::atomic<size_t> items = {0};
+  char __padding[128 - sizeof(std::atomic<size_t>)*2];
+} __attribute__ ((aligned (128)));
+
+static_assert(sizeof(shard_t) == 128, "shard_t should be cacheline-sized");
+
+struct stats_t {
+  ssize_t items = 0;
+  ssize_t bytes = 0;
+  void dump(ceph::Formatter *f) const {
+    f->dump_int("items", items);
+    f->dump_int("bytes", bytes);
+  }
+
+  stats_t& operator+=(const stats_t& o) {
+    items += o.items;
+    bytes += o.bytes;
+    return *this;
+  }
+};
+
+pool_t& get_pool(pool_index_t ix);
+const char *get_pool_name(pool_index_t ix);
+
+struct type_t {
+  const char *type_name;
+  size_t item_size;
+  std::atomic<ssize_t> items = {0};  // signed
+};
+
+struct type_info_hash {
+  std::size_t operator()(const std::type_info& k) const {
+    return k.hash_code();
+  }
+};
+
+class pool_t {
+  shard_t shard[num_shards];
+
+  mutable std::mutex lock;  // only used for types list
+  std::unordered_map<const char *, type_t> type_map;
+
+public:
+  //
+  // How much this pool consumes. O(<num_shards>)
+  //
+  size_t allocated_bytes() const;
+  size_t allocated_items() const;
+
+  void adjust_count(ssize_t items, ssize_t bytes);
+
+  static size_t pick_a_shard_int() {
+    // Dirt cheap, see:
+    //   https://fossies.org/dox/glibc-2.32/pthread__self_8c_source.html
+    size_t me = (size_t)pthread_self();
+    size_t i = (me >> 12) & ((1 << num_shard_bits) - 1);
+    return i;
+  }
+
+  shard_t* pick_a_shard() {
+    size_t i = pick_a_shard_int();
+    return &shard[i];
+  }
+
+  type_t *get_type(const std::type_info& ti, size_t size) {
+    std::lock_guard<std::mutex> l(lock);
+    auto p = type_map.find(ti.name());
+    if (p != type_map.end()) {
+      return &p->second;
+    }
+    type_t &t = type_map[ti.name()];
+    t.type_name = ti.name();
+    t.item_size = size;
+    return &t;
+  }
+
+  // get pool stats.  by_type is not populated if !debug
+  void get_stats(stats_t *total,
+		 std::map<std::string, stats_t> *by_type) const;
+
+  void dump(ceph::Formatter *f, stats_t *ptotal=0) const;
+};
+
+void dump(ceph::Formatter *f);
+
+
+// STL allocator for use with containers.  All actual state
+// is stored in the static pool_allocator_base_t, which saves us from
+// passing the allocator to container constructors.
+
+template<pool_index_t pool_ix, typename T>
+class pool_allocator {
+  pool_t *pool;
+  type_t *type = nullptr;
+
+public:
+  typedef pool_allocator<pool_ix, T> allocator_type;
+  typedef T value_type;
+  typedef value_type *pointer;
+  typedef const value_type * const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+
+  template<typename U> struct rebind {
+    typedef pool_allocator<pool_ix,U> other;
+  };
+
+  void init(bool force_register) {
+    pool = &get_pool(pool_ix);
+    if (debug_mode || force_register) {
+      type = pool->get_type(typeid(T), sizeof(T));
+    }
+  }
+
+  pool_allocator(bool force_register=false) {
+    init(force_register);
+  }
+  template<typename U>
+  pool_allocator(const pool_allocator<pool_ix,U>&) {
+    init(false);
+  }
+
+  T* allocate(size_t n, void *p = nullptr) {
+    size_t total = sizeof(T) * n;
+    shard_t *shard = pool->pick_a_shard();
+    shard->bytes += total;
+    shard->items += n;
+    if (type) {
+      type->items += n;
+    }
+    T* r = reinterpret_cast<T*>(new char[total]);
+    return r;
+  }
+
+  void deallocate(T* p, size_t n) {
+    size_t total = sizeof(T) * n;
+    shard_t *shard = pool->pick_a_shard();
+    shard->bytes -= total;
+    shard->items -= n;
+    if (type) {
+      type->items -= n;
+    }
+    delete[] reinterpret_cast<char*>(p);
+  }
+
+  T* allocate_aligned(size_t n, size_t align, void *p = nullptr) {
+    size_t total = sizeof(T) * n;
+    shard_t *shard = pool->pick_a_shard();
+    shard->bytes += total;
+    shard->items += n;
+    if (type) {
+      type->items += n;
+    }
+    char *ptr;
+    int rc = ::posix_memalign((void**)(void*)&ptr, align, total);
+    if (rc)
+      throw std::bad_alloc();
+    T* r = reinterpret_cast<T*>(ptr);
+    return r;
+  }
+
+  void deallocate_aligned(T* p, size_t n) {
+    size_t total = sizeof(T) * n;
+    shard_t *shard = pool->pick_a_shard();
+    shard->bytes -= total;
+    shard->items -= n;
+    if (type) {
+      type->items -= n;
+    }
+    ::free(p);
+  }
+
+  void destroy(T* p) {
+    p->~T();
+  }
+
+  template<class U>
+  void destroy(U *p) {
+    p->~U();
+  }
+
+  void construct(T* p, const T& val) {
+    ::new ((void *)p) T(val);
+  }
+
+  template<class U, class... Args> void construct(U* p,Args&&... args) {
+    ::new((void *)p) U(std::forward<Args>(args)...);
+  }
+
+  bool operator==(const pool_allocator&) const { return true; }
+  bool operator!=(const pool_allocator&) const { return false; }
+};
+
+
+// Namespace mempool
+
+#define P(x)								\
+  namespace x {								\
+    static const mempool::pool_index_t id = mempool::mempool_##x;	\
+    template<typename v>						\
+    using pool_allocator = mempool::pool_allocator<id,v>;		\
+                                                                        \
+    using string = std::basic_string<char,std::char_traits<char>,       \
+                                     pool_allocator<char>>;             \
+                                                                        \
+    template<typename k,typename v, typename cmp = std::less<k> >	\
+    using map = std::map<k, v, cmp,					\
+			 pool_allocator<std::pair<const k,v>>>;		\
+                                                                        \
+    template<typename k,typename v, typename cmp = std::less<k> >       \
+    using compact_map = compact_map<k, v, cmp,                          \
+			 pool_allocator<std::pair<const k,v>>>;         \
+                                                                        \
+    template<typename k,typename v, typename cmp = std::less<k> >       \
+    using compact_multimap = compact_multimap<k, v, cmp,                \
+			 pool_allocator<std::pair<const k,v>>>;         \
+                                                                        \
+    template<typename k, typename cmp = std::less<k> >                  \
+    using compact_set = compact_set<k, cmp, pool_allocator<k>>;         \
+                                                                        \
+    template<typename k,typename v, typename cmp = std::less<k> >	\
+    using multimap = std::multimap<k,v,cmp,				\
+				   pool_allocator<std::pair<const k,	\
+							    v>>>;	\
+                                                                        \
+    template<typename k, typename cmp = std::less<k> >			\
+    using set = std::set<k,cmp,pool_allocator<k>>;			\
+                                                                        \
+    template<typename k, typename cmp = std::less<k> >			\
+    using flat_set = boost::container::flat_set<k,cmp,pool_allocator<k>>; \
+									\
+    template<typename k, typename v, typename cmp = std::less<k> >	\
+    using flat_map = boost::container::flat_map<k,v,cmp,		\
+						pool_allocator<std::pair<k,v>>>; \
+                                                                        \
+    template<typename v>						\
+    using list = std::list<v,pool_allocator<v>>;			\
+                                                                        \
+    template<typename v>						\
+    using vector = std::vector<v,pool_allocator<v>>;			\
+                                                                        \
+    template<typename k, typename v,					\
+	     typename h=std::hash<k>,					\
+	     typename eq = std::equal_to<k>>				\
+    using unordered_map =						\
+      std::unordered_map<k,v,h,eq,pool_allocator<std::pair<const k,v>>>;\
+                                                                        \
+    inline size_t allocated_bytes() {					\
+      return mempool::get_pool(id).allocated_bytes();			\
+    }									\
+    inline size_t allocated_items() {					\
+      return mempool::get_pool(id).allocated_items();			\
+    }									\
+  };
+
+DEFINE_MEMORY_POOLS_HELPER(P)
+
+#undef P
+
+};
+
+// the elements allocated by mempool is in the same memory space as the ones
+// allocated by the default allocator. so compare them in an efficient way:
+// libstdc++'s std::equal is specialized to use memcmp if T is integer or
+// pointer. this is good enough for our usecase. use
+// std::is_trivially_copyable<T> to expand the support to more types if
+// nececssary.
+template<typename T, mempool::pool_index_t pool_index>
+bool operator==(const std::vector<T, std::allocator<T>>& lhs,
+		const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs)
+{
+  return (lhs.size() == rhs.size() &&
+	  std::equal(lhs.begin(), lhs.end(), rhs.begin()));
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator!=(const std::vector<T, std::allocator<T>>& lhs,
+		const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs)
+{
+  return !(lhs == rhs);
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator==(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs,
+		const std::vector<T, std::allocator<T>>& rhs)
+{
+  return rhs == lhs;
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator!=(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs,
+		const std::vector<T, std::allocator<T>>& rhs)
+{
+  return !(lhs == rhs);
+}
+
+// Use this for any type that is contained by a container (unless it
+// is a class you defined; see below).
+#define MEMPOOL_DECLARE_FACTORY(obj, factoryname, pool)			\
+  namespace mempool {							\
+    namespace pool {							\
+      extern pool_allocator<obj> alloc_##factoryname;			\
+    }									\
+  }
+
+#define MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool)			\
+  namespace mempool {							\
+    namespace pool {							\
+      pool_allocator<obj> alloc_##factoryname = {true};			\
+    }									\
+  }
+
+// Use this for each class that belongs to a mempool.  For example,
+//
+//   class T {
+//     MEMPOOL_CLASS_HELPERS();
+//     ...
+//   };
+//
+#define MEMPOOL_CLASS_HELPERS()						\
+  void *operator new(size_t size);					\
+  void *operator new[](size_t size) noexcept {				\
+    ceph_abort_msg("no array new");					\
+    return nullptr; }							\
+  void  operator delete(void *);					\
+  void  operator delete[](void *) { ceph_abort_msg("no array delete"); }
+
+
+// Use this in some particular .cc file to match each class with a
+// MEMPOOL_CLASS_HELPERS().
+#define MEMPOOL_DEFINE_OBJECT_FACTORY(obj,factoryname,pool)		\
+  MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool)			\
+  void *obj::operator new(size_t size) {				\
+    return mempool::pool::alloc_##factoryname.allocate(1); \
+  }									\
+  void obj::operator delete(void *p)  {					\
+    return mempool::pool::alloc_##factoryname.deallocate((obj*)p, 1);	\
+  }
+
+#endif
diff --git a/src/include/msgr.h b/src/include/msgr.h
new file mode 100644
index 00000000..f7b2a078
--- /dev/null
+++ b/src/include/msgr.h
@@ -0,0 +1,254 @@
+#ifndef CEPH_MSGR_H
+#define CEPH_MSGR_H
+
+#ifndef __KERNEL__
+#include <sys/socket.h> // for struct sockaddr_storage
+#endif
+
+#include "include/int_types.h"
+
+/* See comment in ceph_fs.h.  */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT_LEGACY    6789  /* legacy default monitor port */
+#define CEPH_MON_PORT_IANA      3300  /* IANA monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST  6789
+
+/*
+ * tcp connection banner.  include a protocol version. and adjust
+ * whenever the wire protocol changes.  try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+
+
+/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2_PREFIX "ceph v2\n"
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name)               \
+	const static uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+	const static uint64_t CEPH_MSGR2_FEATUREMASK_##name =            \
+			(1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+	(((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1)   // msgr2.1
+
+#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES  (0ull)
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+       return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+	__u8 type;      /* CEPH_ENTITY_TYPE_* */
+	__le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON    0x01
+#define CEPH_ENTITY_TYPE_MDS    0x02
+#define CEPH_ENTITY_TYPE_OSD    0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_MGR    0x10
+#define CEPH_ENTITY_TYPE_AUTH   0x20
+
+#define CEPH_ENTITY_TYPE_ANY    0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+	__le32 type;
+	__le32 nonce;  /* unique id for process (e.g. pid) */
+	struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+	struct ceph_entity_name name;
+	struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
+					  incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
+					  with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
+					  with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
+#define CEPH_MSGR_TAG_MSG           7  /* message */
+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
+#define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2     14
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15  /* keepalive reply */
+#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16  /* ceph v2 doing server challenge */
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+	__le64 features;     /* supported feature bits */
+	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
+	__le32 global_seq;   /* count connections initiated by this host */
+	__le32 connect_seq;  /* count connections initiated in this session */
+	__le32 protocol_version;
+	__le32 authorizer_protocol;
+	__le32 authorizer_len;
+	__u8  flags;         /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+	__u8 tag;
+	__le64 features;     /* feature bits for this session */
+	__le32 global_seq;
+	__le32 connect_seq;
+	__le32 protocol_version;
+	__le32 authorizer_len;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header_old {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_inst src, orig_src;
+	__le32 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_name src;
+
+	/* oldest code we think can decode this.  unknown if zero. */
+	__le16 compat_version;
+	__le16 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header2 {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 data_pre_padding_len;
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	__le64 ack_seq;
+	__u8 flags;
+	/* oldest code we think can decode this.  unknown if zero. */
+	__le16 compat_version;
+	__le16 reserved;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW     64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH    196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ * ceph_msg_footer_old does not support digital signatures on messages PLR
+ */
+
+struct ceph_msg_footer_old {
+	__le32 front_crc, middle_crc, data_crc;
+	__u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_msg_footer {
+	__le32 front_crc, middle_crc, data_crc;
+	// sig holds the 64 bits of the digital signature for the message PLR
+	__le64  sig;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
+#define CEPH_MSG_FOOTER_SIGNED	  (1<<2)   /* msg was signed */
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/object.h b/src/include/object.h
new file mode 100644
index 00000000..99ca58f9
--- /dev/null
+++ b/src/include/object.h
@@ -0,0 +1,214 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_OBJECT_H
+#define CEPH_OBJECT_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <iosfwd>
+#include <iomanip>
+
+#include "include/rados.h"
+#include "include/unordered_map.h"
+
+#include "hash.h"
+#include "encoding.h"
+#include "ceph_hash.h"
+#include "cmp.h"
+
+using namespace std;
+
+struct object_t {
+  string name;
+
+  object_t() {}
+  // cppcheck-suppress noExplicitConstructor
+  object_t(const char *s) : name(s) {}
+  // cppcheck-suppress noExplicitConstructor
+  object_t(const string& s) : name(s) {}
+
+  void swap(object_t& o) {
+    name.swap(o.name);
+  }
+  void clear() {
+    name.clear();
+  }
+  
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    encode(name, bl);
+  }
+  void decode(bufferlist::const_iterator &bl) {
+    using ceph::decode;
+    decode(name, bl);
+  }
+};
+WRITE_CLASS_ENCODER(object_t)
+
+inline bool operator==(const object_t& l, const object_t& r) {
+  return l.name == r.name;
+}
+inline bool operator!=(const object_t& l, const object_t& r) {
+  return l.name != r.name;
+}
+inline bool operator>(const object_t& l, const object_t& r) {
+  return l.name > r.name;
+}
+inline bool operator<(const object_t& l, const object_t& r) {
+  return l.name < r.name;
+}
+inline bool operator>=(const object_t& l, const object_t& r) { 
+  return l.name >= r.name;
+}
+inline bool operator<=(const object_t& l, const object_t& r) {
+  return l.name <= r.name;
+}
+inline ostream& operator<<(ostream& out, const object_t& o) {
+  return out << o.name;
+}
+
+namespace std {
+  template<> struct hash<object_t> {
+    size_t operator()(const object_t& r) const { 
+      //static hash<string> H;
+      //return H(r.name);
+      return ceph_str_hash_linux(r.name.c_str(), r.name.length());
+    }
+  };
+} // namespace std
+
+
+struct file_object_t {
+  uint64_t ino, bno;
+  mutable char buf[34];
+
+  file_object_t(uint64_t i=0, uint64_t b=0) : ino(i), bno(b) {
+    buf[0] = 0;
+  }
+  
+  const char *c_str() const {
+    if (!buf[0])
+      snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)bno);
+    return buf;
+  }
+
+  operator object_t() {
+    return object_t(c_str());
+  }
+};
+
+
+// ---------------------------
+// snaps
+
+struct snapid_t {
+  uint64_t val;
+  // cppcheck-suppress noExplicitConstructor
+  snapid_t(uint64_t v=0) : val(v) {}
+  snapid_t operator+=(snapid_t o) { val += o.val; return *this; }
+  snapid_t operator++() { ++val; return *this; }
+  operator uint64_t() const { return val; }  
+};
+
+inline void encode(snapid_t i, bufferlist &bl) { encode(i.val, bl); }
+inline void decode(snapid_t &i, bufferlist::const_iterator &p) { decode(i.val, p); }
+
+template<>
+struct denc_traits<snapid_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const snapid_t& o, size_t& p) {
+    denc(o.val, p);
+  }
+  static void encode(const snapid_t &o, buffer::list::contiguous_appender& p) {
+    denc(o.val, p);
+  }
+  static void decode(snapid_t& o, buffer::ptr::const_iterator &p) {
+    denc(o.val, p);
+  }
+};
+
+inline ostream& operator<<(ostream& out, const snapid_t& s) {
+  if (s == CEPH_NOSNAP)
+    return out << "head";
+  else if (s == CEPH_SNAPDIR)
+    return out << "snapdir";
+  else
+    return out << hex << s.val << dec;
+}
+
+
+struct sobject_t {
+  object_t oid;
+  snapid_t snap;
+
+  sobject_t() : snap(0) {}
+  sobject_t(object_t o, snapid_t s) : oid(o), snap(s) {}
+
+  void swap(sobject_t& o) {
+    oid.swap(o.oid);
+    snapid_t t = snap;
+    snap = o.snap;
+    o.snap = t;
+  }
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(oid, bl);
+    encode(snap, bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    using ceph::decode;
+    decode(oid, bl);
+    decode(snap, bl);
+  }
+};
+WRITE_CLASS_ENCODER(sobject_t)
+
+inline bool operator==(const sobject_t &l, const sobject_t &r) {
+  return l.oid == r.oid && l.snap == r.snap;
+}
+inline bool operator!=(const sobject_t &l, const sobject_t &r) {
+  return l.oid != r.oid || l.snap != r.snap;
+}
+inline bool operator>(const sobject_t &l, const sobject_t &r) {
+  return l.oid > r.oid || (l.oid == r.oid && l.snap > r.snap);
+}
+inline bool operator<(const sobject_t &l, const sobject_t &r) {
+  return l.oid < r.oid || (l.oid == r.oid && l.snap < r.snap);
+}
+inline bool operator>=(const sobject_t &l, const sobject_t &r) {
+  return l.oid > r.oid || (l.oid == r.oid && l.snap >= r.snap);
+}
+inline bool operator<=(const sobject_t &l, const sobject_t &r) {
+  return l.oid < r.oid || (l.oid == r.oid && l.snap <= r.snap);
+}
+inline ostream& operator<<(ostream& out, const sobject_t &o) {
+  return out << o.oid << "/" << o.snap;
+}
+namespace std {
+  template<> struct hash<sobject_t> {
+    size_t operator()(const sobject_t &r) const {
+      static hash<object_t> H;
+      static rjhash<uint64_t> I;
+      return H(r.oid) ^ I(r.snap);
+    }
+  };
+} // namespace std
+
+#endif
diff --git a/src/include/on_exit.h b/src/include/on_exit.h
new file mode 100644
index 00000000..c412ab33
--- /dev/null
+++ b/src/include/on_exit.h
@@ -0,0 +1,49 @@
+#ifndef CEPH_ON_EXIT_H
+#define CEPH_ON_EXIT_H
+
+#include <pthread.h>
+#include <vector>
+
+#include "include/ceph_assert.h"
+/*
+ * Create a static instance at the file level to get callbacks called when the
+ * process exits via main() or exit().
+ */
+
+class OnExitManager {
+  public:
+    typedef void (*callback_t)(void *arg);
+
+    OnExitManager() {
+      int ret = pthread_mutex_init(&lock_, NULL);
+      ceph_assert(ret == 0);
+    }
+
+    ~OnExitManager() {
+      pthread_mutex_lock(&lock_);
+      std::vector<struct cb>::iterator it;
+      for (it = funcs_.begin(); it != funcs_.end(); it++) {
+        it->func(it->arg);
+      }
+      funcs_.clear();
+      pthread_mutex_unlock(&lock_);
+    }
+
+    void add_callback(callback_t func, void *arg) {
+      pthread_mutex_lock(&lock_);
+      struct cb callback = { func, arg };
+      funcs_.push_back(callback);
+      pthread_mutex_unlock(&lock_);
+    }
+
+  private:
+    struct cb {
+      callback_t func;
+      void *arg;
+    };
+
+    std::vector<struct cb> funcs_;
+    pthread_mutex_t lock_;
+};
+
+#endif
diff --git a/src/include/page.h b/src/include/page.h
new file mode 100644
index 00000000..db6e2058
--- /dev/null
+++ b/src/include/page.h
@@ -0,0 +1,18 @@
+#ifndef CEPH_PAGE_H
+#define CEPH_PAGE_H
+
+namespace ceph {
+  // these are in common/page.cc
+  extern unsigned _page_size;
+  extern unsigned long _page_mask;
+  extern unsigned _page_shift;
+}
+
+#endif
+
+
+#define CEPH_PAGE_SIZE ceph::_page_size
+#define CEPH_PAGE_MASK ceph::_page_mask
+#define CEPH_PAGE_SHIFT ceph::_page_shift
+
+
diff --git a/src/include/rados.h b/src/include/rados.h
new file mode 100644
index 00000000..bbcf0867
--- /dev/null
+++ b/src/include/rados.h
@@ -0,0 +1,681 @@
+#ifndef CEPH_RADOS_H
+#define CEPH_RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include <string.h>
+#include <stdbool.h>
+#include "msgr.h"
+
+/* See comment in ceph_fs.h.  */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+	unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+				    const struct ceph_fsid *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+
+struct ceph_timespec {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+	__le16 preferred; /* preferred primary osd */
+	__le16 ps;        /* placement seed */
+	__le32 pool;      /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg pool types
+ *
+ * NOTE: These map 1:1 on to the pg_pool_t::TYPE_* values.  They are
+ * duplicated here only for CrushCompiler's benefit.
+ */
+#define CEPH_PG_TYPE_REPLICATED 1
+/* #define CEPH_PG_TYPE_RAID4   2   never implemented */
+#define CEPH_PG_TYPE_ERASURE 3
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+	if ((x & bmask) < b)
+		return x & bmask;
+	else
+		return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+	__le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+	__le32 epoch;
+	__le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS       (1<<0)
+#define CEPH_OSD_UP           (1<<1)
+#define CEPH_OSD_AUTOOUT      (1<<2)  /* osd was automatically marked out */
+#define CEPH_OSD_NEW          (1<<3)  /* osd is new, never marked in */
+#define CEPH_OSD_FULL         (1<<4)  /* osd is at or above full threshold */
+#define CEPH_OSD_NEARFULL     (1<<5)  /* osd is at or above nearfull threshold */
+#define CEPH_OSD_BACKFILLFULL (1<<6)  /* osd is at or above backfillfull threshold */
+#define CEPH_OSD_DESTROYED    (1<<7)  /* osd has been destroyed */
+#define CEPH_OSD_NOUP         (1<<8)  /* osd can not be marked up */
+#define CEPH_OSD_NODOWN       (1<<9)  /* osd can not be marked down */
+#define CEPH_OSD_NOIN         (1<<10) /* osd can not be marked in */
+#define CEPH_OSD_NOOUT        (1<<11) /* osd can not be marked out */
+
+extern const char *ceph_osd_state_name(int s);
+
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+
+#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
+#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL         (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL             (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD          (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR          (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC         (1<<4)  /* pause recovery */
+#define CEPH_OSDMAP_NOUP             (1<<5)  /* block osd boot */
+#define CEPH_OSDMAP_NODOWN           (1<<6)  /* block osd mark-down/failure */
+#define CEPH_OSDMAP_NOOUT            (1<<7)  /* block osd auto mark-out */
+#define CEPH_OSDMAP_NOIN             (1<<8)  /* block osd auto mark-in */
+#define CEPH_OSDMAP_NOBACKFILL       (1<<9)  /* block osd backfill */
+#define CEPH_OSDMAP_NORECOVER        (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB          (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB     (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT      (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE      (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE      (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL    (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN   (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
+#define CEPH_OSDMAP_PURGED_SNAPDIRS  (1<<20) /* osds have converted snapsets */
+#define CEPH_OSDMAP_NOSNAPTRIM       (1<<21) /* disable snap trimming */
+#define CEPH_OSDMAP_PGLOG_HARDLIMIT  (1<<22) /* put a hard limit on pg log length */
+
+/* these are hidden in 'ceph status' view */
+#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL|	\
+				      CEPH_OSDMAP_REQUIRE_KRAKEN |	\
+				      CEPH_OSDMAP_REQUIRE_LUMINOUS |	\
+				      CEPH_OSDMAP_RECOVERY_DELETES |	\
+				      CEPH_OSDMAP_SORTBITWISE |		\
+				      CEPH_OSDMAP_PURGED_SNAPDIRS |     \
+                                      CEPH_OSDMAP_PGLOG_HARDLIMIT)
+#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL |	\
+					  CEPH_OSDMAP_REQUIRE_KRAKEN |	\
+					  CEPH_OSDMAP_REQUIRE_LUMINOUS)
+
+/*
+ * major ceph release numbers
+ */
+#define CEPH_RELEASE_ARGONAUT    1
+#define CEPH_RELEASE_BOBTAIL     2
+#define CEPH_RELEASE_CUTTLEFISH  3
+#define CEPH_RELEASE_DUMPLING    4
+#define CEPH_RELEASE_EMPEROR     5
+#define CEPH_RELEASE_FIREFLY     6
+#define CEPH_RELEASE_GIANT       7
+#define CEPH_RELEASE_HAMMER      8
+#define CEPH_RELEASE_INFERNALIS  9
+#define CEPH_RELEASE_JEWEL      10
+#define CEPH_RELEASE_KRAKEN     11
+#define CEPH_RELEASE_LUMINOUS   12
+#define CEPH_RELEASE_MIMIC      13
+#define CEPH_RELEASE_NAUTILUS   14
+#define CEPH_RELEASE_MAX        15  /* highest + 1 */
+
+extern const char *ceph_release_name(int r);
+extern int ceph_release_from_name(const char *s);
+extern uint64_t ceph_release_features(int r);
+extern int ceph_release_from_features(uint64_t features);
+
+/*
+ * The error code to return when an OSD can't handle a write
+ * because it is too large.
+ */
+#define OSD_WRITETOOBIG EMSGSIZE
+
+/*
+ * osd ops
+ *
+ * WARNING: do not use these op codes directly.  Use the helpers
+ * defined below instead.  In certain cases, op code behavior was
+ * redefined, resulting in special-cases in the helpers.
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+#define CEPH_OSD_OP_MODE_CACHE 0x8000
+
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+//      LEAVE UNUSED           0x0600 used to be multiobject ops
+
+#define __CEPH_OSD_OP1(mode, nr) \
+	(CEPH_OSD_OP_MODE_##mode | (nr))
+
+#define __CEPH_OSD_OP(mode, type, nr) \
+	(CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr))
+
+#define __CEPH_FORALL_OSD_OPS(f)					    \
+	/** data **/							    \
+	/* read */							    \
+	f(READ,		__CEPH_OSD_OP(RD, DATA, 1),	"read")		    \
+	f(STAT,		__CEPH_OSD_OP(RD, DATA, 2),	"stat")		    \
+	f(MAPEXT,	__CEPH_OSD_OP(RD, DATA, 3),	"mapext")	    \
+	f(CHECKSUM,	__CEPH_OSD_OP(RD, DATA, 31),	"checksum")	    \
+									    \
+	/* fancy read */						    \
+	f(MASKTRUNC,	__CEPH_OSD_OP(RD, DATA, 4),	"masktrunc")	    \
+	f(SPARSE_READ,	__CEPH_OSD_OP(RD, DATA, 5),	"sparse-read")	    \
+									    \
+	f(NOTIFY,	__CEPH_OSD_OP(RD, DATA, 6),	"notify")	    \
+	f(NOTIFY_ACK,	__CEPH_OSD_OP(RD, DATA, 7),	"notify-ack")	    \
+									    \
+	/* versioning */						    \
+	f(ASSERT_VER,	__CEPH_OSD_OP(RD, DATA, 8),	"assert-version")   \
+									    \
+	f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9),	"list-watchers")    \
+									    \
+	f(LIST_SNAPS,	__CEPH_OSD_OP(RD, DATA, 10),	"list-snaps")	    \
+									    \
+	/* sync */							    \
+	f(SYNC_READ,	__CEPH_OSD_OP(RD, DATA, 11),	"sync_read")	    \
+									    \
+	/* write */							    \
+	f(WRITE,	__CEPH_OSD_OP(WR, DATA, 1),	"write")	    \
+	f(WRITEFULL,	__CEPH_OSD_OP(WR, DATA, 2),	"writefull")	    \
+	f(TRUNCATE,	__CEPH_OSD_OP(WR, DATA, 3),	"truncate")	    \
+	f(ZERO,		__CEPH_OSD_OP(WR, DATA, 4),	"zero")		    \
+	f(DELETE,	__CEPH_OSD_OP(WR, DATA, 5),	"delete")	    \
+									    \
+	/* fancy write */						    \
+	f(APPEND,	__CEPH_OSD_OP(WR, DATA, 6),	"append")	    \
+	f(STARTSYNC,	__CEPH_OSD_OP(WR, DATA, 7),	"startsync")	    \
+	f(SETTRUNC,	__CEPH_OSD_OP(WR, DATA, 8),	"settrunc")	    \
+	f(TRIMTRUNC,	__CEPH_OSD_OP(WR, DATA, 9),	"trimtrunc")	    \
+									    \
+	f(TMAPUP,	__CEPH_OSD_OP(RMW, DATA, 10),	"tmapup")	    \
+	f(TMAPPUT,	__CEPH_OSD_OP(WR, DATA, 11),	"tmapput")	    \
+	f(TMAPGET,	__CEPH_OSD_OP(RD, DATA, 12),	"tmapget")	    \
+									    \
+	f(CREATE,	__CEPH_OSD_OP(WR, DATA, 13),	"create")	    \
+	f(ROLLBACK,	__CEPH_OSD_OP(WR, DATA, 14),	"rollback")	    \
+									    \
+	f(WATCH,	__CEPH_OSD_OP(WR, DATA, 15),	"watch")	    \
+									    \
+	/* omap */							    \
+	f(OMAPGETKEYS,	__CEPH_OSD_OP(RD, DATA, 17),	"omap-get-keys")    \
+	f(OMAPGETVALS,	__CEPH_OSD_OP(RD, DATA, 18),	"omap-get-vals")    \
+	f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19),	"omap-get-header")  \
+	f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \
+	f(OMAPSETVALS,	__CEPH_OSD_OP(WR, DATA, 21),	"omap-set-vals")    \
+	f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22),	"omap-set-header")  \
+	f(OMAPCLEAR,	__CEPH_OSD_OP(WR, DATA, 23),	"omap-clear")	    \
+	f(OMAPRMKEYS,	__CEPH_OSD_OP(WR, DATA, 24),	"omap-rm-keys")	    \
+	f(OMAP_CMP,	__CEPH_OSD_OP(RD, DATA, 25),	"omap-cmp")	    \
+									    \
+	/* tiering */							    \
+	f(COPY_FROM,	__CEPH_OSD_OP(WR, DATA, 26),	"copy-from")	    \
+	/* was copy-get-classic */					\
+	f(UNDIRTY,	__CEPH_OSD_OP(WR, DATA, 28),	"undirty")	    \
+	f(ISDIRTY,	__CEPH_OSD_OP(RD, DATA, 29),	"isdirty")	    \
+	f(COPY_GET,	__CEPH_OSD_OP(RD, DATA, 30),	"copy-get")	    \
+	f(CACHE_FLUSH,	__CEPH_OSD_OP(CACHE, DATA, 31),	"cache-flush")	    \
+	f(CACHE_EVICT,	__CEPH_OSD_OP(CACHE, DATA, 32),	"cache-evict")	    \
+	f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \
+									    \
+	/* convert tmap to omap */					    \
+	f(TMAP2OMAP,	__CEPH_OSD_OP(RMW, DATA, 34),	"tmap2omap")	    \
+									    \
+	/* hints */							    \
+	f(SETALLOCHINT,	__CEPH_OSD_OP(WR, DATA, 35),	"set-alloc-hint")   \
+                                                                            \
+	/* cache pin/unpin */						    \
+	f(CACHE_PIN,	__CEPH_OSD_OP(WR, DATA, 36),	"cache-pin")        \
+	f(CACHE_UNPIN,	__CEPH_OSD_OP(WR, DATA, 37),	"cache-unpin")      \
+									    \
+	/* ESX/SCSI */							    \
+	f(WRITESAME,	__CEPH_OSD_OP(WR, DATA, 38),	"write-same")	    \
+	f(CMPEXT,	__CEPH_OSD_OP(RD, DATA, 32),	"cmpext")	    \
+									    \
+	/* Extensible */						    \
+	f(SET_REDIRECT,	__CEPH_OSD_OP(WR, DATA, 39),	"set-redirect")	    \
+	f(SET_CHUNK,	__CEPH_OSD_OP(WR, DATA, 40),	"set-chunk")	    \
+	f(TIER_PROMOTE,	__CEPH_OSD_OP(WR, DATA, 41),	"tier-promote")	    \
+	f(UNSET_MANIFEST, __CEPH_OSD_OP(WR, DATA, 42),	"unset-manifest")   \
+									    \
+	/** attrs **/							    \
+	/* read */							    \
+	f(GETXATTR,	__CEPH_OSD_OP(RD, ATTR, 1),	"getxattr")	    \
+	f(GETXATTRS,	__CEPH_OSD_OP(RD, ATTR, 2),	"getxattrs")	    \
+	f(CMPXATTR,	__CEPH_OSD_OP(RD, ATTR, 3),	"cmpxattr")	    \
+									    \
+	/* write */							    \
+	f(SETXATTR,	__CEPH_OSD_OP(WR, ATTR, 1),	"setxattr")	    \
+	f(SETXATTRS,	__CEPH_OSD_OP(WR, ATTR, 2),	"setxattrs")	    \
+	f(RESETXATTRS,	__CEPH_OSD_OP(WR, ATTR, 3),	"resetxattrs")	    \
+	f(RMXATTR,	__CEPH_OSD_OP(WR, ATTR, 4),	"rmxattr")	    \
+									    \
+	/** subop **/							    \
+	f(PULL,		__CEPH_OSD_OP1(SUB, 1),		"pull")		    \
+	f(PUSH,		__CEPH_OSD_OP1(SUB, 2),		"push")		    \
+	f(BALANCEREADS,	__CEPH_OSD_OP1(SUB, 3),		"balance-reads")    \
+	f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4),	"unbalance-reads")  \
+	f(SCRUB,	__CEPH_OSD_OP1(SUB, 5),		"scrub")	    \
+	f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6),	"scrub-reserve")    \
+	f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7),	"scrub-unreserve")  \
+	/* 8 used to be scrub-stop */					\
+	f(SCRUB_MAP,	__CEPH_OSD_OP1(SUB, 9),		"scrub-map")	    \
+									    \
+	/** exec **/							    \
+	/* note: the RD bit here is wrong; see special-case below in helper */ \
+	f(CALL,		__CEPH_OSD_OP(RD, EXEC, 1),	"call")		    \
+									    \
+	/** pg **/							    \
+	f(PGLS,		__CEPH_OSD_OP(RD, PG, 1),	"pgls")		    \
+	f(PGLS_FILTER,	__CEPH_OSD_OP(RD, PG, 2),	"pgls-filter")	    \
+	f(PG_HITSET_LS,	__CEPH_OSD_OP(RD, PG, 3),	"pg-hitset-ls")	    \
+	f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4),	"pg-hitset-get")    \
+	f(PGNLS,	__CEPH_OSD_OP(RD, PG, 5),	"pgnls")	    \
+	f(PGNLS_FILTER,	__CEPH_OSD_OP(RD, PG, 6),	"pgnls-filter")     \
+	f(SCRUBLS, __CEPH_OSD_OP(RD, PG, 7), "scrubls")
+
+enum {
+#define GENERATE_ENUM_ENTRY(op, opcode, str)	CEPH_OSD_OP_##op = (opcode),
+__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY)
+#undef GENERATE_ENUM_ENTRY
+};
+
+static inline int ceph_osd_op_type_data(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+	return (op & CEPH_OSD_OP_MODE_RD) &&
+		op != CEPH_OSD_OP_CALL;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+	return op & CEPH_OSD_OP_MODE_WR;
+}
+static inline int ceph_osd_op_mode_cache(int op)
+{
+	return op & CEPH_OSD_OP_MODE_CACHE;
+}
+static inline bool ceph_osd_op_uses_extent(int op)
+{
+	switch(op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_MAPEXT:
+	case CEPH_OSD_OP_MASKTRUNC:
+	case CEPH_OSD_OP_SPARSE_READ:
+	case CEPH_OSD_OP_SYNC_READ:
+	case CEPH_OSD_OP_WRITE:
+	case CEPH_OSD_OP_WRITEFULL:
+	case CEPH_OSD_OP_TRUNCATE:
+	case CEPH_OSD_OP_ZERO:
+	case CEPH_OSD_OP_APPEND:
+	case CEPH_OSD_OP_TRIMTRUNC:
+	case CEPH_OSD_OP_CMPEXT:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * and objclass.h. Any modification here needs to be updated there
+ */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
+#define CEPH_OSD_TMAP_RM  'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
+
+extern const char *ceph_osd_op_name(int op);
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+	CEPH_OSD_FLAG_ACK =            0x0001,  /* want (or is) "ack" ack */
+	CEPH_OSD_FLAG_ONNVRAM =        0x0002,  /* want (or is) "onnvram" ack */
+	CEPH_OSD_FLAG_ONDISK =         0x0004,  /* want (or is) "ondisk" ack */
+	CEPH_OSD_FLAG_RETRY =          0x0008,  /* resend attempt */
+	CEPH_OSD_FLAG_READ =           0x0010,  /* op may read */
+	CEPH_OSD_FLAG_WRITE =          0x0020,  /* op may write */
+	CEPH_OSD_FLAG_ORDERSNAP =      0x0040,  /* EOLDSNAP if snapc is out of order */
+	CEPH_OSD_FLAG_PEERSTAT_OLD =   0x0080,  /* DEPRECATED msg includes osd_peer_stat */
+	CEPH_OSD_FLAG_BALANCE_READS =  0x0100,
+	CEPH_OSD_FLAG_PARALLELEXEC =   0x0200,  /* execute op in parallel */
+	CEPH_OSD_FLAG_PGOP =           0x0400,  /* pg op, no object */
+	CEPH_OSD_FLAG_EXEC =           0x0800,  /* op may exec */
+	CEPH_OSD_FLAG_EXEC_PUBLIC =    0x1000,  /* DEPRECATED op may exec (public) */
+	CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000,  /* read from nearby replica, if any */
+	CEPH_OSD_FLAG_RWORDERED =      0x4000,  /* order wrt concurrent reads */
+	CEPH_OSD_FLAG_IGNORE_CACHE =   0x8000,  /* ignore cache logic */
+	CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
+	CEPH_OSD_FLAG_IGNORE_OVERLAY =0x20000,  /* ignore pool overlay */
+	CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+	CEPH_OSD_FLAG_MAP_SNAP_CLONE =0x80000,  /* map snap direct to clone id
+						 */
+	CEPH_OSD_FLAG_ENFORCE_SNAPC    =0x100000,  /* use snapc provided even if
+						      pool uses pool snaps */
+	CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
+	CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+	CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+	CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
+	CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000,  /* ignore redirection */
+};
+
+enum {
+	CEPH_OSD_OP_FLAG_EXCL = 0x1,      /* EXCL object create */
+	CEPH_OSD_OP_FLAG_FAILOK = 0x2,    /* continue despite failure */
+	CEPH_OSD_OP_FLAG_FADVISE_RANDOM     = 0x4, /* the op is random */
+	CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */
+	CEPH_OSD_OP_FLAG_FADVISE_WILLNEED   = 0x10,/* data will be accessed in the near future */
+	CEPH_OSD_OP_FLAG_FADVISE_DONTNEED   = 0x20,/* data will not be accessed in the near future */
+	CEPH_OSD_OP_FLAG_FADVISE_NOCACHE   = 0x40, /* data will be accessed only once by this client */
+	CEPH_OSD_OP_FLAG_WITH_REFERENCE   = 0x80, /* need reference couting */
+	CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */
+};
+
+#define EOLDSNAPC    85  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED 108 /* blacklisted */
+
+/* xattr comparison */
+enum {
+	CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+	CEPH_OSD_CMPXATTR_OP_NE  = 2,
+	CEPH_OSD_CMPXATTR_OP_GT  = 3,
+	CEPH_OSD_CMPXATTR_OP_GTE = 4,
+	CEPH_OSD_CMPXATTR_OP_LT  = 5,
+	CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+	CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+	CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
+
+enum {
+	CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1,     /* part of a flush operation */
+	CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2,  /* ignore pool overlay */
+	CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
+	CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
+						     * cloneid */
+	CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
+};
+
+enum {
+	CEPH_OSD_TMAP2OMAP_NULLOK = 1,
+};
+
+enum {
+	CEPH_OSD_WATCH_OP_UNWATCH = 0,
+	CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+	/* note: use only ODD ids to prevent pre-giant code from
+	   interpreting the op as UNWATCH */
+	CEPH_OSD_WATCH_OP_WATCH = 3,
+	CEPH_OSD_WATCH_OP_RECONNECT = 5,
+	CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+enum {
+	CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32 = 0,
+	CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64 = 1,
+	CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C   = 2
+};
+
+const char *ceph_osd_watch_op_name(int o);
+
+enum {
+	CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+	CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+	CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+	CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+	CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+	CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+	CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+	CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128,
+	CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+	CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+
+const char *ceph_osd_alloc_hint_flag_name(int f);
+
+enum {
+	CEPH_OSD_BACKOFF_OP_BLOCK = 1,
+	CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
+	CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
+};
+
+const char *ceph_osd_backoff_op_name(int op);
+
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+	__le16 op;           /* CEPH_OSD_OP_* */
+	__le32 flags;        /* CEPH_OSD_OP_FLAG_* */
+	union {
+		struct {
+			__le64 offset, length;
+			__le64 truncate_size;
+			__le32 truncate_seq;
+		} __attribute__ ((packed)) extent;
+		struct {
+			__le32 name_len;
+			__le32 value_len;
+			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+		} __attribute__ ((packed)) xattr;
+		struct {
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+			__le32 indata_len;
+		} __attribute__ ((packed)) cls;
+		struct {
+			__le64 count;
+			__le32 start_epoch; /* for the pgls sequence */
+		} __attribute__ ((packed)) pgls;
+	        struct {
+		        __le64 snapid;
+	        } __attribute__ ((packed)) snap;
+		struct {
+			__le64 cookie;
+			__le64 ver;     /* no longer used */
+			__u8 op;	/* CEPH_OSD_WATCH_OP_* */
+			__u32 gen;      /* registration generation */
+			__u32 timeout; /* connection timeout */
+		} __attribute__ ((packed)) watch;
+		struct {
+			__le64 cookie;
+		} __attribute__ ((packed)) notify;
+		struct {
+			__le64 unused;
+			__le64 ver;
+		} __attribute__ ((packed)) assert_ver;
+		struct {
+			__le64 offset, length;
+			__le64 src_offset;
+		} __attribute__ ((packed)) clonerange;
+		struct {
+			__le64 max;     /* max data in reply */
+		} __attribute__ ((packed)) copy_get;
+		struct {
+			__le64 snapid;
+			__le64 src_version;
+			__u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */
+			/*
+			 * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags
+			 * for src object, flags for dest object are in
+			 * ceph_osd_op::flags.
+			 */
+			__le32 src_fadvise_flags;
+		} __attribute__ ((packed)) copy_from;
+		struct {
+			struct ceph_timespec stamp;
+		} __attribute__ ((packed)) hit_set_get;
+		struct {
+			__u8 flags;
+		} __attribute__ ((packed)) tmap2omap;
+		struct {
+			__le64 expected_object_size;
+			__le64 expected_write_size;
+			__le32 flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+		} __attribute__ ((packed)) alloc_hint;
+		struct {
+			__le64 offset;
+			__le64 length;
+			__le64 data_length;
+		} __attribute__ ((packed)) writesame;
+		struct {
+			__le64 offset;
+			__le64 length;
+			__le32 chunk_size;
+			__u8 type;              /* CEPH_OSD_CHECKSUM_OP_TYPE_* */
+		} __attribute__ ((packed)) checksum;
+	};
+	__le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * Check the compatibility of struct ceph_osd_op
+ *  (2+4+(2*8+8+4)+4) = (sizeof(ceph_osd_op::op) +
+ *                     sizeof(ceph_osd_op::flags) +
+ *                     sizeof(ceph_osd_op::extent) +
+ *                     sizeof(ceph_osd_op::payload_len))
+ */
+#ifdef __cplusplus
+static_assert(sizeof(ceph_osd_op) == (2+4+(2*8+8+4)+4),
+              "sizeof(ceph_osd_op) breaks the compatibility");
+#endif
+
+struct ceph_osd_reply_head {
+	__le32 client_inc;                /* client incarnation */
+	__le32 flags;
+	struct ceph_object_layout layout;
+	__le32 osdmap_epoch;
+	struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+	__le32 result;                    /* result code */
+
+	__le32 object_len;                /* length of object name */
+	__le32 num_ops;
+	struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h
new file mode 120000
index 00000000..51fc03be
--- /dev/null
+++ b/src/include/rados/buffer.h
@@ -0,0 +1 @@
+../buffer.h
+\ No newline at end of file
diff --git a/src/include/rados/buffer_fwd.h b/src/include/rados/buffer_fwd.h
new file mode 120000
index 00000000..bd1f6f1b
--- /dev/null
+++ b/src/include/rados/buffer_fwd.h
@@ -0,0 +1 @@
+../buffer_fwd.h
+\ No newline at end of file
diff --git a/src/include/rados/crc32c.h b/src/include/rados/crc32c.h
new file mode 120000
index 00000000..19ef4317
--- /dev/null
+++ b/src/include/rados/crc32c.h
@@ -0,0 +1 @@
+../crc32c.h
+\ No newline at end of file
diff --git a/src/include/rados/inline_memory.h b/src/include/rados/inline_memory.h
new file mode 120000
index 00000000..48f0d443
--- /dev/null
+++ b/src/include/rados/inline_memory.h
@@ -0,0 +1 @@
+../inline_memory.h
+\ No newline at end of file
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
new file mode 100644
index 00000000..58a65afa
--- /dev/null
+++ b/src/include/rados/librados.h
@@ -0,0 +1,4015 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRADOS_H
+#define CEPH_LIBRADOS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include <unistd.h>
+#include <string.h>
+#include "rados_types.h"
+
+#include <sys/time.h>
+
+#ifndef CEPH_OSD_TMAP_SET
+/* These are also defined in rados.h and objclass.h. Keep them in sync! */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c'
+#define CEPH_OSD_TMAP_RM  'r'
+#endif
+
+#define LIBRADOS_VER_MAJOR 3
+#define LIBRADOS_VER_MINOR 0
+#define LIBRADOS_VER_EXTRA 0
+
+#define LIBRADOS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA)
+
+#define LIBRADOS_SUPPORTS_WATCH 1
+#define LIBRADOS_SUPPORTS_SERVICES 1
+#define LIBRADOS_SUPPORTS_GETADDRS 1
+#define LIBRADOS_SUPPORTS_APP_METADATA 1
+
+/* RADOS lock flags
+ * They are also defined in cls_lock_types.h. Keep them in sync!
+ */
+#define LIBRADOS_LOCK_FLAG_RENEW 0x1
+
+/*
+ * Constants for rados_write_op_create().
+ */
+#define LIBRADOS_CREATE_EXCLUSIVE 1
+#define LIBRADOS_CREATE_IDEMPOTENT 0
+
+/*
+ * Flags that can be set on a per-op basis via
+ * rados_read_op_set_flags() and rados_write_op_set_flags().
+ */
+enum {
+  // fail a create operation if the object already exists
+  LIBRADOS_OP_FLAG_EXCL               =  0x1,
+  // allow the transaction to succeed even if the flagged op fails
+  LIBRADOS_OP_FLAG_FAILOK 	      = 0x2,
+  // indicate read/write op random
+  LIBRADOS_OP_FLAG_FADVISE_RANDOM     = 0x4,
+  // indicate read/write op sequential
+  LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8,
+  // indicate read/write data will be accessed in the near future (by someone)
+  LIBRADOS_OP_FLAG_FADVISE_WILLNEED   = 0x10,
+  // indicate read/write data will not accessed in the near future (by anyone)
+  LIBRADOS_OP_FLAG_FADVISE_DONTNEED   = 0x20,
+  // indicate read/write data will not accessed again (by *this* client)
+  LIBRADOS_OP_FLAG_FADVISE_NOCACHE    = 0x40,
+  // optionally support FUA (force unit access) on write requests
+  LIBRADOS_OP_FLAG_FADVISE_FUA        = 0x80,
+};
+
+#define CEPH_RADOS_API
+
+/**
+ * @name xattr comparison operations
+ * Operators for comparing xattrs on objects, and aborting the
+ * rados_read_op or rados_write_op transaction if the comparison
+ * fails.
+ *
+ * @{
+ */
+enum {
+	LIBRADOS_CMPXATTR_OP_EQ  = 1,
+	LIBRADOS_CMPXATTR_OP_NE  = 2,
+	LIBRADOS_CMPXATTR_OP_GT  = 3,
+	LIBRADOS_CMPXATTR_OP_GTE = 4,
+	LIBRADOS_CMPXATTR_OP_LT  = 5,
+	LIBRADOS_CMPXATTR_OP_LTE = 6
+};
+/** @} */
+
+/**
+ * @name Operation Flags
+ * Flags for rados_read_op_operate(), rados_write_op_operate(),
+ * rados_aio_read_op_operate(), and rados_aio_write_op_operate().
+ * See librados.hpp for details.
+ * @{
+ */
+enum {
+  LIBRADOS_OPERATION_NOFLAG             = 0,
+  LIBRADOS_OPERATION_BALANCE_READS      = 1,
+  LIBRADOS_OPERATION_LOCALIZE_READS     = 2,
+  LIBRADOS_OPERATION_ORDER_READS_WRITES = 4,
+  LIBRADOS_OPERATION_IGNORE_CACHE       = 8,
+  LIBRADOS_OPERATION_SKIPRWLOCKS        = 16,
+  LIBRADOS_OPERATION_IGNORE_OVERLAY     = 32,
+  /* send requests to cluster despite the cluster or pool being marked
+     full; ops will either succeed (e.g., delete) or return EDQUOT or
+     ENOSPC. */
+  LIBRADOS_OPERATION_FULL_TRY           = 64,
+  /*
+   * Mainly for delete op
+   */
+  LIBRADOS_OPERATION_FULL_FORCE		= 128,
+  LIBRADOS_OPERATION_IGNORE_REDIRECT	= 256,
+  LIBRADOS_OPERATION_ORDERSNAP          = 512,
+};
+/** @} */
+
+/**
+ * @name Alloc hint flags
+ * Flags for rados_write_op_alloc_hint2() and rados_set_alloc_hint2()
+ * indicating future IO patterns.
+ * @{
+ */
+enum {
+  LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+  LIBRADOS_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+  LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+  LIBRADOS_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+  LIBRADOS_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+  LIBRADOS_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+  LIBRADOS_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+  LIBRADOS_ALLOC_HINT_FLAG_LONGLIVED = 128,
+  LIBRADOS_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+  LIBRADOS_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+/** @} */
+
+typedef enum {
+	LIBRADOS_CHECKSUM_TYPE_XXHASH32 = 0,
+	LIBRADOS_CHECKSUM_TYPE_XXHASH64 = 1,
+	LIBRADOS_CHECKSUM_TYPE_CRC32C   = 2
+} rados_checksum_type_t;
+
+/*
+ * snap id contants
+ */
+#define LIBRADOS_SNAP_HEAD  ((uint64_t)(-2))
+#define LIBRADOS_SNAP_DIR   ((uint64_t)(-1))
+
+/**
+ * @typedef rados_t
+ *
+ * A handle for interacting with a RADOS cluster. It encapsulates all
+ * RADOS client configuration, including username, key for
+ * authentication, logging, and debugging. Talking different clusters
+ * -- or to the same cluster with different users -- requires
+ * different cluster handles.
+ */
+#ifndef VOIDPTR_RADOS_T
+#define VOIDPTR_RADOS_T
+typedef void *rados_t;
+#endif //VOIDPTR_RADOS_T
+
+/**
+ * @typedef rados_config_t
+ *
+ * A handle for the ceph configuration context for the rados_t cluster
+ * instance.  This can be used to share configuration context/state
+ * (e.g., logging configuration) between librados instance.
+ *
+ * @warning The config context does not have independent reference
+ * counting.  As such, a rados_config_t handle retrieved from a given
+ * rados_t is only valid as long as that rados_t.
+ */
+typedef void *rados_config_t;
+
+/**
+ * @typedef rados_ioctx_t
+ *
+ * An io context encapsulates a few settings for all I/O operations
+ * done on it:
+ * - pool - set when the io context is created (see rados_ioctx_create())
+ * - snapshot context for writes (see
+ *   rados_ioctx_selfmanaged_snap_set_write_ctx())
+ * - snapshot id to read from (see rados_ioctx_snap_set_read())
+ * - object locator for all single-object operations (see
+ *   rados_ioctx_locator_set_key())
+ * - namespace for all single-object operations (see
+ *   rados_ioctx_set_namespace()).  Set to LIBRADOS_ALL_NSPACES
+ *   before rados_nobjects_list_open() will list all objects in all
+ *   namespaces.
+ *
+ * @warning Changing any of these settings is not thread-safe -
+ * librados users must synchronize any of these changes on their own,
+ * or use separate io contexts for each thread
+ */
+typedef void *rados_ioctx_t;
+
+/**
+ * @typedef rados_list_ctx_t
+ *
+ * An iterator for listing the objects in a pool.
+ * Used with rados_nobjects_list_open(),
+ * rados_nobjects_list_next(), rados_nobjects_list_next2(), and
+ * rados_nobjects_list_close().
+ */
+typedef void *rados_list_ctx_t;
+
+/**
+ * @typedef rados_object_list_cursor
+ *
+ * The cursor used with rados_enumerate_objects
+ * and accompanying methods.
+ */
+typedef void * rados_object_list_cursor;
+
+/**
+ * @struct rados_object_list_item
+ *
+ * The item populated by rados_object_list in
+ * the results array.
+ */
+typedef struct rados_object_list_item {
+
+  /// oid length
+  size_t oid_length;
+  /// name of the object
+  char *oid;
+  /// namespace length
+  size_t nspace_length;
+  /// the object namespace
+  char *nspace;
+  /// locator length
+  size_t locator_length;
+  /// object locator
+  char *locator;
+} rados_object_list_item;
+
+/**
+ * @typedef rados_snap_t
+ * The id of a snapshot.
+ */
+typedef uint64_t rados_snap_t;
+
+/**
+ * @typedef rados_xattrs_iter_t
+ * An iterator for listing extended attrbutes on an object.
+ * Used with rados_getxattrs(), rados_getxattrs_next(), and
+ * rados_getxattrs_end().
+ */
+typedef void *rados_xattrs_iter_t;
+
+/**
+ * @typedef rados_omap_iter_t
+ * An iterator for listing omap key/value pairs on an object.
+ * Used with rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals(),
+ * rados_read_op_omap_get_vals_by_keys(), rados_omap_get_next(), and
+ * rados_omap_get_end().
+ */
+typedef void *rados_omap_iter_t;
+
+/**
+ * @struct rados_pool_stat_t
+ * Usage information for a pool.
+ */
+struct rados_pool_stat_t {
+  /// space used in bytes
+  uint64_t num_bytes;
+  /// space used in KB
+  uint64_t num_kb;
+  /// number of objects in the pool
+  uint64_t num_objects;
+  /// number of clones of objects
+  uint64_t num_object_clones;
+  /// num_objects * num_replicas
+  uint64_t num_object_copies;
+  /// number of objects missing on primary
+  uint64_t num_objects_missing_on_primary;
+  /// number of objects found on no OSDs
+  uint64_t num_objects_unfound;
+  /// number of objects replicated fewer times than they should be
+  /// (but found on at least one OSD)
+  uint64_t num_objects_degraded;
+  /// number of objects read
+  uint64_t num_rd;
+  /// objects read in KB
+  uint64_t num_rd_kb;
+  /// number of objects written
+  uint64_t num_wr;
+  /// objects written in KB
+  uint64_t num_wr_kb;
+  /// bytes originally provided by user
+  uint64_t num_user_bytes;
+  /// bytes passed compression
+  uint64_t compressed_bytes_orig;
+  /// bytes resulted after compression
+  uint64_t compressed_bytes;
+  /// bytes allocated at storage
+  uint64_t compressed_bytes_alloc;
+};
+
+/**
+ * @struct rados_cluster_stat_t
+ * Cluster-wide usage information
+ */
+struct rados_cluster_stat_t {
+  /// total device size
+  uint64_t kb;
+  /// total used
+  uint64_t kb_used;
+  /// total available/free
+  uint64_t kb_avail;
+  /// number of objects
+  uint64_t num_objects;
+};
+
+/**
+ * @typedef rados_write_op_t
+ *
+ * An object write operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_write_op() rados_release_write_op()
+ * - Extended attribute manipulation: rados_write_op_cmpxattr()
+ *   rados_write_op_cmpxattr(), rados_write_op_setxattr(),
+ *   rados_write_op_rmxattr()
+ * - Object map key/value pairs: rados_write_op_omap_set(),
+ *   rados_write_op_omap_rm_keys(), rados_write_op_omap_clear(),
+ *   rados_write_op_omap_cmp()
+ * - Object properties: rados_write_op_assert_exists(),
+ *   rados_write_op_assert_version()
+ * - Creating objects: rados_write_op_create()
+ * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero
+ *   rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove,
+ *   rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext()
+ * - Hints: rados_write_op_set_alloc_hint()
+ * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate()
+ */
+typedef void *rados_write_op_t;
+
+/**
+ * @typedef rados_read_op_t
+ *
+ * An object read operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_read_op() rados_release_read_op()
+ * - Extended attribute manipulation: rados_read_op_cmpxattr(),
+ *   rados_read_op_getxattr(), rados_read_op_getxattrs()
+ * - Object map key/value pairs: rados_read_op_omap_get_vals(),
+ *   rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals_by_keys(),
+ *   rados_read_op_omap_cmp()
+ * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(),
+ *   rados_read_op_assert_version()
+ * - IO on objects: rados_read_op_read(), rados_read_op_checksum(),
+ *   rados_read_op_cmpext()
+ * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf()
+ * - Request properties: rados_read_op_set_flags()
+ * - Performing the operation: rados_read_op_operate(),
+ *   rados_aio_read_op_operate()
+ */
+typedef void *rados_read_op_t;
+
+/**
+ * @typedef rados_completion_t
+ * Represents the state of an asynchronous operation - it contains the
+ * return value once the operation completes, and can be used to block
+ * until the operation is complete or safe.
+ */
+typedef void *rados_completion_t;
+
+/**
+ * @struct blkin_trace_info
+ * blkin trace information for Zipkin tracing
+ */
+struct blkin_trace_info;
+
+/**
+ * Get the version of librados.
+ *
+ * The version number is major.minor.extra. Note that this is
+ * unrelated to the Ceph version number.
+ *
+ * TODO: define version semantics, i.e.:
+ * - incrementing major is for backwards-incompatible changes
+ * - incrementing minor is for backwards-compatible changes
+ * - incrementing extra is for bug fixes
+ *
+ * @param major where to store the major version number
+ * @param minor where to store the minor version number
+ * @param extra where to store the extra version number
+ */
+CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra);
+
+/**
+ * @name Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using librados.
+ *
+ * @{
+ */
+
+/**
+ * Create a handle for communicating with a RADOS cluster.
+ *
+ * Ceph environment variables are read when this is called, so if
+ * $CEPH_ARGS specifies everything you need to connect, no further
+ * configuration is necessary.
+ *
+ * @param cluster where to store the handle
+ * @param id the user to connect as (i.e. admin, not client.admin)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create(rados_t *cluster, const char * const id);
+
+/**
+ * Extended version of rados_create.
+ *
+ * Like rados_create, but 
+ * 1) don't assume 'client\.'+id; allow full specification of name
+ * 2) allow specification of cluster name
+ * 3) flags for future expansion
+ */
+CEPH_RADOS_API int rados_create2(rados_t *pcluster,
+                                 const char *const clustername,
+                                 const char * const name, uint64_t flags);
+
+/**
+ * Initialize a cluster handle from an existing configuration.
+ *
+ * Share configuration state with another rados_t instance.
+ *
+ * @param cluster where to store the handle
+ * @param cct the existing configuration to use
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create_with_context(rados_t *cluster,
+                                             rados_config_t cct);
+
+/**
+ * Ping the monitor with ID mon_id, storing the resulting reply in
+ * buf (if specified) with a maximum size of len.
+ *
+ * The result buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free().  The
+ * buffer and length pointers can be NULL, in which case they are
+ * not filled in.
+ *
+ * @param      cluster    cluster handle
+ * @param[in]  mon_id     ID of the monitor to ping
+ * @param[out] outstr     double pointer with the resulting reply
+ * @param[out] outstrlen  pointer with the size of the reply in outstr
+ */
+CEPH_RADOS_API int rados_ping_monitor(rados_t cluster, const char *mon_id,
+                                      char **outstr, size_t *outstrlen);
+
+/**
+ * Connect to the cluster.
+ *
+ * @note BUG: Before calling this, calling a function that communicates with the
+ * cluster will crash.
+ *
+ * @pre The cluster handle is configured with at least a monitor
+ * address. If cephx is enabled, a client name and secret must also be
+ * set.
+ *
+ * @post If this succeeds, any function in librados may be used
+ *
+ * @param cluster The cluster to connect to.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_connect(rados_t cluster);
+
+/**
+ * Disconnects from the cluster.
+ *
+ * For clean up, this is only necessary after rados_connect() has
+ * succeeded.
+ *
+ * @warning This does not guarantee any asynchronous writes have
+ * completed. To do that, you must call rados_aio_flush() on all open
+ * io contexts.
+ *
+ * @warning We implicitly call rados_watch_flush() on shutdown.  If
+ * there are watches being used, this should be done explicitly before
+ * destroying the relevant IoCtx.  We do it here as a safety measure.
+ *
+ * @post the cluster handle cannot be used again
+ *
+ * @param cluster the cluster to shutdown
+ */
+CEPH_RADOS_API void rados_shutdown(rados_t cluster);
+
+/** @} init */
+
+/**
+ * @name Configuration
+ * These functions read and update Ceph configuration for a cluster
+ * handle. Any configuration changes must be done before connecting to
+ * the cluster.
+ *
+ * Options that librados users might want to set include:
+ * - mon_host
+ * - auth_supported
+ * - key, keyfile, or keyring when using cephx
+ * - log_file, log_to_stderr, err_to_stderr, and log_to_syslog
+ * - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms
+ *
+ * See docs.ceph.com for information about available configuration options`
+ *
+ * @{
+ */
+
+/**
+ * Configure the cluster handle using a Ceph config file
+ *
+ * If path is NULL, the default locations are searched, and the first
+ * found is used. The locations are:
+ * - $CEPH_CONF (environment variable)
+ * - /etc/ceph/ceph.conf
+ * - ~/.ceph/config
+ * - ceph.conf (in the current working directory)
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param path path to a Ceph configuration file
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_read_file(rados_t cluster, const char *path);
+
+/**
+ * Configure the cluster handle with command line arguments
+ *
+ * argv can contain any common Ceph command line option, including any
+ * configuration parameter prefixed by '--' and replacing spaces with
+ * dashes or underscores. For example, the following options are equivalent:
+ * - --mon-host 10.0.0.1:6789
+ * - --mon_host 10.0.0.1:6789
+ * - -m 10.0.0.1:6789
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv(rados_t cluster, int argc,
+                                         const char **argv);
+
+
+/**
+ * Configure the cluster handle with command line arguments, returning
+ * any remainders.  Same rados_conf_parse_argv, except for extra
+ * remargv argument to hold returns unrecognized arguments.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @param remargv char* array for returned unrecognized arguments
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv_remainder(rados_t cluster, int argc,
+				                   const char **argv,
+                                                   const char **remargv);
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cluster cluster handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_env(rados_t cluster, const char *var);
+
+/**
+ * Set a configuration option
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param option option to set
+ * @param value value of the option
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when the option is not a Ceph configuration option
+ */
+CEPH_RADOS_API int rados_conf_set(rados_t cluster, const char *option,
+                                  const char *value);
+
+/**
+ * Get the value of a configuration option
+ *
+ * @param cluster configuration to read
+ * @param option which option to read
+ * @param buf where to write the configuration value
+ * @param len the size of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENAMETOOLONG if the buffer is too short to contain the
+ * requested value
+ */
+CEPH_RADOS_API int rados_conf_get(rados_t cluster, const char *option,
+                                  char *buf, size_t len);
+
+/** @} config */
+
+/**
+ * Read usage info about the cluster
+ *
+ * This tells you total space, space used, space available, and number
+ * of objects. These are not updated immediately when data is written,
+ * they are eventually consistent.
+ *
+ * @param cluster cluster to query
+ * @param result where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cluster_stat(rados_t cluster,
+                                      struct rados_cluster_stat_t *result);
+
+/**
+ * Get the fsid of the cluster as a hexadecimal string.
+ *
+ * The fsid is a unique id of an entire Ceph cluster.
+ *
+ * @param cluster where to get the fsid
+ * @param buf where to write the fsid
+ * @param len the size of buf in bytes (should be 37)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the buffer is too short to contain the
+ * fsid
+ */
+CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len);
+
+/**
+ * Get/wait for the most recent osdmap
+ * 
+ * @param cluster the cluster to shutdown
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster);
+
+/**
+ * @name Pools
+ *
+ * RADOS pools are separate namespaces for objects. Pools may have
+ * different crush rules associated with them, so they could have
+ * differing replication levels or placement strategies. RADOS
+ * permissions are also tied to pools - users can have different read,
+ * write, and execute permissions on a per-pool basis.
+ *
+ * @{
+ */
+
+/**
+ * List pools
+ *
+ * Gets a list of pool names as NULL-terminated strings.  The pool
+ * names will be placed in the supplied buffer one after another.
+ * After the last pool name, there will be two 0 bytes in a row.
+ *
+ * If len is too short to fit all the pool name entries we need, we will fill
+ * as much as we can.
+ *
+ * Buf may be null to determine the buffer size needed to list all pools.
+ *
+ * @param cluster cluster handle
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_pool_list(rados_t cluster, char *buf, size_t len);
+
+/**
+ * List inconsistent placement groups of the given pool
+ *
+ * Gets a list of inconsistent placement groups as NULL-terminated strings.
+ * The placement group names will be placed in the supplied buffer one after
+ * another. After the last name, there will be two 0 types in a row.
+ *
+ * If len is too short to fit all the placement group entries we need, we  will
+ * fill as much as we can.
+ *
+ * @param cluster cluster handle
+ * @param pool pool ID
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_inconsistent_pg_list(rados_t cluster, int64_t pool,
+					      char *buf, size_t len);
+
+/**
+ * Get a configuration handle for a rados cluster handle
+ *
+ * This handle is valid only as long as the cluster handle is valid.
+ *
+ * @param cluster cluster handle
+ * @returns config handle for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_cct(rados_t cluster);
+
+/**
+ * Get a global id for current instance
+ *
+ * This id is a unique representation of current connection to the cluster
+ *
+ * @param cluster cluster handle
+ * @returns instance global id
+ */
+CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster);
+
+/**
+ * Gets the minimum compatible OSD version
+ *
+ * @param cluster cluster handle
+ * @param[out] require_osd_release minimum compatible OSD version
+ *  based upon the current features
+ * @returns 0 on sucess, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_osd(rados_t cluster,
+                                                int8_t* require_osd_release);
+
+/**
+ * Gets the minimum compatible client version
+ *
+ * @param cluster cluster handle
+ * @param[out] min_compat_client minimum compatible client version
+ *  based upon the current features
+ * @param[out] require_min_compat_client required minimum client version
+ *  based upon explicit setting
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_client(rados_t cluster,
+                                                   int8_t* min_compat_client,
+                                                   int8_t* require_min_compat_client);
+
+/**
+ * Create an io context
+ *
+ * The io context allows you to perform operations within a particular
+ * pool. For more details see rados_ioctx_t.
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name name of the pool
+ * @param ioctx where to store the io context
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_create(rados_t cluster, const char *pool_name,
+                                      rados_ioctx_t *ioctx);
+CEPH_RADOS_API int rados_ioctx_create2(rados_t cluster, int64_t pool_id,
+                                       rados_ioctx_t *ioctx);
+
+/**
+ * The opposite of rados_ioctx_create
+ *
+ * This just tells librados that you no longer need to use the io context.
+ * It may not be freed immediately if there are pending asynchronous
+ * requests on it, but you should not use an io context again after
+ * calling this function on it.
+ *
+ * @warning This does not guarantee any asynchronous
+ * writes have completed. You must call rados_aio_flush()
+ * on the io context before destroying it to do that.
+ *
+ * @warning If this ioctx is used by rados_watch, the caller needs to
+ * be sure that all registered watches are disconnected via
+ * rados_unwatch() and that rados_watch_flush() is called.  This
+ * ensures that a racing watch callback does not make use of a
+ * destroyed ioctx.
+ *
+ * @param io the io context to dispose of
+ */
+CEPH_RADOS_API void rados_ioctx_destroy(rados_ioctx_t io);
+
+/**
+ * Get configuration handle for a pool handle
+ *
+ * @param io pool handle
+ * @returns rados_config_t for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_ioctx_cct(rados_ioctx_t io);
+
+/**
+ * Get the cluster handle used by this rados_ioctx_t
+ * Note that this is a weak reference, and should not
+ * be destroyed via rados_shutdown().
+ *
+ * @param io the io context
+ * @returns the cluster handle for this io context
+ */
+CEPH_RADOS_API rados_t rados_ioctx_get_cluster(rados_ioctx_t io);
+
+/**
+ * Get pool usage statistics
+ *
+ * Fills in a rados_pool_stat_t after querying the cluster.
+ *
+ * @param io determines which pool to query
+ * @param stats where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_stat(rados_ioctx_t io,
+                                         struct rados_pool_stat_t *stats);
+
+/**
+ * Get the id of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name which pool to look up
+ * @returns id of the pool
+ * @returns -ENOENT if the pool is not found
+ */
+CEPH_RADOS_API int64_t rados_pool_lookup(rados_t cluster,
+                                         const char *pool_name);
+
+/**
+ * Get the name of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param id the id of the pool
+ * @param buf where to store the pool name
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_pool_reverse_lookup(rados_t cluster, int64_t id,
+                                             char *buf, size_t maxlen);
+
+/**
+ * Create a pool with default settings
+ *
+ * The default crush rule is rule 0.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create(rados_t cluster, const char *pool_name);
+
+/**
+ * Create a pool owned by a specific auid.
+ *
+ * DEPRECATED: auid support has been removed, and this call will be removed in a future
+ * release.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_auid(rados_t cluster,
+                                               const char *pool_name,
+                                               uint64_t auid)
+  __attribute__((deprecated));
+
+/**
+ * Create a pool with a specific CRUSH rule
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool1
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_crush_rule(rados_t cluster,
+                                                     const char *pool_name,
+				                     uint8_t crush_rule_num);
+
+/**
+ * Create a pool with a specific CRUSH rule and auid
+ *
+ * DEPRECATED: auid support has been removed and this call will be removed
+ * in a future release.
+ *
+ * This is a combination of rados_pool_create_with_crush_rule() and
+ * rados_pool_create_with_auid().
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool2
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_all(rados_t cluster,
+                                              const char *pool_name,
+                                              uint64_t auid,
+			                      uint8_t crush_rule_num)
+  __attribute__((deprecated));
+
+/**
+ * Returns the pool that is the base tier for this pool.
+ *
+ * The return value is the ID of the pool that should be used to read from/write to.
+ * If tiering is not set up for the pool, returns \c pool.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool ID of the pool to query
+ * @param[out] base_tier base tier, or \c pool if tiering is not configured
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool,
+                                            int64_t* base_tier);
+
+/**
+ * Delete a pool and all data inside it
+ *
+ * The pool is removed from the cluster immediately,
+ * but the actual data is deleted in the background.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool_name which pool to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name);
+
+/**
+ * Attempt to change an io context's associated auid "owner"
+ *
+ * DEPRECATED: auid support has been removed and this call has no effect.
+ *
+ * Requires that you have write permission on both the current and new
+ * auid.
+ *
+ * @param io reference to the pool to change.
+ * @param auid the auid you wish the io to have.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid)
+  __attribute__((deprecated));
+
+
+/**
+ * Get the auid of a pool
+ *
+ * DEPRECATED: auid support has been removed and this call always reports
+ * CEPH_AUTH_UID_DEFAULT (-1).
+
+ * @param io pool to query
+ * @param auid where to store the auid
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid)
+  __attribute__((deprecated));
+
+/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io)
+  __attribute__((deprecated));
+
+/**
+ * Test whether the specified pool requires alignment or not.
+ *
+ * @param io pool to query
+ * @param req 1 if alignment is supported, 0 if not.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io,
+  int *req);
+
+/* deprecated, use rados_ioctx_pool_required_alignment2 instead */
+CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io)
+  __attribute__((deprecated));
+
+/**
+ * Get the alignment flavor of a pool
+ *
+ * @param io pool to query
+ * @param alignment where to store the alignment flavor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io,
+  uint64_t *alignment);
+
+/**
+ * Get the pool id of the io context
+ *
+ * @param io the io context to query
+ * @returns the id of the pool the io context uses
+ */
+CEPH_RADOS_API int64_t rados_ioctx_get_id(rados_ioctx_t io);
+
+/**
+ * Get the pool name of the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf,
+                                             unsigned maxlen);
+
+/** @} pools */
+
+/**
+ * @name Object Locators
+ *
+ * @{
+ */
+
+/**
+ * Set the key for mapping objects to pgs within an io context.
+ *
+ * The key is used instead of the object name to determine which
+ * placement groups an object is put in. This affects all subsequent
+ * operations of the io context - until a different locator key is
+ * set, all objects in this io context will be placed in the same pg.
+ *
+ * @param io the io context to change
+ * @param key the key to use as the object locator, or NULL to discard
+ * any previously set key
+ */
+CEPH_RADOS_API void rados_ioctx_locator_set_key(rados_ioctx_t io,
+                                                const char *key);
+
+/**
+ * Set the namespace for objects within an io context
+ *
+ * The namespace specification further refines a pool into different
+ * domains.  The mapping of objects to pgs is also based on this
+ * value.
+ *
+ * @param io the io context to change
+ * @param nspace the name to use as the namespace, or NULL use the
+ * default namespace
+ */
+CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io,
+                                              const char *nspace);
+
+/**
+ * Get the namespace for objects within the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_namespace(rados_ioctx_t io, char *buf,
+                                             unsigned maxlen);
+
+/** @} obj_loc */
+
+/**
+ * @name Listing Objects
+ * @{
+ */
+/**
+ * Start listing objects in a pool
+ *
+ * @param io the pool to list from
+ * @param ctx the handle to store list context in
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_open(rados_ioctx_t io,
+                                            rados_list_ctx_t *ctx);
+
+/**
+ * Return hash position of iterator, rounded to the current PG
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @returns current hash position, rounded to the current pg
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_get_pg_hash_position(rados_list_ctx_t ctx);
+
+/**
+ * Reposition object iterator to a different hash position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param pos hash position to move to
+ * @returns actual (rounded) position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek(rados_list_ctx_t ctx,
+                                                 uint32_t pos);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor position to move to
+ * @returns rounded position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek_cursor(rados_list_ctx_t ctx,
+                                                        rados_object_list_cursor cursor);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * The returned handle must be released with rados_object_list_cursor_free().
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor where to store cursor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_get_cursor(rados_list_ctx_t ctx,
+                                                  rados_object_list_cursor *cursor);
+
+/**
+ * Get the next object name and locator in the pool
+ *
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next(rados_list_ctx_t ctx,
+                                            const char **entry,
+	                                    const char **key,
+                                            const char **nspace);
+
+/**
+ * Get the next object name, locator and their sizes in the pool
+ *
+ * The sizes allow to list objects with \0 (the NUL character)
+ * in .e.g *entry. Is is unusual see such object names but a bug
+ * in a client has risen the need to handle them as well.
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @param entry_size where to store the size of name of the entry
+ * @param key_size where to store the size of object locator (set to NULL to ignore)
+ * @param nspace_size where to store the size of object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next2(rados_list_ctx_t ctx,
+                                             const char **entry,
+                                             const char **key,
+                                             const char **nspace,
+                                             size_t *entry_size,
+                                             size_t *key_size,
+                                             size_t *nspace_size);
+
+/**
+ * Close the object listing handle.
+ *
+ * This should be called when the handle is no longer needed.
+ * The handle should not be used after it has been closed.
+ *
+ * @param ctx the handle to close
+ */
+CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx);
+
+/**
+ * Get cursor handle pointing to the *beginning* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool.  It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_begin(
+  rados_ioctx_t io);
+
+/**
+ * Get cursor handle pointing to the *end* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool.  It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_end(rados_ioctx_t io);
+
+/**
+ * Check if a cursor has reached the end of a pool
+ *
+ * @param io ioctx
+ * @param cur cursor
+ * @returns 1 if the cursor has reached the end of the pool, 0 otherwise
+ */
+CEPH_RADOS_API int rados_object_list_is_end(rados_ioctx_t io,
+    rados_object_list_cursor cur);
+
+/**
+ * Release a cursor
+ *
+ * Release a cursor.  The handle may not be used after this point.
+ *
+ * @param io ioctx
+ * @param cur cursor
+ */
+CEPH_RADOS_API void rados_object_list_cursor_free(rados_ioctx_t io,
+    rados_object_list_cursor cur);
+
+/**
+ * Compare two cursor positions
+ *
+ * Compare two cursors, and indicate whether the first cursor precedes,
+ * matches, or follows the second.
+ *
+ * @param io ioctx
+ * @param lhs first cursor
+ * @param rhs second cursor
+ * @returns -1, 0, or 1 for lhs < rhs, lhs == rhs, or lhs > rhs
+ */
+CEPH_RADOS_API int rados_object_list_cursor_cmp(rados_ioctx_t io,
+    rados_object_list_cursor lhs, rados_object_list_cursor rhs);
+
+/**
+ * @return the number of items set in the results array
+ */
+CEPH_RADOS_API int rados_object_list(rados_ioctx_t io,
+    const rados_object_list_cursor start,
+    const rados_object_list_cursor finish,
+    const size_t result_size,
+    const char *filter_buf,
+    const size_t filter_buf_len,
+    rados_object_list_item *results,
+    rados_object_list_cursor *next);
+
+CEPH_RADOS_API void rados_object_list_free(
+    const size_t result_size,
+    rados_object_list_item *results);
+
+/**
+ * Obtain cursors delineating a subset of a range.  Use this
+ * when you want to split up the work of iterating over the
+ * global namespace.  Expected use case is when you are iterating
+ * in parallel, with `m` workers, and each worker taking an id `n`.
+ *
+ * @param io ioctx
+ * @param start start of the range to be sliced up (inclusive)
+ * @param finish end of the range to be sliced up (exclusive)
+ * @param n which of the m chunks you would like to get cursors for
+ * @param m how many chunks to divide start-finish into
+ * @param split_start cursor populated with start of the subrange (inclusive)
+ * @param split_finish cursor populated with end of the subrange (exclusive)
+ */
+CEPH_RADOS_API void rados_object_list_slice(rados_ioctx_t io,
+    const rados_object_list_cursor start,
+    const rados_object_list_cursor finish,
+    const size_t n,
+    const size_t m,
+    rados_object_list_cursor *split_start,
+    rados_object_list_cursor *split_finish);
+
+
+/** @} Listing Objects */
+
+/**
+ * @name Snapshots
+ *
+ * RADOS snapshots are based upon sequence numbers that form a
+ * snapshot context. They are pool-specific. The snapshot context
+ * consists of the current snapshot sequence number for a pool, and an
+ * array of sequence numbers at which snapshots were taken, in
+ * descending order. Whenever a snapshot is created or deleted, the
+ * snapshot sequence number for the pool is increased. To add a new
+ * snapshot, the new snapshot sequence number must be increased and
+ * added to the snapshot context.
+ *
+ * There are two ways to manage these snapshot contexts:
+ * -# within the RADOS cluster
+ *    These are called pool snapshots, and store the snapshot context
+ *    in the OSDMap. These represent a snapshot of all the objects in
+ *    a pool.
+ * -# within the RADOS clients
+ *    These are called self-managed snapshots, and push the
+ *    responsibility for keeping track of the snapshot context to the
+ *    clients. For every write, the client must send the snapshot
+ *    context. In librados, this is accomplished with
+ *    rados_selfmanaged_snap_set_write_ctx(). These are more
+ *    difficult to manage, but are restricted to specific objects
+ *    instead of applying to an entire pool.
+ *
+ * @{
+ */
+
+/**
+ * Create a pool-wide snapshot
+ *
+ * @param io the pool to snapshot
+ * @param snapname the name of the snapshot
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_create(rados_ioctx_t io,
+                                           const char *snapname);
+
+/**
+ * Delete a pool snapshot
+ *
+ * @param io the pool to delete the snapshot from
+ * @param snapname which snapshot to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_remove(rados_ioctx_t io,
+                                           const char *snapname);
+
+/**
+ * Rollback an object to a pool snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapname which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid,
+		                             const char *snapname);
+
+/**
+ * @warning Deprecated: Use rados_ioctx_snap_rollback() instead
+ */
+CEPH_RADOS_API int rados_rollback(rados_ioctx_t io, const char *oid,
+				  const char *snapname)
+  __attribute__((deprecated));
+
+/**
+ * Set the snapshot from which reads are performed.
+ *
+ * Subsequent reads will return data as it was at the time of that
+ * snapshot.
+ *
+ * @param io the io context to change
+ * @param snap the id of the snapshot to set, or LIBRADOS_SNAP_HEAD for no
+ * snapshot (i.e. normal operation)
+ */
+CEPH_RADOS_API void rados_ioctx_snap_set_read(rados_ioctx_t io,
+                                              rados_snap_t snap);
+
+/**
+ * Allocate an ID for a self-managed snapshot
+ *
+ * Get a unique ID to put in the snaphot context to create a
+ * snapshot. A clone of an object is not created until a write with
+ * the new snapshot context is completed.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+                                                       rados_snap_t *snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+                                        rados_snap_t *snapid,
+                                        rados_completion_t completion);
+
+/**
+ * Remove a self-managed snapshot
+ *
+ * This increases the snapshot sequence number, which will cause
+ * snapshots to be removed lazily.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+                                                       rados_snap_t snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+                                        rados_snap_t snapid,
+                                        rados_completion_t completion);
+
+/**
+ * Rollback an object to a self-managed snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapid which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io,
+                                                         const char *oid,
+                                                         rados_snap_t snapid);
+
+/**
+ * Set the snapshot context for use when writing to objects
+ *
+ * This is stored in the io context, and applies to all future writes.
+ *
+ * @param io the io context to change
+ * @param seq the newest snapshot sequence number for the pool
+ * @param snaps array of snapshots in sorted by descending id
+ * @param num_snaps how many snaphosts are in the snaps array
+ * @returns 0 on success, negative error code on failure
+ * @returns -EINVAL if snaps are not in descending order
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io,
+                                                              rados_snap_t seq,
+                                                              rados_snap_t *snaps,
+                                                              int num_snaps);
+
+/**
+ * List all the ids of pool snapshots
+ *
+ * If the output array does not have enough space to fit all the
+ * snapshots, -ERANGE is returned and the caller should retry with a
+ * larger array.
+ *
+ * @param io the pool to read from
+ * @param snaps where to store the results
+ * @param maxlen the number of rados_snap_t that fit in the snaps array
+ * @returns number of snapshots on success, negative error code on failure
+ * @returns -ERANGE is returned if the snaps array is too short
+ */
+CEPH_RADOS_API int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t *snaps,
+                                         int maxlen);
+
+/**
+ * Get the id of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param name the snapshot to find
+ * @param id where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_lookup(rados_ioctx_t io, const char *name,
+                                           rados_snap_t *id);
+
+/**
+ * Get the name of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param id the snapshot to find
+ * @param name where to store the result
+ * @param maxlen the size of the name array
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the name array is too small
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_name(rados_ioctx_t io, rados_snap_t id,
+                                             char *name, int maxlen);
+
+/**
+ * Find when a pool snapshot occurred
+ *
+ * @param io the pool the snapshot was taken in
+ * @param id the snapshot to lookup
+ * @param t where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id,
+                                              time_t *t);
+
+/** @} Snapshots */
+
+/**
+ * @name Synchronous I/O
+ * Writes are replicated to a number of OSDs based on the
+ * configuration of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_ioctx_wait_for_complete().  For greater data safety, use the
+ * asynchronous functions and rados_aio_wait_for_safe().
+ *
+ * @{
+ */
+
+/**
+ * Return the version of the last object read or written to.
+ *
+ * This exposes the internal version number of the last object read or
+ * written via this io context
+ *
+ * @param io the io context to check
+ * @returns last read or written object version
+ */
+CEPH_RADOS_API uint64_t rados_get_last_version(rados_ioctx_t io);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object, starting at
+ * offset *off*. The value of *len* must be <= UINT_MAX/2.
+ *
+ * @note This will never return a positive value not equal to len.
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid,
+                               const char *buf, size_t len, uint64_t off);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write_full(rados_ioctx_t io, const char *oid,
+                                    const char *buf, size_t len);
+
+/**
+ * Write the same *data_len* bytes from *buf* multiple times into the
+ * *oid* object. *write_len* bytes are written in total, which must be
+ * a multiple of *data_len*. The value of *write_len* and *data_len*
+ * must be <= UINT_MAX/2.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_writesame(rados_ioctx_t io, const char *oid,
+                                   const char *buf, size_t data_len,
+                                   size_t write_len, uint64_t off);
+
+/**
+ * Append *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_append(rados_ioctx_t io, const char *oid,
+                                const char *buf, size_t len);
+
+/**
+ * Read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+CEPH_RADOS_API int rados_read(rados_ioctx_t io, const char *oid, char *buf,
+                              size_t len, uint64_t off);
+
+/**
+ * Compute checksum from object data
+ *
+ * The io context determines the snapshot to checksum, if any was set
+ * by rados_ioctx_snap_set_read(). The length of the init_value and
+ * resulting checksum are dependent upon the checksum type:
+ *
+ *    XXHASH64: le64
+ *    XXHASH32: le32
+ *    CRC32C:	le32
+ *
+ * The checksum result is encoded the following manner:
+ *
+ *    le32 num_checksum_chunks
+ *    {
+ *      leXX checksum for chunk (where XX = appropriate size for the checksum type)
+ *    } * num_checksum_chunks
+ *
+ * @param io the context in which to perform the checksum
+ * @param oid the name of the object to checksum
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param len the number of bytes to checksum
+ * @param off the offset to start checksumming in the object
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result
+ * @param checksum_len the number of bytes available for the result
+ * @return negative error code on failure
+ */
+CEPH_RADOS_API int rados_checksum(rados_ioctx_t io, const char *oid,
+				  rados_checksum_type_t type,
+				  const char *init_value, size_t init_value_len,
+				  size_t len, uint64_t off, size_t chunk_size,
+				  char *pchecksum, size_t checksum_len);
+
+/**
+ * Delete an object
+ *
+ * @note This does not delete any snapshots of the object.
+ *
+ * @param io the pool to delete the object from
+ * @param oid the name of the object to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_remove(rados_ioctx_t io, const char *oid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @param io the context in which to truncate
+ * @param oid the name of the object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid,
+                               uint64_t size);
+
+/**
+ * Compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o name of the object
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ *  (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o,
+                                const char *cmp_buf, size_t cmp_len,
+                                uint64_t off);
+
+/**
+ * @name Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattr(rados_ioctx_t io, const char *o,
+                                  const char *name, char *buf, size_t len);
+
+/**
+ * Set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_setxattr(rados_ioctx_t io, const char *o,
+                                  const char *name, const char *buf,
+                                  size_t len);
+
+/**
+ * Delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_rmxattr(rados_ioctx_t io, const char *o,
+                                 const char *name);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs(rados_ioctx_t io, const char *oid,
+                                   rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs_next(rados_xattrs_iter_t iter,
+                                        const char **name, const char **val,
+                                        size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Get the next omap key/value pair on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key is
+ * null-terminated, and val has length len. If the end of the list has
+ * been reached, key and val are NULL, and len is 0. key and val will
+ * not be accessible after rados_omap_get_end() is called on iter, so
+ * if they are needed after that they should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next(rados_omap_iter_t iter,
+                                       char **key,
+                                       char **val,
+                                       size_t *len);
+
+/**
+ * Get the next omap key/value pair on the object. Note that it's
+ * perfectly safe to mix calls to rados_omap_get_next and
+ * rados_omap_get_next2.
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key has length
+ * keylen and val has length vallen. If the end of the list has
+ * been reached, key and val are NULL, and keylen and vallen is 0.
+ * key and val will not be accessible after rados_omap_get_end()
+ * is called on iter, so if they are needed after that they
+ * should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param key_len where to store the number of bytes in key
+ * @param val_len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next2(rados_omap_iter_t iter,
+                                       char **key,
+                                       char **val,
+                                       size_t *key_len,
+                                       size_t *val_len);
+
+/**
+ * Return number of elements in the iterator
+ *
+ * @param iter the iterator of which to return the size
+ */
+CEPH_RADOS_API unsigned int rados_omap_iter_size(rados_omap_iter_t iter);
+
+/**
+ * Close the omap iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_omap_get_end(rados_omap_iter_t iter);
+
+/**
+ * Get object stats (size/mtime)
+ *
+ * TODO: when are these set, and by whom? can they be out of date?
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_stat(rados_ioctx_t io, const char *o, uint64_t *psize,
+                              time_t *pmtime);
+/**
+ * Execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param oid the object to call the method on
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns the length of the output, or
+ * -ERANGE if out_buf does not have enough space to store it (For methods that return data). For
+ * methods that don't return data, the return value is
+ * method-specific.
+ */
+CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid,
+                              const char *cls, const char *method,
+	                      const char *in_buf, size_t in_len, char *buf,
+                              size_t out_len);
+
+
+/** @} Synchronous I/O */
+
+/**
+ * @name Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_callback_t
+ * Callbacks for asynchrous operations take two parameters:
+ * - cb the completion that has finished
+ * - arg application defined data made available to the callback function
+ */
+typedef void (*rados_callback_t)(rados_completion_t cb, void *arg);
+
+/**
+ * Constructs a completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * TODO: more complete documentation of this elsewhere (in the RADOS docs?)
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all relpicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_create_completion(void *cb_arg,
+                                               rados_callback_t cb_complete,
+                                               rados_callback_t cb_safe,
+				               rados_completion_t *pc);
+
+/**
+ * Block until an operation completes
+ *
+ * This means it is in memory on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete(rados_completion_t c);
+
+/**
+ * Block until an operation is safe
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe(rados_completion_t c);
+
+/**
+ * Has an asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe(rados_completion_t c);
+
+/**
+ * Block until an operation completes and callback completes
+ *
+ * This means it is in memory on all replicas and can be read.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete_and_cb(rados_completion_t c);
+
+/**
+ * Block until an operation is safe and callback has completed
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe_and_cb(rados_completion_t c);
+
+/**
+ * Has an asynchronous operation and callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete_and_cb(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe and has the callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe_and_cb(rados_completion_t c);
+
+/**
+ * Get the return value of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns return value of the operation
+ */
+CEPH_RADOS_API int rados_aio_get_return_value(rados_completion_t c);
+
+/**
+ * Get the internal object version of the target of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns version number of the asychronous operation's target
+ */
+CEPH_RADOS_API uint64_t rados_aio_get_version(rados_completion_t c);
+
+/**
+ * Release a completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c completion to release
+ */
+CEPH_RADOS_API void rados_aio_release(rados_completion_t c);
+
+/**
+ * Write data to an object asynchronously
+ *
+ * Queues the write and returns. The return value of the completion
+ * will be 0 on success, negative error code on failure.
+ *
+ * @param io the context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write(rados_ioctx_t io, const char *oid,
+		                   rados_completion_t completion,
+		                   const char *buf, size_t len, uint64_t off);
+
+/**
+ * Asynchronously append data to an object
+ *
+ * Queues the append and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the append is safe and complete
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_append(rados_ioctx_t io, const char *oid,
+		                    rados_completion_t completion,
+		                    const char *buf, size_t len);
+
+/**
+ * Asynchronously write an entire object
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ * Queues the write_full and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write_full is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write_full(rados_ioctx_t io, const char *oid,
+			                rados_completion_t completion,
+			                const char *buf, size_t len);
+
+/**
+ * Asynchronously write the same buffer multiple times
+ *
+ * Queues the writesame and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the writesame is safe and complete
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_writesame(rados_ioctx_t io, const char *oid,
+			               rados_completion_t completion,
+			               const char *buf, size_t data_len,
+				       size_t write_len, uint64_t off);
+
+/**
+ * Asynchronously remove an object
+ *
+ * Queues the remove and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_remove(rados_ioctx_t io, const char *oid,
+		                    rados_completion_t completion);
+
+/**
+ * Asynchronously read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @note only the 'complete' callback of the completion will be called.
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param completion what to do when the read is complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_read(rados_ioctx_t io, const char *oid,
+		                  rados_completion_t completion,
+		                  char *buf, size_t len, uint64_t off);
+
+/**
+ * Block until all pending writes in an io context are safe
+ *
+ * This is not equivalent to calling rados_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @note BUG: always returns 0, should be void or accept a timeout
+ *
+ * @param io the context to flush
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush(rados_ioctx_t io);
+
+
+/**
+ * Schedule a callback for when all currently pending
+ * aio writes are safe. This is a non-blocking version of
+ * rados_aio_flush().
+ *
+ * @param io the context to flush
+ * @param completion what to do when the writes are safe
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush_async(rados_ioctx_t io,
+                                         rados_completion_t completion);
+
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param completion what to do when the stat is complete
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o,
+		                  rados_completion_t completion,
+		                  uint64_t *psize, time_t *pmtime);
+
+/**
+ * Asynchronously compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o the name of the object to compare with
+ * @param completion what to do when the comparison is complete
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ *  (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o,
+                                    rados_completion_t completion,
+                                    const char *cmp_buf,
+                                    size_t cmp_len,
+                                    uint64_t off);
+
+/**
+ * Cancel async operation
+ *
+ * @param io ioctx
+ * @param completion completion handle
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io,
+                                    rados_completion_t completion);
+
+/**
+ * Asynchronously execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param o name of the object
+ * @param completion what to do when the exec completes
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_exec(rados_ioctx_t io, const char *o,
+				  rados_completion_t completion,
+				  const char *cls, const char *method,
+				  const char *in_buf, size_t in_len,
+				  char *buf, size_t out_len);
+
+/** @} Asynchronous I/O */
+
+/**
+ * @name Asynchronous Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Asynchronously get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param completion what to do when the getxattr completes
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattr(rados_ioctx_t io, const char *o,
+				      rados_completion_t completion,
+				      const char *name, char *buf, size_t len);
+
+/**
+ * Asynchronously set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param completion what to do when the setxattr completes
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_setxattr(rados_ioctx_t io, const char *o,
+				      rados_completion_t completion,
+				      const char *name, const char *buf,
+				      size_t len);
+
+/**
+ * Asynchronously delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param completion what to do when the rmxattr completes
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_rmxattr(rados_ioctx_t io, const char *o,
+				     rados_completion_t completion,
+				     const char *name);
+
+/**
+ * Asynchronously start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param completion what to do when the getxattrs completes
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattrs(rados_ioctx_t io, const char *oid,
+				       rados_completion_t completion,
+				       rados_xattrs_iter_t *iter);
+
+/** @} Asynchronous Xattrs */
+
+/**
+ * @name Watch/Notify
+ *
+ * Watch/notify is a protocol to help communicate among clients. It
+ * can be used to sychronize client state. All that's needed is a
+ * well-known object name (for example, rbd uses the header object of
+ * an image).
+ *
+ * Watchers register an interest in an object, and receive all
+ * notifies on that object. A notify attempts to communicate with all
+ * clients watching an object, and blocks on the notifier until each
+ * client responds or a timeout is reached.
+ *
+ * See rados_watch() and rados_notify() for more details.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_watchcb_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param opcode undefined
+ * @param ver version of the watched object
+ * @param arg application-specific data
+ *
+ * @note BUG: opcode is an internal detail that shouldn't be exposed
+ * @note BUG: ver is unused
+ */
+typedef void (*rados_watchcb_t)(uint8_t opcode, uint64_t ver, void *arg);
+
+/**
+ * @typedef rados_watchcb2_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param arg opaque user-defined value provided to rados_watch2()
+ * @param notify_id an id for this notify event
+ * @param handle the watcher handle we are notifying
+ * @param notifier_id the unique client id for the notifier
+ * @param data payload from the notifier
+ * @param datalen length of payload buffer
+ */
+typedef void (*rados_watchcb2_t)(void *arg,
+				 uint64_t notify_id,
+				 uint64_t handle,
+				 uint64_t notifier_id,
+				 void *data,
+				 size_t data_len);
+
+/**
+ * @typedef rados_watcherrcb_t
+ *
+ * Callback activated when we encounter an error with the watch session.
+ * This can happen when the location of the objects moves within the
+ * cluster and we fail to register our watch with the new object location,
+ * or when our connection with the object OSD is otherwise interrupted and
+ * we may have missed notify events.
+ *
+ * @param pre opaque user-defined value provided to rados_watch2()
+ * @param err error code
+ */
+  typedef void (*rados_watcherrcb_t)(void *pre, uint64_t cookie, int err);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @note BUG: librados should provide a way for watchers to notice connection resets
+ * @note BUG: the ver parameter does not work, and -ERANGE will never be returned
+ *            (See URL tracker.ceph.com/issues/2592)
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param ver expected version of the object
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param arg application defined data to pass when watchcb is called
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the version of the object is greater than ver
+ */
+CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver,
+			       uint64_t *cookie,
+			       rados_watchcb_t watchcb, void *arg)
+  __attribute__((deprecated));
+
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to the
+ * primary OSD for a watched object, the watch will be removed after
+ * a timeout configured with osd_client_watch_timeout.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch2(rados_ioctx_t io, const char *o, uint64_t *cookie,
+				rados_watchcb2_t watchcb,
+				rados_watcherrcb_t watcherrcb,
+				void *arg);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch3(rados_ioctx_t io, const char *o, uint64_t *cookie,
+        rados_watchcb2_t watchcb,
+        rados_watcherrcb_t watcherrcb,
+        uint32_t timeout,
+        void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch(rados_ioctx_t io, const char *o,
+				   rados_completion_t completion, uint64_t *handle,
+				   rados_watchcb2_t watchcb,
+				   rados_watcherrcb_t watcherrcb,
+				   void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after the number of seconds that configured in timeout parameter.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch2(rados_ioctx_t io, const char *o,
+           rados_completion_t completion, uint64_t *handle,
+           rados_watchcb2_t watchcb,
+           rados_watcherrcb_t watcherrcb,
+           uint32_t timeout,
+           void *arg);
+
+/**
+ * Check on the status of a watch
+ *
+ * Return the number of milliseconds since the watch was last confirmed.
+ * Or, if there has been an error, return that.
+ *
+ * If there is an error, the watch is no longer valid, and should be
+ * destroyed with rados_unwatch2().  The the user is still interested
+ * in the object, a new watch should be created with rados_watch2().
+ *
+ * @param io the pool the object is in
+ * @param cookie the watch handle
+ * @returns ms since last confirmed on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch_check(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the watched object (ignored)
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch(rados_ioctx_t io, const char *o, uint64_t cookie)
+  __attribute__((deprecated));
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch2(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Asynchronous unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unwatch(rados_ioctx_t io, uint64_t cookie,
+                                     rados_completion_t completion);
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * @note BUG: the timeout is not changeable via the C API
+ * @note BUG: the bufferlist is inaccessible in a rados_watchcb_t
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param ver obsolete - just pass zero
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver,
+				const char *buf, int buf_len)
+  __attribute__((deprecated));
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * The reply buffer is optional.  If specified, the client will get
+ * back an encoded buffer that includes the ids of the clients that
+ * acknowledged the notify as well as their notify ack payloads (if
+ * any).  Clients that timed out are not included.  Even clients that
+ * do not include a notify ack payload are included in the list but
+ * have a 0-length payload associated with them.  The format:
+ *
+ *    le32 num_acks
+ *    {
+ *      le64 gid     global id for the client (for client.1234 that's 1234)
+ *      le64 cookie  cookie for the client
+ *      le32 buflen  length of reply message buffer
+ *      u8 * buflen  payload
+ *    } * num_acks
+ *    le32 num_timeouts
+ *    {
+ *      le64 gid     global id for the client
+ *      le64 cookie  cookie for the client
+ *    } * num_timeouts
+ *
+ * Note: There may be multiple instances of the same gid if there are
+ * multiple watchers registered via the same client.
+ *
+ * Note: The buffer must be released with rados_buffer_free() when the
+ * user is done with it.
+ *
+ * Note: Since the result buffer includes clients that time out, it
+ * will be set even when rados_notify() returns an error code (like
+ * -ETIMEDOUT).
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param o the name of the object
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @param timeout_ms notify timeout (in ms)
+ * @param reply_buffer pointer to reply buffer pointer (free with rados_buffer_free)
+ * @param reply_buffer_len pointer to size of reply buffer
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o,
+				    rados_completion_t completion,
+				    const char *buf, int buf_len,
+				    uint64_t timeout_ms, char **reply_buffer,
+				    size_t *reply_buffer_len);
+CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o,
+				 const char *buf, int buf_len,
+				 uint64_t timeout_ms,
+				 char **reply_buffer, size_t *reply_buffer_len);
+
+/**
+ * Acknolwedge receipt of a notify
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param notify_id the notify_id we got on the watchcb2_t callback
+ * @param cookie the watcher handle
+ * @param buf payload to return to notifier (optional)
+ * @param buf_len payload length
+ * @returns 0 on success
+ */
+CEPH_RADOS_API int rados_notify_ack(rados_ioctx_t io, const char *o,
+				    uint64_t notify_id, uint64_t cookie,
+				    const char *buf, int buf_len);
+
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will block until all pending watch/notify callbacks have
+ * been executed and the queue is empty.  It should usually be called
+ * after shutting down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ */
+CEPH_RADOS_API int rados_watch_flush(rados_t cluster);
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will be nonblock, and the completion will be called
+ * until all pending watch/notify callbacks have been executed and
+ * the queue is empty.  It should usually be called after shutting
+ * down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ * @param completion what to do when operation has been attempted
+ */
+CEPH_RADOS_API int rados_aio_watch_flush(rados_t cluster, rados_completion_t completion);
+
+/** @} Watch/Notify */
+
+/**
+ * Pin an object in the cache tier
+ *
+ * When an object is pinned in the cache tier, it stays in the cache
+ * tier, and won't be flushed out.
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o);
+
+/**
+ * Unpin an object in the cache tier
+ *
+ * After an object is unpinned in the cache tier, it can be flushed out
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o);
+
+/**
+ * @name Hints
+ *
+ * @{
+ */
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o,
+                                        uint64_t expected_object_size,
+                                        uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint2(rados_ioctx_t io, const char *o,
+					 uint64_t expected_object_size,
+					 uint64_t expected_write_size,
+					 uint32_t flags);
+
+/** @} Hints */
+
+/**
+ * @name Object Operations
+ *
+ * A single rados operation can do multiple operations on one object
+ * atomically. The whole operation will succeed or fail, and no partial
+ * results will be visible.
+ *
+ * Operations may be either reads, which can return data, or writes,
+ * which cannot. The effects of writes are applied and visible all at
+ * once, so an operation that sets an xattr and then checks its value
+ * will not see the updated value.
+ *
+ * @{
+ */
+
+/**
+ * Create a new rados_write_op_t write operation. This will store all actions
+ * to be performed atomically. You must call rados_release_write_op when you are
+ * finished with it.
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_write_op_t rados_create_write_op(void);
+
+/**
+ * Free a rados_write_op_t, must be called when you're done with it.
+ * @param write_op operation to deallocate, created with rados_create_write_op
+ */
+CEPH_RADOS_API void rados_release_write_op(rados_write_op_t write_op);
+
+/**
+ * Set flags for the last operation added to this write_op.
+ * At least one op must have been added to the write_op.
+ * @param write_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_write_op_set_flags(rados_write_op_t write_op,
+                                             int flags);
+
+/**
+ * Ensure that the object exists before writing
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before writing. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ *   then rados_write_op_operate will return -ERANGE instead of
+ *   executing the op.
+ * - If the object's version is less than the asserted version
+ *   then rados_write_op_operate will return -EOVERFLOW instead
+ *   of executing the op.
+ * @param write_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param write_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ *  on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op,
+                                          const char *cmp_buf,
+                                          size_t cmp_len,
+                                          uint64_t off,
+                                          int *prval);
+
+/**
+ * Ensure that given xattr satisfies comparison.
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_write_op_cmpxattr(rados_write_op_t write_op,
+                                            const char *name,
+                                            uint8_t comparison_operator,
+                                            const char *value,
+                                            size_t value_len);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+   LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp(rados_write_op_t write_op,
+                                            const char *key,
+                                            uint8_t comparison_operator,
+                                            const char *val,
+                                            size_t val_len,
+                                            int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+   LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp2(rados_write_op_t write_op,
+                                            const char *key,
+                                            uint8_t comparison_operator,
+                                            const char *val,
+                                            size_t key_len,
+                                            size_t val_len,
+                                            int *prval);
+
+/**
+ * Set an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr
+ * @param value buffer to set xattr to
+ * @param value_len length of buffer to set xattr to
+ */
+CEPH_RADOS_API void rados_write_op_setxattr(rados_write_op_t write_op,
+                                            const char *name,
+                                            const char *value,
+                                            size_t value_len);
+
+/**
+ * Remove an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to remove
+ */
+CEPH_RADOS_API void rados_write_op_rmxattr(rados_write_op_t write_op,
+                                           const char *name);
+
+/**
+ * Create the object
+ * @param write_op operation to add this action to
+ * @param exclusive set to either LIBRADOS_CREATE_EXCLUSIVE or
+   LIBRADOS_CREATE_IDEMPOTENT
+ * will error if the object already exists.
+ * @param category category string (DEPRECATED, HAS NO EFFECT)
+ */
+CEPH_RADOS_API void rados_write_op_create(rados_write_op_t write_op,
+                                          int exclusive,
+                                          const char* category);
+
+/**
+ * Write to offset
+ * @param write_op operation to add this action to
+ * @param offset offset to write to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write(rados_write_op_t write_op,
+                                         const char *buffer,
+                                         size_t len,
+                                         uint64_t offset);
+
+/**
+ * Write whole object, atomically replacing it.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write_full(rados_write_op_t write_op,
+                                              const char *buffer,
+                                              size_t len);
+
+/**
+ * Write the same buffer multiple times
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param data_len length of buffer
+ * @param write_len total number of bytes to write, as a multiple of @c data_len
+ * @param offset offset to write to
+ */
+CEPH_RADOS_API void rados_write_op_writesame(rados_write_op_t write_op,
+                                             const char *buffer,
+                                             size_t data_len,
+                                             size_t write_len,
+                                             uint64_t offset);
+
+/**
+ * Append to end of object.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_append(rados_write_op_t write_op,
+                                          const char *buffer,
+                                          size_t len);
+/**
+ * Remove object
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op);
+
+/**
+ * Truncate an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to truncate to
+ */
+CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op,
+                                            uint64_t offset);
+
+/**
+ * Zero part of an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to zero
+ * @param len length to zero
+ */
+CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op,
+			                uint64_t offset,
+			                uint64_t len);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * @param write_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_write_op_exec(rados_write_op_t write_op,
+			                const char *cls,
+			                const char *method,
+			                const char *in_buf,
+			                size_t in_len,
+			                int *prval);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set(rados_write_op_t write_op,
+                                            char const* const* keys,
+                                            char const* const* vals,
+                                            const size_t *lens,
+                                            size_t num);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param key_lens array of lengths corresponding to each key
+ * @param val_lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set2(rados_write_op_t write_op,
+                                            char const* const* keys,
+                                            char const* const* vals,
+                                            const size_t *key_lens,
+                                            const size_t *val_lens,
+                                            size_t num);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to remove
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys(rados_write_op_t write_op,
+                                                char const* const* keys,
+                                                size_t keys_len);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of char arrays representing keys to remove
+ * @param key_lens array of size_t values representing length of each key
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys2(rados_write_op_t write_op,
+                                                char const* const* keys,
+                                                const size_t* key_lens,
+                                                size_t keys_len);
+
+/**
+ * Remove all key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_omap_clear(rados_write_op_t write_op);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint(rados_write_op_t write_op,
+                                                  uint64_t expected_object_size,
+                                                  uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint2(rados_write_op_t write_op,
+						   uint64_t expected_object_size,
+						   uint64_t expected_write_size,
+						   uint32_t flags);
+
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_write_op_operate(rados_write_op_t write_op,
+			                  rados_ioctx_t io,
+			                  const char *oid,
+			                  time_t *mtime,
+			                  int flags);
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+
+CEPH_RADOS_API int rados_write_op_operate2(rados_write_op_t write_op,
+                                           rados_ioctx_t io,
+                                           const char *oid,
+                                           struct timespec *mtime,
+                                           int flags);
+
+/**
+ * Perform a write operation asynchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_write_op_operate(rados_write_op_t write_op,
+                                              rados_ioctx_t io,
+                                              rados_completion_t completion,
+                                              const char *oid,
+                                              time_t *mtime,
+			                      int flags);
+
+/**
+ * Create a new rados_read_op_t write operation. This will store all
+ * actions to be performed atomically. You must call
+ * rados_release_read_op when you are finished with it (after it
+ * completes, or you decide not to send it in the first place).
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_read_op_t rados_create_read_op(void);
+
+/**
+ * Free a rados_read_op_t, must be called when you're done with it.
+ * @param read_op operation to deallocate, created with rados_create_read_op
+ */
+CEPH_RADOS_API void rados_release_read_op(rados_read_op_t read_op);
+
+/**
+ * Set flags for the last operation added to this read_op.
+ * At least one op must have been added to the read_op.
+ * @param read_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_read_op_set_flags(rados_read_op_t read_op, int flags);
+
+/**
+ * Ensure that the object exists before reading
+ * @param read_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before reading. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ *   then rados_read_op_operate will return -ERANGE instead of
+ *   executing the op.
+ * - If the object's version is less than the asserted version
+ *   then rados_read_op_operate will return -EOVERFLOW instead
+ *   of executing the op.
+ * @param read_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param read_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ *  on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op,
+                                         const char *cmp_buf,
+                                         size_t cmp_len,
+                                         uint64_t off,
+                                         int *prval);
+
+/**
+ * Ensure that the an xattr satisfies a comparison
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param read_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_read_op_cmpxattr(rados_read_op_t read_op,
+			                   const char *name,
+			                   uint8_t comparison_operator,
+			                   const char *value,
+			                   size_t value_len);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @param read_op operation to add this action to
+ * @param iter where to store the iterator
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_getxattrs(rados_read_op_t read_op,
+			                    rados_xattrs_iter_t *iter,
+			                    int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+   LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp(rados_read_op_t read_op,
+                                           const char *key,
+                                           uint8_t comparison_operator,
+                                           const char *val,
+                                           size_t val_len,
+                                           int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+   LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp2(rados_read_op_t read_op,
+                                           const char *key,
+                                           uint8_t comparison_operator,
+                                           const char *val,
+                                           size_t key_len,
+                                           size_t val_len,
+                                           int *prval);
+
+/**
+ * Get object size and mtime
+ * @param read_op operation to add this action to
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op,
+			               uint64_t *psize,
+			               time_t *pmtime,
+			               int *prval);
+
+/**
+ * Read bytes from offset into buffer.
+ *
+ * prlen will be filled with the number of bytes read if successful.
+ * A short read can only occur if the read reaches the end of the
+ * object.
+ *
+ * @param read_op operation to add this action to
+ * @param offset offset to read from
+ * @param len length of buffer
+ * @param buffer where to put the data
+ * @param bytes_read where to store the number of bytes read by this action
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op,
+			               uint64_t offset,
+			               size_t len,
+			               char *buffer,
+			               size_t *bytes_read,
+			               int *prval);
+
+/**
+ * Compute checksum from object data
+ *
+ * @param read_op operation to add this action to
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param offset the offset to start checksumming in the object
+ * @param len the number of bytes to checksum
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result for this action
+ * @param checksum_len the number of bytes available for the result
+ * @param prval where to store the return value for this action
+ */
+CEPH_RADOS_API void rados_read_op_checksum(rados_read_op_t read_op,
+					   rados_checksum_type_t type,
+					   const char *init_value,
+					   size_t init_value_len,
+					   uint64_t offset, size_t len,
+					   size_t chunk_size, char *pchecksum,
+					   size_t checksum_len, int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * The output buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf where to put librados-allocated output buffer
+ * @param out_len length of out_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec(rados_read_op_t read_op,
+			               const char *cls,
+			               const char *method,
+			               const char *in_buf,
+			               size_t in_len,
+			               char **out_buf,
+			               size_t *out_len,
+			               int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * If the output buffer is too small, prval will
+ * be set to -ERANGE and used_len will be 0.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf user-provided buffer to read into
+ * @param out_len length of out_buf in bytes
+ * @param used_len where to store the number of bytes read into out_buf
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op,
+				                const char *cls,
+				                const char *method,
+				                const char *in_buf,
+				                size_t in_len,
+				                char *out_buf,
+				                size_t out_len,
+				                size_t *used_len,
+				                int *prval);
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op,
+				                const char *start_after,
+				                const char *filter_prefix,
+				                uint64_t max_return,
+				                rados_omap_iter_t *iter,
+				                int *prval)
+  __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals2(rados_read_op_t read_op,
+						 const char *start_after,
+						 const char *filter_prefix,
+						 uint64_t max_return,
+						 rados_omap_iter_t *iter,
+						 unsigned char *pmore,
+						 int *prval);
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys(rados_read_op_t read_op,
+				                const char *start_after,
+				                uint64_t max_return,
+				                rados_omap_iter_t *iter,
+				                int *prval)
+  __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys2(rados_read_op_t read_op,
+						 const char *start_after,
+						 uint64_t max_return,
+						 rados_omap_iter_t *iter,
+						 unsigned char *pmore,
+						 int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to null-terminated keys to get
+ * @param keys_len the number of strings in keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op,
+                                                        char const* const* keys,
+                                                        size_t keys_len,
+                                                        rados_omap_iter_t *iter,
+                                                        int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to keys to get
+ * @param num_keys the number of strings in keys
+ * @param key_lens array of size_t's describing each key len (in bytes)
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys2(rados_read_op_t read_op,
+                                                        char const* const* keys,
+                                                        size_t num_keys,
+                                                        const size_t* key_lens,
+                                                        rados_omap_iter_t *iter,
+                                                        int *prval);
+
+/**
+ * Perform a read operation synchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op,
+			                 rados_ioctx_t io,
+			                 const char *oid,
+			                 int flags);
+
+/**
+ * Perform a read operation asynchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op,
+			                     rados_ioctx_t io,
+			                     rados_completion_t completion,
+			                     const char *oid,
+			                     int flags);
+
+/** @} Object Operations */
+
+/**
+ * Take an exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid,
+                                        const char * name, const char * cookie,
+                                        const char * desc,
+                                        struct timeval * duration,
+                                        uint8_t flags);
+
+/**
+ * Take a shared lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param tag The tag of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_shared(rados_ioctx_t io, const char * o,
+                                     const char * name, const char * cookie,
+                                     const char * tag, const char * desc,
+	                             struct timeval * duration, uint8_t flags);
+
+/**
+ * Release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_unlock(rados_ioctx_t io, const char *o,
+                                const char *name, const char *cookie);
+
+/**
+ * Asynchronous release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @param completion what to do when operation has been attempted
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unlock(rados_ioctx_t io, const char *o,
+                                    const char *name, const char *cookie,
+			            rados_completion_t completion);
+
+/**
+ * List clients that have locked the named object lock and information about
+ * the lock.
+ *
+ * The number of bytes required in each buffer is put in the
+ * corresponding size out parameter. If any of the provided buffers
+ * are too short, -ERANGE is returned after these sizes are filled in.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param exclusive where to store whether the lock is exclusive (1) or shared (0)
+ * @param tag where to store the tag associated with the object lock
+ * @param tag_len number of bytes in tag buffer
+ * @param clients buffer in which locker clients are stored, separated by '\0'
+ * @param clients_len number of bytes in the clients buffer
+ * @param cookies buffer in which locker cookies are stored, separated by '\0'
+ * @param cookies_len number of bytes in the cookies buffer
+ * @param addrs buffer in which locker addresses are stored, separated by '\0'
+ * @param addrs_len number of bytes in the clients buffer
+ * @returns number of lockers on success, negative error code on failure
+ * @returns -ERANGE if any of the buffers are too short
+ */
+CEPH_RADOS_API ssize_t rados_list_lockers(rados_ioctx_t io, const char *o,
+			                  const char *name, int *exclusive,
+			                  char *tag, size_t *tag_len,
+			                  char *clients, size_t *clients_len,
+			                  char *cookies, size_t *cookies_len,
+			                  char *addrs, size_t *addrs_len);
+
+/**
+ * Releases a shared or exclusive lock on an object, which was taken by the
+ * specified client.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param client the client currently holding the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ * @returns -EINVAL if the client cannot be parsed
+ */
+CEPH_RADOS_API int rados_break_lock(rados_ioctx_t io, const char *o,
+                                    const char *name, const char *client,
+                                    const char *cookie);
+
+/**
+ * Blacklists the specified client from the OSDs
+ *
+ * @param cluster cluster handle
+ * @param client_address client address
+ * @param expire_seconds number of seconds to blacklist (0 for default)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_blacklist_add(rados_t cluster,
+				       char *client_address,
+				       uint32_t expire_seconds);
+
+/**
+ * Gets addresses of the RADOS session, suitable for blacklisting.
+ *
+ * @param cluster cluster handle
+ * @param addrs the output string.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getaddrs(rados_t cluster, char** addrs);
+
+CEPH_RADOS_API void rados_set_osdmap_full_try(rados_ioctx_t io);
+
+CEPH_RADOS_API void rados_unset_osdmap_full_try(rados_ioctx_t io);
+
+/**
+ * Enable an application on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param force 0 if only single application per pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io,
+                                            const char *app_name, int force);
+
+/**
+ * List all enabled applications
+ *
+ * If the provided buffer is too short, the required length is filled in and
+ * -ERANGE is returned. Otherwise, the buffers are filled with the application
+ * names, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param values buffer in which to store application names
+ * @param values_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values,
+                                          size_t *values_len);
+
+/**
+ * Get application metadata value from pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value result buffer
+ * @param value_len maximum len of value
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io,
+                                                  const char *app_name,
+                                                  const char *key, char *value,
+                                                  size_t *value_len);
+
+/**
+ * Set application metadata on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io,
+                                                  const char *app_name,
+                                                  const char *key,
+                                                  const char *value);
+
+/**
+ * Remove application metadata from a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io,
+                                                     const char *app_name,
+                                                     const char *key);
+
+/**
+ * List all metadata key/value pairs associated with an application.
+ *
+ * This iterates over all metadata, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are filled
+ * in and -ERANGE is returned. Otherwise, the buffers are filled with
+ * the keys and values of the metadata, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param keys buffer in which to store key names
+ * @param key_len number of bytes in keys buffer
+ * @param values buffer in which to store values
+ * @param vals_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io,
+                                                   const char *app_name,
+                                                   char *keys, size_t *key_len,
+                                                   char *values,
+                                                   size_t *vals_len);
+
+/**
+ * @name Mon/OSD/PG Commands
+ *
+ * These interfaces send commands relating to the monitor, OSD, or PGs.
+ *
+ * @{
+ */
+
+/**
+ * Send monitor command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free().  The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command(rados_t cluster, const char **cmd,
+                                     size_t cmdlen, const char *inbuf,
+                                     size_t inbuflen, char **outbuf,
+                                     size_t *outbuflen, char **outs,
+                                     size_t *outslen);
+
+/**
+ * Send ceph-mgr command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free().  The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mgr_command(rados_t cluster, const char **cmd,
+                                     size_t cmdlen, const char *inbuf,
+                                     size_t inbuflen, char **outbuf,
+                                     size_t *outbuflen, char **outs,
+                                     size_t *outslen);
+
+/**
+ * Send monitor command to a specific monitor.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free().  The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param name target monitor's name
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command_target(rados_t cluster, const char *name,
+			                    const char **cmd, size_t cmdlen,
+			                    const char *inbuf, size_t inbuflen,
+			                    char **outbuf, size_t *outbuflen,
+			                    char **outs, size_t *outslen);
+
+/**
+ * free a rados-allocated buffer
+ *
+ * Release memory allocated by librados calls like rados_mon_command().
+ *
+ * @param buf buffer pointer
+ */
+CEPH_RADOS_API void rados_buffer_free(char *buf);
+
+CEPH_RADOS_API int rados_osd_command(rados_t cluster, int osdid,
+                                     const char **cmd, size_t cmdlen,
+		                     const char *inbuf, size_t inbuflen,
+		                     char **outbuf, size_t *outbuflen,
+		                     char **outs, size_t *outslen);
+
+CEPH_RADOS_API int rados_pg_command(rados_t cluster, const char *pgstr,
+                                    const char **cmd, size_t cmdlen,
+		                    const char *inbuf, size_t inbuflen,
+		                    char **outbuf, size_t *outbuflen,
+		                    char **outs, size_t *outslen);
+
+CEPH_RADOS_API int rados_mgr_command(rados_t cluster,
+                                     const char **cmd, size_t cmdlen,
+		                     const char *inbuf, size_t inbuflen,
+		                     char **outbuf, size_t *outbuflen,
+		                     char **outs, size_t *outslen);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log.  The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback_t)(void *arg,
+				     const char *line,
+				     const char *who, 
+				     uint64_t sec, uint64_t nsec,
+				     uint64_t seq, const char *level,
+				     const char *msg);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log.  The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback2_t)(void *arg,
+				     const char *line,
+				     const char *channel,
+				     const char *who,
+				     const char *name,
+				     uint64_t sec, uint64_t nsec,
+				     uint64_t seq, const char *level,
+				     const char *msg);
+
+CEPH_RADOS_API int rados_monitor_log(rados_t cluster, const char *level,
+                                     rados_log_callback_t cb, void *arg);
+CEPH_RADOS_API int rados_monitor_log2(rados_t cluster, const char *level,
+				      rados_log_callback2_t cb, void *arg);
+
+
+/**
+ * register daemon instance for a service
+ *
+ * Register us as a daemon providing a particular service.  We identify
+ * the service (e.g., 'rgw') and our instance name (e.g., 'rgw.$hostname').
+ * The metadata is a map of keys and values with arbitrary static metdata
+ * for this instance.  The encoding is a series of NULL-terminated strings,
+ * alternating key names and values, terminating with an empty key name.
+ * For example,  "foo\0bar\0this\0that\0\0" is the dict {foo=bar,this=that}.
+ *
+ * For the lifetime of the librados instance, regular beacons will be sent
+ * to the cluster to maintain our registration in the service map.
+ *
+ * @param cluster handle
+ * @param service service name
+ * @param daemon daemon instance name
+ * @param metadata_dict static daemon metadata dict
+ */
+CEPH_RADOS_API int rados_service_register(
+  rados_t cluster,
+  const char *service,
+  const char *daemon,
+  const char *metadata_dict);
+
+/**
+ * update daemon status
+ *
+ * Update our mutable status information in the service map.
+ *
+ * The status dict is encoded the same way the daemon metadata is encoded
+ * for rados_service_register.  For example, "foo\0bar\0this\0that\0\0" is
+ * {foo=bar,this=that}.
+ *
+ * @param cluster rados cluster handle
+ * @param status_dict status dict
+ */
+CEPH_RADOS_API int rados_service_update_status(
+  rados_t cluster,
+  const char *status_dict);
+
+/** @} Mon/OSD/PG commands */
+
+/*
+ * These methods are no longer supported and return -ENOTSUP where possible.
+ */
+CEPH_RADOS_API int rados_objects_list_open(
+  rados_ioctx_t io,
+  rados_list_ctx_t *ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_get_pg_hash_position(
+  rados_list_ctx_t ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_seek(
+  rados_list_ctx_t ctx,
+  uint32_t pos) __attribute__((deprecated));
+CEPH_RADOS_API int rados_objects_list_next(
+  rados_list_ctx_t ctx,
+  const char **entry,
+  const char **key) __attribute__((deprecated));
+CEPH_RADOS_API void rados_objects_list_close(
+  rados_list_ctx_t ctx) __attribute__((deprecated));
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
new file mode 100644
index 00000000..0c047c43
--- /dev/null
+++ b/src/include/rados/librados.hpp
@@ -0,0 +1,1468 @@
+#ifndef __LIBRADOS_HPP
+#define __LIBRADOS_HPP
+
+#include <string>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+#include <utility>
+#include "buffer.h"
+
+#include "librados.h"
+#include "librados_fwd.hpp"
+#include "rados_types.hpp"
+
+namespace libradosstriper
+{
+  class RadosStriper;
+}
+
+namespace librados {
+
+using ceph::bufferlist;
+
+struct AioCompletionImpl;
+struct IoCtxImpl;
+struct ListObjectImpl;
+class NObjectIteratorImpl;
+struct ObjListCtx;
+class ObjectOperationImpl;
+struct PlacementGroupImpl;
+struct PoolAsyncCompletionImpl;
+
+typedef struct rados_cluster_stat_t cluster_stat_t;
+typedef struct rados_pool_stat_t pool_stat_t;
+
+typedef void *list_ctx_t;
+typedef uint64_t auid_t;
+typedef void *config_t;
+
+typedef struct {
+  std::string client;
+  std::string cookie;
+  std::string address;
+} locker_t;
+
+typedef std::map<std::string, pool_stat_t> stats_map;
+
+typedef void *completion_t;
+typedef void (*callback_t)(completion_t cb, void *arg);
+
+inline namespace v14_2_0 {
+
+  class IoCtx;
+  class RadosClient;
+
+  class CEPH_RADOS_API ListObject
+  {
+  public:
+    const std::string& get_nspace() const;
+    const std::string& get_oid() const;
+    const std::string& get_locator() const;
+
+    ListObject();
+    ~ListObject();
+    ListObject( const ListObject&);
+    ListObject& operator=(const ListObject& rhs);
+  private:
+    ListObject(ListObjectImpl *impl);
+
+    friend class librados::NObjectIteratorImpl;
+    friend std::ostream& operator<<(std::ostream& out, const ListObject& lop);
+
+    ListObjectImpl *impl;
+  };
+  CEPH_RADOS_API std::ostream& operator<<(std::ostream& out, const librados::ListObject& lop);
+
+  class CEPH_RADOS_API NObjectIterator;
+
+  class CEPH_RADOS_API ObjectCursor
+  {
+    public:
+    ObjectCursor();
+    ObjectCursor(const ObjectCursor &rhs);
+    explicit ObjectCursor(rados_object_list_cursor c);
+    ~ObjectCursor();
+    ObjectCursor& operator=(const ObjectCursor& rhs);
+    bool operator<(const ObjectCursor &rhs) const;
+    bool operator==(const ObjectCursor &rhs) const;
+    void set(rados_object_list_cursor c);
+
+    friend class IoCtx;
+    friend class librados::NObjectIteratorImpl;
+    friend std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+    std::string to_str() const;
+    bool from_str(const std::string& s);
+
+    protected:
+    rados_object_list_cursor c_cursor;
+  };
+  CEPH_RADOS_API std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+  class CEPH_RADOS_API NObjectIterator : public std::iterator <std::forward_iterator_tag, ListObject> {
+  public:
+    static const NObjectIterator __EndObjectIterator;
+    NObjectIterator(): impl(NULL) {}
+    ~NObjectIterator();
+    NObjectIterator(const NObjectIterator &rhs);
+    NObjectIterator& operator=(const NObjectIterator& rhs);
+
+    bool operator==(const NObjectIterator& rhs) const;
+    bool operator!=(const NObjectIterator& rhs) const;
+    const ListObject& operator*() const;
+    const ListObject* operator->() const;
+    NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions
+    NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions
+    friend class IoCtx;
+    friend class librados::NObjectIteratorImpl;
+
+    /// get current hash position of the iterator, rounded to the current pg
+    uint32_t get_pg_hash_position() const;
+
+    /// move the iterator to a given hash position. this may (will!) be rounded
+    /// to the nearest pg. errors are thrown as exceptions
+    uint32_t seek(uint32_t pos);
+
+    /// move the iterator to a given cursor position. errors are thrown as exceptions
+    uint32_t seek(const ObjectCursor& cursor);
+
+    /// get current cursor position
+    ObjectCursor get_cursor();
+
+    /**
+     * Configure PGLS filter to be applied OSD-side (requires caller
+     * to know/understand the format expected by the OSD)
+     */
+    void set_filter(const bufferlist &bl);
+
+  private:
+    NObjectIterator(ObjListCtx *ctx_);
+    void get_next();
+    NObjectIteratorImpl *impl;
+  };
+
+  class CEPH_RADOS_API ObjectItem
+  {
+    public:
+    std::string oid;
+    std::string nspace;
+    std::string locator;
+  };
+
+  /// DEPRECATED; do not use
+  class CEPH_RADOS_API WatchCtx {
+  public:
+    virtual ~WatchCtx();
+    virtual void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) = 0;
+  };
+
+  class CEPH_RADOS_API WatchCtx2 {
+  public:
+    virtual ~WatchCtx2();
+    /**
+     * Callback activated when we receive a notify event.
+     *
+     * @param notify_id unique id for this notify event
+     * @param cookie the watcher we are notifying
+     * @param notifier_id the unique client id of the notifier
+     * @param bl opaque notify payload (from the notifier)
+     */
+    virtual void handle_notify(uint64_t notify_id,
+			       uint64_t cookie,
+			       uint64_t notifier_id,
+			       bufferlist& bl) = 0;
+
+    /**
+     * Callback activated when we encounter an error with the watch.
+     *
+     * Errors we may see:
+     *   -ENOTCONN  : our watch was disconnected
+     *   -ETIMEDOUT : our watch is still valid, but we may have missed
+     *                a notify event.
+     *
+     * @param cookie the watcher with the problem
+     * @param err error
+     */
+    virtual void handle_error(uint64_t cookie, int err) = 0;
+  };
+
+  struct CEPH_RADOS_API AioCompletion {
+    AioCompletion(AioCompletionImpl *pc_) : pc(pc_) {}
+    int set_complete_callback(void *cb_arg, callback_t cb);
+    int set_safe_callback(void *cb_arg, callback_t cb);
+    int wait_for_complete();
+    int wait_for_safe();
+    int wait_for_complete_and_cb();
+    int wait_for_safe_and_cb();
+    bool is_complete();
+    bool is_safe();
+    bool is_complete_and_cb();
+    bool is_safe_and_cb();
+    int get_return_value();
+    int get_version() __attribute__ ((deprecated));
+    uint64_t get_version64();
+    void release();
+    AioCompletionImpl *pc;
+  };
+
+  struct CEPH_RADOS_API PoolAsyncCompletion {
+    PoolAsyncCompletion(PoolAsyncCompletionImpl *pc_) : pc(pc_) {}
+    int set_callback(void *cb_arg, callback_t cb);
+    int wait();
+    bool is_complete();
+    int get_return_value();
+    void release();
+    PoolAsyncCompletionImpl *pc;
+  };
+
+  /**
+   * These are per-op flags which may be different among
+   * ops added to an ObjectOperation.
+   */
+  enum ObjectOperationFlags {
+    OP_EXCL =   LIBRADOS_OP_FLAG_EXCL,
+    OP_FAILOK = LIBRADOS_OP_FLAG_FAILOK,
+    OP_FADVISE_RANDOM = LIBRADOS_OP_FLAG_FADVISE_RANDOM,
+    OP_FADVISE_SEQUENTIAL = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL,
+    OP_FADVISE_WILLNEED = LIBRADOS_OP_FLAG_FADVISE_WILLNEED,
+    OP_FADVISE_DONTNEED = LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
+    OP_FADVISE_NOCACHE = LIBRADOS_OP_FLAG_FADVISE_NOCACHE,
+  };
+
+  class CEPH_RADOS_API ObjectOperationCompletion {
+  public:
+    virtual ~ObjectOperationCompletion() {}
+    virtual void handle_completion(int r, bufferlist& outbl) = 0;
+  };
+
+  /**
+   * These flags apply to the ObjectOperation as a whole.
+   *
+   * BALANCE_READS and LOCALIZE_READS should only be used
+   * when reading from data you're certain won't change,
+   * like a snapshot, or where eventual consistency is ok.
+   *
+   * ORDER_READS_WRITES will order reads the same way writes are
+   * ordered (e.g., waiting for degraded objects).  In particular, it
+   * will make a write followed by a read sequence be preserved.
+   *
+   * IGNORE_CACHE will skip the caching logic on the OSD that normally
+   * handles promotion of objects between tiers.  This allows an operation
+   * to operate (or read) the cached (or uncached) object, even if it is
+   * not coherent.
+   *
+   * IGNORE_OVERLAY will ignore the pool overlay tiering metadata and
+   * process the op directly on the destination pool.  This is useful
+   * for CACHE_FLUSH and CACHE_EVICT operations.
+   */
+  enum ObjectOperationGlobalFlags {
+    OPERATION_NOFLAG             = LIBRADOS_OPERATION_NOFLAG,
+    OPERATION_BALANCE_READS      = LIBRADOS_OPERATION_BALANCE_READS,
+    OPERATION_LOCALIZE_READS     = LIBRADOS_OPERATION_LOCALIZE_READS,
+    OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES,
+    OPERATION_IGNORE_CACHE       = LIBRADOS_OPERATION_IGNORE_CACHE,
+    OPERATION_SKIPRWLOCKS        = LIBRADOS_OPERATION_SKIPRWLOCKS,
+    OPERATION_IGNORE_OVERLAY     = LIBRADOS_OPERATION_IGNORE_OVERLAY,
+    // send requests to cluster despite the cluster or pool being
+    // marked full; ops will either succeed (e.g., delete) or return
+    // EDQUOT or ENOSPC
+    OPERATION_FULL_TRY           = LIBRADOS_OPERATION_FULL_TRY,
+    //mainly for delete
+    OPERATION_FULL_FORCE	 = LIBRADOS_OPERATION_FULL_FORCE,
+    OPERATION_IGNORE_REDIRECT	 = LIBRADOS_OPERATION_IGNORE_REDIRECT,
+    OPERATION_ORDERSNAP          = LIBRADOS_OPERATION_ORDERSNAP,
+  };
+
+  /*
+   * Alloc hint flags for the alloc_hint operation.
+   */
+  enum AllocHintFlags {
+    ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+    ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+    ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+    ALLOC_HINT_FLAG_RANDOM_READ = 8,
+    ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+    ALLOC_HINT_FLAG_IMMUTABLE = 32,
+    ALLOC_HINT_FLAG_SHORTLIVED = 64,
+    ALLOC_HINT_FLAG_LONGLIVED = 128,
+    ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+    ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+  };
+
+  /*
+   * ObjectOperation : compound object operation
+   * Batch multiple object operations into a single request, to be applied
+   * atomically.
+   */
+  class CEPH_RADOS_API ObjectOperation
+  {
+  public:
+    ObjectOperation();
+    virtual ~ObjectOperation();
+
+    size_t size();
+    void set_op_flags(ObjectOperationFlags flags) __attribute__((deprecated));
+    //flag mean ObjectOperationFlags
+    void set_op_flags2(int flags);
+
+    void cmpext(uint64_t off, const bufferlist& cmp_bl, int *prval);
+    void cmpxattr(const char *name, uint8_t op, const bufferlist& val);
+    void cmpxattr(const char *name, uint8_t op, uint64_t v);
+    void exec(const char *cls, const char *method, bufferlist& inbl);
+    void exec(const char *cls, const char *method, bufferlist& inbl, bufferlist *obl, int *prval);
+    void exec(const char *cls, const char *method, bufferlist& inbl, ObjectOperationCompletion *completion);
+    /**
+     * Guard operation with a check that object version == ver
+     *
+     * @param ver [in] version to check
+     */
+    void assert_version(uint64_t ver);
+
+    /**
+     * Guard operation with a check that the object already exists
+     */
+    void assert_exists();
+
+    /**
+     * get key/value pairs for specified keys
+     *
+     * @param assertions [in] comparison assertions
+     * @param prval [out] place error code in prval upon completion
+     *
+     * assertions has the form of mappings from keys to (comparison rval, assertion)
+     * The assertion field may be CEPH_OSD_CMPXATTR_OP_[GT|LT|EQ].
+     *
+     * That is, to assert that the value at key 'foo' is greater than 'bar':
+     *
+     * ObjectReadOperation op;
+     * int r;
+     * map<string, pair<bufferlist, int> > assertions;
+     * bufferlist bar(string('bar'));
+     * assertions['foo'] = make_pair(bar, CEPH_OSD_CMP_XATTR_OP_GT);
+     * op.omap_cmp(assertions, &r);
+     */
+    void omap_cmp(
+      const std::map<std::string, std::pair<bufferlist, int> > &assertions,
+      int *prval);
+
+  protected:
+    ObjectOperationImpl *impl;
+    ObjectOperation(const ObjectOperation& rhs);
+    ObjectOperation& operator=(const ObjectOperation& rhs);
+    friend class IoCtx;
+    friend class Rados;
+  };
+
+  /*
+   * ObjectWriteOperation : compound object write operation
+   * Batch multiple object operations into a single request, to be applied
+   * atomically.
+   */
+  class CEPH_RADOS_API ObjectWriteOperation : public ObjectOperation
+  {
+  protected:
+    time_t *unused;
+  public:
+    ObjectWriteOperation() : unused(NULL) {}
+    ~ObjectWriteOperation() override {}
+
+    void mtime(time_t *pt);
+    void mtime2(struct timespec *pts);
+
+    void create(bool exclusive);
+    void create(bool exclusive,
+		const std::string& category); ///< NOTE: category is unused
+
+    void write(uint64_t off, const bufferlist& bl);
+    void write_full(const bufferlist& bl);
+    void writesame(uint64_t off, uint64_t write_len,
+		   const bufferlist& bl);
+    void append(const bufferlist& bl);
+    void remove();
+    void truncate(uint64_t off);
+    void zero(uint64_t off, uint64_t len);
+    void rmxattr(const char *name);
+    void setxattr(const char *name, const bufferlist& bl);
+    void setxattr(const char *name, const bufferlist&& bl);
+    void tmap_update(const bufferlist& cmdbl);
+    void tmap_put(const bufferlist& bl);
+    void selfmanaged_snap_rollback(uint64_t snapid);
+
+    /**
+     * Rollback an object to the specified snapshot id
+     *
+     * Used with pool snapshots
+     *
+     * @param snapid [in] snopshot id specified
+     */
+    void snap_rollback(uint64_t snapid);
+
+    /**
+     * set keys and values according to map
+     *
+     * @param map [in] keys and values to set
+     */
+    void omap_set(const std::map<std::string, bufferlist> &map);
+
+    /**
+     * set header
+     *
+     * @param bl [in] header to set
+     */
+    void omap_set_header(const bufferlist &bl);
+
+    /**
+     * Clears omap contents
+     */
+    void omap_clear();
+
+    /**
+     * Clears keys in to_rm
+     *
+     * @param to_rm [in] keys to remove
+     */
+    void omap_rm_keys(const std::set<std::string> &to_rm);
+
+    /**
+     * Copy an object
+     *
+     * Copies an object from another location.  The operation is atomic in that
+     * the copy either succeeds in its entirety or fails (e.g., because the
+     * source object was modified while the copy was in progress).
+     *
+     * @param src source object name
+     * @param src_ioctx ioctx for the source object
+     * @param src_version current version of the source object
+     * @param src_fadvise_flags the fadvise flags for source object
+     */
+    void copy_from(const std::string& src, const IoCtx& src_ioctx,
+		   uint64_t src_version, uint32_t src_fadvise_flags);
+
+    /**
+     * undirty an object
+     *
+     * Clear an objects dirty flag
+     */
+    void undirty();
+
+    /**
+     * Set allocation hint for an object
+     *
+     * @param expected_object_size expected size of the object, in bytes
+     * @param expected_write_size expected size of writes to the object, in bytes
+     * @param flags flags ()
+     */
+    void set_alloc_hint(uint64_t expected_object_size,
+                        uint64_t expected_write_size);
+    void set_alloc_hint2(uint64_t expected_object_size,
+			 uint64_t expected_write_size,
+			 uint32_t flags);
+
+    /**
+     * Pin/unpin an object in cache tier
+     *
+     * @returns 0 on success, negative error code on failure
+     */
+    void cache_pin();
+    void cache_unpin();
+
+    /**
+     * Extensible tier
+     *
+     * Set redirect target
+     */
+    void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx,
+		      uint64_t tgt_version, int flag = 0);
+    void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx,
+                   std::string tgt_oid, uint64_t tgt_offset, int flag = 0);
+    void tier_promote();
+    void unset_manifest();
+
+
+    friend class IoCtx;
+  };
+
+  /*
+   * ObjectReadOperation : compound object operation that return value
+   * Batch multiple object operations into a single request, to be applied
+   * atomically.
+   */
+  class CEPH_RADOS_API ObjectReadOperation : public ObjectOperation
+  {
+  public:
+    ObjectReadOperation() {}
+    ~ObjectReadOperation() override {}
+
+    void stat(uint64_t *psize, time_t *pmtime, int *prval);
+    void stat2(uint64_t *psize, struct timespec *pts, int *prval);
+    void getxattr(const char *name, bufferlist *pbl, int *prval);
+    void getxattrs(std::map<std::string, bufferlist> *pattrs, int *prval);
+    void read(size_t off, uint64_t len, bufferlist *pbl, int *prval);
+    void checksum(rados_checksum_type_t type, const bufferlist &init_value_bl,
+		  uint64_t off, size_t len, size_t chunk_size, bufferlist *pbl,
+		  int *prval);
+
+    /**
+     * see aio_sparse_read()
+     */
+    void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m,
+                    bufferlist *data_bl, int *prval);
+
+    /**
+     * omap_get_vals: keys and values from the object omap
+     *
+     * Get up to max_return keys and values beginning after start_after
+     *
+     * @param start_after [in] list no keys smaller than start_after
+     * @param max_return [in] list no more than max_return key/value pairs
+     * @param out_vals [out] place returned values in out_vals on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals(
+      const std::string &start_after,
+      uint64_t max_return,
+      std::map<std::string, bufferlist> *out_vals,
+      int *prval) __attribute__ ((deprecated));  // use v2
+
+    /**
+     * omap_get_vals: keys and values from the object omap
+     *
+     * Get up to max_return keys and values beginning after start_after
+     *
+     * @param start_after [in] list no keys smaller than start_after
+     * @param max_return [in] list no more than max_return key/value pairs
+     * @param out_vals [out] place returned values in out_vals on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals2(
+      const std::string &start_after,
+      uint64_t max_return,
+      std::map<std::string, bufferlist> *out_vals,
+      bool *pmore,
+      int *prval);
+
+    /**
+     * omap_get_vals: keys and values from the object omap
+     *
+     * Get up to max_return keys and values beginning after start_after
+     *
+     * @param start_after [in] list keys starting after start_after
+     * @param filter_prefix [in] list only keys beginning with filter_prefix
+     * @param max_return [in] list no more than max_return key/value pairs
+     * @param out_vals [out] place returned values in out_vals on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals(
+      const std::string &start_after,
+      const std::string &filter_prefix,
+      uint64_t max_return,
+      std::map<std::string, bufferlist> *out_vals,
+      int *prval) __attribute__ ((deprecated));  // use v2
+
+    /**
+     * omap_get_vals2: keys and values from the object omap
+     *
+     * Get up to max_return keys and values beginning after start_after
+     *
+     * @param start_after [in] list keys starting after start_after
+     * @param filter_prefix [in] list only keys beginning with filter_prefix
+     * @param max_return [in] list no more than max_return key/value pairs
+     * @param out_vals [out] place returned values in out_vals on completion
+     * @param pmore [out] pointer to bool indicating whether there are more keys
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals2(
+      const std::string &start_after,
+      const std::string &filter_prefix,
+      uint64_t max_return,
+      std::map<std::string, bufferlist> *out_vals,
+      bool *pmore,
+      int *prval);
+
+
+    /**
+     * omap_get_keys: keys from the object omap
+     *
+     * Get up to max_return keys beginning after start_after
+     *
+     * @param start_after [in] list keys starting after start_after
+     * @param max_return [in] list no more than max_return keys
+     * @param out_keys [out] place returned values in out_keys on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_keys(const std::string &start_after,
+                       uint64_t max_return,
+                       std::set<std::string> *out_keys,
+                       int *prval) __attribute__ ((deprecated)); // use v2
+
+    /**
+     * omap_get_keys2: keys from the object omap
+     *
+     * Get up to max_return keys beginning after start_after
+     *
+     * @param start_after [in] list keys starting after start_after
+     * @param max_return [in] list no more than max_return keys
+     * @param out_keys [out] place returned values in out_keys on completion
+     * @param pmore [out] pointer to bool indicating whether there are more keys
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_keys2(const std::string &start_after,
+			uint64_t max_return,
+			std::set<std::string> *out_keys,
+			bool *pmore,
+			int *prval);
+
+    /**
+     * omap_get_header: get header from object omap
+     *
+     * @param header [out] place header here upon completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_header(bufferlist *header, int *prval);
+
+    /**
+     * get key/value pairs for specified keys
+     *
+     * @param keys [in] keys to get
+     * @param map [out] place key/value pairs found here on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals_by_keys(const std::set<std::string> &keys,
+			       std::map<std::string, bufferlist> *map,
+			       int *prval);
+
+    /**
+     * list_watchers: Get list watchers of object
+     *
+     * @param out_watchers [out] place returned values in out_watchers on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void list_watchers(std::list<obj_watch_t> *out_watchers, int *prval);
+
+    /**
+     * list snapshot clones associated with a logical object
+     *
+     * This will include a record for each version of the object,
+     * include the "HEAD" (which will have a cloneid of SNAP_HEAD).
+     * Each clone includes a vector of snap ids for which it is
+     * defined to exist.
+     *
+     * NOTE: this operation must be submitted from an IoCtx with a
+     * read snapid of SNAP_DIR for reliable results.
+     *
+     * @param out_snaps [out] pointer to resulting snap_set_t
+     * @param prval [out] place error code in prval upon completion
+     */
+    void list_snaps(snap_set_t *out_snaps, int *prval);
+
+    /**
+     * query dirty state of an object
+     *
+     * @param isdirty [out] pointer to resulting bool
+     * @param prval [out] place error code in prval upon completion
+     */
+    void is_dirty(bool *isdirty, int *prval);
+
+    /**
+     * flush a cache tier object to backing tier; will block racing
+     * updates.
+     *
+     * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+     * triggering a promotion.
+     */
+    void cache_flush();
+
+    /**
+     * Flush a cache tier object to backing tier; will EAGAIN if we race
+     * with an update.  Must be used with the SKIPRWLOCKS flag.
+     *
+     * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+     * triggering a promotion.
+     */
+    void cache_try_flush();
+
+    /**
+     * evict a clean cache tier object
+     *
+     * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+     * triggering a promote on the OSD (that is then evicted).
+     */
+    void cache_evict();
+  };
+
+  /* IoCtx : This is a context in which we can perform I/O.
+   * It includes a Pool,
+   *
+   * Typical use (error checking omitted):
+   *
+   * IoCtx p;
+   * rados.ioctx_create("my_pool", p);
+   * p->stat(&stats);
+   * ... etc ...
+   *
+   * NOTE: be sure to call watch_flush() prior to destroying any IoCtx
+   * that is used for watch events to ensure that racing callbacks
+   * have completed.
+   */
+  class CEPH_RADOS_API IoCtx
+  {
+  public:
+    IoCtx();
+    static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool);
+    IoCtx(const IoCtx& rhs);
+    IoCtx& operator=(const IoCtx& rhs);
+    IoCtx(IoCtx&& rhs) noexcept;
+    IoCtx& operator=(IoCtx&& rhs) noexcept;
+
+    ~IoCtx();
+
+    bool is_valid() const;
+
+    // Close our pool handle
+    void close();
+
+    // deep copy
+    void dup(const IoCtx& rhs);
+
+    // set pool auid
+    int set_auid(uint64_t auid_)
+      __attribute__ ((deprecated));
+
+    // set pool auid
+    int set_auid_async(uint64_t auid_, PoolAsyncCompletion *c)
+      __attribute__ ((deprecated));
+
+    // get pool auid
+    int get_auid(uint64_t *auid_)
+      __attribute__ ((deprecated));
+
+    uint64_t get_instance_id() const;
+
+    std::string get_pool_name();
+
+    bool pool_requires_alignment();
+    int pool_requires_alignment2(bool * req);
+    uint64_t pool_required_alignment();
+    int pool_required_alignment2(uint64_t * alignment);
+
+    // create an object
+    int create(const std::string& oid, bool exclusive);
+    int create(const std::string& oid, bool exclusive,
+	       const std::string& category); ///< category is unused
+
+    /**
+     * write bytes to an object at a specified offset
+     *
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+    /**
+     * append bytes to an object
+     *
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int append(const std::string& oid, bufferlist& bl, size_t len);
+    /**
+     * replace object contents with provided data
+     *
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int write_full(const std::string& oid, bufferlist& bl);
+    int writesame(const std::string& oid, bufferlist& bl,
+		  size_t write_len, uint64_t off);
+    int read(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+    int checksum(const std::string& o, rados_checksum_type_t type,
+		 const bufferlist &init_value_bl, size_t len, uint64_t off,
+		 size_t chunk_size, bufferlist *pbl);
+    int remove(const std::string& oid);
+    int remove(const std::string& oid, int flags);
+    int trunc(const std::string& oid, uint64_t size);
+    int mapext(const std::string& o, uint64_t off, size_t len, std::map<uint64_t,uint64_t>& m);
+    int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl);
+    int sparse_read(const std::string& o, std::map<uint64_t,uint64_t>& m, bufferlist& bl, size_t len, uint64_t off);
+    int getxattr(const std::string& oid, const char *name, bufferlist& bl);
+    int getxattrs(const std::string& oid, std::map<std::string, bufferlist>& attrset);
+    int setxattr(const std::string& oid, const char *name, bufferlist& bl);
+    int rmxattr(const std::string& oid, const char *name);
+    int stat(const std::string& oid, uint64_t *psize, time_t *pmtime);
+    int stat2(const std::string& oid, uint64_t *psize, struct timespec *pts);
+    int exec(const std::string& oid, const char *cls, const char *method,
+	     bufferlist& inbl, bufferlist& outbl);
+    /**
+     * modify object tmap based on encoded update sequence
+     *
+     * NOTE: this call steals the contents of @param bl
+     */
+    int tmap_update(const std::string& oid, bufferlist& cmdbl);
+
+    int omap_get_vals(const std::string& oid,
+                      const std::string& start_after,
+                      uint64_t max_return,
+                      std::map<std::string, bufferlist> *out_vals);
+    int omap_get_vals2(const std::string& oid,
+		       const std::string& start_after,
+		       uint64_t max_return,
+		       std::map<std::string, bufferlist> *out_vals,
+		       bool *pmore);
+    int omap_get_vals(const std::string& oid,
+                      const std::string& start_after,
+                      const std::string& filter_prefix,
+                      uint64_t max_return,
+                      std::map<std::string, bufferlist> *out_vals);
+    int omap_get_vals2(const std::string& oid,
+		       const std::string& start_after,
+		       const std::string& filter_prefix,
+		       uint64_t max_return,
+		       std::map<std::string, bufferlist> *out_vals,
+		       bool *pmore);
+    int omap_get_keys(const std::string& oid,
+                      const std::string& start_after,
+                      uint64_t max_return,
+                      std::set<std::string> *out_keys);
+    int omap_get_keys2(const std::string& oid,
+		       const std::string& start_after,
+		       uint64_t max_return,
+		       std::set<std::string> *out_keys,
+		       bool *pmore);
+    int omap_get_header(const std::string& oid,
+                        bufferlist *bl);
+    int omap_get_vals_by_keys(const std::string& oid,
+                              const std::set<std::string>& keys,
+                              std::map<std::string, bufferlist> *vals);
+    int omap_set(const std::string& oid,
+                 const std::map<std::string, bufferlist>& map);
+    int omap_set_header(const std::string& oid,
+                        const bufferlist& bl);
+    int omap_clear(const std::string& oid);
+    int omap_rm_keys(const std::string& oid,
+                     const std::set<std::string>& keys);
+
+    void snap_set_read(snap_t seq);
+    int selfmanaged_snap_set_write_ctx(snap_t seq, std::vector<snap_t>& snaps);
+
+    // Create a snapshot with a given name
+    int snap_create(const char *snapname);
+
+    // Look up a snapshot by name.
+    // Returns 0 on success; error code otherwise
+    int snap_lookup(const char *snapname, snap_t *snap);
+
+    // Gets a timestamp for a snap
+    int snap_get_stamp(snap_t snapid, time_t *t);
+
+    // Gets the name of a snap
+    int snap_get_name(snap_t snapid, std::string *s);
+
+    // Remove a snapshot from this pool
+    int snap_remove(const char *snapname);
+
+    int snap_list(std::vector<snap_t> *snaps);
+
+    int snap_rollback(const std::string& oid, const char *snapname);
+
+    // Deprecated name kept for backward compatibility - same as snap_rollback()
+    int rollback(const std::string& oid, const char *snapname)
+      __attribute__ ((deprecated));
+
+    int selfmanaged_snap_create(uint64_t *snapid);
+    void aio_selfmanaged_snap_create(uint64_t *snapid, AioCompletion *c);
+
+    int selfmanaged_snap_remove(uint64_t snapid);
+    void aio_selfmanaged_snap_remove(uint64_t snapid, AioCompletion *c);
+
+    int selfmanaged_snap_rollback(const std::string& oid, uint64_t snapid);
+
+    // Advisory locking on rados objects.
+    int lock_exclusive(const std::string &oid, const std::string &name,
+		       const std::string &cookie,
+		       const std::string &description,
+		       struct timeval * duration, uint8_t flags);
+
+    int lock_shared(const std::string &oid, const std::string &name,
+		    const std::string &cookie, const std::string &tag,
+		    const std::string &description,
+		    struct timeval * duration, uint8_t flags);
+
+    int unlock(const std::string &oid, const std::string &name,
+	       const std::string &cookie);
+
+    int break_lock(const std::string &oid, const std::string &name,
+		   const std::string &client, const std::string &cookie);
+
+    int list_lockers(const std::string &oid, const std::string &name,
+		     int *exclusive,
+		     std::string *tag,
+		     std::list<librados::locker_t> *lockers);
+
+
+    /// Start enumerating objects for a pool. Errors are thrown as exceptions.
+    NObjectIterator nobjects_begin(const bufferlist &filter=bufferlist());
+    /// Start enumerating objects for a pool starting from a hash position.
+    /// Errors are thrown as exceptions.
+    NObjectIterator nobjects_begin(uint32_t start_hash_position,
+                                   const bufferlist &filter=bufferlist());
+    /// Start enumerating objects for a pool starting from cursor. Errors are
+    /// thrown as exceptions.
+    NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor,
+                                   const bufferlist &filter=bufferlist());
+    /// Iterator indicating the end of a pool
+    const NObjectIterator& nobjects_end() const;
+
+    /// Get cursor for pool beginning
+    ObjectCursor object_list_begin();
+
+    /// Get cursor for pool end
+    ObjectCursor object_list_end();
+
+    /// Check whether a cursor is at the end of a pool
+    bool object_list_is_end(const ObjectCursor &oc);
+
+    /// List some objects between two cursors
+    int object_list(const ObjectCursor &start, const ObjectCursor &finish,
+                    const size_t result_count,
+                    const bufferlist &filter,
+                    std::vector<ObjectItem> *result,
+                    ObjectCursor *next);
+
+    /// Generate cursors that include the N out of Mth slice of the pool
+    void object_list_slice(
+        const ObjectCursor start,
+        const ObjectCursor finish,
+        const size_t n,
+        const size_t m,
+        ObjectCursor *split_start,
+        ObjectCursor *split_finish);
+
+    /**
+     * List available hit set objects
+     *
+     * @param uint32_t [in] hash position to query
+     * @param c [in] completion
+     * @param pls [out] list of available intervals
+     */
+    int hit_set_list(uint32_t hash, AioCompletion *c,
+		     std::list< std::pair<time_t, time_t> > *pls);
+
+    /**
+     * Retrieve hit set for a given hash, and time
+     *
+     * @param hash [in] hash position
+     * @param c [in] completion
+     * @param stamp [in] time interval that falls within the hit set's interval
+     * @param pbl [out] buffer to store the result in
+     */
+    int hit_set_get(uint32_t hash, AioCompletion *c, time_t stamp,
+		    bufferlist *pbl);
+
+    uint64_t get_last_version();
+
+    int aio_read(const std::string& oid, AioCompletion *c,
+		 bufferlist *pbl, size_t len, uint64_t off);
+    /**
+     * Asynchronously read from an object at a particular snapshot
+     *
+     * This is the same as normal aio_read, except that it chooses
+     * the snapshot to read from from its arguments instead of the
+     * internal IoCtx state.
+     *
+     * The return value of the completion will be number of bytes read on
+     * success, negative error code on failure.
+     *
+     * @param oid the name of the object to read from
+     * @param c what to do when the read is complete
+     * @param pbl where to store the results
+     * @param len the number of bytes to read
+     * @param off the offset to start reading from in the object
+     * @param snapid the id of the snapshot to read from
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_read(const std::string& oid, AioCompletion *c,
+		 bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid);
+    int aio_sparse_read(const std::string& oid, AioCompletion *c,
+			std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+			size_t len, uint64_t off);
+    /**
+     * Asynchronously read existing extents from an object at a
+     * particular snapshot
+     *
+     * This is the same as normal aio_sparse_read, except that it chooses
+     * the snapshot to read from from its arguments instead of the
+     * internal IoCtx state.
+     *
+     * m will be filled in with a map of extents in the object,
+     * mapping offsets to lengths (in bytes) within the range
+     * requested. The data for all of the extents are stored
+     * back-to-back in offset order in data_bl.
+     *
+     * @param oid the name of the object to read from
+     * @param c what to do when the read is complete
+     * @param m where to store the map of extents
+     * @param data_bl where to store the data
+     * @param len the number of bytes to read
+     * @param off the offset to start reading from in the object
+     * @param snapid the id of the snapshot to read from
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_sparse_read(const std::string& oid, AioCompletion *c,
+			std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+			size_t len, uint64_t off, uint64_t snapid);
+    /**
+     * Asynchronously compare an on-disk object range with a buffer
+     *
+     * @param oid the name of the object to read from
+     * @param c what to do when the read is complete
+     * @param off object byte offset at which to start the comparison
+     * @param cmp_bl buffer containing bytes to be compared with object contents
+     * @returns 0 on success, negative error code on failure,
+     *  (-MAX_ERRNO - mismatch_off) on mismatch
+     */
+    int aio_cmpext(const std::string& oid,
+		   librados::AioCompletion *c,
+		   uint64_t off,
+		   bufferlist& cmp_bl);
+    int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+		  size_t len, uint64_t off);
+    int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+		  size_t len);
+    int aio_write_full(const std::string& oid, AioCompletion *c, const bufferlist& bl);
+    int aio_writesame(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+		      size_t write_len, uint64_t off);
+
+    /**
+     * Asynchronously remove an object
+     *
+     * Queues the remove and returns.
+     *
+     * The return value of the completion will be 0 on success, negative
+     * error code on failure.
+     *
+     * @param oid the name of the object
+     * @param c what to do when the remove is safe and complete
+     * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+     * other than SNAP_HEAD
+     */
+    int aio_remove(const std::string& oid, AioCompletion *c);
+    int aio_remove(const std::string& oid, AioCompletion *c, int flags);
+
+    /**
+     * Wait for all currently pending aio writes to be safe.
+     *
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_flush();
+
+    /**
+     * Schedule a callback for when all currently pending
+     * aio writes are safe. This is a non-blocking version of
+     * aio_flush().
+     *
+     * @param c what to do when the writes are safe
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_flush_async(AioCompletion *c);
+    int aio_getxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+    int aio_getxattrs(const std::string& oid, AioCompletion *c, std::map<std::string, bufferlist>& attrset);
+    int aio_setxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+    int aio_rmxattr(const std::string& oid, AioCompletion *c, const char *name);
+    int aio_stat(const std::string& oid, AioCompletion *c, uint64_t *psize, time_t *pmtime);
+    int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts);
+
+    /**
+     * Cancel aio operation
+     *
+     * @param c completion handle
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_cancel(AioCompletion *c);
+
+    int aio_exec(const std::string& oid, AioCompletion *c, const char *cls, const char *method,
+	         bufferlist& inbl, bufferlist *outbl);
+
+    /*
+     * asynchronous version of unlock
+     */
+    int aio_unlock(const std::string &oid, const std::string &name,
+	           const std::string &cookie, AioCompletion *c);
+
+    // compound object operations
+    int operate(const std::string& oid, ObjectWriteOperation *op);
+    int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl);
+    int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op);
+    int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op, int flags);
+    /**
+     * Schedule an async write operation with explicit snapshot parameters
+     *
+     * This is the same as the first aio_operate(), except that it
+     * gets the snapshot context from its arguments instead of the
+     * IoCtx internal state.
+     *
+     * @param oid the object to operate on
+     * @param c what to do when the operation is complete and safe
+     * @param op which operations to perform
+     * @param seq latest selfmanaged snapshot sequence number for this object
+     * @param snaps currently existing selfmanaged snapshot ids for this object
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_operate(const std::string& oid, AioCompletion *c,
+		    ObjectWriteOperation *op, snap_t seq,
+		    std::vector<snap_t>& snaps);
+    int aio_operate(const std::string& oid, AioCompletion *c,
+        ObjectWriteOperation *op, snap_t seq,
+        std::vector<snap_t>& snaps,
+        const blkin_trace_info *trace_info);
+    int aio_operate(const std::string& oid, AioCompletion *c,
+        ObjectWriteOperation *op, snap_t seq,
+        std::vector<snap_t>& snaps, int flags,
+        const blkin_trace_info *trace_info);
+    int aio_operate(const std::string& oid, AioCompletion *c,
+		    ObjectReadOperation *op, bufferlist *pbl);
+
+    int aio_operate(const std::string& oid, AioCompletion *c,
+		    ObjectReadOperation *op, snap_t snapid, int flags,
+		    bufferlist *pbl)
+      __attribute__ ((deprecated));
+
+    int aio_operate(const std::string& oid, AioCompletion *c,
+		    ObjectReadOperation *op, int flags,
+		    bufferlist *pbl);
+    int aio_operate(const std::string& oid, AioCompletion *c,
+        ObjectReadOperation *op, int flags,
+        bufferlist *pbl, const blkin_trace_info *trace_info);
+
+    // watch/notify
+    int watch2(const std::string& o, uint64_t *handle,
+	       librados::WatchCtx2 *ctx);
+    int watch3(const std::string& o, uint64_t *handle,
+	       librados::WatchCtx2 *ctx, uint32_t timeout);
+    int aio_watch(const std::string& o, AioCompletion *c, uint64_t *handle,
+	       librados::WatchCtx2 *ctx);
+    int aio_watch2(const std::string& o, AioCompletion *c, uint64_t *handle,
+	       librados::WatchCtx2 *ctx, uint32_t timeout);
+    int unwatch2(uint64_t handle);
+    int aio_unwatch(uint64_t handle, AioCompletion *c);
+    /**
+     * Send a notify event to watchers
+     *
+     * Upon completion the pbl bufferlist reply payload will be
+     * encoded like so:
+     *
+     *    le32 num_acks
+     *    {
+     *      le64 gid     global id for the client (for client.1234 that's 1234)
+     *      le64 cookie  cookie for the client
+     *      le32 buflen  length of reply message buffer
+     *      u8 * buflen  payload
+     *    } * num_acks
+     *    le32 num_timeouts
+     *    {
+     *      le64 gid     global id for the client
+     *      le64 cookie  cookie for the client
+     *    } * num_timeouts
+     *
+     *
+     */
+    int notify2(const std::string& o,   ///< object
+		bufferlist& bl,         ///< optional broadcast payload
+		uint64_t timeout_ms,    ///< timeout (in ms)
+		bufferlist *pbl);       ///< reply buffer
+    int aio_notify(const std::string& o,   ///< object
+                   AioCompletion *c,       ///< completion when notify completes
+                   bufferlist& bl,         ///< optional broadcast payload
+                   uint64_t timeout_ms,    ///< timeout (in ms)
+                   bufferlist *pbl);       ///< reply buffer
+
+    int list_watchers(const std::string& o, std::list<obj_watch_t> *out_watchers);
+    int list_snaps(const std::string& o, snap_set_t *out_snaps);
+    void set_notify_timeout(uint32_t timeout);
+
+    /// acknowledge a notify we received.
+    void notify_ack(const std::string& o, ///< watched object
+		    uint64_t notify_id,   ///< notify id
+		    uint64_t cookie,      ///< our watch handle
+		    bufferlist& bl);      ///< optional reply payload
+
+    /***
+     * check on watch validity
+     *
+     * Check if a watch is valid.  If so, return the number of
+     * milliseconds since we last confirmed its liveness.  If there is
+     * a known error, return it.
+     *
+     * If there is an error, the watch is no longer valid, and should
+     * be destroyed with unwatch().  The user is still interested in
+     * the object, a new watch should be created with watch().
+     *
+     * @param cookie watch handle
+     * @returns ms since last confirmed valid, or error
+     */
+    int watch_check(uint64_t cookie);
+
+    // old, deprecated versions
+    int watch(const std::string& o, uint64_t ver, uint64_t *cookie,
+	      librados::WatchCtx *ctx) __attribute__ ((deprecated));
+    int notify(const std::string& o, uint64_t ver, bufferlist& bl)
+      __attribute__ ((deprecated));
+    int unwatch(const std::string& o, uint64_t cookie)
+      __attribute__ ((deprecated));
+
+    /**
+     * Set allocation hint for an object
+     *
+     * This is an advisory operation, it will always succeed (as if it
+     * was submitted with a OP_FAILOK flag set) and is not guaranteed
+     * to do anything on the backend.
+     *
+     * @param o the name of the object
+     * @param expected_object_size expected size of the object, in bytes
+     * @param expected_write_size expected size of writes to the object, in bytes
+     * @returns 0 on success, negative error code on failure
+     */
+    int set_alloc_hint(const std::string& o,
+                       uint64_t expected_object_size,
+                       uint64_t expected_write_size);
+    int set_alloc_hint2(const std::string& o,
+			uint64_t expected_object_size,
+			uint64_t expected_write_size,
+			uint32_t flags);
+
+    // assert version for next sync operations
+    void set_assert_version(uint64_t ver);
+
+    /**
+     * Pin/unpin an object in cache tier
+     *
+     * @param o the name of the object
+     * @returns 0 on success, negative error code on failure
+     */
+    int cache_pin(const std::string& o);
+    int cache_unpin(const std::string& o);
+
+    std::string get_pool_name() const;
+
+    void locator_set_key(const std::string& key);
+    void set_namespace(const std::string& nspace);
+    std::string get_namespace() const;
+
+    int64_t get_id();
+
+    // deprecated versions
+    uint32_t get_object_hash_position(const std::string& oid)
+      __attribute__ ((deprecated));
+    uint32_t get_object_pg_hash_position(const std::string& oid)
+      __attribute__ ((deprecated));
+
+    int get_object_hash_position2(const std::string& oid, uint32_t *hash_position);
+    int get_object_pg_hash_position2(const std::string& oid, uint32_t *pg_hash_position);
+
+    config_t cct();
+
+    void set_osdmap_full_try();
+    void unset_osdmap_full_try();
+
+    int application_enable(const std::string& app_name, bool force);
+    int application_enable_async(const std::string& app_name,
+                                 bool force, PoolAsyncCompletion *c);
+    int application_list(std::set<std::string> *app_names);
+    int application_metadata_get(const std::string& app_name,
+                                 const std::string &key,
+                                 std::string *value);
+    int application_metadata_set(const std::string& app_name,
+                                 const std::string &key,
+                                 const std::string& value);
+    int application_metadata_remove(const std::string& app_name,
+                                    const std::string &key);
+    int application_metadata_list(const std::string& app_name,
+                                  std::map<std::string, std::string> *values);
+
+  private:
+    /* You can only get IoCtx instances from Rados */
+    IoCtx(IoCtxImpl *io_ctx_impl_);
+
+    friend class Rados; // Only Rados can use our private constructor to create IoCtxes.
+    friend class libradosstriper::RadosStriper; // Striper needs to see our IoCtxImpl
+    friend class ObjectWriteOperation;  // copy_from needs to see our IoCtxImpl
+
+    IoCtxImpl *io_ctx_impl;
+  };
+
+  struct CEPH_RADOS_API PlacementGroup {
+    PlacementGroup();
+    PlacementGroup(const PlacementGroup&);
+    ~PlacementGroup();
+    bool parse(const char*);
+    std::unique_ptr<PlacementGroupImpl> impl;
+  };
+
+  CEPH_RADOS_API std::ostream& operator<<(std::ostream&, const PlacementGroup&);
+
+  class CEPH_RADOS_API Rados
+  {
+  public:
+    static void version(int *major, int *minor, int *extra);
+
+    Rados();
+    explicit Rados(IoCtx& ioctx);
+    ~Rados();
+    static void from_rados_t(rados_t cluster, Rados &rados);
+
+    int init(const char * const id);
+    int init2(const char * const name, const char * const clustername,
+	      uint64_t flags);
+    int init_with_context(config_t cct_);
+    config_t cct();
+    int connect();
+    void shutdown();
+    int watch_flush();
+    int aio_watch_flush(AioCompletion*);
+    int conf_read_file(const char * const path) const;
+    int conf_parse_argv(int argc, const char ** argv) const;
+    int conf_parse_argv_remainder(int argc, const char ** argv,
+				  const char ** remargv) const;
+    int conf_parse_env(const char *env) const;
+    int conf_set(const char *option, const char *value);
+    int conf_get(const char *option, std::string &val);
+
+    int service_daemon_register(
+      const std::string& service,  ///< service name (e.g., 'rgw')
+      const std::string& name,     ///< daemon name (e.g., 'gwfoo')
+      const std::map<std::string,std::string>& metadata); ///< static metadata about daemon
+    int service_daemon_update_status(
+      std::map<std::string,std::string>&& status);
+
+    int pool_create(const char *name);
+    int pool_create(const char *name, uint64_t auid)
+      __attribute__ ((deprecated));
+    int pool_create(const char *name, uint64_t auid, uint8_t crush_rule)
+      __attribute__ ((deprecated));
+    int pool_create_with_rule(const char *name, uint8_t crush_rule);
+    int pool_create_async(const char *name, PoolAsyncCompletion *c);
+    int pool_create_async(const char *name, uint64_t auid, PoolAsyncCompletion *c)
+      __attribute__ ((deprecated));
+    int pool_create_async(const char *name, uint64_t auid, uint8_t crush_rule, PoolAsyncCompletion *c)
+      __attribute__ ((deprecated));
+    int pool_create_with_rule_async(const char *name, uint8_t crush_rule, PoolAsyncCompletion *c);
+    int pool_get_base_tier(int64_t pool, int64_t* base_tier);
+    int pool_delete(const char *name);
+    int pool_delete_async(const char *name, PoolAsyncCompletion *c);
+    int64_t pool_lookup(const char *name);
+    int pool_reverse_lookup(int64_t id, std::string *name);
+
+    uint64_t get_instance_id();
+
+    int get_min_compatible_osd(int8_t* require_osd_release);
+    int get_min_compatible_client(int8_t* min_compat_client,
+                                  int8_t* require_min_compat_client);
+
+    int mon_command(std::string cmd, const bufferlist& inbl,
+		    bufferlist *outbl, std::string *outs);
+    int mgr_command(std::string cmd, const bufferlist& inbl,
+		    bufferlist *outbl, std::string *outs);
+    int osd_command(int osdid, std::string cmd, const bufferlist& inbl,
+                    bufferlist *outbl, std::string *outs);
+    int pg_command(const char *pgstr, std::string cmd, const bufferlist& inbl,
+                   bufferlist *outbl, std::string *outs);
+
+    int ioctx_create(const char *name, IoCtx &pioctx);
+    int ioctx_create2(int64_t pool_id, IoCtx &pioctx);
+
+    // Features useful for test cases
+    void test_blacklist_self(bool set);
+
+    /* pool info */
+    int pool_list(std::list<std::string>& v);
+    int pool_list2(std::list<std::pair<int64_t, std::string> >& v);
+    int get_pool_stats(std::list<std::string>& v,
+		       stats_map& result);
+    /// deprecated; use simpler form.  categories no longer supported.
+    int get_pool_stats(std::list<std::string>& v,
+		       std::map<std::string, stats_map>& stats);
+    /// deprecated; categories no longer supported
+    int get_pool_stats(std::list<std::string>& v,
+                       std::string& category,
+		       std::map<std::string, stats_map>& stats);
+    /// check if pool has selfmanaged snaps
+    bool get_pool_is_selfmanaged_snaps_mode(const std::string& poolname);
+
+    int cluster_stat(cluster_stat_t& result);
+    int cluster_fsid(std::string *fsid);
+
+    /**
+     * List inconsistent placement groups in the given pool
+     *
+     * @param pool_id the pool id
+     * @param pgs [out] the inconsistent PGs
+     */
+    int get_inconsistent_pgs(int64_t pool_id,
+                             std::vector<PlacementGroup>* pgs);
+    /**
+     * List the inconsistent objects found in a given PG by last scrub
+     *
+     * @param pg the placement group returned by @c pg_list()
+     * @param start_after the first returned @c objects
+     * @param max_return the max number of the returned @c objects
+     * @param c what to do when the operation is complete and safe
+     * @param objects [out] the objects where inconsistencies are found
+     * @param interval [in,out] an epoch indicating current interval
+     * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+     *          the current interval begin epoch is different.
+     */
+    int get_inconsistent_objects(const PlacementGroup& pg,
+                                 const object_id_t &start_after,
+                                 unsigned max_return,
+                                 AioCompletion *c,
+                                 std::vector<inconsistent_obj_t>* objects,
+                                 uint32_t* interval);
+    /**
+     * List the inconsistent snapsets found in a given PG by last scrub
+     *
+     * @param pg the placement group returned by @c pg_list()
+     * @param start_after the first returned @c objects
+     * @param max_return the max number of the returned @c objects
+     * @param c what to do when the operation is complete and safe
+     * @param snapsets [out] the objects where inconsistencies are found
+     * @param interval [in,out] an epoch indicating current interval
+     * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+     *          the current interval begin epoch is different.
+     */
+    int get_inconsistent_snapsets(const PlacementGroup& pg,
+                                  const object_id_t &start_after,
+                                  unsigned max_return,
+                                  AioCompletion *c,
+                                  std::vector<inconsistent_snapset_t>* snapset,
+                                  uint32_t* interval);
+
+    /// get/wait for the most recent osdmap
+    int wait_for_latest_osdmap();
+
+    int blacklist_add(const std::string& client_address,
+                      uint32_t expire_seconds);
+
+    /*
+     * pool aio
+     *
+     * It is up to the caller to release the completion handler, even if the pool_create_async()
+     * and/or pool_delete_async() fails and does not send the async request
+     */
+    static PoolAsyncCompletion *pool_async_create_completion();
+
+   // -- aio --
+    static AioCompletion *aio_create_completion();
+    static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete,
+						callback_t cb_safe);
+    
+    friend std::ostream& operator<<(std::ostream &oss, const Rados& r);
+  private:
+    // We don't allow assignment or copying
+    Rados(const Rados& rhs);
+    const Rados& operator=(const Rados& rhs);
+    RadosClient *client;
+  };
+
+} // namespace v14_2_0
+} // namespace librados
+
+#endif
+
diff --git a/src/include/rados/librados_fwd.hpp b/src/include/rados/librados_fwd.hpp
new file mode 100644
index 00000000..8926d097
--- /dev/null
+++ b/src/include/rados/librados_fwd.hpp
@@ -0,0 +1,32 @@
+#ifndef __LIBRADOS_FWD_HPP
+#define __LIBRADOS_FWD_HPP
+
+namespace libradosstriper {
+
+class RadosStriper;
+
+} // namespace libradosstriper
+
+namespace librados {
+inline namespace v14_2_0 {
+
+class AioCompletion;
+class IoCtx;
+class ListObject;
+class NObjectIterator;
+class ObjectCursor;
+class ObjectItem;
+class ObjectOperation;
+class ObjectOperationCompletion;
+class ObjectReadOperation;
+class ObjectWriteOperation;
+class PlacementGroup;
+class PoolAsyncCompletion;
+class Rados;
+class WatchCtx;
+class WatchCtx2;
+
+} // inline namespace v14_2_0
+} // namespace librados
+
+#endif // __LIBRADOS_FWD_HPP
diff --git a/src/include/rados/librgw.h b/src/include/rados/librgw.h
new file mode 100644
index 00000000..c20e96be
--- /dev/null
+++ b/src/include/rados/librgw.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_LIBRGW_H
+#define CEPH_LIBRGW_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_VER_MAJOR 1
+#define LIBRGW_VER_MINOR 1
+#define LIBRGW_VER_EXTRA 0
+
+#define LIBRGW_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_VERSION_CODE LIBRGW_VERSION(LIBRGW_VER_MAJOR, LIBRGW_VER_MINOR, LIBRGW_VER_EXTRA)
+
+typedef void* librgw_t;
+int librgw_create(librgw_t *rgw, int argc, char **argv);
+void librgw_shutdown(librgw_t rgw);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_LIBRGW_H */
diff --git a/src/include/rados/objclass.h b/src/include/rados/objclass.h
new file mode 100644
index 00000000..80ae69d2
--- /dev/null
+++ b/src/include/rados/objclass.h
@@ -0,0 +1,177 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+#define CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+
+#ifdef __cplusplus
+
+#include "buffer.h"
+
+extern "C" {
+#endif
+
+#define CEPH_CLS_API [[gnu::visibility("default")]]
+
+#define CLS_VER(maj,min) \
+int __cls_ver__## maj ## _ ##min = 0; \
+int __cls_ver_maj = maj; \
+int __cls_ver_min = min;
+
+#define CLS_NAME(name) \
+int __cls_name__## name = 0; \
+const char *__cls_name = #name;
+
+#define CLS_INIT(name) \
+CEPH_CLS_API void __cls_init()
+
+#define CLS_METHOD_RD       0x1 /// method executes read operations
+#define CLS_METHOD_WR       0x2 /// method executes write operations
+#define CLS_METHOD_PROMOTE  0x8 /// method cannot be proxied to base tier
+
+#define CLS_LOG(level, fmt, ...)                                        \
+  cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__)
+
+/**
+ * Initialize a class.
+ */
+void __cls_init();
+
+/**
+ * @typdef cls_handle_t
+ *
+ * A handle for interacting with the object class.
+ */
+typedef void *cls_handle_t;
+
+/**
+ * @typedef cls_method_handle_t
+ *
+ * A handle for interacting with the method of the object class.
+ */
+typedef void *cls_method_handle_t;
+
+/**
+ * @typedef cls_method_context_t
+ *
+ * A context for the method of the object class.
+ */
+typedef void* cls_method_context_t;
+
+/*class utils*/
+extern int cls_log(int level, const char *format, ...)
+  __attribute__((__format__(printf, 2, 3)));
+
+/* class registration api */
+extern int cls_register(const char *name, cls_handle_t *handle);
+
+#ifdef __cplusplus
+}
+
+/**
+ * @typedef cls_method_cxx_call_t
+ *
+ */
+typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx,
+    class ceph::buffer::list *inbl, class ceph::buffer::list *outbl);
+
+/**
+ * Register a method.
+ *
+ * @param hclass
+ * @param method
+ * @param flags
+ * @param class_call
+ * @param handle
+ */
+extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags,
+                                   cls_method_cxx_call_t class_call, cls_method_handle_t *handle);
+
+/**
+ * Create an object.
+ *
+ * @param hctx
+ * @param exclusive
+ */
+extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive);
+
+/**
+ * Remove an object.
+ *
+ * @param hctx
+ */
+extern int cls_cxx_remove(cls_method_context_t hctx);
+
+/**
+ * Check on the status of an object.
+ *
+ * @param hctx
+ * @param size
+ * @param mtime
+ */
+extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime);
+
+/**
+ * Read contents of an object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Write to the object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Get xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param outbl
+ */
+extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
+                            ceph::bufferlist *outbl);
+
+/**
+ * Set xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param inbl
+ */
+extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name,
+                            ceph::bufferlist *inbl);
+
+/**
+ * Get value corresponding to a key from the map.
+ *
+ * @param hctx
+ * @param key
+ * @param outbl
+ */
+extern int cls_cxx_map_get_val(cls_method_context_t hctx,
+                               const std::string &key, ceph::bufferlist *outbl);
+
+/**
+ * Set value corresponding to a key in the map.
+ *
+ * @param hctx
+ * @param key
+ * @param inbl
+ */
+extern int cls_cxx_map_set_val(cls_method_context_t hctx,
+                               const std::string &key, ceph::bufferlist *inbl);
+
+#endif
+
+#endif
diff --git a/src/include/rados/page.h b/src/include/rados/page.h
new file mode 120000
index 00000000..cf983e83
--- /dev/null
+++ b/src/include/rados/page.h
@@ -0,0 +1 @@
+../page.h
+\ No newline at end of file
diff --git a/src/include/rados/rados_types.h b/src/include/rados/rados_types.h
new file mode 100644
index 00000000..0712f489
--- /dev/null
+++ b/src/include/rados/rados_types.h
@@ -0,0 +1,29 @@
+#ifndef CEPH_RADOS_TYPES_H
+#define CEPH_RADOS_TYPES_H
+
+#include <stdint.h>
+
+/**
+ * @struct obj_watch_t
+ * One item from list_watchers
+ */
+struct obj_watch_t {
+  /// Address of the Watcher
+  char addr[256];
+  /// Watcher ID
+  int64_t watcher_id;
+  /// Cookie
+  uint64_t cookie;
+  /// Timeout in Seconds
+  uint32_t timeout_seconds;
+}; 
+
+/**
+ *
+ * Pass as nspace argument to rados_ioctx_set_namespace()
+ * before calling rados_nobjects_list_open() to return
+ * all objects in all namespaces.
+ */
+#define	LIBRADOS_ALL_NSPACES "\001"
+
+#endif
diff --git a/src/include/rados/rados_types.hpp b/src/include/rados/rados_types.hpp
new file mode 100644
index 00000000..8c02dd83
--- /dev/null
+++ b/src/include/rados/rados_types.hpp
@@ -0,0 +1,331 @@
+#ifndef CEPH_RADOS_TYPES_HPP
+#define CEPH_RADOS_TYPES_HPP
+
+#include <map>
+#include <utility>
+#include <vector>
+#include <stdint.h>
+#include <string>
+
+#include "buffer.h"
+#include "rados_types.h"
+
+namespace librados {
+
+typedef uint64_t snap_t;
+
+enum {
+  SNAP_HEAD = (uint64_t)(-2),
+  SNAP_DIR = (uint64_t)(-1)
+};
+
+struct clone_info_t {
+  snap_t cloneid;
+  std::vector<snap_t> snaps;          // ascending
+  std::vector< std::pair<uint64_t,uint64_t> > overlap;  // with next newest
+  uint64_t size;
+  clone_info_t() : cloneid(0), size(0) {}
+};
+
+struct snap_set_t {
+  std::vector<clone_info_t> clones;   // ascending
+  snap_t seq;   // newest snapid seen by the object
+  snap_set_t() : seq(0) {}
+};
+
+struct object_id_t {
+  std::string name;
+  std::string nspace;
+  std::string locator;
+  snap_t snap = 0;
+  object_id_t() = default;
+  object_id_t(const std::string& name,
+              const std::string& nspace,
+              const std::string& locator,
+              snap_t snap)
+    : name(name),
+      nspace(nspace),
+      locator(locator),
+      snap(snap)
+  {}
+};
+
+struct err_t {
+  enum : uint64_t {
+    SHARD_MISSING        = 1 << 1,
+    SHARD_STAT_ERR       = 1 << 2,
+    SHARD_READ_ERR       = 1 << 3,
+    DATA_DIGEST_MISMATCH_OI = 1 << 9,   // Old
+    DATA_DIGEST_MISMATCH_INFO = 1 << 9,
+    OMAP_DIGEST_MISMATCH_OI = 1 << 10,  // Old
+    OMAP_DIGEST_MISMATCH_INFO = 1 << 10,
+    SIZE_MISMATCH_OI        = 1 << 11,  // Old
+    SIZE_MISMATCH_INFO        = 1 << 11,
+    SHARD_EC_HASH_MISMATCH  = 1 << 12,
+    SHARD_EC_SIZE_MISMATCH  = 1 << 13,
+    OI_ATTR_MISSING         = 1 << 14, // Old
+    INFO_MISSING         = 1 << 14,
+    OI_ATTR_CORRUPTED       = 1 << 15, // Old
+    INFO_CORRUPTED       = 1 << 15,
+    SS_ATTR_MISSING         = 1 << 16, // Old
+    SNAPSET_MISSING         = 1 << 16,
+    SS_ATTR_CORRUPTED       = 1 << 17, // Old
+    SNAPSET_CORRUPTED       = 1 << 17,
+    OBJ_SIZE_OI_MISMATCH      = 1 << 18, // Old
+    OBJ_SIZE_INFO_MISMATCH      = 1 << 18,
+    HINFO_MISSING         = 1 << 19,
+    HINFO_CORRUPTED       = 1 << 20
+    // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+  };
+  uint64_t errors = 0;
+  static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED;
+  static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH;
+  bool has_shard_missing() const {
+    return errors & SHARD_MISSING;
+  }
+  bool has_stat_error() const {
+    return errors & SHARD_STAT_ERR;
+  }
+  bool has_read_error() const {
+    return errors & SHARD_READ_ERR;
+  }
+  bool has_data_digest_mismatch_oi() const {   // Compatibility
+    return errors & DATA_DIGEST_MISMATCH_OI;
+  }
+  bool has_data_digest_mismatch_info() const {
+    return errors & DATA_DIGEST_MISMATCH_INFO;
+  }
+  bool has_omap_digest_mismatch_oi() const {   // Compatibility
+    return errors & OMAP_DIGEST_MISMATCH_OI;
+  }
+  bool has_omap_digest_mismatch_info() const {
+    return errors & OMAP_DIGEST_MISMATCH_INFO;
+  }
+  bool has_size_mismatch_oi() const {   // Compatibility
+    return errors & SIZE_MISMATCH_OI;
+  }
+  bool has_size_mismatch_info() const {
+    return errors & SIZE_MISMATCH_INFO;
+  }
+  bool has_ec_hash_error() const {
+    return errors & SHARD_EC_HASH_MISMATCH;
+  }
+  bool has_ec_size_error() const {
+    return errors & SHARD_EC_SIZE_MISMATCH;
+  }
+  bool has_oi_attr_missing() const {    // Compatibility
+    return errors & OI_ATTR_MISSING;
+  }
+  bool has_info_missing() const {
+    return errors & INFO_MISSING;
+  }
+  bool has_oi_attr_corrupted() const {	 // Compatibility
+    return errors & OI_ATTR_CORRUPTED;
+  }
+  bool has_info_corrupted() const {
+    return errors & INFO_CORRUPTED;
+  }
+  bool has_ss_attr_missing() const {	// Compatibility
+    return errors & SS_ATTR_MISSING;
+  }
+  bool has_snapset_missing() const {
+    return errors & SNAPSET_MISSING;
+  }
+  bool has_ss_attr_corrupted() const {	// Compatibility
+    return errors & SS_ATTR_CORRUPTED;
+  }
+  bool has_snapset_corrupted() const {
+    return errors & SNAPSET_CORRUPTED;
+  }
+  bool has_shallow_errors() const {
+    return errors & SHALLOW_ERRORS;
+  }
+  bool has_deep_errors() const {
+    return errors & DEEP_ERRORS;
+  }
+  bool has_obj_size_oi_mismatch() const {   // Compatibility
+    return errors & OBJ_SIZE_OI_MISMATCH;
+   }
+  bool has_obj_size_info_mismatch() const {
+    return errors & OBJ_SIZE_INFO_MISMATCH;
+  }
+  bool has_hinfo_missing() const {
+    return errors & HINFO_MISSING;
+  }
+  bool has_hinfo_corrupted() const {
+    return errors & HINFO_CORRUPTED;
+  }
+};
+
+struct shard_info_t : err_t {
+  std::map<std::string, ceph::bufferlist> attrs;
+  uint64_t size = -1;
+  bool omap_digest_present = false;
+  uint32_t omap_digest = 0;
+  bool data_digest_present = false;
+  uint32_t data_digest = 0;
+  bool selected_oi = false;
+  bool primary = false;
+};
+
+struct osd_shard_t {
+  int32_t osd;
+  int8_t shard;
+};
+
+inline bool operator<(const osd_shard_t &lhs, const osd_shard_t &rhs) {
+  if (lhs.osd < rhs.osd)
+    return true;
+  else if (lhs.osd > rhs.osd)
+    return false;
+  else
+    return lhs.shard < rhs.shard;
+}
+
+struct obj_err_t {
+  enum : uint64_t {
+    OBJECT_INFO_INCONSISTENCY   = 1 << 1,
+    // XXX: Can an older rados binary work if these bits stay the same?
+    DATA_DIGEST_MISMATCH = 1 << 4,
+    OMAP_DIGEST_MISMATCH = 1 << 5,
+    SIZE_MISMATCH        = 1 << 6,
+    ATTR_VALUE_MISMATCH  = 1 << 7,
+    ATTR_NAME_MISMATCH    = 1 << 8,
+    SNAPSET_INCONSISTENCY   = 1 << 9,
+    HINFO_INCONSISTENCY   = 1 << 10,
+    SIZE_TOO_LARGE        = 1 << 11,
+    // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+  };
+  uint64_t errors = 0;
+  static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH
+	  |ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY|SIZE_TOO_LARGE;
+  static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH;
+  bool has_object_info_inconsistency() const {
+    return errors & OBJECT_INFO_INCONSISTENCY;
+  }
+  bool has_data_digest_mismatch() const {
+    return errors & DATA_DIGEST_MISMATCH;
+  }
+  bool has_omap_digest_mismatch() const {
+    return errors & OMAP_DIGEST_MISMATCH;
+  }
+  bool has_size_mismatch() const {
+    return errors & SIZE_MISMATCH;
+  }
+  bool has_attr_value_mismatch() const {
+    return errors & ATTR_VALUE_MISMATCH;
+  }
+  bool has_attr_name_mismatch() const {
+    return errors & ATTR_NAME_MISMATCH;
+  }
+  bool has_shallow_errors() const {
+    return errors & SHALLOW_ERRORS;
+  }
+  bool has_deep_errors() const {
+    return errors & DEEP_ERRORS;
+  }
+  bool has_snapset_inconsistency() const {
+    return errors & SNAPSET_INCONSISTENCY;
+  }
+  bool has_hinfo_inconsistency() const {
+    return errors & HINFO_INCONSISTENCY;
+  }
+  bool has_size_too_large() const {
+    return errors & SIZE_TOO_LARGE;
+  }
+};
+
+struct inconsistent_obj_t : obj_err_t {
+  inconsistent_obj_t() = default;
+  inconsistent_obj_t(const object_id_t& object)
+    : object{object}, version(0)
+  {}
+  object_id_t object;
+  uint64_t version;  // XXX: Redundant with object info attr
+  std::map<osd_shard_t, shard_info_t> shards;
+  err_t union_shards;
+};
+
+struct inconsistent_snapset_t {
+  inconsistent_snapset_t() = default;
+  inconsistent_snapset_t(const object_id_t& head)
+    : object{head}
+  {}
+  enum {
+    SNAPSET_MISSING = 1 << 0,
+    SNAPSET_CORRUPTED = 1 << 1,
+    CLONE_MISSING  = 1 << 2,
+    SNAP_ERROR  = 1 << 3,
+    HEAD_MISMATCH  = 1 << 4,  // Unused
+    HEADLESS_CLONE = 1 << 5,
+    SIZE_MISMATCH  = 1 << 6,
+    OI_MISSING   = 1 << 7,    // Old
+    INFO_MISSING   = 1 << 7,
+    OI_CORRUPTED = 1 << 8,    // Old
+    INFO_CORRUPTED = 1 << 8,
+    EXTRA_CLONES = 1 << 9,
+  };
+  uint64_t errors = 0;
+  object_id_t object;
+  // Extra clones
+  std::vector<snap_t> clones;
+  std::vector<snap_t> missing;
+  ceph::bufferlist ss_bl;
+
+  bool ss_attr_missing() const {     // Compatibility
+    return errors & SNAPSET_MISSING;
+  }
+  bool snapset_missing() const {
+    return errors & SNAPSET_MISSING;
+  }
+  bool ss_attr_corrupted() const {   // Compatibility
+    return errors & SNAPSET_CORRUPTED;
+  }
+  bool snapset_corrupted() const {
+    return errors & SNAPSET_CORRUPTED;
+  }
+  bool clone_missing() const  {
+    return errors & CLONE_MISSING;
+  }
+  bool snapset_mismatch() const {    // Compatibility
+    return errors & SNAP_ERROR;
+  }
+  bool snapset_error() const {
+    return errors & SNAP_ERROR;
+  }
+  bool head_mismatch() const {      // Compatibility
+    return false;
+  }
+  bool headless() const {
+    return errors & HEADLESS_CLONE;
+  }
+  bool size_mismatch() const {
+    return errors & SIZE_MISMATCH;
+  }
+  bool oi_attr_missing() const {   // Compatibility
+    return errors & OI_MISSING;
+  }
+  bool info_missing() const {
+    return errors & INFO_MISSING;
+  }
+  bool oi_attr_corrupted() const {  // Compatibility
+    return errors & OI_CORRUPTED;
+  }
+  bool info_corrupted() const {
+    return errors & INFO_CORRUPTED;
+  }
+  bool extra_clones() const {
+    return errors & EXTRA_CLONES;
+  }
+};
+
+/**
+ * @var all_nspaces
+ * Pass as nspace argument to IoCtx::set_namespace()
+ * before calling nobjects_begin() to iterate
+ * through all objects in all namespaces.
+ */
+const std::string all_nspaces(LIBRADOS_ALL_NSPACES);
+
+}
+#endif
diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h
new file mode 100644
index 00000000..66cf627a
--- /dev/null
+++ b/src/include/rados/rgw_file.h
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * convert RGW commands to file commands
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef RADOS_RGW_FILE_H
+#define RADOS_RGW_FILE_H
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "librgw.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_FILE_VER_MAJOR 1
+#define LIBRGW_FILE_VER_MINOR 1
+#define LIBRGW_FILE_VER_EXTRA 7
+
+#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA)
+
+/*
+ * object types
+ */
+enum rgw_fh_type {
+  RGW_FS_TYPE_NIL = 0,
+  RGW_FS_TYPE_FILE,
+  RGW_FS_TYPE_DIRECTORY,
+  RGW_FS_TYPE_SYMBOLIC_LINK,
+};
+
+/*
+ * dynamic allocated handle to support nfs handle
+ */
+
+/* content-addressable hash */
+struct rgw_fh_hk {
+  uint64_t bucket;
+  uint64_t object;
+};
+
+struct rgw_file_handle
+{
+  /* content-addressable hash */
+  struct rgw_fh_hk fh_hk;
+  void *fh_private; /* librgw private data */
+  /* object type */
+  enum rgw_fh_type fh_type;
+};
+
+struct rgw_fs
+{
+  librgw_t rgw;
+  void *fs_private;
+  struct rgw_file_handle* root_fh;
+};
+
+
+/* XXX mount info hypothetical--emulate Unix, support at least
+ * UUID-length fsid */
+struct rgw_statvfs {
+    uint64_t  f_bsize;    /* file system block size */
+    uint64_t  f_frsize;   /* fragment size */
+    uint64_t     f_blocks;   /* size of fs in f_frsize units */
+    uint64_t     f_bfree;    /* # free blocks */
+    uint64_t     f_bavail;   /* # free blocks for unprivileged users */
+    uint64_t     f_files;    /* # inodes */
+    uint64_t     f_ffree;    /* # free inodes */
+    uint64_t     f_favail;   /* # free inodes for unprivileged users */
+    uint64_t     f_fsid[2];     /* file system ID */
+    uint64_t     f_flag;     /* mount flags */
+    uint64_t     f_namemax;  /* maximum filename length */
+};
+
+
+void rgwfile_version(int *major, int *minor, int *extra);
+
+/*
+  lookup object by name (POSIX style)
+*/
+#define RGW_LOOKUP_FLAG_NONE    0x0000
+#define RGW_LOOKUP_FLAG_CREATE  0x0001
+#define RGW_LOOKUP_FLAG_RCB     0x0002 /* readdir callback hint */
+#define RGW_LOOKUP_FLAG_DIR     0x0004
+#define RGW_LOOKUP_FLAG_FILE    0x0008
+
+#define RGW_LOOKUP_TYPE_FLAGS \
+  (RGW_LOOKUP_FLAG_DIR|RGW_LOOKUP_FLAG_FILE)
+
+int rgw_lookup(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *parent_fh, const char *path,
+	      struct rgw_file_handle **fh,
+	      struct stat *st, uint32_t mask, uint32_t flags);
+
+/*
+  lookup object by handle (NFS style)
+*/
+int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk,
+		      struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+ * release file handle
+ */
+#define RGW_FH_RELE_FLAG_NONE   0x0000
+
+int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		uint32_t flags);
+
+/*
+ attach rgw namespace
+*/
+#define RGW_MOUNT_FLAG_NONE     0x0000
+
+int rgw_mount(librgw_t rgw, const char *uid, const char *key,
+	      const char *secret, struct rgw_fs **rgw_fs,
+	      uint32_t flags);
+
+int rgw_mount2(librgw_t rgw, const char *uid, const char *key,
+               const char *secret, const char *root, struct rgw_fs **rgw_fs,
+               uint32_t flags);
+
+/*
+ register invalidate callbacks
+*/
+#define RGW_REG_INVALIDATE_FLAG_NONE    0x0000
+
+typedef void (*rgw_fh_callback_t)(void *handle, struct rgw_fh_hk fh_hk);
+
+int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb,
+			    void *arg, uint32_t flags);
+
+/*
+ detach rgw namespace
+*/
+#define RGW_UMOUNT_FLAG_NONE    0x0000
+
+int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags);
+
+
+/*
+  get filesystem attributes
+*/
+#define RGW_STATFS_FLAG_NONE     0x0000
+
+int rgw_statfs(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *parent_fh,
+	       struct rgw_statvfs *vfs_st,
+	       uint32_t flags);
+
+
+/* XXX (get|set)attr mask bits */
+#define RGW_SETATTR_MODE   1
+#define RGW_SETATTR_UID    2
+#define RGW_SETATTR_GID    4
+#define RGW_SETATTR_MTIME  8
+#define RGW_SETATTR_ATIME 16
+#define RGW_SETATTR_SIZE  32
+#define RGW_SETATTR_CTIME 64
+
+/*
+  create file
+*/
+#define RGW_CREATE_FLAG_NONE     0x0000
+
+int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+	       const char *name, struct stat *st, uint32_t mask,
+	       struct rgw_file_handle **fh, uint32_t posix_flags,
+	       uint32_t flags);
+
+/*
+  create a symbolic link
+ */
+#define RGW_CREATELINK_FLAG_NONE     0x0000
+int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+               const char *name, const char *link_path, struct stat *st, 
+               uint32_t mask, struct rgw_file_handle **fh, uint32_t posix_flags,
+               uint32_t flags);
+
+/*
+  create a new directory
+*/
+#define RGW_MKDIR_FLAG_NONE      0x0000
+
+int rgw_mkdir(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *parent_fh,
+	      const char *name, struct stat *st, uint32_t mask,
+	      struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+  rename object
+*/
+#define RGW_RENAME_FLAG_NONE      0x0000
+
+int rgw_rename(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *olddir, const char* old_name,
+	       struct rgw_file_handle *newdir, const char* new_name,
+	       uint32_t flags);
+
+/*
+  remove file or directory
+*/
+#define RGW_UNLINK_FLAG_NONE      0x0000
+
+int rgw_unlink(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *parent_fh, const char* path,
+	       uint32_t flags);
+
+/*
+    read  directory content
+*/
+typedef bool (*rgw_readdir_cb)(const char *name, void *arg, uint64_t offset,
+			       struct stat *st, uint32_t mask,
+			       uint32_t flags);
+
+#define RGW_READDIR_FLAG_NONE      0x0000
+#define RGW_READDIR_FLAG_DOTDOT    0x0001 /* send dot names */
+
+int rgw_readdir(struct rgw_fs *rgw_fs,
+		struct rgw_file_handle *parent_fh, uint64_t *offset,
+		rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+		uint32_t flags);
+
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+		 struct rgw_file_handle *parent_fh, const char *name,
+		 rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+		 uint32_t flags);
+
+/* project offset of dirent name */
+#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000
+
+int rgw_dirent_offset(struct rgw_fs *rgw_fs,
+		      struct rgw_file_handle *parent_fh,
+		      const char *name, int64_t *offset,
+		      uint32_t flags);
+
+/*
+   get unix attributes for object
+*/
+#define RGW_GETATTR_FLAG_NONE      0x0000
+
+int rgw_getattr(struct rgw_fs *rgw_fs,
+		struct rgw_file_handle *fh, struct stat *st,
+		uint32_t flags);
+
+/*
+   set unix attributes for object
+*/
+#define RGW_SETATTR_FLAG_NONE      0x0000
+
+int rgw_setattr(struct rgw_fs *rgw_fs,
+		struct rgw_file_handle *fh, struct stat *st,
+		uint32_t mask, uint32_t flags);
+
+/*
+   truncate file
+*/
+#define RGW_TRUNCATE_FLAG_NONE     0x0000
+
+int rgw_truncate(struct rgw_fs *rgw_fs,
+		 struct rgw_file_handle *fh, uint64_t size,
+		 uint32_t flags);
+
+/*
+   open file
+*/
+#define RGW_OPEN_FLAG_NONE         0x0000
+#define RGW_OPEN_FLAG_CREATE       0x0001
+#define RGW_OPEN_FLAG_V3           0x0002 /* ops have v3 semantics */
+#define RGW_OPEN_FLAG_STATELESS    0x0002 /* alias it */
+
+int rgw_open(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+	     uint32_t posix_flags, uint32_t flags);
+
+/*
+   close file
+*/
+
+#define RGW_CLOSE_FLAG_NONE        0x0000
+#define RGW_CLOSE_FLAG_RELE        0x0001
+  
+int rgw_close(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+	      uint32_t flags);
+
+/*
+   read data from file
+*/
+#define RGW_READ_FLAG_NONE 0x0000
+
+int rgw_read(struct rgw_fs *rgw_fs,
+	     struct rgw_file_handle *fh, uint64_t offset,
+	     size_t length, size_t *bytes_read, void *buffer,
+	     uint32_t flags);
+
+/*
+   read symbolic link
+*/
+#define RGW_READLINK_FLAG_NONE 0x0000
+
+int rgw_readlink(struct rgw_fs *rgw_fs,
+	     struct rgw_file_handle *fh, uint64_t offset,
+	     size_t length, size_t *bytes_read, void *buffer,
+	     uint32_t flags);
+
+/*
+   write data to file
+*/
+#define RGW_WRITE_FLAG_NONE      0x0000
+
+int rgw_write(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *fh, uint64_t offset,
+	      size_t length, size_t *bytes_written, void *buffer,
+	      uint32_t flags);
+
+#define RGW_UIO_NONE    0x0000
+#define RGW_UIO_GIFT    0x0001
+#define RGW_UIO_FREE    0x0002
+#define RGW_UIO_BUFQ    0x0004
+
+struct rgw_uio;
+typedef void (*rgw_uio_release)(struct rgw_uio *, uint32_t);
+
+/* buffer vector descriptors */
+struct rgw_vio {
+  void *vio_p1;
+  void *vio_u1;
+  void *vio_base;
+  int32_t vio_len;
+};
+  
+struct rgw_uio {
+  rgw_uio_release uio_rele;
+  void *uio_p1;
+  void *uio_u1;
+  uint64_t uio_offset;
+  uint64_t uio_resid;
+  uint32_t uio_cnt;
+  uint32_t uio_flags;
+  struct rgw_vio *uio_vio; /* appended vectors */
+};
+
+typedef struct rgw_uio rgw_uio;
+
+int rgw_readv(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+int rgw_writev(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+/*
+   sync written data
+*/
+#define RGW_FSYNC_FLAG_NONE        0x0000
+
+int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+	      uint32_t flags);
+
+/*
+   NFS commit operation
+*/
+
+#define RGW_COMMIT_FLAG_NONE        0x0000
+
+int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+	       uint64_t offset, uint64_t length, uint32_t flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RADOS_RGW_FILE_H */
diff --git a/src/include/radosstriper/libradosstriper.h b/src/include/radosstriper/libradosstriper.h
new file mode 100644
index 00000000..7eb33596
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.h
@@ -0,0 +1,610 @@
+#ifndef CEPH_LIBRADOSSTRIPER_H
+#define CEPH_LIBRADOSSTRIPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <string.h>
+
+#include "../rados/librados.h"
+
+#define LIBRADOSSTRIPER_VER_MAJOR 0
+#define LIBRADOSSTRIPER_VER_MINOR 0
+#define LIBRADOSSTRIPER_VER_EXTRA 0
+
+#define LIBRADOSSTRIPER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOSSTRIPER_VERSION_CODE LIBRADOSSTRIPER_VERSION(LIBRADOSSTRIPER_VER_MAJOR, LIBRADOSSTRIPER_VER_MINOR, LIBRADOSSTRIPER_VER_EXTRA)
+
+/**
+ * @typedef rados_striper_t
+ *
+ * A handle for interacting with striped objects in a RADOS cluster.
+ */
+typedef void *rados_striper_t;
+
+/**
+ * @defgroup libradosstriper_h_init Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using libradosstriper.
+ *
+ * @{
+ */
+
+/**
+ * Creates a rados striper using the given io context
+ * Striper has initially default object layout.
+ * See rados_striper_set_object_layout_*() to change this
+ *
+ * @param ioctx the rados context to use
+ * @param striper where to store the rados striper
+ * @returns 0 on success, negative error code on failure
+ */
+  int rados_striper_create(rados_ioctx_t ioctx,
+                           rados_striper_t *striper);
+
+/**
+ * Destroys a rados striper
+ *
+ * @param striper the striper to destroy
+ */
+void rados_striper_destroy(rados_striper_t striper);
+
+/**
+ * Sets the object layout's stripe unit of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_unit the stripe_unit value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper,
+                                                unsigned int stripe_unit);
+
+/**
+ * Sets the object layout's stripe count of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_count the stripe_count value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_count(rados_striper_t striper,
+                                                 unsigned int stripe_count);
+
+/**
+ * Sets the object layout's object_size of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param object_size the object_size value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_object_size(rados_striper_t striper,
+                                                unsigned int object_size);
+
+/** @} init */
+
+/**
+ * @defgroup libradosstriper_h_synch_io Synchronous I/O
+ * Writes are striped to several rados objects which are then
+ * replicated to a number of OSDs based on the configuration
+ * of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_striper_ioctx_wait_for_complete().
+ *
+ * @{
+ */
+
+/**
+ * Synchronously write data to a striped object at the specified offset
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_write(rados_striper_t striper,
+                        const char *soid,
+                        const char *buf,
+                        size_t len,
+                        uint64_t off);
+
+/**
+ * Synchronously write an entire striped object
+ *
+ * The striped object is filled with the provided data. If the striped object exists,
+ * it is truncated and then written.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_write_full(rados_striper_t striper,
+                             const char *soid,
+                             const char *buf,
+                             size_t len);
+
+/**
+ * Append data to an object
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_append(rados_striper_t striper,
+                         const char *soid,
+                         const char *buf,
+                         size_t len);
+
+/**
+ * Synchronously read data from a striped object at the specified offset
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+int rados_striper_read(rados_striper_t striper,
+                       const char *soid,
+                       char *buf,
+                       size_t len,
+                       uint64_t off);
+
+/**
+ * Synchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_remove(rados_striper_t striper,
+                         const char* soid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @note the truncation is not fully atomic. The metadata part is,
+ * so the behavior will be atomic from user point of view when
+ * the object size is reduced. However, in case of failure, old data
+ * may stay around, hidden. They may reappear if the object size is
+ * later grown, instead of the expected 0s. When growing the
+ * object and in case of failure, the new 0 data may not be
+ * fully created. This can lead to ENOENT errors when
+ * writing/reading the missing parts.
+ * @note the truncation can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during truncation (same EBUSY return code)
+ * @param io the rados context to use
+ * @param soid the name of the striped object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_trunc(rados_striper_t striper, const char *soid, uint64_t size);
+
+/** @} Synchronous I/O */
+
+/**
+ * @defgroup libradosstriper_h_xattrs Xattrs
+ * Extended attributes are stored as extended attributes on the
+ * first rados regular object of the striped object.
+ * Thus, they have the same limitations as the underlying
+ * rados extended attributes.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the getxattr will occur
+ * @param oid name of the striped object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+int rados_striper_getxattr(rados_striper_t striper,
+                           const char *oid,
+                           const char *name,
+                           char *buf,
+                           size_t len);
+
+/**
+ * Set an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the setxattr will occur
+ * @param oid name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_setxattr(rados_striper_t striper,
+                           const char *oid,
+                           const char *name,
+                           const char *buf,
+                           size_t len);
+
+/**
+ * Delete an extended attribute from a striped object.
+ *
+ * @param striper the striper in which the rmxattr will occur
+ * @param oid name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_rmxattr(rados_striper_t striper,
+                          const char *oid,
+                          const char *name);
+
+/**
+ * Start iterating over xattrs on a striped object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param striper the striper in which the getxattrs will occur
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs(rados_striper_t striper,
+                            const char *oid,
+                            rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the striped object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs_next(rados_xattrs_iter_t iter,
+                                 const char **name,
+                                 const char **val,
+                                 size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+void rados_striper_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Synchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_stat(rados_striper_t striper,
+                       const char* soid,
+                       uint64_t *psize,
+                       time_t *pmtime);
+
+/**
+ * @defgroup libradosstriper_h_asynch_io Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_striper_multi_completion_t
+ * Represents the state of a set of asynchronous operations
+ * it contains the aggregated return value once the operations complete
+ * and can be used to block until all operations are complete and/or safe.
+ */
+typedef void *rados_striper_multi_completion_t;
+
+/**
+ * Constructs a multi completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all relpicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+int rados_striper_multi_aio_create_completion(void *cb_arg,
+                                              rados_callback_t cb_complete,
+                                              rados_callback_t cb_safe,
+                                              rados_striper_multi_completion_t *pc);
+
+/**
+ * Block until all operation complete
+ *
+ * This means data is in memory on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operation are safe
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations complete and callback completes
+ *
+ * This means data is in memory on all replicas and can be read.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations are safe and callback has completed
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation and callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe and has the callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Get the return value of a multi asychronous operation
+ *
+ * The return value is set when all operations are complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operations to inspect
+ * @returns aggregated return value of the operations
+ */
+int rados_striper_multi_aio_get_return_value(rados_striper_multi_completion_t c);
+
+/**
+ * Release a multi asynchrnous IO completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c multi completion to release
+ */
+void rados_striper_multi_aio_release(rados_striper_multi_completion_t c);
+
+/**
+ * Asynchronously write data to a striped object at the specified offset
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write(rados_striper_t striper,
+                            const char *soid,
+                            rados_completion_t completion,
+                            const char *buf,
+                            size_t len,
+                            uint64_t off);
+
+/**
+ * Asynchronously appends data to a striped object
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_append(rados_striper_t striper,
+                             const char *soid,
+                             rados_completion_t completion,
+                             const char *buf,
+                             size_t len);
+
+/**
+ * Asynchronously fills and object with the provided data.
+ * If the object exists, it is truncated and then written.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write_full(rados_striper_t striper,
+                                 const char *soid,
+                                 rados_completion_t completion,
+                                 const char *buf,
+                                 size_t len);
+
+/**
+ * Asynchronously read data from a striped object at the specified offset
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the read is safe and complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_read(rados_striper_t striper,
+                           const char *soid,
+                           rados_completion_t completion,
+                           char *buf,
+                           const size_t len,
+                           uint64_t off);
+
+/**
+ * Asynchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, negative error code on failure
+ */
+
+int rados_striper_aio_remove(rados_striper_t striper,
+                             const char* soid,
+                             rados_completion_t completion);
+
+/**
+ * Block until all pending writes in a striper are safe
+ *
+ * This is not equivalent to calling rados_striper_multi_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @param striper the striper in which the flush will occur
+ * @returns 0 on success, negative error code on failure
+*/
+void rados_striper_aio_flush(rados_striper_t striper);
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param completion what to do when the stats is complete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_aio_stat(rados_striper_t striper,
+                           const char* soid,
+                           rados_completion_t completion,
+                           uint64_t *psize,
+                           time_t *pmtime);
+
+/** @} Asynchronous I/O */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp
new file mode 100644
index 00000000..674a56b7
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.hpp
@@ -0,0 +1,241 @@
+#ifndef __LIBRADOSSTRIPER_HPP
+#define __LIBRADOSSTRIPER_HPP
+
+#include <string.h>
+#include <string>
+#include <map>
+#include "../rados/buffer.h"
+#include "../rados/librados.hpp"
+
+#include "libradosstriper.h"
+
+namespace libradosstriper
+{
+  struct RadosStriperImpl;
+  struct MultiAioCompletionImpl;
+
+  /*
+   * Completion object for multiple asynchronous IO
+   * It allows to internally handle several "requests"
+   */
+  struct MultiAioCompletion {
+    MultiAioCompletion(MultiAioCompletionImpl *pc_) : pc(pc_) {}
+    ~MultiAioCompletion();
+    int set_complete_callback(void *cb_arg, librados::callback_t cb);
+    int set_safe_callback(void *cb_arg, librados::callback_t cb);
+    void wait_for_complete();
+    void wait_for_safe();
+    void wait_for_complete_and_cb();
+    void wait_for_safe_and_cb();
+    bool is_complete();
+    bool is_safe();
+    bool is_complete_and_cb();
+    bool is_safe_and_cb();
+    int get_return_value();
+    void release();
+    MultiAioCompletionImpl *pc;
+  };
+
+  /* RadosStriper : This class allows to perform read/writes on striped objects
+   *
+   * Typical use (error checking omitted):
+   *
+   * RadosStriper rs;
+   * RadosStriper.striper_create("my_cluster", rs);
+   * bufferlist bl;
+   * ... put data in bl ...
+   * rs.write(object_name, bl, len, offset);
+   * bufferlist bl2;
+   * rs.read(object_name, &bl2, len, offset);
+   * ...
+   */
+  class RadosStriper
+  {
+  public:
+
+    /*
+     * constructor
+     */
+    RadosStriper();
+
+    /*
+     * builds the C counter part of a RadosStriper
+     */
+    static void to_rados_striper_t(RadosStriper &striper,
+                                   rados_striper_t *s);
+
+    /*
+     * copy constructor
+     */
+    RadosStriper(const RadosStriper& rs);
+
+    /*
+     * operator=
+     */
+    RadosStriper& operator=(const RadosStriper& rs);
+
+    /*
+     * destructor
+     * Internally calling close() if an object is currently opened
+     */
+    ~RadosStriper();
+
+    /*
+     * create method
+     */
+    static int striper_create(librados::IoCtx& ioctx,
+                              RadosStriper *striper);
+
+    /*
+     * set object layout's stripe unit
+     * This layout will be used when new objects are created (by writing to them)
+     * Already existing objects will be opened with their own layout.
+     */
+    int set_object_layout_stripe_unit(unsigned int stripe_unit);
+
+    /*
+     * set object layout's stripe count
+     * This layout will be used when new objects are created (by writing to them)
+     * Already existing objects will be opened with their own layout.
+     */
+    int set_object_layout_stripe_count(unsigned int stripe_count);
+
+    /*
+     * set object layout's object size
+     * This layout will be used when new objects are created (by writing to them)
+     * Already existing objects will be opened with their own layout.
+     */
+    int set_object_layout_object_size(unsigned int object_size);
+
+    /**
+     * Get the value of an extended attribute on a striped object
+     */
+    int getxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+    /**
+     * Set the value of an extended attribute on a striped object
+     */
+    int setxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+    /**
+     * Delete an extended attribute from a striped object
+     */
+    int rmxattr(const std::string& oid, const char *name);
+
+    /**
+     * Start iterating over xattrs on a striped object.
+     */
+    int getxattrs(const std::string& oid,
+                  std::map<std::string, ceph::bufferlist>& attrset); 
+    
+    /**
+     * synchronously write to the striped object at the specified offset.
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int write(const std::string& soid, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+    /**
+     * synchronously fill the striped object with the specified data
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int write_full(const std::string& soid, const ceph::bufferlist& bl);
+
+    /**
+     * synchronously append data to the striped object
+     * NOTE: this call steals the contents of @p bl.
+     */
+    int append(const std::string& soid, const ceph::bufferlist& bl, size_t len);
+
+    /**
+     * asynchronously write to the striped object at the specified offset.
+     * NOTE: this call steals the contents of @p bl.
+     */
+    int aio_write(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+    /**
+     * asynchronously fill the striped object with the specified data
+     * NOTE: this call steals the contents of @p bl.
+     */
+    int aio_write_full(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl);
+
+    /**
+     * asynchronously append data to the striped object
+     * NOTE: this call steals the contents of @p bl.
+     */
+    int aio_append(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len);
+
+    /**
+     * synchronously read from the striped object at the specified offset.
+     */
+    int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off);
+
+    /**
+     * asynchronously read from the striped object at the specified offset.
+     */
+    int aio_read(const std::string& soid, librados::AioCompletion *c, ceph::bufferlist *pbl, size_t len, uint64_t off);
+
+    /**
+     * synchronously get striped object stats (size/mtime)
+     */
+    int stat(const std::string& soid, uint64_t *psize, time_t *pmtime);
+    int stat2(const std::string& soid, uint64_t *psize, struct timespec *pts);
+
+    /**
+     * asynchronously get striped object stats (size/mtime)
+     */
+    int aio_stat(const std::string& soid, librados::AioCompletion *c,
+                 uint64_t *psize, time_t *pmtime);
+    int aio_stat2(const std::string& soid, librados::AioCompletion *c,
+                  uint64_t *psize, struct timespec *pts);
+
+    /**
+     * deletes a striped object.
+     * There is no atomicity of the deletion and the striped
+     * object may be left incomplete if an error is returned (metadata
+     * all present, but some stripes missing)
+     * However, there is a atomicity of the metadata deletion and
+     * the deletion can not happen if any I/O is ongoing (it
+     * will return EBUSY). Identically, no I/O will be able to start
+     * during deletion (same EBUSY return code)
+     */
+    int remove(const std::string& soid);
+    int remove(const std::string& soid, int flags);
+
+    /**
+     * asynchronous remove of striped objects
+     * See synchronous version for comments on (lack of) atomicity
+     */
+    int aio_remove(const std::string& soid, librados::AioCompletion *c);
+    int aio_remove(const std::string& soid, librados::AioCompletion *c, int flags);
+
+    /**
+     * Resizes a striped object
+     * the truncation can not happen if any I/O is ongoing (it
+     * will return EBUSY). Identically, no I/O will be able to start
+     * during truncation (same EBUSY return code)
+     */
+    int trunc(const std::string& oid, uint64_t size);
+
+    /**
+     * Wait for all currently pending aio writes to be safe.
+     *
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_flush();
+
+    /**
+     * creation of multi aio completion objects
+     */
+    static MultiAioCompletion *multi_aio_create_completion();
+    static MultiAioCompletion *multi_aio_create_completion(void *cb_arg,
+                                                           librados::callback_t cb_complete,
+                                                           librados::callback_t cb_safe);
+
+  private:
+    RadosStriperImpl *rados_striper_impl;
+
+  };
+
+}
+
+#endif
diff --git a/src/include/random.h b/src/include/random.h
new file mode 100644
index 00000000..b3cb80c3
--- /dev/null
+++ b/src/include/random.h
@@ -0,0 +1,289 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+*/
+
+#ifndef CEPH_RANDOM_H
+#define CEPH_RANDOM_H 1
+
+#include <mutex>
+#include <random>
+#include <type_traits>
+#include <boost/optional.hpp>
+
+// Basic random number facility, adapted from N3551:
+namespace ceph::util {
+
+inline namespace version_1_0_2 {
+
+namespace detail {
+
+template <typename T0, typename T1>
+using larger_of = typename std::conditional<
+                    sizeof(T0) >= sizeof(T1), 
+                    T0, T1>
+                  ::type;
+
+// avoid mixing floating point and integers:
+template <typename NumberT0, typename NumberT1>
+using has_compatible_numeric_types =
+            std::disjunction<
+                std::conjunction<
+                    std::is_floating_point<NumberT0>, std::is_floating_point<NumberT1>
+                >,
+                std::conjunction<
+                    std::is_integral<NumberT0>, std::is_integral<NumberT1>
+                >
+            >;
+
+
+// Select the larger of type compatible numeric types:
+template <typename NumberT0, typename NumberT1>
+using select_number_t = std::enable_if_t<detail::has_compatible_numeric_types<NumberT0, NumberT1>::value,
+                                         detail::larger_of<NumberT0, NumberT1>>;
+
+} // namespace detail
+
+namespace detail {
+
+// Choose default distribution for appropriate types:
+template <typename NumberT, 
+          bool IsIntegral>
+struct select_distribution
+{
+ using type = std::uniform_int_distribution<NumberT>;
+};
+
+template <typename NumberT>
+struct select_distribution<NumberT, false>
+{
+ using type = std::uniform_real_distribution<NumberT>;
+};
+
+template <typename NumberT>
+using default_distribution = typename
+    select_distribution<NumberT, std::is_integral<NumberT>::value>::type;
+
+} // namespace detail
+
+namespace detail {
+
+template <typename EngineT>
+EngineT& engine();
+
+template <typename MutexT, typename EngineT, 
+          typename SeedT = typename EngineT::result_type>
+void randomize_rng(const SeedT seed, MutexT& m, EngineT& e)
+{
+  std::lock_guard<MutexT> lg(m);
+  e.seed(seed);
+}
+
+template <typename MutexT, typename EngineT>
+void randomize_rng(MutexT& m, EngineT& e)
+{
+  std::random_device rd;
+ 
+  std::lock_guard<MutexT> lg(m);
+  e.seed(rd());
+}
+
+template <typename EngineT = std::default_random_engine,
+          typename SeedT = typename EngineT::result_type>
+void randomize_rng(const SeedT n)
+{
+  detail::engine<EngineT>().seed(n);
+}
+
+template <typename EngineT = std::default_random_engine>
+void randomize_rng()
+{
+  std::random_device rd;
+  detail::engine<EngineT>().seed(rd());
+}
+
+template <typename EngineT>
+EngineT& engine()
+{
+  thread_local boost::optional<EngineT> rng_engine;
+
+  if (!rng_engine) {
+    rng_engine.emplace(EngineT());
+    randomize_rng<EngineT>();
+  }
+
+  return *rng_engine;
+}
+
+} // namespace detail
+
+namespace detail {
+
+template <typename NumberT,
+          typename DistributionT = detail::default_distribution<NumberT>,
+          typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max,
+                               EngineT& e)
+{
+  DistributionT d { min, max };
+
+  using param_type = typename DistributionT::param_type;
+  return d(e, param_type { min, max });
+}
+
+template <typename NumberT,
+          typename MutexT,
+          typename DistributionT = detail::default_distribution<NumberT>,
+          typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max,
+                               MutexT& m, EngineT& e)
+{
+  DistributionT d { min, max };
+ 
+  using param_type = typename DistributionT::param_type;
+ 
+  std::lock_guard<MutexT> lg(m);
+  return d(e, param_type { min, max });
+}
+
+template <typename NumberT,
+          typename DistributionT = detail::default_distribution<NumberT>,
+          typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max)
+{
+  return detail::generate_random_number<NumberT, DistributionT, EngineT>
+          (min, max, detail::engine<EngineT>());
+}
+
+template <typename MutexT, 
+          typename EngineT,
+          typename NumberT = int,
+          typename DistributionT = detail::default_distribution<NumberT>>
+NumberT generate_random_number(MutexT& m, EngineT& e)
+{
+  return detail::generate_random_number<NumberT, MutexT, DistributionT, EngineT>
+          (0, std::numeric_limits<NumberT>::max(), m, e);
+}
+
+template <typename NumberT, typename MutexT, typename EngineT>
+NumberT generate_random_number(const NumberT max, MutexT& m, EngineT& e)
+{
+  return generate_random_number<NumberT>(0, max, m, e);
+}
+
+} // namespace detail
+
+template <typename EngineT = std::default_random_engine>
+void randomize_rng()
+{
+  detail::randomize_rng<EngineT>();
+}
+
+template <typename NumberT = int,
+          typename DistributionT = detail::default_distribution<NumberT>,
+          typename EngineT = std::default_random_engine>
+NumberT generate_random_number()
+{
+  return detail::generate_random_number<NumberT, DistributionT, EngineT>
+          (0, std::numeric_limits<NumberT>::max());
+}
+
+template <typename NumberT0, typename NumberT1,
+          typename NumberT = detail::select_number_t<NumberT0, NumberT1>
+         >
+NumberT generate_random_number(const NumberT0 min, const NumberT1 max)
+{
+  return detail::generate_random_number<NumberT,
+                                        detail::default_distribution<NumberT>,
+                                        std::default_random_engine>
+                                       (static_cast<NumberT>(min), static_cast<NumberT>(max)); 
+}
+
+template <typename NumberT0, typename NumberT1,
+          typename DistributionT,
+          typename EngineT,
+          typename NumberT = detail::select_number_t<NumberT0, NumberT1>
+		 >
+NumberT generate_random_number(const NumberT min, const NumberT max,
+                               EngineT& e)
+{
+ return detail::generate_random_number<NumberT,
+                       DistributionT,
+                       EngineT>(static_cast<NumberT>(min), static_cast<NumberT>(max), e);
+}
+
+template <typename NumberT>
+NumberT generate_random_number(const NumberT max)
+{
+ return generate_random_number<NumberT>(0, max);
+}
+
+// Function object:
+template <typename NumberT>
+class random_number_generator final
+{
+  std::mutex l;
+  std::random_device rd;
+  std::default_random_engine e;
+
+  using seed_type = typename decltype(e)::result_type;
+ 
+  public:
+  using number_type         = NumberT;
+  using random_engine_type  = decltype(e);
+  using random_device_type  = decltype(rd);
+
+  public:
+  random_device_type& random_device() noexcept { return rd; } 
+  random_engine_type& random_engine() noexcept { return e; }
+ 
+  public:
+  random_number_generator() {
+    detail::randomize_rng(l, e);
+  }
+ 
+  explicit random_number_generator(const seed_type seed) {
+    detail::randomize_rng(seed, l, e);
+  }
+
+  random_number_generator(random_number_generator&& rhs)
+   : e(std::move(rhs.e))
+  {}
+ 
+  public:
+  random_number_generator(const random_number_generator&)            = delete;
+  random_number_generator& operator=(const random_number_generator&) = delete;
+ 
+  public:
+  NumberT operator()() { 
+    return detail::generate_random_number(l, e); 
+  }
+ 
+  NumberT operator()(const NumberT max) { 
+    return detail::generate_random_number<NumberT>(max, l, e); 
+  }
+ 
+  NumberT operator()(const NumberT min, const NumberT max) { 
+    return detail::generate_random_number<NumberT>(min, max, l, e); 
+  }
+ 
+  public:
+  void seed(const seed_type n) { 
+    detail::randomize_rng(n, l, e); 
+  }
+};
+
+} // inline namespace version_*
+
+} // namespace ceph::util
+
+#endif
diff --git a/src/include/rangeset.h b/src/include/rangeset.h
new file mode 100644
index 00000000..e7e3d047
--- /dev/null
+++ b/src/include/rangeset.h
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_RANGESET_H
+#define CEPH_RANGESET_H
+
+/*
+ *
+ * my first container with iterator!   it's pretty ugly.
+ *
+ */
+
+#include <map>
+
+//typedef int T;
+
+template <class T>
+struct _rangeset_base {
+  map<T,T> ranges;  // pair(first,last) (inclusive, e.g. [first,last])
+                    
+  typedef typename map<T,T>::iterator mapit;
+
+  // get iterator for range including val.  or ranges.end().
+  mapit get_range_for(T val) {
+    mapit it = ranges.lower_bound(val);
+    if (it == ranges.end()) {
+      // search backwards
+      typename map<T,T>::reverse_iterator it = ranges.rbegin();
+      if (it == ranges.rend()) return ranges.end();
+      if (it->first <= val && it->second >= val)
+        return ranges.find(it->first);
+      return ranges.end();
+    } else {
+      if (it->first == val) return 
+      it--;
+      if (it->first <= val && it->second >= val)
+        return it;
+      return ranges.end();
+    }
+  }
+
+};
+
+
+template <class T>
+class rangeset_iterator :
+  public std::iterator<std::input_iterator_tag, T>
+{
+  //typedef typename map<T,T>::iterator mapit;
+
+  map<T,T> ranges;
+  typename map<T,T>::iterator it;
+  T current;
+
+public:
+  // cons
+  rangeset_iterator() {}
+
+  rangeset_iterator(typename map<T,T>::iterator& it, map<T,T>& ranges) {
+    this->ranges = ranges;
+    this->it = it;
+    if (this->it != ranges.end())
+      current = it->first;
+  }
+
+  bool operator==(rangeset_iterator<T> rit) {
+    return (it == rit.it && rit.current == current);
+  }
+  bool operator!=(rangeset_iterator<T> rit) {
+    return (it != rit.it) || (rit.current != current);
+  }
+  
+  T& operator*() {
+    return current;
+  }
+
+  rangeset_iterator<T> operator++(int) {
+    if (current < it->second)
+      current++;
+    else {
+      it++;
+      if (it != ranges.end())
+        current = it->first;
+    }
+    
+    return *this;
+  }
+};
+
+
+template <class T>
+class rangeset
+{
+  typedef typename map<T,T>::iterator map_iterator;
+
+  _rangeset_base<T> theset;
+  inodeno_t _size;
+
+public:
+  rangeset() { _size = 0; }
+  typedef rangeset_iterator<T> iterator;
+
+  iterator begin() {
+    map_iterator it = theset.ranges.begin();
+    return iterator(it, theset.ranges);
+  }
+
+  iterator end() {
+    map_iterator it = theset.ranges.end();
+    return iterator(it, theset.ranges);
+  }
+
+  map_iterator map_begin() {
+    return theset.ranges.begin();
+  }
+  map_iterator map_end() {
+    return theset.ranges.end();
+  }
+  int map_size() {
+    return theset.ranges.size();
+  }
+
+  void map_insert(T v1, T v2) {
+    theset.ranges.insert(pair<T,T>(v1,v2));
+    _size += v2 - v1+1;
+  }
+
+
+  // ...
+  bool contains(T val) {
+    if (theset.get_range_for(val) == theset.ranges.end()) return false;
+    ceph_assert(!empty());
+    return true;
+  }
+  
+  void insert(T val) {
+    ceph_assert(!contains(val));
+
+    map_iterator left = theset.get_range_for(val-1);
+    map_iterator right = theset.get_range_for(val+1);
+
+    if (left != theset.ranges.end() &&
+        right != theset.ranges.end()) {
+      // join!
+      left->second = right->second;
+      theset.ranges.erase(right);
+      _size++;
+      return;
+    }
+
+    if (left != theset.ranges.end()) {
+      // add to left range
+      left->second = val;
+      _size++;
+      return;
+    }
+
+    if (right != theset.ranges.end()) {
+      // add to right range
+      theset.ranges.insert(pair<T,T>(val, right->second));
+      theset.ranges.erase(val+1);
+      _size++;
+      return;
+    }
+
+    // new range
+    theset.ranges.insert(pair<T,T>(val,val));
+    _size++;
+    return;
+  }
+
+  unsigned size() {
+    return size();
+  }
+
+  bool empty() {
+    if (theset.ranges.empty()) {
+      ceph_assert(_size == 0);
+      return true;
+    }
+    ceph_assert(_size>0);
+    return false;
+  }
+
+  
+  T first() {
+    ceph_assert(!empty());
+    map_iterator it = theset.ranges.begin();
+    return it->first;
+  }
+  
+  void erase(T val) {
+    ceph_assert(contains(val));
+    map_iterator it = theset.get_range_for(val);
+    ceph_assert(it != theset.ranges.end());
+    
+    // entire range
+    if (val == it->first && val == it->second) {
+      theset.ranges.erase(it);
+      _size--;
+      return;
+    }
+
+    // beginning
+    if (val == it->first) {
+      theset.ranges.insert(pair<T,T>(val+1, it->second));
+      theset.ranges.erase(it);
+      _size--;
+      return;      
+    }
+
+    // end
+    if (val == it->second) {
+      it->second = val-1;
+      _size--;
+      return;
+    }
+
+    // middle split
+    theset.ranges.insert(pair<T,T>(it->first, val-1));
+    theset.ranges.insert(pair<T,T>(val+1, it->second));
+    theset.ranges.erase(it);
+    _size--;
+    return;
+  }
+
+  void dump() {
+    for (typename map<T,T>::iterator it = theset.ranges.begin();
+         it != theset.ranges.end();
+         it++) {
+      cout << " " << it->first << "-" << it->second << endl;
+    }
+  }
+
+};
+
+
+#endif
diff --git a/src/include/rbd/features.h b/src/include/rbd/features.h
new file mode 100644
index 00000000..89c54a36
--- /dev/null
+++ b/src/include/rbd/features.h
@@ -0,0 +1,102 @@
+#ifndef CEPH_RBD_FEATURES_H
+#define CEPH_RBD_FEATURES_H
+
+#define RBD_FEATURE_LAYERING		(1ULL<<0)
+#define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
+#define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF           (1ULL<<4)
+#define RBD_FEATURE_DEEP_FLATTEN        (1ULL<<5)
+#define RBD_FEATURE_JOURNALING          (1ULL<<6)
+#define RBD_FEATURE_DATA_POOL           (1ULL<<7)
+#define RBD_FEATURE_OPERATIONS          (1ULL<<8)
+#define RBD_FEATURE_MIGRATING           (1ULL<<9)
+
+#define RBD_FEATURES_DEFAULT             (RBD_FEATURE_LAYERING | \
+                                         RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                         RBD_FEATURE_OBJECT_MAP | \
+                                         RBD_FEATURE_FAST_DIFF | \
+                                         RBD_FEATURE_DEEP_FLATTEN)
+
+#define RBD_FEATURE_NAME_LAYERING        "layering"
+#define RBD_FEATURE_NAME_STRIPINGV2      "striping"
+#define RBD_FEATURE_NAME_EXCLUSIVE_LOCK  "exclusive-lock"
+#define RBD_FEATURE_NAME_OBJECT_MAP      "object-map"
+#define RBD_FEATURE_NAME_FAST_DIFF       "fast-diff"
+#define RBD_FEATURE_NAME_DEEP_FLATTEN    "deep-flatten"
+#define RBD_FEATURE_NAME_JOURNALING      "journaling"
+#define RBD_FEATURE_NAME_DATA_POOL       "data-pool"
+#define RBD_FEATURE_NAME_OPERATIONS      "operations"
+#define RBD_FEATURE_NAME_MIGRATING       "migrating"
+
+/// features that make an image inaccessible for read or write by
+/// clients that don't understand them
+#define RBD_FEATURES_INCOMPATIBLE 	(RBD_FEATURE_LAYERING       | \
+					 RBD_FEATURE_STRIPINGV2     | \
+                                         RBD_FEATURE_DATA_POOL)
+
+/// features that make an image unwritable by clients that don't understand them
+#define RBD_FEATURES_RW_INCOMPATIBLE	(RBD_FEATURES_INCOMPATIBLE  | \
+					 RBD_FEATURE_EXCLUSIVE_LOCK | \
+					 RBD_FEATURE_OBJECT_MAP     | \
+                                         RBD_FEATURE_FAST_DIFF      | \
+                                         RBD_FEATURE_DEEP_FLATTEN   | \
+                                         RBD_FEATURE_JOURNALING     | \
+                                         RBD_FEATURE_OPERATIONS     | \
+                                         RBD_FEATURE_MIGRATING)
+
+#define RBD_FEATURES_ALL          	(RBD_FEATURE_LAYERING       | \
+					 RBD_FEATURE_STRIPINGV2     | \
+                                   	 RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                         RBD_FEATURE_OBJECT_MAP     | \
+                                         RBD_FEATURE_FAST_DIFF      | \
+                                         RBD_FEATURE_DEEP_FLATTEN   | \
+                                         RBD_FEATURE_JOURNALING     | \
+                                         RBD_FEATURE_DATA_POOL      | \
+                                         RBD_FEATURE_OPERATIONS     | \
+                                         RBD_FEATURE_MIGRATING)
+
+/// features that may be dynamically enabled or disabled
+#define RBD_FEATURES_MUTABLE            (RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                         RBD_FEATURE_OBJECT_MAP     | \
+                                         RBD_FEATURE_FAST_DIFF      | \
+                                         RBD_FEATURE_JOURNALING)
+
+/// features that may be dynamically disabled
+#define RBD_FEATURES_DISABLE_ONLY       (RBD_FEATURE_DEEP_FLATTEN)
+
+/// features that only work when used with a single client
+/// using the image for writes
+#define RBD_FEATURES_SINGLE_CLIENT (RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                    RBD_FEATURE_OBJECT_MAP     | \
+                                    RBD_FEATURE_FAST_DIFF      | \
+                                    RBD_FEATURE_JOURNALING)
+
+/// features that will be implicitly enabled
+#define RBD_FEATURES_IMPLICIT_ENABLE  (RBD_FEATURE_STRIPINGV2 | \
+                                       RBD_FEATURE_DATA_POOL  | \
+                                       RBD_FEATURE_FAST_DIFF  | \
+                                       RBD_FEATURE_OPERATIONS | \
+                                       RBD_FEATURE_MIGRATING)
+
+/// features that cannot be controlled by the user
+#define RBD_FEATURES_INTERNAL         (RBD_FEATURE_OPERATIONS | \
+                                       RBD_FEATURE_MIGRATING)
+
+#define RBD_OPERATION_FEATURE_CLONE_PARENT      (1ULL<<0)
+#define RBD_OPERATION_FEATURE_CLONE_CHILD       (1ULL<<1)
+#define RBD_OPERATION_FEATURE_GROUP             (1ULL<<2)
+#define RBD_OPERATION_FEATURE_SNAP_TRASH        (1ULL<<3)
+
+#define RBD_OPERATION_FEATURE_NAME_CLONE_PARENT "clone-parent"
+#define RBD_OPERATION_FEATURE_NAME_CLONE_CHILD  "clone-child"
+#define RBD_OPERATION_FEATURE_NAME_GROUP        "group"
+#define RBD_OPERATION_FEATURE_NAME_SNAP_TRASH   "snap-trash"
+
+/// all valid operation features
+#define RBD_OPERATION_FEATURES_ALL (RBD_OPERATION_FEATURE_CLONE_PARENT | \
+                                    RBD_OPERATION_FEATURE_CLONE_CHILD  | \
+                                    RBD_OPERATION_FEATURE_GROUP        | \
+                                    RBD_OPERATION_FEATURE_SNAP_TRASH)
+
+#endif
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
new file mode 100644
index 00000000..522a6fb6
--- /dev/null
+++ b/src/include/rbd/librbd.h
@@ -0,0 +1,1243 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRBD_H
+#define CEPH_LIBRBD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include <stdbool.h>
+#include <string.h>
+#include <sys/uio.h>
+#include "../rados/librados.h"
+#include "features.h"
+
+#define LIBRBD_VER_MAJOR 1
+#define LIBRBD_VER_MINOR 12
+#define LIBRBD_VER_EXTRA 0
+
+#define LIBRBD_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA)
+
+#define LIBRBD_SUPPORTS_AIO_FLUSH 1
+#define LIBRBD_SUPPORTS_AIO_OPEN 1
+#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE 1
+#define LIBRBD_SUPPORTS_LOCKING 1
+#define LIBRBD_SUPPORTS_INVALIDATE 1
+#define LIBRBD_SUPPORTS_IOVEC 1
+#define LIBRBD_SUPPORTS_WATCH 0
+#define LIBRBD_SUPPORTS_WRITESAME 1
+#define LIBRBD_SUPPORTS_WRITE_ZEROES 1
+
+#if __GNUC__ >= 4
+  #define CEPH_RBD_API    __attribute__ ((visibility ("default")))
+#else
+  #define CEPH_RBD_API
+#endif
+
+#define RBD_FLAG_OBJECT_MAP_INVALID   (1<<0)
+#define RBD_FLAG_FAST_DIFF_INVALID    (1<<1)
+
+typedef void *rbd_image_t;
+typedef void *rbd_image_options_t;
+typedef void *rbd_pool_stats_t;
+
+typedef void *rbd_completion_t;
+typedef void (*rbd_callback_t)(rbd_completion_t cb, void *arg);
+
+typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr);
+
+typedef void (*rbd_update_callback_t)(void *arg);
+
+typedef enum {
+  RBD_SNAP_NAMESPACE_TYPE_USER  = 0,
+  RBD_SNAP_NAMESPACE_TYPE_GROUP = 1,
+  RBD_SNAP_NAMESPACE_TYPE_TRASH = 2
+} rbd_snap_namespace_type_t;
+
+typedef struct {
+  char *id;
+  char *name;
+} rbd_image_spec_t;
+
+typedef struct {
+  int64_t pool_id;
+  char *pool_name;
+  char *pool_namespace;
+  char *image_id;
+  char *image_name;
+  bool trash;
+} rbd_linked_image_spec_t;
+
+typedef struct {
+  uint64_t id;
+  rbd_snap_namespace_type_t namespace_type;
+  char *name;
+} rbd_snap_spec_t;
+
+typedef struct {
+  uint64_t id;
+  uint64_t size;
+  const char *name;
+} rbd_snap_info_t;
+
+typedef struct {
+  const char *pool_name;
+  const char *image_name;
+  const char *image_id;
+  bool trash;
+} rbd_child_info_t;
+
+#define RBD_MAX_IMAGE_NAME_SIZE 96
+#define RBD_MAX_BLOCK_NAME_SIZE 24
+
+#define RBD_SNAP_REMOVE_UNPROTECT	1 << 0
+#define RBD_SNAP_REMOVE_FLATTEN		1 << 1
+#define RBD_SNAP_REMOVE_FORCE		(RBD_SNAP_REMOVE_UNPROTECT | RBD_SNAP_REMOVE_FLATTEN)
+
+/**
+ * These types used to in set_image_notification to indicate the type of event
+ * socket passed in.
+ */
+enum {
+  EVENT_TYPE_PIPE = 1,
+  EVENT_TYPE_EVENTFD = 2
+};
+
+typedef struct {
+  uint64_t size;
+  uint64_t obj_size;
+  uint64_t num_objs;
+  int order;
+  char block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE]; /* deprecated */
+  int64_t parent_pool;			           /* deprecated */
+  char parent_name[RBD_MAX_IMAGE_NAME_SIZE];       /* deprecated */
+} rbd_image_info_t;
+
+typedef enum {
+  RBD_MIRROR_MODE_DISABLED, /* mirroring is disabled */
+  RBD_MIRROR_MODE_IMAGE,    /* mirroring enabled on a per-image basis */
+  RBD_MIRROR_MODE_POOL      /* mirroring enabled on all journaled images */
+} rbd_mirror_mode_t;
+
+typedef enum {
+  RBD_MIRROR_PEER_DIRECTION_RX    = 0,
+  RBD_MIRROR_PEER_DIRECTION_TX    = 1,
+  RBD_MIRROR_PEER_DIRECTION_RX_TX = 2
+} rbd_mirror_peer_direction_t;
+
+typedef struct {
+  char *uuid;
+  char *cluster_name;
+  char *client_name;
+} rbd_mirror_peer_t;
+
+#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_MON_HOST "mon_host"
+#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_KEY      "key"
+
+typedef enum {
+  RBD_MIRROR_IMAGE_DISABLING = 0,
+  RBD_MIRROR_IMAGE_ENABLED = 1,
+  RBD_MIRROR_IMAGE_DISABLED = 2
+} rbd_mirror_image_state_t;
+
+typedef struct {
+  char *global_id;
+  rbd_mirror_image_state_t state;
+  bool primary;
+} rbd_mirror_image_info_t;
+
+typedef enum {
+  MIRROR_IMAGE_STATUS_STATE_UNKNOWN         = 0,
+  MIRROR_IMAGE_STATUS_STATE_ERROR           = 1,
+  MIRROR_IMAGE_STATUS_STATE_SYNCING         = 2,
+  MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY = 3,
+  MIRROR_IMAGE_STATUS_STATE_REPLAYING       = 4,
+  MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY = 5,
+  MIRROR_IMAGE_STATUS_STATE_STOPPED         = 6,
+} rbd_mirror_image_status_state_t;
+
+typedef struct {
+  char *name;
+  rbd_mirror_image_info_t info;
+  rbd_mirror_image_status_state_t state;
+  char *description;
+  time_t last_update;
+  bool up;
+} rbd_mirror_image_status_t;
+
+typedef enum {
+  RBD_GROUP_IMAGE_STATE_ATTACHED,
+  RBD_GROUP_IMAGE_STATE_INCOMPLETE
+} rbd_group_image_state_t;
+
+typedef struct {
+  char *name;
+  int64_t pool;
+  rbd_group_image_state_t state;
+} rbd_group_image_info_t;
+
+typedef struct {
+  char *name;
+  int64_t pool;
+} rbd_group_info_t;
+
+typedef enum {
+  RBD_GROUP_SNAP_STATE_INCOMPLETE,
+  RBD_GROUP_SNAP_STATE_COMPLETE
+} rbd_group_snap_state_t;
+
+typedef struct {
+  char *name;
+  rbd_group_snap_state_t state;
+} rbd_group_snap_info_t;
+
+typedef struct {
+  int64_t group_pool;
+  char *group_name;
+  char *group_snap_name;
+} rbd_snap_group_namespace_t;
+
+typedef enum {
+  RBD_LOCK_MODE_EXCLUSIVE = 0,
+  RBD_LOCK_MODE_SHARED = 1,
+} rbd_lock_mode_t;
+
+CEPH_RBD_API void rbd_version(int *major, int *minor, int *extra);
+
+/* image options */
+enum {
+  RBD_IMAGE_OPTION_FORMAT = 0,
+  RBD_IMAGE_OPTION_FEATURES = 1,
+  RBD_IMAGE_OPTION_ORDER = 2,
+  RBD_IMAGE_OPTION_STRIPE_UNIT = 3,
+  RBD_IMAGE_OPTION_STRIPE_COUNT = 4,
+  RBD_IMAGE_OPTION_JOURNAL_ORDER = 5,
+  RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH = 6,
+  RBD_IMAGE_OPTION_JOURNAL_POOL = 7,
+  RBD_IMAGE_OPTION_FEATURES_SET = 8,
+  RBD_IMAGE_OPTION_FEATURES_CLEAR = 9,
+  RBD_IMAGE_OPTION_DATA_POOL = 10,
+  RBD_IMAGE_OPTION_FLATTEN = 11,
+  RBD_IMAGE_OPTION_CLONE_FORMAT = 12,
+};
+
+typedef enum {
+  RBD_TRASH_IMAGE_SOURCE_USER = 0,
+  RBD_TRASH_IMAGE_SOURCE_MIRRORING = 1,
+  RBD_TRASH_IMAGE_SOURCE_MIGRATION = 2,
+  RBD_TRASH_IMAGE_SOURCE_REMOVING = 3
+} rbd_trash_image_source_t;
+
+typedef struct {
+  char *id;
+  char *name;
+  rbd_trash_image_source_t source;
+  time_t deletion_time;
+  time_t deferment_end_time;
+} rbd_trash_image_info_t;
+
+typedef struct {
+  char *addr;
+  int64_t id;
+  uint64_t cookie;
+} rbd_image_watcher_t;
+
+typedef enum {
+  RBD_IMAGE_MIGRATION_STATE_UNKNOWN = -1,
+  RBD_IMAGE_MIGRATION_STATE_ERROR = 0,
+  RBD_IMAGE_MIGRATION_STATE_PREPARING = 1,
+  RBD_IMAGE_MIGRATION_STATE_PREPARED = 2,
+  RBD_IMAGE_MIGRATION_STATE_EXECUTING = 3,
+  RBD_IMAGE_MIGRATION_STATE_EXECUTED = 4,
+  RBD_IMAGE_MIGRATION_STATE_ABORTING = 5,
+} rbd_image_migration_state_t;
+
+typedef struct {
+  int64_t source_pool_id;
+  char *source_pool_namespace;
+  char *source_image_name;
+  char *source_image_id;
+  int64_t dest_pool_id;
+  char *dest_pool_namespace;
+  char *dest_image_name;
+  char *dest_image_id;
+  rbd_image_migration_state_t state;
+  char *state_description;
+} rbd_image_migration_status_t;
+
+typedef enum {
+  RBD_CONFIG_SOURCE_CONFIG = 0,
+  RBD_CONFIG_SOURCE_POOL = 1,
+  RBD_CONFIG_SOURCE_IMAGE = 2,
+} rbd_config_source_t;
+
+typedef struct {
+  char *name;
+  char *value;
+  rbd_config_source_t source;
+} rbd_config_option_t;
+
+typedef enum {
+  RBD_POOL_STAT_OPTION_IMAGES,
+  RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES,
+  RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES,
+  RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS,
+  RBD_POOL_STAT_OPTION_TRASH_IMAGES,
+  RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES,
+  RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES,
+  RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS
+} rbd_pool_stat_option_t;
+
+CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts);
+CEPH_RBD_API void rbd_image_options_destroy(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_set_string(rbd_image_options_t opts,
+					      int optname, const char* optval);
+CEPH_RBD_API int rbd_image_options_set_uint64(rbd_image_options_t opts,
+					      int optname, uint64_t optval);
+CEPH_RBD_API int rbd_image_options_get_string(rbd_image_options_t opts,
+					      int optname, char* optval,
+					      size_t maxlen);
+CEPH_RBD_API int rbd_image_options_get_uint64(rbd_image_options_t opts,
+					      int optname, uint64_t* optval);
+CEPH_RBD_API int rbd_image_options_is_set(rbd_image_options_t opts,
+                                          int optname, bool* is_set);
+CEPH_RBD_API int rbd_image_options_unset(rbd_image_options_t opts, int optname);
+CEPH_RBD_API void rbd_image_options_clear(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_is_empty(rbd_image_options_t opts);
+
+/* helpers */
+CEPH_RBD_API void rbd_image_spec_cleanup(rbd_image_spec_t *image);
+CEPH_RBD_API void rbd_image_spec_list_cleanup(rbd_image_spec_t *images,
+                                              size_t num_images);
+CEPH_RBD_API void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image);
+CEPH_RBD_API void rbd_linked_image_spec_list_cleanup(
+    rbd_linked_image_spec_t *images, size_t num_images);
+CEPH_RBD_API void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap);
+
+/* images */
+CEPH_RBD_API int rbd_list(rados_ioctx_t io, char *names, size_t *size)
+    __attribute__((deprecated));
+CEPH_RBD_API int rbd_list2(rados_ioctx_t io, rbd_image_spec_t* images,
+                           size_t *max_images);
+
+CEPH_RBD_API int rbd_create(rados_ioctx_t io, const char *name, uint64_t size,
+                            int *order);
+CEPH_RBD_API int rbd_create2(rados_ioctx_t io, const char *name, uint64_t size,
+		             uint64_t features, int *order);
+/**
+ * create new rbd image
+ *
+ * The stripe_unit must be a factor of the object size (1 << order).
+ * The stripe_count can be one (no intra-object striping) or greater
+ * than one.  The RBD_FEATURE_STRIPINGV2 must be specified if the
+ * stripe_unit != the object size and the stripe_count is != 1.
+ *
+ * @param io ioctx
+ * @param name image name
+ * @param size image size in bytes
+ * @param features initial feature bits
+ * @param order object/block size, as a power of two (object size == 1 << order)
+ * @param stripe_unit stripe unit size, in bytes.
+ * @param stripe_count number of objects to stripe over before looping
+ * @return 0 on success, or negative error code
+ */
+CEPH_RBD_API int rbd_create3(rados_ioctx_t io, const char *name, uint64_t size,
+		             uint64_t features, int *order,
+		             uint64_t stripe_unit, uint64_t stripe_count);
+CEPH_RBD_API int rbd_create4(rados_ioctx_t io, const char *name, uint64_t size,
+			     rbd_image_options_t opts);
+CEPH_RBD_API int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
+	                   const char *p_snapname, rados_ioctx_t c_ioctx,
+	                   const char *c_name, uint64_t features, int *c_order);
+CEPH_RBD_API int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name,
+	                    const char *p_snapname, rados_ioctx_t c_ioctx,
+	                    const char *c_name, uint64_t features, int *c_order,
+	                    uint64_t stripe_unit, int stripe_count);
+CEPH_RBD_API int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name,
+	                    const char *p_snapname, rados_ioctx_t c_ioctx,
+	                    const char *c_name, rbd_image_options_t c_opts);
+CEPH_RBD_API int rbd_remove(rados_ioctx_t io, const char *name);
+CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name,
+			                  librbd_progress_fn_t cb,
+                                          void *cbdata);
+CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname,
+                            const char *destname);
+
+CEPH_RBD_API int rbd_trash_move(rados_ioctx_t io, const char *name,
+                                uint64_t delay);
+CEPH_RBD_API int rbd_trash_get(rados_ioctx_t io, const char *id,
+                               rbd_trash_image_info_t *info);
+CEPH_RBD_API void rbd_trash_get_cleanup(rbd_trash_image_info_t *info);
+CEPH_RBD_API int rbd_trash_list(rados_ioctx_t io,
+                                rbd_trash_image_info_t *trash_entries,
+                                size_t *num_entries);
+CEPH_RBD_API void rbd_trash_list_cleanup(rbd_trash_image_info_t *trash_entries,
+                                         size_t num_entries);
+CEPH_RBD_API int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts, float threshold);
+CEPH_RBD_API int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts,
+                                               float threshold, librbd_progress_fn_t cb,
+                                               void* cbdata);
+CEPH_RBD_API int rbd_trash_remove(rados_ioctx_t io, const char *id, bool force);
+CEPH_RBD_API int rbd_trash_remove_with_progress(rados_ioctx_t io,
+                                                const char *id,
+                                                bool force,
+                                                librbd_progress_fn_t cb,
+                                                void *cbdata);
+CEPH_RBD_API int rbd_trash_restore(rados_ioctx_t io, const char *id,
+                                   const char *name);
+
+/* migration */
+CEPH_RBD_API int rbd_migration_prepare(rados_ioctx_t ioctx,
+                                       const char *image_name,
+                                       rados_ioctx_t dest_ioctx,
+                                       const char *dest_image_name,
+                                       rbd_image_options_t opts);
+CEPH_RBD_API int rbd_migration_execute(rados_ioctx_t ioctx,
+                                       const char *image_name);
+CEPH_RBD_API int rbd_migration_execute_with_progress(rados_ioctx_t ioctx,
+                                                     const char *image_name,
+                                                     librbd_progress_fn_t cb,
+                                                     void *cbdata);
+CEPH_RBD_API int rbd_migration_abort(rados_ioctx_t ioctx,
+                                     const char *image_name);
+CEPH_RBD_API int rbd_migration_abort_with_progress(rados_ioctx_t ioctx,
+                                                   const char *image_name,
+                                                   librbd_progress_fn_t cb,
+                                                   void *cbdata);
+CEPH_RBD_API int rbd_migration_commit(rados_ioctx_t ioctx,
+                                      const char *image_name);
+CEPH_RBD_API int rbd_migration_commit_with_progress(rados_ioctx_t ioctx,
+                                                    const char *image_name,
+                                                    librbd_progress_fn_t cb,
+                                                    void *cbdata);
+CEPH_RBD_API int rbd_migration_status(rados_ioctx_t ioctx,
+                                      const char *image_name,
+                                      rbd_image_migration_status_t *status,
+                                      size_t status_size);
+CEPH_RBD_API void rbd_migration_status_cleanup(
+    rbd_image_migration_status_t *status);
+
+/* pool mirroring */
+CEPH_RBD_API int rbd_mirror_site_name_get(rados_t cluster,
+                                          char *name, size_t *max_len);
+CEPH_RBD_API int rbd_mirror_site_name_set(rados_t cluster,
+                                          const char *name);
+
+CEPH_RBD_API int rbd_mirror_mode_get(rados_ioctx_t io_ctx,
+                                     rbd_mirror_mode_t *mirror_mode);
+CEPH_RBD_API int rbd_mirror_mode_set(rados_ioctx_t io_ctx,
+                                     rbd_mirror_mode_t mirror_mode);
+
+CEPH_RBD_API int rbd_mirror_peer_bootstrap_create(rados_ioctx_t io_ctx,
+                                                  char *token, size_t *max_len);
+CEPH_RBD_API int rbd_mirror_peer_bootstrap_import(
+    rados_ioctx_t io_ctx, rbd_mirror_peer_direction_t direction,
+    const char *token);
+
+CEPH_RBD_API int rbd_mirror_peer_add(rados_ioctx_t io_ctx,
+                                     char *uuid, size_t uuid_max_length,
+                                     const char *cluster_name,
+                                     const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_remove(rados_ioctx_t io_ctx,
+                                        const char *uuid);
+CEPH_RBD_API int rbd_mirror_peer_list(rados_ioctx_t io_ctx,
+                                      rbd_mirror_peer_t *peers, int *max_peers);
+CEPH_RBD_API void rbd_mirror_peer_list_cleanup(rbd_mirror_peer_t *peers,
+                                               int max_peers);
+CEPH_RBD_API int rbd_mirror_peer_set_client(rados_ioctx_t io_ctx,
+                                            const char *uuid,
+                                            const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_set_cluster(rados_ioctx_t io_ctx,
+                                             const char *uuid,
+                                             const char *cluster_name);
+CEPH_RBD_API int rbd_mirror_peer_get_attributes(
+    rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len,
+    char *values, size_t *max_value_len, size_t *key_value_count);
+CEPH_RBD_API int rbd_mirror_peer_set_attributes(
+    rados_ioctx_t p, const char *uuid, const char *keys, const char *values,
+    size_t key_value_count);
+
+CEPH_RBD_API int rbd_mirror_image_status_list(rados_ioctx_t io_ctx,
+					      const char *start_id, size_t max,
+					      char **image_ids,
+					      rbd_mirror_image_status_t *images,
+					      size_t *len);
+CEPH_RBD_API void rbd_mirror_image_status_list_cleanup(char **image_ids,
+    rbd_mirror_image_status_t *images, size_t len);
+CEPH_RBD_API int rbd_mirror_image_status_summary(rados_ioctx_t io_ctx,
+    rbd_mirror_image_status_state_t *states, int *counts, size_t *maxlen);
+
+CEPH_RBD_API int rbd_mirror_image_instance_id_list(rados_ioctx_t io_ctx,
+                                                   const char *start_id,
+                                                   size_t max, char **image_ids,
+                                                   char **instance_ids,
+                                                   size_t *len);
+CEPH_RBD_API void rbd_mirror_image_instance_id_list_cleanup(char **image_ids,
+                                                            char **instance_ids,
+                                                            size_t len);
+
+/* pool metadata */
+CEPH_RBD_API int rbd_pool_metadata_get(rados_ioctx_t io_ctx, const char *key,
+                                       char *value, size_t *val_len);
+CEPH_RBD_API int rbd_pool_metadata_set(rados_ioctx_t io_ctx, const char *key,
+                                       const char *value);
+CEPH_RBD_API int rbd_pool_metadata_remove(rados_ioctx_t io_ctx,
+                                          const char *key);
+CEPH_RBD_API int rbd_pool_metadata_list(rados_ioctx_t io_ctx, const char *start,
+                                        uint64_t max, char *keys,
+                                        size_t *key_len, char *values,
+                                        size_t *vals_len);
+
+CEPH_RBD_API int rbd_config_pool_list(rados_ioctx_t io_ctx,
+                                      rbd_config_option_t *options,
+                                      int *max_options);
+CEPH_RBD_API void rbd_config_pool_list_cleanup(rbd_config_option_t *options,
+                                               int max_options);
+
+CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name,
+                          rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_open_by_id(rados_ioctx_t io, const char *id,
+                                rbd_image_t *image, const char *snap_name);
+
+CEPH_RBD_API int rbd_aio_open(rados_ioctx_t io, const char *name,
+			      rbd_image_t *image, const char *snap_name,
+			      rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_open_by_id(rados_ioctx_t io, const char *id,
+                                    rbd_image_t *image, const char *snap_name,
+                                    rbd_completion_t c);
+
+/**
+ * Open an image in read-only mode.
+ *
+ * This is intended for use by clients that cannot write to a block
+ * device due to cephx restrictions. There will be no watch
+ * established on the header object, since a watch is a write. This
+ * means the metadata reported about this image (parents, snapshots,
+ * size, etc.) may become stale. This should not be used for
+ * long-running operations, unless you can be sure that one of these
+ * properties changing is safe.
+ *
+ * Attempting to write to a read-only image will return -EROFS.
+ *
+ * @param io ioctx to determine the pool the image is in
+ * @param name image name
+ * @param image where to store newly opened image handle
+ * @param snap_name name of snapshot to open at, or NULL for no snapshot
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_open_read_only(rados_ioctx_t io, const char *name,
+                                    rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_open_by_id_read_only(rados_ioctx_t io, const char *id,
+                                          rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_aio_open_read_only(rados_ioctx_t io, const char *name,
+					rbd_image_t *image, const char *snap_name,
+					rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_open_by_id_read_only(rados_ioctx_t io, const char *id,
+                                              rbd_image_t *image, const char *snap_name,
+                                              rbd_completion_t c);
+CEPH_RBD_API int rbd_close(rbd_image_t image);
+CEPH_RBD_API int rbd_aio_close(rbd_image_t image, rbd_completion_t c);
+CEPH_RBD_API int rbd_resize(rbd_image_t image, uint64_t size);
+CEPH_RBD_API int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink,
+			     librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_resize_with_progress(rbd_image_t image, uint64_t size,
+			     librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_stat(rbd_image_t image, rbd_image_info_t *info,
+                          size_t infosize);
+CEPH_RBD_API int rbd_get_old_format(rbd_image_t image, uint8_t *old);
+CEPH_RBD_API int rbd_get_size(rbd_image_t image, uint64_t *size);
+CEPH_RBD_API int rbd_get_features(rbd_image_t image, uint64_t *features);
+CEPH_RBD_API int rbd_update_features(rbd_image_t image, uint64_t features,
+                                     uint8_t enabled);
+CEPH_RBD_API int rbd_get_op_features(rbd_image_t image, uint64_t *op_features);
+CEPH_RBD_API int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit);
+CEPH_RBD_API int rbd_get_stripe_count(rbd_image_t image,
+                                      uint64_t *stripe_count);
+
+CEPH_RBD_API int rbd_get_create_timestamp(rbd_image_t image,
+                                          struct timespec *timestamp);
+CEPH_RBD_API int rbd_get_access_timestamp(rbd_image_t image,
+                                          struct timespec *timestamp);
+CEPH_RBD_API int rbd_get_modify_timestamp(rbd_image_t image,
+                                          struct timespec *timestamp);
+
+CEPH_RBD_API int rbd_get_overlap(rbd_image_t image, uint64_t *overlap);
+CEPH_RBD_API int rbd_get_name(rbd_image_t image, char *name, size_t *name_len);
+CEPH_RBD_API int rbd_get_id(rbd_image_t image, char *id, size_t id_len);
+CEPH_RBD_API int rbd_get_block_name_prefix(rbd_image_t image,
+                                           char *prefix, size_t prefix_len);
+CEPH_RBD_API int64_t rbd_get_data_pool_id(rbd_image_t image);
+
+CEPH_RBD_API int rbd_get_parent_info(rbd_image_t image,
+			             char *parent_poolname, size_t ppoolnamelen,
+			             char *parent_name, size_t pnamelen,
+			             char *parent_snapname,
+                                     size_t psnapnamelen)
+    __attribute__((deprecated));
+CEPH_RBD_API int rbd_get_parent_info2(rbd_image_t image,
+                                      char *parent_poolname,
+                                      size_t ppoolnamelen,
+                                      char *parent_name, size_t pnamelen,
+                                      char *parent_id, size_t pidlen,
+                                      char *parent_snapname,
+                                      size_t psnapnamelen)
+    __attribute__((deprecated));
+CEPH_RBD_API int rbd_get_parent(rbd_image_t image,
+                                rbd_linked_image_spec_t *parent_image,
+                                rbd_snap_spec_t *parent_snap);
+
+CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags);
+CEPH_RBD_API int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info,
+                               size_t group_info_size);
+CEPH_RBD_API int rbd_set_image_notification(rbd_image_t image, int fd, int type);
+
+/* exclusive lock feature */
+CEPH_RBD_API int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner);
+CEPH_RBD_API int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode);
+CEPH_RBD_API int rbd_lock_release(rbd_image_t image);
+CEPH_RBD_API int rbd_lock_get_owners(rbd_image_t image,
+                                     rbd_lock_mode_t *lock_mode,
+                                     char **lock_owners,
+                                     size_t *max_lock_owners);
+CEPH_RBD_API void rbd_lock_get_owners_cleanup(char **lock_owners,
+                                              size_t lock_owner_count);
+CEPH_RBD_API int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode,
+                                const char *lock_owner);
+
+/* object map feature */
+CEPH_RBD_API int rbd_rebuild_object_map(rbd_image_t image,
+                                        librbd_progress_fn_t cb, void *cbdata);
+
+CEPH_RBD_API int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx,
+                          const char *destname);
+CEPH_RBD_API int rbd_copy2(rbd_image_t src, rbd_image_t dest);
+CEPH_RBD_API int rbd_copy3(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+			   const char *destname, rbd_image_options_t dest_opts);
+CEPH_RBD_API int rbd_copy4(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+			   const char *destname, rbd_image_options_t dest_opts,
+			   size_t sparse_size);
+CEPH_RBD_API int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
+                                        const char *destname,
+                                        librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress2(rbd_image_t src, rbd_image_t dest,
+			                 librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress3(rbd_image_t image,
+					 rados_ioctx_t dest_p,
+					 const char *destname,
+					 rbd_image_options_t dest_opts,
+					 librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress4(rbd_image_t image,
+					 rados_ioctx_t dest_p,
+					 const char *destname,
+					 rbd_image_options_t dest_opts,
+					 librbd_progress_fn_t cb, void *cbdata,
+					 size_t sparse_size);
+
+/* deep copy */
+CEPH_RBD_API int rbd_deep_copy(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+                               const char *destname,
+                               rbd_image_options_t dest_opts);
+CEPH_RBD_API int rbd_deep_copy_with_progress(rbd_image_t image,
+                                             rados_ioctx_t dest_io_ctx,
+                                             const char *destname,
+                                             rbd_image_options_t dest_opts,
+                                             librbd_progress_fn_t cb,
+                                             void *cbdata);
+
+/* snapshots */
+CEPH_RBD_API int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
+                               int *max_snaps);
+CEPH_RBD_API void rbd_snap_list_end(rbd_snap_info_t *snaps);
+CEPH_RBD_API int rbd_snap_create(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_remove(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_remove2(rbd_image_t image, const char *snap_name,
+                                  uint32_t flags, librbd_progress_fn_t cb,
+                                  void *cbdata);
+CEPH_RBD_API int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id);
+CEPH_RBD_API int rbd_snap_rollback(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_rollback_with_progress(rbd_image_t image,
+                                                 const char *snapname,
+				                 librbd_progress_fn_t cb,
+                                                 void *cbdata);
+CEPH_RBD_API int rbd_snap_rename(rbd_image_t image, const char *snapname,
+				 const char* dstsnapsname);
+/**
+ * Prevent a snapshot from being deleted until it is unprotected.
+ *
+ * @param snap_name which snapshot to protect
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if snap is already protected
+ */
+CEPH_RBD_API int rbd_snap_protect(rbd_image_t image, const char *snap_name);
+/**
+ * Allow a snaphshot to be deleted.
+ *
+ * @param snap_name which snapshot to unprotect
+ * @returns 0 on success, negative error code on failure
+ * @returns -EINVAL if snap is not protected
+ */
+CEPH_RBD_API int rbd_snap_unprotect(rbd_image_t image, const char *snap_name);
+/**
+ * Determine whether a snapshot is protected.
+ *
+ * @param snap_name which snapshot query
+ * @param is_protected where to store the result (0 or 1)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_is_protected(rbd_image_t image, const char *snap_name,
+			               int *is_protected);
+/**
+ * Get the current snapshot limit for an image. If no limit is set,
+ * UINT64_MAX is returned.
+ *
+ * @param limit pointer where the limit will be stored on success
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit);
+
+/**
+ * Set a limit for the number of snapshots that may be taken of an image.
+ *
+ * @param limit the maximum number of snapshots allowed in the future.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_set_limit(rbd_image_t image, uint64_t limit);
+
+/**
+ * Get the timestamp of a snapshot for an image. 
+ *
+ * @param snap_id the snap id of a snapshot of input image.
+ * @param timestamp the timestamp of input snapshot.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp);
+
+CEPH_RBD_API int rbd_snap_set(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id);
+
+CEPH_RBD_API int rbd_snap_get_namespace_type(rbd_image_t image,
+                                             uint64_t snap_id,
+                                             rbd_snap_namespace_type_t *namespace_type);
+CEPH_RBD_API int rbd_snap_get_group_namespace(rbd_image_t image,
+                                              uint64_t snap_id,
+                                              rbd_snap_group_namespace_t *group_snap,
+                                              size_t group_snap_size);
+CEPH_RBD_API int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap,
+                                                  size_t group_snap_size);
+CEPH_RBD_API int rbd_snap_get_trash_namespace(rbd_image_t image,
+                                              uint64_t snap_id,
+                                              char* original_name,
+                                              size_t max_length);
+
+CEPH_RBD_API int rbd_flatten(rbd_image_t image);
+
+CEPH_RBD_API int rbd_flatten_with_progress(rbd_image_t image,
+                                           librbd_progress_fn_t cb,
+                                           void *cbdata);
+
+CEPH_RBD_API int rbd_sparsify(rbd_image_t image, size_t sparse_size);
+
+CEPH_RBD_API int rbd_sparsify_with_progress(rbd_image_t image,
+                                            size_t sparse_size,
+                                            librbd_progress_fn_t cb,
+                                            void *cbdata);
+
+/**
+ * List all images that are cloned from the image at the
+ * snapshot that is set via rbd_snap_set().
+ *
+ * This iterates over all pools, so it should be run by a user with
+ * read access to all of them. pools_len and images_len are filled in
+ * with the number of bytes put into the pools and images buffers.
+ *
+ * If the provided buffers are too short, the required lengths are
+ * still filled in, but the data is not and -ERANGE is returned.
+ * Otherwise, the buffers are filled with the pool and image names
+ * of the children, with a '\0' after each.
+ *
+ * @param image which image (and implicitly snapshot) to list clones of
+ * @param pools buffer in which to store pool names
+ * @param pools_len number of bytes in pools buffer
+ * @param images buffer in which to store image names
+ * @param images_len number of bytes in images buffer
+ * @returns number of children on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RBD_API ssize_t rbd_list_children(rbd_image_t image, char *pools,
+                                       size_t *pools_len, char *images,
+                                       size_t *images_len)
+    __attribute__((deprecated));
+CEPH_RBD_API int rbd_list_children2(rbd_image_t image,
+                                    rbd_child_info_t *children,
+                                    int *max_children)
+    __attribute__((deprecated));
+CEPH_RBD_API void rbd_list_child_cleanup(rbd_child_info_t *child)
+    __attribute__((deprecated));
+CEPH_RBD_API void rbd_list_children_cleanup(rbd_child_info_t *children,
+                                            size_t num_children)
+    __attribute__((deprecated));
+
+CEPH_RBD_API int rbd_list_children3(rbd_image_t image,
+                                    rbd_linked_image_spec_t *images,
+                                    size_t *max_images);
+
+CEPH_RBD_API int rbd_list_descendants(rbd_image_t image,
+                                      rbd_linked_image_spec_t *images,
+                                      size_t *max_images);
+
+/**
+ * @defgroup librbd_h_locking Advisory Locking
+ *
+ * An rbd image may be locking exclusively, or shared, to facilitate
+ * e.g. live migration where the image may be open in two places at once.
+ * These locks are intended to guard against more than one client
+ * writing to an image without coordination. They don't need to
+ * be used for snapshots, since snapshots are read-only.
+ *
+ * Currently locks only guard against locks being acquired.
+ * They do not prevent anything else.
+ *
+ * A locker is identified by the internal rados client id of the
+ * holder and a user-defined cookie. This (client id, cookie) pair
+ * must be unique for each locker.
+ *
+ * A shared lock also has a user-defined tag associated with it. Each
+ * additional shared lock must specify the same tag or lock
+ * acquisition will fail. This can be used by e.g. groups of hosts
+ * using a clustered filesystem on top of an rbd image to make sure
+ * they're accessing the correct image.
+ *
+ * @{
+ */
+/**
+ * List clients that have locked the image and information about the lock.
+ *
+ * The number of bytes required in each buffer is put in the
+ * corresponding size out parameter. If any of the provided buffers
+ * are too short, -ERANGE is returned after these sizes are filled in.
+ *
+ * @param exclusive where to store whether the lock is exclusive (1) or shared (0)
+ * @param tag where to store the tag associated with the image
+ * @param tag_len number of bytes in tag buffer
+ * @param clients buffer in which locker clients are stored, separated by '\0'
+ * @param clients_len number of bytes in the clients buffer
+ * @param cookies buffer in which locker cookies are stored, separated by '\0'
+ * @param cookies_len number of bytes in the cookies buffer
+ * @param addrs buffer in which locker addresses are stored, separated by '\0'
+ * @param addrs_len number of bytes in the clients buffer
+ * @returns number of lockers on success, negative error code on failure
+ * @returns -ERANGE if any of the buffers are too short
+ */
+CEPH_RBD_API ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive,
+			              char *tag, size_t *tag_len,
+			              char *clients, size_t *clients_len,
+			              char *cookies, size_t *cookies_len,
+			              char *addrs, size_t *addrs_len);
+
+/**
+ * Take an exclusive lock on the image.
+ *
+ * @param image the image to lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_lock_exclusive(rbd_image_t image, const char *cookie);
+
+/**
+ * Take a shared lock on the image.
+ *
+ * Other clients may also take a shared lock, as lock as they use the
+ * same tag.
+ *
+ * @param image the image to lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param tag user-defined identifier for this shared use of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_lock_shared(rbd_image_t image, const char *cookie,
+                                 const char *tag);
+
+/**
+ * Release a shared or exclusive lock on the image.
+ *
+ * @param image the image to unlock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_unlock(rbd_image_t image, const char *cookie);
+
+/**
+ * Release a shared or exclusive lock that was taken by the specified client.
+ *
+ * @param image the image to unlock
+ * @param client the entity holding the lock (as given by rbd_list_lockers())
+ * @param cookie user-defined identifier for the instance of the lock to break
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_break_lock(rbd_image_t image, const char *client,
+                                const char *cookie);
+
+/** @} locking */
+
+/* I/O */
+CEPH_RBD_API ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len,
+                              char *buf);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len,
+                               char *buf, int op_flags);
+/* DEPRECATED; use rbd_read_iterate2 */
+CEPH_RBD_API int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len,
+			              int (*cb)(uint64_t, size_t, const char *, void *),
+                                      void *arg);
+
+/**
+ * iterate read over an image
+ *
+ * Reads each region of the image and calls the callback.  If the
+ * buffer pointer passed to the callback is NULL, the given extent is
+ * defined to be zeros (a hole).  Normally the granularity for the
+ * callback is the image stripe size.
+ *
+ * @param image image to read
+ * @param ofs offset to start from
+ * @param len bytes of source image to cover
+ * @param cb callback for each region
+ * @returns 0 success, error otherwise
+ */
+CEPH_RBD_API int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len,
+		                   int (*cb)(uint64_t, size_t, const char *, void *),
+                                   void *arg);
+/**
+ * get difference between two versions of an image
+ *
+ * This will return the differences between two versions of an image
+ * via a callback, which gets the offset and length and a flag
+ * indicating whether the extent exists (1), or is known/defined to
+ * be zeros (a hole, 0).  If the source snapshot name is NULL, we
+ * interpret that as the beginning of time and return all allocated
+ * regions of the image.  The end version is whatever is currently
+ * selected for the image handle (either a snapshot or the writeable
+ * head).
+ *
+ * @param fromsnapname start snapshot name, or NULL
+ * @param ofs start offset
+ * @param len len in bytes of region to report on
+ * @param include_parent 1 if full history diff should include parent
+ * @param whole_object 1 if diff extents should cover whole object
+ * @param cb callback to call for each allocated region
+ * @param arg argument to pass to the callback
+ * @returns 0 on success, or negative error code on error
+ */
+CEPH_RBD_API int rbd_diff_iterate(rbd_image_t image,
+		                  const char *fromsnapname,
+		                  uint64_t ofs, uint64_t len,
+		                  int (*cb)(uint64_t, size_t, int, void *),
+                                  void *arg);
+CEPH_RBD_API int rbd_diff_iterate2(rbd_image_t image,
+		                   const char *fromsnapname,
+		                   uint64_t ofs, uint64_t len,
+                                   uint8_t include_parent, uint8_t whole_object,
+		                   int (*cb)(uint64_t, size_t, int, void *),
+                                   void *arg);
+CEPH_RBD_API ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len,
+                               const char *buf);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
+                                const char *buf, int op_flags);
+CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len);
+CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
+                                   const char *buf, size_t data_len,
+                                   int op_flags);
+CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs,
+                                      size_t len, int zero_flags,
+                                      int op_flags);
+CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs,
+                                           size_t len, const char *cmp_buf,
+                                           const char *buf,
+                                           uint64_t *mismatch_off,
+                                           int op_flags);
+
+CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
+                               const char *buf, rbd_completion_t c);
+
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len,
+                                const char *buf, rbd_completion_t c,
+                                int op_flags);
+CEPH_RBD_API int rbd_aio_writev(rbd_image_t image, const struct iovec *iov,
+                                int iovcnt, uint64_t off, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
+                              char *buf, rbd_completion_t c);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len,
+                               char *buf, rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_readv(rbd_image_t image, const struct iovec *iov,
+                               int iovcnt, uint64_t off, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
+                                 rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
+                                   const char *buf, size_t data_len,
+                                   rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off,
+                                      size_t len, rbd_completion_t c,
+                                      int zero_flags, int op_flags);
+CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image,
+                                               uint64_t off, size_t len,
+                                               const char *cmp_buf,
+                                               const char *buf,
+                                               rbd_completion_t c,
+                                               uint64_t *mismatch_off,
+                                               int op_flags);
+
+CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg,
+                                           rbd_callback_t complete_cb,
+                                           rbd_completion_t *c);
+CEPH_RBD_API int rbd_aio_is_complete(rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_wait_for_complete(rbd_completion_t c);
+CEPH_RBD_API ssize_t rbd_aio_get_return_value(rbd_completion_t c);
+CEPH_RBD_API void *rbd_aio_get_arg(rbd_completion_t c);
+CEPH_RBD_API void rbd_aio_release(rbd_completion_t c);
+CEPH_RBD_API int rbd_flush(rbd_image_t image);
+/**
+ * Start a flush if caching is enabled. Get a callback when
+ * the currently pending writes are on disk.
+ *
+ * @param image the image to flush writes to
+ * @param c what to call when flushing is complete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
+
+/**
+ * Drop any cached data for an image
+ *
+ * @param image the image to invalidate cached data for
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_invalidate_cache(rbd_image_t image);
+
+CEPH_RBD_API int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp);
+
+CEPH_RBD_API int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *val_len);
+CEPH_RBD_API int rbd_metadata_set(rbd_image_t image, const char *key, const char *value);
+CEPH_RBD_API int rbd_metadata_remove(rbd_image_t image, const char *key);
+/**
+ * List all metadatas associated with this image.
+ *
+ * This iterates over all metadatas, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are
+ * still filled in, but the data is not and -ERANGE is returned.
+ * Otherwise, the buffers are filled with the keys and values
+ * of the image, with a '\0' after each.
+ *
+ * @param image which image (and implicitly snapshot) to list clones of
+ * @param start_after which name to begin listing after
+ *        (use the empty string to start at the beginning)
+ * @param max the maximum number of names to lis(if 0 means no limit)
+ * @param keys buffer in which to store pool names
+ * @param keys_len number of bytes in pools buffer
+ * @param values buffer in which to store image names
+ * @param vals_len number of bytes in images buffer
+ * @returns number of children on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RBD_API int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max,
+    char *keys, size_t *key_len, char *values, size_t *vals_len);
+
+// RBD image mirroring support functions
+CEPH_RBD_API int rbd_mirror_image_enable(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_disable(rbd_image_t image, bool force);
+CEPH_RBD_API int rbd_mirror_image_promote(rbd_image_t image, bool force);
+CEPH_RBD_API int rbd_mirror_image_demote(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_resync(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_get_info(rbd_image_t image,
+                                           rbd_mirror_image_info_t *mirror_image_info,
+                                           size_t info_size);
+CEPH_RBD_API int rbd_mirror_image_get_status(rbd_image_t image,
+                                             rbd_mirror_image_status_t *mirror_image_status,
+                                             size_t status_size);
+CEPH_RBD_API int rbd_mirror_image_get_instance_id(rbd_image_t image,
+                                                  char *instance_id,
+                                                  size_t *id_max_length);
+CEPH_RBD_API int rbd_aio_mirror_image_promote(rbd_image_t image, bool force,
+                                              rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_demote(rbd_image_t image,
+                                             rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_info(rbd_image_t image,
+                                               rbd_mirror_image_info_t *mirror_image_info,
+                                               size_t info_size,
+                                               rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_status(rbd_image_t image,
+                                                 rbd_mirror_image_status_t *mirror_image_status,
+                                                 size_t status_size,
+                                                 rbd_completion_t c);
+
+// RBD groups support functions
+CEPH_RBD_API int rbd_group_create(rados_ioctx_t p, const char *name);
+CEPH_RBD_API int rbd_group_remove(rados_ioctx_t p, const char *name);
+CEPH_RBD_API int rbd_group_list(rados_ioctx_t p, char *names, size_t *size);
+CEPH_RBD_API int rbd_group_rename(rados_ioctx_t p, const char *src_name,
+                                  const char *dest_name);
+CEPH_RBD_API int rbd_group_info_cleanup(rbd_group_info_t *group_info,
+                                        size_t group_info_size);
+
+/**
+ * Register an image metadata change watcher.
+ *
+ * @param image the image to watch
+ * @param handle where to store the internal id assigned to this watch
+ * @param watch_cb what to do when a notify is received on this image
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_update_watch(rbd_image_t image, uint64_t *handle,
+				  rbd_update_callback_t watch_cb, void *arg);
+
+/**
+ * Unregister an image watcher.
+ *
+ * @param image the image to unwatch
+ * @param handle which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_update_unwatch(rbd_image_t image, uint64_t handle);
+
+/**
+ * List any watchers of an image.
+ *
+ * Watchers will be allocated and stored in the passed watchers array. If there
+ * are more watchers than max_watchers, -ERANGE will be returned and the number
+ * of watchers will be stored in max_watchers.
+ *
+ * The caller should call rbd_watchers_list_cleanup when finished with the list
+ * of watchers.
+ *
+ * @param image the image to list watchers for.
+ * @param watchers an array to store watchers in.
+ * @param max_watchers capacity of the watchers array.
+ * @returns 0 on success, negative error code on failure.
+ * @returns -ERANGE if there are too many watchers for the passed array.
+ * @returns the number of watchers in max_watchers.
+ */
+CEPH_RBD_API int rbd_watchers_list(rbd_image_t image,
+				   rbd_image_watcher_t *watchers,
+				   size_t *max_watchers);
+
+CEPH_RBD_API void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers,
+					    size_t num_watchers);
+
+CEPH_RBD_API int rbd_config_image_list(rbd_image_t image,
+                                       rbd_config_option_t *options,
+                                       int *max_options);
+CEPH_RBD_API void rbd_config_image_list_cleanup(rbd_config_option_t *options,
+                                                int max_options);
+
+CEPH_RBD_API int rbd_group_image_add(rados_ioctx_t group_p,
+                                     const char *group_name,
+                                     rados_ioctx_t image_p,
+                                     const char *image_name);
+CEPH_RBD_API int rbd_group_image_remove(rados_ioctx_t group_p,
+                                        const char *group_name,
+                                        rados_ioctx_t image_p,
+                                        const char *image_name);
+CEPH_RBD_API int rbd_group_image_remove_by_id(rados_ioctx_t group_p,
+                                              const char *group_name,
+                                              rados_ioctx_t image_p,
+                                              const char *image_id);
+CEPH_RBD_API int rbd_group_image_list(rados_ioctx_t group_p,
+                                      const char *group_name,
+                                      rbd_group_image_info_t *images,
+                                      size_t group_image_info_size,
+                                      size_t *num_entries);
+CEPH_RBD_API int rbd_group_image_list_cleanup(rbd_group_image_info_t *images,
+                                              size_t group_image_info_size,
+                                              size_t num_entries);
+
+CEPH_RBD_API int rbd_group_snap_create(rados_ioctx_t group_p,
+                                       const char *group_name,
+                                       const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_remove(rados_ioctx_t group_p,
+                                       const char *group_name,
+                                       const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_rename(rados_ioctx_t group_p,
+                                       const char *group_name,
+                                       const char *old_snap_name,
+                                       const char *new_snap_name);
+CEPH_RBD_API int rbd_group_snap_list(rados_ioctx_t group_p,
+                                     const char *group_name,
+                                     rbd_group_snap_info_t *snaps,
+                                     size_t group_snap_info_size,
+                                     size_t *num_entries);
+CEPH_RBD_API int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps,
+                                             size_t group_snap_info_size,
+                                             size_t num_entries);
+CEPH_RBD_API int rbd_group_snap_rollback(rados_ioctx_t group_p,
+                                         const char *group_name,
+                                         const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p,
+                                                       const char *group_name,
+                                                       const char *snap_name,
+                                                       librbd_progress_fn_t cb,
+                                                       void *cbdata);
+
+CEPH_RBD_API int rbd_namespace_create(rados_ioctx_t io,
+                                      const char *namespace_name);
+CEPH_RBD_API int rbd_namespace_remove(rados_ioctx_t io,
+                                      const char *namespace_name);
+CEPH_RBD_API int rbd_namespace_list(rados_ioctx_t io, char *namespace_names,
+                                    size_t *size);
+CEPH_RBD_API int rbd_namespace_exists(rados_ioctx_t io,
+                                      const char *namespace_name,
+                                      bool *exists);
+
+CEPH_RBD_API int rbd_pool_init(rados_ioctx_t io, bool force);
+
+CEPH_RBD_API void rbd_pool_stats_create(rbd_pool_stats_t *stats);
+CEPH_RBD_API void rbd_pool_stats_destroy(rbd_pool_stats_t stats);
+CEPH_RBD_API int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats,
+					          int stat_option,
+                                                  uint64_t* stat_val);
+CEPH_RBD_API int rbd_pool_stats_get(rados_ioctx_t io, rbd_pool_stats_t stats);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
new file mode 100644
index 00000000..646c6bb3
--- /dev/null
+++ b/src/include/rbd/librbd.hpp
@@ -0,0 +1,686 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef __LIBRBD_HPP
+#define __LIBRBD_HPP
+
+#include <string>
+#include <list>
+#include <map>
+#include <vector>
+#include "../rados/buffer.h"
+#include "../rados/librados.hpp"
+#include "librbd.h"
+
+namespace librbd {
+
+  using librados::IoCtx;
+
+  class Image;
+  class ImageOptions;
+  class PoolStats;
+  typedef void *image_ctx_t;
+  typedef void *completion_t;
+  typedef void (*callback_t)(completion_t cb, void *arg);
+
+  typedef struct {
+    std::string id;
+    std::string name;
+  } image_spec_t;
+
+  typedef struct {
+    int64_t pool_id;
+    std::string pool_name;
+    std::string pool_namespace;
+    std::string image_id;
+    std::string image_name;
+    bool trash;
+  } linked_image_spec_t;
+
+  typedef rbd_snap_namespace_type_t snap_namespace_type_t;
+
+  typedef struct {
+    uint64_t id;
+    snap_namespace_type_t namespace_type;
+    std::string name;
+  } snap_spec_t;
+
+  typedef struct {
+    uint64_t id;
+    uint64_t size;
+    std::string name;
+  } snap_info_t;
+
+  typedef struct {
+    int64_t group_pool;
+    std::string group_name;
+    std::string group_snap_name;
+  } snap_group_namespace_t;
+
+  typedef struct {
+    std::string client;
+    std::string cookie;
+    std::string address;
+  } locker_t;
+
+  typedef rbd_mirror_peer_direction_t mirror_peer_direction_t;
+
+  typedef struct {
+    std::string uuid;
+    std::string cluster_name;
+    std::string client_name;
+  } mirror_peer_t;
+
+  typedef rbd_mirror_image_state_t mirror_image_state_t;
+
+  typedef struct {
+    std::string global_id;
+    mirror_image_state_t state;
+    bool primary;
+  } mirror_image_info_t;
+
+  typedef rbd_mirror_image_status_state_t mirror_image_status_state_t;
+
+  typedef struct {
+    std::string name;
+    mirror_image_info_t info;
+    mirror_image_status_state_t state;
+    std::string description;
+    time_t last_update;
+    bool up;
+  } mirror_image_status_t;
+
+  typedef rbd_group_image_state_t group_image_state_t;
+
+  typedef struct {
+    std::string name;
+    int64_t pool;
+    group_image_state_t state;
+  } group_image_info_t;
+
+  typedef struct {
+    std::string name;
+    int64_t pool;
+  } group_info_t;
+
+  typedef rbd_group_snap_state_t group_snap_state_t;
+
+  typedef struct {
+    std::string name;
+    group_snap_state_t state;
+  } group_snap_info_t;
+
+  typedef rbd_image_info_t image_info_t;
+
+  class CEPH_RBD_API ProgressContext
+  {
+  public:
+    virtual ~ProgressContext();
+    virtual int update_progress(uint64_t offset, uint64_t total) = 0;
+  };
+
+  typedef struct {
+    std::string id;
+    std::string name;
+    rbd_trash_image_source_t source;
+    time_t deletion_time;
+    time_t deferment_end_time;
+  } trash_image_info_t;
+
+  typedef struct {
+    std::string pool_name;
+    std::string image_name;
+    std::string image_id;
+    bool trash;
+  } child_info_t;
+
+  typedef struct {
+    std::string addr;
+    int64_t id;
+    uint64_t cookie;
+  } image_watcher_t;
+
+  typedef rbd_image_migration_state_t image_migration_state_t;
+
+  typedef struct {
+    int64_t source_pool_id;
+    std::string source_pool_namespace;
+    std::string source_image_name;
+    std::string source_image_id;
+    int64_t dest_pool_id;
+    std::string dest_pool_namespace;
+    std::string dest_image_name;
+    std::string dest_image_id;
+    image_migration_state_t state;
+    std::string state_description;
+  } image_migration_status_t;
+
+  typedef rbd_config_source_t config_source_t;
+
+  typedef struct {
+    std::string name;
+    std::string value;
+    config_source_t source;
+  } config_option_t;
+
+class CEPH_RBD_API RBD
+{
+public:
+  RBD();
+  ~RBD();
+
+  // This must be dynamically allocated with new, and
+  // must be released with release().
+  // Do not use delete.
+  struct AioCompletion {
+    void *pc;
+    AioCompletion(void *cb_arg, callback_t complete_cb);
+    bool is_complete();
+    int wait_for_complete();
+    ssize_t get_return_value();
+    void *get_arg();
+    void release();
+  };
+
+  void version(int *major, int *minor, int *extra);
+
+  int open(IoCtx& io_ctx, Image& image, const char *name);
+  int open(IoCtx& io_ctx, Image& image, const char *name, const char *snapname);
+  int open_by_id(IoCtx& io_ctx, Image& image, const char *id);
+  int open_by_id(IoCtx& io_ctx, Image& image, const char *id, const char *snapname);
+  int aio_open(IoCtx& io_ctx, Image& image, const char *name,
+	       const char *snapname, RBD::AioCompletion *c);
+  int aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id,
+	             const char *snapname, RBD::AioCompletion *c);
+  // see librbd.h
+  int open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+		     const char *snapname);
+  int open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+                           const char *snapname);
+  int aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+			 const char *snapname, RBD::AioCompletion *c);
+  int aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+                               const char *snapname, RBD::AioCompletion *c);
+
+  int list(IoCtx& io_ctx, std::vector<std::string>& names)
+    __attribute__((deprecated));
+  int list2(IoCtx& io_ctx, std::vector<image_spec_t>* images);
+
+  int create(IoCtx& io_ctx, const char *name, uint64_t size, int *order);
+  int create2(IoCtx& io_ctx, const char *name, uint64_t size,
+	      uint64_t features, int *order);
+  int create3(IoCtx& io_ctx, const char *name, uint64_t size,
+	      uint64_t features, int *order,
+	      uint64_t stripe_unit, uint64_t stripe_count);
+  int create4(IoCtx& io_ctx, const char *name, uint64_t size,
+	      ImageOptions& opts);
+  int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+	       IoCtx& c_ioctx, const char *c_name, uint64_t features,
+	       int *c_order);
+  int clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+	     IoCtx& c_ioctx, const char *c_name, uint64_t features,
+	     int *c_order, uint64_t stripe_unit, int stripe_count);
+  int clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+	     IoCtx& c_ioctx, const char *c_name, ImageOptions& opts);
+  int remove(IoCtx& io_ctx, const char *name);
+  int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx);
+  int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname);
+
+  int trash_move(IoCtx &io_ctx, const char *name, uint64_t delay);
+  int trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info);
+  int trash_list(IoCtx &io_ctx, std::vector<trash_image_info_t> &entries);
+  int trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold);
+  int trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts, float threshold,
+                                ProgressContext &pctx);
+  int trash_remove(IoCtx &io_ctx, const char *image_id, bool force);
+  int trash_remove_with_progress(IoCtx &io_ctx, const char *image_id,
+                                 bool force, ProgressContext &pctx);
+  int trash_restore(IoCtx &io_ctx, const char *id, const char *name);
+
+  // Migration
+  int migration_prepare(IoCtx& io_ctx, const char *image_name,
+                        IoCtx& dest_io_ctx, const char *dest_image_name,
+                        ImageOptions& opts);
+  int migration_execute(IoCtx& io_ctx, const char *image_name);
+  int migration_execute_with_progress(IoCtx& io_ctx, const char *image_name,
+                                      ProgressContext &prog_ctx);
+  int migration_abort(IoCtx& io_ctx, const char *image_name);
+  int migration_abort_with_progress(IoCtx& io_ctx, const char *image_name,
+                                    ProgressContext &prog_ctx);
+  int migration_commit(IoCtx& io_ctx, const char *image_name);
+  int migration_commit_with_progress(IoCtx& io_ctx, const char *image_name,
+                                     ProgressContext &prog_ctx);
+  int migration_status(IoCtx& io_ctx, const char *image_name,
+                       image_migration_status_t *status, size_t status_size);
+
+  // RBD pool mirroring support functions
+  int mirror_site_name_get(librados::Rados& rados, std::string* site_name);
+  int mirror_site_name_set(librados::Rados& rados,
+                           const std::string& site_name);
+
+  int mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode);
+  int mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode);
+
+  int mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token);
+  int mirror_peer_bootstrap_import(IoCtx& io_ctx,
+                                   mirror_peer_direction_t direction,
+                                   const std::string &token);
+
+  int mirror_peer_add(IoCtx& io_ctx, std::string *uuid,
+                      const std::string &cluster_name,
+                      const std::string &client_name);
+  int mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid);
+  int mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers);
+  int mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid,
+                             const std::string &client_name);
+  int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid,
+                              const std::string &cluster_name);
+  int mirror_peer_get_attributes(
+      IoCtx& io_ctx, const std::string &uuid,
+      std::map<std::string, std::string> *key_vals);
+  int mirror_peer_set_attributes(
+      IoCtx& io_ctx, const std::string &uuid,
+      const std::map<std::string, std::string>& key_vals);
+
+  int mirror_image_status_list(IoCtx& io_ctx, const std::string &start_id,
+      size_t max, std::map<std::string, mirror_image_status_t> *images);
+  int mirror_image_status_summary(IoCtx& io_ctx,
+      std::map<mirror_image_status_state_t, int> *states);
+  int mirror_image_instance_id_list(IoCtx& io_ctx, const std::string &start_id,
+      size_t max, std::map<std::string, std::string> *sevice_ids);
+
+  // RBD groups support functions
+  int group_create(IoCtx& io_ctx, const char *group_name);
+  int group_remove(IoCtx& io_ctx, const char *group_name);
+  int group_list(IoCtx& io_ctx, std::vector<std::string> *names);
+  int group_rename(IoCtx& io_ctx, const char *src_group_name,
+                   const char *dest_group_name);
+
+  int group_image_add(IoCtx& io_ctx, const char *group_name,
+		      IoCtx& image_io_ctx, const char *image_name);
+  int group_image_remove(IoCtx& io_ctx, const char *group_name,
+			 IoCtx& image_io_ctx, const char *image_name);
+  int group_image_remove_by_id(IoCtx& io_ctx, const char *group_name,
+                               IoCtx& image_io_ctx, const char *image_id);
+  int group_image_list(IoCtx& io_ctx, const char *group_name,
+                       std::vector<group_image_info_t> *images,
+                       size_t group_image_info_size);
+
+  int group_snap_create(IoCtx& io_ctx, const char *group_name,
+			const char *snap_name);
+  int group_snap_remove(IoCtx& io_ctx, const char *group_name,
+			const char *snap_name);
+  int group_snap_rename(IoCtx& group_ioctx, const char *group_name,
+                        const char *old_snap_name, const char *new_snap_name);
+  int group_snap_list(IoCtx& group_ioctx, const char *group_name,
+                      std::vector<group_snap_info_t> *snaps,
+                      size_t group_snap_info_size);
+  int group_snap_rollback(IoCtx& io_ctx, const char *group_name,
+                          const char *snap_name);
+  int group_snap_rollback_with_progress(IoCtx& io_ctx, const char *group_name,
+                                        const char *snap_name,
+                                        ProgressContext& pctx);
+
+  int namespace_create(IoCtx& ioctx, const char *namespace_name);
+  int namespace_remove(IoCtx& ioctx, const char *namespace_name);
+  int namespace_list(IoCtx& io_ctx, std::vector<std::string>* namespace_names);
+  int namespace_exists(IoCtx& io_ctx, const char *namespace_name, bool *exists);
+
+  int pool_init(IoCtx& io_ctx, bool force);
+  int pool_stats_get(IoCtx& io_ctx, PoolStats *pool_stats);
+
+  int pool_metadata_get(IoCtx &io_ctx, const std::string &key,
+                        std::string *value);
+  int pool_metadata_set(IoCtx &io_ctx, const std::string &key,
+                        const std::string &value);
+  int pool_metadata_remove(IoCtx &io_ctx, const std::string &key);
+  int pool_metadata_list(IoCtx &io_ctx, const std::string &start, uint64_t max,
+                         std::map<std::string, ceph::bufferlist> *pairs);
+
+  int config_list(IoCtx& io_ctx, std::vector<config_option_t> *options);
+
+private:
+  /* We don't allow assignment or copying */
+  RBD(const RBD& rhs);
+  const RBD& operator=(const RBD& rhs);
+};
+
+class CEPH_RBD_API ImageOptions {
+public:
+  ImageOptions();
+  ImageOptions(rbd_image_options_t opts);
+  ImageOptions(const ImageOptions &imgopts);
+  ~ImageOptions();
+
+  int set(int optname, const std::string& optval);
+  int set(int optname, uint64_t optval);
+  int get(int optname, std::string* optval) const;
+  int get(int optname, uint64_t* optval) const;
+  int is_set(int optname, bool* is_set);
+  int unset(int optname);
+  void clear();
+  bool empty() const;
+
+private:
+  friend class RBD;
+  friend class Image;
+
+  rbd_image_options_t opts;
+};
+
+class CEPH_RBD_API PoolStats {
+public:
+  PoolStats();
+  ~PoolStats();
+
+  PoolStats(const PoolStats&) = delete;
+  PoolStats& operator=(const PoolStats&) = delete;
+
+  int add(rbd_pool_stat_option_t option, uint64_t* opt_val);
+
+private:
+  friend class RBD;
+
+  rbd_pool_stats_t pool_stats;
+};
+
+class CEPH_RBD_API UpdateWatchCtx {
+public:
+  virtual ~UpdateWatchCtx() {}
+  /**
+   * Callback activated when we receive a notify event.
+   */
+  virtual void handle_notify() = 0;
+};
+
+class CEPH_RBD_API Image
+{
+public:
+  Image();
+  ~Image();
+
+  int close();
+  int aio_close(RBD::AioCompletion *c);
+
+  int resize(uint64_t size);
+  int resize2(uint64_t size, bool allow_shrink, ProgressContext& pctx);
+  int resize_with_progress(uint64_t size, ProgressContext& pctx);
+  int stat(image_info_t &info, size_t infosize);
+  int get_name(std::string *name);
+  int get_id(std::string *id);
+  std::string get_block_name_prefix();
+  int64_t get_data_pool_id();
+  int parent_info(std::string *parent_poolname, std::string *parent_name,
+		  std::string *parent_snapname)
+      __attribute__((deprecated));
+  int parent_info2(std::string *parent_poolname, std::string *parent_name,
+                   std::string *parent_id, std::string *parent_snapname)
+      __attribute__((deprecated));
+  int get_parent(linked_image_spec_t *parent_image, snap_spec_t *parent_snap);
+
+  int old_format(uint8_t *old);
+  int size(uint64_t *size);
+  int get_group(group_info_t *group_info, size_t group_info_size);
+  int features(uint64_t *features);
+  int update_features(uint64_t features, bool enabled);
+  int get_op_features(uint64_t *op_features);
+  int overlap(uint64_t *overlap);
+  int get_flags(uint64_t *flags);
+  int set_image_notification(int fd, int type);
+
+  /* exclusive lock feature */
+  int is_exclusive_lock_owner(bool *is_owner);
+  int lock_acquire(rbd_lock_mode_t lock_mode);
+  int lock_release();
+  int lock_get_owners(rbd_lock_mode_t *lock_mode,
+                      std::list<std::string> *lock_owners);
+  int lock_break(rbd_lock_mode_t lock_mode, const std::string &lock_owner);
+
+  /* object map feature */
+  int rebuild_object_map(ProgressContext &prog_ctx);
+
+  int check_object_map(ProgressContext &prog_ctx);
+
+  int copy(IoCtx& dest_io_ctx, const char *destname);
+  int copy2(Image& dest);
+  int copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts);
+  int copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts,
+	    size_t sparse_size);
+  int copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+			 ProgressContext &prog_ctx);
+  int copy_with_progress2(Image& dest, ProgressContext &prog_ctx);
+  int copy_with_progress3(IoCtx& dest_io_ctx, const char *destname,
+			  ImageOptions& opts, ProgressContext &prog_ctx);
+  int copy_with_progress4(IoCtx& dest_io_ctx, const char *destname,
+			  ImageOptions& opts, ProgressContext &prog_ctx,
+			  size_t sparse_size);
+
+  /* deep copy */
+  int deep_copy(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts);
+  int deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+                              ImageOptions& opts, ProgressContext &prog_ctx);
+
+  /* striping */
+  uint64_t get_stripe_unit() const;
+  uint64_t get_stripe_count() const;
+
+  int get_create_timestamp(struct timespec *timestamp);
+  int get_access_timestamp(struct timespec *timestamp);
+  int get_modify_timestamp(struct timespec *timestamp);
+
+  int flatten();
+  int flatten_with_progress(ProgressContext &prog_ctx);
+
+  int sparsify(size_t sparse_size);
+  int sparsify_with_progress(size_t sparse_size, ProgressContext &prog_ctx);
+  /**
+   * Returns a pair of poolname, imagename for each clone
+   * of this image at the currently set snapshot.
+   */
+  int list_children(std::set<std::pair<std::string, std::string> > *children)
+      __attribute__((deprecated));
+  /**
+  * Returns a structure of poolname, imagename, imageid and trash flag
+  * for each clone of this image at the currently set snapshot.
+  */
+  int list_children2(std::vector<librbd::child_info_t> *children)
+      __attribute__((deprecated));
+  int list_children3(std::vector<linked_image_spec_t> *images);
+  int list_descendants(std::vector<linked_image_spec_t> *images);
+
+  /* advisory locking (see librbd.h for details) */
+  int list_lockers(std::list<locker_t> *lockers,
+		   bool *exclusive, std::string *tag);
+  int lock_exclusive(const std::string& cookie);
+  int lock_shared(const std::string& cookie, const std::string& tag);
+  int unlock(const std::string& cookie);
+  int break_lock(const std::string& client, const std::string& cookie);
+
+  /* snapshots */
+  int snap_list(std::vector<snap_info_t>& snaps);
+  /* DEPRECATED; use snap_exists2 */
+  bool snap_exists(const char *snapname) __attribute__ ((deprecated));
+  int snap_exists2(const char *snapname, bool *exists);
+  int snap_create(const char *snapname);
+  int snap_remove(const char *snapname);
+  int snap_remove2(const char *snapname, uint32_t flags, ProgressContext& pctx);
+  int snap_remove_by_id(uint64_t snap_id);
+  int snap_rollback(const char *snap_name);
+  int snap_rollback_with_progress(const char *snap_name, ProgressContext& pctx);
+  int snap_protect(const char *snap_name);
+  int snap_unprotect(const char *snap_name);
+  int snap_is_protected(const char *snap_name, bool *is_protected);
+  int snap_set(const char *snap_name);
+  int snap_set_by_id(uint64_t snap_id);
+  int snap_rename(const char *srcname, const char *dstname);
+  int snap_get_limit(uint64_t *limit);
+  int snap_set_limit(uint64_t limit);
+  int snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp);
+  int snap_get_namespace_type(uint64_t snap_id,
+                              snap_namespace_type_t *namespace_type);
+  int snap_get_group_namespace(uint64_t snap_id,
+                               snap_group_namespace_t *group_namespace,
+                               size_t snap_group_namespace_size);
+  int snap_get_trash_namespace(uint64_t snap_id, std::string* original_name);
+
+  /* I/O */
+  ssize_t read(uint64_t ofs, size_t len, ceph::bufferlist& bl);
+  /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+  ssize_t read2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+  int64_t read_iterate(uint64_t ofs, size_t len,
+		       int (*cb)(uint64_t, size_t, const char *, void *), void *arg);
+  int read_iterate2(uint64_t ofs, uint64_t len,
+		    int (*cb)(uint64_t, size_t, const char *, void *), void *arg);
+  /**
+   * get difference between two versions of an image
+   *
+   * This will return the differences between two versions of an image
+   * via a callback, which gets the offset and length and a flag
+   * indicating whether the extent exists (1), or is known/defined to
+   * be zeros (a hole, 0).  If the source snapshot name is NULL, we
+   * interpret that as the beginning of time and return all allocated
+   * regions of the image.  The end version is whatever is currently
+   * selected for the image handle (either a snapshot or the writeable
+   * head).
+   *
+   * @param fromsnapname start snapshot name, or NULL
+   * @param ofs start offset
+   * @param len len in bytes of region to report on
+   * @param include_parent true if full history diff should include parent
+   * @param whole_object 1 if diff extents should cover whole object
+   * @param cb callback to call for each allocated region
+   * @param arg argument to pass to the callback
+   * @returns 0 on success, or negative error code on error
+   */
+  int diff_iterate(const char *fromsnapname,
+		   uint64_t ofs, uint64_t len,
+		   int (*cb)(uint64_t, size_t, int, void *), void *arg);
+  int diff_iterate2(const char *fromsnapname,
+		    uint64_t ofs, uint64_t len,
+                    bool include_parent, bool whole_object,
+		    int (*cb)(uint64_t, size_t, int, void *), void *arg);
+
+  ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl);
+  /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+  ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+
+  int discard(uint64_t ofs, uint64_t len);
+  ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags);
+  ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags);
+
+  ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl,
+                            ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags);
+
+  int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
+  /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+  int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl,
+		  RBD::AioCompletion *c, int op_flags);
+
+  int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
+  int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl,
+                    RBD::AioCompletion *c, int op_flags);
+  int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c,
+                       int zero_flags, int op_flags);
+
+  int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl,
+                            ceph::bufferlist& bl, RBD::AioCompletion *c,
+                            uint64_t *mismatch_off, int op_flags);
+
+  /**
+   * read async from image
+   *
+   * The target bufferlist is populated with references to buffers
+   * that contain the data for the given extent of the image.
+   *
+   * NOTE: If caching is enabled, the bufferlist will directly
+   * reference buffers in the cache to avoid an unnecessary data copy.
+   * As a result, if the user intends to modify the buffer contents
+   * directly, they should make a copy first (unconditionally, or when
+   * the reference count on ther underlying buffer is more than 1).
+   *
+   * @param off offset in image
+   * @param len length of read
+   * @param bl bufferlist to read into
+   * @param c aio completion to notify when read is complete
+   */
+  int aio_read(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
+  /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+  int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl,
+		  RBD::AioCompletion *c, int op_flags);
+
+  int flush();
+  /**
+   * Start a flush if caching is enabled. Get a callback when
+   * the currently pending writes are on disk.
+   *
+   * @param image the image to flush writes to
+   * @param c what to call when flushing is complete
+   * @returns 0 on success, negative error code on failure
+   */
+  int aio_flush(RBD::AioCompletion *c);
+
+  /**
+   * Drop any cached data for this image
+   *
+   * @returns 0 on success, negative error code on failure
+   */
+  int invalidate_cache();
+
+  int poll_io_events(RBD::AioCompletion **comps, int numcomp);
+
+  int metadata_get(const std::string &key, std::string *value);
+  int metadata_set(const std::string &key, const std::string &value);
+  int metadata_remove(const std::string &key);
+  /**
+   * Returns a pair of key/value for this image
+   */
+  int metadata_list(const std::string &start, uint64_t max, std::map<std::string, ceph::bufferlist> *pairs);
+
+  // RBD image mirroring support functions
+  int mirror_image_enable();
+  int mirror_image_disable(bool force);
+  int mirror_image_promote(bool force);
+  int mirror_image_demote();
+  int mirror_image_resync();
+  int mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+                            size_t info_size);
+  int mirror_image_get_status(mirror_image_status_t *mirror_image_status,
+			      size_t status_size);
+  int mirror_image_get_instance_id(std::string *instance_id);
+  int aio_mirror_image_promote(bool force, RBD::AioCompletion *c);
+  int aio_mirror_image_demote(RBD::AioCompletion *c);
+  int aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+                                size_t info_size, RBD::AioCompletion *c);
+  int aio_mirror_image_get_status(mirror_image_status_t *mirror_image_status,
+                                  size_t status_size, RBD::AioCompletion *c);
+
+  int update_watch(UpdateWatchCtx *ctx, uint64_t *handle);
+  int update_unwatch(uint64_t handle);
+
+  int list_watchers(std::list<image_watcher_t> &watchers);
+
+  int config_list(std::vector<config_option_t> *options);
+
+private:
+  friend class RBD;
+
+  Image(const Image& rhs);
+  const Image& operator=(const Image& rhs);
+
+  image_ctx_t ctx;
+};
+
+}
+
+#endif
diff --git a/src/include/rbd/object_map_types.h b/src/include/rbd/object_map_types.h
new file mode 100644
index 00000000..54852caa
--- /dev/null
+++ b/src/include/rbd/object_map_types.h
@@ -0,0 +1,13 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_RBD_OBJECT_MAP_TYPES_H
+#define CEPH_RBD_OBJECT_MAP_TYPES_H
+
+#include "include/int_types.h"
+
+static const uint8_t OBJECT_NONEXISTENT  = 0;
+static const uint8_t OBJECT_EXISTS       = 1;
+static const uint8_t OBJECT_PENDING      = 2;
+static const uint8_t OBJECT_EXISTS_CLEAN = 3;
+
+#endif // CEPH_RBD_OBJECT_MAP_TYPES_H
diff --git a/src/include/rbd_types.h b/src/include/rbd_types.h
new file mode 100644
index 00000000..35a1a8bc
--- /dev/null
+++ b/src/include/rbd_types.h
@@ -0,0 +1,159 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include "include/types.h"
+#include "rbd/features.h"
+
+/* New-style rbd image 'foo' consists of objects
+ *   rbd_id.foo              - id of image
+ *   rbd_header.<id>         - image metadata
+ *   rbd_object_map.<id>     - optional image object map
+ *   rbd_data.<id>.00000000
+ *   rbd_data.<id>.00000001
+ *   ...                     - data
+ */
+
+#define RBD_HEADER_PREFIX      "rbd_header."
+#define RBD_OBJECT_MAP_PREFIX  "rbd_object_map."
+#define RBD_DATA_PREFIX        "rbd_data."
+#define RBD_ID_PREFIX          "rbd_id."
+
+/*
+ * old-style rbd image 'foo' consists of objects
+ *   foo.rbd      - image metadata
+ *   rb.<idhi>.<idlo>.00000000
+ *   rb.<idhi>.<idlo>.00000001
+ *   ...          - data
+ */
+
+#define RBD_SUFFIX	 	".rbd"
+#define RBD_DIRECTORY           "rbd_directory"
+#define RBD_INFO                "rbd_info"
+#define RBD_NAMESPACE           "rbd_namespace"
+#define RBD_TASK                "rbd_task"
+
+/*
+ * rbd_children object in each pool contains omap entries
+ * that map parent (poolid, imageid, snapid) to a list of children
+ * (imageids; snapids aren't required because we get all the snapshot
+ * info from a read of the child's header object anyway).
+ *
+ * The clone operation writes a new item to this child list, and rm or
+ * flatten removes an item, and may remove the whole entry if no children
+ * exist after the rm/flatten.
+ *
+ * When attempting to remove a parent, all pools are searched for
+ * rbd_children objects with entries referring to that parent; if any
+ * exist (and those children exist), the parent removal is prevented.
+ */
+#define RBD_CHILDREN		"rbd_children"
+#define RBD_LOCK_NAME		"rbd_lock"
+
+/**
+ * rbd_mirroring object in each pool contains pool-specific settings
+ * for configuring mirroring.
+ */
+#define RBD_MIRRORING       "rbd_mirroring"
+
+/**
+ * rbd_mirror_leader and rbd_mirror_instance.<instance id> objects are used
+ * for pool-level coordination between rbd-mirror daemons.
+ */
+#define RBD_MIRROR_LEADER               "rbd_mirror_leader"
+#define RBD_MIRROR_INSTANCE_PREFIX      "rbd_mirror_instance."
+
+#define RBD_MAX_OBJ_NAME_SIZE	96
+#define RBD_MAX_BLOCK_NAME_SIZE 24
+
+/**
+ * Maximum string length of the RBD v2 image id (not including
+ * null termination). This limit was derived from the existing
+ * RBD_MAX_BLOCK_NAME_SIZE limit which needs to hold the "rbd_data."
+ * prefix and null termination.
+ */
+#define RBD_MAX_IMAGE_ID_LENGTH 14
+
+/**
+ * Maximum string length of the RBD block object name prefix (not including
+ * null termination).
+ *
+ * v1 format: rb.<max 8-byte high id>.<max 8-byte low id>.<max 8-byte extra>
+ * v2 format: rbd_data.[<max 19-byte pool id>.]<max 14-byte image id>
+ *
+ * Note: new features might require increasing this maximum prefix length.
+ */
+#define RBD_MAX_BLOCK_NAME_PREFIX_LENGTH 43
+
+#define RBD_COMP_NONE		0
+#define RBD_CRYPT_NONE		0
+
+#define RBD_HEADER_TEXT		"<<< Rados Block Device Image >>>\n"
+#define RBD_MIGRATE_HEADER_TEXT	"<<< Migrating RBD Image      >>>\n"
+#define RBD_HEADER_SIGNATURE	"RBD"
+#define RBD_HEADER_VERSION	"001.005"
+
+#define RBD_GROUP_INVALID_POOL (-1)
+
+#define RBD_GROUP_HEADER_PREFIX "rbd_group_header."
+
+#define RBD_GROUP_DIRECTORY "rbd_group_directory"
+
+#define RBD_TRASH "rbd_trash"
+
+/**
+ * MON config-key prefix for storing optional remote cluster connectivity
+ * parameters
+ */
+#define RBD_MIRROR_CONFIG_KEY_PREFIX          "rbd/mirror/"
+#define RBD_MIRROR_SITE_NAME_CONFIG_KEY       RBD_MIRROR_CONFIG_KEY_PREFIX "site_name"
+#define RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY  RBD_MIRROR_CONFIG_KEY_PREFIX "peer_client_id"
+#define RBD_MIRROR_PEER_CONFIG_KEY_PREFIX     RBD_MIRROR_CONFIG_KEY_PREFIX "peer/"
+
+struct rbd_info {
+	ceph_le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_obj_snap_ondisk {
+	ceph_le64 id;
+	ceph_le64 image_size;
+} __attribute__((packed));
+
+struct rbd_obj_header_ondisk {
+	char text[40];
+	char block_name[RBD_MAX_BLOCK_NAME_SIZE];
+	char signature[4];
+	char version[8];
+	struct {
+		__u8 order;
+		__u8 crypt_type;
+		__u8 comp_type;
+		__u8 unused;
+	} __attribute__((packed)) options;
+	ceph_le64 image_size;
+	ceph_le64 snap_seq;
+	ceph_le32 snap_count;
+	ceph_le32 reserved;
+	ceph_le64 snap_names_len;
+	struct rbd_obj_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+enum {
+  RBD_PROTECTION_STATUS_UNPROTECTED  = 0,
+  RBD_PROTECTION_STATUS_UNPROTECTING = 1,
+  RBD_PROTECTION_STATUS_PROTECTED    = 2,
+  RBD_PROTECTION_STATUS_LAST         = 3
+};
+
+#endif
diff --git a/src/include/rgw/librgw_admin_user.h b/src/include/rgw/librgw_admin_user.h
new file mode 100644
index 00000000..e1dd5a29
--- /dev/null
+++ b/src/include/rgw/librgw_admin_user.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * create rgw admin user
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef LIB_RGW_ADMIN_USER_H
+#define LIB_RGW_ADMIN_USER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_ADMIN_USER_VER_MAJOR 1
+#define LIBRGW_ADMIN_USER_VER_MINOR 0
+#define LIBRGW_ADMIN_USER_VER_EXTRA 0
+
+#define LIBRGW_ADMIN_USER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_ADMIN_USER_VERSION_CODE LIBRGW_ADMIN_USER_VERSION(LIBRGW_ADMIN_USER_VER_MAJOR, LIBRGW_ADMIN_USER_VER_MINOR, LIBRGW_ADMIN_USER_VER_EXTRA)
+
+typedef void* librgw_admin_user_t;
+int librgw_admin_user_create(librgw_admin_user_t *rgw_admin_user, int argc, char **argv);
+void librgw_admin_user_shutdown(librgw_admin_user_t rgw_admin_user);
+
+struct rgw_user_info
+{
+  const char *uid;
+  const char *display_name;
+  const char *access_key;
+  const char* secret_key;
+  const char* email;
+  const char *caps;
+  const char *access;
+  bool admin;
+  bool system;
+};
+
+ /*
+ * create a new rgw user
+ */
+int rgw_admin_create_user(librgw_admin_user_t rgw_admin_user, const char *uid,
+			  const char *display_name,  const char *access_key, const char* secret_key,
+			  const char *email, const char *caps,
+			  const char *access, bool admin, bool system);
+
+/*
+ * get rgw user info
+ */
+int rgw_admin_user_info(librgw_admin_user_t rgw_admin_user,const char * uid, rgw_user_info* user_info);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBRGW_ADMIN_USER */
diff --git a/src/include/scope_guard.h b/src/include/scope_guard.h
new file mode 100644
index 00000000..878d8c16
--- /dev/null
+++ b/src/include/scope_guard.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef SCOPE_GUARD
+#define SCOPE_GUARD
+
+#include <utility>
+
+template <typename F>
+struct scope_guard {
+  F f;
+  scope_guard() = delete;
+  scope_guard(const scope_guard &) = delete;
+  scope_guard(scope_guard &&) = default;
+  scope_guard & operator=(const scope_guard &) = delete;
+  scope_guard & operator=(scope_guard &&) = default;
+  scope_guard(const F& f) : f(f) {}
+  scope_guard(F &&f) : f(std::move(f)) {}
+  template<typename... Args>
+  scope_guard(std::in_place_t, Args&& ...args) : f(std::forward<Args>(args)...) {}
+  ~scope_guard() {
+    std::move(f)(); // Support at-most-once functions
+  }
+};
+
+template <typename F>
+scope_guard<F> make_scope_guard(F &&f) {
+  return scope_guard<F>(std::forward<F>(f));
+}
+
+template<typename F, typename... Args>
+scope_guard<F> make_scope_guard(std::in_place_type_t<F>, Args&& ...args) {
+  return { std::in_place, std::forward<Args>(args)... };
+}
+
+#endif
diff --git a/src/include/sock_compat.h b/src/include/sock_compat.h
new file mode 100644
index 00000000..14b5efa1
--- /dev/null
+++ b/src/include/sock_compat.h
@@ -0,0 +1,43 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_SOCK_COMPAT_H
+#define CEPH_SOCK_COMPAT_H
+
+#include "include/compat.h"
+#include <sys/socket.h>
+
+/*
+ * This optimization may not be available on all platforms (e.g. OSX).
+ * Apparently a similar approach based on TCP_CORK can be used.
+ */
+#ifndef MSG_MORE
+# define MSG_MORE 0
+#endif
+
+/*
+ * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
+ */
+#ifndef MSG_NOSIGNAL
+# define MSG_NOSIGNAL 0
+# ifdef SO_NOSIGPIPE
+#  define CEPH_USE_SO_NOSIGPIPE
+# else
+#  define CEPH_USE_SIGPIPE_BLOCKER
+#  warning "Using SIGPIPE blocking instead of suppression; this is not well-tested upstream!"
+# endif
+#endif
+
+int socket_cloexec(int domain, int type, int protocol);
+int socketpair_cloexec(int domain, int type, int protocol, int sv[2]);
+int accept_cloexec(int sockfd, struct sockaddr* addr, socklen_t* addrlen);
+
+#endif
diff --git a/src/include/spinlock.h b/src/include/spinlock.h
new file mode 100644
index 00000000..3f12bdc0
--- /dev/null
+++ b/src/include/spinlock.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ * @author Jesse Williamson <jwilliamson@suse.de>
+ *
+*/
+
+#ifndef CEPH_SPINLOCK_HPP
+#define CEPH_SPINLOCK_HPP
+
+#include <atomic>
+
+namespace ceph {
+inline namespace version_1_0 {
+
+class spinlock;
+
+inline void spin_lock(std::atomic_flag& lock);
+inline void spin_unlock(std::atomic_flag& lock);
+inline void spin_lock(ceph::spinlock& lock);
+inline void spin_unlock(ceph::spinlock& lock);
+
+/* A pre-packaged spinlock type modelling BasicLockable: */
+class spinlock final
+{
+  std::atomic_flag af = ATOMIC_FLAG_INIT;
+
+  public:
+  void lock() {
+    ceph::spin_lock(af);
+  }
+ 
+  void unlock() noexcept {
+    ceph::spin_unlock(af);
+  }
+};
+
+// Free functions:
+inline void spin_lock(std::atomic_flag& lock)
+{
+ while(lock.test_and_set(std::memory_order_acquire))
+  ;
+}
+
+inline void spin_unlock(std::atomic_flag& lock)
+{
+ lock.clear(std::memory_order_release);
+}
+
+inline void spin_lock(std::atomic_flag *lock)
+{
+ spin_lock(*lock);
+}
+
+inline void spin_unlock(std::atomic_flag *lock)
+{
+ spin_unlock(*lock);
+}
+
+inline void spin_lock(ceph::spinlock& lock)
+{
+ lock.lock();
+}
+
+inline void spin_unlock(ceph::spinlock& lock)
+{
+ lock.unlock();
+}
+
+inline void spin_lock(ceph::spinlock *lock)
+{
+ spin_lock(*lock);
+}
+
+inline void spin_unlock(ceph::spinlock *lock)
+{
+ spin_unlock(*lock);
+}
+
+} // inline namespace (version)
+} // namespace ceph
+
+#endif
diff --git a/src/include/stat.h b/src/include/stat.h
new file mode 100644
index 00000000..19398758
--- /dev/null
+++ b/src/include/stat.h
@@ -0,0 +1,145 @@
+#ifndef CEPH_STAT_H
+#define CEPH_STAT_H
+
+#include <acconfig.h>
+
+#include <sys/stat.h>
+
+/*
+ * Access time-related `struct stat` members.
+ *
+ * Note that for each of the stat member get/set functions below, setting a
+ * high-res value (stat_set_*_nsec) on a platform without high-res support is
+ * a no-op.
+ */
+
+#ifdef HAVE_STAT_ST_MTIM_TV_NSEC
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+  return st->st_mtim.tv_nsec;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_mtim.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+  return st->st_atim.tv_nsec;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_atim.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+  return st->st_ctim.tv_nsec;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_ctim.tv_nsec = nsec;
+}
+
+#elif defined(HAVE_STAT_ST_MTIMESPEC_TV_NSEC)
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+  return st->st_mtimespec.tv_nsec;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_mtimespec.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+  return st->st_atimespec.tv_nsec;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_atimespec.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+  return st->st_ctimespec.tv_nsec;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_ctimespec.tv_nsec = nsec;
+}
+
+#else
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+  return 0;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+  return 0;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+  return 0;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+#endif
+
+/*
+ * Access second-resolution `struct stat` members.
+ */
+
+static inline uint32_t stat_get_mtime_sec(struct stat *st)
+{
+  return st->st_mtime;
+}
+
+static inline void stat_set_mtime_sec(struct stat *st, uint32_t sec)
+{
+  st->st_mtime = sec;
+}
+
+static inline uint32_t stat_get_atime_sec(struct stat *st)
+{
+  return st->st_atime;
+}
+
+static inline void stat_set_atime_sec(struct stat *st, uint32_t sec)
+{
+  st->st_atime = sec;
+}
+
+static inline uint32_t stat_get_ctime_sec(struct stat *st)
+{
+  return st->st_ctime;
+}
+
+static inline void stat_set_ctime_sec(struct stat *st, uint32_t sec)
+{
+  st->st_ctime = sec;
+}
+
+#endif
diff --git a/src/include/statlite.h b/src/include/statlite.h
new file mode 100644
index 00000000..2ab3a940
--- /dev/null
+++ b/src/include/statlite.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_STATLITE_H
+#define CEPH_STATLITE_H
+
+extern "C" {
+
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+
+struct statlite {
+  dev_t         st_dev;      /* device */
+  ino_t         st_ino;      /* inode */
+  mode_t        st_mode;     /* protection */
+  nlink_t       st_nlink;    /* number of hard links */
+  uid_t         st_uid;      /* user ID of owner */
+  gid_t         st_gid;      /* group ID of owner */
+  dev_t         st_rdev;     /* device type (if inode device)*/
+  unsigned long st_litemask; /* bit mask for optional fields */
+  /***************************************************************/
+  /**** Remaining fields are optional according to st_litemask ***/
+  off_t         st_size;     /* total size, in bytes         */
+  blksize_t     st_blksize;  /* blocksize for filesystem I/O */
+  blkcnt_t      st_blocks;   /* number of blocks allocated   */
+  struct timespec st_atim;            /* Time of last access.  */
+  struct timespec st_mtim;            /* Time of last modification.  */
+  struct timespec st_ctim;            /* Time of last status change.  */
+  //time_t        st_atime;    /* time of last access          */
+  //time_t        st_mtime;    /* time of last modification    */
+  //time_t        st_ctime;    /* time of last change          */
+}; 
+
+#define S_STATLITE_SIZE     1
+#define S_STATLITE_BLKSIZE  2
+#define S_STATLITE_BLOCKS   4
+#define S_STATLITE_ATIME    8
+#define S_STATLITE_MTIME    16
+#define S_STATLITE_CTIME    32
+
+#define S_REQUIRESIZE(m)      (m | S_STATLITE_SIZE)
+#define S_REQUIREBLKSIZE(m)   (m | S_STATLITE_BLKSIZE)
+#define S_REQUIREBLOCKS(m)    (m | S_STATLITE_BLOCKS)
+#define S_REQUIREATIME(m)     (m | S_STATLITE_ATIME)
+#define S_REQUIREMTIME(m)     (m | S_STATLITE_MTIME)
+#define S_REQUIRECTIME(m)     (m | S_STATLITE_CTIME)
+
+#define S_ISVALIDSIZE(m)      (m & S_STATLITE_SIZE)
+#define S_ISVALIDBLKSIZE(m)   (m & S_STATLITE_BLKSIZE)
+#define S_ISVALIDBLOCKS(m)    (m & S_STATLITE_BLOCKS)
+#define S_ISVALIDATIME(m)     (m & S_STATLITE_ATIME)
+#define S_ISVALIDMTIME(m)     (m & S_STATLITE_MTIME)
+#define S_ISVALIDCTIME(m)     (m & S_STATLITE_CTIME)
+
+
+// readdirplus etc.
+
+struct dirent_plus {
+ struct dirent     d_dirent;  /* dirent struct for this entry */
+ struct stat       d_stat;    /* attributes for this entry */
+ int               d_stat_err;/* errno for d_stat, or 0 */
+};
+struct dirent_lite {
+ struct dirent     d_dirent;  /* dirent struct for this entry */
+ struct statlite   d_stat;    /* attributes for this entry */
+ int               d_stat_err;/* errno for d_stat, or 0 */
+};
+
+}
+#endif
diff --git a/src/include/str_list.h b/src/include/str_list.h
new file mode 100644
index 00000000..518db1ca
--- /dev/null
+++ b/src/include/str_list.h
@@ -0,0 +1,129 @@
+#ifndef CEPH_STRLIST_H
+#define CEPH_STRLIST_H
+
+#include <list>
+#include <set>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace ceph {
+
+/// Split a string using the given delimiters, passing each piece as a
+/// (non-null-terminated) std::string_view to the callback.
+template <typename Func> // where Func(std::string_view) is a valid call
+void for_each_substr(std::string_view s, const char *delims, Func&& f)
+{
+  auto pos = s.find_first_not_of(delims);
+  while (pos != s.npos) {
+    s.remove_prefix(pos); // trim delims from the front
+    auto end = s.find_first_of(delims);
+    f(s.substr(0, end));
+    pos = s.find_first_not_of(delims, end);
+  }
+}
+
+} // namespace ceph
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as list
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+			 std::list<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as list
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+                         const char *delims,
+			 std::list<std::string>& str_list);
+
+std::list<std::string> get_str_list(const std::string& str,
+                                    const char *delims = ";,= \t");
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_vec**.
+ * 
+ * @param [in] str String to split and save as Vector
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+extern void get_str_vec(const std::string& str,
+			 std::vector<std::string>& str_vec);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_vec**.
+ * 
+ * @param [in] str String to split and save as Vector
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+extern void get_str_vec(const std::string& str,
+                         const char *delims,
+			 std::vector<std::string>& str_vec);
+
+std::vector<std::string> get_str_vec(const std::string& str,
+                                     const char *delims = ";,= \t");
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as Set
+ * @param [out] str_list Set modified containing str after it has been split
+**/
+extern void get_str_set(const std::string& str,
+			std::set<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as Set
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list Set modified containing str after it has been split
+**/
+template<class Compare = std::less<std::string> >
+void get_str_set(const std::string& str,
+                 const char *delims,
+                 std::set<std::string, Compare>& str_list)
+{
+  str_list.clear();
+  for_each_substr(str, delims, [&str_list] (auto token) {
+                  str_list.emplace(token.begin(), token.end());
+                  });
+}
+
+std::set<std::string> get_str_set(const std::string& str,
+                                  const char *delims = ";,= \t");
+
+
+
+/**
+ * Return a String containing the vector **v** joined with **sep**
+ * 
+ * If **v** is empty, the function returns an empty string
+ * For each element in **v**,
+ * it will concatenate this element and **sep** with result
+ * 
+ * @param [in] v Vector to join as a String
+ * @param [in] sep String used to join each element from **v**
+ * @return empty string if **v** is empty or concatenated string
+**/
+inline std::string str_join(const std::vector<std::string>& v, const std::string& sep)
+{
+  if (v.empty())
+    return std::string();
+  std::vector<std::string>::const_iterator i = v.begin();
+  std::string r = *i;
+  for (++i; i != v.end(); ++i) {
+    r += sep;
+    r += *i;
+  }
+  return r;
+}
+
+#endif
diff --git a/src/include/str_map.h b/src/include/str_map.h
new file mode 100644
index 00000000..6a0370d1
--- /dev/null
+++ b/src/include/str_map.h
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#ifndef CEPH_STRMAP_H
+#define CEPH_STRMAP_H
+
+#define CONST_DELIMS ",;\t\n "
+
+#include <map>
+#include <string>
+#include <sstream>
+
+/**
+ * Parse **str** and set **str_map** with the key/value pairs read
+ * from it. The format of **str** is either a well formed JSON object
+ * or a custom key[=value] plain text format.
+ *
+ * JSON is tried first. If successfully parsed into a JSON object, it
+ * is copied into **str_map** verbatim. If it is not a JSON object ( a
+ * string, integer etc. ), -EINVAL is returned and **ss** is set to
+ * a human readable error message.
+ *
+ * If **str** is no valid JSON and if **fallback_to_plain** is set to true
+ * (default: true) it is assumed to be a string containing white space
+ * separated key=value pairs. A white space is either space, tab or newline.
+ * Function **get_str_map** will be leveraged to parse the plain-text
+ * key/value pairs.
+ * 
+ * @param [in] str JSON or plain text key/value pairs
+ * @param [out] ss human readable message on error
+ * @param [out] str_map key/value pairs read from str
+ * @param [in] fallback_to_plain attempt parsing as plain-text if json fails
+ * @return **0** on success or a -EINVAL on error.
+ */
+extern int get_json_str_map(
+    const std::string &str,
+    std::ostream &ss,
+    std::map<std::string,std::string> *str_map,
+    bool fallback_to_plain = true);
+
+/**
+ * Parse **str** and set **str_map** with the key/value pairs read from
+ * it.  The format of **str** is a number of custom key[=value] pairs in
+ * plain text format.
+ *
+ * The string will be parsed taking **delims** as field delimiters for
+ * key/values.  The value is optional resulting in an empty string when
+ * not provided.  For example, using white space as delimiters:
+ *
+ *     insert your own=political/ideological    statement=here 
+ *
+ * will be parsed into:
+ *
+ *     { "insert": "", 
+ *       "your": "", 
+ *       "own": "political/ideological",
+ *       "statement": "here" }
+ *
+ * Alternative delimiters may be provided.  For instance, specifying
+ * "white space and slash", for the above statement, would be parsed
+ * into:
+ *
+ *     { "insert": "",
+ *       "your": "",
+ *       "own": "political",
+ *       "ideological": "",
+ *       "statement": "here" }
+ *
+ * See how adding '/' to the delimiters field will spawn a new key without
+ * a set value.
+ *
+ * Always returns 0, as there is no condition for failure.
+ *
+ * @param [in] str plain text key/value pairs
+ * @param [in] delims field delimiters to be used for parsing str
+ * @param [out] str_map key/value pairs parsed from str
+ * @return **0**
+ */
+extern int get_str_map(
+    const std::string &str,
+    std::map<std::string,std::string> *str_map,
+    const char *delims = CONST_DELIMS);
+
+/**
+ * Returns the value of **key** in **str_map** if available.
+ *
+ * If **key** is not available in **str_map**, and if **def_val** is
+ * not-NULL then returns **def_val**. Otherwise checks if the value of
+ * **key** is an empty string and if so will return **key**.
+ * If the map contains **key**, the function returns the value of **key**.
+ *
+ * @param[in] str_map Map to obtain **key** from
+ * @param[in] key The key to search for in the map
+ * @param[in] def_val The value to return in case **key** is not present
+ */
+extern std::string get_str_map_value(
+    const std::map<std::string,std::string> &str_map,
+    const std::string &key,
+    const std::string *def_val = NULL);
+
+/**
+ * Returns the value of **key** in **str_map** if available.
+ *
+ * If **key** is available in **str_map** returns the value of **key**.
+ *
+ * If **key** is not available in **str_map**, and if **def_key**
+ * is not-NULL and available in **str_map**, then returns the value
+ * of **def_key**.
+ *
+ * Otherwise returns an empty string.
+ *
+ * @param[in] str_map Map to obtain **key** or **def_key** from
+ * @param[in] key Key to obtain the value of from **str_map**
+ * @param[in] def_key Key to fallback to if **key** is not present
+ *                    in **str_map**
+ */
+extern std::string get_str_map_key(
+    const std::map<std::string,std::string> &str_map,
+    const std::string &key,
+    const std::string *fallback_key = NULL);
+
+
+// This function's only purpose is to check whether a given map has only
+// ONE key with an empty value (which would mean that 'get_str_map()' read
+// a map in the form of 'VALUE', without any KEY/VALUE pairs) and, in such
+// event, to assign said 'VALUE' to a given 'def_key', such that we end up
+// with a map of the form "m = { 'def_key' : 'VALUE' }" instead of the
+// original "m = { 'VALUE' : '' }".
+int get_conf_str_map_helper(
+    const std::string &str,
+    std::ostringstream &oss,
+    std::map<std::string,std::string> *m,
+    const std::string &def_key);
+
+#endif
diff --git a/src/include/stringify.h b/src/include/stringify.h
new file mode 100644
index 00000000..1b2a130c
--- /dev/null
+++ b/src/include/stringify.h
@@ -0,0 +1,33 @@
+#ifndef __CEPH_STRINGIFY_H
+#define __CEPH_STRINGIFY_H
+
+#include <string>
+#include <sstream>
+
+#include "include/types.h"
+
+template<typename T>
+inline std::string stringify(const T& a) {
+#if defined(__GNUC__) && !(defined(__clang__) || defined(__INTEL_COMPILER))
+  static __thread std::ostringstream ss;
+  ss.str("");
+#else
+  std::ostringstream ss;
+#endif
+  ss << a;
+  return ss.str();
+}
+
+template <class T, class A>
+T joinify(const A &begin, const A &end, const T &t)
+{
+  T result;
+  for (A it = begin; it != end; it++) {
+    if (!result.empty())
+      result.append(t);
+    result.append(*it);
+  }
+  return result;
+}
+
+#endif
diff --git a/src/include/timegm.h b/src/include/timegm.h
new file mode 100644
index 00000000..fb970432
--- /dev/null
+++ b/src/include/timegm.h
@@ -0,0 +1,79 @@
+//  (C) Copyright Howard Hinnant
+//  (C) Copyright 2010-2011 Vicente J. Botet Escriba
+//  Use, modification and distribution are subject to the Boost Software License,
+//  Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt).
+
+//===-------------------------- locale ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// This code was adapted by Vicente from Howard Hinnant's experimental work
+// on chrono i/o to Boost and some functions from libc++/locale to emulate the missing time_get::get()
+
+#ifndef BOOST_CHRONO_IO_TIME_POINT_IO_H
+#define BOOST_CHRONO_IO_TIME_POINT_IO_H
+
+#include <time.h>
+
+static int32_t is_leap(int32_t year) {
+  if(year % 400 == 0)
+    return 1;
+  if(year % 100 == 0)
+    return 0;
+  if(year % 4 == 0)
+    return 1;
+  return 0;
+}
+
+static int32_t days_from_0(int32_t year) {
+  year--;
+  return 365 * year + (year / 400) - (year/100) + (year / 4);
+}
+
+int32_t static days_from_1970(int32_t year) {
+  static const int days_from_0_to_1970 = days_from_0(1970);
+  return days_from_0(year) - days_from_0_to_1970;
+}
+
+static int32_t days_from_1jan(int32_t year,int32_t month,int32_t day) {
+  static const int32_t days[2][12] =
+  {
+    { 0,31,59,90,120,151,181,212,243,273,304,334},
+    { 0,31,60,91,121,152,182,213,244,274,305,335}
+  };
+
+  return days[is_leap(year)][month-1] + day - 1;
+}
+
+static  time_t internal_timegm(tm const *t) {
+  int year = t->tm_year + 1900;
+  int month = t->tm_mon;
+  if(month > 11)
+  {
+    year += month/12;
+    month %= 12;
+  }
+  else if(month < 0)
+  {
+    int years_diff = (-month + 11)/12;
+    year -= years_diff;
+    month+=12 * years_diff;
+  }
+  month++;
+  int day = t->tm_mday;
+  int day_of_year = days_from_1jan(year,month,day);
+  int days_since_epoch = days_from_1970(year) + day_of_year ;
+
+  time_t seconds_in_day = 3600 * 24;
+  time_t result = seconds_in_day * days_since_epoch + 3600 * t->tm_hour + 60 * t->tm_min + t->tm_sec;
+
+  return result;
+}
+
+#endif
diff --git a/src/include/types.h b/src/include/types.h
new file mode 100644
index 00000000..1ae15277
--- /dev/null
+++ b/src/include/types.h
@@ -0,0 +1,604 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_TYPES_H
+#define CEPH_TYPES_H
+
+// this is needed for ceph_fs to compile in userland
+#include "int_types.h"
+#include "byteorder.h"
+
+#include "uuid.h"
+
+#include <netinet/in.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "rbd_types.h"
+
+#ifdef __cplusplus
+#ifndef _BACKWARD_BACKWARD_WARNING_H
+#define _BACKWARD_BACKWARD_WARNING_H   // make gcc 4.3 shut up about hash_*
+#endif
+#endif
+
+extern "C" {
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "statlite.h"
+}
+
+#include <string>
+#include <list>
+#include <set>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/flat_map.hpp>
+#include <map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+
+
+#include "include/unordered_map.h"
+
+#include "object.h"
+#include "intarith.h"
+
+#include "acconfig.h"
+
+#include "assert.h"
+
+// DARWIN compatibility
+#ifdef __APPLE__
+typedef long long loff_t;
+typedef long long off64_t;
+#define O_DIRECT 00040000
+#endif
+
+// FreeBSD compatibility
+#ifdef __FreeBSD__
+typedef off_t loff_t;
+typedef off_t off64_t;
+#endif
+
+#if defined(__sun) || defined(_AIX)
+typedef off_t loff_t;
+#endif
+
+
+// -- io helpers --
+
+// Forward declare all the I/O helpers so strict ADL can find them in
+// the case of containers of containers. I'm tempted to abstract this
+// stuff using template templates like I did for denc.
+
+namespace std {
+template<class A, class B>
+inline std::ostream& operator<<(std::ostream&out, const std::pair<A,B>& v);
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v);
+template<class A, std::size_t N, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v);
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t);
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m);
+}
+
+namespace boost {
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const boost::tuple<Ts...> &t);
+
+namespace container {
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& iset);
+}
+}
+
+namespace std {
+template<class A, class B>
+inline std::ostream& operator<<(std::ostream& out, const std::pair<A,B>& v) {
+  return out << v.first << "," << v.second;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v) {
+  bool first = true;
+  out << "[";
+  for (const auto& p : v) {
+    if (!first) out << ",";
+    out << p;
+    first = false;
+  }
+  out << "]";
+  return out;
+}
+
+template<class A, std::size_t N, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v) {
+  bool first = true;
+  out << "[";
+  for (const auto& p : v) {
+    if (!first) out << ",";
+    out << p;
+    first = false;
+  }
+  out << "]";
+  return out;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v) {
+  out << "<";
+  for (auto p = v.begin(); p != v.end(); ++p) {
+    if (p != v.begin()) out << ",";
+    out << *p;
+  }
+  out << ">";
+  return out;
+}
+
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t) {
+  auto f = [n = sizeof...(Ts), i = 0U, &out](const auto& e) mutable {
+    out << e;
+    if (++i != n)
+      out << ",";
+  };
+  ceph::for_each(t, f);
+  return out;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist) {
+  for (auto it = ilist.begin();
+       it != ilist.end();
+       ++it) {
+    if (it != ilist.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset) {
+  for (auto it = iset.begin();
+       it != iset.end();
+       ++it) {
+    if (it != iset.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset) {
+  for (auto it = iset.begin();
+       it != iset.end();
+       ++it) {
+    if (it != iset.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m)
+{
+  out << "{";
+  for (auto it = m.begin();
+       it != m.end();
+       ++it) {
+    if (it != m.begin()) out << ",";
+    out << it->first << "=" << it->second;
+  }
+  out << "}";
+  return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m)
+{
+  out << "{{";
+  for (auto it = m.begin();
+       it != m.end();
+       ++it) {
+    if (it != m.begin()) out << ",";
+    out << it->first << "=" << it->second;
+  }
+  out << "}}";
+  return out;
+}
+
+} // namespace std
+
+namespace boost {
+namespace tuples {
+template<typename A, typename B, typename C>
+inline std::ostream& operator<<(std::ostream& out, const boost::tuples::tuple<A, B, C> &t) {
+  return out << boost::get<0>(t) << ","
+	     << boost::get<1>(t) << ","
+	     << boost::get<2>(t);
+}
+}
+namespace container {
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset) {
+  for (auto it = iset.begin();
+       it != iset.end();
+       ++it) {
+    if (it != iset.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& m) {
+  for (auto it = m.begin();
+       it != m.end();
+       ++it) {
+    if (it != m.begin()) out << ",";
+    out << it->first << "=" << it->second;
+  }
+  return out;
+}
+}
+} // namespace boost
+
+
+
+/*
+ * comparators for stl containers
+ */
+// for ceph::unordered_map:
+//   ceph::unordered_map<const char*, long, hash<const char*>, eqstr> vals;
+struct eqstr
+{
+  bool operator()(const char* s1, const char* s2) const
+  {
+    return strcmp(s1, s2) == 0;
+  }
+};
+
+// for set, map
+struct ltstr
+{
+  bool operator()(const char* s1, const char* s2) const
+  {
+    return strcmp(s1, s2) < 0;
+  }
+};
+
+
+namespace ceph {
+  class Formatter;
+}
+
+#include "encoding.h"
+
+WRITE_RAW_ENCODER(ceph_fsid)
+WRITE_RAW_ENCODER(ceph_file_layout)
+WRITE_RAW_ENCODER(ceph_dir_layout)
+WRITE_RAW_ENCODER(ceph_mds_session_head)
+WRITE_RAW_ENCODER(ceph_mds_request_head_legacy)
+WRITE_RAW_ENCODER(ceph_mds_request_head)
+WRITE_RAW_ENCODER(ceph_mds_request_release)
+WRITE_RAW_ENCODER(ceph_filelock)
+WRITE_RAW_ENCODER(ceph_mds_caps_head)
+WRITE_RAW_ENCODER(ceph_mds_caps_body_legacy)
+WRITE_RAW_ENCODER(ceph_mds_cap_peer)
+WRITE_RAW_ENCODER(ceph_mds_cap_release)
+WRITE_RAW_ENCODER(ceph_mds_cap_item)
+WRITE_RAW_ENCODER(ceph_mds_lease)
+WRITE_RAW_ENCODER(ceph_mds_snap_head)
+WRITE_RAW_ENCODER(ceph_mds_snap_realm)
+WRITE_RAW_ENCODER(ceph_mds_reply_head)
+WRITE_RAW_ENCODER(ceph_mds_reply_cap)
+WRITE_RAW_ENCODER(ceph_mds_cap_reconnect)
+WRITE_RAW_ENCODER(ceph_mds_snaprealm_reconnect)
+WRITE_RAW_ENCODER(ceph_frag_tree_split)
+WRITE_RAW_ENCODER(ceph_osd_reply_head)
+WRITE_RAW_ENCODER(ceph_osd_op)
+WRITE_RAW_ENCODER(ceph_msg_header)
+WRITE_RAW_ENCODER(ceph_msg_footer)
+WRITE_RAW_ENCODER(ceph_msg_footer_old)
+WRITE_RAW_ENCODER(ceph_mon_subscribe_item)
+
+WRITE_RAW_ENCODER(ceph_mon_statfs)
+WRITE_RAW_ENCODER(ceph_mon_statfs_reply)
+
+// ----------------------
+// some basic types
+
+// NOTE: these must match ceph_fs.h typedefs
+typedef uint64_t ceph_tid_t; // transaction id
+typedef uint64_t version_t;
+typedef __u32 epoch_t;       // map epoch  (32bits -> 13 epochs/second for 10 years)
+
+// --------------------------------------
+// identify individual mount clients by 64bit value
+
+struct client_t {
+  int64_t v;
+
+  // cppcheck-suppress noExplicitConstructor
+  client_t(int64_t _v = -2) : v(_v) {}
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(v, bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    using ceph::decode;
+    decode(v, bl);
+  }
+};
+WRITE_CLASS_ENCODER(client_t)
+
+static inline bool operator==(const client_t& l, const client_t& r) { return l.v == r.v; }
+static inline bool operator!=(const client_t& l, const client_t& r) { return l.v != r.v; }
+static inline bool operator<(const client_t& l, const client_t& r) { return l.v < r.v; }
+static inline bool operator<=(const client_t& l, const client_t& r) { return l.v <= r.v; }
+static inline bool operator>(const client_t& l, const client_t& r) { return l.v > r.v; }
+static inline bool operator>=(const client_t& l, const client_t& r) { return l.v >= r.v; }
+
+static inline bool operator>=(const client_t& l, int64_t o) { return l.v >= o; }
+static inline bool operator<(const client_t& l, int64_t o) { return l.v < o; }
+
+inline ostream& operator<<(ostream& out, const client_t& c) {
+  return out << c.v;
+}
+
+
+
+// --
+
+namespace {
+  inline ostream& format_u(ostream& out, const uint64_t v, const uint64_t n,
+      const int index, const uint64_t mult, const char* u)
+  {
+    char buffer[32];
+
+    if (index == 0) {
+      (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+    } else if ((v % mult) == 0) {
+      // If this is an even multiple of the base, always display
+      // without any decimal fraction.
+      (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+    } else {
+      // We want to choose a precision that reflects the best choice
+      // for fitting in 5 characters.  This can get rather tricky when
+      // we have numbers that are very close to an order of magnitude.
+      // For example, when displaying 10239 (which is really 9.999K),
+      // we want only a single place of precision for 10.0K.  We could
+      // develop some complex heuristics for this, but it's much
+      // easier just to try each combination in turn.
+      int i;
+      for (i = 2; i >= 0; i--) {
+        if (snprintf(buffer, sizeof(buffer), "%.*f%s", i,
+          static_cast<double>(v) / mult, u) <= 7)
+          break;
+      }
+    }
+
+    return out << buffer;
+  }
+}
+
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * decimal unit prefix (the classic SI units). No actual unit will be added.
+ */
+struct si_u_t {
+  uint64_t v;
+  explicit si_u_t(uint64_t _v) : v(_v) {};
+};
+
+inline ostream& operator<<(ostream& out, const si_u_t& b)
+{
+  uint64_t n = b.v;
+  int index = 0;
+  uint64_t mult = 1;
+  const char* u[] = {"", "k", "M", "G", "T", "P", "E"};
+
+  while (n >= 1000 && index < 7) {
+    n /= 1000;
+    index++;
+    mult *= 1000;
+  }
+
+  return format_u(out, b.v, n, index, mult, u[index]);
+}
+
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * binary unit prefix (IEC units). Since binary unit prefixes are to be used for
+ * "multiples of units in data processing, data transmission, and digital
+ * information" (so bits and bytes) and so far bits are not printed, the unit
+ * "B" for "byte" is added besides the multiplier.
+ */
+struct byte_u_t {
+  uint64_t v;
+  explicit byte_u_t(uint64_t _v) : v(_v) {};
+};
+
+inline ostream& operator<<(ostream& out, const byte_u_t& b)
+{
+  uint64_t n = b.v;
+  int index = 0;
+  const char* u[] = {" B", " KiB", " MiB", " GiB", " TiB", " PiB", " EiB"};
+
+  while (n >= 1024 && index < 7) {
+    n /= 1024;
+    index++;
+  }
+
+  return format_u(out, b.v, n, index, 1ULL << (10 * index), u[index]);
+}
+
+inline ostream& operator<<(ostream& out, const ceph_mon_subscribe_item& i)
+{
+  return out << i.start
+	     << ((i.flags & CEPH_SUBSCRIBE_ONETIME) ? "" : "+");
+}
+
+struct weightf_t {
+  float v;
+  // cppcheck-suppress noExplicitConstructor
+  weightf_t(float _v) : v(_v) {}
+};
+
+inline ostream& operator<<(ostream& out, const weightf_t& w)
+{
+  if (w.v < -0.01F) {
+    return out << "-";
+  } else if (w.v < 0.000001F) {
+    return out << "0";
+  } else {
+    std::streamsize p = out.precision();
+    return out << std::fixed << std::setprecision(5) << w.v << std::setprecision(p);
+  }
+}
+
+struct shard_id_t {
+  int8_t id;
+
+  shard_id_t() : id(0) {}
+  explicit shard_id_t(int8_t _id) : id(_id) {}
+
+  operator int8_t() const { return id; }
+
+  const static shard_id_t NO_SHARD;
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    encode(id, bl);
+  }
+  void decode(bufferlist::const_iterator &bl) {
+    using ceph::decode;
+    decode(id, bl);
+  }
+};
+WRITE_CLASS_ENCODER(shard_id_t)
+WRITE_EQ_OPERATORS_1(shard_id_t, id)
+WRITE_CMP_OPERATORS_1(shard_id_t, id)
+ostream &operator<<(ostream &lhs, const shard_id_t &rhs);
+
+#if defined(__sun) || defined(_AIX) || defined(__APPLE__) || defined(__FreeBSD__)
+__s32  ceph_to_hostos_errno(__s32 e);
+__s32  hostos_to_ceph_errno(__s32 e);
+#else
+#define  ceph_to_hostos_errno(e) (e)
+#define  hostos_to_ceph_errno(e) (e)
+#endif
+
+struct errorcode32_t {
+  int32_t code;
+
+  errorcode32_t() : code(0) {}
+  // cppcheck-suppress noExplicitConstructor
+  errorcode32_t(int32_t i) : code(i) {}
+
+  operator int() const  { return code; }
+  int* operator&()      { return &code; }
+  int operator==(int i) { return code == i; }
+  int operator>(int i)  { return code > i; }
+  int operator>=(int i) { return code >= i; }
+  int operator<(int i)  { return code < i; }
+  int operator<=(int i) { return code <= i; }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    __s32 newcode = hostos_to_ceph_errno(code);
+    encode(newcode, bl);
+  }
+  void decode(bufferlist::const_iterator &bl) {
+    using ceph::decode;
+    decode(code, bl);
+    code = ceph_to_hostos_errno(code);
+  }
+};
+WRITE_CLASS_ENCODER(errorcode32_t)
+WRITE_EQ_OPERATORS_1(errorcode32_t, code)
+WRITE_CMP_OPERATORS_1(errorcode32_t, code)
+
+template <uint8_t S>
+struct sha_digest_t {
+  constexpr static uint32_t SIZE = S;
+  // TODO: we might consider std::array in the future. Avoiding it for now
+  // as sha_digest_t is a part of our public API.
+  unsigned char v[S] = {0};
+
+  string to_str() const {
+    char str[S * 2 + 1] = {0};
+    str[0] = '\0';
+    for (size_t i = 0; i < S; i++) {
+      ::sprintf(&str[i * 2], "%02x", static_cast<int>(v[i]));
+    }
+    return string(str);
+  }
+  sha_digest_t(const unsigned char *_v) { memcpy(v, _v, SIZE); };
+  sha_digest_t() {}
+
+  bool operator==(const sha_digest_t& r) const {
+    return ::memcmp(v, r.v, SIZE) == 0;
+  }
+  bool operator!=(const sha_digest_t& r) const {
+    return ::memcmp(v, r.v, SIZE) != 0;
+  }
+
+  void encode(bufferlist &bl) const {
+    // copy to avoid reinterpret_cast, is_pod and other nasty things
+    using ceph::encode;
+    std::array<unsigned char, SIZE> tmparr;
+    memcpy(tmparr.data(), v, SIZE);
+    encode(tmparr, bl);
+  }
+  void decode(bufferlist::const_iterator &bl) {
+    using ceph::decode;
+    std::array<unsigned char, SIZE> tmparr;
+    decode(tmparr, bl);
+    memcpy(v, tmparr.data(), SIZE);
+  }
+};
+
+template <uint8_t S>
+inline ostream &operator<<(ostream &out, const sha_digest_t<S> &b) {
+  string str = b.to_str();
+  return out << str;
+}
+
+using sha1_digest_t = sha_digest_t<20>;
+WRITE_CLASS_ENCODER(sha1_digest_t)
+
+using sha256_digest_t = sha_digest_t<32>;
+WRITE_CLASS_ENCODER(sha256_digest_t)
+
+
+#endif
diff --git a/src/include/unordered_map.h b/src/include/unordered_map.h
new file mode 100644
index 00000000..aee5f5a7
--- /dev/null
+++ b/src/include/unordered_map.h
@@ -0,0 +1,11 @@
+#ifndef CEPH_UNORDERED_MAP_H
+#define CEPH_UNORDERED_MAP_H
+
+#include <unordered_map>
+
+namespace ceph {
+  using std::unordered_map;
+  using std::unordered_multimap;
+}
+
+#endif
diff --git a/src/include/unordered_set.h b/src/include/unordered_set.h
new file mode 100644
index 00000000..e30e1799
--- /dev/null
+++ b/src/include/unordered_set.h
@@ -0,0 +1,10 @@
+#ifndef CEPH_UNORDERED_SET_H
+#define CEPH_UNORDERED_SET_H
+
+#include <unordered_set>
+
+namespace ceph {
+  using std::unordered_set;
+}
+
+#endif
diff --git a/src/include/util.h b/src/include/util.h
new file mode 100644
index 00000000..18aa51ad
--- /dev/null
+++ b/src/include/util.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ */
+#ifndef CEPH_UTIL_H
+#define CEPH_UTIL_H
+
+#include "common/Formatter.h"
+#include "include/types.h"
+
+std::string bytes2str(uint64_t count);
+
+struct ceph_data_stats
+{
+  uint64_t byte_total;
+  uint64_t byte_used;
+  uint64_t byte_avail;
+  int avail_percent;
+
+  ceph_data_stats() :
+    byte_total(0),
+    byte_used(0),
+    byte_avail(0),
+    avail_percent(0)
+  { }
+
+  void dump(Formatter *f) const {
+    ceph_assert(f != NULL);
+    f->dump_int("total", byte_total);
+    f->dump_int("used", byte_used);
+    f->dump_int("avail", byte_avail);
+    f->dump_int("avail_percent", avail_percent);
+  }
+
+  void encode(bufferlist &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(byte_total, bl);
+    encode(byte_used, bl);
+    encode(byte_avail, bl);
+    encode(avail_percent, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &p) {
+    DECODE_START(1, p);
+    decode(byte_total, p);
+    decode(byte_used, p);
+    decode(byte_avail, p);
+    decode(avail_percent, p);
+    DECODE_FINISH(p);
+  }
+
+  static void generate_test_instances(list<ceph_data_stats*>& ls) {
+    ls.push_back(new ceph_data_stats);
+    ls.push_back(new ceph_data_stats);
+    ls.back()->byte_total = 1024*1024;
+    ls.back()->byte_used = 512*1024;
+    ls.back()->byte_avail = 512*1024;
+    ls.back()->avail_percent = 50;
+  }
+};
+typedef struct ceph_data_stats ceph_data_stats_t;
+WRITE_CLASS_ENCODER(ceph_data_stats)
+
+int get_fs_stats(ceph_data_stats_t &stats, const char *path);
+
+/// get memory limit for the current cgroup
+int get_cgroup_memory_limit(uint64_t *limit);
+
+/// collect info from @p uname(2), @p /proc/meminfo and @p /proc/cpuinfo
+void collect_sys_info(map<string, string> *m, CephContext *cct);
+
+/// dump service ids grouped by their host to the specified formatter
+/// @param f formatter for the output
+/// @param services a map from hostname to a list of service id hosted by this host
+/// @param type the service type of given @p services, for example @p osd or @p mon.
+void dump_services(Formatter* f, const map<string, list<int> >& services, const char* type);
+/// dump service names grouped by their host to the specified formatter
+/// @param f formatter for the output
+/// @param services a map from hostname to a list of service name hosted by this host
+/// @param type the service type of given @p services, for example @p osd or @p mon.
+void dump_services(Formatter* f, const map<string, list<string> >& services, const char* type);
+
+string cleanbin(bufferlist &bl, bool &b64, bool show = false);
+string cleanbin(string &str);
+
+namespace ceph::util {
+
+// Returns true if s matches any parameters:
+template <typename ...XS>
+bool match_str(const std::string& s, const XS& ...xs)
+{
+ return ((s == xs) || ...);
+}
+
+} // namespace ceph::util
+#endif /* CEPH_UTIL_H */
diff --git a/src/include/utime.h b/src/include/utime.h
new file mode 100644
index 00000000..42f9b087
--- /dev/null
+++ b/src/include/utime.h
@@ -0,0 +1,579 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_UTIME_H
+#define CEPH_UTIME_H
+
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+
+#include "include/types.h"
+#include "include/timegm.h"
+#include "common/strtol.h"
+#include "common/ceph_time.h"
+#include "common/safe_io.h"
+#include "common/SubProcess.h"
+#include "include/denc.h"
+
+
+// --------
+// utime_t
+
+inline __u32 cap_to_u32_max(__u64 t) {
+  return std::min(t, (__u64)std::numeric_limits<uint32_t>::max());
+}
+/* WARNING: If add member in utime_t, please make sure the encode/decode function
+ * work well. For little-endian machine, we should make sure there is no padding
+ * in 32-bit machine and 64-bit machine.
+ * You should also modify the padding_check function.
+ */
+class utime_t {
+public:
+  struct {
+    __u32 tv_sec, tv_nsec;
+  } tv;
+
+ public:
+  bool is_zero() const {
+    return (tv.tv_sec == 0) && (tv.tv_nsec == 0);
+  }
+
+  void normalize() {
+    if (tv.tv_nsec > 1000000000ul) {
+      tv.tv_sec = cap_to_u32_max(tv.tv_sec + tv.tv_nsec / (1000000000ul));
+      tv.tv_nsec %= 1000000000ul;
+    }
+  }
+
+  // cons
+  utime_t() { tv.tv_sec = 0; tv.tv_nsec = 0; }
+  utime_t(time_t s, int n) { tv.tv_sec = s; tv.tv_nsec = n; normalize(); }
+  utime_t(const struct ceph_timespec &v) {
+    decode_timeval(&v);
+  }
+  utime_t(const struct timespec v)
+  {
+    // NOTE: this is used by ceph_clock_now() so should be kept
+    // as thin as possible.
+    tv.tv_sec = v.tv_sec;
+    tv.tv_nsec = v.tv_nsec;
+  }
+  // conversion from ceph::real_time/coarse_real_time
+  template <typename Clock, typename std::enable_if_t<
+            ceph::converts_to_timespec_v<Clock>>* = nullptr>
+  explicit utime_t(const std::chrono::time_point<Clock>& t)
+    : utime_t(Clock::to_timespec(t)) {} // forward to timespec ctor
+
+  utime_t(const struct timeval &v) {
+    set_from_timeval(&v);
+  }
+  utime_t(const struct timeval *v) {
+    set_from_timeval(v);
+  }
+  void to_timespec(struct timespec *ts) const {
+    ts->tv_sec = tv.tv_sec;
+    ts->tv_nsec = tv.tv_nsec;
+  }
+  void set_from_double(double d) { 
+    tv.tv_sec = (__u32)trunc(d);
+    tv.tv_nsec = (__u32)((d - (double)tv.tv_sec) * 1000000000.0);
+  }
+
+  real_time to_real_time() const {
+    ceph_timespec ts;
+    encode_timeval(&ts);
+    return ceph::real_clock::from_ceph_timespec(ts);
+  }
+
+  // accessors
+  time_t        sec()  const { return tv.tv_sec; } 
+  long          usec() const { return tv.tv_nsec/1000; }
+  int           nsec() const { return tv.tv_nsec; }
+
+  // ref accessors/modifiers
+  __u32&         sec_ref()  { return tv.tv_sec; }
+  __u32&         nsec_ref() { return tv.tv_nsec; }
+
+  uint64_t to_nsec() const {
+    return (uint64_t)tv.tv_nsec + (uint64_t)tv.tv_sec * 1000000000ull;
+  }
+  uint64_t to_msec() const {
+    return (uint64_t)tv.tv_nsec / 1000000ull + (uint64_t)tv.tv_sec * 1000ull;
+  }
+
+  void copy_to_timeval(struct timeval *v) const {
+    v->tv_sec = tv.tv_sec;
+    v->tv_usec = tv.tv_nsec/1000;
+  }
+  void set_from_timeval(const struct timeval *v) {
+    tv.tv_sec = v->tv_sec;
+    tv.tv_nsec = v->tv_usec*1000;
+  }
+  void padding_check() {
+    static_assert(
+      sizeof(utime_t) ==
+        sizeof(tv.tv_sec) +
+        sizeof(tv.tv_nsec)
+      ,
+      "utime_t have padding");
+  }
+  void encode(bufferlist &bl) const {
+#if defined(CEPH_LITTLE_ENDIAN)
+    bl.append((char *)(this), sizeof(__u32) + sizeof(__u32));
+#else
+    using ceph::encode;
+    encode(tv.tv_sec, bl);
+    encode(tv.tv_nsec, bl);
+#endif
+  }
+  void decode(bufferlist::const_iterator &p) {
+#if defined(CEPH_LITTLE_ENDIAN)
+    p.copy(sizeof(__u32) + sizeof(__u32), (char *)(this));
+#else
+    using ceph::decode;
+    decode(tv.tv_sec, p);
+    decode(tv.tv_nsec, p);
+#endif
+  }
+
+  DENC(utime_t, v, p) {
+    denc(v.tv.tv_sec, p);
+    denc(v.tv.tv_nsec, p);
+  }
+
+
+  void encode_timeval(struct ceph_timespec *t) const {
+    t->tv_sec = tv.tv_sec;
+    t->tv_nsec = tv.tv_nsec;
+  }
+  void decode_timeval(const struct ceph_timespec *t) {
+    tv.tv_sec = t->tv_sec;
+    tv.tv_nsec = t->tv_nsec;
+  }
+
+  utime_t round_to_minute() {
+    struct tm bdt;
+    time_t tt = sec();
+    localtime_r(&tt, &bdt);
+    bdt.tm_sec = 0;
+    tt = mktime(&bdt);
+    return utime_t(tt, 0);
+  }
+
+  utime_t round_to_hour() {
+    struct tm bdt;
+    time_t tt = sec();
+    localtime_r(&tt, &bdt);
+    bdt.tm_sec = 0;
+    bdt.tm_min = 0;
+    tt = mktime(&bdt);
+    return utime_t(tt, 0);
+  }
+
+  utime_t round_to_day() {
+    struct tm bdt;
+    time_t tt = sec();
+    localtime_r(&tt, &bdt);
+    bdt.tm_sec = 0;
+    bdt.tm_min = 0;
+    bdt.tm_hour = 0;
+    tt = mktime(&bdt);
+    return utime_t(tt, 0);
+  }
+
+  // cast to double
+  operator double() const {
+    return (double)sec() + ((double)nsec() / 1000000000.0L);
+  }
+  operator ceph_timespec() const {
+    ceph_timespec ts;
+    ts.tv_sec = sec();
+    ts.tv_nsec = nsec();
+    return ts;
+  }
+
+  void sleep() const {
+    struct timespec ts;
+    to_timespec(&ts);
+    nanosleep(&ts, NULL);
+  }
+
+  // output
+  ostream& gmtime(ostream& out) const {
+    out.setf(std::ios::right);
+    char oldfill = out.fill();
+    out.fill('0');
+    if (sec() < ((time_t)(60*60*24*365*10))) {
+      // raw seconds.  this looks like a relative time.
+      out << (long)sec() << "." << std::setw(6) << usec();
+    } else {
+      // this looks like an absolute time.
+      //  aim for http://en.wikipedia.org/wiki/ISO_8601
+      struct tm bdt;
+      time_t tt = sec();
+      gmtime_r(&tt, &bdt);
+      out << std::setw(4) << (bdt.tm_year+1900)  // 2007 -> '07'
+	  << '-' << std::setw(2) << (bdt.tm_mon+1)
+	  << '-' << std::setw(2) << bdt.tm_mday
+	  << ' '
+	  << std::setw(2) << bdt.tm_hour
+	  << ':' << std::setw(2) << bdt.tm_min
+	  << ':' << std::setw(2) << bdt.tm_sec;
+      out << "." << std::setw(6) << usec();
+      out << "Z";
+    }
+    out.fill(oldfill);
+    out.unsetf(std::ios::right);
+    return out;
+  }
+
+  // output
+  ostream& gmtime_nsec(ostream& out) const {
+    out.setf(std::ios::right);
+    char oldfill = out.fill();
+    out.fill('0');
+    if (sec() < ((time_t)(60*60*24*365*10))) {
+      // raw seconds.  this looks like a relative time.
+      out << (long)sec() << "." << std::setw(6) << usec();
+    } else {
+      // this looks like an absolute time.
+      //  aim for http://en.wikipedia.org/wiki/ISO_8601
+      struct tm bdt;
+      time_t tt = sec();
+      gmtime_r(&tt, &bdt);
+      out << std::setw(4) << (bdt.tm_year+1900)  // 2007 -> '07'
+	  << '-' << std::setw(2) << (bdt.tm_mon+1)
+	  << '-' << std::setw(2) << bdt.tm_mday
+	  << ' '
+	  << std::setw(2) << bdt.tm_hour
+	  << ':' << std::setw(2) << bdt.tm_min
+	  << ':' << std::setw(2) << bdt.tm_sec;
+      out << "." << std::setw(9) << nsec();
+      out << "Z";
+    }
+    out.fill(oldfill);
+    out.unsetf(std::ios::right);
+    return out;
+  }
+
+  // output
+  ostream& asctime(ostream& out) const {
+    out.setf(std::ios::right);
+    char oldfill = out.fill();
+    out.fill('0');
+    if (sec() < ((time_t)(60*60*24*365*10))) {
+      // raw seconds.  this looks like a relative time.
+      out << (long)sec() << "." << std::setw(6) << usec();
+    } else {
+      // this looks like an absolute time.
+      //  aim for http://en.wikipedia.org/wiki/ISO_8601
+      struct tm bdt;
+      time_t tt = sec();
+      gmtime_r(&tt, &bdt);
+
+      char buf[128];
+      asctime_r(&bdt, buf);
+      int len = strlen(buf);
+      if (buf[len - 1] == '\n')
+        buf[len - 1] = '\0';
+      out << buf;
+    }
+    out.fill(oldfill);
+    out.unsetf(std::ios::right);
+    return out;
+  }
+  
+  ostream& localtime(ostream& out) const {
+    out.setf(std::ios::right);
+    char oldfill = out.fill();
+    out.fill('0');
+    if (sec() < ((time_t)(60*60*24*365*10))) {
+      // raw seconds.  this looks like a relative time.
+      out << (long)sec() << "." << std::setw(6) << usec();
+    } else {
+      // this looks like an absolute time.
+      //  aim for http://en.wikipedia.org/wiki/ISO_8601
+      struct tm bdt;
+      time_t tt = sec();
+      localtime_r(&tt, &bdt);
+      out << std::setw(4) << (bdt.tm_year+1900)  // 2007 -> '07'
+	  << '-' << std::setw(2) << (bdt.tm_mon+1)
+	  << '-' << std::setw(2) << bdt.tm_mday
+	  << ' '
+	  << std::setw(2) << bdt.tm_hour
+	  << ':' << std::setw(2) << bdt.tm_min
+	  << ':' << std::setw(2) << bdt.tm_sec;
+      out << "." << std::setw(6) << usec();
+      //out << '_' << bdt.tm_zone;
+    }
+    out.fill(oldfill);
+    out.unsetf(std::ios::right);
+    return out;
+  }
+
+  int sprintf(char *out, int outlen) const {
+    struct tm bdt;
+    time_t tt = sec();
+    localtime_r(&tt, &bdt);
+
+    return ::snprintf(out, outlen,
+		    "%04d-%02d-%02d %02d:%02d:%02d.%06ld",
+		    bdt.tm_year + 1900, bdt.tm_mon + 1, bdt.tm_mday,
+		    bdt.tm_hour, bdt.tm_min, bdt.tm_sec, usec());
+  }
+
+  static int snprintf(char *out, int outlen, time_t tt) {
+    struct tm bdt;
+    localtime_r(&tt, &bdt);
+
+    return ::snprintf(out, outlen,
+        "%04d-%02d-%02d %02d:%02d:%02d",
+        bdt.tm_year + 1900, bdt.tm_mon + 1, bdt.tm_mday,
+        bdt.tm_hour, bdt.tm_min, bdt.tm_sec);
+  }
+
+  static int invoke_date(const std::string& date_str, utime_t *result) {
+     char buf[256];
+
+     SubProcess bin_date("/bin/date", SubProcess::CLOSE, SubProcess::PIPE,
+			 SubProcess::KEEP);
+     bin_date.add_cmd_args("-d", date_str.c_str(), "+%s %N", NULL);
+
+     int r = bin_date.spawn();
+     if (r < 0) return r;
+
+     ssize_t n = safe_read(bin_date.get_stdout(), buf, sizeof(buf));
+
+     r = bin_date.join();
+     if (r || n <= 0) return -EINVAL;
+
+     uint64_t epoch, nsec;
+     std::istringstream iss(buf);
+
+     iss >> epoch;
+     iss >> nsec;
+
+     *result = utime_t(epoch, nsec);
+
+     return 0;
+  }
+
+
+  static int parse_date(const string& date, uint64_t *epoch, uint64_t *nsec,
+                        string *out_date=NULL, string *out_time=NULL) {
+    struct tm tm;
+    memset(&tm, 0, sizeof(tm));
+
+    if (nsec)
+      *nsec = 0;
+
+    const char *p = strptime(date.c_str(), "%Y-%m-%d", &tm);
+    if (p) {
+      if (*p == ' ' || *p == 'T') {
+	p++;
+	// strptime doesn't understand fractional/decimal seconds, and
+	// it also only takes format chars or literals, so we have to
+	// get creative.
+	char fmt[32] = {0};
+	strncpy(fmt, p, sizeof(fmt) - 1);
+	fmt[0] = '%';
+	fmt[1] = 'H';
+	fmt[2] = ':';
+	fmt[3] = '%';
+	fmt[4] = 'M';
+	fmt[6] = '%';
+	fmt[7] = 'S';
+	const char *subsec = 0;
+	char *q = fmt + 8;
+	if (*q == '.') {
+	  ++q;
+	  subsec = p + 9;
+	  q = fmt + 9;
+	  while (*q && isdigit(*q)) {
+	    ++q;
+	  }
+	}
+	// look for tz...
+	if (*q == '-' || *q == '+') {
+	  *q = '%';
+	  *(q+1) = 'z';
+	  *(q+2) = 0;
+	}
+	p = strptime(p, fmt, &tm);
+	if (!p) {
+	  return -EINVAL;
+	}
+        if (nsec && subsec) {
+          unsigned i;
+          char buf[10]; /* 9 digit + null termination */
+          for (i = 0; (i < sizeof(buf) - 1) && isdigit(*subsec); ++i, ++subsec) {
+            buf[i] = *subsec;
+          }
+          for (; i < sizeof(buf) - 1; ++i) {
+            buf[i] = '0';
+          }
+          buf[i] = '\0';
+          string err;
+          *nsec = (uint64_t)strict_strtol(buf, 10, &err);
+          if (!err.empty()) {
+            return -EINVAL;
+          }
+        }
+      }
+    } else {
+      int sec, usec;
+      int r = sscanf(date.c_str(), "%d.%d", &sec, &usec);
+      if (r != 2) {
+        return -EINVAL;
+      }
+
+      time_t tt = sec;
+      gmtime_r(&tt, &tm);
+
+      if (nsec) {
+        *nsec = (uint64_t)usec * 1000;
+      }
+    }
+
+    // apply the tm_gmtoff manually below, since none of mktime,
+    // gmtime, and localtime seem to do it.  zero it out here just in
+    // case some other libc *does* apply it.  :(
+    auto gmtoff = tm.tm_gmtoff;
+    tm.tm_gmtoff = 0;
+
+    time_t t = internal_timegm(&tm);
+    if (epoch)
+      *epoch = (uint64_t)t;
+
+    *epoch -= gmtoff;
+
+    if (out_date) {
+      char buf[32];
+      strftime(buf, sizeof(buf), "%F", &tm);
+      *out_date = buf;
+    }
+    if (out_time) {
+      char buf[32];
+      strftime(buf, sizeof(buf), "%T", &tm);
+      *out_time = buf;
+    }
+
+    return 0;
+  }
+
+  bool parse(const string& s) {
+    uint64_t epoch, nsec;
+    int r = parse_date(s, &epoch, &nsec);
+    if (r < 0) {
+      return false;
+    }
+    *this = utime_t(epoch, nsec);
+    return true;
+  }
+};
+WRITE_CLASS_ENCODER(utime_t)
+WRITE_CLASS_DENC(utime_t)
+
+// arithmetic operators
+inline utime_t operator+(const utime_t& l, const utime_t& r) {
+  __u64 sec = (__u64)l.sec() + r.sec();
+  return utime_t(cap_to_u32_max(sec), l.nsec() + r.nsec());
+}
+inline utime_t& operator+=(utime_t& l, const utime_t& r) {
+  l.sec_ref() = cap_to_u32_max((__u64)l.sec() + r.sec());
+  l.nsec_ref() += r.nsec();
+  l.normalize();
+  return l;
+}
+inline utime_t& operator+=(utime_t& l, double f) {
+  double fs = trunc(f);
+  double ns = (f - fs) * 1000000000.0;
+  l.sec_ref() = cap_to_u32_max(l.sec() + (__u64)fs);
+  l.nsec_ref() += (long)ns;
+  l.normalize();
+  return l;
+}
+
+inline utime_t operator-(const utime_t& l, const utime_t& r) {
+  return utime_t( l.sec() - r.sec() - (l.nsec()<r.nsec() ? 1:0),
+                  l.nsec() - r.nsec() + (l.nsec()<r.nsec() ? 1000000000:0) );
+}
+inline utime_t& operator-=(utime_t& l, const utime_t& r) {
+  l.sec_ref() -= r.sec();
+  if (l.nsec() >= r.nsec())
+    l.nsec_ref() -= r.nsec();
+  else {
+    l.nsec_ref() += 1000000000L - r.nsec();
+    l.sec_ref()--;
+  }
+  return l;
+}
+inline utime_t& operator-=(utime_t& l, double f) {
+  double fs = trunc(f);
+  double ns = (f - fs) * 1000000000.0;
+  l.sec_ref() -= (long)fs;
+  long nsl = (long)ns;
+  if (nsl) {
+    l.sec_ref()--;
+    l.nsec_ref() = 1000000000L + l.nsec_ref() - nsl;
+  }
+  l.normalize();
+  return l;
+}
+
+
+// comparators
+inline bool operator>(const utime_t& a, const utime_t& b)
+{
+  return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.nsec() > b.nsec());
+}
+inline bool operator<=(const utime_t& a, const utime_t& b)
+{
+  return !(operator>(a, b));
+}
+inline bool operator<(const utime_t& a, const utime_t& b)
+{
+  return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.nsec() < b.nsec());
+}
+inline bool operator>=(const utime_t& a, const utime_t& b)
+{
+  return !(operator<(a, b));
+}
+
+inline bool operator==(const utime_t& a, const utime_t& b)
+{
+  return a.sec() == b.sec() && a.nsec() == b.nsec();
+}
+inline bool operator!=(const utime_t& a, const utime_t& b)
+{
+  return a.sec() != b.sec() || a.nsec() != b.nsec();
+}
+
+
+// output
+
+// ostream
+inline std::ostream& operator<<(std::ostream& out, const utime_t& t)
+{
+  return t.localtime(out);
+}
+
+inline std::string utimespan_str(const utime_t& age) {
+  auto age_ts = ceph::timespan(age.nsec()) + std::chrono::seconds(age.sec());
+  return timespan_str(age_ts);
+}
+
+#endif
diff --git a/src/include/uuid.h b/src/include/uuid.h
new file mode 100644
index 00000000..f957f87a
--- /dev/null
+++ b/src/include/uuid.h
@@ -0,0 +1,83 @@
+#ifndef _CEPH_UUID_H
+#define _CEPH_UUID_H
+
+/*
+ * Thin C++ wrapper around libuuid.
+ */
+
+#include "encoding.h"
+
+#include <ostream>
+#include <random>
+
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+struct uuid_d {
+  boost::uuids::uuid uuid;
+
+  uuid_d() {
+    boost::uuids::nil_generator gen;
+    uuid = gen();
+  }
+
+  bool is_zero() const {
+    return uuid.is_nil();
+  }
+
+  void generate_random() {
+    std::random_device rng;
+    boost::uuids::basic_random_generator gen(rng);
+    uuid = gen();
+  }
+  
+  bool parse(const char *s) {
+    try {
+      boost::uuids::string_generator gen;
+      uuid = gen(s);
+      return true;
+    } catch (std::runtime_error& e) {
+      return false;
+    }
+  }
+  void print(char *s) const {
+    memcpy(s, boost::uuids::to_string(uuid).c_str(), 37);
+  }
+
+ std::string to_string() const {
+    return boost::uuids::to_string(uuid);
+  }
+
+  char *bytes() const {
+    return (char*)uuid.data;
+  }
+
+  void encode(bufferlist& bl) const {
+    ::encode_raw(uuid, bl);
+  }
+
+  void decode(bufferlist::const_iterator& p) const {
+    ::decode_raw(uuid, p);
+  }
+};
+WRITE_CLASS_ENCODER(uuid_d)
+
+inline std::ostream& operator<<(std::ostream& out, const uuid_d& u) {
+  char b[37];
+  u.print(b);
+  return out << b;
+}
+
+inline bool operator==(const uuid_d& l, const uuid_d& r) {
+  return l.uuid == r.uuid;
+}
+inline bool operator!=(const uuid_d& l, const uuid_d& r) {
+  return l.uuid != r.uuid;
+}
+inline bool operator<(const uuid_d& l, const uuid_d& r) {
+  return l.to_string() < r.to_string();
+}
+
+
+#endif
diff --git a/src/include/xlist.h b/src/include/xlist.h
new file mode 100644
index 00000000..733a318a
--- /dev/null
+++ b/src/include/xlist.h
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_XLIST_H
+#define CEPH_XLIST_H
+
+#include <iterator>
+#include <cstdlib>
+#include <ostream>
+
+#include "include/ceph_assert.h"
+
+template<typename T>
+class xlist {
+public:
+  class item {
+  public:
+    item(T i) : _item(i) {}
+    ~item() { 
+      ceph_assert(!is_on_list());
+    }
+
+    item(const item& other) = delete;
+    item(item&& other) = delete;
+    const item& operator= (const item& right) = delete;
+    item& operator= (item&& right) = delete;
+
+    xlist* get_list() { return _list; }
+    bool is_on_list() const { return _list ? true:false; }
+    bool remove_myself() {
+      if (_list) {
+	_list->remove(this);
+	ceph_assert(_list == 0);
+	return true;
+      } else
+	return false;
+    }
+    void move_to_front() {
+      ceph_assert(_list);
+      _list->push_front(this);
+    }
+    void move_to_back() {
+      ceph_assert(_list);
+      _list->push_back(this);
+    }
+
+  private:
+    friend xlist;
+    T _item;
+    item *_prev = nullptr, *_next = nullptr;
+    xlist *_list = nullptr;
+  };
+
+  typedef item* value_type;
+  typedef item* const_reference;
+
+private:
+  item *_front, *_back;
+  size_t _size;
+
+public:
+  xlist(const xlist& other) {
+    _front = other._front;
+    _back = other._back;
+    _size = other._size;
+  }
+
+  xlist() : _front(0), _back(0), _size(0) {}
+  ~xlist() { 
+    ceph_assert(_size == 0);
+    ceph_assert(_front == 0);
+    ceph_assert(_back == 0);
+  }
+
+  size_t size() const {
+    ceph_assert((bool)_front == (bool)_size);
+    return _size;
+  }
+  bool empty() const { 
+    ceph_assert((bool)_front == (bool)_size);
+    return _front == 0; 
+  }
+
+  void clear() {
+    while (_front)
+      remove(_front);
+    ceph_assert((bool)_front == (bool)_size);
+  }
+
+  void push_front(item *i) {
+    if (i->_list) 
+      i->_list->remove(i);
+
+    i->_list = this;
+    i->_next = _front;
+    i->_prev = 0;
+    if (_front) 
+      _front->_prev = i;
+    else
+      _back = i;
+    _front = i;
+    _size++;
+  }
+  void push_back(item *i) {
+    if (i->_list) 
+      i->_list->remove(i);
+
+    i->_list = this;
+    i->_next = 0;
+    i->_prev = _back;
+    if (_back) 
+      _back->_next = i;
+    else
+      _front = i;
+    _back = i;
+    _size++;
+  }
+  void remove(item *i) {
+    ceph_assert(i->_list == this);
+    
+    if (i->_prev)
+      i->_prev->_next = i->_next;
+    else
+      _front = i->_next;
+    if (i->_next)
+      i->_next->_prev = i->_prev;
+    else
+      _back = i->_prev;
+    _size--;
+
+    i->_list = 0;
+    i->_next = i->_prev = 0;
+    ceph_assert((bool)_front == (bool)_size);
+  }
+
+  T front() { return static_cast<T>(_front->_item); }
+  const T front() const { return static_cast<const T>(_front->_item); }
+
+  T back() { return static_cast<T>(_back->_item); }
+  const T back() const { return static_cast<const T>(_back->_item); }
+
+  void pop_front() {
+    ceph_assert(!empty());
+    remove(_front);
+  }
+  void pop_back() {
+    ceph_assert(!empty());
+    remove(_back);
+  }
+
+  class iterator: std::iterator<std::forward_iterator_tag, T> {
+  private:
+    item *cur;
+  public:
+    iterator(item *i = 0) : cur(i) {}
+    T operator*() { return static_cast<T>(cur->_item); }
+    iterator& operator++() {
+      ceph_assert(cur);
+      ceph_assert(cur->_list);
+      cur = cur->_next;
+      return *this;
+    }
+    bool end() const { return cur == 0; }
+    bool operator==(const iterator& rhs) const {
+      return cur == rhs.cur;
+    }
+    bool operator!=(const iterator& rhs) const {
+      return cur != rhs.cur;
+    }
+  };
+
+  iterator begin() { return iterator(_front); }
+  iterator end() { return iterator(NULL); }
+
+  class const_iterator: std::iterator<std::forward_iterator_tag, T> {
+  private:
+    item *cur;
+  public:
+    const_iterator(item *i = 0) : cur(i) {}
+    const T operator*() { return static_cast<const T>(cur->_item); }
+    const_iterator& operator++() {
+      ceph_assert(cur);
+      ceph_assert(cur->_list);
+      cur = cur->_next;
+      return *this;
+    }
+    bool end() const { return cur == 0; }
+    bool operator==(const_iterator& rhs) const {
+      return cur == rhs.cur;
+    }
+    bool operator!=(const_iterator& rhs) const {
+      return cur != rhs.cur;
+    }
+  };
+
+  const_iterator begin() const { return const_iterator(_front); }
+  const_iterator end() const { return const_iterator(NULL); }
+
+  friend std::ostream &operator<<(std::ostream &oss, const xlist<T> &list) {
+    bool first = true;
+    for (const auto &item : list) {
+      if (!first) {
+        oss << ", ";
+      }
+      oss << *item; /* item should be a pointer */
+      first = false;
+    }
+    return oss;
+  }
+};
+
+
+#endif