Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/include
parent: Initial commit. (diff)
download: ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
134 files changed, 42722 insertions, 0 deletions
diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt
new file mode 100644
index 000000000..cb9c2fea8
--- /dev/null
+++ b/src/include/CMakeLists.txt
@@ -0,0 +1,46 @@
+install(FILES
+  libcephsqlite.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+install(FILES
+  rados/librados.h
+  rados/rados_types.h
+  rados/rados_types.hpp
+  rados/librados_fwd.hpp
+  rados/librados.hpp
+  buffer.h
+  buffer_fwd.h
+  inline_memory.h
+  page.h
+  crc32c.h
+  rados/objclass.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
+if(WITH_LIBRADOSSTRIPER)
+  install(FILES
+    radosstriper/libradosstriper.h
+    radosstriper/libradosstriper.hpp
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/radosstriper)
+endif()
+
+if(WITH_RBD)
+  install(FILES
+    rbd/features.h
+    rbd/librbd.h
+    rbd/librbd.hpp
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rbd)
+endif()
+
+if(WITH_RADOSGW)
+  install(FILES
+    rados/librgw.h
+    rados/rgw_file.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados)
+endif()
+
+if(WITH_LIBCEPHFS)
+  install(FILES
+    cephfs/libcephfs.h
+    cephfs/ceph_ll_client.h
+    cephfs/types.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cephfs)
+endif()
diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h
new file mode 100644
index 000000000..35c7a7738
--- /dev/null
+++ b/src/include/CompatSet.h
@@ -0,0 +1,285 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_COMPATSET_H
+#define CEPH_COMPATSET_H
+
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/types.h"
+#include "common/Formatter.h"
+
+struct CompatSet {
+
+  struct Feature {
+    uint64_t id;
+    std::string name;
+
+    Feature(uint64_t _id, const std::string& _name) : id(_id), name(_name) {}
+  };
+
+  class FeatureSet {
+    uint64_t mask;
+    std::map<uint64_t, std::string> names;
+
+  public:
+    friend struct CompatSet;
+    friend class CephCompatSet_AllSet_Test;
+    friend class CephCompatSet_other_Test;
+    friend class CephCompatSet_merge_Test;
+    friend std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs);
+    friend std::ostream& operator<<(std::ostream& out, const CompatSet& compat);
+    FeatureSet() : mask(1), names() {}
+    void insert(const Feature& f) {
+      ceph_assert(f.id > 0);
+      ceph_assert(f.id < 64);
+      mask |= ((uint64_t)1<<f.id);
+      names[f.id] = f.name;
+    }
+
+    bool contains(const Feature& f) const {
+      return names.count(f.id);
+    }
+    bool contains(uint64_t f) const {
+      return names.count(f);
+    }
+    /**
+     * Getter instead of using name[] to be const safe
+     */
+    std::string get_name(uint64_t const f) const {
+      std::map<uint64_t, std::string>::const_iterator i = names.find(f);
+      ceph_assert(i != names.end());
+      return i->second;
+    }
+
+    void remove(uint64_t f) {
+      if (names.count(f)) {
+	names.erase(f);
+	mask &= ~((uint64_t)1<<f);
+      }
+    }
+    void remove(const Feature& f) {
+      remove(f.id);
+    }
+
+    void encode(ceph::buffer::list& bl) const {
+      using ceph::encode;
+      /* See below, mask always has the lowest bit set in memory, but
+       * unset in the encoding */
+      encode(mask & (~(uint64_t)1), bl);
+      encode(names, bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator& bl) {
+      using ceph::decode;
+      decode(mask, bl);
+      decode(names, bl);
+      /**
+       * Previously, there was a bug where insert did
+       * mask |= f.id rather than mask |= (1 << f.id).
+       * In FeatureSets from those version, mask always
+       * has the lowest bit set.  Since then, masks always
+       * have the lowest bit unset.
+       *
+       * When we encounter such a FeatureSet, we have to
+       * reconstruct the mask from the names map.
+       */
+      if (mask & 1) {
+	mask = 1;
+	std::map<uint64_t, std::string> temp_names;
+	temp_names.swap(names);
+	for (auto i = temp_names.begin(); i != temp_names.end(); ++i) {
+	  insert(Feature(i->first, i->second));
+	}
+      } else {
+	mask |= 1;
+      }
+    }
+
+    void dump(ceph::Formatter *f) const {
+      for (auto p = names.cbegin(); p != names.cend(); ++p) {
+	char s[18];
+	snprintf(s, sizeof(s), "feature_%llu", (unsigned long long)p->first);
+	f->dump_string(s, p->second);
+      }
+    }
+  };
+
+  // These features have no impact on the read / write status
+  FeatureSet compat;
+  // If any of these features are missing, read is possible ( as long
+  // as no incompat feature is missing ) but it is not possible to write
+  FeatureSet ro_compat;
+  // If any of these features are missing, read or write is not possible
+  FeatureSet incompat;
+
+  CompatSet(FeatureSet& _compat, FeatureSet& _ro_compat, FeatureSet& _incompat) :
+    compat(_compat), ro_compat(_ro_compat), incompat(_incompat) {}
+
+  CompatSet() : compat(), ro_compat(), incompat() { }
+
+
+  /* does this filesystem implementation have the
+     features required to read the other? */
+  bool readable(CompatSet const& other) const {
+    return !((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+  }
+
+  /* does this filesystem implementation have the
+     features required to write the other? */
+  bool writeable(CompatSet const& other) const {
+    return readable(other) &&
+      !((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+  }
+
+  /* Compare this CompatSet to another.
+   * CAREFULLY NOTE: This operation is NOT commutative.
+   * a > b DOES NOT imply that b < a.
+   * If returns:
+   * 0: The CompatSets have the same feature set.
+   * 1: This CompatSet's features are a strict superset of the other's.
+   * -1: This CompatSet is missing at least one feature
+   *     described in the other. It may still have more features, though.
+   */
+  int compare(const CompatSet& other) const {
+    if ((other.compat.mask == compat.mask) &&
+	(other.ro_compat.mask == ro_compat.mask) &&
+	(other.incompat.mask == incompat.mask)) return 0;
+    //okay, they're not the same
+
+    //if we're writeable we have a superset of theirs on incompat and ro_compat
+    if (writeable(other) && !((other.compat.mask ^ compat.mask)
+			      & other.compat.mask)) return 1;
+    //if we make it here, we weren't writeable or had a difference compat set
+    return -1;
+  }
+
+  /* Get the features supported by other CompatSet but not this one,
+   * as a CompatSet.
+   */
+  CompatSet unsupported(const CompatSet& other) const {
+    CompatSet diff;
+    uint64_t other_compat =
+      ((other.compat.mask ^ compat.mask) & other.compat.mask);
+    uint64_t other_ro_compat =
+      ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+    uint64_t other_incompat =
+      ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+    for (int id = 1; id < 64; ++id) {
+      uint64_t mask = (uint64_t)1 << id;
+      if (mask & other_compat) {
+	diff.compat.insert( Feature(id, other.compat.names.at(id)));
+      }
+      if (mask & other_ro_compat) {
+	diff.ro_compat.insert(Feature(id, other.ro_compat.names.at(id)));
+      }
+      if (mask & other_incompat) {
+	diff.incompat.insert( Feature(id, other.incompat.names.at(id)));
+      }
+    }
+    return diff;
+  }
+  
+  /* Merge features supported by other CompatSet into this one.
+   * Return: true if some features were merged
+   */
+  bool merge(CompatSet const & other) {
+    uint64_t other_compat =
+      ((other.compat.mask ^ compat.mask) & other.compat.mask);
+    uint64_t other_ro_compat =
+      ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+    uint64_t other_incompat =
+      ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+    if (!other_compat && !other_ro_compat && !other_incompat)
+      return false;
+    for (int id = 1; id < 64; ++id) {
+      uint64_t mask = (uint64_t)1 << id;
+      if (mask & other_compat) {
+	compat.insert( Feature(id, other.compat.get_name(id)));
+      }
+      if (mask & other_ro_compat) {
+	ro_compat.insert(Feature(id, other.ro_compat.get_name(id)));
+      }
+      if (mask & other_incompat) {
+	incompat.insert( Feature(id, other.incompat.get_name(id)));
+      }
+    }
+    return true;
+  }
+
+  std::ostream& printlite(std::ostream& o) const {
+    o << "{c=[" << std::hex << compat.mask << "]";
+    o << ",r=[" << std::hex << ro_compat.mask << "]";
+    o << ",i=[" << std::hex << incompat.mask << "]}";
+    o << std::dec;
+    return o;
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    compat.encode(bl);
+    ro_compat.encode(bl);
+    incompat.encode(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    compat.decode(bl);
+    ro_compat.decode(bl);
+    incompat.decode(bl);
+  }
+
+  void dump(ceph::Formatter *f) const {
+    f->open_object_section("compat");
+    compat.dump(f);
+    f->close_section();
+    f->open_object_section("ro_compat");
+    ro_compat.dump(f);
+    f->close_section();
+    f->open_object_section("incompat");
+    incompat.dump(f);
+    f->close_section();
+  }
+
+  static void generate_test_instances(std::list<CompatSet*>& o) {
+    o.push_back(new CompatSet);
+    o.push_back(new CompatSet);
+    o.back()->compat.insert(Feature(1, "one"));
+    o.back()->compat.insert(Feature(2, "two"));
+    o.back()->ro_compat.insert(Feature(4, "four"));
+    o.back()->incompat.insert(Feature(3, "three"));
+  }
+};
+WRITE_CLASS_ENCODER(CompatSet)
+
+inline std::ostream& operator<<(std::ostream& out, const CompatSet::Feature& f)
+{
+  return out << "F(" << f.id << ", \"" << f.name << "\")";
+}
+
+inline std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs)
+{
+  return out << fs.names;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const CompatSet& compat)
+{
+  return out << "compat=" << compat.compat
+	     << ",rocompat=" << compat.ro_compat
+	     << ",incompat=" << compat.incompat;
+}
+
+#endif
diff --git a/src/include/Context.h b/src/include/Context.h
new file mode 100644
index 000000000..bef85ca5b
--- /dev/null
+++ b/src/include/Context.h
@@ -0,0 +1,535 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_CONTEXT_H
+#define CEPH_CONTEXT_H
+
+#include "common/dout.h"
+
+#include <functional>
+#include <list>
+#include <memory>
+#include <set>
+
+#include <boost/function.hpp>
+#include <boost/system/error_code.hpp>
+
+#include "common/error_code.h"
+
+#include "include/ceph_assert.h"
+#include "common/ceph_mutex.h"
+
+#define mydout(cct, v) lgeneric_subdout(cct, context, v)
+
+/*
+ * GenContext - abstract callback class
+ */
+template <typename T>
+class GenContext {
+  GenContext(const GenContext& other);
+  const GenContext& operator=(const GenContext& other);
+
+ protected:
+  virtual void finish(T t) = 0;
+
+ public:
+  GenContext() {}
+  virtual ~GenContext() {}       // we want a virtual destructor!!!
+
+  template <typename C>
+  void complete(C &&t) {
+    finish(std::forward<C>(t));
+    delete this;
+  }
+
+  template <typename C>
+  void operator()(C &&t) noexcept {
+    complete(std::forward<C>(t));
+  }
+
+  template<typename U = T>
+  auto operator()() noexcept
+    -> typename std::enable_if<std::is_default_constructible<U>::value,
+			       void>::type {
+    complete(T{});
+  }
+
+
+  std::reference_wrapper<GenContext> func() {
+    return std::ref(*this);
+  }
+};
+
+template <typename T>
+using GenContextURef = std::unique_ptr<GenContext<T> >;
+
+/*
+ * Context - abstract callback class
+ */
+class Finisher;
+class Context {
+  Context(const Context& other);
+  const Context& operator=(const Context& other);
+
+ protected:
+  virtual void finish(int r) = 0;
+
+  // variant of finish that is safe to call "synchronously."  override should
+  // return true.
+  virtual bool sync_finish(int r) {
+    return false;
+  }
+
+ public:
+  Context() {}
+  virtual ~Context() {}       // we want a virtual destructor!!!
+  virtual void complete(int r) {
+    finish(r);
+    delete this;
+  }
+  virtual bool sync_complete(int r) {
+    if (sync_finish(r)) {
+      delete this;
+      return true;
+    }
+    return false;
+  }
+  void complete(boost::system::error_code ec) {
+    complete(ceph::from_error_code(ec));
+  }
+  void operator()(boost::system::error_code ec) noexcept {
+    complete(ec);
+  }
+
+  void operator()() noexcept {
+    complete({});
+  }
+
+  std::reference_wrapper<Context> func() {
+    return std::ref(*this);
+  }
+};
+
+/**
+ * Simple context holding a single object
+ */
+template<class T>
+class ContainerContext : public Context {
+  T obj;
+public:
+  ContainerContext(T &obj) : obj(obj) {}
+  void finish(int r) override {}
+};
+template <typename T>
+ContainerContext<T> *make_container_context(T &&t) {
+  return new ContainerContext<T>(std::forward<T>(t));
+}
+
+template <class T>
+struct Wrapper : public Context {
+  Context *to_run;
+  T val;
+  Wrapper(Context *to_run, T val) : to_run(to_run), val(val) {}
+  void finish(int r) override {
+    if (to_run)
+      to_run->complete(r);
+  }
+};
+struct RunOnDelete {
+  Context *to_run;
+  RunOnDelete(Context *to_run) : to_run(to_run) {}
+  ~RunOnDelete() {
+    if (to_run)
+      to_run->complete(0);
+  }
+};
+typedef std::shared_ptr<RunOnDelete> RunOnDeleteRef;
+
+template <typename T>
+class LambdaContext : public Context {
+public:
+  LambdaContext(T &&t) : t(std::forward<T>(t)) {}
+  void finish(int r) override {
+    if constexpr (std::is_invocable_v<T, int>)
+      t(r);
+    else
+      t();
+  }
+private:
+  T t;
+};
+
+template <typename T>
+LambdaContext<T> *make_lambda_context(T &&t) {
+  return new LambdaContext<T>(std::move(t));
+}
+
+template <typename F, typename T>
+struct LambdaGenContext : GenContext<T> {
+  F f;
+  LambdaGenContext(F &&f) : f(std::forward<F>(f)) {}
+  void finish(T t) override {
+    f(std::forward<T>(t));
+  }
+};
+template <typename T, typename F>
+GenContextURef<T> make_gen_lambda_context(F &&f) {
+  return GenContextURef<T>(new LambdaGenContext<F, T>(std::move(f)));
+}
+
+/*
+ * finish and destroy a list of Contexts
+ */
+template<class C>
+inline void finish_contexts(CephContext *cct, C& finished, int result = 0)
+{
+  if (finished.empty())
+    return;
+
+  C ls;
+  ls.swap(finished); // swap out of place to avoid weird loops
+
+  if (cct)
+    mydout(cct,10) << ls.size() << " contexts to finish with " << result << dendl;
+  for (Context* c : ls) {
+    if (cct)
+      mydout(cct,10) << "---- " << c << dendl;
+    c->complete(result);
+  }
+}
+
+class C_NoopContext : public Context {
+public:
+  void finish(int r) override { }
+};
+
+
+struct C_Lock : public Context {
+  ceph::mutex *lock;
+  Context *fin;
+  C_Lock(ceph::mutex *l, Context *c) : lock(l), fin(c) {}
+  ~C_Lock() override {
+    delete fin;
+  }
+  void finish(int r) override {
+    if (fin) {
+      std::lock_guard l{*lock};
+      fin->complete(r);
+      fin = NULL;
+    }
+  }
+};
+
+/*
+ * C_Contexts - set of Contexts
+ *
+ * ContextType must be an ancestor class of ContextInstanceType, or the same class.
+ * ContextInstanceType must be default-constructable.
+ */
+template <class ContextType, class ContextInstanceType, class Container = std::list<ContextType *>>
+class C_ContextsBase : public ContextInstanceType {
+public:
+  CephContext *cct;
+  Container contexts;
+
+  C_ContextsBase(CephContext *cct_)
+    : cct(cct_)
+  {
+  }
+  ~C_ContextsBase() override {
+    for (auto c : contexts) {
+      delete c;
+    }
+  }
+  void add(ContextType* c) {
+    contexts.push_back(c);
+  }
+  void take(Container& ls) {
+    Container c;
+    c.swap(ls);
+    if constexpr (std::is_same_v<Container, std::list<ContextType *>>) {
+      contexts.splice(contexts.end(), c);
+    } else {
+      contexts.insert(contexts.end(), c.begin(), c.end());
+    }
+  }
+  void complete(int r) override {
+    // Neuter any ContextInstanceType custom complete(), because although
+    // I want to look like it, I don't actually want to run its code.
+    Context::complete(r);
+  }
+  void finish(int r) override {
+    finish_contexts(cct, contexts, r);
+  }
+  bool empty() { return contexts.empty(); }
+
+  template<class C>
+  static ContextType *list_to_context(C& cs) {
+    if (cs.size() == 0) {
+      return 0;
+    } else if (cs.size() == 1) {
+      ContextType *c = cs.front();
+      cs.clear();
+      return c;
+    } else {
+      C_ContextsBase<ContextType, ContextInstanceType> *c(new C_ContextsBase<ContextType, ContextInstanceType>(0));
+      c->take(cs);
+      return c;
+    }
+  }
+};
+
+typedef C_ContextsBase<Context, Context> C_Contexts;
+
+/*
+ * C_Gather
+ *
+ * ContextType must be an ancestor class of ContextInstanceType, or the same class.
+ * ContextInstanceType must be default-constructable.
+ *
+ * BUG:? only reports error from last sub to have an error return
+ */
+template <class ContextType, class ContextInstanceType>
+class C_GatherBase {
+private:
+  CephContext *cct;
+  int result = 0;
+  ContextType *onfinish;
+#ifdef DEBUG_GATHER
+  std::set<ContextType*> waitfor;
+#endif
+  int sub_created_count = 0;
+  int sub_existing_count = 0;
+  mutable ceph::recursive_mutex lock =
+    ceph::make_recursive_mutex("C_GatherBase::lock"); // disable lockdep
+  bool activated = false;
+
+  void sub_finish(ContextType* sub, int r) {
+    lock.lock();
+#ifdef DEBUG_GATHER
+    ceph_assert(waitfor.count(sub));
+    waitfor.erase(sub);
+#endif
+    --sub_existing_count;
+    mydout(cct,10) << "C_GatherBase " << this << ".sub_finish(r=" << r << ") " << sub
+#ifdef DEBUG_GATHER
+		    << " (remaining " << waitfor << ")"
+#endif
+		    << dendl;
+    if (r < 0 && result == 0)
+      result = r;
+    if ((activated == false) || (sub_existing_count != 0)) {
+      lock.unlock();
+      return;
+    }
+    lock.unlock();
+    delete_me();
+  }
+
+  void delete_me() {
+    if (onfinish) {
+      onfinish->complete(result);
+      onfinish = 0;
+    }
+    delete this;
+  }
+
+  class C_GatherSub : public ContextInstanceType {
+    C_GatherBase *gather;
+  public:
+    C_GatherSub(C_GatherBase *g) : gather(g) {}
+    void complete(int r) override {
+      // Cancel any customized complete() functionality
+      // from the Context subclass we're templated for,
+      // we only want to hit that in onfinish, not at each
+      // sub finish.  e.g. MDSInternalContext.
+      Context::complete(r);
+    }
+    void finish(int r) override {
+      gather->sub_finish(this, r);
+      gather = 0;
+    }
+    ~C_GatherSub() override {
+      if (gather)
+	gather->sub_finish(this, 0);
+    }
+  };
+
+public:
+  C_GatherBase(CephContext *cct_, ContextType *onfinish_)
+    : cct(cct_), onfinish(onfinish_)
+  {
+    mydout(cct,10) << "C_GatherBase " << this << ".new" << dendl;
+  }
+  ~C_GatherBase() {
+    mydout(cct,10) << "C_GatherBase " << this << ".delete" << dendl;
+  }
+  void set_finisher(ContextType *onfinish_) {
+    std::lock_guard l{lock};
+    ceph_assert(!onfinish);
+    onfinish = onfinish_;
+  }
+  void activate() {
+    lock.lock();
+    ceph_assert(activated == false);
+    activated = true;
+    if (sub_existing_count != 0) {
+      lock.unlock();
+      return;
+    }
+    lock.unlock();
+    delete_me();
+  }
+  ContextType *new_sub() {
+    std::lock_guard l{lock};
+    ceph_assert(activated == false);
+    sub_created_count++;
+    sub_existing_count++;
+    ContextType *s = new C_GatherSub(this);
+#ifdef DEBUG_GATHER
+    waitfor.insert(s);
+#endif
+    mydout(cct,10) << "C_GatherBase " << this << ".new_sub is " << sub_created_count << " " << s << dendl;
+    return s;
+  }
+
+  inline int get_sub_existing_count() const {
+    std::lock_guard l{lock};
+    return sub_existing_count;
+  }
+
+  inline int get_sub_created_count() const {
+    std::lock_guard l{lock};
+    return sub_created_count;
+  }
+};
+
+/*
+ * The C_GatherBuilder remembers each C_Context created by
+ * C_GatherBuilder.new_sub() in a C_Gather.  When a C_Context created
+ * by new_sub() is complete(), C_Gather forgets about it.  When
+ * C_GatherBuilder notices that there are no C_Context left in
+ * C_Gather, it calls complete() on the C_Context provided as the
+ * second argument of the constructor (finisher).
+ *
+ * How to use C_GatherBuilder:
+ *
+ * 1. Create a C_GatherBuilder on the stack
+ * 2. Call gather_bld.new_sub() as many times as you want to create new subs
+ *    It is safe to call this 0 times, or 100, or anything in between.
+ * 3. If you didn't supply a finisher in the C_GatherBuilder constructor,
+ *    set one with gather_bld.set_finisher(my_finisher)
+ * 4. Call gather_bld.activate()
+ *
+ * Example:
+ *
+ * C_SaferCond all_done;
+ * C_GatherBuilder gb(g_ceph_context, all_done);
+ * j.submit_entry(1, first, 0, gb.new_sub()); // add a C_Context to C_Gather
+ * j.submit_entry(2, first, 0, gb.new_sub()); // add a C_Context to C_Gather
+ * gb.activate(); // consume C_Context as soon as they complete()
+ * all_done.wait(); // all_done is complete() after all new_sub() are complete()
+ *
+ * The finisher may be called at any point after step 4, including immediately
+ * from the activate() function.
+ * The finisher will never be called before activate().
+ *
+ * Note: Currently, subs must be manually freed by the caller (for some reason.)
+ */
+template <class ContextType, class GatherType>
+class C_GatherBuilderBase
+{
+public:
+  C_GatherBuilderBase(CephContext *cct_)
+    : cct(cct_), c_gather(NULL), finisher(NULL), activated(false)
+  {
+  }
+  C_GatherBuilderBase(CephContext *cct_, ContextType *finisher_)
+    : cct(cct_), c_gather(NULL), finisher(finisher_), activated(false)
+  {
+  }
+  ~C_GatherBuilderBase() {
+    if (c_gather) {
+      ceph_assert(activated); // Don't forget to activate your C_Gather!
+    }
+    else {
+      delete finisher;
+    }
+  }
+  ContextType *new_sub() {
+    if (!c_gather) {
+      c_gather = new GatherType(cct, finisher);
+    }
+    return c_gather->new_sub();
+  }
+  void activate() {
+    if (!c_gather)
+      return;
+    ceph_assert(finisher != NULL);
+    activated = true;
+    c_gather->activate();
+  }
+  void set_finisher(ContextType *finisher_) {
+    finisher = finisher_;
+    if (c_gather)
+      c_gather->set_finisher(finisher);
+  }
+  GatherType *get() const {
+    return c_gather;
+  }
+  bool has_subs() const {
+    return (c_gather != NULL);
+  }
+  int num_subs_created() {
+    ceph_assert(!activated);
+    if (c_gather == NULL)
+      return 0;
+    return c_gather->get_sub_created_count();
+  }
+  int num_subs_remaining() {
+    ceph_assert(!activated);
+    if (c_gather == NULL)
+      return 0;
+    return c_gather->get_sub_existing_count();
+  }
+
+private:
+  CephContext *cct;
+  GatherType *c_gather;
+  ContextType *finisher;
+  bool activated;
+};
+
+typedef C_GatherBase<Context, Context> C_Gather;
+typedef C_GatherBuilderBase<Context, C_Gather > C_GatherBuilder;
+
+template <class ContextType>
+class ContextFactory {
+public:
+  virtual ~ContextFactory() {}
+  virtual ContextType *build() = 0;
+};
+
+inline auto lambdafy(Context *c) {
+  return [fin = std::unique_ptr<Context>(c)]
+    (boost::system::error_code ec) mutable {
+	   fin.release()->complete(ceph::from_error_code(ec));
+	 };
+}
+
+
+#undef mydout
+
+#endif
diff --git a/src/include/Distribution.h b/src/include/Distribution.h
new file mode 100644
index 000000000..56e998757
--- /dev/null
+++ b/src/include/Distribution.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_DISTRIBUTION_H
+#define CEPH_DISTRIBUTION_H
+
+#include <vector>
+
+class Distribution {
+  std::vector<float> p;
+  std::vector<int> v;
+
+ public:
+  //Distribution() { 
+  //}
+  
+  unsigned get_width() {
+    return p.size();
+  }
+
+  void clear() {
+    p.clear();
+    v.clear();
+  }
+  void add(int val, float pr) {
+    p.push_back(pr);
+    v.push_back(val);
+  }
+
+  void random() {
+    float sum = 0.0;
+    for (unsigned i=0; i<p.size(); i++) {
+      p[i] = (float)(rand() % 10000);
+      sum += p[i];
+    }
+    for (unsigned i=0; i<p.size(); i++) 
+      p[i] /= sum;
+  }
+
+  int sample() {
+    float s = (float)(rand() % 10000) / 10000.0;
+    for (unsigned i=0; i<p.size(); i++) {
+      if (s < p[i]) return v[i];
+      s -= p[i];
+    }
+    ceph_abort();
+    return v[p.size() - 1];  // hmm.  :/
+  }
+
+  float normalize() {
+    float s = 0.0;
+    for (unsigned i=0; i<p.size(); i++)
+      s += p[i];
+    for (unsigned i=0; i<p.size(); i++)
+      p[i] /= s;
+    return s;
+  }
+
+};
+
+#endif
diff --git a/src/include/addr_parsing.h b/src/include/addr_parsing.h
new file mode 100644
index 000000000..c205ac75f
--- /dev/null
+++ b/src/include/addr_parsing.h
@@ -0,0 +1,28 @@
+/*
+ * addr_parsing.h
+ *
+ *  Created on: Sep 14, 2010
+ *      Author: gregf
+ *      contains functions used by Ceph to convert named addresses
+ *      (eg ceph.com) into IP addresses (ie 127.0.0.1).
+ */
+
+#ifndef ADDR_PARSING_H_
+#define ADDR_PARSING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int safe_cat(char **pstr, int *plen, int pos, const char *str2);
+
+/*
+ * returns a string allocated by malloc; caller must free
+ */
+char *resolve_addrs(const char *orig_str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ADDR_PARSING_H_ */
diff --git a/src/include/alloc_ptr.h b/src/include/alloc_ptr.h
new file mode 100644
index 000000000..258c58338
--- /dev/null
+++ b/src/include/alloc_ptr.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ALLOC_PTR_H
+#define CEPH_ALLOC_PTR_H
+
+#include <memory>
+
+template <class T>
+class alloc_ptr
+{
+public:
+    typedef typename std::pointer_traits< std::unique_ptr<T> >::pointer pointer;
+    typedef typename std::pointer_traits< std::unique_ptr<T> >::element_type element_type;
+
+    alloc_ptr() : ptr() {}
+
+    template<class U>
+      alloc_ptr(U&& u) : ptr(std::forward<U>(u)) {}
+
+    alloc_ptr(alloc_ptr<pointer>&& rhs) : ptr(std::move(rhs.ptr)) {}
+    alloc_ptr(const alloc_ptr<pointer>& rhs) = delete;
+    alloc_ptr& operator=(const alloc_ptr<pointer>&& rhs) {
+        ptr = rhs.ptr;
+    }
+    alloc_ptr& operator=(const alloc_ptr<pointer>& rhs) {
+        ptr = rhs.ptr;
+    }
+
+    void swap (alloc_ptr<pointer>& rhs) {
+        ptr.swap(rhs.ptr);
+    }
+    element_type* release() {
+        return ptr.release();
+    }
+    void reset(element_type *p = nullptr) {
+        ptr.reset(p);
+    }
+    element_type* get() const {
+        if (!ptr)
+          ptr.reset(new element_type);
+        return ptr.get();
+    }
+    element_type& operator*() const {
+        if (!ptr)
+          ptr.reset(new element_type);
+        return *ptr;
+    }
+    element_type* operator->() const {
+        if (!ptr)
+          ptr.reset(new element_type);
+        return ptr.get();
+    }
+    operator bool() const {
+        return !!ptr;
+    }
+
+    friend bool operator< (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::less<element_type>(*lhs, *rhs);
+    }
+    friend bool operator<=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::less_equal<element_type>(*lhs, *rhs);
+    }
+    friend bool operator> (const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::greater<element_type>(*lhs, *rhs);
+    }
+    friend bool operator>=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return std::greater_equal<element_type>(*lhs, *rhs);
+    }
+    friend bool operator==(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return *lhs == *rhs;
+    }
+    friend bool operator!=(const alloc_ptr& lhs, const alloc_ptr& rhs) {
+        return *lhs != *rhs;
+    }
+private:
+    mutable std::unique_ptr<element_type> ptr;
+};
+
+#endif
diff --git a/src/include/any.h b/src/include/any.h
new file mode 100644
index 000000000..da59c88f4
--- /dev/null
+++ b/src/include/any.h
@@ -0,0 +1,704 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef INCLUDE_STATIC_ANY
+#define INCLUDE_STATIC_ANY
+
+#include <any>
+#include <cstddef>
+#include <initializer_list>
+#include <memory>
+#include <typeinfo>
+#include <type_traits>
+
+#include <boost/smart_ptr/shared_ptr.hpp>
+#include <boost/smart_ptr/make_shared.hpp>
+
+namespace ceph {
+
+namespace _any {
+
+// Shared Functionality
+// --------------------
+//
+// Common implementation details. Most functionality is here. We
+// assume that destructors do not throw. Some of them might and
+// they'll invoke terminate and that's fine.
+//
+// We are using the Curiously Recurring Template Pattern! We require
+// that all classes inheriting from us provide:
+//
+//   - `static constexpr size_t capacity`: Maximum capacity. No object
+//                                         larger than this may be
+//                                         stored. `dynamic` for dynamic.
+//   - `void* ptr() const noexcept`: returns a pointer to storage.
+//                                   (`alloc_storage` must have been called.
+//                                   `free_storage` must not have been called
+//                                   since.)
+//   - `void* alloc_storage(const std::size_t)`: allocate storage
+//   - `void free_storage() noexcept`: free storage. Must be idempotent.
+//
+// We provide most of the public interface, as well as the operator function,
+// cast_helper, and the type() call.
+
+// Set `capacity` to this value to indicate that there is no fixed
+// capacity.
+//
+inline constexpr std::size_t dynamic = ~0;
+
+// Driver Function
+// ---------------
+//
+// The usual type-erasure control function trick. This one is simpler
+// than usual since we punt on moving and copying. We could dispense
+// with this and just store a deleter and a pointer to a typeinfo, but
+// that would be twice the space.
+//
+// Moved out here so the type of `func_t` isn't dependent on the
+// enclosing class.
+//
+enum class op { type, destroy };
+template<typename T>
+inline void op_func(const op o, void* p) noexcept {
+  static const std::type_info& type = typeid(T);
+  switch (o) {
+  case op::type:
+    *(reinterpret_cast<const std::type_info**>(p)) = &type;
+    break;
+  case op::destroy:
+    reinterpret_cast<T*>(p)->~T();
+    break;
+  }
+}
+using func_t = void (*)(const op, void* p) noexcept;
+
+// The base class 
+// --------------
+//
+// The `storage_t` parameter gives the type of the value that manages
+// storage and allocation. We use it to create a protected data member
+// (named `storage`). This allows us to sidestep the problem in
+// initialization order where, where exposed constructors were using
+// trying to allocate or free storage *before* the data members of the
+// derived class were initialized.
+//
+// Making storage_t a member type of the derived class won't work, due
+// to C++'s rules for nested types being *horrible*. Just downright
+// *horrible*.
+//
+template<typename D, typename storage_t>
+class base {
+  // Make definitions from our superclass visible
+  // --------------------------------------------
+  //
+  // And check that they fit the requirements. At least those that are
+  // statically checkable.
+  //
+  static constexpr std::size_t capacity = D::capacity;
+
+  void* ptr() const noexcept {
+    static_assert(
+      noexcept(static_cast<const D*>(this)->ptr()) &&
+      std::is_same_v<decltype(static_cast<const D*>(this)->ptr()), void*>,
+      "‘void* ptr() const noexcept’ missing from superclass");
+    return static_cast<const D*>(this)->ptr();
+  }
+
+  void* alloc_storage(const std::size_t z) {
+    static_assert(
+      std::is_same_v<decltype(static_cast<D*>(this)->alloc_storage(z)), void*>,
+      "‘void* alloc_storage(const size_t)’ missing from superclass.");
+    return static_cast<D*>(this)->alloc_storage(z);
+  }
+
+  void free_storage() noexcept {
+    static_assert(
+      noexcept(static_cast<D*>(this)->free_storage()) &&
+      std::is_void_v<decltype(static_cast<D*>(this)->free_storage())>,
+      "‘void free_storage() noexcept’ missing from superclass.");
+    static_cast<D*>(this)->free_storage();
+  }
+
+
+  // Pile O' Templates
+  // -----------------
+  //
+  // These are just verbose and better typed once than twice. They're
+  // used for SFINAE and declaring noexcept.
+  //
+  template<class T>
+  struct is_in_place_type_helper : std::false_type {};
+  template<class T>
+  struct is_in_place_type_helper<std::in_place_type_t<T>> : std::true_type {};
+
+  template<class T>
+  static constexpr bool is_in_place_type_v =
+    is_in_place_type_helper<std::decay_t<T>>::value;
+
+  // SFINAE condition for value initialized
+  // constructors/assigners. This is analogous to the standard's
+  // requirement that this overload only participate in overload
+  // resolution if std::decay_t<T> is not the same type as the
+  // any-type, nor a specialization of std::in_place_type_t
+  //
+  template<typename T>
+  using value_condition_t = std::enable_if_t<
+    !std::is_same_v<std::decay_t<T>, D> &&
+    !is_in_place_type_v<std::decay_t<T>>>;
+
+  // This `noexcept` condition for value construction lets
+  // `immobile_any`'s value constructor/assigner be noexcept, so long
+  // as the type's copy or move constructor cooperates.
+  //
+  template<typename T>
+  static constexpr bool value_noexcept_v =
+    std::is_nothrow_constructible_v<std::decay_t<T>, T> && capacity != dynamic;
+
+  // SFINAE condition for in-place constructors/assigners
+  //
+  template<typename T, typename... Args>
+  using in_place_condition_t = std::enable_if_t<std::is_constructible_v<
+						  std::decay_t<T>, Args...>>;
+
+  // Analogous to the above. Give noexcept to immobile_any::emplace
+  // when possible.
+  //
+  template<typename T, typename... Args>
+  static constexpr bool in_place_noexcept_v =
+    std::is_nothrow_constructible_v<std::decay_t<T>, Args...> &&
+    capacity != dynamic;
+
+private:
+
+  // Functionality!
+  // --------------
+
+  // The driver function for the currently stored object. Whether this
+  // is null is the canonical way to know whether an instance has a
+  // value.
+  //
+  func_t func = nullptr;
+
+  // Construct an object within ourselves. As you can see we give the
+  // weak exception safety guarantee.
+  //
+  template<typename T, typename ...Args>
+  std::decay_t<T>& construct(Args&& ...args) {
+    using Td = std::decay_t<T>;
+    static_assert(capacity == dynamic || sizeof(Td) <= capacity,
+		  "Supplied type is too large for this specialization.");
+    try {
+      func = &op_func<Td>;
+      return *new (reinterpret_cast<Td*>(alloc_storage(sizeof(Td))))
+	Td(std::forward<Args>(args)...);
+    } catch (...) {
+      reset();
+      throw;
+    }
+  }
+
+protected:
+
+  // We hold the storage, even if the superclass class manipulates it,
+  // so that its default initialization comes soon enough for us to
+  // use it in our constructors.
+  //
+  storage_t storage;
+
+public:
+
+  base() noexcept = default;
+  ~base() noexcept {
+    reset();
+  }
+
+protected:
+  // Since some of our derived classes /can/ be copied or moved.
+  //
+  base(const base& rhs) noexcept : func(rhs.func) {
+    if constexpr (std::is_copy_assignable_v<storage_t>) {
+      storage = rhs.storage;
+    }
+  }
+  base& operator =(const base& rhs) noexcept {
+    reset();
+    func = rhs.func;
+    if constexpr (std::is_copy_assignable_v<storage_t>) {
+      storage = rhs.storage;
+    }
+    return *this;
+  }
+
+  base(base&& rhs) noexcept : func(std::move(rhs.func)) {
+    if constexpr (std::is_move_assignable_v<storage_t>) {
+      storage = std::move(rhs.storage);
+    }
+    rhs.func = nullptr;
+  }
+  base& operator =(base&& rhs) noexcept {
+    reset();
+    func = rhs.func;
+    if constexpr (std::is_move_assignable_v<storage_t>) {
+      storage = std::move(rhs.storage);
+    }
+    rhs.func = nullptr;
+    return *this;
+  }
+
+public:
+
+  // Value construct/assign
+  // ----------------------
+  //
+  template<typename T,
+	   typename = value_condition_t<T>>
+  base(T&& t) noexcept(value_noexcept_v<T>) {
+    construct<T>(std::forward<T>(t));
+  }
+
+  // On exception, *this is set to empty.
+  //
+  template<typename T,
+           typename = value_condition_t<T>>
+  base& operator =(T&& t) noexcept(value_noexcept_v<T>) {
+    reset();
+    construct<T>(std::forward<T>(t));
+    return *this;
+  }
+
+  // In-place construct/assign
+  // -------------------------
+  //
+  // I really hate the way the C++ standard library treats references
+  // as if they were stepchildren in a Charles Dickens novel. I am
+  // quite upset that std::optional lacks a specialization for
+  // references. There's no legitimate reason for it. The whole
+  // 're-seat or refuse' debate is simply a canard. The optional is
+  // effectively a container, so of course it can be emptied or
+  // reassigned. No, pointers are not an acceptable substitute. A
+  // pointer gives an address in memory which may be null and which
+  // may represent an object or may a location in which an object is
+  // to be created. An optional reference, on the other hand, is a
+  // reference to an initialized, live object or /empty/. This is an
+  // obvious difference that should be communicable to any programmer
+  // reading the code through the type system.
+  //
+  // `std::any`, even in the case of in-place construction,
+  // only stores the decayed type. I suspect this was to get around
+  // the question of whether, for a std::any holding a T&,
+  // std::any_cast<T> should return a copy or throw
+  // std::bad_any_cast.
+  //
+  // I think the appropriate response in that case would be to make a
+  // copy if the type supports it and fail otherwise. Once a concrete
+  // type is known the problem solves itself.
+  //
+  // If one were inclined, one could easily load the driver function
+  // with a heavy subset of the type traits (those that depend only on
+  // the type in question) and simply /ask/ whether it's a reference.
+  //
+  // At the moment, I'm maintaining compatibility with the standard
+  // library except for copy/move semantics.
+  //
+  template<typename T,
+           typename... Args,
+           typename = in_place_condition_t<T, Args...>>
+  base(std::in_place_type_t<T>,
+       Args&& ...args) noexcept(in_place_noexcept_v<T, Args...>) {
+    construct<T>(std::forward<Args>(args)...);
+  }
+
+  // On exception, *this is set to empty.
+  //
+  template<typename T,
+           typename... Args,
+           typename = in_place_condition_t<T>>
+  std::decay_t<T>& emplace(Args&& ...args) noexcept(in_place_noexcept_v<
+						    T, Args...>) {
+    reset();
+    return construct<T>(std::forward<Args>(args)...);
+  }
+
+  template<typename T,
+           typename U,
+           typename... Args,
+           typename = in_place_condition_t<T, std::initializer_list<U>,
+					   Args...>>
+  base(std::in_place_type_t<T>,
+       std::initializer_list<U> i,
+       Args&& ...args) noexcept(in_place_noexcept_v<T, std::initializer_list<U>,
+				Args...>) {
+    construct<T>(i, std::forward<Args>(args)...);
+  }
+
+  // On exception, *this is set to empty.
+  //
+  template<typename T,
+           typename U,
+           typename... Args,
+           typename = in_place_condition_t<T, std::initializer_list<U>,
+					   Args...>>
+  std::decay_t<T>& emplace(std::initializer_list<U> i,
+                           Args&& ...args) noexcept(in_place_noexcept_v<T,
+						    std::initializer_list<U>,
+						    Args...>) {
+    reset();
+    return construct<T>(i,std::forward<Args>(args)...);
+  }
+
+  // Empty ourselves, using the subclass to free any storage.
+  //
+  void reset() noexcept {
+    if (has_value()) {
+      func(op::destroy, ptr());
+      func = nullptr;
+    }
+    free_storage();
+  }
+
+  template<typename U = storage_t,
+	   typename = std::enable_if<std::is_swappable_v<storage_t>>>
+  void swap(base& rhs) {
+    using std::swap;
+    swap(func, rhs.func);
+    swap(storage, rhs.storage);
+  }
+
+  // All other functions should use this function to test emptiness
+  // rather than examining `func` directly.
+  //
+  bool has_value() const noexcept {
+    return !!func;
+  }
+
+  // Returns the type of the value stored, if any.
+  //
+  const std::type_info& type() const noexcept {
+    if (has_value()) {
+      const std::type_info* t;
+      func(op::type, reinterpret_cast<void*>(&t));
+      return *t;
+    } else {
+      return typeid(void);
+    }
+  }
+
+  template<typename T, typename U, typename V>
+  friend inline void* cast_helper(const base<U, V>& b) noexcept;
+};
+
+// Function used by all `any_cast` functions
+//
+// Returns a void* to the contents if they exist and match the
+// requested type, otherwise `nullptr`.
+//
+template<typename T, typename U, typename V>
+inline void* cast_helper(const base<U, V>& b) noexcept {
+  if (b.func && ((&op_func<T> == b.func) ||
+		 (b.type() == typeid(T)))) {
+    return b.ptr();
+  } else {
+    return nullptr;
+  }
+}
+}
+
+// `any_cast`
+// ==========
+//
+// Just the usual gamut of `any_cast` overloads. These get a bit
+// repetitive and it would be nice to think of a way to collapse them
+// down a bit.
+//
+
+// The pointer pair!
+//
+template<typename T, typename U, typename V>
+inline T* any_cast(_any::base<U, V>* a) noexcept {
+  if (a) {
+    return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a));
+  }
+  return nullptr;
+}
+
+template<typename T, typename U, typename V>
+inline const T* any_cast(const _any::base<U, V>* a) noexcept {
+  if (a) {
+    return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a));
+  }
+  return nullptr;
+}
+
+// While we disallow copying the immobile any itself, we can allow
+// anything with an extracted value that the type supports.
+//
+template<typename T, typename U, typename V>
+inline T any_cast(_any::base<U, V>& a) {
+  static_assert(std::is_reference_v<T> ||
+                std::is_copy_constructible_v<T>,
+                "The supplied type must be either a reference or "
+                "copy constructible.");
+  auto p = any_cast<std::decay_t<T>>(&a);
+  if (p) {
+    return static_cast<T>(*p);
+  }
+  throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline T any_cast(const _any::base<U, V>& a) {
+  static_assert(std::is_reference_v<T> ||
+                std::is_copy_constructible_v<T>,
+                "The supplied type must be either a reference or "
+                "copy constructible.");
+  auto p = any_cast<std::decay_t<T>>(&a);
+  if (p) {
+    return static_cast<T>(*p);
+  }
+  throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline std::enable_if_t<(std::is_move_constructible_v<T> ||
+			 std::is_copy_constructible_v<T>) &&
+			!std::is_rvalue_reference_v<T>, T>
+any_cast(_any::base<U, V>&& a) {
+  auto p = any_cast<std::decay_t<T>>(&a);
+  if (p) {
+    return std::move((*p));
+  }
+  throw std::bad_any_cast();
+}
+
+template<typename T, typename U, typename V>
+inline std::enable_if_t<std::is_rvalue_reference_v<T>, T>
+any_cast(_any::base<U, V>&& a) {
+  auto p = any_cast<std::decay_t<T>>(&a);
+  if (p) {
+    return static_cast<T>(*p);
+  }
+  throw std::bad_any_cast();
+}
+
+// `immobile_any`
+// ==============
+//
+// Sometimes, uncopyable objects exist and I want to do things with
+// them. The C++ standard library is really quite keen on insisting
+// things be copyable before it deigns to work. I find this annoying.
+//
+// Also, the allocator, while useful, is really not considerate of
+// other people's time. Every time we go to visit it, it takes us
+// quite an awfully long time to get away again. As such, I've been
+// trying to avoid its company whenever it is convenient and seemly.
+//
+// We accept any type that will fit in the declared capacity. You may
+// store types with throwing destructors, but terminate will be
+// invoked when they throw.
+//
+template<std::size_t S>
+class immobile_any : public _any::base<immobile_any<S>,
+				       std::aligned_storage_t<S>> {
+  using base = _any::base<immobile_any<S>, std::aligned_storage_t<S>>;
+  friend base;
+
+  using _any::base<immobile_any<S>, std::aligned_storage_t<S>>::storage;
+
+  // Superclass requirements!
+  // ------------------------
+  //
+  // Simple as anything. We have a buffer of fixed size and return the
+  // pointer to it when asked.
+  //
+  static constexpr std::size_t capacity = S;
+  void* ptr() const noexcept {
+    return const_cast<void*>(static_cast<const void*>(&storage));
+  }
+  void* alloc_storage(std::size_t) noexcept {
+    return ptr();
+  }
+  void free_storage() noexcept {}
+
+  static_assert(capacity != _any::dynamic,
+		"That is not a valid size for an immobile_any.");
+
+public:
+
+  immobile_any() noexcept = default;
+
+  immobile_any(const immobile_any&) = delete;
+  immobile_any& operator =(const immobile_any&) = delete;
+  immobile_any(immobile_any&&) = delete;
+  immobile_any& operator =(immobile_any&&) = delete;
+
+  using base::base;
+  using base::operator =;
+
+  void swap(immobile_any&) = delete;
+};
+
+template<typename T, std::size_t S, typename... Args>
+inline immobile_any<S> make_immobile_any(Args&& ...args) {
+  return immobile_any<S>(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, std::size_t S, typename U, typename... Args>
+inline immobile_any<S> make_immobile_any(std::initializer_list<U> i, Args&& ...args) {
+  return immobile_any<S>(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+
+// `unique_any`
+// ============
+//
+// Oh dear. Now we're getting back into allocation. You don't think
+// the allocator noticed all those mean things we said about it, do
+// you?
+//
+// Well. Okay, allocator. Sometimes when it's the middle of the night
+// and you're writing template code you say things you don't exactly
+// mean. If it weren't for you, we wouldn't have any memory to run all
+// our programs in at all. Really, I'm just being considerate of
+// *your* needs, trying to avoid having to run to you every time we
+// instantiate a type, making a few that can be self-sufficient…uh…
+//
+// **Anyway**, this is movable but not copyable, as you should expect
+// from anything with ‘unique’ in the name.
+//
+class unique_any : public _any::base<unique_any, std::unique_ptr<std::byte[]>> {
+  using base = _any::base<unique_any, std::unique_ptr<std::byte[]>>;
+  friend base;
+
+  using base::storage;
+
+  // Superclass requirements
+  // -----------------------
+  //
+  // Our storage is a single chunk of RAM owned by a
+  // `std::unique_ptr`.
+  //
+  static constexpr std::size_t capacity = _any::dynamic;
+  void* ptr() const noexcept {
+    return static_cast<void*>(storage.get());
+    return nullptr;
+  }
+
+  void* alloc_storage(const std::size_t z) {
+    storage.reset(new std::byte[z]);
+    return ptr();
+  }
+
+  void free_storage() noexcept {
+    storage.reset();
+  }
+
+public:
+
+  unique_any() noexcept = default;
+  ~unique_any() noexcept = default;
+
+  unique_any(const unique_any&) = delete;
+  unique_any& operator =(const unique_any&) = delete;
+
+  // We can rely on the behavior of `unique_ptr` and the base class to
+  // give us a default move constructor that does the right thing.
+  //
+  unique_any(unique_any&& rhs) noexcept = default;
+  unique_any& operator =(unique_any&& rhs) = default;
+
+  using base::base;
+  using base::operator =;
+};
+
+inline void swap(unique_any& lhs, unique_any& rhs) noexcept {
+  lhs.swap(rhs);
+}
+
+template<typename T, typename... Args>
+inline unique_any make_unique_any(Args&& ...args) {
+  return unique_any(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, typename U, typename... Args>
+inline unique_any make_unique_any(std::initializer_list<U> i, Args&& ...args) {
+  return unique_any(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+
+// `shared_any`
+// ============
+//
+// Once more with feeling!
+//
+// This is both copyable *and* movable. In case you need that sort of
+// thing. It seemed a reasonable completion.
+//
+class shared_any : public _any::base<shared_any, boost::shared_ptr<std::byte[]>> {
+  using base = _any::base<shared_any, boost::shared_ptr<std::byte[]>>;
+  friend base;
+
+  using base::storage;
+
+  // Superclass requirements
+  // -----------------------
+  //
+  // Our storage is a single chunk of RAM allocated from the
+  // heap. This time it's owned by a `boost::shared_ptr` so we can use
+  // `boost::make_shared_noinit`. (This lets us get the optimization
+  // that allocates array and control block in one without wasting
+  // time on `memset`.)
+  //
+  static constexpr std::size_t capacity = _any::dynamic;
+  void* ptr() const noexcept {
+    return static_cast<void*>(storage.get());
+  }
+
+  void* alloc_storage(std::size_t n) {
+    storage = boost::make_shared_noinit<std::byte[]>(n);
+    return ptr();
+  }
+
+  void free_storage() noexcept {
+    storage.reset();
+  }
+
+public:
+
+  shared_any() noexcept = default;
+  ~shared_any() noexcept = default;
+
+  shared_any(const shared_any& rhs) noexcept = default;
+  shared_any& operator =(const shared_any&) noexcept = default;
+
+  shared_any(shared_any&& rhs) noexcept = default;
+  shared_any& operator =(shared_any&& rhs) noexcept = default;
+
+  using base::base;
+  using base::operator =;
+};
+
+inline void swap(shared_any& lhs, shared_any& rhs) noexcept {
+  lhs.swap(rhs);
+}
+
+template<typename T, typename... Args>
+inline shared_any make_shared_any(Args&& ...args) {
+  return shared_any(std::in_place_type<T>, std::forward<Args>(args)...);
+}
+
+template<typename T, typename U, typename... Args>
+inline shared_any make_shared_any(std::initializer_list<U> i, Args&& ...args) {
+  return shared_any(std::in_place_type<T>, i, std::forward<Args>(args)...);
+}
+}
+
+#endif // INCLUDE_STATIC_ANY
diff --git a/src/include/bitmapper.h b/src/include/bitmapper.h
new file mode 100644
index 000000000..5a65cc20f
--- /dev/null
+++ b/src/include/bitmapper.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_BITMAPPER_H
+#define CEPH_BITMAPPER_H
+
+class bitmapper {
+  char *_data;
+  int _len;
+
+ public:
+  bitmapper() : _data(0), _len(0) { }
+  bitmapper(char *data, int len) : _data(data), _len(len) { }
+
+  void set_data(char *data, int len) { _data = data; _len = len; }
+
+  int bytes() const { return _len; }
+  int bits() const { return _len * 8; }
+
+  bool operator[](int b) const {
+    return get(b);
+  }
+  bool get(int b) const {
+    return _data[b >> 3] & (1 << (b&7));
+  }
+  void set(int b) {
+    _data[b >> 3] |= 1 << (b&7);
+  }
+  void clear(int b) {
+    _data[b >> 3] &= ~(1 << (b&7));
+  }
+  void toggle(int b) {
+    _data[b >> 3] ^= 1 << (b&7);
+  }
+};
+
+#endif
diff --git a/src/include/blobhash.h b/src/include/blobhash.h
new file mode 100644
index 000000000..303892b13
--- /dev/null
+++ b/src/include/blobhash.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_BLOBHASH_H
+#define CEPH_BLOBHASH_H
+
+#include <cstdint>
+#include "hash.h"
+
+class blobhash {
+public:
+  uint32_t operator()(const void* p, size_t len) {
+    static rjhash<std::uint32_t> H;
+    std::uint32_t acc = 0;
+    auto buf = static_cast<const unsigned char*>(p);
+    while (len >= sizeof(acc)) {
+      acc ^= unaligned_load(buf);
+      buf += sizeof(std::uint32_t);
+      len -= sizeof(std::uint32_t);
+    }
+    // handle the last few bytes of p[-(len % 4):]
+    switch (len) {
+    case 3:
+      acc ^= buf[2] << 16;
+      [[fallthrough]];
+    case 2:
+      acc ^= buf[1] << 8;
+      [[fallthrough]];
+    case 1:
+      acc ^= buf[0];
+    }
+    return H(acc);
+  }
+private:
+  static inline std::uint32_t unaligned_load(const unsigned char* p) {
+    std::uint32_t result;
+    __builtin_memcpy(&result, p, sizeof(result));
+    return result;
+  }
+};
+
+
+#endif
diff --git a/src/include/btree_map.h b/src/include/btree_map.h
new file mode 100644
index 000000000..218835a0f
--- /dev/null
+++ b/src/include/btree_map.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_INCLUDE_BTREE_MAP_H
+#define CEPH_INCLUDE_BTREE_MAP_H
+
+#include "include/cpp-btree/btree.h"
+#include "include/cpp-btree/btree_map.h"
+#include "include/ceph_assert.h"   // cpp-btree uses system assert, blech
+#include "include/encoding.h"
+
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, ceph::buffer::list& bl)
+{
+  using ceph::encode;
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U>
+inline void encode(const btree::btree_map<T,U>& m, ceph::buffer::list& bl, uint64_t features)
+{
+  using ceph::encode;
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U>
+inline void decode(btree::btree_map<T,U>& m, ceph::buffer::list::const_iterator& p)
+{
+  using ceph::decode;
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U>
+inline void encode_nohead(const btree::btree_map<T,U>& m, ceph::buffer::list& bl)
+{
+  using ceph::encode;
+  for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U>
+inline void decode_nohead(int n, btree::btree_map<T,U>& m, ceph::buffer::list::const_iterator& p)
+{
+  using ceph::decode;
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+
+#endif
diff --git a/src/include/buffer.h b/src/include/buffer.h
new file mode 100644
index 000000000..10dceaec2
--- /dev/null
+++ b/src/include/buffer.h
@@ -0,0 +1,1294 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_BUFFER_H
+#define CEPH_BUFFER_H
+
+#if defined(__linux__) || defined(__FreeBSD__)
+#include <stdlib.h>
+#endif
+#include <limits.h>
+
+#ifndef _XOPEN_SOURCE
+# define _XOPEN_SOURCE 600
+#endif
+
+#include <stdio.h>
+#include <sys/uio.h>
+
+#if defined(__linux__)	// For malloc(2).
+#include <malloc.h>
+#endif
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !defined(__CYGWIN__) && !defined(_WIN32)
+# include <sys/mman.h>
+#endif
+
+#include <iosfwd>
+#include <iomanip>
+#include <list>
+#include <memory>
+#include <vector>
+#include <string>
+#if __cplusplus >= 201703L
+#include <string_view>
+#endif // __cplusplus >= 201703L
+
+#include <exception>
+#include <type_traits>
+
+#include "page.h"
+#include "crc32c.h"
+#include "buffer_fwd.h"
+
+
+#ifdef __CEPH__
+# include "include/ceph_assert.h"
+#else
+# include <assert.h>
+#endif
+
+#include "inline_memory.h"
+
+#define CEPH_BUFFER_API
+
+#ifdef HAVE_SEASTAR
+namespace seastar {
+template <typename T> class temporary_buffer;
+namespace net {
+class packet;
+}
+}
+#endif // HAVE_SEASTAR
+class deleter;
+
+template<typename T> class DencDumper;
+
+namespace ceph {
+
+template <class T>
+struct nop_delete {
+  void operator()(T*) {}
+};
+
+// This is not unique_ptr-like smart pointer! It just signalizes ownership
+// but DOES NOT manage the resource. It WILL LEAK if not manually deleted.
+// It's rather a replacement for raw pointer than any other smart one.
+//
+// Considered options:
+//  * unique_ptr with custom deleter implemented in .cc (would provide
+//    the non-zero-cost resource management),
+//  * GSL's owner<T*> (pretty neat but would impose an extra depedency),
+//  * unique_ptr with nop deleter,
+//  * raw pointer (doesn't embed ownership enforcement - std::move).
+template <class T>
+struct unique_leakable_ptr : public std::unique_ptr<T, ceph::nop_delete<T>> {
+  using std::unique_ptr<T, ceph::nop_delete<T>>::unique_ptr;
+};
+
+namespace buffer CEPH_BUFFER_API {
+inline namespace v15_2_0 {
+
+/// Actual definitions in common/error_code.h
+struct error;
+struct bad_alloc;
+struct end_of_buffer;
+struct malformed_input;
+struct error_code;
+
+  /// count of cached crc hits (matching input)
+  int get_cached_crc();
+  /// count of cached crc hits (mismatching input, required adjustment)
+  int get_cached_crc_adjusted();
+  /// count of crc cache misses
+  int get_missed_crc();
+  /// enable/disable tracking of cached crcs
+  void track_cached_crc(bool b);
+
+  /*
+   * an abstract raw buffer.  with a reference count.
+   */
+  class raw;
+  class raw_malloc;
+  class raw_static;
+  class raw_posix_aligned;
+  class raw_hack_aligned;
+  class raw_claimed_char;
+  class raw_unshareable; // diagnostic, unshareable char buffer
+  class raw_combined;
+  class raw_claim_buffer;
+
+
+  /*
+   * named constructors
+   */
+  ceph::unique_leakable_ptr<raw> copy(const char *c, unsigned len);
+  ceph::unique_leakable_ptr<raw> create(unsigned len);
+  ceph::unique_leakable_ptr<raw> create(unsigned len, char c);
+  ceph::unique_leakable_ptr<raw> create_in_mempool(unsigned len, int mempool);
+  ceph::unique_leakable_ptr<raw> claim_char(unsigned len, char *buf);
+  ceph::unique_leakable_ptr<raw> create_malloc(unsigned len);
+  ceph::unique_leakable_ptr<raw> claim_malloc(unsigned len, char *buf);
+  ceph::unique_leakable_ptr<raw> create_static(unsigned len, char *buf);
+  ceph::unique_leakable_ptr<raw> create_aligned(unsigned len, unsigned align);
+  ceph::unique_leakable_ptr<raw> create_aligned_in_mempool(unsigned len, unsigned align, int mempool);
+  ceph::unique_leakable_ptr<raw> create_page_aligned(unsigned len);
+  ceph::unique_leakable_ptr<raw> create_small_page_aligned(unsigned len);
+  ceph::unique_leakable_ptr<raw> claim_buffer(unsigned len, char *buf, deleter del);
+
+#ifdef HAVE_SEASTAR
+  /// create a raw buffer to wrap seastar cpu-local memory, using foreign_ptr to
+  /// make it safe to share between cpus
+  ceph::unique_leakable_ptr<buffer::raw> create(seastar::temporary_buffer<char>&& buf);
+  /// create a raw buffer to wrap seastar cpu-local memory, without the safety
+  /// of foreign_ptr. the caller must otherwise guarantee that the buffer ptr is
+  /// destructed on this cpu
+  ceph::unique_leakable_ptr<buffer::raw> create_local(seastar::temporary_buffer<char>&& buf);
+#endif
+
+  /*
+   * a buffer pointer.  references (a subsequence of) a raw buffer.
+   */
+  class CEPH_BUFFER_API ptr {
+    friend class list;
+  protected:
+    raw *_raw;
+    unsigned _off, _len;
+  private:
+
+    void release();
+
+    template<bool is_const>
+    class iterator_impl {
+      const ptr *bp;     ///< parent ptr
+      const char *start; ///< starting pointer into bp->c_str()
+      const char *pos;   ///< pointer into bp->c_str()
+      const char *end_ptr;   ///< pointer to bp->end_c_str()
+      const bool deep;   ///< if true, do not allow shallow ptr copies
+
+      iterator_impl(typename std::conditional<is_const, const ptr*, ptr*>::type p,
+		    size_t offset, bool d)
+	: bp(p),
+	  start(p->c_str() + offset),
+	  pos(start),
+	  end_ptr(p->end_c_str()),
+	  deep(d)
+      {}
+
+      friend class ptr;
+
+    public:
+      using pointer = typename std::conditional<is_const, const char*, char *>::type;
+      pointer get_pos_add(size_t n) {
+	auto r = pos;
+	*this += n;
+	return r;
+      }
+      ptr get_ptr(size_t len) {
+	if (deep) {
+	  return buffer::copy(get_pos_add(len), len);
+	} else {
+	  size_t off = pos - bp->c_str();
+	  *this += len;
+	  return ptr(*bp, off, len);
+	}
+      }
+
+      iterator_impl& operator+=(size_t len);
+
+      const char *get_pos() {
+	return pos;
+      }
+      const char *get_end() {
+	return end_ptr;
+      }
+
+      size_t get_offset() {
+	return pos - start;
+      }
+
+      bool end() const {
+	return pos == end_ptr;
+      }
+    };
+
+  public:
+    using const_iterator = iterator_impl<true>;
+    using iterator = iterator_impl<false>;
+
+    ptr() : _raw(nullptr), _off(0), _len(0) {}
+    ptr(ceph::unique_leakable_ptr<raw> r);
+    // cppcheck-suppress noExplicitConstructor
+    ptr(unsigned l);
+    ptr(const char *d, unsigned l);
+    ptr(const ptr& p);
+    ptr(ptr&& p) noexcept;
+    ptr(const ptr& p, unsigned o, unsigned l);
+    ptr(const ptr& p, ceph::unique_leakable_ptr<raw> r);
+    ptr& operator= (const ptr& p);
+    ptr& operator= (ptr&& p) noexcept;
+    ~ptr() {
+      // BE CAREFUL: this destructor is called also for hypercombined ptr_node.
+      // After freeing underlying raw, `*this` can become inaccessible as well!
+      release();
+    }
+
+    bool have_raw() const { return _raw ? true:false; }
+
+    void swap(ptr& other) noexcept;
+
+    iterator begin(size_t offset=0) {
+      return iterator(this, offset, false);
+    }
+    const_iterator begin(size_t offset=0) const {
+      return const_iterator(this, offset, false);
+    }
+    const_iterator cbegin() const {
+      return begin();
+    }
+    const_iterator begin_deep(size_t offset=0) const {
+      return const_iterator(this, offset, true);
+    }
+
+    // misc
+    bool is_aligned(unsigned align) const {
+      return ((uintptr_t)c_str() & (align-1)) == 0;
+    }
+    bool is_page_aligned() const { return is_aligned(CEPH_PAGE_SIZE); }
+    bool is_n_align_sized(unsigned align) const
+    {
+      return (length() % align) == 0;
+    }
+    bool is_n_page_sized() const { return is_n_align_sized(CEPH_PAGE_SIZE); }
+    bool is_partial() const {
+      return have_raw() && (start() > 0 || end() < raw_length());
+    }
+
+    int get_mempool() const;
+    void reassign_to_mempool(int pool);
+    void try_assign_to_mempool(int pool);
+
+    // accessors
+    const char *c_str() const;
+    char *c_str();
+    const char *end_c_str() const;
+    char *end_c_str();
+    unsigned length() const { return _len; }
+    unsigned offset() const { return _off; }
+    unsigned start() const { return _off; }
+    unsigned end() const { return _off + _len; }
+    unsigned unused_tail_length() const;
+    const char& operator[](unsigned n) const;
+    char& operator[](unsigned n);
+
+    const char *raw_c_str() const;
+    unsigned raw_length() const;
+    int raw_nref() const;
+
+    void copy_out(unsigned o, unsigned l, char *dest) const;
+
+    unsigned wasted() const;
+
+    int cmp(const ptr& o) const;
+    bool is_zero() const;
+
+    // modifiers
+    void set_offset(unsigned o) {
+#ifdef __CEPH__
+      ceph_assert(raw_length() >= o);
+#else
+      assert(raw_length() >= o);
+#endif
+      _off = o;
+    }
+    void set_length(unsigned l) {
+#ifdef __CEPH__
+      ceph_assert(raw_length() >= l);
+#else
+      assert(raw_length() >= l);
+#endif
+      _len = l;
+    }
+
+    unsigned append(char c);
+    unsigned append(const char *p, unsigned l);
+#if __cplusplus >= 201703L
+    inline unsigned append(std::string_view s) {
+      return append(s.data(), s.length());
+    }
+#endif // __cplusplus >= 201703L
+    void copy_in(unsigned o, unsigned l, const char *src, bool crc_reset = true);
+    void zero(bool crc_reset = true);
+    void zero(unsigned o, unsigned l, bool crc_reset = true);
+    unsigned append_zeros(unsigned l);
+
+#ifdef HAVE_SEASTAR
+    /// create a temporary_buffer, copying the ptr as its deleter
+    operator seastar::temporary_buffer<char>() &;
+    /// convert to temporary_buffer, stealing the ptr as its deleter
+    operator seastar::temporary_buffer<char>() &&;
+#endif // HAVE_SEASTAR
+
+  };
+
+
+  struct ptr_hook {
+    mutable ptr_hook* next;
+
+    ptr_hook() = default;
+    ptr_hook(ptr_hook* const next)
+      : next(next) {
+    }
+  };
+
+  class ptr_node : public ptr_hook, public ptr {
+  public:
+    struct cloner {
+      ptr_node* operator()(const ptr_node& clone_this);
+    };
+    struct disposer {
+      void operator()(ptr_node* const delete_this) {
+	if (!__builtin_expect(dispose_if_hypercombined(delete_this), 0)) {
+	  delete delete_this;
+	}
+      }
+    };
+
+    ~ptr_node() = default;
+
+    static std::unique_ptr<ptr_node, disposer>
+    create(ceph::unique_leakable_ptr<raw> r) {
+      return create_hypercombined(std::move(r));
+    }
+    static std::unique_ptr<ptr_node, disposer>
+    create(const unsigned l) {
+      return create_hypercombined(buffer::create(l));
+    }
+    template <class... Args>
+    static std::unique_ptr<ptr_node, disposer>
+    create(Args&&... args) {
+      return std::unique_ptr<ptr_node, disposer>(
+	new ptr_node(std::forward<Args>(args)...));
+    }
+
+    static ptr_node* copy_hypercombined(const ptr_node& copy_this);
+
+  private:
+    friend list;
+
+    template <class... Args>
+    ptr_node(Args&&... args) : ptr(std::forward<Args>(args)...) {
+    }
+    ptr_node(const ptr_node&) = default;
+
+    ptr& operator= (const ptr& p) = delete;
+    ptr& operator= (ptr&& p) noexcept = delete;
+    ptr_node& operator= (const ptr_node& p) = delete;
+    ptr_node& operator= (ptr_node&& p) noexcept = delete;
+    void swap(ptr& other) noexcept = delete;
+    void swap(ptr_node& other) noexcept = delete;
+
+    static bool dispose_if_hypercombined(ptr_node* delete_this);
+    static std::unique_ptr<ptr_node, disposer> create_hypercombined(
+      ceph::unique_leakable_ptr<raw> r);
+  };
+  /*
+   * list - the useful bit!
+   */
+
+  class CEPH_BUFFER_API list {
+  public:
+    // this the very low-level implementation of singly linked list
+    // ceph::buffer::list is built on. We don't use intrusive slist
+    // of Boost (or any other 3rd party) to save extra dependencies
+    // in our public headers.
+    class buffers_t {
+      // _root.next can be thought as _head
+      ptr_hook _root;
+      ptr_hook* _tail;
+
+    public:
+      template <class T>
+      class buffers_iterator {
+	typename std::conditional<
+	  std::is_const<T>::value, const ptr_hook*, ptr_hook*>::type cur;
+	template <class U> friend class buffers_iterator;
+      public:
+	using value_type = T;
+	using reference = typename std::add_lvalue_reference<T>::type;
+	using pointer = typename std::add_pointer<T>::type;
+	using difference_type = std::ptrdiff_t;
+	using iterator_category = std::forward_iterator_tag;
+
+	template <class U>
+	buffers_iterator(U* const p)
+	  : cur(p) {
+	}
+	// copy constructor
+	buffers_iterator(const buffers_iterator<T>& other)
+	  : cur(other.cur) {
+	}
+	// converting constructor, from iterator -> const_iterator only
+	template <class U, typename std::enable_if<
+	    std::is_const<T>::value && !std::is_const<U>::value, int>::type = 0>
+	buffers_iterator(const buffers_iterator<U>& other)
+	  : cur(other.cur) {
+	}
+	buffers_iterator() = default;
+
+	T& operator*() const {
+	  return *reinterpret_cast<T*>(cur);
+	}
+	T* operator->() const {
+	  return reinterpret_cast<T*>(cur);
+	}
+
+	buffers_iterator& operator++() {
+	  cur = cur->next;
+	  return *this;
+	}
+	buffers_iterator operator++(int) {
+	  const auto temp(*this);
+	  ++*this;
+	  return temp;
+	}
+
+	template <class U>
+	buffers_iterator& operator=(buffers_iterator<U>& other) {
+	  cur = other.cur;
+	  return *this;
+	}
+
+	bool operator==(const buffers_iterator& rhs) const {
+	  return cur == rhs.cur;
+	}
+	bool operator!=(const buffers_iterator& rhs) const {
+	  return !(*this==rhs);
+	}
+      };
+
+      typedef buffers_iterator<const ptr_node> const_iterator;
+      typedef buffers_iterator<ptr_node> iterator;
+
+      typedef const ptr_node& const_reference;
+      typedef ptr_node& reference;
+
+      buffers_t()
+        : _root(&_root),
+	  _tail(&_root) {
+      }
+      buffers_t(const buffers_t&) = delete;
+      buffers_t(buffers_t&& other)
+	: _root(other._root.next == &other._root ? &_root : other._root.next),
+	  _tail(other._tail == &other._root ? &_root : other._tail) {
+	other._root.next = &other._root;
+	other._tail = &other._root;
+
+	_tail->next = &_root;
+      }
+      buffers_t& operator=(buffers_t&& other) {
+	if (&other != this) {
+	  clear_and_dispose();
+	  swap(other);
+	}
+	return *this;
+      }
+
+      void push_back(reference item) {
+	item.next = &_root;
+	// this updates _root.next when called on empty
+	_tail->next = &item;
+	_tail = &item;
+      }
+
+      void push_front(reference item) {
+	item.next = _root.next;
+	_root.next = &item;
+	_tail = _tail == &_root ? &item : _tail;
+      }
+
+      // *_after
+      iterator erase_after(const_iterator it) {
+	const auto* to_erase = it->next;
+
+	it->next = to_erase->next;
+	_root.next = _root.next == to_erase ? to_erase->next : _root.next;
+	_tail = _tail == to_erase ? (ptr_hook*)&*it : _tail;
+	return it->next;
+      }
+
+      void insert_after(const_iterator it, reference item) {
+	item.next = it->next;
+	it->next = &item;
+	_root.next = it == end() ? &item : _root.next;
+	_tail = const_iterator(_tail) == it ? &item : _tail;
+      }
+
+      void splice_back(buffers_t& other) {
+	if (other.empty()) {
+	  return;
+	}
+
+	other._tail->next = &_root;
+	// will update root.next if empty() == true
+	_tail->next = other._root.next;
+	_tail = other._tail;
+
+	other._root.next = &other._root;
+	other._tail = &other._root;
+      }
+
+      bool empty() const { return _tail == &_root; }
+
+      const_iterator begin() const {
+	return _root.next;
+      }
+      const_iterator before_begin() const {
+	return &_root;
+      }
+      const_iterator end() const {
+	return &_root;
+      }
+      iterator begin() {
+	return _root.next;
+      }
+      iterator before_begin() {
+	return &_root;
+      }
+      iterator end() {
+	return &_root;
+      }
+
+      reference front() {
+	return reinterpret_cast<reference>(*_root.next);
+      }
+      reference back() {
+	return reinterpret_cast<reference>(*_tail);
+      }
+      const_reference front() const {
+	return reinterpret_cast<const_reference>(*_root.next);
+      }
+      const_reference back() const {
+	return reinterpret_cast<const_reference>(*_tail);
+      }
+
+      void clone_from(const buffers_t& other) {
+	clear_and_dispose();
+	for (auto& node : other) {
+	  ptr_node* clone = ptr_node::cloner()(node);
+	  push_back(*clone);
+	}
+      }
+      void clear_and_dispose() {
+        ptr_node::disposer dispose;
+        for (auto it = begin(), e = end(); it != e; /* nop */) {
+          auto& node = *it++;
+          dispose(&node);
+        }
+        _tail = &_root;
+        _root.next = _tail;
+      }
+      iterator erase_after_and_dispose(iterator it) {
+	auto* to_dispose = &*std::next(it);
+	auto ret = erase_after(it);
+	ptr_node::disposer()(to_dispose);
+	return ret;
+      }
+
+      void swap(buffers_t& other) {
+	const auto copy_root = _root;
+	_root.next = \
+	  other._root.next == &other._root ? &this->_root : other._root.next;
+	other._root.next = \
+	  copy_root.next == &_root ? &other._root : copy_root.next;
+
+	const auto copy_tail = _tail;
+	_tail = other._tail == &other._root ? &this->_root : other._tail;
+	other._tail = copy_tail == &_root ? &other._root : copy_tail;
+
+	_tail->next = &_root;
+	other._tail->next = &other._root;
+      }
+    };
+
+    class iterator;
+
+  private:
+    // my private bits
+    buffers_t _buffers;
+
+    // track bufferptr we can modify (especially ::append() to). Not all bptrs
+    // bufferlist holds have this trait -- if somebody ::push_back(const ptr&),
+    // he expects it won't change.
+    ptr_node* _carriage;
+    unsigned _len, _num;
+
+    template <bool is_const>
+    class CEPH_BUFFER_API iterator_impl {
+    protected:
+      typedef typename std::conditional<is_const,
+					const list,
+					list>::type bl_t;
+      typedef typename std::conditional<is_const,
+					const buffers_t,
+					buffers_t >::type list_t;
+      typedef typename std::conditional<is_const,
+					typename buffers_t::const_iterator,
+					typename buffers_t::iterator>::type list_iter_t;
+      bl_t* bl;
+      list_t* ls;  // meh.. just here to avoid an extra pointer dereference..
+      list_iter_t p;
+      unsigned off; // in bl
+      unsigned p_off;   // in *p
+      friend class iterator_impl<true>;
+
+    public:
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = typename std::conditional<is_const, const char, char>::type;
+      using difference_type = std::ptrdiff_t;
+      using pointer = typename std::add_pointer<value_type>::type;
+      using reference = typename std::add_lvalue_reference<value_type>::type;
+
+      // constructor.  position.
+      iterator_impl()
+	: bl(0), ls(0), off(0), p_off(0) {}
+      iterator_impl(bl_t *l, unsigned o=0);
+      iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po)
+	: bl(l), ls(&bl->_buffers), p(ip), off(o), p_off(po) {}
+      iterator_impl(const list::iterator& i);
+
+      /// get current iterator offset in buffer::list
+      unsigned get_off() const { return off; }
+
+      /// get number of bytes remaining from iterator position to the end of the buffer::list
+      unsigned get_remaining() const { return bl->length() - off; }
+
+      /// true if iterator is at the end of the buffer::list
+      bool end() const {
+	return p == ls->end();
+	//return off == bl->length();
+      }
+      void seek(unsigned o);
+      char operator*() const;
+      iterator_impl& operator+=(unsigned o);
+      iterator_impl& operator++();
+      ptr get_current_ptr() const;
+      bool is_pointing_same_raw(const ptr& other) const;
+
+      bl_t& get_bl() const { return *bl; }
+
+      // copy data out.
+      // note that these all _append_ to dest!
+      void copy(unsigned len, char *dest);
+      // deprecated, use copy_deep()
+      void copy(unsigned len, ptr &dest) __attribute__((deprecated));
+      void copy_deep(unsigned len, ptr &dest);
+      void copy_shallow(unsigned len, ptr &dest);
+      void copy(unsigned len, list &dest);
+      void copy(unsigned len, std::string &dest);
+      void copy_all(list &dest);
+
+      // get a pointer to the currenet iterator position, return the
+      // number of bytes we can read from that position (up to want),
+      // and advance the iterator by that amount.
+      size_t get_ptr_and_advance(size_t want, const char **p);
+
+      /// calculate crc from iterator position
+      uint32_t crc32c(size_t length, uint32_t crc);
+
+      friend bool operator==(const iterator_impl& lhs,
+			     const iterator_impl& rhs) {
+	return &lhs.get_bl() == &rhs.get_bl() && lhs.get_off() == rhs.get_off();
+      }
+      friend bool operator!=(const iterator_impl& lhs,
+			     const iterator_impl& rhs) {
+	return &lhs.get_bl() != &rhs.get_bl() || lhs.get_off() != rhs.get_off();
+      }
+    };
+
+  public:
+    typedef iterator_impl<true> const_iterator;
+
+    class CEPH_BUFFER_API iterator : public iterator_impl<false> {
+    public:
+      iterator() = default;
+      iterator(bl_t *l, unsigned o=0);
+      iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po);
+      // copy data in
+      void copy_in(unsigned len, const char *src, bool crc_reset = true);
+      void copy_in(unsigned len, const list& otherl);
+    };
+
+    struct reserve_t {
+      char* bp_data;
+      unsigned* bp_len;
+      unsigned* bl_len;
+    };
+
+    class contiguous_appender {
+      ceph::bufferlist& bl;
+      ceph::bufferlist::reserve_t space;
+      char* pos;
+      bool deep;
+
+      /// running count of bytes appended that are not reflected by @pos
+      size_t out_of_band_offset = 0;
+
+      contiguous_appender(bufferlist& bl, size_t len, bool d)
+	: bl(bl),
+	  space(bl.obtain_contiguous_space(len)),
+	  pos(space.bp_data),
+	  deep(d) {
+      }
+
+      void flush_and_continue() {
+	const size_t l = pos - space.bp_data;
+	*space.bp_len += l;
+	*space.bl_len += l;
+	space.bp_data = pos;
+      }
+
+      friend class list;
+      template<typename Type> friend class ::DencDumper;
+
+    public:
+      ~contiguous_appender() {
+	flush_and_continue();
+      }
+
+      size_t get_out_of_band_offset() const {
+	return out_of_band_offset;
+      }
+      void append(const char* __restrict__ p, size_t l) {
+	maybe_inline_memcpy(pos, p, l, 16);
+	pos += l;
+      }
+      char *get_pos_add(size_t len) {
+	char *r = pos;
+	pos += len;
+	return r;
+      }
+      char *get_pos() const {
+	return pos;
+      }
+
+      void append(const bufferptr& p) {
+	const auto plen = p.length();
+	if (!plen) {
+	  return;
+	}
+	if (deep) {
+	  append(p.c_str(), plen);
+	} else {
+	  flush_and_continue();
+	  bl.append(p);
+	  space = bl.obtain_contiguous_space(0);
+	  out_of_band_offset += plen;
+	}
+      }
+      void append(const bufferlist& l) {
+	if (deep) {
+	  for (const auto &p : l._buffers) {
+	    append(p.c_str(), p.length());
+	  }
+	} else {
+	  flush_and_continue();
+	  bl.append(l);
+	  space = bl.obtain_contiguous_space(0);
+	  out_of_band_offset += l.length();
+	}
+      }
+
+      size_t get_logical_offset() const {
+	return out_of_band_offset + (pos - space.bp_data);
+      }
+    };
+
+    contiguous_appender get_contiguous_appender(size_t len, bool deep=false) {
+      return contiguous_appender(*this, len, deep);
+    }
+
+    class contiguous_filler {
+      friend buffer::list;
+      char* pos;
+
+      contiguous_filler(char* const pos) : pos(pos) {}
+
+    public:
+      void advance(const unsigned len) {
+	pos += len;
+      }
+      void copy_in(const unsigned len, const char* const src) {
+	memcpy(pos, src, len);
+	advance(len);
+      }
+      char* c_str() {
+        return pos;
+      }
+    };
+    // The contiguous_filler is supposed to be not costlier than a single
+    // pointer. Keep it dumb, please.
+    static_assert(sizeof(contiguous_filler) == sizeof(char*),
+		  "contiguous_filler should be no costlier than pointer");
+
+    class page_aligned_appender {
+      bufferlist& bl;
+      unsigned min_alloc;
+
+      page_aligned_appender(list *l, unsigned min_pages)
+	: bl(*l),
+	  min_alloc(min_pages * CEPH_PAGE_SIZE) {
+      }
+
+      void _refill(size_t len);
+
+      template <class Func>
+      void _append_common(size_t len, Func&& impl_f) {
+	const auto free_in_last = bl.get_append_buffer_unused_tail_length();
+	const auto first_round = std::min(len, free_in_last);
+	if (first_round) {
+	  impl_f(first_round);
+	}
+	// no C++17 for the sake of the C++11 guarantees of librados, sorry.
+	const auto second_round = len - first_round;
+	if (second_round) {
+	  _refill(second_round);
+	  impl_f(second_round);
+	}
+      }
+
+      friend class list;
+
+    public:
+      void append(const bufferlist& l) {
+	bl.append(l);
+	bl.obtain_contiguous_space(0);
+      }
+
+      void append(const char* buf, size_t entire_len) {
+	 _append_common(entire_len,
+			[buf, this] (const size_t chunk_len) mutable {
+	  bl.append(buf, chunk_len);
+	  buf += chunk_len;
+	});
+      }
+
+      void append_zero(size_t entire_len) {
+	_append_common(entire_len, [this] (const size_t chunk_len) {
+	  bl.append_zero(chunk_len);
+	});
+      }
+
+      void substr_of(const list& bl, unsigned off, unsigned len) {
+	for (const auto& bptr : bl.buffers()) {
+	  if (off >= bptr.length()) {
+	    off -= bptr.length();
+	    continue;
+	  }
+	  const auto round_size = std::min(bptr.length() - off, len);
+	  append(bptr.c_str() + off, round_size);
+	  len -= round_size;
+	  off = 0;
+	}
+      }
+    };
+
+    page_aligned_appender get_page_aligned_appender(unsigned min_pages=1) {
+      return page_aligned_appender(this, min_pages);
+    }
+
+  private:
+    // always_empty_bptr has no underlying raw but its _len is always 0.
+    // This is useful for e.g. get_append_buffer_unused_tail_length() as
+    // it allows to avoid conditionals on hot paths.
+    static ptr_node always_empty_bptr;
+    ptr_node& refill_append_space(const unsigned len);
+
+    // for page_aligned_appender; never ever expose this publicly!
+    // carriage / append_buffer is just an implementation's detail.
+    ptr& get_append_buffer() {
+      return *_carriage;
+    }
+
+  public:
+    // cons/des
+    list()
+      : _carriage(&always_empty_bptr),
+        _len(0),
+        _num(0) {
+    }
+    // cppcheck-suppress noExplicitConstructor
+    // cppcheck-suppress noExplicitConstructor
+    list(unsigned prealloc)
+      : _carriage(&always_empty_bptr),
+        _len(0),
+        _num(0) {
+      reserve(prealloc);
+    }
+
+    list(const list& other)
+      : _carriage(&always_empty_bptr),
+        _len(other._len),
+        _num(other._num) {
+      _buffers.clone_from(other._buffers);
+    }
+
+    list(list&& other) noexcept
+      : _buffers(std::move(other._buffers)),
+        _carriage(other._carriage),
+        _len(other._len),
+        _num(other._num) {
+      other.clear();
+    }
+
+    ~list() {
+      _buffers.clear_and_dispose();
+    }
+
+    list& operator= (const list& other) {
+      if (this != &other) {
+        _carriage = &always_empty_bptr;
+        _buffers.clone_from(other._buffers);
+        _len = other._len;
+        _num = other._num;
+      }
+      return *this;
+    }
+    list& operator= (list&& other) noexcept {
+      _buffers = std::move(other._buffers);
+      _carriage = other._carriage;
+      _len = other._len;
+      _num = other._num;
+      other.clear();
+      return *this;
+    }
+
+    uint64_t get_wasted_space() const;
+    unsigned get_num_buffers() const { return _num; }
+    const ptr_node& front() const { return _buffers.front(); }
+    const ptr_node& back() const { return _buffers.back(); }
+
+    int get_mempool() const;
+    void reassign_to_mempool(int pool);
+    void try_assign_to_mempool(int pool);
+
+    size_t get_append_buffer_unused_tail_length() const {
+      return _carriage->unused_tail_length();
+    }
+
+    const buffers_t& buffers() const { return _buffers; }
+    buffers_t& mut_buffers() { return _buffers; }
+    void swap(list& other) noexcept;
+    unsigned length() const {
+#if 0
+      // DEBUG: verify _len
+      unsigned len = 0;
+      for (std::list<ptr>::const_iterator it = _buffers.begin();
+	   it != _buffers.end();
+	   it++) {
+	len += (*it).length();
+      }
+#ifdef __CEPH__
+      ceph_assert(len == _len);
+#else
+      assert(len == _len);
+#endif // __CEPH__
+#endif
+      return _len;
+    }
+
+    bool contents_equal(const buffer::list& other) const;
+    bool contents_equal(const void* other, size_t length) const;
+
+    bool is_provided_buffer(const char *dst) const;
+    bool is_aligned(unsigned align) const;
+    bool is_page_aligned() const;
+    bool is_n_align_sized(unsigned align) const;
+    bool is_n_page_sized() const;
+    bool is_aligned_size_and_memory(unsigned align_size,
+				    unsigned align_memory) const;
+
+    bool is_zero() const;
+
+    // modifiers
+    void clear() noexcept {
+      _carriage = &always_empty_bptr;
+      _buffers.clear_and_dispose();
+      _len = 0;
+      _num = 0;
+    }
+    void push_back(const ptr& bp) {
+      if (bp.length() == 0)
+	return;
+      _buffers.push_back(*ptr_node::create(bp).release());
+      _len += bp.length();
+      _num += 1;
+    }
+    void push_back(ptr&& bp) {
+      if (bp.length() == 0)
+	return;
+      _len += bp.length();
+      _num += 1;
+      _buffers.push_back(*ptr_node::create(std::move(bp)).release());
+      _carriage = &always_empty_bptr;
+    }
+    void push_back(const ptr_node&) = delete;
+    void push_back(ptr_node&) = delete;
+    void push_back(ptr_node&&) = delete;
+    void push_back(std::unique_ptr<ptr_node, ptr_node::disposer> bp) {
+      _carriage = bp.get();
+      _len += bp->length();
+      _num += 1;
+      _buffers.push_back(*bp.release());
+    }
+    void push_back(raw* const r) = delete;
+    void push_back(ceph::unique_leakable_ptr<raw> r) {
+      _buffers.push_back(*ptr_node::create(std::move(r)).release());
+      _carriage = &_buffers.back();
+      _len += _buffers.back().length();
+      _num += 1;
+    }
+
+    void zero();
+    void zero(unsigned o, unsigned l);
+
+    bool is_contiguous() const;
+    void rebuild();
+    void rebuild(std::unique_ptr<ptr_node, ptr_node::disposer> nb);
+    bool rebuild_aligned(unsigned align);
+    // max_buffers = 0 mean don't care _buffers.size(), other
+    // must make _buffers.size() <= max_buffers after rebuilding.
+    bool rebuild_aligned_size_and_memory(unsigned align_size,
+					 unsigned align_memory,
+					 unsigned max_buffers = 0);
+    bool rebuild_page_aligned();
+
+    void reserve(size_t prealloc);
+
+    [[deprecated("in favor of operator=(list&&)")]] void claim(list& bl) {
+      *this = std::move(bl);
+    }
+    void claim_append(list& bl);
+    void claim_append(list&& bl) {
+      claim_append(bl);
+    }
+
+    // copy with explicit volatile-sharing semantics
+    void share(const list& bl)
+    {
+      if (this != &bl) {
+        clear();
+	for (const auto& bp : bl._buffers) {
+          _buffers.push_back(*ptr_node::create(bp).release());
+        }
+        _len = bl._len;
+        _num = bl._num;
+      }
+    }
+
+#ifdef HAVE_SEASTAR
+    /// convert the bufferlist into a network packet
+    operator seastar::net::packet() &&;
+#endif
+
+    iterator begin(size_t offset=0) {
+      return iterator(this, offset);
+    }
+    iterator end() {
+      return iterator(this, _len, _buffers.end(), 0);
+    }
+
+    const_iterator begin(size_t offset=0) const {
+      return const_iterator(this, offset);
+    }
+    const_iterator cbegin(size_t offset=0) const {
+      return begin(offset);
+    }
+    const_iterator end() const {
+      return const_iterator(this, _len, _buffers.end(), 0);
+    }
+
+    void append(char c);
+    void append(const char *data, unsigned len);
+    void append(std::string s) {
+      append(s.data(), s.length());
+    }
+#if __cplusplus >= 201703L
+    // To forcibly disambiguate between string and string_view in the
+    // case of arrays
+    template<std::size_t N>
+    void append(const char (&s)[N]) {
+      append(s, N);
+    }
+    void append(const char* s) {
+      append(s, strlen(s));
+    }
+    void append(std::string_view s) {
+      append(s.data(), s.length());
+    }
+#endif // __cplusplus >= 201703L
+    void append(const ptr& bp);
+    void append(ptr&& bp);
+    void append(const ptr& bp, unsigned off, unsigned len);
+    void append(const list& bl);
+    /// append each non-empty line from the stream and add '\n',
+    /// so a '\n' will be added even the stream does not end with EOL.
+    ///
+    /// For example, if the stream contains "ABC\n\nDEF", "ABC\nDEF\n" is
+    /// actually appended.
+    void append(std::istream& in);
+    contiguous_filler append_hole(unsigned len);
+    void append_zero(unsigned len);
+    void prepend_zero(unsigned len);
+
+    reserve_t obtain_contiguous_space(const unsigned len);
+
+    /*
+     * get a char
+     */
+    const char& operator[](unsigned n) const;
+    char *c_str();
+    std::string to_str() const;
+
+    void substr_of(const list& other, unsigned off, unsigned len);
+
+    // funky modifer
+    void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */);
+    void write(int off, int len, std::ostream& out) const;
+
+    void encode_base64(list& o);
+    void decode_base64(list& o);
+
+    void write_stream(std::ostream &out) const;
+    void hexdump(std::ostream &out, bool trailing_newline = true) const;
+    ssize_t pread_file(const char *fn, uint64_t off, uint64_t len, std::string *error);
+    int read_file(const char *fn, std::string *error);
+    ssize_t read_fd(int fd, size_t len);
+    ssize_t recv_fd(int fd, size_t len);
+    int write_file(const char *fn, int mode=0644);
+    int write_fd(int fd) const;
+    int write_fd(int fd, uint64_t offset) const;
+    int send_fd(int fd) const;
+    template<typename VectorT>
+    void prepare_iov(VectorT *piov) const {
+#ifdef __CEPH__
+      ceph_assert(_num <= IOV_MAX);
+#else
+      assert(_num <= IOV_MAX);
+#endif
+      piov->resize(_num);
+      unsigned n = 0;
+      for (auto& p : _buffers) {
+	(*piov)[n].iov_base = (void *)p.c_str();
+	(*piov)[n].iov_len = p.length();
+	++n;
+      }
+    }
+
+    struct iovec_t {
+      uint64_t offset;
+      uint64_t length;
+      std::vector<iovec> iov;
+    };
+    using iov_vec_t = std::vector<iovec_t>;
+    iov_vec_t prepare_iovs() const;
+
+    uint32_t crc32c(uint32_t crc) const;
+    void invalidate_crc();
+
+    // These functions return a bufferlist with a pointer to a single
+    // static buffer. They /must/ not outlive the memory they
+    // reference.
+    static list static_from_mem(char* c, size_t l);
+    static list static_from_cstring(char* c);
+    static list static_from_string(std::string& s);
+  };
+
+} // inline namespace v15_2_0
+
+  /*
+   * efficient hash of one or more bufferlists
+   */
+
+  class hash {
+    uint32_t crc;
+
+  public:
+    hash() : crc(0) { }
+    // cppcheck-suppress noExplicitConstructor
+    hash(uint32_t init) : crc(init) { }
+
+    void update(const buffer::list& bl) {
+      crc = bl.crc32c(crc);
+    }
+
+    uint32_t digest() {
+      return crc;
+    }
+  };
+
+inline bool operator==(const bufferlist &lhs, const bufferlist &rhs) {
+  if (lhs.length() != rhs.length())
+    return false;
+  return std::equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+inline bool operator<(const bufferlist& lhs, const bufferlist& rhs) {
+  auto l = lhs.begin(), r = rhs.begin();
+  for (; l != lhs.end() && r != rhs.end(); ++l, ++r) {
+    if (*l < *r) return true;
+    if (*l > *r) return false;
+  }
+  return (l == lhs.end()) && (r != rhs.end()); // lhs.length() < rhs.length()
+}
+
+inline bool operator<=(const bufferlist& lhs, const bufferlist& rhs) {
+  auto l = lhs.begin(), r = rhs.begin();
+  for (; l != lhs.end() && r != rhs.end(); ++l, ++r) {
+    if (*l < *r) return true;
+    if (*l > *r) return false;
+  }
+  return l == lhs.end(); // lhs.length() <= rhs.length()
+}
+
+inline bool operator!=(const bufferlist &l, const bufferlist &r) {
+  return !(l == r);
+}
+inline bool operator>(const bufferlist& lhs, const bufferlist& rhs) {
+  return rhs < lhs;
+}
+inline bool operator>=(const bufferlist& lhs, const bufferlist& rhs) {
+  return rhs <= lhs;
+}
+
+std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
+
+std::ostream& operator<<(std::ostream& out, const buffer::raw &r);
+
+std::ostream& operator<<(std::ostream& out, const buffer::list& bl);
+
+inline bufferhash& operator<<(bufferhash& l, const bufferlist &r) {
+  l.update(r);
+  return l;
+}
+
+} // namespace buffer
+
+} // namespace ceph
+
+
+#endif
diff --git a/src/include/buffer_fwd.h b/src/include/buffer_fwd.h
new file mode 100644
index 000000000..6de7b1a1f
--- /dev/null
+++ b/src/include/buffer_fwd.h
@@ -0,0 +1,19 @@
+#ifndef BUFFER_FWD_H
+#define BUFFER_FWD_H
+
+namespace ceph {
+  namespace buffer {
+    inline namespace v15_2_0 {
+      class ptr;
+      class list;
+    }
+    class hash;
+  }
+
+  using bufferptr = buffer::ptr;
+  using bufferlist = buffer::list;
+  using bufferhash = buffer::hash;
+}
+
+#endif
+
diff --git a/src/include/buffer_raw.h b/src/include/buffer_raw.h
new file mode 100644
index 000000000..2298525c9
--- /dev/null
+++ b/src/include/buffer_raw.h
@@ -0,0 +1,120 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BUFFER_RAW_H
+#define CEPH_BUFFER_RAW_H
+
+#include <map>
+#include <utility>
+#include <type_traits>
+#include "common/ceph_atomic.h"
+#include "include/buffer.h"
+#include "include/mempool.h"
+#include "include/spinlock.h"
+
+namespace ceph::buffer {
+inline namespace v15_2_0 {
+
+  class raw {
+  public:
+    // In the future we might want to have a slab allocator here with few
+    // embedded slots. This would allow to avoid the "if" in dtor of ptr_node.
+    std::aligned_storage<sizeof(ptr_node),
+			 alignof(ptr_node)>::type bptr_storage;
+  protected:
+    char *data;
+    unsigned len;
+  public:
+    ceph::atomic<unsigned> nref { 0 };
+    int mempool;
+
+    std::pair<size_t, size_t> last_crc_offset {std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max()};
+    std::pair<uint32_t, uint32_t> last_crc_val;
+
+    mutable ceph::spinlock crc_spinlock;
+
+    explicit raw(unsigned l, int mempool=mempool::mempool_buffer_anon)
+      : data(nullptr), len(l), nref(0), mempool(mempool) {
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+    }
+    raw(char *c, unsigned l, int mempool=mempool::mempool_buffer_anon)
+      : data(c), len(l), nref(0), mempool(mempool) {
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+    }
+    virtual ~raw() {
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+	-1, -(int)len);
+    }
+
+    void _set_len(unsigned l) {
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+	-1, -(int)len);
+      len = l;
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len);
+    }
+
+    void reassign_to_mempool(int pool) {
+      if (pool == mempool) {
+	return;
+      }
+      mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(
+	-1, -(int)len);
+      mempool = pool;
+      mempool::get_pool(mempool::pool_index_t(pool)).adjust_count(1, len);
+    }
+
+    void try_assign_to_mempool(int pool) {
+      if (mempool == mempool::mempool_buffer_anon) {
+	reassign_to_mempool(pool);
+      }
+    }
+
+private:
+    // no copying.
+    // cppcheck-suppress noExplicitConstructor
+    raw(const raw &other) = delete;
+    const raw& operator=(const raw &other) = delete;
+public:
+    char *get_data() const {
+      return data;
+    }
+    unsigned get_len() const {
+      return len;
+    }
+    bool get_crc(const std::pair<size_t, size_t> &fromto,
+		 std::pair<uint32_t, uint32_t> *crc) const {
+      std::lock_guard lg(crc_spinlock);
+      if (last_crc_offset == fromto) {
+        *crc = last_crc_val;
+        return true;
+      }
+      return false;
+    }
+    void set_crc(const std::pair<size_t, size_t> &fromto,
+		 const std::pair<uint32_t, uint32_t> &crc) {
+      std::lock_guard lg(crc_spinlock);
+      last_crc_offset = fromto;
+      last_crc_val = crc;
+    }
+    void invalidate_crc() {
+      std::lock_guard lg(crc_spinlock);
+      last_crc_offset.first = std::numeric_limits<size_t>::max();
+      last_crc_offset.second = std::numeric_limits<size_t>::max();
+    }
+  };
+
+} // inline namespace v15_2_0
+} // namespace ceph::buffer
+
+#endif // CEPH_BUFFER_RAW_H
diff --git a/src/include/byteorder.h b/src/include/byteorder.h
new file mode 100644
index 000000000..eb6d5e102
--- /dev/null
+++ b/src/include/byteorder.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <boost/endian/conversion.hpp>
+
+#include "int_types.h"
+
+template<typename T>
+inline T swab(T val) {
+  return boost::endian::endian_reverse(val);
+}
+
+template<typename T>
+struct ceph_le {
+private:
+  T v;
+public:
+  ceph_le() = default;
+  explicit ceph_le(T nv)
+    : v{boost::endian::native_to_little(nv)}
+  {}
+  ceph_le<T>& operator=(T nv) {
+    v = boost::endian::native_to_little(nv);
+    return *this;
+  }
+  operator T() const { return boost::endian::little_to_native(v); }
+  friend inline bool operator==(ceph_le a, ceph_le b) {
+    return a.v == b.v;
+  }
+} __attribute__ ((packed));
+
+using ceph_le64 = ceph_le<__u64>;
+using ceph_le32 = ceph_le<__u32>;
+using ceph_le16 = ceph_le<__u16>;
+
+using ceph_les64 = ceph_le<__s64>;
+using ceph_les32 = ceph_le<__s32>;
+using ceph_les16 = ceph_le<__s16>;
+
+inline ceph_les64 init_les64(__s64 x) {
+  ceph_les64 v;
+  v = x;
+  return v;
+}
+inline ceph_les32 init_les32(__s32 x) {
+  ceph_les32 v;
+  v = x;
+  return v;
+}
+inline ceph_les16 init_les16(__s16 x) {
+  ceph_les16 v;
+  v = x;
+  return v;
+}
diff --git a/src/include/ceph_assert.h b/src/include/ceph_assert.h
new file mode 100644
index 000000000..0627894ea
--- /dev/null
+++ b/src/include/ceph_assert.h
@@ -0,0 +1,147 @@
+#ifndef CEPH_ASSERT_H
+#define CEPH_ASSERT_H
+
+#include <cstdlib>
+#include <string>
+
+#ifndef __STRING
+# define __STRING(x) #x
+#endif
+
+#if defined(__linux__)
+#include <features.h>
+
+#elif defined(__FreeBSD__)
+#include <sys/cdefs.h>
+#define	__GNUC_PREREQ(minor, major)	__GNUC_PREREQ__(minor, major)
+#elif defined(__sun) || defined(_AIX)
+#include "include/compat.h"
+#include <assert.h>
+#endif
+
+#ifdef __CEPH__
+# include "acconfig.h"
+#endif
+
+#include "include/common_fwd.h"
+
+namespace ceph {
+
+struct BackTrace;
+
+/*
+ * Select a function-name variable based on compiler tests, and any compiler
+ * specific overrides.
+ */
+#if defined(HAVE_PRETTY_FUNC)
+# define __CEPH_ASSERT_FUNCTION __PRETTY_FUNCTION__
+#elif defined(HAVE_FUNC)
+# define __CEPH_ASSERT_FUNCTION __func__
+#else
+# define __CEPH_ASSERT_FUNCTION ((__const char *) 0)
+#endif
+
+extern void register_assert_context(CephContext *cct);
+
+struct assert_data {
+  const char *assertion;
+  const char *file;
+  const int line;
+  const char *function;
+};
+
+extern void __ceph_assert_fail(const char *assertion, const char *file, int line, const char *function)
+  __attribute__ ((__noreturn__));
+extern void __ceph_assert_fail(const assert_data &ctx)
+  __attribute__ ((__noreturn__));
+
+extern void __ceph_assertf_fail(const char *assertion, const char *file, int line, const char *function, const char* msg, ...)
+  __attribute__ ((__noreturn__));
+extern void __ceph_assert_warn(const char *assertion, const char *file, int line, const char *function);
+
+[[noreturn]] void __ceph_abort(const char *file, int line, const char *func,
+                               const std::string& msg);
+
+[[noreturn]] void __ceph_abortf(const char *file, int line, const char *func,
+                                const char* msg, ...);
+
+#define _CEPH_ASSERT_VOID_CAST static_cast<void>
+
+#define assert_warn(expr)							\
+  ((expr)								\
+   ? _CEPH_ASSERT_VOID_CAST (0)					\
+   : ::ceph::__ceph_assert_warn (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION))
+
+}
+
+using namespace ceph;
+
+
+/*
+ * ceph_abort aborts the program with a nice backtrace.
+ *
+ * Currently, it's the same as assert(0), but we may one day make assert a
+ * debug-only thing, like it is in many projects.
+ */
+#define ceph_abort(msg, ...)                                            \
+  ::ceph::__ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, "abort() called")
+
+#define ceph_abort_msg(msg)                                             \
+  ::ceph::__ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, msg) 
+
+#define ceph_abort_msgf(...)                                             \
+  ::ceph::__ceph_abortf( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)
+
+#ifdef __SANITIZE_ADDRESS__
+#define ceph_assert(expr)                           \
+  do {                                              \
+    ((expr))                                        \
+    ? _CEPH_ASSERT_VOID_CAST (0)                    \
+      : ::ceph::__ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \
+  } while (false)
+#else
+#define ceph_assert(expr)							\
+  do { static const ceph::assert_data assert_data_ctx = \
+   {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \
+   ((expr) \
+   ? _CEPH_ASSERT_VOID_CAST (0) \
+    : ::ceph::__ceph_assert_fail(assert_data_ctx)); } while(false)
+#endif
+
+// this variant will *never* get compiled out to NDEBUG in the future.
+// (ceph_assert currently doesn't either, but in the future it might.)
+#ifdef __SANITIZE_ADDRESS__
+#define ceph_assert_always(expr)                    \
+  do {                                              \
+    ((expr))                                        \
+    ? _CEPH_ASSERT_VOID_CAST (0)                    \
+      : ::ceph::__ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \
+  } while(false)
+#else
+#define ceph_assert_always(expr)							\
+  do { static const ceph::assert_data assert_data_ctx = \
+   {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \
+   ((expr) \
+   ? _CEPH_ASSERT_VOID_CAST (0) \
+    : ::ceph::__ceph_assert_fail(assert_data_ctx)); } while(false)
+#endif
+
+// Named by analogy with printf.  Along with an expression, takes a format
+// string and parameters which are printed if the assertion fails.
+#define assertf(expr, ...)                  \
+  ((expr)								\
+   ? _CEPH_ASSERT_VOID_CAST (0)					\
+   : ::ceph::__ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+#define ceph_assertf(expr, ...)                  \
+  ((expr)								\
+   ? _CEPH_ASSERT_VOID_CAST (0)					\
+   : ::ceph::__ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+
+// this variant will *never* get compiled out to NDEBUG in the future.
+// (ceph_assertf currently doesn't either, but in the future it might.)
+#define ceph_assertf_always(expr, ...)                  \
+  ((expr)								\
+   ? _CEPH_ASSERT_VOID_CAST (0)					\
+   : ::ceph::__ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__))
+
+#endif
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
new file mode 100644
index 000000000..794e10efd
--- /dev/null
+++ b/src/include/ceph_features.h
@@ -0,0 +1,280 @@
+#ifndef __CEPH_FEATURES
+#define __CEPH_FEATURES
+
+#include "sys/types.h"
+
+/*
+ * Each time we reclaim bits for reuse we need to specify another
+ * bitmask that, if all bits are set, indicates we have the new
+ * incarnation of that feature.  Base case is 1 (first use)
+ */
+#define CEPH_FEATURE_INCARNATION_1 (0ull)
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57)              // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
+
+#define DEFINE_CEPH_FEATURE(bit, incarnation, name)			\
+	const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit);		\
+	const static uint64_t CEPH_FEATUREMASK_##name =			\
+		(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+// this bit is ignored but still advertised by release *when*
+#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
+	const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
+	const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name =		\
+		(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+// this bit is ignored by release *unused* and not advertised by
+// release *unadvertised*
+#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
+
+
+// test for a feature.  this test is safer than a typical mask against
+// the bit because it ensures that we have the bit AND the marker for the
+// bit's incarnation.  this must be used in any case where the features
+// bits may include an old meaning of the bit.
+#define HAVE_FEATURE(x, name)				\
+	(((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
+
+
+/*
+ * Notes on deprecation:
+ *
+ * For feature bits used *only* on the server-side:
+ *
+ *  - In the first phase we indicate that a feature is DEPRECATED as of
+ *    a particular release.  This is the first major release X (say,
+ *    mimic) that does not depend on its peers advertising the feature.
+ *    That is, it safely assumes its peers all have the feature.  We
+ *    indicate this with the DEPRECATED macro.  For example,
+ *
+ *      DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MON_METADATA, MIMIC)
+ *
+ *    because 13.2.z (mimic) did not care if its peers advertised this
+ *    feature bit.
+ *
+ *  - In the second phase we stop advertising the the bit and call it
+ *    RETIRED.  This can normally be done 2 major releases
+ *    following the one in which we marked the feature DEPRECATED.  In
+ *    the above example, for 15.0.z (octopus) we can say:
+ *
+ *      DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MON_METADATA, MIMIC, OCTOPUS)
+ *
+ *  - The bit can be reused in the next release that will never talk to
+ *    a pre-octopus daemon (13 mimic or 14 nautlius) that advertises the
+ *    bit: in this case, the 16.y.z (P-release).
+ *
+ * This ensures that no two versions who have different meanings for
+ * the bit ever speak to each other.
+ */
+
+/*
+ * Notes on the kernel client:
+ *
+ * - "X" means that the feature bit has been advertised and supported
+ *   since kernel X
+ *
+ * - "X req" means that the feature bit has been advertised and required
+ *   since kernel X
+ *
+ * The remaining feature bits are not and have never been used by the
+ * kernel client.
+ */
+
+DEFINE_CEPH_FEATURE( 0, 1, UID)
+DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)        // 2.6.35 req
+DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
+DEFINE_CEPH_FEATURE( 3, 1, FLOCK)            // 2.6.36
+DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)       // 4.6 req
+DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
+DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ)    // 3.10 req
+DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH)    // 2.6.38
+DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
+DEFINE_CEPH_FEATURE( 9, 1, PGID64)           // 3.9 req
+DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
+DEFINE_CEPH_FEATURE(11, 1, PGPOOL3)          // 3.9 req
+DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
+DEFINE_CEPH_FEATURE(13, 1, OSDENC)           // 3.9 req
+DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
+DEFINE_CEPH_FEATURE(15, 1, MONENC)
+DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(16, 3, SERVER_OCTOPUS)
+DEFINE_CEPH_FEATURE(16, 3, OSD_REPOP_MLCOD)
+DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(17, 3, OS_PERF_STAT_NS)
+DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)   // 3.6
+DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(19, 2, OSD_PGLOG_HARDLIMIT)
+DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(20, 3, SERVER_PACIFIC)
+DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS)  // 4.13
+DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT)  // overlap
+DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF)    // overlap
+DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP)  // overlap
+DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(22, 2, OSD_FIXED_COLLECTION_LIST)
+DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH)         // 3.19 req (unless nocephx_require_signatures)
+DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(24, 2, RECOVERY_RESERVATION_2)
+DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)  // 3.9
+DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
+DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) // 3.9
+DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
+DEFINE_CEPH_FEATURE(29, 1, MDSENC)           // 4.7
+DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)    // 3.9
+DEFINE_CEPH_FEATURE_RETIRED(31, 1, MON_SINGLE_PAXOS, NAUTILUS, PACIFIC)
+DEFINE_CEPH_FEATURE(31, 3, SERVER_REEF)
+DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(32, 3, STRETCH_MODE)
+DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(33, 3, SERVER_QUINCY)
+DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(34, 3, RANGE_BLOCKLIST)
+DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL)    // 3.14
+DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2)         // 3.14
+DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER)      // 3.14
+DEFINE_CEPH_FEATURE_RETIRED(38, 1, OSD_ERASURE_CODES, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC)       // 3.15
+DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA)  // 3.19
+DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3)  // 3.15
+DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
+DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2)  // 4.3 (for consistency)
+DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND)   // 4.13
+DEFINE_CEPH_FEATURE_RETIRED(44, 1, ERASURE_CODE_PLUGINS_V2, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
+// available
+DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA)        // 4.17
+DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4)         // 4.1
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
+// available
+DEFINE_CEPH_FEATURE_RETIRED(50, 1, MON_METADATA, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(51, 1, OSD_BITWISE_HOBJ_SORT, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(52, 1, OSD_PROXY_WRITE_FEATURES, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(53, 1, ERASURE_CODE_PLUGINS_V3, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(54, 1, OSD_HITSET_GMT, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE_RETIRED(55, 1, HAMMER_0_94_4, MIMIC, OCTOPUS)
+// available
+DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) // 4.13 (for pg_pool_t >= v25)
+DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) // 4.13
+DEFINE_CEPH_FEATURE_RETIRED(57, 1, MON_ROUTE_OSDMAP, MIMIC, OCTOPUS) // overlap
+DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
+DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5)  // 4.5
+DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
+DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
+DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
+DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
+DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
+DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit*
+DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2)         // 4.19, *do not share this bit*
+
+DEFINE_CEPH_FEATURE(62, 1, RESERVED)           // do not use; used as a sentinel
+DEFINE_CEPH_FEATURE_RETIRED(63, 1, RESERVED_BROKEN, LUMINOUS, QUINCY) // client-facing
+// available
+
+
+/*
+ * Features supported.  Should be everything above.
+ */
+#define CEPH_FEATURES_ALL		 \
+	(CEPH_FEATURE_UID |		 \
+	 CEPH_FEATURE_NOSRCADDR |	 \
+	 CEPH_FEATURE_FLOCK |		 \
+	 CEPH_FEATURE_SUBSCRIBE2 |	 \
+	 CEPH_FEATURE_MONNAMES |	 \
+	 CEPH_FEATURE_RECONNECT_SEQ |	 \
+	 CEPH_FEATURE_DIRLAYOUTHASH |	 \
+	 CEPH_FEATURE_OBJECTLOCATOR |	 \
+	 CEPH_FEATURE_PGID64 |		 \
+	 CEPH_FEATURE_INCSUBOSDMAP |	 \
+	 CEPH_FEATURE_PGPOOL3 |		 \
+	 CEPH_FEATURE_OSDREPLYMUX |	 \
+	 CEPH_FEATURE_OSDENC |		 \
+	 CEPH_FEATURE_MONENC |		 \
+	 CEPH_FEATURE_CRUSH_TUNABLES |	 \
+	 CEPH_FEATURE_MSG_AUTH |	     \
+	 CEPH_FEATURE_CRUSH_TUNABLES2 |	     \
+	 CEPH_FEATURE_CREATEPOOLID |	     \
+	 CEPH_FEATURE_REPLY_CREATE_INODE |   \
+	 CEPH_FEATURE_MDSENC |			\
+	 CEPH_FEATURE_OSDHASHPSPOOL |       \
+	 CEPH_FEATURE_NEW_OSDOP_ENCODING |        \
+         CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \
+	 CEPH_FEATURE_OSD_CACHEPOOL |	    \
+	 CEPH_FEATURE_CRUSH_V2 |	    \
+	 CEPH_FEATURE_EXPORT_PEER |	    \
+	 CEPH_FEATURE_OSDMAP_ENC |          \
+	 CEPH_FEATURE_MDS_INLINE_DATA |	    \
+	 CEPH_FEATURE_CRUSH_TUNABLES3 |	    \
+	 CEPH_FEATURE_OSD_PRIMARY_AFFINITY |	\
+	 CEPH_FEATURE_MSGR_KEEPALIVE2 |	\
+	 CEPH_FEATURE_OSD_POOLRESEND |	\
+	 CEPH_FEATURE_OSD_FADVISE_FLAGS |     \
+	 CEPH_FEATURE_MDS_QUOTA | \
+         CEPH_FEATURE_CRUSH_V4 |	     \
+	 CEPH_FEATURE_MON_STATEFUL_SUB |	 \
+	 CEPH_FEATURE_CRUSH_TUNABLES5 |	    \
+	 CEPH_FEATURE_SERVER_JEWEL |  \
+	 CEPH_FEATURE_FS_FILE_LAYOUT_V2 |		 \
+	 CEPH_FEATURE_SERVER_KRAKEN |	\
+	 CEPH_FEATURE_FS_BTIME |			 \
+	 CEPH_FEATURE_FS_CHANGE_ATTR |			 \
+	 CEPH_FEATURE_MSG_ADDR2 | \
+	 CEPH_FEATURE_SERVER_LUMINOUS |		\
+	 CEPH_FEATURE_RESEND_ON_SPLIT |		\
+	 CEPH_FEATURE_RADOS_BACKOFF |		\
+	 CEPH_FEATURE_OSD_RECOVERY_DELETES |	\
+	 CEPH_FEATURE_SERVER_MIMIC |		\
+	 CEPH_FEATURE_RECOVERY_RESERVATION_2 |	\
+	 CEPH_FEATURE_SERVER_NAUTILUS |		\
+	 CEPH_FEATURE_CEPHX_V2 | \
+	 CEPH_FEATURE_OSD_PGLOG_HARDLIMIT | \
+	 CEPH_FEATUREMASK_SERVER_OCTOPUS | \
+	 CEPH_FEATUREMASK_STRETCH_MODE | \
+	 CEPH_FEATUREMASK_OSD_REPOP_MLCOD | \
+	 CEPH_FEATUREMASK_SERVER_PACIFIC | \
+	 CEPH_FEATURE_OSD_FIXED_COLLECTION_LIST | \
+	 CEPH_FEATUREMASK_SERVER_QUINCY | \
+	 CEPH_FEATURE_RANGE_BLOCKLIST | \
+	 CEPH_FEATUREMASK_SERVER_REEF | \
+	 0ULL)
+
+#define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
+
+/*
+ * crush related features
+ */
+#define CEPH_FEATURES_CRUSH			\
+	(CEPH_FEATURE_CRUSH_TUNABLES |		\
+	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
+	 CEPH_FEATURE_CRUSH_TUNABLES3 |		\
+	 CEPH_FEATURE_CRUSH_TUNABLES5 |		\
+	 CEPH_FEATURE_CRUSH_V2 |		\
+	 CEPH_FEATURE_CRUSH_V4 |		\
+	 CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS)
+
+/*
+ * make sure we don't try to use the reserved features
+ */
+#define CEPH_STATIC_ASSERT(x) (void)(sizeof(int[((x)==0) ? -1 : 0]))
+
+static inline void ____build_time_check_for_reserved_bits(void) {
+	CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL & CEPH_FEATURE_RESERVED) == 0);
+}
+
+#endif
diff --git a/src/include/ceph_frag.h b/src/include/ceph_frag.h
new file mode 100644
index 000000000..5babb8e95
--- /dev/null
+++ b/src/include/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+	return (b << 24) |
+		(v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+	return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+	return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+	return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+	return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+	/* is sub as specific as us, and contained by us? */
+	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f) - 1,
+			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1,
+	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+	int newbits = ceph_frag_bits(f) + by;
+	return ceph_frag_make(newbits,
+			 ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+	return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+	return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+			 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
new file mode 100644
index 000000000..28440c820
--- /dev/null
+++ b/src/include/ceph_fs.h
@@ -0,0 +1,1137 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL-2.1 or LGPL-3.0
+ */
+
+#ifndef CEPH_FS_H
+#define CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+#include "include/encoding.h"
+#include "include/denc.h"
+
+/*
+ * The data structures defined here are shared between Linux kernel and
+ * user space.  Also, those data structures are maintained always in
+ * little-endian byte order, even on big-endian systems.  This is handled
+ * differently in kernel vs. user space.  For use as kernel headers, the
+ * little-endian fields need to use the __le16/__le32/__le64 types.  These
+ * are markers that indicate endian conversion routines must be used
+ * whenever such fields are accessed, which can be verified by checker
+ * tools like "sparse".  For use as user-space headers, the little-endian
+ * fields instead use types ceph_le16/ceph_le32/ceph_le64, which are C++
+ * classes that implement automatic endian conversion on every access.
+ * To still allow for header sharing, this file uses the __le types, but
+ * redefines those to the ceph_ types when compiled in user space.
+ */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
+
+
+#define CEPH_INO_ROOT             1
+/*
+ * hidden .ceph dir, which is no longer created but
+ * recognised in existing filesystems so that we
+ * don't try to fragment it.
+ */
+#define CEPH_INO_CEPH             2
+#define CEPH_INO_GLOBAL_SNAPREALM 3
+#define CEPH_INO_LOST_AND_FOUND   4 /* reserved ino for use in recovery */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+	/* file -> object mapping */
+	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+				      of page size. */
+	__le32 fl_stripe_count;    /* over this many objects */
+	__le32 fl_object_size;     /* until objects are this big, then move to
+				      new objects */
+	__le32 fl_cas_hash;        /* UNUSED.  0 = none; 1 = sha256 */
+
+	/* pg -> disk layout */
+	__le32 fl_object_stripe_unit;  /* UNUSED.  for per-object parity, if any */
+
+	/* object -> pg layout */
+	__le32 fl_unused;       /* unused; used to be preferred primary for pg (-1 for none) */
+	__le32 fl_pg_pool;      /* namespace, crush rule, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+struct ceph_dir_layout {
+	__u8   dl_dir_hash;   /* see ceph_hash.h for ids */
+	__u8   dl_unused1;
+	__u16  dl_unused2;
+	__u32  dl_unused3;
+} __attribute__ ((packed));
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+
+#define CEPH_AES_IV "cephsageyudagreg"
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN	0x0
+#define CEPH_AUTH_NONE	 	0x1
+#define CEPH_AUTH_CEPHX	 	0x2
+
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN 0x0
+#define CEPH_CON_MODE_CRC     0x1
+#define CEPH_CON_MODE_SECURE  0x2
+
+extern const char *ceph_con_mode_name(int con_mode);
+
+/*  For options with "_", like: GSS_GSS
+    which means: Mode/Protocol to validate "authentication_authorization",
+    where:
+      - Authentication: Verifying the identity of an entity.
+      - Authorization:  Verifying that an authenticated entity has
+                        the right to access a particular resource.
+*/ 
+#define CEPH_AUTH_GSS     0x4
+#define CEPH_AUTH_GSS_GSS CEPH_AUTH_GSS
+
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_MON_GET_OSDMAP         6
+#define CEPH_MSG_MON_METADATA           7
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH			17
+#define CEPH_MSG_AUTH_REPLY		18
+#define CEPH_MSG_MON_GET_VERSION        19
+#define CEPH_MSG_MON_GET_VERSION_REPLY  20
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP                21
+
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_RECLAIM		27
+#define CEPH_MSG_CLIENT_RECLAIM_REPLY   28
+#define CEPH_MSG_CLIENT_METRICS         29
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+#define CEPH_MSG_CLIENT_QUOTA           0x314
+
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
+
+
+/* osd */
+#define CEPH_MSG_OSD_MAP                41
+#define CEPH_MSG_OSD_OP                 42
+#define CEPH_MSG_OSD_OPREPLY            43
+#define CEPH_MSG_WATCH_NOTIFY           44
+#define CEPH_MSG_OSD_BACKOFF            61
+
+/* FSMap subscribers (see all MDS clusters at once) */
+#define CEPH_MSG_FS_MAP                 45
+/* FSMapUser subscribers (get MDS clusters name->ID mapping) */
+#define CEPH_MSG_FS_MAP_USER		103
+
+/* watch-notify operations */
+enum {
+	CEPH_WATCH_EVENT_NOTIFY		  = 1, /* notifying watcher */
+	CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
+	CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
+};
+
+const char *ceph_watch_event_name(int o);
+
+/* pool operations */
+enum {
+  POOL_OP_CREATE			= 0x01,
+  POOL_OP_DELETE			= 0x02,
+  POOL_OP_AUID_CHANGE			= 0x03,
+  POOL_OP_CREATE_SNAP			= 0x11,
+  POOL_OP_DELETE_SNAP			= 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP		= 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP		= 0x22,
+};
+
+struct ceph_mon_request_header {
+	__le64 have_version;
+	__le16 session_mon;
+	__le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+	__le64 kb, kb_used, kb_avail;
+	__le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+	struct ceph_fsid fsid;
+	__le64 version;
+	struct ceph_statfs st;
+} __attribute__ ((packed));
+
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 pool;
+	__le32 op;
+	__le64 __old_auid;  // obsolete
+	__le64 snapid;
+	__le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 reply_code;
+	__le32 epoch;
+	char has_data;
+	char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+	__le64 snapid;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+	struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+#define CEPH_SUBSCRIBE_ONETIME    1  /* i want only 1 update after have */
+
+struct ceph_mon_subscribe_item {
+	__le64 start;
+	__u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+	__le32 duration;         /* seconds */
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mdsmap flags
+ */
+#define CEPH_MDSMAP_NOT_JOINABLE                 (1<<0)  /* standbys cannot join */
+#define CEPH_MDSMAP_DOWN                         (CEPH_MDSMAP_NOT_JOINABLE) /* backwards compat */
+#define CEPH_MDSMAP_ALLOW_SNAPS                  (1<<1)  /* cluster allowed to create snapshots */
+/* deprecated #define CEPH_MDSMAP_ALLOW_MULTIMDS (1<<2) cluster allowed to have >1 active MDS */
+/* deprecated #define CEPH_MDSMAP_ALLOW_DIRFRAGS (1<<3) cluster allowed to fragment directories */
+#define CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS	     (1<<4)  /* cluster alllowed to enable MULTIMDS
+                                                            and SNAPS at the same time */
+#define CEPH_MDSMAP_ALLOW_STANDBY_REPLAY         (1<<5)  /* cluster alllowed to enable MULTIMDS */
+#define CEPH_MDSMAP_REFUSE_CLIENT_SESSION        (1<<6)  /* cluster allowed to refuse client session
+                                                            request */
+#define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \
+			      CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS)
+
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+					  empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAYONCE   -9 /* Legacy, unused */
+#define CEPH_MDS_STATE_NULL         -10
+
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+					  operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+#define CEPH_MDS_STATE_DAMAGED      15 /* rank not replayable, need repair */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DN          (1 << 0)
+#define CEPH_LOCK_DVERSION    (1 << 1)
+#define CEPH_LOCK_ISNAP       (1 << 4)  /* snapshot lock. MDS internal */
+#define CEPH_LOCK_IPOLICY     (1 << 5)  /* policy lock on dirs. MDS internal */
+#define CEPH_LOCK_IFILE       (1 << 6)
+#define CEPH_LOCK_INEST       (1 << 7)  /* mds internal */
+#define CEPH_LOCK_IDFT        (1 << 8)  /* dir frag tree */
+#define CEPH_LOCK_IAUTH       (1 << 9)
+#define CEPH_LOCK_ILINK       (1 << 10)
+#define CEPH_LOCK_IXATTR      (1 << 11)
+#define CEPH_LOCK_IFLOCK      (1 << 12)  /* advisory file locks */
+#define CEPH_LOCK_IVERSION    (1 << 13)  /* mds internal */
+
+#define CEPH_LOCK_IFIRST      CEPH_LOCK_ISNAP
+
+
+/* client_session ops */
+enum {
+	CEPH_SESSION_REQUEST_OPEN,
+	CEPH_SESSION_OPEN,
+	CEPH_SESSION_REQUEST_CLOSE,
+	CEPH_SESSION_CLOSE,
+	CEPH_SESSION_REQUEST_RENEWCAPS,
+	CEPH_SESSION_RENEWCAPS,
+	CEPH_SESSION_STALE,
+	CEPH_SESSION_RECALL_STATE,
+	CEPH_SESSION_FLUSHMSG,
+	CEPH_SESSION_FLUSHMSG_ACK,
+	CEPH_SESSION_FORCE_RO,
+    // A response to REQUEST_OPEN indicating that the client should
+    // permanently desist from contacting the MDS
+	CEPH_SESSION_REJECT,
+        CEPH_SESSION_REQUEST_FLUSH_MDLOG
+};
+
+// flags for state reclaim
+#define CEPH_RECLAIM_RESET	1
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+	__le32 op;
+	__le64 seq;
+	struct ceph_timespec stamp;
+	__le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+	CEPH_MDS_OP_LOOKUP     = 0x00100,
+	CEPH_MDS_OP_GETATTR    = 0x00101,
+	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+	CEPH_MDS_OP_LOOKUPINO  = 0x00104,
+	CEPH_MDS_OP_LOOKUPNAME = 0x00105,
+	CEPH_MDS_OP_GETVXATTR  = 0x00106,
+	CEPH_MDS_OP_DUMMY = 0x00107,
+
+	CEPH_MDS_OP_SETXATTR   = 0x01105,
+	CEPH_MDS_OP_RMXATTR    = 0x01106,
+	CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+	CEPH_MDS_OP_SETATTR    = 0x01108,
+	CEPH_MDS_OP_SETFILELOCK= 0x01109,
+	CEPH_MDS_OP_GETFILELOCK= 0x00110,
+	CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
+
+	CEPH_MDS_OP_MKNOD      = 0x01201,
+	CEPH_MDS_OP_LINK       = 0x01202,
+	CEPH_MDS_OP_UNLINK     = 0x01203,
+	CEPH_MDS_OP_RENAME     = 0x01204,
+	CEPH_MDS_OP_MKDIR      = 0x01220,
+	CEPH_MDS_OP_RMDIR      = 0x01221,
+	CEPH_MDS_OP_SYMLINK    = 0x01222,
+
+	CEPH_MDS_OP_CREATE     = 0x01301,
+	CEPH_MDS_OP_OPEN       = 0x00302,
+	CEPH_MDS_OP_READDIR    = 0x00305,
+
+	CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+	CEPH_MDS_OP_MKSNAP     = 0x01400,
+	CEPH_MDS_OP_RMSNAP     = 0x01401,
+	CEPH_MDS_OP_LSSNAP     = 0x00402,
+	CEPH_MDS_OP_RENAMESNAP = 0x01403,
+	CEPH_MDS_OP_READDIR_SNAPDIFF   = 0x01404,
+
+	// internal op
+	CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
+	CEPH_MDS_OP_EXPORTDIR  = 0x01501,
+	CEPH_MDS_OP_FLUSH      = 0x01502,
+	CEPH_MDS_OP_ENQUEUE_SCRUB  = 0x01503,
+	CEPH_MDS_OP_REPAIR_FRAGSTATS = 0x01504,
+	CEPH_MDS_OP_REPAIR_INODESTATS = 0x01505,
+	CEPH_MDS_OP_RDLOCK_FRAGSSTATS = 0x01507
+};
+
+#define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE     || \
+				     op == CEPH_MDS_OP_MKNOD      || \
+				     op == CEPH_MDS_OP_MKDIR      || \
+				     op == CEPH_MDS_OP_SYMLINK)
+
+extern const char *ceph_mds_op_name(int op);
+
+// setattr mask is an int
+#ifndef CEPH_SETATTR_MODE
+#define CEPH_SETATTR_MODE		(1 << 0)
+#define CEPH_SETATTR_UID		(1 << 1)
+#define CEPH_SETATTR_GID		(1 << 2)
+#define CEPH_SETATTR_MTIME		(1 << 3)
+#define CEPH_SETATTR_ATIME		(1 << 4)
+#define CEPH_SETATTR_SIZE		(1 << 5)
+#define CEPH_SETATTR_CTIME		(1 << 6)
+#define CEPH_SETATTR_MTIME_NOW		(1 << 7)
+#define CEPH_SETATTR_ATIME_NOW		(1 << 8)
+#define CEPH_SETATTR_BTIME		(1 << 9)
+#define CEPH_SETATTR_KILL_SGUID		(1 << 10)
+#define CEPH_SETATTR_FSCRYPT_AUTH	(1 << 11)
+#define CEPH_SETATTR_FSCRYPT_FILE	(1 << 12)
+#define CEPH_SETATTR_KILL_SUID		(1 << 13)
+#define CEPH_SETATTR_KILL_SGID		(1 << 14)
+#endif
+
+/*
+ * open request flags
+ */
+#define CEPH_O_RDONLY          00000000
+#define CEPH_O_WRONLY          00000001
+#define CEPH_O_RDWR            00000002
+#define CEPH_O_CREAT           00000100
+#define CEPH_O_EXCL            00000200
+#define CEPH_O_TRUNC           00001000
+#define CEPH_O_LAZY            00020000
+#define CEPH_O_DIRECTORY       00200000
+#define CEPH_O_NOFOLLOW        00400000
+
+int ceph_flags_sys2wire(int flags);
+
+/*
+ * Ceph setxattr request flags.
+ */
+#define CEPH_XATTR_CREATE  (1 << 0)
+#define CEPH_XATTR_REPLACE (1 << 1)
+#define CEPH_XATTR_REMOVE  (1 << 31)
+
+/*
+ * readdir/readdir_snapdiff request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS	(1<<0)
+
+/*
+ * readdir/readdir_snapdiff reply flags.
+ */
+#define CEPH_READDIR_FRAG_END		(1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
+#define CEPH_READDIR_HASH_ORDER		(1<<9)
+#define CEPH_READDIR_OFFSET_HASH       (1<<10)
+
+/* Note that this is embedded wthin ceph_mds_request_head_legacy. */
+union ceph_mds_request_args_legacy {
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+	} __attribute__ ((packed)) getattr;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+	} __attribute__ ((packed)) setattr;
+	struct {
+		__le32 frag;                 /* which dir fragment */
+		__le32 max_entries;          /* how many dentries to grab */
+		__le32 max_bytes;
+		__le16 flags;
+               __le32 offset_hash;
+	} __attribute__ ((packed)) readdir;
+	struct {
+		__le32 mode;
+		__le32 rdev;
+	} __attribute__ ((packed)) mknod;
+	struct {
+		__le32 mode;
+	} __attribute__ ((packed)) mkdir;
+	struct {
+		__le32 flags;
+		__le32 mode;
+		__le32 stripe_unit;          /* layout for newly created file */
+		__le32 stripe_count;         /* ... */
+		__le32 object_size;
+		__le32 pool;                 /* if >= 0 and CREATEPOOLID feature */
+		__le32 mask;                 /* CEPH_CAP_* */
+		__le64 old_size;             /* if O_TRUNC */
+	} __attribute__ ((packed)) open;
+	struct {
+		__le32 flags;
+		__le32 osdmap_epoch; 	    /* use for set file/dir layout */
+	} __attribute__ ((packed)) setxattr;
+	struct {
+		struct ceph_file_layout layout;
+	} __attribute__ ((packed)) setlayout;
+	struct {
+		__u8 rule; /* currently fcntl or flock */
+		__u8 type; /* shared, exclusive, remove*/
+		__le64 owner; /* who requests/holds the lock */
+		__le64 pid; /* process id requesting the lock */
+		__le64 start; /* initial location to lock */
+		__le64 length; /* num bytes to lock from start */
+		__u8 wait; /* will caller wait for lock to become available? */
+	} __attribute__ ((packed)) filelock_change;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+#define CEPH_MDS_FLAG_ASYNC         4  /* request is async */
+
+struct ceph_mds_request_head_legacy {
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args_legacy args;
+} __attribute__ ((packed));
+
+/*
+ * Note that this is embedded wthin ceph_mds_request_head. Also, compatibility
+ * with the ceph_mds_request_args_legacy must be maintained!
+ */
+union ceph_mds_request_args {
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+	} __attribute__ ((packed)) getattr;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+		struct ceph_timespec btime;
+	} __attribute__ ((packed)) setattr;
+	struct {
+		__le32 frag;                 /* which dir fragment */
+		__le32 max_entries;          /* how many dentries to grab */
+		__le32 max_bytes;
+		__le16 flags;
+               __le32 offset_hash;
+	} __attribute__ ((packed)) readdir;
+	struct {
+		__le32 mode;
+		__le32 rdev;
+	} __attribute__ ((packed)) mknod;
+	struct {
+		__le32 mode;
+	} __attribute__ ((packed)) mkdir;
+	struct {
+		__le32 flags;
+		__le32 mode;
+		__le32 stripe_unit;          /* layout for newly created file */
+		__le32 stripe_count;         /* ... */
+		__le32 object_size;
+		__le32 pool;                 /* if >= 0 and CREATEPOOLID feature */
+		__le32 mask;                 /* CEPH_CAP_* */
+		__le64 old_size;             /* if O_TRUNC */
+	} __attribute__ ((packed)) open;
+	struct {
+		__le32 flags;
+		__le32 osdmap_epoch; 	    /* use for set file/dir layout */
+	} __attribute__ ((packed)) setxattr;
+	struct {
+		struct ceph_file_layout layout;
+	} __attribute__ ((packed)) setlayout;
+	struct {
+		__u8 rule; /* currently fcntl or flock */
+		__u8 type; /* shared, exclusive, remove*/
+		__le64 owner; /* who requests/holds the lock */
+		__le64 pid; /* process id requesting the lock */
+		__le64 start; /* initial location to lock */
+		__le64 length; /* num bytes to lock from start */
+		__u8 wait; /* will caller wait for lock to become available? */
+	} __attribute__ ((packed)) filelock_change;
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+		__le64 snapid;
+		__le64 parent;
+		__le32 hash;
+	} __attribute__ ((packed)) lookupino;
+	struct {
+		__le32 frag;                 /* which dir fragment */
+		__le32 max_entries;          /* how many dentries to grab */
+		__le32 max_bytes;
+		__le16 flags;
+                __le32 offset_hash;
+		__le64 snap_other;
+	} __attribute__ ((packed)) snapdiff;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_REQUEST_HEAD_VERSION	3
+
+/*
+ * Note that any change to this structure must ensure that it is compatible
+ * with ceph_mds_request_head_legacy.
+ */
+struct ceph_mds_request_head {
+	__le16 version;
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args args;
+
+	__le32 ext_num_retry;          /* new count retry attempts */
+	__le32 ext_num_fwd;            /* new count fwd attempts */
+
+	__le32 struct_len;             /* to store size of struct ceph_mds_request_head */
+	__le32 owner_uid, owner_gid;   /* used for OPs which create inodes */
+} __attribute__ ((packed));
+
+void inline encode(const struct ceph_mds_request_head& h, ceph::buffer::list& bl) {
+  using ceph::encode;
+  encode(h.version, bl);
+  encode(h.oldest_client_tid, bl);
+  encode(h.mdsmap_epoch, bl);
+  encode(h.flags, bl);
+
+  // For old MDS daemons
+  __u8 num_retry = __u32(h.ext_num_retry);
+  __u8 num_fwd = __u32(h.ext_num_fwd);
+  encode(num_retry, bl);
+  encode(num_fwd, bl);
+
+  encode(h.num_releases, bl);
+  encode(h.op, bl);
+  encode(h.caller_uid, bl);
+  encode(h.caller_gid, bl);
+  encode(h.ino, bl);
+  bl.append((char*)&h.args, sizeof(h.args));
+
+  if (h.version >= 2) {
+    encode(h.ext_num_retry, bl);
+    encode(h.ext_num_fwd, bl);
+  }
+
+  if (h.version >= 3) {
+    __u32 struct_len = sizeof(struct ceph_mds_request_head);
+    encode(struct_len, bl);
+    encode(h.owner_uid, bl);
+    encode(h.owner_gid, bl);
+
+    /*
+     * Please, add new fields handling here.
+     * You don't need to check h.version as we do it
+     * in decode(), because decode can properly skip
+     * all unsupported fields if h.version >= 3.
+     */
+  }
+}
+
+void inline decode(struct ceph_mds_request_head& h, ceph::buffer::list::const_iterator& bl) {
+  using ceph::decode;
+  unsigned struct_end = bl.get_off();
+
+  decode(h.version, bl);
+  decode(h.oldest_client_tid, bl);
+  decode(h.mdsmap_epoch, bl);
+  decode(h.flags, bl);
+  decode(h.num_retry, bl);
+  decode(h.num_fwd, bl);
+  decode(h.num_releases, bl);
+  decode(h.op, bl);
+  decode(h.caller_uid, bl);
+  decode(h.caller_gid, bl);
+  decode(h.ino, bl);
+  bl.copy(sizeof(h.args), (char*)&(h.args));
+
+  if (h.version >= 2) {
+    decode(h.ext_num_retry, bl);
+    decode(h.ext_num_fwd, bl);
+  } else {
+    h.ext_num_retry = h.num_retry;
+    h.ext_num_fwd = h.num_fwd;
+  }
+
+  if (h.version >= 3) {
+    decode(h.struct_len, bl);
+    struct_end += h.struct_len;
+
+    decode(h.owner_uid, bl);
+    decode(h.owner_gid, bl);
+  } else {
+    /*
+     * client is old: let's take caller_{u,g}id as owner_{u,g}id
+     * this is how it worked before adding of owner_{u,g}id fields.
+     */
+    h.owner_uid = h.caller_uid;
+    h.owner_gid = h.caller_gid;
+  }
+
+  /* add new fields handling here */
+
+  /*
+   * From version 3 we have struct_len field.
+   * It allows us to properly handle a case
+   * when client send struct ceph_mds_request_head
+   * bigger in size than MDS supports. In this
+   * case we just want to skip all remaining bytes
+   * at the end.
+   *
+   * See also DECODE_FINISH macro. Unfortunately,
+   * we can't start using it right now as it will be
+   * an incompatible protocol change.
+   */
+  if (h.version >= 3) {
+    if (bl.get_off() > struct_end)
+      throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__));
+    if (bl.get_off() < struct_end)
+      bl += struct_end - bl.get_off();
+  }
+}
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+	__le64 ino, cap_id;            /* ino and unique cap id */
+	__le32 caps, wanted;           /* new issued, wanted */
+	__le32 seq, issue_seq, mseq;
+	__le32 dname_seq;              /* if releasing a dentry lease, a */
+	__le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+
+static inline void
+copy_from_legacy_head(struct ceph_mds_request_head *head,
+			struct ceph_mds_request_head_legacy *legacy)
+{
+	struct ceph_mds_request_head_legacy *embedded_legacy =
+		(struct ceph_mds_request_head_legacy *)&head->oldest_client_tid;
+	*embedded_legacy = *legacy;
+}
+
+static inline void
+copy_to_legacy_head(struct ceph_mds_request_head_legacy *legacy,
+			struct ceph_mds_request_head *head)
+{
+	struct ceph_mds_request_head_legacy *embedded_legacy =
+		(struct ceph_mds_request_head_legacy *)&head->oldest_client_tid;
+	*legacy = *embedded_legacy;
+}
+
+/* client reply */
+struct ceph_mds_reply_head {
+	__le32 op;
+	__le32 result;
+	__le32 mdsmap_epoch;
+	__u8 safe;                     /* true if committed to disk */
+	__u8 is_dentry, is_target;     /* true if dentry, target inode records
+					  are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+	__le32 frag;                   /* this frag splits... */
+	__le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+	__le32 nsplits;                /* num ceph_frag_tree_split records */
+	struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+	__le32 caps, wanted;           /* caps issued, wanted */
+	__le64 cap_id;
+	__le32 seq, mseq;
+	__le64 realm;                  /* snap realm */
+	__u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH	(1 << 0)	/* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_RELEASE	(1 << 1)        /* ask client to release the cap */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+	__le16 mask;            /* lease type(s) */
+	__le32 duration_ms;     /* lease duration */
+	__le32 seq;
+} __attribute__ ((packed));
+
+#define CEPH_LEASE_VALID	(1 | 2) /* old and new bit values */
+#define CEPH_LEASE_PRIMARY_LINK	4	/* primary linkage */
+
+struct ceph_mds_reply_dirfrag {
+	__le32 frag;            /* fragment */
+	__le32 auth;            /* auth mds, if this is a delegation point */
+	__le32 ndist;           /* number of mds' this is replicated on */
+	__le32 dist[];
+} __attribute__ ((packed));
+
+#define CEPH_LOCK_FCNTL		1
+#define CEPH_LOCK_FLOCK		2
+#define CEPH_LOCK_FCNTL_INTR	3
+#define CEPH_LOCK_FLOCK_INTR	4
+
+#define CEPH_LOCK_SHARED   1
+#define CEPH_LOCK_EXCL     2
+#define CEPH_LOCK_UNLOCK   4
+
+struct ceph_filelock {
+	__le64 start;/* file offset to start lock at */
+	__le64 length; /* num bytes to lock; 0 for all following start */
+	__le64 client; /* which client holds the lock */
+	__le64 owner; /* who requests/holds the lock */
+	__le64 pid; /* process id holding the lock on the client */
+	__u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
+
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+/* inline data state */
+#define CEPH_INLINE_NONE	((__u64)-1)
+#define CEPH_INLINE_MAX_SIZE	CEPH_MIN_STRIPE_UNIT
+
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+/* note: these definitions are duplicated in mds/locks.c */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+
+#define CEPH_CAP_SIMPLE_BITS  2
+#define CEPH_CAP_FILE_BITS    8
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)       ((x) << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |			\
+				 CEPH_CAP_AUTH_SHARED |	\
+				 CEPH_CAP_LINK_SHARED |	\
+				 CEPH_CAP_FILE_SHARED |	\
+				 CEPH_CAP_XATTR_SHARED)
+#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
+				   CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT        CEPH_CAP_FILE_WREXTEND
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
+			      CEPH_CAP_LINK_SHARED |			\
+			      CEPH_CAP_XATTR_SHARED |			\
+			      CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |	\
+			   CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |		\
+			   CEPH_CAP_LINK_EXCL |		\
+			   CEPH_CAP_XATTR_EXCL |	\
+			   CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \
+                              CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
+			      CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+			   CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+			CEPH_LOCK_IXATTR)
+
+/* cap masks async dir operations */
+#define CEPH_CAP_DIR_CREATE    CEPH_CAP_FILE_CACHE
+#define CEPH_CAP_DIR_UNLINK    CEPH_CAP_FILE_RD
+#define CEPH_CAP_ANY_DIR_OPS   (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \
+				CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO)
+
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+	CEPH_CAP_OP_GRANT,         /* mds->client grant */
+	CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+	CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+	CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+	CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+	CEPH_CAP_OP_UPDATE,        /* client->mds update */
+	CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+	CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+	CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+	CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+	CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+	CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+	CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/* extra info for cap import/export */
+struct ceph_mds_cap_peer {
+	__le64 cap_id;
+	__le32 seq;
+	__le32 mseq;
+	__le32 mds;
+	__u8   flags;
+} __attribute__ ((packed));
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps_head {
+	__le32 op;                  /* CEPH_CAP_OP_* */
+	__le64 ino, realm;
+	__le64 cap_id;
+	__le32 seq, issue_seq;
+	__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+	__le32 migrate_seq;
+	__le64 snap_follows;
+	__le32 snap_trace_len;
+
+	/* authlock */
+	__le32 uid, gid, mode;
+
+	/* linklock */
+	__le32 nlink;
+
+	/* xattrlock */
+	__le32 xattr_len;
+	__le64 xattr_version;
+} __attribute__ ((packed));
+
+struct ceph_mds_caps_non_export_body {
+    /* all except export */
+    /* filelock */
+    __le64 size, max_size, truncate_size;
+    __le32 truncate_seq;
+    struct ceph_timespec mtime, atime, ctime;
+    struct ceph_file_layout layout;
+    __le32 time_warp_seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_caps_export_body {
+    /* export message */
+    struct ceph_mds_cap_peer peer;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+	__le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+	__le64 ino;
+	__le64 cap_id;
+	__le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+	__u8 action;            /* CEPH_MDS_LEASE_* */
+	__le16 mask;            /* which lease */
+	__le64 ino;
+	__le64 first, last;     /* snap range */
+	__le32 seq;
+	__le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+	__le64 cap_id;
+	__le32 wanted;
+	__le32 issued;
+	__le64 snaprealm;
+	__le64 pathbase;        /* base ino for our path to this ino */
+	__le32 flock_len;       /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+
+struct ceph_mds_cap_reconnect_v1 {
+	__le64 cap_id;
+	__le32 wanted;
+	__le32 issued;
+	__le64 size;
+	struct ceph_timespec mtime, atime;
+	__le64 snaprealm;
+	__le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+
+struct ceph_mds_snaprealm_reconnect {
+	__le64 ino;     /* snap realm base */
+	__le64 seq;     /* snap seq for this snap realm */
+	__le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+	CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+	CEPH_SNAP_OP_CREATE,
+	CEPH_SNAP_OP_DESTROY,
+	CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+	__le32 op;                /* CEPH_SNAP_OP_* */
+	__le64 split;             /* ino to split off, if any */
+	__le32 num_split_inos;    /* # inos belonging to new child realm */
+	__le32 num_split_realms;  /* # child realms udner new child realm */
+	__le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+	__le64 ino;           /* ino */
+	__le64 created;       /* snap: when created */
+	__le64 parent;        /* ino: parent realm */
+	__le64 parent_since;  /* snap: same parent since */
+	__le64 seq;           /* snap: version */
+	__le32 num_snaps;
+	__le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/ceph_fuse.h b/src/include/ceph_fuse.h
new file mode 100644
index 000000000..cfa8097bb
--- /dev/null
+++ b/src/include/ceph_fuse.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ */
+#ifndef CEPH_FUSE_H
+#define CEPH_FUSE_H
+
+/*
+ * The API version that we want to use, regardless of what the
+ * library version is. Note that this must be defined before
+ * fuse.h is included.
+ */
+#ifndef FUSE_USE_VERSION
+#define FUSE_USE_VERSION	312
+#endif
+
+#include <fuse.h>
+#include "acconfig.h"
+
+/*
+ * Redefine the FUSE_VERSION macro defined in "fuse_common.h"
+ * header file, because the MINOR numner has been forgotten to
+ * update since libfuse 3.2 to 3.8. We need to fetch the MINOR
+ * number from pkgconfig file.
+ */
+#ifdef FUSE_VERSION
+#undef FUSE_VERSION
+#define FUSE_VERSION FUSE_MAKE_VERSION(CEPH_FUSE_MAJOR_VERSION, CEPH_FUSE_MINOR_VERSION)
+#endif
+
+static inline int filler_compat(fuse_fill_dir_t filler,
+                                void *buf, const char *name,
+                                const struct stat *stbuf,
+                                off_t off)
+{
+  return filler(buf, name, stbuf, off
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                , static_cast<enum fuse_fill_dir_flags>(0)
+#endif
+        );
+}
+#endif /* CEPH_FUSE_H */
diff --git a/src/include/ceph_hash.h b/src/include/ceph_hash.h
new file mode 100644
index 000000000..f9d80ac36
--- /dev/null
+++ b/src/include/ceph_hash.h
@@ -0,0 +1,14 @@
+#ifndef FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+extern bool ceph_str_hash_valid(int type);
+
+#endif
diff --git a/src/include/cephfs/ceph_ll_client.h b/src/include/cephfs/ceph_ll_client.h
new file mode 100644
index 000000000..ac5b7c224
--- /dev/null
+++ b/src/include/cephfs/ceph_ll_client.h
@@ -0,0 +1,215 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * scalable distributed file system
+ *
+ * Copyright (C) Jeff Layton <jlayton@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_CEPH_LL_CLIENT_H
+#define CEPH_CEPH_LL_CLIENT_H
+#include <stdint.h>
+
+#ifdef _WIN32
+#include "include/win32/fs_compat.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+
+class Fh;
+
+struct inodeno_t;
+struct vinodeno_t;
+typedef struct vinodeno_t vinodeno;
+
+#else /* __cplusplus */
+
+typedef struct Fh Fh;
+
+typedef struct inodeno_t {
+  uint64_t val;
+} inodeno_t;
+
+typedef struct _snapid_t {
+  uint64_t val;
+} snapid_t;
+
+typedef struct vinodeno_t {
+  inodeno_t ino;
+  snapid_t snapid;
+} vinodeno_t;
+
+#endif /* __cplusplus */
+
+/*
+ * Heavily borrowed from David Howells' draft statx patchset.
+ *
+ * Since the xstat patches are still a work in progress, we borrow its data
+ * structures and #defines to implement ceph_getattrx. Once the xstat stuff
+ * has been merged we should drop this and switch over to using that instead.
+ */
+struct ceph_statx {
+	uint32_t	stx_mask;
+	uint32_t	stx_blksize;
+	uint32_t	stx_nlink;
+	uint32_t	stx_uid;
+	uint32_t	stx_gid;
+	uint16_t	stx_mode;
+	uint64_t	stx_ino;
+	uint64_t	stx_size;
+	uint64_t	stx_blocks;
+	dev_t		stx_dev;
+	dev_t		stx_rdev;
+	struct timespec	stx_atime;
+	struct timespec	stx_ctime;
+	struct timespec	stx_mtime;
+	struct timespec	stx_btime;
+	uint64_t	stx_version;
+};
+
+#define CEPH_STATX_MODE		0x00000001U     /* Want/got stx_mode */
+#define CEPH_STATX_NLINK	0x00000002U     /* Want/got stx_nlink */
+#define CEPH_STATX_UID		0x00000004U     /* Want/got stx_uid */
+#define CEPH_STATX_GID		0x00000008U     /* Want/got stx_gid */
+#define CEPH_STATX_RDEV		0x00000010U     /* Want/got stx_rdev */
+#define CEPH_STATX_ATIME	0x00000020U     /* Want/got stx_atime */
+#define CEPH_STATX_MTIME	0x00000040U     /* Want/got stx_mtime */
+#define CEPH_STATX_CTIME	0x00000080U     /* Want/got stx_ctime */
+#define CEPH_STATX_INO		0x00000100U     /* Want/got stx_ino */
+#define CEPH_STATX_SIZE		0x00000200U     /* Want/got stx_size */
+#define CEPH_STATX_BLOCKS	0x00000400U     /* Want/got stx_blocks */
+#define CEPH_STATX_BASIC_STATS	0x000007ffU     /* The stuff in the normal stat struct */
+#define CEPH_STATX_BTIME	0x00000800U     /* Want/got stx_btime */
+#define CEPH_STATX_VERSION	0x00001000U     /* Want/got stx_version */
+#define CEPH_STATX_ALL_STATS	0x00001fffU     /* All supported stats */
+
+/*
+ * Compatibility macros until these defines make their way into glibc
+ */
+#ifndef AT_STATX_DONT_SYNC
+#define AT_STATX_SYNC_TYPE	0x6000
+#define AT_STATX_SYNC_AS_STAT	0x0000
+#define AT_STATX_FORCE_SYNC	0x2000
+#define AT_STATX_DONT_SYNC	0x4000 /* Don't sync attributes with the server */
+#endif
+
+/*
+ * This is deprecated and just for backwards compatibility.
+ * Please use AT_STATX_DONT_SYNC instead.
+ */
+#define AT_NO_ATTR_SYNC		AT_STATX_DONT_SYNC /* Deprecated */
+
+/*
+ * The statx interfaces only allow these flags. In order to allow us to add
+ * others in the future, we disallow setting any that aren't recognized.
+ */
+#define CEPH_REQ_FLAG_MASK		(AT_SYMLINK_NOFOLLOW|AT_STATX_DONT_SYNC)
+
+/* fallocate mode flags */
+#ifndef FALLOC_FL_KEEP_SIZE
+#define FALLOC_FL_KEEP_SIZE 0x01
+#endif
+#ifndef FALLOC_FL_PUNCH_HOLE
+#define FALLOC_FL_PUNCH_HOLE 0x02
+#endif
+
+/** ceph_deleg_cb_t: Delegation recalls
+ *
+ * Called when there is an outstanding Delegation and there is conflicting
+ * access, either locally or via cap activity.
+ * @fh: open filehandle
+ * @priv: private info registered when delegation was acquired
+ */
+typedef void (*ceph_deleg_cb_t)(Fh *fh, void *priv);
+
+/**
+ * client_ino_callback_t: Inode data/metadata invalidation
+ *
+ * Called when the client wants to invalidate the cached data for a range
+ * in the file.
+ * @handle: client callback handle
+ * @ino: vino of inode to be invalidated
+ * @off: starting offset of content to be invalidated
+ * @len: length of region to invalidate
+ */
+typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino,
+	      int64_t off, int64_t len);
+
+/**
+ * client_dentry_callback_t: Dentry invalidation
+ *
+ * Called when the client wants to purge a dentry from its cache.
+ * @handle: client callback handle
+ * @dirino: vino of directory that contains dentry to be invalidate
+ * @ino: vino of inode attached to dentry to be invalidated
+ * @name: name of dentry to be invalidated
+ * @len: length of @name
+ */
+typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
+					 vinodeno_t ino, const char *name,
+					 size_t len);
+
+/**
+ * client_remount_callback_t: Remount entire fs
+ *
+ * Called when the client needs to purge the dentry cache and the application
+ * doesn't have a way to purge an individual dentry. Mostly used for ceph-fuse
+ * on older kernels.
+ * @handle: client callback handle
+ */
+
+typedef int (*client_remount_callback_t)(void *handle);
+
+/**
+ * client_switch_interrupt_callback_t: Lock request interrupted
+ *
+ * Called before file lock request to set the interrupt handler while waiting
+ * After the wait, called with "data" set to NULL pointer.
+ * @handle: client callback handle
+ * @data: opaque data passed to interrupt before call, NULL pointer after.
+ */
+typedef void (*client_switch_interrupt_callback_t)(void *handle, void *data);
+
+/**
+ * client_umask_callback_t: Fetch umask of actor
+ *
+ * Called when the client needs the umask of the requestor.
+ * @handle: client callback handle
+ */
+typedef mode_t (*client_umask_callback_t)(void *handle);
+
+/**
+ * client_ino_release_t: Request that application release Inode references
+ *
+ * Called when the MDS wants to trim caps and Inode records.
+ * @handle: client callback handle
+ * @ino: vino of Inode being released
+ */
+typedef void (*client_ino_release_t)(void *handle, vinodeno_t ino);
+
+/*
+ * The handle is an opaque value that gets passed to some callbacks. Any fields
+ * set to NULL will be left alone. There is no way to unregister callbacks.
+ */
+struct ceph_client_callback_args {
+  void *handle;
+  client_ino_callback_t ino_cb;
+  client_dentry_callback_t dentry_cb;
+  client_switch_interrupt_callback_t switch_intr_cb;
+  client_remount_callback_t remount_cb;
+  client_umask_callback_t umask_cb;
+  client_ino_release_t ino_release_cb;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_STATX_H */
+
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h
new file mode 100644
index 000000000..dc62698fa
--- /dev/null
+++ b/src/include/cephfs/libcephfs.h
@@ -0,0 +1,2201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIB_H
+#define CEPH_LIB_H
+
+#if defined(__linux__)
+#include <features.h>
+#endif
+#include <utime.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/statvfs.h>
+#include <sys/socket.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <fcntl.h>
+#include <dirent.h>
+
+#include "ceph_ll_client.h"
+
+#ifdef __cplusplus
+namespace ceph::common {
+  class CephContext;
+}
+using CephContext = ceph::common::CephContext;
+extern "C" {
+#endif
+
+#define LIBCEPHFS_VER_MAJOR 10
+#define LIBCEPHFS_VER_MINOR 0
+#define LIBCEPHFS_VER_EXTRA 3
+
+#define LIBCEPHFS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBCEPHFS_VERSION_CODE LIBCEPHFS_VERSION(LIBCEPHFS_VER_MAJOR, LIBCEPHFS_VER_MINOR, LIBCEPHFS_VER_EXTRA)
+
+#if __GNUC__ >= 4
+  #define LIBCEPHFS_DEPRECATED   __attribute__((deprecated))
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#else
+  #define LIBCEPHFS_DEPRECATED
+#endif
+
+/*
+ * If using glibc check that file offset is 64-bit.
+ */
+#if defined(__GLIBC__) && !defined(__USE_FILE_OFFSET64)
+# error libceph: glibc must define __USE_FILE_OFFSET64 or readdir results will be corrupted
+#endif
+
+/*
+ * XXXX redeclarations from ceph_fs.h, rados.h, etc.  We need more of this
+ * in the interface, but shouldn't be re-typing it (and using different
+ * C data types).
+ */
+#ifndef __cplusplus
+
+#define CEPH_INO_ROOT  1
+#define CEPH_NOSNAP  ((uint64_t)(-2))
+
+struct ceph_file_layout {
+	/* file -> object mapping */
+	uint32_t fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+				      of page size. */
+	uint32_t fl_stripe_count;    /* over this many objects */
+	uint32_t fl_object_size;     /* until objects are this big, then move to
+				      new objects */
+	uint32_t fl_cas_hash;        /* 0 = none; 1 = sha256 */
+
+	/* pg -> disk layout */
+	uint32_t fl_object_stripe_unit;  /* for per-object parity, if any */
+
+	/* object -> pg layout */
+	uint32_t fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+	uint32_t fl_pg_pool;      /* namespace, crush rule, rep level */
+} __attribute__ ((packed));
+
+struct CephContext;
+#endif /* ! __cplusplus */
+
+struct UserPerm;
+typedef struct UserPerm UserPerm;
+
+struct Inode;
+typedef struct Inode Inode;
+
+struct ceph_mount_info;
+struct ceph_dir_result;
+
+// user supplied key,value pair to be associated with a snapshot.
+// callers can supply an array of this struct via ceph_mksnap().
+struct snap_metadata {
+  const char *key;
+  const char *value;
+};
+
+struct snap_info {
+  uint64_t id;
+  size_t nr_snap_metadata;
+  struct snap_metadata *snap_metadata;
+};
+
+struct ceph_snapdiff_entry_t {
+  struct dirent dir_entry;
+  uint64_t snapid; //should be snapid_t but prefer not to exposure it
+};
+
+/* setattr mask bits (up to an int in size) */
+#ifndef CEPH_SETATTR_MODE
+#define CEPH_SETATTR_MODE		(1 << 0)
+#define CEPH_SETATTR_UID		(1 << 1)
+#define CEPH_SETATTR_GID		(1 << 2)
+#define CEPH_SETATTR_MTIME		(1 << 3)
+#define CEPH_SETATTR_ATIME		(1 << 4)
+#define CEPH_SETATTR_SIZE		(1 << 5)
+#define CEPH_SETATTR_CTIME		(1 << 6)
+#define CEPH_SETATTR_MTIME_NOW		(1 << 7)
+#define CEPH_SETATTR_ATIME_NOW		(1 << 8)
+#define CEPH_SETATTR_BTIME		(1 << 9)
+#define CEPH_SETATTR_KILL_SGUID		(1 << 10)
+#define CEPH_SETATTR_FSCRYPT_AUTH	(1 << 11)
+#define CEPH_SETATTR_FSCRYPT_FILE	(1 << 12)
+#define CEPH_SETATTR_KILL_SUID		(1 << 13)
+#define CEPH_SETATTR_KILL_SGID		(1 << 14)
+#endif
+
+/* define error codes for the mount function*/
+# define CEPHFS_ERROR_MON_MAP_BUILD 1000
+# define CEPHFS_ERROR_NEW_CLIENT 1002
+# define CEPHFS_ERROR_MESSENGER_START 1003
+
+/**
+ * Create a UserPerm credential object.
+ *
+ * Some calls (most notably, the ceph_ll_* ones), take a credential object
+ * that represents the credentials that the calling program is using. This
+ * function creates a new credential object for this purpose. Returns a
+ * pointer to the object, or NULL if it can't be allocated.
+ *
+ * Note that the gidlist array is used directly and is not copied. It must
+ * remain valid over the lifetime of the created UserPerm object.
+ *
+ * @param uid uid to be used
+ * @param gid gid to be used
+ * @param ngids number of gids in supplemental grouplist
+ * @param gidlist array of gid_t's in the list of groups
+ */
+UserPerm *ceph_userperm_new(uid_t uid, gid_t gid, int ngids, gid_t *gidlist);
+
+/**
+ * Destroy a UserPerm credential object.
+ *
+ * @param perm pointer to object to be destroyed
+ *
+ * Currently this just frees the object. Note that the gidlist array is not
+ * freed. The caller must do so if it's necessary.
+ */
+void ceph_userperm_destroy(UserPerm *perm);
+
+/**
+ * Get a pointer to the default UserPerm object for the mount.
+ *
+ * @param cmount the mount info handle
+ *
+ * Every cmount has a default set of credentials. This returns a pointer to
+ * that object.
+ *
+ * Unlike with ceph_userperm_new, this object should not be freed.
+ */
+struct UserPerm *ceph_mount_perms(struct ceph_mount_info *cmount);
+
+/**
+ * Set cmount's default permissions
+ *
+ * @param cmount the mount info handle
+ * @param perm permissions to set to default for mount
+ *
+ * Every cmount has a default set of credentials. This does a deep copy of
+ * the given permissions to the ones in the cmount. Must be done after
+ * ceph_init but before ceph_mount.
+ *
+ * Returns 0 on success, and -EISCONN if the cmount is already mounted.
+ */
+int ceph_mount_perms_set(struct ceph_mount_info *cmount, UserPerm *perm);
+
+/**
+ * @defgroup libcephfs_h_init Setup and Teardown
+ * These are the first and last functions that should be called
+ * when using libcephfs.
+ *
+ * @{
+ */
+
+/**
+ * Get the version of libcephfs.
+ *
+ * The version number is major.minor.patch.
+ *
+ * @param major where to store the major version number
+ * @param minor where to store the minor version number
+ * @param patch where to store the extra version number
+ */
+const char *ceph_version(int *major, int *minor, int *patch);
+
+/**
+ * Create a mount handle for interacting with Ceph.  All libcephfs
+ * functions operate on a mount info handle.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param id the id of the client.  This can be a unique id that identifies
+ *           this client, and will get appended onto "client.".  Callers can
+ *           pass in NULL, and the id will be the process id of the client.
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create(struct ceph_mount_info **cmount, const char * const id);
+
+/**
+ * Create a mount handle from a CephContext, which holds the configuration
+ * for the ceph cluster.  A CephContext can be acquired from an existing ceph_mount_info
+ * handle, using the @ref ceph_get_mount_context call.  Note that using the same CephContext
+ * for two different mount handles results in the same client entity id being used.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param conf reuse this pre-existing CephContext config
+ * @returns 0 on success, negative error code on failure
+ */
+#ifdef __cplusplus
+int ceph_create_with_context(struct ceph_mount_info **cmount, CephContext *conf);
+#else
+int ceph_create_with_context(struct ceph_mount_info **cmount, struct CephContext *conf);
+#endif
+
+#ifndef VOIDPTR_RADOS_T
+#define VOIDPTR_RADOS_T
+typedef void *rados_t;
+#endif // VOIDPTR_RADOS_T
+
+/**
+ * Create a mount handle from a rados_t, for using libcephfs in the
+ * same process as librados.
+ *
+ * @param cmount the mount info handle to initialize
+ * @param cluster reference to already-initialized librados handle
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_create_from_rados(struct ceph_mount_info **cmount, rados_t cluster);
+
+/**
+ * Initialize the filesystem client (but do not mount the filesystem yet)
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_init(struct ceph_mount_info *cmount);
+
+/**
+ * Optionally set which filesystem to mount, before calling mount.
+ *
+ * An error will be returned if this libcephfs instance is already
+ * mounted. This function is an alternative to setting the global
+ * client_fs setting.  Using this function enables multiple libcephfs
+ * instances in the same process to mount different filesystems.
+ *
+ * The filesystem name is *not* validated in this function.  That happens
+ * during mount(), where an ENOENT error will result if a non-existent
+ * filesystem was specified here.
+ *
+ * @param cmount the mount info handle
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_select_filesystem(struct ceph_mount_info *cmount, const char *fs_name);
+
+
+/**
+ * Perform a mount using the path for the root of the mount.
+ *
+ * It is optional to call ceph_init before this.  If ceph_init has
+ * not already been called, it will be called in the course of this operation.
+ *
+ * @param cmount the mount info handle
+ * @param root the path for the root of the mount.  This can be an existing
+ *	       directory within the ceph cluster, but most likely it will
+ * 	       be "/".  Passing in NULL is equivalent to "/".
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_mount(struct ceph_mount_info *cmount, const char *root);
+
+/**
+ * Return cluster ID for a mounted ceph filesystem
+ *
+ * Every ceph filesystem has a filesystem ID associated with it. This
+ * function returns that value. If the ceph_mount_info does not refer to a
+ * mounted filesystem, this returns a negative error code.
+ */
+int64_t ceph_get_fs_cid(struct ceph_mount_info *cmount);
+
+/**
+ * Execute a management command remotely on an MDS.
+ *
+ * Must have called ceph_init or ceph_mount before calling this.
+ *
+ * @param mds_spec string representing rank, MDS name, GID or '*'
+ * @param cmd array of null-terminated strings
+ * @param cmdlen length of cmd array
+ * @param inbuf non-null-terminated input data to command
+ * @param inbuflen length in octets of inbuf
+ * @param outbuf populated with pointer to buffer (command output data)
+ * @param outbuflen length of allocated outbuf
+ * @param outs populated with pointer to buffer (command error strings)
+ * @param outslen length of allocated outs
+ *
+ * @return 0 on success, negative error code on failure
+ *
+ */
+int ceph_mds_command(struct ceph_mount_info *cmount,
+    const char *mds_spec,
+    const char **cmd,
+    size_t cmdlen,
+    const char *inbuf, size_t inbuflen,
+    char **outbuf, size_t *outbuflen,
+    char **outs, size_t *outslen);
+
+/**
+ * Free a buffer, such as those used for output arrays from ceph_mds_command
+ */
+void ceph_buffer_free(char *buf);
+
+/**
+ * Unmount a mount handle.
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure
+ */
+int ceph_unmount(struct ceph_mount_info *cmount);
+
+/**
+ * Abort mds connections
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure
+ */
+int ceph_abort_conn(struct ceph_mount_info *cmount);
+
+/**
+ * Destroy the mount handle.
+ *
+ * The handle should not be mounted. This should be called on completion of
+ * all libcephfs functions.
+ *
+ * @param cmount the mount handle
+ * @return 0 on success, negative error code on failure.
+ */
+int ceph_release(struct ceph_mount_info *cmount);
+
+/**
+ * Deprecated. Unmount and destroy the ceph mount handle. This should be
+ * called on completion of all libcephfs functions.
+ *
+ * Equivalent to ceph_unmount() + ceph_release() without error handling.
+ *
+ * @param cmount the mount handle to shutdown
+ */
+void ceph_shutdown(struct ceph_mount_info *cmount);
+
+/**
+ * Return associated client addresses
+ *
+ * @param cmount the mount handle
+ * @param addrs the output addresses
+ * @returns 0 on success, a negative error code on failure
+ * @note the returned addrs should be free by the caller
+ */
+int ceph_getaddrs(struct ceph_mount_info *cmount, char** addrs);
+
+/**
+ * Get a global id for current instance
+ *
+ * The handle should not be mounted. This should be called on completion of
+ * all libcephfs functions.
+ *
+ * @param cmount the mount handle
+ * @returns instance global id
+ */
+uint64_t ceph_get_instance_id(struct ceph_mount_info *cmount);
+
+/**
+ * Extract the CephContext from the mount point handle.
+ *
+ * @param cmount the ceph mount handle to get the context from.
+ * @returns the CephContext associated with the mount handle.
+ */
+#ifdef __cplusplus
+CephContext *ceph_get_mount_context(struct ceph_mount_info *cmount);
+#else
+struct CephContext *ceph_get_mount_context(struct ceph_mount_info *cmount);
+#endif
+/*
+ * Check mount status.
+ *
+ * Return non-zero value if mounted. Otherwise, zero.
+ */
+int ceph_is_mounted(struct ceph_mount_info *cmount);
+
+/** @} init */
+
+/**
+ * @defgroup libcephfs_h_config Config
+ * Functions for manipulating the Ceph configuration at runtime.
+ *
+ * @{
+ */
+
+/**
+ * Load the ceph configuration from the specified config file.
+ *
+ * @param cmount the mount handle to load the configuration into.
+ * @param path_list the configuration file path
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_read_file(struct ceph_mount_info *cmount, const char *path_list);
+
+/**
+ * Parse the command line arguments and load the configuration parameters.
+ *
+ * @param cmount the mount handle to load the configuration parameters into.
+ * @param argc count of the arguments in argv
+ * @param argv the argument list
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc, const char **argv);
+
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre ceph_mount() has not been called on the handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cmount handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *var);
+
+/** Sets a configuration value from a string.
+ *
+ * @param cmount the mount handle to set the configuration value on
+ * @param option the configuration option to set
+ * @param value the value of the configuration option to set
+ * 
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_conf_set(struct ceph_mount_info *cmount, const char *option, const char *value);
+
+/** Set mount timeout.
+ *
+ * @param cmount mount handle to set the configuration value on
+ * @param timeout mount timeout interval
+ *
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_set_mount_timeout(struct ceph_mount_info *cmount, uint32_t timeout);
+
+/**
+ * Gets the configuration value as a string.
+ *
+ * @param cmount the mount handle to set the configuration value on
+ * @param option the config option to get
+ * @param buf the buffer to fill with the value
+ * @param len the length of the buffer.
+ * @returns the size of the buffer filled in with the value, or negative error code on failure
+ */
+int ceph_conf_get(struct ceph_mount_info *cmount, const char *option, char *buf, size_t len);
+
+/** @} config */
+
+/**
+ * @defgroup libcephfs_h_fsops File System Operations.
+ * Functions for getting/setting file system wide information specific to a particular
+ * mount handle.
+ *
+ * @{
+ */
+
+/**
+ * Perform a statfs on the ceph file system.  This call fills in file system wide statistics
+ * into the passed in buffer.
+ *
+ * @param cmount the ceph mount handle to use for performing the statfs.
+ * @param path can be any path within the mounted filesystem
+ * @param stbuf the file system statistics filled in by this function.
+ * @return 0 on success, negative error code otherwise.
+ */
+int ceph_statfs(struct ceph_mount_info *cmount, const char *path, struct statvfs *stbuf);
+
+/**
+ * Synchronize all filesystem data to persistent media.
+ *
+ * @param cmount the ceph mount handle to use for performing the sync_fs.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_sync_fs(struct ceph_mount_info *cmount);
+
+/**
+ * Get the current working directory.
+ *
+ * @param cmount the ceph mount to get the current working directory for.
+ * @returns the path to the current working directory
+ */
+const char* ceph_getcwd(struct ceph_mount_info *cmount);
+
+/**
+ * Change the current working directory.
+ *
+ * @param cmount the ceph mount to change the current working directory for.
+ * @param path the path to the working directory to change into.
+ * @returns 0 on success, negative error code otherwise.
+ */
+int ceph_chdir(struct ceph_mount_info *cmount, const char *path);
+
+/** @} fsops */
+
+/**
+ * @defgroup libcephfs_h_dir Directory Operations.
+ * Functions for manipulating and listing directories.
+ *
+ * @{
+ */
+
+/**
+ * Open the given directory.
+ *
+ * @param cmount the ceph mount handle to use to open the directory
+ * @param name the path name of the directory to open.  Must be either an absolute path
+ *        or a path relative to the current working directory.
+ * @param dirpp the directory result pointer structure to fill in.
+ * @returns 0 on success or negative error code otherwise.
+ */
+int ceph_opendir(struct ceph_mount_info *cmount, const char *name, struct ceph_dir_result **dirpp);
+
+/**
+ * Open a directory referred to by a file descriptor
+ *
+ * @param cmount the ceph mount handle to use to open the directory
+ * @param dirfd open file descriptor for the directory
+ * @param dirpp the directory result pointer structure to fill in
+ * @returns 0 on success or negative error code otherwise
+ */
+int ceph_fdopendir(struct ceph_mount_info *cmount, int dirfd, struct ceph_dir_result **dirpp);
+
+/**
+ * Close the open directory.
+ *
+ * @param cmount the ceph mount handle to use for closing the directory
+ * @param dirp the directory result pointer (set by ceph_opendir) to close
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_closedir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Get the next entry in an open directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry to return.
+ * @returns the next directory entry or NULL if at the end of the directory (or the directory
+ *          is empty.  This pointer should not be freed by the caller, and is only safe to
+ *          access between return and the next call to ceph_readdir or ceph_closedir.
+ */
+struct dirent * ceph_readdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * A safe version of ceph_readdir, where the directory entry struct is allocated by the caller.
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry to return.
+ * @param de the directory entry pointer filled in with the next directory entry of the dirp state.
+ * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached,
+ *          and a negative error code on failure.
+ */
+int ceph_readdir_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de);
+
+/**
+ * A safe version of ceph_readdir that also returns the file statistics (readdir+stat).
+ *
+ * @param cmount the ceph mount handle to use for performing the readdir_plus_r.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry to return.
+ * @param de the directory entry pointer filled in with the next directory entry of the dirp state.
+ * @param stx the stats of the file/directory of the entry returned
+ * @param want mask showing desired inode attrs for returned entry
+ * @param flags bitmask of flags to use when filling out attributes
+ * @param out optional returned Inode argument. If non-NULL, then a reference will be taken on
+ *            the inode and the pointer set on success.
+ * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached,
+ *          and a negative error code on failure.
+ */
+int ceph_readdirplus_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de,
+		       struct ceph_statx *stx, unsigned want, unsigned flags, struct Inode **out);
+
+struct ceph_snapdiff_info
+{
+  struct ceph_mount_info* cmount;
+  struct ceph_dir_result* dir1;    // primary dir entry to build snapdiff for.
+  struct ceph_dir_result* dir_aux; // aux dir entry to identify the second snapshot.
+                                   // Can point to the parent dir entry if entry-in-question
+                                   // doesn't exist in the second snapshot
+};
+
+/**
+ * Opens snapdiff stream to get snapshots delta (aka snapdiff).
+ *
+ * @param cmount the ceph mount handle to use for snapdiff retrieval.
+ * @param root_path  root path for snapshots-in-question
+ * @param rel_path subpath under the root to build delta for
+ * @param snap1 the first snapshot name
+ * @param snap2 the second snapshot name
+ * @param out resulting snapdiff stream handle to be used for snapdiff results
+              retrieval via ceph_readdir_snapdiff
+ * @returns 0 on success and negative error code otherwise
+ */
+int ceph_open_snapdiff(struct ceph_mount_info* cmount,
+                       const char* root_path,
+                       const char* rel_path,
+                       const char* snap1,
+                       const char* snap2,
+                       struct ceph_snapdiff_info* out);
+/**
+ * Get the next snapshot delta entry.
+ *
+ * @param info snapdiff stream handle opened via ceph_open_snapdiff()
+ * @param out  the next snapdiff entry which includes directory entry and the
+ *             entry's snapshot id - later one for emerged/existing entry or
+ *             former snapshot id for the removed entry.
+ * @returns >0 on success, 0 if no more entries in the stream and negative
+ *          error code otherwise
+ */
+int ceph_readdir_snapdiff(struct ceph_snapdiff_info* snapdiff,
+                          struct ceph_snapdiff_entry_t* out);
+/**
+ * Close snapdiff stream.
+ *
+ * @param info snapdiff stream handle opened via ceph_open_snapdiff()
+ * @returns 0 on success and negative error code otherwise
+ */
+int ceph_close_snapdiff(struct ceph_snapdiff_info* snapdiff);
+
+/**
+ * Gets multiple directory entries.
+ *
+ * @param cmount the ceph mount handle to use for performing the getdents.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry/entries to return.
+ * @param name an array of struct dirent that gets filled in with the  to fill returned directory entries into.
+ * @param buflen the length of the buffer, which should be the number of dirent structs * sizeof(struct dirent).
+ * @returns the length of the buffer that was filled in, will always be multiples of sizeof(struct dirent), or a
+ *          negative error code.  If the buffer is not large enough for a single entry, -ERANGE is returned.
+ */
+int ceph_getdents(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen);
+
+/**
+ * Gets multiple directory names.
+ * 
+ * @param cmount the ceph mount handle to use for performing the getdents.
+ * @param dirp the directory stream pointer from an opendir holding the state of the
+ *        next entry/entries to return.
+ * @param name a buffer to fill in with directory entry names.
+ * @param buflen the length of the buffer that can be filled in.
+ * @returns the length of the buffer filled in with entry names, or a negative error code on failure.
+ *          If the buffer isn't large enough for a single entry, -ERANGE is returned.
+ */
+int ceph_getdnames(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen);
+
+/**
+ * Rewind the directory stream to the beginning of the directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the rewinddir.
+ * @param dirp the directory stream pointer to rewind.
+ */
+void ceph_rewinddir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Get the current position of a directory stream.
+ *
+ * @param cmount the ceph mount handle to use for performing the telldir.
+ * @param dirp the directory stream pointer to get the current position of.
+ * @returns the position of the directory stream.  Note that the offsets returned
+ *          by ceph_telldir do not have a particular order (cannot be compared with
+ *          inequality).
+ */
+int64_t ceph_telldir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp);
+
+/**
+ * Move the directory stream to a position specified by the given offset.
+ *
+ * @param cmount the ceph mount handle to use for performing the seekdir.
+ * @param dirp the directory stream pointer to move.
+ * @param offset the position to move the directory stream to.  This offset should be
+ *        a value returned by telldir.  Note that this value does not refer to the nth
+ *        entry in a directory, and can not be manipulated with plus or minus.
+ */
+void ceph_seekdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, int64_t offset);
+
+/**
+ * Create a directory.
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the directory to create.  This must be either an
+ *        absolute path or a relative path off of the current working directory.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdir(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Create a directory relative to a file descriptor
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param dirfd open file descriptor for a directory (or CEPHFS_AT_FDCWD)
+ * @param relpath the path of the directory to create.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdirat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, mode_t mode);
+
+/**
+ * Create a snapshot
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the directory to create snapshot.  This must be either an
+ *        absolute path or a relative path off of the current working directory.
+ * @param name snapshot name
+ * @param mode the permissions the directory should have once created.
+ * @param snap_metadata array of snap metadata structs
+ * @param nr_snap_metadata number of snap metadata struct entries
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mksnap(struct ceph_mount_info *cmount, const char *path, const char *name,
+                mode_t mode, struct snap_metadata *snap_metadata, size_t nr_snap_metadata);
+
+/**
+ * Remove a snapshot
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the directory to create snapshot.  This must be either an
+ *        absolute path or a relative path off of the current working directory.
+ * @param name snapshot name
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_rmsnap(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Create multiple directories at once.
+ *
+ * @param cmount the ceph mount handle to use for making the directories.
+ * @param path the full path of directories and sub-directories that should
+ *        be created.
+ * @param mode the permissions the directory should have once created.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_mkdirs(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Remove a directory.
+ *
+ * @param cmount the ceph mount handle to use for removing directories.
+ * @param path the path of the directory to remove.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_rmdir(struct ceph_mount_info *cmount, const char *path);
+
+/** @} dir */
+
+/**
+ * @defgroup libcephfs_h_links Links and Link Handling.
+ * Functions for creating and manipulating hard links and symbolic inks.
+ *
+ * @{
+ */
+
+/**
+ * Create a link.
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_link(struct ceph_mount_info *cmount, const char *existing, const char *newname);
+
+/**
+ * Read a symbolic link.
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param path the path to the symlink to read
+ * @param buf the buffer to hold the path of the file that the symlink points to.
+ * @param size the length of the buffer
+ * @returns number of bytes copied on success or negative error code on failure
+ */
+int ceph_readlink(struct ceph_mount_info *cmount, const char *path, char *buf, int64_t size);
+
+/**
+ * Read a symbolic link relative to a file descriptor
+ *
+ * @param cmount the ceph mount handle to use for creating the link.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the path to the symlink to read
+ * @param buf the buffer to hold the path of the file that the symlink points to.
+ * @param size the length of the buffer
+ * @returns number of bytes copied on success or negative error code on failure
+ */
+int ceph_readlinkat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, char *buf,
+                    int64_t size);
+
+/**
+ * Creates a symbolic link.
+ *
+ * @param cmount the ceph mount handle to use for creating the symbolic link.
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on failure.
+ */
+int ceph_symlink(struct ceph_mount_info *cmount, const char *existing, const char *newname);
+
+/**
+ * Creates a symbolic link relative to a file descriptor
+ *
+ * @param cmount the ceph mount handle to use for creating the symbolic link.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param existing the path to the existing file/directory to link to.
+ * @param newname the path to the new file/directory to link from.
+ * @returns 0 on success or a negative return code on failure.
+ */
+int ceph_symlinkat(struct ceph_mount_info *cmount, const char *existing, int dirfd,
+                   const char *newname);
+
+/** @} links */
+
+/**
+ * @defgroup libcephfs_h_files File manipulation and handling.
+ * Functions for creating and manipulating files.
+ *
+ * @{
+ */
+
+
+/**
+ * Checks if deleting a file, link or directory is allowed.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file, link or directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_may_delete(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Removes a file, link, or symbolic link.  If the file/link has multiple links to it, the
+ * file will not disappear from the namespace until all references to it are removed.
+ * 
+ * @param cmount the ceph mount handle to use for performing the unlink.
+ * @param path the path of the file or link to unlink.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_unlink(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Removes a file, link, or symbolic link relative to a file descriptor.
+ * If the file/link has multiple links to it, the file will not
+ * disappear from the namespace until all references to it are removed.
+ *
+ * @param cmount the ceph mount handle to use for performing the unlink.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the path of the file or link to unlink.
+ * @param flags bitfield that can be used to set AT_* modifier flags (only AT_REMOVEDIR)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_unlinkat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, int flags);
+
+/**
+ * Rename a file or directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the rename.
+ * @param from the path to the existing file or directory.
+ * @param to the new name of the file or directory
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_rename(struct ceph_mount_info *cmount, const char *from, const char *to);
+
+/**
+ * Get an open file's extended statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param fd the file descriptor of the file to get statistics of.
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC, AT_STATX_DONT_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fstatx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx,
+		unsigned int want, unsigned int flags);
+
+/**
+ * Get attributes of a file relative to a file descriptor
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath to the file/directory to get statistics of
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC, AT_STATX_DONT_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_statxat(struct ceph_mount_info *cmount, int dirfd, const char *relpath,
+                 struct ceph_statx *stx, unsigned int want, unsigned int flags);
+
+/**
+ * Get a file's extended statistics and attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stx the ceph_statx struct that will be filled in with the file's statistics.
+ * @param want bitfield of CEPH_STATX_* flags showing designed attributes
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC, AT_STATX_DONT_SYNC and AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_statx(struct ceph_mount_info *cmount, const char *path, struct ceph_statx *stx,
+	       unsigned int want, unsigned int flags);
+
+/**
+ * Get a file's statistics and attributes.
+ *
+ * ceph_stat() is deprecated, use ceph_statx() instead.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stbuf the stat struct that will be filled in with the file's statistics.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_stat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf)
+  LIBCEPHFS_DEPRECATED;
+
+/**
+ * Get a file's statistics and attributes, without following symlinks.
+ *
+ * ceph_lstat() is deprecated, use ceph_statx(.., AT_SYMLINK_NOFOLLOW) instead.
+ *
+ * @param cmount the ceph mount handle to use for performing the stat.
+ * @param path the file or directory to get the statistics of.
+ * @param stbuf the stat struct that will be filled in with the file's statistics.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lstat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf)
+  LIBCEPHFS_DEPRECATED;
+
+/**
+ * Get the open file's statistics.
+ *
+ * ceph_fstat() is deprecated, use ceph_fstatx() instead.
+ *
+ * @param cmount the ceph mount handle to use for performing the fstat.
+ * @param fd the file descriptor of the file to get statistics of.
+ * @param stbuf the stat struct of the file's statistics, filled in by the
+ *    function.
+ * @returns 0 on success or a negative error code on failure
+ */
+int ceph_fstat(struct ceph_mount_info *cmount, int fd, struct stat *stbuf)
+  LIBCEPHFS_DEPRECATED;
+
+/**
+ * Set a file's attributes.
+ *
+ * @param cmount the ceph mount handle to use for performing the setattr.
+ * @param relpath the path to the file/directory to set the attributes of.
+ * @param stx the statx struct that must include attribute values to set on the file.
+ * @param mask a mask of all the CEPH_SETATTR_* values that have been set in the statx struct.
+ * @param flags mask of AT_* flags (only AT_ATTR_NOFOLLOW is respected for now)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_setattrx(struct ceph_mount_info *cmount, const char *relpath, struct ceph_statx *stx, int mask, int flags);
+
+/**
+ * Set a file's attributes (extended version).
+ * 
+ * @param cmount the ceph mount handle to use for performing the setattr.
+ * @param fd the fd of the open file/directory to set the attributes of.
+ * @param stx the statx struct that must include attribute values to set on the file.
+ * @param mask a mask of all the stat values that have been set on the stat struct.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fsetattrx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx, int mask);
+
+/**
+ * Change the mode bits (permissions) of a file/directory.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param path the path to the file/directory to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_chmod(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Change the mode bits (permissions) of a file/directory. If the path is a
+ * symbolic link, it's not de-referenced.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param path the path of file/directory to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lchmod(struct ceph_mount_info *cmount, const char *path, mode_t mode);
+
+/**
+ * Change the mode bits (permissions) of an open file.
+ *
+ * @param cmount the ceph mount handle to use for performing the chmod.
+ * @param fd the open file descriptor to change the mode bits on.
+ * @param mode the new permissions to set.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fchmod(struct ceph_mount_info *cmount, int fd, mode_t mode);
+
+/**
+ * Change the mode bits (permissions) of a file relative to a file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the relpath of the file/directory to change the ownership of.
+ * @param mode the new permissions to set.
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_chmodat(struct ceph_mount_info *cmount, int dirfd, const char *relpath,
+                 mode_t mode, int flags);
+
+/**
+ * Change the ownership of a file/directory.
+ * 
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param path the path of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_chown(struct ceph_mount_info *cmount, const char *path, int uid, int gid);
+
+/**
+ * Change the ownership of a file from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param fd the fd of the open file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_fchown(struct ceph_mount_info *cmount, int fd, int uid, int gid);
+
+/**
+ * Change the ownership of a file/directory, don't follow symlinks.
+ * 
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param path the path of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lchown(struct ceph_mount_info *cmount, const char *path, int uid, int gid);
+
+/**
+ * Change the ownership of a file/directory releative to a file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the chown.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the relpath of the file/directory to change the ownership of.
+ * @param uid the user id to set on the file/directory.
+ * @param gid the group id to set on the file/directory.
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_chownat(struct ceph_mount_info *cmount, int dirfd, const char *relpath,
+                 uid_t uid, gid_t gid, int flags);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param buf holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utime(struct ceph_mount_info *cmount, const char *path, struct utimbuf *buf);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param buf holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futime(struct ceph_mount_info *cmount, int fd, struct utimbuf *buf);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times, don't follow symlinks.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param path the path to the file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_lutimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futimes(struct ceph_mount_info *cmount, int fd, struct timeval times[2]);
+
+/**
+ * Change file/directory last access and modification times.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param fd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_futimens(struct ceph_mount_info *cmount, int fd, struct timespec times[2]);
+
+/**
+ * Change file/directory last access and modification times relative
+ * to a file descriptor.
+ *
+ * @param cmount the ceph mount handle to use for performing the utime.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the relpath of the file/directory to change the ownership of.
+ * @param dirfd the fd of the open file/directory to set the time values of.
+ * @param times holding the access and modification times to set on the file.
+ * @param flags bitfield that can be used to set AT_* modifier flags (AT_SYMLINK_NOFOLLOW)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_utimensat(struct ceph_mount_info *cmount, int dirfd, const char *relpath,
+                   struct timespec times[2], int flags);
+
+/**
+ * Apply or remove an advisory lock.
+ *
+ * @param cmount the ceph mount handle to use for performing the lock.
+ * @param fd the open file descriptor to change advisory lock.
+ * @param operation the advisory lock operation to be performed on the file
+ * descriptor among LOCK_SH (shared lock), LOCK_EX (exclusive lock),
+ * or LOCK_UN (remove lock). The LOCK_NB value can be ORed to perform a
+ * non-blocking operation.
+ * @param owner the user-supplied owner identifier (an arbitrary integer)
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_flock(struct ceph_mount_info *cmount, int fd, int operation,
+	       uint64_t owner);
+
+/**
+ * Truncate the file to the given size.  If this operation causes the
+ * file to expand, the empty bytes will be filled in with zeros.
+ *
+ * @param cmount the ceph mount handle to use for performing the truncate.
+ * @param path the path to the file to truncate.
+ * @param size the new size of the file.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_truncate(struct ceph_mount_info *cmount, const char *path, int64_t size);
+
+/**
+ * Make a block or character special file.
+ *
+ * @param cmount the ceph mount handle to use for performing the mknod.
+ * @param path the path to the special file.
+ * @param mode the permissions to use and the type of special file.  The type can be
+ *        one of S_IFREG, S_IFCHR, S_IFBLK, S_IFIFO.
+ * @param rdev If the file type is S_IFCHR or S_IFBLK then this parameter specifies the
+ *        major and minor numbers of the newly created device special file.  Otherwise, 
+ *        it is ignored.
+ * @returns 0 on success or negative error code on failure.
+ */
+int ceph_mknod(struct ceph_mount_info *cmount, const char *path, mode_t mode, dev_t rdev);
+/**
+ * Create and/or open a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param path the path of the file to open.  If the flags parameter includes O_CREAT,
+ *        the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ *        is specified in the flags.
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_open(struct ceph_mount_info *cmount, const char *path, int flags, mode_t mode);
+
+/**
+ * Create and/or open a file relative to a directory
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD)
+ * @param relpath the path of the file to open.  If the flags parameter includes O_CREAT,
+ *        the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ *        is specified in the flags.
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_openat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, int flags, mode_t mode);
+
+/**
+ * Create and/or open a file with a specific file layout.
+ *
+ * @param cmount the ceph mount handle to use for performing the open.
+ * @param path the path of the file to open.  If the flags parameter includes O_CREAT,
+ *        the file will first be created before opening.
+ * @param flags a set of option masks that control how the file is created/opened.
+ * @param mode the permissions to place on the file if the file does not exist and O_CREAT
+ *        is specified in the flags.
+ * @param stripe_unit the stripe unit size (option, 0 for default)
+ * @param stripe_count the stripe count (optional, 0 for default)
+ * @param object_size the object size (optional, 0 for default)
+ * @param data_pool name of target data pool name (optional, NULL or empty string for default)
+ * @returns a non-negative file descriptor number on success or a negative error code on failure.
+ */
+int ceph_open_layout(struct ceph_mount_info *cmount, const char *path, int flags,
+ 		     mode_t mode, int stripe_unit, int stripe_count, int object_size,
+ 		     const char *data_pool);
+
+/**
+ * Close the open file.
+ *
+ * @param cmount the ceph mount handle to use for performing the close.
+ * @param fd the file descriptor referring to the open file.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_close(struct ceph_mount_info *cmount, int fd);
+
+/**
+ * Reposition the open file stream based on the given offset.
+ *
+ * @param cmount the ceph mount handle to use for performing the lseek.
+ * @param fd the open file descriptor referring to the open file and holding the
+ *        current position of the stream.
+ * @param offset the offset to set the stream to
+ * @param whence the flag to indicate what type of seeking to perform:
+ *	SEEK_SET: the offset is set to the given offset in the file.
+ *      SEEK_CUR: the offset is set to the current location plus @e offset bytes.
+ *      SEEK_END: the offset is set to the end of the file plus @e offset bytes.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int64_t ceph_lseek(struct ceph_mount_info *cmount, int fd, int64_t offset, int whence);
+/**
+ * Read data from the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the read.
+ * @param fd the file descriptor of the open file to read from.
+ * @param buf the buffer to read data into
+ * @param size the initial size of the buffer
+ * @param offset the offset in the file to read from.  If this value is negative, the
+ *        function reads from the current offset of the file descriptor.
+ * @returns the number of bytes read into buf, or a negative error code on failure.
+ */
+int ceph_read(struct ceph_mount_info *cmount, int fd, char *buf, int64_t size, int64_t offset);
+
+/**
+ * Read data from the file.
+ * @param cmount the ceph mount handle to use for performing the read.
+ * @param fd the file descriptor of the open file to read from.
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset in the file to read from.  If this value is negative, the
+ *        function reads from the current offset of the file descriptor.
+ * @returns the number of bytes read into buf, or a negative error code on failure.
+ */
+int ceph_preadv(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+           int64_t offset);
+
+/**
+ * Write data to a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the write.
+ * @param fd the file descriptor of the open file to write to
+ * @param buf the bytes to write to the file
+ * @param size the size of the buf array
+ * @param offset the offset of the file write into.  If this value is negative, the
+ *        function writes to the current offset of the file descriptor.
+ * @returns the number of bytes written, or a negative error code
+ */
+int ceph_write(struct ceph_mount_info *cmount, int fd, const char *buf, int64_t size,
+	       int64_t offset);
+
+/**
+ * Write data to a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the write.
+ * @param fd the file descriptor of the open file to write to
+ * @param iov the iov structure to read data into
+ * @param iovcnt the number of items that iov includes
+ * @param offset the offset of the file write into.  If this value is negative, the
+ *        function writes to the current offset of the file descriptor.
+ * @returns the number of bytes written, or a negative error code
+ */
+int ceph_pwritev(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt,
+           int64_t offset);
+
+/**
+ * Truncate a file to the given size.
+ *
+ * @param cmount the ceph mount handle to use for performing the ftruncate.
+ * @param fd the file descriptor of the file to truncate
+ * @param size the new size of the file
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_ftruncate(struct ceph_mount_info *cmount, int fd, int64_t size);
+
+/**
+ * Synchronize an open file to persistent media.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param syncdataonly a boolean whether to synchronize metadata and data (0)
+ *        or just data (1).
+ * @return 0 on success or a negative error code on failure.
+ */
+int ceph_fsync(struct ceph_mount_info *cmount, int fd, int syncdataonly);
+
+/**
+ * Preallocate or release disk space for the file for the byte range.
+ *
+ * @param cmount the ceph mount handle to use for performing the fallocate.
+ * @param fd the file descriptor of the file to fallocate.
+ * @param mode the flags determines the operation to be performed on the given range.
+ *        default operation (0) allocate and initialize to zero the file in the byte range,
+ *        and the file size will be changed if offset + length is greater than
+ *        the file size. if the FALLOC_FL_KEEP_SIZE flag is specified in the mode,
+ *        the file size will not be changed. if the FALLOC_FL_PUNCH_HOLE flag is
+ *        specified in the mode, the operation is deallocate space and zero the byte range.
+ * @param offset the byte range starting.
+ * @param length the length of the range.
+ * @return 0 on success or a negative error code on failure.
+ */
+int ceph_fallocate(struct ceph_mount_info *cmount, int fd, int mode,
+	                      int64_t offset, int64_t length);
+
+/**
+ * Enable/disable lazyio for the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param enable a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio(struct ceph_mount_info *cmount, int fd, int enable);
+
+
+/**
+ * Flushes the write buffer for the file thereby propogating the buffered write to the file.
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param offset a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio_propagate(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count);
+
+
+/**
+ * Flushes the write buffer for the file and invalidate the read cache. This allows a subsequent read operation to read and cache data directly from the file and hence everyone's propagated writes would be visible. 
+ *
+ * @param cmount the ceph mount handle to use for performing the fsync.
+ * @param fd the file descriptor of the file to sync.
+ * @param offset a boolean to enable lazyio or disable lazyio.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lazyio_synchronize(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count);
+
+/** @} file */
+
+/**
+ * @defgroup libcephfs_h_xattr Extended Attribute manipulation and handling.
+ * Functions for creating and manipulating extended attributes on files.
+ *
+ * @{
+ */
+
+/**
+ * Get an extended attribute.
+ *
+ * @param cmount the ceph mount handle to use for performing the getxattr.
+ * @param path the path to the file
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_getxattr(struct ceph_mount_info *cmount, const char *path, const char *name, 
+	void *value, size_t size);
+
+/**
+ * Get an extended attribute.
+ *
+ * @param cmount the ceph mount handle to use for performing the getxattr.
+ * @param fd the open file descriptor referring to the file to get extended attribute from.
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_fgetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+	void *value, size_t size);
+
+/**
+ * Get an extended attribute without following symbolic links.  This function is
+ * identical to ceph_getxattr, but if the path refers to a symbolic link,
+ * we get the extended attributes of the symlink rather than the attributes
+ * of the link itself.
+ *
+ * @param cmount the ceph mount handle to use for performing the lgetxattr.
+ * @param path the path to the file
+ * @param name the name of the extended attribute to get
+ * @param value a pre-allocated buffer to hold the xattr's value
+ * @param size the size of the pre-allocated buffer
+ * @returns the size of the value or a negative error code on failure.
+ */
+int ceph_lgetxattr(struct ceph_mount_info *cmount, const char *path, const char *name, 
+	void *value, size_t size);
+
+/**
+ * List the extended attribute keys on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the listxattr.
+ * @param path the path to the file.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_listxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size);
+
+/**
+ * List the extended attribute keys on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the listxattr.
+ * @param fd the open file descriptor referring to the file to list extended attributes on.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_flistxattr(struct ceph_mount_info *cmount, int fd, char *list, size_t size);
+
+/**
+ * Get the list of extended attribute keys on a file, but do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the llistxattr.
+ * @param path the path to the file.
+ * @param list a buffer to be filled in with the list of extended attributes keys.
+ * @param size the size of the list buffer.
+ * @returns the size of the resulting list filled in.
+ */
+int ceph_llistxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size);
+
+/**
+ * Remove an extended attribute from a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the removexattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_removexattr(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Remove an extended attribute from a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the removexattr.
+ * @param fd the open file descriptor referring to the file to remove extended attribute from.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fremovexattr(struct ceph_mount_info *cmount, int fd, const char *name);
+
+/**
+ * Remove the extended attribute from a file, do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the lremovexattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to remove.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lremovexattr(struct ceph_mount_info *cmount, const char *path, const char *name);
+
+/**
+ * Set an extended attribute on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the setxattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ *	CEPH_XATTR_CREATE: create the extended attribute.  Must not exist.
+ *      CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_setxattr(struct ceph_mount_info *cmount, const char *path, const char *name, 
+	const void *value, size_t size, int flags);
+
+/**
+ * Set an extended attribute on a file.
+ *
+ * @param cmount the ceph mount handle to use for performing the setxattr.
+ * @param fd the open file descriptor referring to the file to set extended attribute on.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ *	CEPH_XATTR_CREATE: create the extended attribute.  Must not exist.
+ *      CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_fsetxattr(struct ceph_mount_info *cmount, int fd, const char *name,
+	const void *value, size_t size, int flags);
+
+/**
+ * Set an extended attribute on a file, do not follow symbolic links.
+ *
+ * @param cmount the ceph mount handle to use for performing the lsetxattr.
+ * @param path the path to the file.
+ * @param name the name of the extended attribute to set.
+ * @param value the bytes of the extended attribute value
+ * @param size the size of the extended attribute value
+ * @param flags the flags can be:
+ *	CEPH_XATTR_CREATE: create the extended attribute.  Must not exist.
+ *      CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist.
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char *name, 
+	const void *value, size_t size, int flags);
+
+/** @} xattr */
+
+/**
+ * @defgroup libcephfs_h_filelayout Control File Layout.
+ * Functions for setting and getting the file layout of existing files.
+ *
+ * @{
+ */
+
+/**
+ * Get the file striping unit from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the striping unit of.
+ * @returns the striping unit of the file or a negative error code on failure.
+ */
+int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file striping unit.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the striping unit of.
+ * @returns the striping unit of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file striping count from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file striping count.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file object size from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file object size.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file pool information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the pool information of.
+ * @returns the ceph pool id that the file is in
+ */
+int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file pool information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the pool information of.
+ * @returns the ceph pool id that the file is in
+ */
+int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the name of the pool a opened file is stored in,
+ *
+ * Write the name of the file's pool to the buffer.  If buflen is 0, return
+ * a suggested length for the buffer.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen);
+
+/**
+ * get the name of a pool by id
+ *
+ * Given a pool's numeric identifier, get the pool's alphanumeric name.
+ *
+ * @param cmount the ceph mount handle to use
+ * @param pool the numeric pool id
+ * @param buf buffer to sore the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough
+ */
+int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t buflen);
+
+/**
+ * Get the name of the pool a file is stored in
+ *
+ * Write the name of the file's pool to the buffer.  If buflen is 0, return
+ * a suggested length for the buffer.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t buflen);
+
+/**
+ * Get the default pool name of cephfs
+ * Write the name of the default pool to the buffer. If buflen is 0, return
+ * a suggested length for the buffer.
+ * @param cmount the ceph mount handle to use.
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_default_data_pool_name(struct ceph_mount_info *cmount, char *buf, size_t buflen);
+
+/**
+ * Get the file layout from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file layout.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file replication information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the replication information of.
+ * @returns the replication factor of the file.
+ */
+int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file replication information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the replication information of.
+ * @returns the replication factor of the file.
+ */
+int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the id of the named pool.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param pool_name the name of the pool.
+ * @returns the pool id, or a negative error code on failure.
+ */
+int ceph_get_pool_id(struct ceph_mount_info *cmount, const char *pool_name);
+
+/**
+ * Get the pool replication factor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param pool_id the pool id to look up
+ * @returns the replication factor, or a negative error code on failure.
+ */
+int ceph_get_pool_replication(struct ceph_mount_info *cmount, int pool_id);
+
+/**
+ * Get the OSD address where the primary copy of a file stripe is located.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the open file descriptor referring to the file to get the striping unit of.
+ * @param offset the offset into the file to specify the stripe.  The offset can be
+ *	anywhere within the stripe unit.
+ * @param addr the address of the OSD holding that stripe
+ * @param naddr the capacity of the address passed in.
+ * @returns the size of the addressed filled into the @e addr parameter, or a negative
+ *	error code on failure.
+ */
+int ceph_get_file_stripe_address(struct ceph_mount_info *cmount, int fd, int64_t offset,
+				 struct sockaddr_storage *addr, int naddr);
+
+/**
+ * Get the list of OSDs where the objects containing a file offset are located.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the open file descriptor referring to the file.
+ * @param offset the offset within the file.
+ * @param length return the number of bytes between the offset and the end of
+ * the stripe unit (optional).
+ * @param osds an integer array to hold the OSD ids.
+ * @param nosds the size of the integer array.
+ * @returns the number of items stored in the output array, or -ERANGE if the
+ * array is not large enough.
+ */
+int ceph_get_file_extent_osds(struct ceph_mount_info *cmount, int fd,
+                              int64_t offset, int64_t *length, int *osds, int nosds);
+
+/**
+ * Get the fully qualified CRUSH location of an OSD.
+ *
+ * Returns (type, name) string pairs for each device in the CRUSH bucket
+ * hierarchy starting from the given osd to the root. Each pair element is
+ * separated by a NULL character.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param osd the OSD id.
+ * @param path buffer to store location.
+ * @param len size of buffer.
+ * @returns the amount of bytes written into the buffer, or -ERANGE if the
+ * array is not large enough.
+ */
+int ceph_get_osd_crush_location(struct ceph_mount_info *cmount,
+    int osd, char *path, size_t len);
+
+/**
+ * Get the network address of an OSD.
+ *
+ * @param cmount the ceph mount handle.
+ * @param osd the OSD id.
+ * @param addr the OSD network address.
+ * @returns zero on success, other returns a negative error code.
+ */
+int ceph_get_osd_addr(struct ceph_mount_info *cmount, int osd,
+    struct sockaddr_storage *addr);
+
+/**
+ * Get the file layout stripe unit granularity.
+ * @param cmount the ceph mount handle.
+ * @returns the stripe unit granularity or a negative error code on failure.
+ */
+int ceph_get_stripe_unit_granularity(struct ceph_mount_info *cmount);
+
+/** @} filelayout */
+
+/**
+ * No longer available.  Do not use.
+ * These functions will return -EOPNOTSUPP.
+ */
+int ceph_set_default_file_stripe_unit(struct ceph_mount_info *cmount, int stripe);
+int ceph_set_default_file_stripe_count(struct ceph_mount_info *cmount, int count);
+int ceph_set_default_object_size(struct ceph_mount_info *cmount, int size);
+int ceph_set_default_preferred_pg(struct ceph_mount_info *cmount, int osd);
+int ceph_set_default_file_replication(struct ceph_mount_info *cmount, int replication);
+
+/**
+ * Read from local replicas when possible.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param val a boolean to set (1) or clear (0) the option to favor local objects
+ *     for reads.
+ * @returns 0
+ */
+int ceph_localize_reads(struct ceph_mount_info *cmount, int val);
+
+/**
+ * Get the osd id of the local osd (if any)
+ *
+ * @param cmount the ceph mount handle to use.
+ * @returns the osd (if any) local to the node where this call is made, otherwise
+ *	-1 is returned.
+ */
+int ceph_get_local_osd(struct ceph_mount_info *cmount);
+
+/** @} default_filelayout */
+
+/**
+ * Get the capabilities currently issued to the client.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fd the file descriptor to get issued
+ * @returns the current capabilities issued to this client
+ *       for the open file
+ */
+int ceph_debug_get_fd_caps(struct ceph_mount_info *cmount, int fd);
+
+/**
+ * Get the capabilities currently issued to the client.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path to the file
+ * @returns the current capabilities issued to this client
+ *       for the file
+ */
+int ceph_debug_get_file_caps(struct ceph_mount_info *cmount, const char *path);
+
+/* Low Level */
+struct Inode *ceph_ll_get_inode(struct ceph_mount_info *cmount,
+				vinodeno_t vino);
+
+int ceph_ll_lookup_vino(struct ceph_mount_info *cmount, vinodeno_t vino,
+			Inode **inode);
+
+int ceph_ll_lookup_inode(
+    struct ceph_mount_info *cmount,
+    struct inodeno_t ino,
+    Inode **inode);
+
+/**
+ * Get the root inode of FS. Increase counter of references for root Inode. You must call ceph_ll_forget for it!
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param parent pointer to pointer to Inode struct. Pointer to root inode will be returned
+ * @returns 0 if all good
+ */
+int ceph_ll_lookup_root(struct ceph_mount_info *cmount,
+                  Inode **parent);
+int ceph_ll_lookup(struct ceph_mount_info *cmount, Inode *parent,
+		   const char *name, Inode **out, struct ceph_statx *stx,
+		   unsigned want, unsigned flags, const UserPerm *perms);
+int ceph_ll_put(struct ceph_mount_info *cmount, struct Inode *in);
+int ceph_ll_forget(struct ceph_mount_info *cmount, struct Inode *in,
+		   int count);
+int ceph_ll_walk(struct ceph_mount_info *cmount, const char* name, Inode **i,
+		 struct ceph_statx *stx, unsigned int want, unsigned int flags,
+		 const UserPerm *perms);
+int ceph_ll_getattr(struct ceph_mount_info *cmount, struct Inode *in,
+		    struct ceph_statx *stx, unsigned int want, unsigned int flags,
+		    const UserPerm *perms);
+int ceph_ll_setattr(struct ceph_mount_info *cmount, struct Inode *in,
+		    struct ceph_statx *stx, int mask, const UserPerm *perms);
+int ceph_ll_open(struct ceph_mount_info *cmount, struct Inode *in, int flags,
+		 struct Fh **fh, const UserPerm *perms);
+off_t ceph_ll_lseek(struct ceph_mount_info *cmount, struct Fh* filehandle,
+		     off_t offset, int whence);
+int ceph_ll_read(struct ceph_mount_info *cmount, struct Fh* filehandle,
+		 int64_t off, uint64_t len, char* buf);
+int ceph_ll_fsync(struct ceph_mount_info *cmount, struct Fh *fh,
+		  int syncdataonly);
+int ceph_ll_sync_inode(struct ceph_mount_info *cmount, struct Inode *in,
+		  int syncdataonly);
+int ceph_ll_fallocate(struct ceph_mount_info *cmount, struct Fh *fh,
+		      int mode, int64_t offset, int64_t length);
+int ceph_ll_write(struct ceph_mount_info *cmount, struct Fh* filehandle,
+		  int64_t off, uint64_t len, const char *data);
+int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh,
+		      const struct iovec *iov, int iovcnt, int64_t off);
+int64_t ceph_ll_writev(struct ceph_mount_info *cmount, struct Fh *fh,
+		       const struct iovec *iov, int iovcnt, int64_t off);
+int ceph_ll_close(struct ceph_mount_info *cmount, struct Fh* filehandle);
+int ceph_ll_iclose(struct ceph_mount_info *cmount, struct Inode *in, int mode);
+/**
+ * Get xattr value by xattr name.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param in file handle
+ * @param name name of attribute
+ * @param value pointer to begin buffer
+ * @param size buffer size
+ * @param perms pointer to UserPerms object
+ * @returns size of returned buffer. Negative number in error case
+ */
+int ceph_ll_getxattr(struct ceph_mount_info *cmount, struct Inode *in,
+		     const char *name, void *value, size_t size,
+		     const UserPerm *perms);
+int ceph_ll_setxattr(struct ceph_mount_info *cmount, struct Inode *in,
+		     const char *name, const void *value, size_t size,
+		     int flags, const UserPerm *perms);
+int ceph_ll_listxattr(struct ceph_mount_info *cmount, struct Inode *in,
+                      char *list, size_t buf_size, size_t *list_size,
+		      const UserPerm *perms);
+int ceph_ll_removexattr(struct ceph_mount_info *cmount, struct Inode *in,
+			const char *name, const UserPerm *perms);
+int ceph_ll_create(struct ceph_mount_info *cmount, Inode *parent,
+		   const char *name, mode_t mode, int oflags, Inode **outp,
+		   Fh **fhp, struct ceph_statx *stx, unsigned want,
+		   unsigned lflags, const UserPerm *perms);
+int ceph_ll_mknod(struct ceph_mount_info *cmount, Inode *parent,
+		  const char *name, mode_t mode, dev_t rdev, Inode **out,
+		  struct ceph_statx *stx, unsigned want, unsigned flags,
+		  const UserPerm *perms);
+int ceph_ll_mkdir(struct ceph_mount_info *cmount, Inode *parent,
+		  const char *name, mode_t mode, Inode **out,
+		  struct ceph_statx *stx, unsigned want,
+		  unsigned flags, const UserPerm *perms);
+int ceph_ll_link(struct ceph_mount_info *cmount, struct Inode *in,
+		 struct Inode *newparent, const char *name,
+		 const UserPerm *perms);
+int ceph_ll_opendir(struct ceph_mount_info *cmount, struct Inode *in,
+		    struct ceph_dir_result **dirpp, const UserPerm *perms);
+int ceph_ll_releasedir(struct ceph_mount_info *cmount,
+		       struct ceph_dir_result* dir);
+int ceph_ll_rename(struct ceph_mount_info *cmount, struct Inode *parent,
+		   const char *name, struct Inode *newparent,
+		   const char *newname, const UserPerm *perms);
+int ceph_ll_unlink(struct ceph_mount_info *cmount, struct Inode *in,
+		   const char *name, const UserPerm *perms);
+int ceph_ll_statfs(struct ceph_mount_info *cmount, struct Inode *in,
+		   struct statvfs *stbuf);
+int ceph_ll_readlink(struct ceph_mount_info *cmount, struct Inode *in,
+		     char *buf, size_t bufsize, const UserPerm *perms);
+int ceph_ll_symlink(struct ceph_mount_info *cmount,
+		    Inode *in, const char *name, const char *value,
+		    Inode **out, struct ceph_statx *stx,
+		    unsigned want, unsigned flags,
+		    const UserPerm *perms);
+int ceph_ll_rmdir(struct ceph_mount_info *cmount, struct Inode *in,
+		  const char *name, const UserPerm *perms);
+uint32_t ceph_ll_stripe_unit(struct ceph_mount_info *cmount,
+			     struct Inode *in);
+uint32_t ceph_ll_file_layout(struct ceph_mount_info *cmount,
+			     struct Inode *in,
+			     struct ceph_file_layout *layout);
+uint64_t ceph_ll_snap_seq(struct ceph_mount_info *cmount,
+			  struct Inode *in);
+int ceph_ll_get_stripe_osd(struct ceph_mount_info *cmount,
+			   struct Inode *in,
+			   uint64_t blockno,
+			   struct ceph_file_layout* layout);
+int ceph_ll_num_osds(struct ceph_mount_info *cmount);
+int ceph_ll_osdaddr(struct ceph_mount_info *cmount,
+		    int osd, uint32_t *addr);
+uint64_t ceph_ll_get_internal_offset(struct ceph_mount_info *cmount,
+				     struct Inode *in, uint64_t blockno);
+int ceph_ll_read_block(struct ceph_mount_info *cmount,
+		       struct Inode *in, uint64_t blockid,
+		       char* bl, uint64_t offset, uint64_t length,
+		       struct ceph_file_layout* layout);
+int ceph_ll_write_block(struct ceph_mount_info *cmount,
+			struct Inode *in, uint64_t blockid,
+			char* buf, uint64_t offset,
+			uint64_t length, struct ceph_file_layout* layout,
+			uint64_t snapseq, uint32_t sync);
+int ceph_ll_commit_blocks(struct ceph_mount_info *cmount,
+			  struct Inode *in, uint64_t offset, uint64_t range);
+
+
+int ceph_ll_getlk(struct ceph_mount_info *cmount,
+		  Fh *fh, struct flock *fl, uint64_t owner);
+int ceph_ll_setlk(struct ceph_mount_info *cmount,
+		  Fh *fh, struct flock *fl, uint64_t owner, int sleep);
+
+int ceph_ll_lazyio(struct ceph_mount_info *cmount, Fh *fh, int enable);
+
+/*
+ * Delegation support
+ *
+ * Delegations are way for an application to request exclusive or
+ * semi-exclusive access to an Inode. The client requests the delegation and
+ * if it's successful it can reliably cache file data and metadata until the
+ * delegation is recalled.
+ *
+ * Recalls are issued via a callback function, provided by the application.
+ * Callback functions should act something like signal handlers.  You want to
+ * do as little as possible in the callback. Any major work should be deferred
+ * in some fashion as it's difficult to predict the context in which this
+ * function will be called.
+ *
+ * Once the delegation has been recalled, the application should return it as
+ * soon as possible. The application has client_deleg_timeout seconds to
+ * return it, after which the cmount structure is forcibly unmounted and
+ * further calls into it fail.
+ *
+ * The application can set the client_deleg_timeout config option to suit its
+ * needs, but it should take care to choose a value that allows it to avoid
+ * forcible eviction from the cluster in the event of an application bug.
+ */
+
+/* Commands for manipulating delegation state */
+#ifndef CEPH_DELEGATION_NONE
+# define CEPH_DELEGATION_NONE	0
+# define CEPH_DELEGATION_RD	1
+# define CEPH_DELEGATION_WR	2
+#endif
+
+/**
+ * Get the amount of time that the client has to return caps
+ * @param cmount the ceph mount handle to use.
+ *
+ * In the event that a client does not return its caps, the MDS may blocklist
+ * it after this timeout. Applications should check this value and ensure
+ * that they set the delegation timeout to a value lower than this.
+ *
+ * This call returns the cap return timeout (in seconds) for this cmount, or
+ * zero if it's not mounted.
+ */
+uint32_t ceph_get_cap_return_timeout(struct ceph_mount_info *cmount);
+
+/**
+ * Set the delegation timeout for the mount (thereby enabling delegations)
+ * @param cmount the ceph mount handle to use.
+ * @param timeout the delegation timeout (in seconds)
+ *
+ * Since the client could end up blocklisted if it doesn't return delegations
+ * in time, we mandate that any application wanting to use delegations
+ * explicitly set the timeout beforehand. Until this call is done on the
+ * mount, attempts to set a delegation will return -ETIME.
+ *
+ * Once a delegation is recalled, if it is not returned in this amount of
+ * time, the cmount will be forcibly unmounted and further access attempts
+ * will fail (usually with -ENOTCONN errors).
+ *
+ * This value is further vetted against the cap return timeout, and this call
+ * can fail with -EINVAL if the timeout value is too long. Delegations can be
+ * disabled again by setting the timeout to 0.
+ */
+int ceph_set_deleg_timeout(struct ceph_mount_info *cmount, uint32_t timeout);
+
+/**
+ * Request a delegation on an open Fh
+ * @param cmount the ceph mount handle to use.
+ * @param fh file handle
+ * @param cmd CEPH_DELEGATION_* command
+ * @param cb callback function for recalling delegation
+ * @param priv opaque token passed back during recalls
+ *
+ * Returns 0 if the delegation was granted, -EAGAIN if there was a conflict
+ * and other error codes if there is a fatal error of some sort (e.g. -ENOMEM,
+ * -ETIME)
+ */
+int ceph_ll_delegation(struct ceph_mount_info *cmount, Fh *fh,
+		       unsigned int cmd, ceph_deleg_cb_t cb, void *priv);
+
+mode_t ceph_umask(struct ceph_mount_info *cmount, mode_t mode);
+
+/* state reclaim */
+#define CEPH_RECLAIM_RESET 	1
+
+/**
+ * Set ceph client uuid
+ * @param cmount the ceph mount handle to use.
+ * @param uuid the uuid to set
+ *
+ * Must be called before mount.
+ */
+void ceph_set_uuid(struct ceph_mount_info *cmount, const char *uuid);
+
+/**
+ * Set ceph client session timeout
+ * @param cmount the ceph mount handle to use.
+ * @param timeout the timeout to set
+ *
+ * Must be called before mount.
+ */
+void ceph_set_session_timeout(struct ceph_mount_info *cmount, unsigned timeout);
+
+/**
+ * Start to reclaim states of other client
+ * @param cmount the ceph mount handle to use.
+ * @param uuid uuid of client whose states need to be reclaimed
+ * @param flags flags that control how states get reclaimed
+ *
+ * Returns 0 success, -EOPNOTSUPP if mds does not support the operation,
+ * -ENOENT if CEPH_RECLAIM_RESET is specified and there is no client
+ * with the given uuid, -ENOTRECOVERABLE in all other error cases.
+ */
+int ceph_start_reclaim(struct ceph_mount_info *cmount,
+		       const char *uuid, unsigned flags);
+
+/**
+ * finish reclaiming states of other client (
+ * @param cmount the ceph mount handle to use.
+ */
+void ceph_finish_reclaim(struct ceph_mount_info *cmount);
+
+/**
+ * Register a set of callbacks to be used with this cmount
+ *
+ * This is deprecated, use ceph_ll_register_callbacks2() instead.
+ *
+ * @param cmount the ceph mount handle on which the cb's should be registerd
+ * @param args   callback arguments to register with the cmount
+ *
+ * Any fields set to NULL will be ignored. There currently is no way to
+ * unregister these callbacks, so this is a one-way change.
+ */
+void ceph_ll_register_callbacks(struct ceph_mount_info *cmount,
+				struct ceph_client_callback_args *args);
+
+/**
+ * Register a set of callbacks to be used with this cmount
+ * @param cmount the ceph mount handle on which the cb's should be registerd
+ * @param args   callback arguments to register with the cmount
+ *
+ * Any fields set to NULL will be ignored. There currently is no way to
+ * unregister these callbacks, so this is a one-way change.
+ *
+ * Returns 0 on success or -EBUSY if the cmount is mounting or already mounted.
+ */
+int ceph_ll_register_callbacks2(struct ceph_mount_info *cmount,
+				struct ceph_client_callback_args *args);
+
+/**
+ * Get snapshot info
+ *
+ * @param cmount the ceph mount handle to use for making the directory.
+ * @param path the path of the snapshot.  This must be either an
+ *        absolute path or a relative path off of the current working directory.
+ * @returns 0 on success or a negative return code on error.
+ */
+int ceph_get_snap_info(struct ceph_mount_info *cmount,
+                       const char *path, struct snap_info *snap_info);
+
+/**
+ * Free snapshot info buffers
+ *
+ * @param snap_info snapshot info struct (fetched via call to ceph_get_snap_info()).
+ */
+void ceph_free_snap_info_buffer(struct snap_info *snap_info);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/cephfs/metrics/Types.h b/src/include/cephfs/metrics/Types.h
new file mode 100644
index 000000000..d7cf56138
--- /dev/null
+++ b/src/include/cephfs/metrics/Types.h
@@ -0,0 +1,699 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_INCLUDE_CEPHFS_METRICS_TYPES_H
+#define CEPH_INCLUDE_CEPHFS_METRICS_TYPES_H
+
+#include <string>
+#include <boost/variant.hpp>
+
+#include "common/Formatter.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "include/int_types.h"
+#include "include/stringify.h"
+#include "include/utime.h"
+
+namespace ceph { class Formatter; }
+
+enum ClientMetricType {
+  CLIENT_METRIC_TYPE_CAP_INFO,
+  CLIENT_METRIC_TYPE_READ_LATENCY,
+  CLIENT_METRIC_TYPE_WRITE_LATENCY,
+  CLIENT_METRIC_TYPE_METADATA_LATENCY,
+  CLIENT_METRIC_TYPE_DENTRY_LEASE,
+  CLIENT_METRIC_TYPE_OPENED_FILES,
+  CLIENT_METRIC_TYPE_PINNED_ICAPS,
+  CLIENT_METRIC_TYPE_OPENED_INODES,
+  CLIENT_METRIC_TYPE_READ_IO_SIZES,
+  CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
+  CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
+  CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
+  CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
+  CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
+  CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
+  CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+};
+inline std::ostream &operator<<(std::ostream &os, const ClientMetricType &type) {
+  switch(type) {
+  case ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO:
+    os << "CAP_INFO";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY:
+    os << "READ_LATENCY";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY:
+    os << "WRITE_LATENCY";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY:
+    os << "METADATA_LATENCY";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE:
+    os << "DENTRY_LEASE";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES:
+    os << "OPENED_FILES";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS:
+    os << "PINNED_ICAPS";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES:
+    os << "OPENED_INODES";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES:
+    os << "READ_IO_SIZES";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES:
+    os << "WRITE_IO_SIZES";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_AVG_READ_LATENCY:
+    os << "AVG_READ_LATENCY";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_READ_LATENCY:
+    os << "STDEV_READ_LATENCY";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY:
+    os << "AVG_WRITE_LATENCY";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY:
+    os << "STDEV_WRITE_LATENCY";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY:
+    os << "AVG_METADATA_LATENCY";
+    break;
+  case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY:
+    os << "STDEV_METADATA_LATENCY";
+    break;
+  default:
+    os << "(UNKNOWN:" << static_cast<std::underlying_type<ClientMetricType>::type>(type) << ")";
+    break;
+  }
+
+  return os;
+}
+
+struct ClientMetricPayloadBase {
+  ClientMetricPayloadBase(ClientMetricType type) : metric_type(type) {}
+
+  ClientMetricType get_type() const {
+    return metric_type;
+  }
+
+  void print_type(std::ostream *out) const {
+    *out << metric_type;
+  }
+
+  private:
+    ClientMetricType metric_type;
+};
+
+struct CapInfoPayload : public ClientMetricPayloadBase {
+  uint64_t cap_hits = 0;
+  uint64_t cap_misses = 0;
+  uint64_t nr_caps = 0;
+
+  CapInfoPayload()
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO) { }
+  CapInfoPayload(uint64_t cap_hits, uint64_t cap_misses, uint64_t nr_caps)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO),
+    cap_hits(cap_hits), cap_misses(cap_misses), nr_caps(nr_caps) {
+  }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(1, 1, bl);
+    encode(cap_hits, bl);
+    encode(cap_misses, bl);
+    encode(nr_caps, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(1, iter);
+    decode(cap_hits, iter);
+    decode(cap_misses, iter);
+    decode(nr_caps, iter);
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("cap_hits", cap_hits);
+    f->dump_int("cap_misses", cap_misses);
+    f->dump_int("num_caps", nr_caps);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "cap_hits: " << cap_hits << " "
+	 << "cap_misses: " << cap_misses << " "
+	 << "num_caps: " << nr_caps;
+  }
+};
+
+struct ReadLatencyPayload : public ClientMetricPayloadBase {
+  utime_t lat;
+  utime_t mean;
+  uint64_t sq_sum;  // sum of squares
+  uint64_t count;   // IO count
+
+  ReadLatencyPayload()
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY) { }
+  ReadLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY),
+      lat(lat),
+      mean(mean),
+      sq_sum(sq_sum),
+      count(count) {
+  }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(2, 1, bl);
+    encode(lat, bl);
+    encode(mean, bl);
+    encode(sq_sum, bl);
+    encode(count, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(2, iter);
+    decode(lat, iter);
+    if (struct_v >= 2) {
+      decode(mean, iter);
+      decode(sq_sum, iter);
+      decode(count, iter);
+    }
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("latency", lat);
+    f->dump_int("avg_latency", mean);
+    f->dump_unsigned("sq_sum", sq_sum);
+    f->dump_unsigned("count", count);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "latency: " << lat << ", avg_latency: " << mean
+         << ", sq_sum: " << sq_sum << ", count=" << count;
+  }
+};
+
+struct WriteLatencyPayload : public ClientMetricPayloadBase {
+  utime_t lat;
+  utime_t mean;
+  uint64_t sq_sum;  // sum of squares
+  uint64_t count;   // IO count
+
+  WriteLatencyPayload()
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY) { }
+  WriteLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY),
+      lat(lat),
+      mean(mean),
+      sq_sum(sq_sum),
+      count(count){
+  }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(2, 1, bl);
+    encode(lat, bl);
+    encode(mean, bl);
+    encode(sq_sum, bl);
+    encode(count, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(2, iter);
+    decode(lat, iter);
+    if (struct_v >= 2) {
+      decode(mean, iter);
+      decode(sq_sum, iter);
+      decode(count, iter);
+    }
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("latency", lat);
+    f->dump_int("avg_latency", mean);
+    f->dump_unsigned("sq_sum", sq_sum);
+    f->dump_unsigned("count", count);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "latency: " << lat << ", avg_latency: " << mean
+         << ", sq_sum: " << sq_sum << ", count=" << count;
+  }
+};
+
+struct MetadataLatencyPayload : public ClientMetricPayloadBase {
+  utime_t lat;
+  utime_t mean;
+  uint64_t sq_sum;  // sum of squares
+  uint64_t count;   // IO count
+
+  MetadataLatencyPayload()
+  : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY) { }
+  MetadataLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY),
+      lat(lat),
+      mean(mean),
+      sq_sum(sq_sum),
+      count(count) {
+  }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(2, 1, bl);
+    encode(lat, bl);
+    encode(mean, bl);
+    encode(sq_sum, bl);
+    encode(count, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(2, iter);
+    decode(lat, iter);
+    if (struct_v >= 2) {
+      decode(mean, iter);
+      decode(sq_sum, iter);
+      decode(count, iter);
+    }
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("latency", lat);
+    f->dump_int("avg_latency", mean);
+    f->dump_unsigned("sq_sum", sq_sum);
+    f->dump_unsigned("count", count);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "latency: " << lat << ", avg_latency: " << mean
+         << ", sq_sum: " << sq_sum << ", count=" << count;
+  }
+};
+
+struct DentryLeasePayload : public ClientMetricPayloadBase {
+  uint64_t dlease_hits = 0;
+  uint64_t dlease_misses = 0;
+  uint64_t nr_dentries = 0;
+
+  DentryLeasePayload()
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE) { }
+  DentryLeasePayload(uint64_t dlease_hits, uint64_t dlease_misses, uint64_t nr_dentries)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE),
+    dlease_hits(dlease_hits), dlease_misses(dlease_misses), nr_dentries(nr_dentries) { }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(1, 1, bl);
+    encode(dlease_hits, bl);
+    encode(dlease_misses, bl);
+    encode(nr_dentries, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(1, iter);
+    decode(dlease_hits, iter);
+    decode(dlease_misses, iter);
+    decode(nr_dentries, iter);
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("dlease_hits", dlease_hits);
+    f->dump_int("dlease_misses", dlease_misses);
+    f->dump_int("num_dentries", nr_dentries);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "dlease_hits: " << dlease_hits << " "
+	 << "dlease_misses: " << dlease_misses << " "
+	 << "num_dentries: " << nr_dentries;
+  }
+};
+
+struct OpenedFilesPayload : public ClientMetricPayloadBase {
+  uint64_t opened_files = 0;
+  uint64_t total_inodes = 0;
+
+  OpenedFilesPayload()
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES) { }
+  OpenedFilesPayload(uint64_t opened_files, uint64_t total_inodes)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES),
+    opened_files(opened_files), total_inodes(total_inodes) { }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(1, 1, bl);
+    encode(opened_files, bl);
+    encode(total_inodes, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(1, iter);
+    decode(opened_files, iter);
+    decode(total_inodes, iter);
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("opened_files", opened_files);
+    f->dump_int("total_inodes", total_inodes);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "opened_files: " << opened_files << " "
+	 << "total_inodes: " << total_inodes;
+  }
+};
+
+struct PinnedIcapsPayload : public ClientMetricPayloadBase {
+  uint64_t pinned_icaps = 0;
+  uint64_t total_inodes = 0;
+
+  PinnedIcapsPayload()
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS) { }
+  PinnedIcapsPayload(uint64_t pinned_icaps, uint64_t total_inodes)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS),
+    pinned_icaps(pinned_icaps), total_inodes(total_inodes) { }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(1, 1, bl);
+    encode(pinned_icaps, bl);
+    encode(total_inodes, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(1, iter);
+    decode(pinned_icaps, iter);
+    decode(total_inodes, iter);
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("pinned_icaps", pinned_icaps);
+    f->dump_int("total_inodes", total_inodes);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "pinned_icaps: " << pinned_icaps << " "
+	 << "total_inodes: " << total_inodes;
+  }
+};
+
+struct OpenedInodesPayload : public ClientMetricPayloadBase {
+  uint64_t opened_inodes = 0;
+  uint64_t total_inodes = 0;
+
+  OpenedInodesPayload()
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES) { }
+  OpenedInodesPayload(uint64_t opened_inodes, uint64_t total_inodes)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES),
+    opened_inodes(opened_inodes), total_inodes(total_inodes) { }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(1, 1, bl);
+    encode(opened_inodes, bl);
+    encode(total_inodes, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(1, iter);
+    decode(opened_inodes, iter);
+    decode(total_inodes, iter);
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("opened_inodes", opened_inodes);
+    f->dump_int("total_inodes", total_inodes);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "opened_inodes: " << opened_inodes << " "
+	 << "total_inodes: " << total_inodes;
+  }
+};
+
+struct ReadIoSizesPayload : public ClientMetricPayloadBase {
+  uint64_t total_ops = 0;
+  uint64_t total_size = 0;
+
+  ReadIoSizesPayload()
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES) { }
+  ReadIoSizesPayload(uint64_t total_ops, uint64_t total_size)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES),
+    total_ops(total_ops), total_size(total_size) {  }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(1, 1, bl);
+    encode(total_ops, bl);
+    encode(total_size, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(1, iter);
+    decode(total_ops, iter);
+    decode(total_size, iter);
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("total_ops", total_ops);
+    f->dump_int("total_size", total_size);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "total_ops: " << total_ops << " total_size: " << total_size;
+  }
+};
+
+struct WriteIoSizesPayload : public ClientMetricPayloadBase {
+  uint64_t total_ops = 0;
+  uint64_t total_size = 0;
+
+  WriteIoSizesPayload()
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES) { }
+  WriteIoSizesPayload(uint64_t total_ops, uint64_t total_size)
+    : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES),
+    total_ops(total_ops), total_size(total_size) {
+  }
+
+  void encode(bufferlist &bl) const {
+    using ceph::encode;
+    ENCODE_START(1, 1, bl);
+    encode(total_ops, bl);
+    encode(total_size, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(1, iter);
+    decode(total_ops, iter);
+    decode(total_size, iter);
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_int("total_ops", total_ops);
+    f->dump_int("total_size", total_size);
+  }
+
+  void print(std::ostream *out) const {
+    *out << "total_ops: " << total_ops << " total_size: " << total_size;
+  }
+};
+
+struct UnknownPayload : public ClientMetricPayloadBase {
+  UnknownPayload()
+    : ClientMetricPayloadBase(static_cast<ClientMetricType>(-1)) { }
+  UnknownPayload(ClientMetricType metric_type)
+    : ClientMetricPayloadBase(metric_type) { }
+
+  void encode(bufferlist &bl) const {
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+    DECODE_START(254, iter);
+    iter.seek(struct_len);
+    DECODE_FINISH(iter);
+  }
+
+  void dump(Formatter *f) const {
+  }
+
+  void print(std::ostream *out) const {
+  }
+};
+
+typedef boost::variant<CapInfoPayload,
+                       ReadLatencyPayload,
+                       WriteLatencyPayload,
+                       MetadataLatencyPayload,
+                       DentryLeasePayload,
+                       OpenedFilesPayload,
+                       PinnedIcapsPayload,
+                       OpenedInodesPayload,
+                       ReadIoSizesPayload,
+                       WriteIoSizesPayload,
+                       UnknownPayload> ClientMetricPayload;
+
+// metric update message sent by clients
+struct ClientMetricMessage {
+public:
+  ClientMetricMessage(const ClientMetricPayload &payload = UnknownPayload())
+    : payload(payload) {
+  }
+
+  class EncodePayloadVisitor : public boost::static_visitor<void> {
+  public:
+    explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {
+    }
+
+    template <typename ClientMetricPayload>
+    inline void operator()(const ClientMetricPayload &payload) const {
+      using ceph::encode;
+      encode(static_cast<uint32_t>(payload.get_type()), m_bl);
+      payload.encode(m_bl);
+    }
+
+  private:
+    bufferlist &m_bl;
+  };
+
+  class DecodePayloadVisitor : public boost::static_visitor<void> {
+  public:
+    DecodePayloadVisitor(bufferlist::const_iterator &iter) : m_iter(iter) {
+    }
+
+    template <typename ClientMetricPayload>
+    inline void operator()(ClientMetricPayload &payload) const {
+      using ceph::decode;
+      payload.decode(m_iter);
+    }
+
+  private:
+    bufferlist::const_iterator &m_iter;
+  };
+
+  class DumpPayloadVisitor : public boost::static_visitor<void> {
+  public:
+    explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {
+    }
+
+    template <typename ClientMetricPayload>
+    inline void operator()(const ClientMetricPayload &payload) const {
+      m_formatter->dump_string("client_metric_type", stringify(payload.get_type()));
+      payload.dump(m_formatter);
+    }
+
+  private:
+    Formatter *m_formatter;
+  };
+
+  class PrintPayloadVisitor : public boost::static_visitor<void> {
+  public:
+    explicit PrintPayloadVisitor(std::ostream *out) : _out(out) {
+    }
+
+    template <typename ClientMetricPayload>
+    inline void operator()(const ClientMetricPayload &payload) const {
+      *_out << "[client_metric_type: ";
+      payload.print_type(_out);
+      *_out << " ";
+      payload.print(_out);
+      *_out << "]";
+    }
+
+  private:
+    std::ostream *_out;
+  };
+
+  void encode(bufferlist &bl) const {
+    boost::apply_visitor(EncodePayloadVisitor(bl), payload);
+  }
+
+  void decode(bufferlist::const_iterator &iter) {
+    using ceph::decode;
+
+    uint32_t metric_type;
+    decode(metric_type, iter);
+
+    switch (metric_type) {
+    case ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO:
+      payload = CapInfoPayload();
+      break;
+    case ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY:
+      payload = ReadLatencyPayload();
+      break;
+    case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY:
+      payload = WriteLatencyPayload();
+      break;
+    case ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY:
+      payload = MetadataLatencyPayload();
+      break;
+    case ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE:
+      payload = DentryLeasePayload();
+      break;
+    case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES:
+      payload = OpenedFilesPayload();
+      break;
+    case ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS:
+      payload = PinnedIcapsPayload();
+      break;
+    case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES:
+      payload = OpenedInodesPayload();
+      break;
+    case ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES:
+      payload = ReadIoSizesPayload();
+      break;
+    case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES:
+      payload = WriteIoSizesPayload();
+      break;
+    default:
+      payload = UnknownPayload(static_cast<ClientMetricType>(metric_type));
+      break;
+    }
+
+    boost::apply_visitor(DecodePayloadVisitor(iter), payload);
+  }
+
+  void dump(Formatter *f) const {
+    apply_visitor(DumpPayloadVisitor(f), payload);
+  }
+
+  void print(std::ostream *out) const {
+    apply_visitor(PrintPayloadVisitor(out), payload);
+  }
+
+  ClientMetricPayload payload;
+};
+WRITE_CLASS_ENCODER(ClientMetricMessage);
+
+#endif // CEPH_INCLUDE_CEPHFS_METRICS_TYPES_H
diff --git a/src/include/cephfs/types.h b/src/include/cephfs/types.h
new file mode 100644
index 000000000..cca0a6193
--- /dev/null
+++ b/src/include/cephfs/types.h
@@ -0,0 +1,970 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+#ifndef CEPH_CEPHFS_TYPES_H
+#define CEPH_CEPHFS_TYPES_H
+#include "include/int_types.h"
+
+#include <ostream>
+#include <set>
+#include <map>
+#include <string_view>
+
+#include "common/config.h"
+#include "common/Clock.h"
+#include "common/DecayCounter.h"
+#include "common/StackStringStream.h"
+#include "common/entity_name.h"
+
+#include "include/compat.h"
+#include "include/Context.h"
+#include "include/frag.h"
+#include "include/xlist.h"
+#include "include/interval_set.h"
+#include "include/compact_set.h"
+#include "include/fs_types.h"
+#include "include/ceph_fs.h"
+
+#include "mds/inode_backtrace.h"
+
+#include <boost/spirit/include/qi.hpp>
+#include <boost/pool/pool.hpp>
+#include "include/ceph_assert.h"
+#include <boost/serialization/strong_typedef.hpp>
+#include "common/ceph_json.h"
+
+#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
+#define MAX_MDS                   0x100
+
+BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
+extern const mds_gid_t MDS_GID_NONE;
+
+typedef int32_t fs_cluster_id_t;
+constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
+
+// The namespace ID of the anonymous default filesystem from legacy systems
+constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
+
+typedef int32_t mds_rank_t;
+constexpr mds_rank_t MDS_RANK_NONE		= -1;
+constexpr mds_rank_t MDS_RANK_EPHEMERAL_DIST	= -2;
+constexpr mds_rank_t MDS_RANK_EPHEMERAL_RAND	= -3;
+
+struct scatter_info_t {
+  version_t version = 0;
+};
+
+struct frag_info_t : public scatter_info_t {
+  int64_t size() const { return nfiles + nsubdirs; }
+
+  void zero() {
+    *this = frag_info_t();
+  }
+
+  // *this += cur - acc;
+  void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
+    if (cur.mtime > mtime) {
+      mtime = cur.mtime;
+      if (touched_mtime)
+	*touched_mtime = true;
+    }
+    if (cur.change_attr > change_attr) {
+      change_attr = cur.change_attr;
+      if (touched_chattr)
+	*touched_chattr = true;
+    }
+    nfiles += cur.nfiles - acc.nfiles;
+    nsubdirs += cur.nsubdirs - acc.nsubdirs;
+  }
+
+  void add(const frag_info_t& other) {
+    if (other.mtime > mtime)
+      mtime = other.mtime;
+    if (other.change_attr > change_attr)
+      change_attr = other.change_attr;
+    nfiles += other.nfiles;
+    nsubdirs += other.nsubdirs;
+  }
+
+  bool same_sums(const frag_info_t &o) const {
+    return mtime <= o.mtime &&
+	nfiles == o.nfiles &&
+	nsubdirs == o.nsubdirs;
+  }
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<frag_info_t*>& ls);
+
+  // this frag
+  utime_t mtime;
+  uint64_t change_attr = 0;
+  int64_t nfiles = 0;        // files
+  int64_t nsubdirs = 0;      // subdirs
+};
+WRITE_CLASS_ENCODER(frag_info_t)
+
+inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
+  return memcmp(&l, &r, sizeof(l)) == 0;
+}
+inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
+  return !(l == r);
+}
+
+std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
+
+struct nest_info_t : public scatter_info_t {
+  int64_t rsize() const { return rfiles + rsubdirs; }
+
+  void zero() {
+    *this = nest_info_t();
+  }
+
+  void sub(const nest_info_t &other) {
+    add(other, -1);
+  }
+  void add(const nest_info_t &other, int fac=1) {
+    if (other.rctime > rctime)
+      rctime = other.rctime;
+    rbytes += fac*other.rbytes;
+    rfiles += fac*other.rfiles;
+    rsubdirs += fac*other.rsubdirs;
+    rsnaps += fac*other.rsnaps;
+  }
+
+  // *this += cur - acc;
+  void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
+    if (cur.rctime > rctime)
+      rctime = cur.rctime;
+    rbytes += cur.rbytes - acc.rbytes;
+    rfiles += cur.rfiles - acc.rfiles;
+    rsubdirs += cur.rsubdirs - acc.rsubdirs;
+    rsnaps += cur.rsnaps - acc.rsnaps;
+  }
+
+  bool same_sums(const nest_info_t &o) const {
+    return rctime <= o.rctime &&
+        rbytes == o.rbytes &&
+        rfiles == o.rfiles &&
+        rsubdirs == o.rsubdirs &&
+        rsnaps == o.rsnaps;
+  }
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<nest_info_t*>& ls);
+
+  // this frag + children
+  utime_t rctime;
+  int64_t rbytes = 0;
+  int64_t rfiles = 0;
+  int64_t rsubdirs = 0;
+  int64_t rsnaps = 0;
+};
+WRITE_CLASS_ENCODER(nest_info_t)
+
+inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
+  return memcmp(&l, &r, sizeof(l)) == 0;
+}
+inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
+  return !(l == r);
+}
+
+std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
+
+struct vinodeno_t {
+  vinodeno_t() {}
+  vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(ino, bl);
+    encode(snapid, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    decode(ino, p);
+    decode(snapid, p);
+  }
+
+  inodeno_t ino;
+  snapid_t snapid;
+};
+WRITE_CLASS_ENCODER(vinodeno_t)
+
+inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
+  return l.ino == r.ino && l.snapid == r.snapid;
+}
+inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
+  return !(l == r);
+}
+inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
+  return
+    l.ino < r.ino ||
+    (l.ino == r.ino && l.snapid < r.snapid);
+}
+
+typedef enum {
+  QUOTA_MAX_FILES,
+  QUOTA_MAX_BYTES,
+  QUOTA_ANY
+} quota_max_t;
+
+struct quota_info_t
+{
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(max_bytes, bl);
+    encode(max_files, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
+    decode(max_bytes, p);
+    decode(max_files, p);
+    DECODE_FINISH(p);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<quota_info_t *>& ls);
+
+  bool is_valid() const {
+    return max_bytes >=0 && max_files >=0;
+  }
+  bool is_enabled(quota_max_t type=QUOTA_ANY) const {
+    switch (type) {
+    case QUOTA_MAX_FILES:
+      return !!max_files;
+    case QUOTA_MAX_BYTES:
+      return !!max_bytes;
+    case QUOTA_ANY:
+    default:
+      return !!max_bytes || !!max_files;
+    }
+  }
+  void decode_json(JSONObj *obj);
+
+  int64_t max_bytes = 0;
+  int64_t max_files = 0;
+};
+WRITE_CLASS_ENCODER(quota_info_t)
+
+inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
+  return memcmp(&l, &r, sizeof(l)) == 0;
+}
+
+std::ostream& operator<<(std::ostream &out, const quota_info_t &n);
+
+struct client_writeable_range_t {
+  struct byte_range_t {
+    uint64_t first = 0, last = 0;    // interval client can write to
+    byte_range_t() {}
+    void decode_json(JSONObj *obj);
+  };
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
+
+  byte_range_t range;
+  snapid_t follows = 0;     // aka "data+metadata flushed thru"
+};
+
+inline void decode(client_writeable_range_t::byte_range_t& range, ceph::buffer::list::const_iterator& bl) {
+  using ceph::decode;
+  decode(range.first, bl);
+  decode(range.last, bl);
+}
+
+WRITE_CLASS_ENCODER(client_writeable_range_t)
+
+std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
+
+inline bool operator==(const client_writeable_range_t& l,
+		       const client_writeable_range_t& r) {
+  return l.range.first == r.range.first && l.range.last == r.range.last &&
+    l.follows == r.follows;
+}
+
+struct inline_data_t {
+public:
+  inline_data_t() {}
+  inline_data_t(const inline_data_t& o) : version(o.version) {
+    if (o.blp)
+      set_data(*o.blp);
+  }
+  inline_data_t& operator=(const inline_data_t& o) {
+    version = o.version;
+    if (o.blp)
+      set_data(*o.blp);
+    else
+      free_data();
+    return *this;
+  }
+
+  void free_data() {
+    blp.reset();
+  }
+  void get_data(ceph::buffer::list& ret) const {
+    if (blp)
+      ret = *blp;
+    else
+      ret.clear();
+  }
+  void set_data(const ceph::buffer::list& bl) {
+    if (!blp)
+      blp.reset(new ceph::buffer::list);
+    *blp = bl;
+  }
+  size_t length() const { return blp ? blp->length() : 0; }
+
+  bool operator==(const inline_data_t& o) const {
+   return length() == o.length() &&
+	  (length() == 0 ||
+	   (*const_cast<ceph::buffer::list*>(blp.get()) == *const_cast<ceph::buffer::list*>(o.blp.get())));
+  }
+  bool operator!=(const inline_data_t& o) const {
+    return !(*this == o);
+  }
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+
+  version_t version = 1;
+
+private:
+  std::unique_ptr<ceph::buffer::list> blp;
+};
+WRITE_CLASS_ENCODER(inline_data_t)
+
+enum {
+  DAMAGE_STATS,     // statistics (dirstat, size, etc)
+  DAMAGE_RSTATS,    // recursive statistics (rstat, accounted_rstat)
+  DAMAGE_FRAGTREE   // fragtree -- repair by searching
+};
+
+template<template<typename> class Allocator = std::allocator>
+struct inode_t {
+  /**
+   * ***************
+   * Do not forget to add any new fields to the compare() function.
+   * ***************
+   */
+  using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
+
+  inode_t()
+  {
+    clear_layout();
+  }
+
+  // file type
+  bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
+  bool is_dir()     const { return (mode & S_IFMT) == S_IFDIR; }
+  bool is_file()    const { return (mode & S_IFMT) == S_IFREG; }
+
+  bool is_truncating() const { return (truncate_pending > 0); }
+  void truncate(uint64_t old_size, uint64_t new_size, const bufferlist &fbl) {
+    truncate(old_size, new_size);
+    fscrypt_last_block = fbl;
+  }
+  void truncate(uint64_t old_size, uint64_t new_size) {
+    ceph_assert(new_size <= old_size);
+    if (old_size > max_size_ever)
+      max_size_ever = old_size;
+    truncate_from = old_size;
+    size = new_size;
+    rstat.rbytes = new_size;
+    truncate_size = size;
+    truncate_seq++;
+    truncate_pending++;
+  }
+
+  bool has_layout() const {
+    return layout != file_layout_t();
+  }
+
+  void clear_layout() {
+    layout = file_layout_t();
+  }
+
+  uint64_t get_layout_size_increment() const {
+    return layout.get_period();
+  }
+
+  bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
+
+  uint64_t get_client_range(client_t client) const {
+    auto it = client_ranges.find(client);
+    return it != client_ranges.end() ? it->second.range.last : 0;
+  }
+
+  uint64_t get_max_size() const {
+    uint64_t max = 0;
+      for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
+	   p != client_ranges.end();
+	   ++p)
+	if (p->second.range.last > max)
+	  max = p->second.range.last;
+      return max;
+  }
+  void set_max_size(uint64_t new_max) {
+    if (new_max == 0) {
+      client_ranges.clear();
+    } else {
+      for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
+	   p != client_ranges.end();
+	   ++p)
+	p->second.range.last = new_max;
+    }
+  }
+
+  void trim_client_ranges(snapid_t last) {
+    std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
+    while (p != client_ranges.end()) {
+      if (p->second.follows >= last)
+	client_ranges.erase(p++);
+      else
+	++p;
+    }
+  }
+
+  bool is_backtrace_updated() const {
+    return backtrace_version == version;
+  }
+  void update_backtrace(version_t pv=0) {
+    backtrace_version = pv ? pv : version;
+  }
+
+  void add_old_pool(int64_t l) {
+    backtrace_version = version;
+    old_pools.insert(l);
+  }
+
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void client_ranges_cb(client_range_map& c, JSONObj *obj);
+  static void old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj);
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<inode_t*>& ls);
+  /**
+   * Compare this inode_t with another that represent *the same inode*
+   * at different points in time.
+   * @pre The inodes are the same ino
+   *
+   * @param other The inode_t to compare ourselves with
+   * @param divergent A bool pointer which will be set to true
+   * if the values are different in a way that can't be explained
+   * by one being a newer version than the other.
+   *
+   * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
+   */
+  int compare(const inode_t &other, bool *divergent) const;
+
+  // base (immutable)
+  inodeno_t ino = 0;
+  uint32_t   rdev = 0;    // if special file
+
+  // affected by any inode change...
+  utime_t    ctime;   // inode change time
+  utime_t    btime;   // birth time
+
+  // perm (namespace permissions)
+  uint32_t   mode = 0;
+  uid_t      uid = 0;
+  gid_t      gid = 0;
+
+  // nlink
+  int32_t    nlink = 0;
+
+  // file (data access)
+  ceph_dir_layout dir_layout = {};    // [dir only]
+  file_layout_t layout;
+  compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
+  uint64_t   size = 0;        // on directory, # dentries
+  uint64_t   max_size_ever = 0; // max size the file has ever been
+  uint32_t   truncate_seq = 0;
+  uint64_t   truncate_size = 0, truncate_from = 0;
+  uint32_t   truncate_pending = 0;
+  utime_t    mtime;   // file data modify time.
+  utime_t    atime;   // file data access time.
+  uint32_t   time_warp_seq = 0;  // count of (potential) mtime/atime timewarps (i.e., utimes())
+  inline_data_t inline_data; // FIXME check
+
+  // change attribute
+  uint64_t   change_attr = 0;
+
+  client_range_map client_ranges;  // client(s) can write to these ranges
+
+  // dirfrag, recursive accountin
+  frag_info_t dirstat;         // protected by my filelock
+  nest_info_t rstat;           // protected by my nestlock
+  nest_info_t accounted_rstat; // protected by parent's nestlock
+
+  quota_info_t quota;
+
+  mds_rank_t export_pin = MDS_RANK_NONE;
+
+  double export_ephemeral_random_pin = 0;
+  bool export_ephemeral_distributed_pin = false;
+
+  // special stuff
+  version_t version = 0;           // auth only
+  version_t file_data_version = 0; // auth only
+  version_t xattr_version = 0;
+
+  utime_t last_scrub_stamp;    // start time of last complete scrub
+  version_t last_scrub_version = 0;// (parent) start version of last complete scrub
+
+  version_t backtrace_version = 0;
+
+  snapid_t oldest_snap;
+
+  std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
+
+  std::vector<uint8_t> fscrypt_auth;
+  std::vector<uint8_t> fscrypt_file;
+
+  bufferlist fscrypt_last_block;
+
+private:
+  bool older_is_consistent(const inode_t &other) const;
+};
+
+// These methods may be moved back to mdstypes.cc when we have pmr
+template<template<typename> class Allocator>
+void inode_t<Allocator>::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+  ENCODE_START(19, 6, bl);
+
+  encode(ino, bl);
+  encode(rdev, bl);
+  encode(ctime, bl);
+
+  encode(mode, bl);
+  encode(uid, bl);
+  encode(gid, bl);
+
+  encode(nlink, bl);
+  {
+    // removed field
+    bool anchored = 0;
+    encode(anchored, bl);
+  }
+
+  encode(dir_layout, bl);
+  encode(layout, bl, features);
+  encode(size, bl);
+  encode(truncate_seq, bl);
+  encode(truncate_size, bl);
+  encode(truncate_from, bl);
+  encode(truncate_pending, bl);
+  encode(mtime, bl);
+  encode(atime, bl);
+  encode(time_warp_seq, bl);
+  encode(client_ranges, bl);
+
+  encode(dirstat, bl);
+  encode(rstat, bl);
+  encode(accounted_rstat, bl);
+
+  encode(version, bl);
+  encode(file_data_version, bl);
+  encode(xattr_version, bl);
+  encode(backtrace_version, bl);
+  encode(old_pools, bl);
+  encode(max_size_ever, bl);
+  encode(inline_data, bl);
+  encode(quota, bl);
+
+  encode(stray_prior_path, bl);
+
+  encode(last_scrub_version, bl);
+  encode(last_scrub_stamp, bl);
+
+  encode(btime, bl);
+  encode(change_attr, bl);
+
+  encode(export_pin, bl);
+
+  encode(export_ephemeral_random_pin, bl);
+  encode(export_ephemeral_distributed_pin, bl);
+
+  encode(!fscrypt_auth.empty(), bl);
+  encode(fscrypt_auth, bl);
+  encode(fscrypt_file, bl);
+  encode(fscrypt_last_block, bl);
+  ENCODE_FINISH(bl);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::decode(ceph::buffer::list::const_iterator &p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(19, 6, 6, p);
+
+  decode(ino, p);
+  decode(rdev, p);
+  decode(ctime, p);
+
+  decode(mode, p);
+  decode(uid, p);
+  decode(gid, p);
+
+  decode(nlink, p);
+  {
+    bool anchored;
+    decode(anchored, p);
+  }
+
+  if (struct_v >= 4)
+    decode(dir_layout, p);
+  else {
+    // FIPS zeroization audit 20191117: this memset is not security related.
+    memset(&dir_layout, 0, sizeof(dir_layout));
+  }
+  decode(layout, p);
+  decode(size, p);
+  decode(truncate_seq, p);
+  decode(truncate_size, p);
+  decode(truncate_from, p);
+  if (struct_v >= 5)
+    decode(truncate_pending, p);
+  else
+    truncate_pending = 0;
+  decode(mtime, p);
+  decode(atime, p);
+  decode(time_warp_seq, p);
+  if (struct_v >= 3) {
+    decode(client_ranges, p);
+  } else {
+    std::map<client_t, client_writeable_range_t::byte_range_t> m;
+    decode(m, p);
+    for (auto q = m.begin(); q != m.end(); ++q)
+      client_ranges[q->first].range = q->second;
+  }
+
+  decode(dirstat, p);
+  decode(rstat, p);
+  decode(accounted_rstat, p);
+
+  decode(version, p);
+  decode(file_data_version, p);
+  decode(xattr_version, p);
+  if (struct_v >= 2)
+    decode(backtrace_version, p);
+  if (struct_v >= 7)
+    decode(old_pools, p);
+  if (struct_v >= 8)
+    decode(max_size_ever, p);
+  if (struct_v >= 9) {
+    decode(inline_data, p);
+  } else {
+    inline_data.version = CEPH_INLINE_NONE;
+  }
+  if (struct_v < 10)
+    backtrace_version = 0; // force update backtrace
+  if (struct_v >= 11)
+    decode(quota, p);
+
+  if (struct_v >= 12) {
+    std::string tmp;
+    decode(tmp, p);
+    stray_prior_path = std::string_view(tmp);
+  }
+
+  if (struct_v >= 13) {
+    decode(last_scrub_version, p);
+    decode(last_scrub_stamp, p);
+  }
+  if (struct_v >= 14) {
+    decode(btime, p);
+    decode(change_attr, p);
+  } else {
+    btime = utime_t();
+    change_attr = 0;
+  }
+
+  if (struct_v >= 15) {
+    decode(export_pin, p);
+  } else {
+    export_pin = MDS_RANK_NONE;
+  }
+
+  if (struct_v >= 16) {
+    decode(export_ephemeral_random_pin, p);
+    decode(export_ephemeral_distributed_pin, p);
+  } else {
+    export_ephemeral_random_pin = 0;
+    export_ephemeral_distributed_pin = false;
+  }
+
+  if (struct_v >= 17) {
+    bool fscrypt_flag;
+    decode(fscrypt_flag, p); // ignored
+  }
+
+  if (struct_v >= 18) {
+    decode(fscrypt_auth, p);
+    decode(fscrypt_file, p);
+  }
+
+  if (struct_v >= 19) {
+    decode(fscrypt_last_block, p);
+  }
+  DECODE_FINISH(p);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("ino", ino);
+  f->dump_unsigned("rdev", rdev);
+  f->dump_stream("ctime") << ctime;
+  f->dump_stream("btime") << btime;
+  f->dump_unsigned("mode", mode);
+  f->dump_unsigned("uid", uid);
+  f->dump_unsigned("gid", gid);
+  f->dump_unsigned("nlink", nlink);
+
+  f->open_object_section("dir_layout");
+  ::dump(dir_layout, f);
+  f->close_section();
+
+  f->dump_object("layout", layout);
+
+  f->open_array_section("old_pools");
+  for (const auto &p : old_pools) {
+    f->dump_int("pool", p);
+  }
+  f->close_section();
+
+  f->dump_unsigned("size", size);
+  f->dump_unsigned("truncate_seq", truncate_seq);
+  f->dump_unsigned("truncate_size", truncate_size);
+  f->dump_unsigned("truncate_from", truncate_from);
+  f->dump_unsigned("truncate_pending", truncate_pending);
+  f->dump_stream("mtime") << mtime;
+  f->dump_stream("atime") << atime;
+  f->dump_unsigned("time_warp_seq", time_warp_seq);
+  f->dump_unsigned("change_attr", change_attr);
+  f->dump_int("export_pin", export_pin);
+  f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin);
+  f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin);
+
+  f->open_array_section("client_ranges");
+  for (const auto &p : client_ranges) {
+    f->open_object_section("client");
+    f->dump_unsigned("client", p.first.v);
+    p.second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_object_section("dirstat");
+  dirstat.dump(f);
+  f->close_section();
+
+  f->open_object_section("rstat");
+  rstat.dump(f);
+  f->close_section();
+
+  f->open_object_section("accounted_rstat");
+  accounted_rstat.dump(f);
+  f->close_section();
+
+  f->dump_unsigned("version", version);
+  f->dump_unsigned("file_data_version", file_data_version);
+  f->dump_unsigned("xattr_version", xattr_version);
+  f->dump_unsigned("backtrace_version", backtrace_version);
+
+  f->dump_string("stray_prior_path", stray_prior_path);
+  f->dump_unsigned("max_size_ever", max_size_ever);
+
+  f->open_object_section("quota");
+  quota.dump(f);
+  f->close_section();
+
+  f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
+  f->dump_unsigned("last_scrub_version", last_scrub_version);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::client_ranges_cb(typename inode_t<Allocator>::client_range_map& c, JSONObj *obj){
+
+  int64_t client;
+  JSONDecoder::decode_json("client", client, obj, true);
+  client_writeable_range_t client_range_tmp;
+  JSONDecoder::decode_json("byte range", client_range_tmp.range, obj, true);
+  JSONDecoder::decode_json("follows", client_range_tmp.follows.val, obj, true);
+  c[client] = client_range_tmp;
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj){
+
+  int64_t tmp;
+  decode_json_obj(tmp, obj);
+  c.insert(tmp);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::decode_json(JSONObj *obj)
+{
+
+  JSONDecoder::decode_json("ino", ino.val, obj, true);
+  JSONDecoder::decode_json("rdev", rdev, obj, true);
+  //JSONDecoder::decode_json("ctime", ctime, obj, true);
+  //JSONDecoder::decode_json("btime", btime, obj, true);
+  JSONDecoder::decode_json("mode", mode, obj, true);
+  JSONDecoder::decode_json("uid", uid, obj, true);
+  JSONDecoder::decode_json("gid", gid, obj, true);
+  JSONDecoder::decode_json("nlink", nlink, obj, true);
+  JSONDecoder::decode_json("dir_layout", dir_layout, obj, true);
+  JSONDecoder::decode_json("layout", layout, obj, true);
+  JSONDecoder::decode_json("old_pools", old_pools, inode_t<Allocator>::old_pools_cb, obj, true);
+  JSONDecoder::decode_json("size", size, obj, true);
+  JSONDecoder::decode_json("truncate_seq", truncate_seq, obj, true);
+  JSONDecoder::decode_json("truncate_size", truncate_size, obj, true);
+  JSONDecoder::decode_json("truncate_from", truncate_from, obj, true);
+  JSONDecoder::decode_json("truncate_pending", truncate_pending, obj, true);
+  //JSONDecoder::decode_json("mtime", mtime, obj, true);
+  //JSONDecoder::decode_json("atime", atime, obj, true);
+  JSONDecoder::decode_json("time_warp_seq", time_warp_seq, obj, true);
+  JSONDecoder::decode_json("change_attr", change_attr, obj, true);
+  JSONDecoder::decode_json("export_pin", export_pin, obj, true);
+  JSONDecoder::decode_json("client_ranges", client_ranges, inode_t<Allocator>::client_ranges_cb, obj, true);
+  JSONDecoder::decode_json("dirstat", dirstat, obj, true);
+  JSONDecoder::decode_json("rstat", rstat, obj, true);
+  JSONDecoder::decode_json("accounted_rstat", accounted_rstat, obj, true);
+  JSONDecoder::decode_json("version", version, obj, true);
+  JSONDecoder::decode_json("file_data_version", file_data_version, obj, true);
+  JSONDecoder::decode_json("xattr_version", xattr_version, obj, true);
+  JSONDecoder::decode_json("backtrace_version", backtrace_version, obj, true);
+  JSONDecoder::decode_json("stray_prior_path", stray_prior_path, obj, true);
+  JSONDecoder::decode_json("max_size_ever", max_size_ever, obj, true);
+  JSONDecoder::decode_json("quota", quota, obj, true);
+  JSONDecoder::decode_json("last_scrub_stamp", last_scrub_stamp, obj, true);
+  JSONDecoder::decode_json("last_scrub_version", last_scrub_version, obj, true);
+}
+
+template<template<typename> class Allocator>
+void inode_t<Allocator>::generate_test_instances(std::list<inode_t*>& ls)
+{
+  ls.push_back(new inode_t<Allocator>);
+  ls.push_back(new inode_t<Allocator>);
+  ls.back()->ino = 1;
+  // i am lazy.
+}
+
+template<template<typename> class Allocator>
+int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
+{
+  ceph_assert(ino == other.ino);
+  *divergent = false;
+  if (version == other.version) {
+    if (rdev != other.rdev ||
+        ctime != other.ctime ||
+        btime != other.btime ||
+        mode != other.mode ||
+        uid != other.uid ||
+        gid != other.gid ||
+        nlink != other.nlink ||
+        memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
+        layout != other.layout ||
+        old_pools != other.old_pools ||
+        size != other.size ||
+        max_size_ever != other.max_size_ever ||
+        truncate_seq != other.truncate_seq ||
+        truncate_size != other.truncate_size ||
+        truncate_from != other.truncate_from ||
+        truncate_pending != other.truncate_pending ||
+	change_attr != other.change_attr ||
+        mtime != other.mtime ||
+        atime != other.atime ||
+        time_warp_seq != other.time_warp_seq ||
+        inline_data != other.inline_data ||
+        client_ranges != other.client_ranges ||
+        !(dirstat == other.dirstat) ||
+        !(rstat == other.rstat) ||
+        !(accounted_rstat == other.accounted_rstat) ||
+        file_data_version != other.file_data_version ||
+        xattr_version != other.xattr_version ||
+        backtrace_version != other.backtrace_version) {
+      *divergent = true;
+    }
+    return 0;
+  } else if (version > other.version) {
+    *divergent = !older_is_consistent(other);
+    return 1;
+  } else {
+    ceph_assert(version < other.version);
+    *divergent = !other.older_is_consistent(*this);
+    return -1;
+  }
+}
+
+template<template<typename> class Allocator>
+bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
+{
+  if (max_size_ever < other.max_size_ever ||
+      truncate_seq < other.truncate_seq ||
+      time_warp_seq < other.time_warp_seq ||
+      inline_data.version < other.inline_data.version ||
+      dirstat.version < other.dirstat.version ||
+      rstat.version < other.rstat.version ||
+      accounted_rstat.version < other.accounted_rstat.version ||
+      file_data_version < other.file_data_version ||
+      xattr_version < other.xattr_version ||
+      backtrace_version < other.backtrace_version) {
+    return false;
+  }
+  return true;
+}
+
+template<template<typename> class Allocator>
+inline void encode(const inode_t<Allocator> &c, ::ceph::buffer::list &bl, uint64_t features)
+{
+  ENCODE_DUMP_PRE();
+  c.encode(bl, features);
+  ENCODE_DUMP_POST(cl);
+}
+template<template<typename> class Allocator>
+inline void decode(inode_t<Allocator> &c, ::ceph::buffer::list::const_iterator &p)
+{
+  c.decode(p);
+}
+
+// parse a map of keys/values.
+namespace qi = boost::spirit::qi;
+
+template <typename Iterator>
+struct keys_and_values
+  : qi::grammar<Iterator, std::map<std::string, std::string>()>
+{
+    keys_and_values()
+      : keys_and_values::base_type(query)
+    {
+      query =  pair >> *(qi::lit(' ') >> pair);
+      pair  =  key >> '=' >> value;
+      key   =  qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
+      value = +qi::char_("a-zA-Z0-9-_.");
+    }
+  qi::rule<Iterator, std::map<std::string, std::string>()> query;
+  qi::rule<Iterator, std::pair<std::string, std::string>()> pair;
+  qi::rule<Iterator, std::string()> key, value;
+};
+
+#endif
diff --git a/src/include/color.h b/src/include/color.h
new file mode 100644
index 000000000..6c8df40e0
--- /dev/null
+++ b/src/include/color.h
@@ -0,0 +1,13 @@
+#ifndef CEPH_COLOR_H
+#define CEPH_COLOR_H
+
+#define TEXT_NORMAL	"\033[0m"
+/*#define TEXT_HAZARD	"\033[5;31m"*/
+#define TEXT_RED	"\033[0;31m"
+#define TEXT_GREEN	"\033[0;32m"
+#define TEXT_YELLOW	"\033[0;33m"
+#define TEXT_BLUE	"\033[0;34m"
+#define TEXT_MAGENTA	"\033[0;35m"
+#define TEXT_CYAN	"\033[0;36m"
+
+#endif
diff --git a/src/include/common_fwd.h b/src/include/common_fwd.h
new file mode 100644
index 000000000..d906aadfa
--- /dev/null
+++ b/src/include/common_fwd.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+#define TOPNSPC crimson
+#else
+#define TOPNSPC ceph
+#endif
+
+namespace TOPNSPC::common {
+  class CephContext;
+  class PerfCounters;
+  class PerfCountersBuilder;
+  class PerfCountersCollection;
+  class PerfCountersCollectionImpl;
+  class PerfGuard;
+  class RefCountedObject;
+  class RefCountedObjectSafe;
+  class RefCountedCond;
+  class RefCountedWaitObject;
+  class ConfigProxy;
+}
+using TOPNSPC::common::CephContext;
+using TOPNSPC::common::PerfCounters;
+using TOPNSPC::common::PerfCountersBuilder;
+using TOPNSPC::common::PerfCountersCollection;
+using TOPNSPC::common::PerfCountersCollectionImpl;
+using TOPNSPC::common::PerfGuard;
+using TOPNSPC::common::RefCountedObject;
+using TOPNSPC::common::RefCountedObjectSafe;
+using TOPNSPC::common::RefCountedCond;
+using TOPNSPC::common::RefCountedWaitObject;
+using TOPNSPC::common::ConfigProxy;
diff --git a/src/include/compact_map.h b/src/include/compact_map.h
new file mode 100644
index 000000000..21645e3d1
--- /dev/null
+++ b/src/include/compact_map.h
@@ -0,0 +1,383 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_MAP_H
+#define CEPH_COMPACT_MAP_H
+
+#include "buffer.h"
+#include "encoding.h"
+
+#include <map>
+#include <memory>
+
+#include "include/encoding.h"
+
+template <class Key, class T, class Map>
+class compact_map_base {
+protected:
+  std::unique_ptr<Map> map;
+  void alloc_internal() {
+    if (!map)
+      map.reset(new Map);
+  }
+  void free_internal() {
+    map.reset();
+  }
+  template <class It>
+  class const_iterator_base {
+    const compact_map_base *map;
+    It it;
+    const_iterator_base() : map(0) { }
+    const_iterator_base(const compact_map_base* m) : map(m) { }
+    const_iterator_base(const compact_map_base *m, const It& i) : map(m), it(i) { }
+    friend class compact_map_base;
+    friend class iterator_base;
+  public:
+    const_iterator_base(const const_iterator_base& o) {
+      map = o.map;
+      it = o.it;
+    }
+    bool operator==(const const_iterator_base& o) const {
+      return (map == o.map) && (!map->map || it == o.it);
+    }
+    bool operator!=(const const_iterator_base& o) const {
+      return !(*this == o);;
+    }
+    const_iterator_base& operator=(const const_iterator_base& o) {
+      map = o.map;
+      it = o.it;
+      return *this;
+    }
+    const_iterator_base& operator++() {
+      ++it;
+      return *this;
+    }
+    const_iterator_base& operator--() {
+      --it;
+      return *this;
+    }
+    const std::pair<const Key,T>& operator*() {
+      return *it;
+    }
+    const std::pair<const Key,T>* operator->() {
+      return it.operator->();
+    }
+  };
+  template <class It>
+  class iterator_base {
+  private:
+    const compact_map_base* map;
+    It it;
+    iterator_base() : map(0) { }
+    iterator_base(compact_map_base* m) : map(m) { }
+    iterator_base(compact_map_base* m, const It& i) : map(m), it(i) { }
+    friend class compact_map_base;
+  public:
+    iterator_base(const iterator_base& o) {
+      map = o.map;
+      it = o.it;
+    }
+    bool operator==(const iterator_base& o) const {
+      return (map == o.map) && (!map->map || it == o.it);
+    }
+    bool operator!=(const iterator_base& o) const {
+      return !(*this == o);;
+    }
+    iterator_base& operator=(const iterator_base& o) {
+      map = o.map;
+      it = o.it;
+      return *this;
+    }
+    iterator_base& operator++() {
+      ++it;
+      return *this;
+    }
+    iterator_base operator++(int) {
+      iterator_base tmp = *this;
+      ++it;
+      return tmp;
+    }
+    iterator_base& operator--() {
+      --it;
+      return *this;
+    }
+    std::pair<const Key,T>& operator*() {
+      return *it;
+    }
+    std::pair<const Key,T>* operator->() {
+      return it.operator->();
+    }
+    operator const_iterator_base<It>() const {
+      return const_iterator_base<It>(map, it);
+    }
+  };
+
+public:
+  class iterator : public iterator_base<typename Map::iterator> {
+    public:
+      iterator() { }
+      iterator(const iterator_base<typename Map::iterator>& o)
+	: iterator_base<typename Map::iterator>(o) { }
+      iterator(compact_map_base* m) : iterator_base<typename Map::iterator>(m) { }
+      iterator(compact_map_base* m, const typename Map::iterator& i)
+	: iterator_base<typename Map::iterator>(m, i) { }
+  };
+  class const_iterator : public const_iterator_base<typename Map::const_iterator> {
+    public:
+      const_iterator() { }
+      const_iterator(const iterator_base<typename Map::const_iterator>& o)
+	: const_iterator_base<typename Map::const_iterator>(o) { }
+      const_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_iterator>(m) { }
+      const_iterator(const compact_map_base* m, const typename Map::const_iterator& i)
+	: const_iterator_base<typename Map::const_iterator>(m, i) { }
+  };
+  class reverse_iterator : public iterator_base<typename Map::reverse_iterator> {
+    public:
+      reverse_iterator() { }
+      reverse_iterator(const iterator_base<typename Map::reverse_iterator>& o)
+	: iterator_base<typename Map::reverse_iterator>(o) { }
+      reverse_iterator(compact_map_base* m) : iterator_base<typename Map::reverse_iterator>(m) { }
+      reverse_iterator(compact_map_base* m, const typename Map::reverse_iterator& i)
+	: iterator_base<typename Map::reverse_iterator>(m, i) { }
+  };
+  class const_reverse_iterator : public const_iterator_base<typename Map::const_reverse_iterator> {
+    public:
+      const_reverse_iterator() { }
+      const_reverse_iterator(const iterator_base<typename Map::const_reverse_iterator>& o)
+	: iterator_base<typename Map::const_reverse_iterator>(o) { }
+      const_reverse_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_reverse_iterator>(m) { }
+      const_reverse_iterator(const compact_map_base* m, const typename Map::const_reverse_iterator& i)
+	: const_iterator_base<typename Map::const_reverse_iterator>(m, i) { }
+  };
+  compact_map_base(const compact_map_base& o) {
+    if (o.map) {
+      alloc_internal();
+      *map = *o.map;
+    }
+  }
+  compact_map_base() {}
+  ~compact_map_base() {}
+
+  bool empty() const {
+    return !map || map->empty();
+  }
+  size_t size() const {
+    return map ? map->size() : 0;
+  }
+  bool operator==(const compact_map_base& o) const {
+    return (empty() && o.empty()) || (map && o.map && *map == *o.map);
+  }
+  bool operator!=(const compact_map_base& o) const {
+    return !(*this == o);
+  }
+  size_t count (const Key& k) const {
+    return map ? map->count(k) : 0;
+  }
+  iterator erase (iterator p) {
+    if (map) {
+      ceph_assert(this == p.map);
+      auto it = map->erase(p.it);
+      if (map->empty()) {
+        free_internal();
+        return iterator(this);
+      } else {
+        return iterator(this, it);
+      }
+    } else {
+      return iterator(this);
+    }
+  }
+  size_t erase (const Key& k) {
+    if (!map)
+      return 0;
+    size_t r = map->erase(k);
+    if (map->empty())
+	free_internal();
+    return r;
+  }
+  void clear() {
+    free_internal();
+  }
+  void swap(compact_map_base& o) {
+    map.swap(o.map);
+  }
+  compact_map_base& operator=(const compact_map_base& o) {
+    if (o.map) {
+      alloc_internal();
+      *map = *o.map;
+    } else
+      free_internal();
+    return *this;
+  }
+  iterator insert(const std::pair<const Key, T>& val) {
+    alloc_internal();
+    return iterator(this, map->insert(val));
+  }
+  template <class... Args>
+  std::pair<iterator,bool> emplace ( Args&&... args ) {
+    alloc_internal();
+    auto em = map->emplace(std::forward<Args>(args)...);
+    return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+  }
+  iterator begin() {
+   if (!map)
+     return iterator(this);
+   return iterator(this, map->begin());
+  }
+  iterator end() {
+   if (!map)
+     return iterator(this);
+   return iterator(this, map->end());
+  }
+  reverse_iterator rbegin() {
+   if (!map)
+     return reverse_iterator(this);
+   return reverse_iterator(this, map->rbegin());
+  }
+  reverse_iterator rend() {
+   if (!map)
+     return reverse_iterator(this);
+   return reverse_iterator(this, map->rend());
+  }
+  iterator find(const Key& k) {
+    if (!map)
+      return iterator(this);
+    return iterator(this, map->find(k));
+  }
+  iterator lower_bound(const Key& k) {
+    if (!map)
+      return iterator(this);
+    return iterator(this, map->lower_bound(k));
+  }
+  iterator upper_bound(const Key& k) {
+    if (!map)
+      return iterator(this);
+    return iterator(this, map->upper_bound(k));
+  }
+  const_iterator begin() const {
+   if (!map)
+     return const_iterator(this);
+   return const_iterator(this, map->begin());
+  }
+  const_iterator end() const {
+   if (!map)
+     return const_iterator(this);
+   return const_iterator(this, map->end());
+  }
+  const_reverse_iterator rbegin() const {
+   if (!map)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, map->rbegin());
+  }
+  const_reverse_iterator rend() const {
+   if (!map)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, map->rend());
+  }
+  const_iterator find(const Key& k) const {
+    if (!map)
+      return const_iterator(this);
+    return const_iterator(this, map->find(k));
+  }
+  const_iterator lower_bound(const Key& k) const {
+    if (!map)
+      return const_iterator(this);
+    return const_iterator(this, map->lower_bound(k));
+  }
+  const_iterator upper_bound(const Key& k) const {
+    if (!map)
+      return const_iterator(this);
+    return const_iterator(this, map->upper_bound(k));
+  }
+  void encode(ceph::buffer::list &bl) const {
+    using ceph::encode;
+    if (map)
+      encode(*map, bl);
+    else
+      encode((uint32_t)0, bl);
+  }
+  void encode(ceph::buffer::list &bl, uint64_t features) const {
+    using ceph::encode;
+    if (map)
+      encode(*map, bl, features);
+    else
+      encode((uint32_t)0, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    using ceph::decode_nohead;
+    uint32_t n;
+    decode(n, p);
+    if (n > 0) {
+      alloc_internal();
+      decode_nohead(n, *map, p);
+    } else
+      free_internal();
+  }
+};
+
+template<class Key, class T, class Map>
+inline void encode(const compact_map_base<Key, T, Map>& m, ceph::buffer::list& bl) {
+  m.encode(bl);
+}
+template<class Key, class T, class Map>
+inline void encode(const compact_map_base<Key, T, Map>& m, ceph::buffer::list& bl,
+		   uint64_t features) {
+  m.encode(bl, features);
+}
+template<class Key, class T, class Map>
+inline void decode(compact_map_base<Key, T, Map>& m, ceph::buffer::list::const_iterator& p) {
+  m.decode(p);
+}
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_map : public compact_map_base<Key, T, std::map<Key,T,Compare,Alloc> > {
+public:
+  T& operator[](const Key& k) {
+    this->alloc_internal();
+    return (*(this->map))[k];
+  }
+};
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_map<Key, T, Compare, Alloc>& m)
+{
+  out << "{";
+  bool first = true;
+  for (const auto &p : m) {
+    if (!first)
+      out << ",";
+    out << p.first << "=" << p.second;
+    first = false;
+  }
+  out << "}";
+  return out;
+}
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+class compact_multimap : public compact_map_base<Key, T, std::multimap<Key,T,Compare,Alloc> > {
+};
+
+template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > >
+inline std::ostream& operator<<(std::ostream& out, const compact_multimap<Key, T, Compare, Alloc>& m)
+{
+  out << "{{";
+  bool first = true;
+  for (const auto &p : m) {
+    if (!first)
+      out << ",";
+    out << p.first << "=" << p.second;
+    first = false;
+  }
+  out << "}}";
+  return out;
+}
+#endif
diff --git a/src/include/compact_set.h b/src/include/compact_set.h
new file mode 100644
index 000000000..a364fd8c4
--- /dev/null
+++ b/src/include/compact_set.h
@@ -0,0 +1,305 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_COMPACT_SET_H
+#define CEPH_COMPACT_SET_H
+
+#include "buffer.h"
+#include "encoding.h"
+
+#include <memory>
+#include <set>
+
+template <class T, class Set>
+class compact_set_base {
+protected:
+  std::unique_ptr<Set> set;
+  void alloc_internal() {
+    if (!set)
+      set.reset(new Set);
+  }
+  void free_internal() {
+    set.reset();
+  }
+  template <class It>
+  class iterator_base {
+  private:
+    const compact_set_base* set;
+    It it;
+    iterator_base() : set(0) { }
+    iterator_base(const compact_set_base* s) : set(s) { }
+    iterator_base(const compact_set_base* s, const It& i) : set(s), it(i) { }
+    friend class compact_set_base;
+  public:
+    iterator_base(const iterator_base& o) {
+      set = o.set;
+      it = o.it;
+    }
+    bool operator==(const iterator_base& o) const {
+      return (set == o.set) && (!set->set || it == o.it);
+    }
+    bool operator!=(const iterator_base& o) const {
+      return !(*this == o);;
+    }
+    iterator_base& operator=(const iterator_base& o) {
+      set->set = o.set;
+      it = o.it;
+      return *this;
+    }
+    iterator_base& operator++() {
+      ++it;
+      return *this;
+    }
+    iterator_base operator++(int) {
+      iterator_base tmp = *this;
+      ++it;
+      return tmp;
+    }
+    iterator_base& operator--() {
+      --it;
+      return *this;
+    }
+    const T& operator*() {
+      return *it;
+    }
+  };
+public:
+  class const_iterator : public iterator_base<typename Set::const_iterator> {
+    public:
+      const_iterator() { }
+      const_iterator(const iterator_base<typename Set::const_iterator>& o)
+	: iterator_base<typename Set::const_iterator>(o) { }
+      const_iterator(const compact_set_base* s) : iterator_base<typename Set::const_iterator>(s) { }
+      const_iterator(const compact_set_base* s, const typename Set::const_iterator& i)
+	: iterator_base<typename Set::const_iterator>(s, i) { }
+  };
+  class iterator : public iterator_base<typename Set::iterator> {
+    public:
+      iterator() { }
+      iterator(const iterator_base<typename Set::iterator>& o)
+	: iterator_base<typename Set::iterator>(o) { }
+      iterator(compact_set_base* s) : iterator_base<typename Set::iterator>(s) { }
+      iterator(compact_set_base* s, const typename Set::iterator& i)
+	: iterator_base<typename Set::iterator>(s, i) { }
+      operator const_iterator() const {
+	return const_iterator(this->set, this->it);
+      }
+  };
+  class const_reverse_iterator : public iterator_base<typename Set::const_reverse_iterator> {
+    public:
+      const_reverse_iterator() { }
+      const_reverse_iterator(const iterator_base<typename Set::const_reverse_iterator>& o)
+	: iterator_base<typename Set::const_reverse_iterator>(o) { }
+      const_reverse_iterator(const compact_set_base* s) : iterator_base<typename Set::const_reverse_iterator>(s) { }
+      const_reverse_iterator(const compact_set_base* s, const typename Set::const_reverse_iterator& i)
+	: iterator_base<typename Set::const_reverse_iterator>(s, i) { }
+  };
+  class reverse_iterator : public iterator_base<typename Set::reverse_iterator> {
+    public:
+      reverse_iterator() { }
+      reverse_iterator(const iterator_base<typename Set::reverse_iterator>& o)
+	: iterator_base<typename Set::reverse_iterator>(o) { }
+      reverse_iterator(compact_set_base* s) : iterator_base<typename Set::reverse_iterator>(s) { }
+      reverse_iterator(compact_set_base* s, const typename Set::reverse_iterator& i)
+	: iterator_base<typename Set::reverse_iterator>(s, i) { }
+      operator const_iterator() const {
+	return const_iterator(this->set, this->it);
+      }
+  };
+
+  compact_set_base() {}
+  compact_set_base(const compact_set_base& o) {
+    if (o.set) {
+      alloc_internal();
+      *set = *o.set;
+    }
+  }
+  ~compact_set_base() {}
+
+
+  bool empty() const {
+    return !set || set->empty();
+  }
+  size_t size() const {
+    return set ? set->size() : 0;
+  }
+  bool operator==(const compact_set_base& o) const {
+    return (empty() && o.empty()) || (set && o.set && *set == *o.set);
+  }
+  bool operator!=(const compact_set_base& o) const {
+    return !(*this == o);
+  }
+  size_t count(const T& t) const {
+    return set ? set->count(t) : 0;
+  }
+  iterator erase (iterator p) {
+    if (set) {
+      ceph_assert(this == p.set);
+      auto it = set->erase(p.it);
+      if (set->empty()) {
+        free_internal();
+        return iterator(this);
+      } else {
+        return iterator(this, it);
+      }
+    } else {
+      return iterator(this);
+    }
+  }
+  size_t erase (const T& t) {
+    if (!set)
+      return 0;
+    size_t r = set->erase(t);
+    if (set->empty())
+      free_internal();
+    return r;
+  }
+  void clear() {
+    free_internal();
+  }
+  void swap(compact_set_base& o) {
+    set.swap(o.set);
+  }
+  compact_set_base& operator=(const compact_set_base& o) {
+    if (o.set) {
+      alloc_internal();
+      *set = *o.set;
+    } else
+      free_internal();
+    return *this;
+  }
+  std::pair<iterator,bool> insert(const T& t) {
+    alloc_internal();
+    std::pair<typename Set::iterator,bool> r = set->insert(t);
+    return std::make_pair(iterator(this, r.first), r.second);
+  }
+  template <class... Args>
+  std::pair<iterator,bool> emplace ( Args&&... args ) {
+    alloc_internal();
+    auto em = set->emplace(std::forward<Args>(args)...);
+    return std::pair<iterator,bool>(iterator(this, em.first), em.second);
+  }
+
+  iterator begin() {
+   if (!set)
+     return iterator(this);
+   return iterator(this, set->begin());
+  }
+  iterator end() {
+   if (!set)
+     return iterator(this);
+   return iterator(this, set->end());
+  }
+  reverse_iterator rbegin() {
+   if (!set)
+     return reverse_iterator(this);
+   return reverse_iterator(this, set->rbegin());
+  }
+  reverse_iterator rend() {
+   if (!set)
+     return reverse_iterator(this);
+   return reverse_iterator(this, set->rend());
+  }
+  iterator find(const T& t) {
+    if (!set)
+      return iterator(this);
+    return iterator(this, set->find(t));
+  }
+  iterator lower_bound(const T& t) {
+    if (!set)
+      return iterator(this);
+    return iterator(this, set->lower_bound(t));
+  }
+  iterator upper_bound(const T& t) {
+    if (!set)
+      return iterator(this);
+    return iterator(this, set->upper_bound(t));
+  }
+  const_iterator begin() const {
+   if (!set)
+     return const_iterator(this);
+   return const_iterator(this, set->begin());
+  }
+  const_iterator end() const {
+   if (!set)
+     return const_iterator(this);
+   return const_iterator(this, set->end());
+  }
+  const_reverse_iterator rbegin() const {
+   if (!set)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, set->rbegin());
+  }
+  const_reverse_iterator rend() const {
+   if (!set)
+     return const_reverse_iterator(this);
+   return const_reverse_iterator(this, set->rend());
+  }
+  const_iterator find(const T& t) const {
+    if (!set)
+      return const_iterator(this);
+    return const_iterator(this, set->find(t));
+  }
+  const_iterator lower_bound(const T& t) const {
+    if (!set)
+      return const_iterator(this);
+    return const_iterator(this, set->lower_bound(t));
+  }
+  const_iterator upper_bound(const T& t) const {
+    if (!set)
+      return const_iterator(this);
+    return const_iterator(this, set->upper_bound(t));
+  }
+  void encode(ceph::buffer::list &bl) const {
+    using ceph::encode;
+    if (set)
+      encode(*set, bl);
+    else
+      encode((uint32_t)0, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    uint32_t n;
+    decode(n, p);
+    if (n > 0) {
+      alloc_internal();
+      ceph::decode_nohead(n, *set, p);
+    } else
+      free_internal();
+  }
+};
+
+template<class T, class Set>
+inline void encode(const compact_set_base<T, Set>& m, ceph::buffer::list& bl) {
+  m.encode(bl);
+}
+template<class T, class Set>
+inline void decode(compact_set_base<T, Set>& m, ceph::buffer::list::const_iterator& p) {
+  m.decode(p);
+}
+
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+class compact_set : public compact_set_base<T, std::set<T, Compare, Alloc> > {
+};
+
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> >
+inline std::ostream& operator<<(std::ostream& out, const compact_set<T,Compare,Alloc>& s)
+{
+  bool first = true;
+  for (auto &v : s) {
+    if (!first)
+      out << ",";
+    out << v;
+    first = false;
+  }
+  return out;
+}
+#endif
diff --git a/src/include/compat.h b/src/include/compat.h
new file mode 100644
index 000000000..1100d69eb
--- /dev/null
+++ b/src/include/compat.h
@@ -0,0 +1,420 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_COMPAT_H
+#define CEPH_COMPAT_H
+
+#include "acconfig.h"
+#include <sys/types.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#if defined(__linux__)
+#define PROCPREFIX
+#endif
+
+#include <fcntl.h>
+#ifndef F_OFD_SETLK
+#define F_OFD_SETLK F_SETLK 
+#endif 
+
+#include <sys/stat.h>
+
+#ifdef _WIN32
+#include "include/win32/fs_compat.h"
+#endif
+
+#ifndef ACCESSPERMS
+#define ACCESSPERMS (S_IRWXU|S_IRWXG|S_IRWXO)
+#endif
+
+#ifndef ALLPERMS
+#define ALLPERMS (S_ISUID|S_ISGID|S_ISVTX|S_IRWXU|S_IRWXG|S_IRWXO)
+#endif
+
+#if defined(__FreeBSD__)
+
+// FreeBSD supports Linux procfs with its compatibility module
+// And all compatibility stuff is standard mounted on this 
+#define PROCPREFIX "/compat/linux"
+
+#ifndef MSG_MORE
+#define MSG_MORE 0
+#endif
+
+#ifndef O_DSYNC
+#define O_DSYNC O_SYNC
+#endif
+
+/* And include the extra required include file */
+#include <pthread_np.h>
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#define cpu_set_t cpuset_t
+int sched_setaffinity(pid_t pid, size_t cpusetsize,
+                      cpu_set_t *mask);
+
+#endif /* __FreeBSD__ */
+
+#if defined(__APPLE__)
+struct cpu_set_t;
+#endif
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+/* Make sure that ENODATA is defined in the correct way */
+#ifdef ENODATA
+#if (ENODATA == 9919)
+// #warning ENODATA already defined to be 9919, redefining to fix
+// Silencing this warning because it fires at all files where compat.h
+// is included after boost files.
+//
+// This value stems from the definition in the boost library
+// And when this case occurs it is due to the fact that boost files
+// are included before this file. Redefinition might not help in this
+// case since already parsed code has evaluated to the wrong value.
+// This would warrrant for d definition that would actually be evaluated
+// at the location of usage and report a possible conflict.
+// This is left up to a future improvement
+#elif (ENODATA != 87)
+// #warning ENODATA already defined to a value different from 87 (ENOATRR), refining to fix
+#endif
+#undef ENODATA
+#endif
+#define ENODATA ENOATTR
+
+// Fix clock accuracy
+#if !defined(CLOCK_MONOTONIC_COARSE)
+#if defined(CLOCK_MONOTONIC_FAST)
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC_FAST
+#else
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
+#endif
+#endif
+#if !defined(CLOCK_REALTIME_COARSE)
+#if defined(CLOCK_REALTIME_FAST)
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME_FAST
+#else
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
+#endif
+#endif
+
+/* get PATH_MAX */
+#include <limits.h>
+
+#ifndef EUCLEAN
+#define EUCLEAN 117
+#endif
+#ifndef EREMOTEIO
+#define EREMOTEIO 121
+#endif
+#ifndef EKEYREJECTED
+#define EKEYREJECTED 129
+#endif
+#ifndef XATTR_CREATE
+#define XATTR_CREATE 1
+#endif
+
+#endif /* __APPLE__ */
+
+#ifndef HOST_NAME_MAX
+#ifdef MAXHOSTNAMELEN 
+#define HOST_NAME_MAX MAXHOSTNAMELEN 
+#else
+#define HOST_NAME_MAX 255
+#endif
+#endif /* HOST_NAME_MAX */
+
+/* O_LARGEFILE is not defined/required on OSX/FreeBSD */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+/* Could be relevant for other platforms */
+#ifndef ERESTART
+#define ERESTART EINTR
+#endif
+
+#ifndef TEMP_FAILURE_RETRY
+#define TEMP_FAILURE_RETRY(expression) ({     \
+  __typeof(expression) __result;              \
+  do {                                        \
+    __result = (expression);                  \
+  } while (__result == -1 && errno == EINTR); \
+  __result; })
+#endif
+
+#ifdef __cplusplus
+# define VOID_TEMP_FAILURE_RETRY(expression) \
+   static_cast<void>(TEMP_FAILURE_RETRY(expression))
+#else
+# define VOID_TEMP_FAILURE_RETRY(expression) \
+   do { (void)TEMP_FAILURE_RETRY(expression); } while (0)
+#endif
+
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#define lseek64(fd, offset, whence) lseek(fd, offset, whence)
+#endif
+
+#if defined(__sun) || defined(_AIX)
+#define LOG_AUTHPRIV    (10<<3)
+#define LOG_FTP         (11<<3)
+#define __STRING(x)     "x"
+#endif
+
+#if defined(__sun) || defined(_AIX) || defined(_WIN32)
+#define IFTODT(mode)   (((mode) & 0170000) >> 12)
+#endif
+
+#if defined(_AIX)
+#define MSG_DONTWAIT MSG_NONBLOCK
+#endif
+
+#if defined(HAVE_PTHREAD_SETNAME_NP)
+  #if defined(__APPLE__)
+    #define ceph_pthread_setname(thread, name) ({ \
+      int __result = 0;                         \
+      if (thread == pthread_self())             \
+        __result = pthread_setname_np(name);    \
+      __result; })
+  #else
+    #define ceph_pthread_setname pthread_setname_np
+  #endif
+#elif defined(HAVE_PTHREAD_SET_NAME_NP)
+  /* Fix a small name diff and return 0 */
+  #define ceph_pthread_setname(thread, name) ({ \
+    pthread_set_name_np(thread, name);          \
+    0; })
+#else
+  /* compiler warning free success noop */
+  #define ceph_pthread_setname(thread, name) ({ \
+    int __i = 0;                              \
+    __i; })
+#endif
+
+#if defined(HAVE_PTHREAD_GETNAME_NP)
+  #define ceph_pthread_getname pthread_getname_np
+#elif defined(HAVE_PTHREAD_GET_NAME_NP)
+  #define ceph_pthread_getname(thread, name, len) ({ \
+    pthread_get_name_np(thread, name, len);          \
+    0; })
+#else
+  /* compiler warning free success noop */
+  #define ceph_pthread_getname(thread, name, len) ({ \
+    if (name != NULL)                              \
+      *name = '\0';                                \
+    0; })
+#endif
+
+int ceph_posix_fallocate(int fd, off_t offset, off_t len);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int pipe_cloexec(int pipefd[2], int flags);
+char *ceph_strerror_r(int errnum, char *buf, size_t buflen);
+unsigned get_page_size();
+// On success, returns the number of bytes written to the buffer. On
+// failure, returns -1.
+ssize_t get_self_exe_path(char* path, int buff_length);
+
+int ceph_memzero_s(void *dest, size_t destsz, size_t count);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if defined(_WIN32)
+
+#include "include/win32/winsock_compat.h"
+
+#include <windows.h>
+#include <time.h>
+
+#include "include/win32/win32_errno.h"
+
+// There are a few name collisions between Windows headers and Ceph.
+// Updating Ceph definitions would be the prefferable fix in order to avoid
+// confussion, unless it requires too many changes, in which case we're going
+// to redefine Windows values by adding the "WIN32_" prefix.
+#define WIN32_DELETE 0x00010000L
+#undef DELETE
+
+#define WIN32_ERROR 0
+#undef ERROR
+
+#ifndef uint
+typedef unsigned int uint;
+#endif
+
+typedef _sigset_t sigset_t;
+
+typedef unsigned int blksize_t;
+typedef unsigned __int64 blkcnt_t;
+typedef unsigned short nlink_t;
+
+typedef long long loff_t;
+
+#define CPU_SETSIZE (sizeof(size_t)*8)
+
+typedef union
+{
+  char cpuset[CPU_SETSIZE/8];
+  size_t _align;
+} cpu_set_t;
+
+struct iovec {
+  void *iov_base;
+  size_t iov_len;
+};
+
+#define SHUT_RD SD_RECEIVE
+#define SHUT_WR SD_SEND
+#define SHUT_RDWR SD_BOTH
+
+#ifndef SIGINT
+#define SIGINT 2
+#endif
+
+#ifndef SIGKILL
+#define SIGKILL 9
+#endif
+
+#define IOV_MAX 1024
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ssize_t readv(int fd, const struct iovec *iov, int iov_cnt);
+ssize_t writev(int fd, const struct iovec *iov, int iov_cnt);
+
+int fsync(int fd);
+ssize_t pread(int fd, void *buf, size_t count, off_t offset);
+ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset);
+
+long int lrand48(void);
+int random();
+
+int pipe(int pipefd[2]);
+
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+
+char *strptime(const char *s, const char *format, struct tm *tm);
+
+int chown(const char *path, uid_t owner, gid_t group);
+int fchown(int fd, uid_t owner, gid_t group);
+int lchown(const char *path, uid_t owner, gid_t group);
+int setenv(const char *name, const char *value, int overwrite);
+
+int geteuid();
+int getegid();
+int getuid();
+int getgid();
+
+#define unsetenv(name) _putenv_s(name, "")
+
+int win_socketpair(int socks[2]);
+
+#ifdef __MINGW32__
+extern _CRTIMP errno_t __cdecl _putenv_s(const char *_Name,const char *_Value);
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define htobe16(x) __builtin_bswap16(x)
+#define htole16(x) (x)
+#define be16toh(x) __builtin_bswap16(x)
+#define le16toh(x) (x)
+
+#define htobe32(x) __builtin_bswap32(x)
+#define htole32(x) (x)
+#define be32toh(x) __builtin_bswap32(x)
+#define le32toh(x) (x)
+
+#define htobe64(x) __builtin_bswap64(x)
+#define htole64(x) (x)
+#define be64toh(x) __builtin_bswap64(x)
+#define le64toh(x) (x)
+#endif // defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+
+#endif // __MINGW32__
+
+#ifdef __cplusplus
+}
+#endif
+
+#define compat_closesocket closesocket
+// Use "aligned_free" when freeing memory allocated using posix_memalign or
+// _aligned_malloc. Using "free" will crash.
+static inline void aligned_free(void* ptr) {
+  _aligned_free(ptr);
+}
+
+// O_CLOEXEC is not defined on Windows. Since handles aren't inherited
+// with subprocesses unless explicitly requested, we'll define this
+// flag as a no-op.
+#define O_CLOEXEC 0
+#define SOCKOPT_VAL_TYPE char*
+
+#define DEV_NULL "nul"
+
+#else /* WIN32 */
+
+#define SOCKOPT_VAL_TYPE void*
+
+static inline void aligned_free(void* ptr) {
+  free(ptr);
+}
+static inline int compat_closesocket(int fildes) {
+  return close(fildes);
+}
+
+#define DEV_NULL "/dev/null"
+
+#endif /* WIN32 */
+
+/* Supplies code to be run at startup time before invoking main().
+ * Use as:
+ *
+ *     CEPH_CONSTRUCTOR(my_constructor) {
+ *         ...some code...
+ *     }
+ */
+#ifdef _MSC_VER
+#pragma section(".CRT$XCU",read)
+#define CEPH_CONSTRUCTOR(f) \
+  static void __cdecl f(void); \
+  __declspec(allocate(".CRT$XCU")) static void (__cdecl*f##_)(void) = f; \
+  static void __cdecl f(void)
+#else
+#define CEPH_CONSTRUCTOR(f) \
+  static void f(void) __attribute__((constructor)); \
+  static void f(void)
+#endif
+
+/* This should only be used with the socket API. */
+static inline int ceph_sock_errno() {
+#ifdef _WIN32
+  return wsae_to_errno(WSAGetLastError());
+#else
+  return errno;
+#endif
+}
+
+// Needed on Windows when handling binary files. Without it, line
+// endings will be replaced and certain characters can be treated as
+// EOF.
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+#endif /* !CEPH_COMPAT_H */
diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake
new file mode 100644
index 000000000..cc9ad0ec7
--- /dev/null
+++ b/src/include/config-h.in.cmake
@@ -0,0 +1,393 @@
+/* config.h file expanded by Cmake for build */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+/* Define to 1 if you have the `memset_s()` function. */
+#cmakedefine HAVE_MEMSET_S
+
+/* fallocate(2) is supported */
+#cmakedefine CEPH_HAVE_FALLOCATE
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#cmakedefine HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#cmakedefine HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `syncfs' function. */
+#cmakedefine HAVE_SYS_SYNCFS 1
+
+/* sync_file_range(2) is supported */
+#cmakedefine HAVE_SYNC_FILE_RANGE
+
+/* Define if you have mallinfo */
+#cmakedefine HAVE_MALLINFO
+
+/* Define to 1 if you have the `pwritev' function. */
+#cmakedefine HAVE_PWRITEV 1
+
+/* Define to 1 if you have the <sys/mount.h> header file. */
+#cmakedefine HAVE_SYS_MOUNT_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#cmakedefine HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#cmakedefine HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/vfs.h> header file. */
+#cmakedefine HAVE_SYS_VFS_H 1
+
+/* Define to 1 if you have the <execinfo.h> header file. */
+#cmakedefine HAVE_EXECINFO_H 1
+
+/* Define to 1 if the system has the type `__s16'. */
+#cmakedefine HAVE___S16 1
+
+/* Define to 1 if the system has the type `__s32'. */
+#cmakedefine HAVE___S32 1
+
+/* Define to 1 if the system has the type `__s64'. */
+#cmakedefine HAVE___S64 1
+
+/* Define to 1 if the system has the type `__s8'. */
+#cmakedefine HAVE___S8 1
+
+/* Define to 1 if the system has the type `__u16'. */
+#cmakedefine HAVE___U16 1
+
+/* Define to 1 if the system has the type `__u32'. */
+#cmakedefine HAVE___U32 1
+
+/* Define to 1 if the system has the type `__u64'. */
+#cmakedefine HAVE___U64 1
+
+/* Define to 1 if the system has the type `__u8'. */
+#cmakedefine HAVE___U8 1
+
+/* Define if the system has the type `in_addr_t' */
+#cmakedefine HAVE_IN_ADDR_T
+
+/* Define if you have suseconds_t */
+#cmakedefine HAVE_SUSECONDS_T
+
+/* Define if you have res_nquery */
+#cmakedefine HAVE_RES_NQUERY
+
+/* Defined if you have LZ4 */
+#cmakedefine HAVE_LZ4
+
+/* Defined if you have BROTLI */
+#cmakedefine HAVE_BROTLI
+
+/* Defined if you have libaio */
+#cmakedefine HAVE_LIBAIO
+
+/* Defined if you have libdml */
+#cmakedefine HAVE_LIBDML
+
+/* Defined if you have libzbd */
+#cmakedefine HAVE_LIBZBD
+
+/* Defined if you have liburing */
+#cmakedefine HAVE_LIBURING
+
+/* Defind if you have POSIX AIO */
+#cmakedefine HAVE_POSIXAIO
+
+/* Defined if OpenLDAP enabled */
+#cmakedefine HAVE_OPENLDAP
+
+/* Define if you have fuse */
+#cmakedefine HAVE_LIBFUSE
+
+/* Define version major */
+#define CEPH_FUSE_MAJOR_VERSION @FUSE_MAJOR_VERSION@
+
+/* Define version minor */
+#define CEPH_FUSE_MINOR_VERSION @FUSE_MINOR_VERSION@
+
+/* Define to 1 if you have libxfs */
+#cmakedefine HAVE_LIBXFS 1
+
+/* SPDK conditional compilation */
+#cmakedefine HAVE_SPDK
+
+/* DPDK conditional compilation */
+#cmakedefine HAVE_DPDK
+
+/* PMEM_DEVICE (OSD) conditional compilation */
+#cmakedefine HAVE_BLUESTORE_PMEM
+
+/* Define if you have tcmalloc */
+#cmakedefine HAVE_LIBTCMALLOC
+#cmakedefine LIBTCMALLOC_MISSING_ALIGNED_ALLOC
+
+/* AsyncMessenger RDMA conditional compilation */
+#cmakedefine HAVE_RDMA
+
+/* ibverbs experimental conditional compilation */
+#cmakedefine HAVE_IBV_EXP
+
+/* define if bluestore enabled */
+#cmakedefine WITH_BLUESTORE
+
+/* define if cephfs enabled */
+#cmakedefine WITH_CEPHFS
+
+/* define if systemed is enabled */
+#cmakedefine WITH_SYSTEMD
+
+/*define if GSSAPI/KRB5 enabled */
+#cmakedefine HAVE_GSSAPI
+
+/* define if rbd enabled */
+#cmakedefine WITH_RBD
+
+/* define if kernel rbd enabled */
+#cmakedefine WITH_KRBD
+
+/* define if key-value-store is enabled */
+#cmakedefine WITH_KVS
+
+/* define if radosgw enabled */
+#cmakedefine WITH_RADOSGW
+
+/* define if radosgw has openssl support */
+#cmakedefine WITH_CURL_OPENSSL
+
+/* define if HAVE_THREAD_SAFE_RES_QUERY */
+#cmakedefine HAVE_THREAD_SAFE_RES_QUERY
+
+/* define if HAVE_REENTRANT_STRSIGNAL */
+#cmakedefine HAVE_REENTRANT_STRSIGNAL
+
+/* Define if you want to use LTTng */
+#cmakedefine WITH_LTTNG
+
+/* Define if you want to use Jaeger */
+#cmakedefine HAVE_JAEGER
+
+/* Define if you want to use EVENTTRACE */
+#cmakedefine WITH_EVENTTRACE
+
+/* Define if you want to OSD function instrumentation */
+#cmakedefine WITH_OSD_INSTRUMENT_FUNCTIONS
+
+/* Define if you want to use Babeltrace */
+#cmakedefine WITH_BABELTRACE
+
+/* Define to 1 if you have the <babeltrace/babeltrace.h> header file. */
+#cmakedefine HAVE_BABELTRACE_BABELTRACE_H 1
+
+/* Define to 1 if you have the <babeltrace/ctf/events.h> header file. */
+#cmakedefine HAVE_BABELTRACE_CTF_EVENTS_H 1
+
+/* Define to 1 if you have the <babeltrace/ctf/iterator.h> header file. */
+#cmakedefine HAVE_BABELTRACE_CTF_ITERATOR_H 1
+
+/* Define to 1 if you have the <arpa/nameser_compat.h> header file. */
+#cmakedefine HAVE_ARPA_NAMESER_COMPAT_H 1
+
+/* FastCGI headers are in /usr/include/fastcgi */
+#cmakedefine FASTCGI_INCLUDE_DIR
+
+/* splice(2) is supported */
+#cmakedefine CEPH_HAVE_SPLICE
+
+/* Define if you want C_Gather debugging */
+#cmakedefine DEBUG_GATHER
+
+/* Define to 1 if you have the `getgrouplist' function. */
+#cmakedefine HAVE_GETGROUPLIST 1
+
+/* LTTng is disabled, so define this macro to be nothing. */
+#cmakedefine tracepoint
+
+/* Define to 1 if you have fdatasync. */
+#cmakedefine HAVE_FDATASYNC 1
+
+/* Define to 1 if you have the <valgrind/helgrind.h> header file. */
+#cmakedefine HAVE_VALGRIND_HELGRIND_H 1
+
+/* Define to 1 if you have the <sys/prctl.h> header file. */
+#cmakedefine HAVE_SYS_PRCTL_H 1
+
+/* Define to 1 if you have the <linux/types.h> header file. */
+#cmakedefine HAVE_LINUX_TYPES_H 1
+
+/* Define to 1 if you have the <linux/version.h> header file. */
+#cmakedefine HAVE_LINUX_VERSION_H 1
+
+/* Define to 1 if you have sched.h. */
+#cmakedefine HAVE_SCHED 1
+
+/* Define to 1 if you have sigdescr_np. */
+#cmakedefine HAVE_SIGDESCR_NP 1
+
+/* Support SSE (Streaming SIMD Extensions) instructions */
+#cmakedefine HAVE_SSE
+
+/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
+#cmakedefine HAVE_SSE2
+
+/* Define to 1 if you have the `pipe2' function. */
+#cmakedefine HAVE_PIPE2 1
+
+/* Support NEON instructions */
+#cmakedefine HAVE_NEON
+
+/* Define if you have pthread_spin_init */
+#cmakedefine HAVE_PTHREAD_SPINLOCK
+
+/* name_to_handle_at exists */
+#cmakedefine HAVE_NAME_TO_HANDLE_AT
+
+/* we have a recent nasm and are x86_64 */
+#cmakedefine HAVE_NASM_X64
+
+/* nasm can also build the isa-l:avx512 */
+#cmakedefine HAVE_NASM_X64_AVX512
+
+/* Define if the erasure code isa-l plugin is compiled */
+#cmakedefine WITH_EC_ISA_PLUGIN
+
+/* Define to 1 if strerror_r returns char *. */
+#cmakedefine STRERROR_R_CHAR_P 1
+
+/* Defined if you have libzfs enabled */
+#cmakedefine HAVE_LIBZFS
+
+/* Define if the C compiler supports __func__ */
+#cmakedefine HAVE_FUNC
+
+/* Define if the C compiler supports __PRETTY_FUNCTION__ */
+#cmakedefine HAVE_PRETTY_FUNC
+
+/* Define if the C compiler supports __attribute__((__symver__ (".."))) */
+#cmakedefine HAVE_ATTR_SYMVER
+
+/* Define if the C compiler supports __asm__(".symver ..") */
+#cmakedefine HAVE_ASM_SYMVER
+
+/* Have eventfd extension. */
+#cmakedefine HAVE_EVENTFD
+
+/* Define if enabling coverage. */
+#cmakedefine ENABLE_COVERAGE
+
+/* Defined if you want pg ref debugging */
+#cmakedefine PG_DEBUG_REFS
+
+/* Support ARMv8 CRC instructions */
+#cmakedefine HAVE_ARMV8_CRC
+
+/* Support ARMv8 CRYPTO instructions */
+#cmakedefine HAVE_ARMV8_CRYPTO
+
+/* Support ARMv8 CRC and CRYPTO intrinsics */
+#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+
+/* Define if you have struct stat.st_mtimespec.tv_nsec */
+#cmakedefine HAVE_STAT_ST_MTIMESPEC_TV_NSEC
+
+/* Define if you have struct stat.st_mtim.tv_nsec */
+#cmakedefine HAVE_STAT_ST_MTIM_TV_NSEC
+
+/* Define if compiler supports static_cast<> */
+#cmakedefine HAVE_STATIC_CAST
+
+/* Version number of package */
+#cmakedefine PROJECT_VERSION "@PROJECT_VERSION@"
+
+/* Defined if pthread_setname_np() is available */
+#cmakedefine HAVE_PTHREAD_SETNAME_NP 1
+
+/* Defined if pthread_rwlockattr_setkind_np() is available */
+#cmakedefine HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP
+
+/* Defined if blkin enabled */
+#cmakedefine WITH_BLKIN
+
+/* Defined if pthread_set_name_np() is available */
+#cmakedefine HAVE_PTHREAD_SET_NAME_NP
+
+/* Defined if pthread_getname_np() is available */
+#cmakedefine HAVE_PTHREAD_GETNAME_NP 1
+
+/* Support POWER8 instructions */
+#cmakedefine HAVE_POWER8
+
+/* Define if endian type is big endian */
+#cmakedefine CEPH_BIG_ENDIAN
+
+/* Define if endian type is little endian */
+#cmakedefine CEPH_LITTLE_ENDIAN
+
+#cmakedefine MGR_PYTHON_EXECUTABLE "@MGR_PYTHON_EXECUTABLE@"
+
+/* Define to 1 if you have the `getprogname' function. */
+#cmakedefine HAVE_GETPROGNAME 1
+
+/* Defined if getentropy() is available */
+#cmakedefine HAVE_GETENTROPY
+
+/* Defined if libradosstriper is enabled: */
+#cmakedefine WITH_LIBRADOSSTRIPER
+
+/* Defined if OpenSSL is available for the rgw beast frontend */
+#cmakedefine WITH_RADOSGW_BEAST_OPENSSL
+
+/* Defined if rabbitmq-c is available for rgw amqp push endpoint */
+#cmakedefine WITH_RADOSGW_AMQP_ENDPOINT
+
+/* Defined if libedkafka is available for rgw kafka push endpoint */
+#cmakedefine WITH_RADOSGW_KAFKA_ENDPOINT
+
+/* Defined if lua packages can be installed by radosgw */
+#cmakedefine WITH_RADOSGW_LUA_PACKAGES
+
+/* Backend dbstore for Rados Gateway */
+#cmakedefine WITH_RADOSGW_DBSTORE
+
+/* Backend CORTX-Motr for Rados Gateway */
+#cmakedefine WITH_RADOSGW_MOTR
+
+/* Backend CORTX-DAOS for Rados Gateway */
+#cmakedefine WITH_RADOSGW_DAOS
+
+/* Defined if std::map::merge() is supported */
+#cmakedefine HAVE_STDLIB_MAP_SPLICING
+
+/* Defined if Intel QAT compress/decompress is supported */
+#cmakedefine HAVE_QATZIP
+
+/* Define if seastar is available. */
+#cmakedefine HAVE_SEASTAR
+
+/* Define if unit tests are built. */
+#cmakedefine UNIT_TESTS_BUILT
+
+/* Define if RBD QCOW migration format is enabled */
+#cmakedefine WITH_RBD_MIGRATION_FORMAT_QCOW_V1
+
+/* Define if libcephsqlite is enabled */
+#cmakedefine WITH_LIBCEPHSQLITE
+
+/* Define if RWL is enabled */
+#cmakedefine WITH_RBD_RWL
+
+/* Define if PWL-SSD is enabled */
+#cmakedefine WITH_RBD_SSD_CACHE
+
+/* Define if libcryptsetup can be used (linux only) */
+#cmakedefine HAVE_LIBCRYPTSETUP
+
+/* Shared library extension, such as .so, .dll or .dylib */
+#cmakedefine CMAKE_SHARED_LIBRARY_SUFFIX "@CMAKE_SHARED_LIBRARY_SUFFIX@"
+
+/* libexec directory path */
+#cmakedefine CMAKE_INSTALL_LIBEXECDIR "@CMAKE_INSTALL_LIBEXECDIR@"
+
+#endif /* CONFIG_H */
diff --git a/src/include/coredumpctl.h b/src/include/coredumpctl.h
new file mode 100644
index 000000000..60b91e999
--- /dev/null
+++ b/src/include/coredumpctl.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include "acconfig.h"
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <iostream>
+#include <sys/prctl.h>
+#include "common/errno.h"
+
+class PrCtl {
+  int saved_state = -1;
+  static int get_dumpable() {
+    int r = prctl(PR_GET_DUMPABLE);
+    if (r == -1) {
+      r = errno;
+      std::cerr << "warning: unable to get dumpable flag: " << cpp_strerror(r)
+                << std::endl;
+    }
+    return r;
+  }
+  static int set_dumpable(bool new_state) {
+    int r = prctl(PR_SET_DUMPABLE, new_state);
+    if (r) {
+      r = -errno;
+      std::cerr << "warning: unable to " << (new_state ? "set" : "unset")
+                << " dumpable flag: " << cpp_strerror(r)
+                << std::endl;
+    }
+    return r;
+  }
+public:
+  PrCtl(int new_state = 0) {
+    int r = get_dumpable();
+    if (r == -1) {
+      return;
+    }
+    if (r != new_state) {
+      if (!set_dumpable(new_state)) {
+        saved_state = r;
+      }
+    }
+  }
+  ~PrCtl() {
+    if (saved_state < 0) {
+      return;
+    }
+    set_dumpable(saved_state);
+  }
+};
+
+#else
+#ifdef RLIMIT_CORE
+#include <sys/resource.h>
+#include <iostream>
+#include <sys/resource.h>
+#include "common/errno.h"
+
+class PrCtl {
+  rlimit saved_lim;
+  static int get_dumpable(rlimit* saved) {
+    int r = getrlimit(RLIMIT_CORE, saved);
+    if (r) {
+      r = errno;
+      std::cerr << "warning: unable to getrlimit(): " << cpp_strerror(r)
+                << std::endl;
+    }
+    return r;
+  }
+  static void set_dumpable(const rlimit& rlim) {
+    int r = setrlimit(RLIMIT_CORE, &rlim);
+    if (r) {
+      r = -errno;
+      std::cerr << "warning: unable to setrlimit(): " << cpp_strerror(r)
+                << std::endl;
+    }
+  }
+public:
+  PrCtl(int new_state = 0) {
+    int r = get_dumpable(&saved_lim);
+    if (r == -1) {
+      return;
+    }
+    rlimit new_lim;
+    if (new_state) {
+      new_lim.rlim_cur = saved_lim.rlim_max;
+    } else {
+      new_lim.rlim_cur = new_lim.rlim_max = 0;
+    }
+    if (new_lim.rlim_cur == saved_lim.rlim_cur) {
+      return;
+    }
+    set_dumpable(new_lim);
+  }
+  ~PrCtl() {
+    set_dumpable(saved_lim);
+  }
+};
+#else
+struct PrCtl {
+  // to silence the Wunused-variable warning
+  PrCtl() {}
+};
+
+#endif  // RLIMIT_CORE
+#endif
diff --git a/src/include/counter.h b/src/include/counter.h
new file mode 100644
index 000000000..61ed7409c
--- /dev/null
+++ b/src/include/counter.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COUNTER_H
+#define CEPH_COUNTER_H
+
+#include <atomic>
+
+template <typename T>
+class Counter {
+public:
+  Counter() {
+    _count()++;
+    _increments()++;
+  }
+  Counter(const Counter &rhs) {
+    _count()++;
+    _increments()++;
+  }
+  Counter(Counter &&rhs) {}
+  ~Counter() {
+    _count()--;
+  }
+  static uint64_t count() {
+    return _count();
+  }
+  static uint64_t increments() {
+    return _increments();
+  }
+  static uint64_t decrements() {
+    return increments()-count();
+  }
+
+private:
+  static std::atomic<uint64_t> &_count() {
+    static std::atomic<uint64_t> c;
+    return c;
+  }
+  static std::atomic<uint64_t> &_increments() {
+    static std::atomic<uint64_t> i;
+    return i;
+  }
+};
+
+#endif
diff --git a/src/include/cpp-btree/btree.h b/src/include/cpp-btree/btree.h
new file mode 100644
index 000000000..2eddc2abe
--- /dev/null
+++ b/src/include/cpp-btree/btree.h
@@ -0,0 +1,2571 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A btree implementation of the STL set and map interfaces. A btree is smaller
+// and generally also faster than STL set/map (refer to the benchmarks below).
+// The red-black tree implementation of STL set/map has an overhead of 3
+// pointers (left, right and parent) plus the node color information for each
+// stored value. So a set<int32_t> consumes 40 bytes for each value stored in
+// 64-bit mode. This btree implementation stores multiple values on fixed
+// size nodes (usually 256 bytes) and doesn't store child pointers for leaf
+// nodes. The result is that a btree_set<int32_t> may use much less memory per
+// stored value. For the random insertion benchmark in btree_bench.cc, a
+// btree_set<int32_t> with node-size of 256 uses 5.1 bytes per stored value.
+//
+// The packing of multiple values on to each node of a btree has another effect
+// besides better space utilization: better cache locality due to fewer cache
+// lines being accessed. Better cache locality translates into faster
+// operations.
+//
+// CAVEATS
+//
+// Insertions and deletions on a btree can cause splitting, merging or
+// rebalancing of btree nodes. And even without these operations, insertions
+// and deletions on a btree will move values around within a node. In both
+// cases, the result is that insertions and deletions can invalidate iterators
+// pointing to values other than the one being inserted/deleted. Therefore, this
+// container does not provide pointer stability. This is notably different from
+// STL set/map which takes care to not invalidate iterators on insert/erase
+// except, of course, for iterators pointing to the value being erased.  A
+// partial workaround when erasing is available: erase() returns an iterator
+// pointing to the item just after the one that was erased (or end() if none
+// exists).
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <experimental/type_traits>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+namespace btree::internal {
+
+template <typename Compare, typename T>
+using btree_is_key_compare_to =
+  std::is_signed<std::invoke_result_t<Compare, T, T>>;
+
+template<typename T>
+using compare_to_t = decltype(std::declval<T&>().compare(std::declval<const T&>()));
+template<typename T>
+inline constexpr bool has_compare_to = std::experimental::is_detected_v<compare_to_t, T>;
+// A helper class to convert a boolean comparison into a three-way "compare-to"
+// comparison that returns a negative value to indicate less-than, zero to
+// indicate equality and a positive value to indicate greater-than. This helper
+// class is specialized for less<std::string>, greater<std::string>,
+// less<string_view>, and greater<string_view>.
+//
+// key_compare_to_adapter is provided so that btree users
+// automatically get the more efficient compare-to code when using common
+// google string types with common comparison functors.
+// These string-like specializations also turn on heterogeneous lookup by
+// default.
+template <typename Compare, typename=void>
+struct key_compare_to_adapter {
+  using type = Compare;
+};
+
+template <typename K>
+struct key_compare_to_adapter<std::less<K>, std::enable_if_t<has_compare_to<K>>>
+{
+  struct type {
+    inline int operator()(const K& lhs, const K& rhs) const noexcept {
+      return lhs.compare(rhs);
+    }
+  };
+};
+
+template <typename K>
+struct key_compare_to_adapter<std::less<K>, std::enable_if_t<std::is_signed_v<K>>>
+{
+  struct type {
+    inline K operator()(const K& lhs, const K& rhs) const noexcept {
+      return lhs - rhs;
+    }
+  };
+};
+
+template <typename K>
+struct key_compare_to_adapter<std::less<K>, std::enable_if_t<std::is_unsigned_v<K>>>
+{
+  struct type {
+    inline int operator()(const K& lhs, const K& rhs) const noexcept  {
+      if (lhs < rhs) {
+        return -1;
+      } else if (lhs > rhs) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  };
+};
+
+template <typename Key, typename Compare, typename Alloc,
+          int TargetNodeSize, int ValueSize,
+          bool Multi>
+struct common_params {
+  // If Compare is a common comparator for a std::string-like type, then we adapt it
+  // to use heterogeneous lookup and to be a key-compare-to comparator.
+  using key_compare = typename key_compare_to_adapter<Compare>::type;
+  // A type which indicates if we have a key-compare-to functor or a plain old
+  // key-compare functor.
+  using is_key_compare_to = btree_is_key_compare_to<key_compare, Key>;
+
+  using allocator_type = Alloc;
+  using key_type = Key;
+  using size_type = std::make_signed<size_t>::type;
+  using difference_type = ptrdiff_t;
+
+  // True if this is a multiset or multimap.
+  using is_multi_container = std::integral_constant<bool, Multi>;
+
+  constexpr static int kTargetNodeSize = TargetNodeSize;
+  constexpr static int kValueSize = ValueSize;
+  // Upper bound for the available space for values. This is largest for leaf
+  // nodes, which have overhead of at least a pointer + 3 bytes (for storing
+  // 3 field_types) + paddings. if alignof(key_type) is 1, the size of padding
+  // would be 0.
+  constexpr static int kNodeValueSpace =
+        TargetNodeSize - /*minimum overhead=*/(sizeof(void *) + 4);
+
+  // This is an integral type large enough to hold as many
+  // ValueSize-values as will fit a node of TargetNodeSize bytes.
+  using node_count_type =
+      std::conditional_t<(kNodeValueSpace / ValueSize >
+                          (std::numeric_limits<uint8_t>::max)()),
+                         uint16_t,
+                         uint8_t>;
+};
+
+// The internal storage type
+//
+// It is convenient for the value_type of a btree_map<K, V> to be
+// pair<const K, V>; the "const K" prevents accidental modification of the key
+// when dealing with the reference returned from find() and similar methods.
+// However, this creates other problems; we want to be able to emplace(K, V)
+// efficiently with move operations, and similarly be able to move a
+// pair<K, V> in insert().
+//
+// The solution is this union, which aliases the const and non-const versions
+// of the pair. This also allows flat_hash_map<const K, V> to work, even though
+// that has the same efficiency issues with move in emplace() and insert() -
+// but people do it anyway.
+template <class K, class V>
+union map_slot_type {
+  map_slot_type() {}
+  ~map_slot_type() = delete;
+  map_slot_type& operator=(const map_slot_type& slot) {
+    mutable_value = slot.mutable_value;
+    return *this;
+  }
+  map_slot_type& operator=(map_slot_type&& slot) {
+    mutable_value = std::move(slot.mutable_value);
+    return *this;
+  }
+  using value_type = std::pair<const K, V>;
+  using mutable_value_type = std::pair<K, V>;
+
+  value_type value;
+  mutable_value_type mutable_value;
+  K key;
+};
+
+template <class K, class V>
+void swap(map_slot_type<K, V>& lhs, map_slot_type<K, V>& rhs) {
+  std::swap(lhs.mutable_value, rhs.mutable_value);
+}
+
+// A parameters structure for holding the type parameters for a btree_map.
+// Compare and Alloc should be nothrow copy-constructible.
+template <typename Key, typename Data, typename Compare, typename Alloc,
+          int TargetNodeSize, bool Multi>
+struct map_params : common_params<Key, Compare, Alloc, TargetNodeSize,
+                                  sizeof(Key) + sizeof(Data), Multi> {
+  using super_type = typename map_params::common_params;
+  using mapped_type = Data;
+  using value_type = std::pair<const Key, mapped_type>;
+  using mutable_value_type = std::pair<Key, mapped_type>;
+  using slot_type = map_slot_type<Key, mapped_type>;
+  using pointer = value_type*;
+  using const_pointer = const value_type *;
+  using reference = value_type &;
+  using const_reference = const value_type &;
+  using key_compare = typename super_type::key_compare;
+  using init_type = mutable_value_type;
+
+  static constexpr size_t kValueSize = sizeof(Key) + sizeof(mapped_type);
+
+  // Inherit from key_compare for empty base class optimization.
+  struct value_compare : private key_compare {
+    value_compare() = default;
+    explicit value_compare(const key_compare &cmp) : key_compare(cmp) {}
+
+    template <typename T, typename U>
+    auto operator()(const T &left, const U &right) const
+        -> decltype(std::declval<key_compare>()(left.first, right.first)) {
+      return key_compare::operator()(left.first, right.first);
+    }
+  };
+  using is_map_container = std::true_type;
+
+  static const Key &key(const value_type &value) { return value.first; }
+  static mapped_type &value(value_type *value) { return value->second; }
+  static const Key &key(const slot_type *slot) { return slot->key; }
+  static value_type& element(slot_type* slot) { return slot->value; }
+  static const value_type& element(const slot_type* slot) { return slot->value; }
+  template <class... Args>
+  static void construct(Alloc *alloc, slot_type *slot, Args &&... args) {
+    std::allocator_traits<Alloc>::construct(*alloc,
+                                            &slot->mutable_value,
+                                            std::forward<Args>(args)...);
+  }
+  // Construct this slot by moving from another slot.
+  static void construct(Alloc* alloc, slot_type* slot, slot_type* other) {
+    emplace(slot);
+    std::allocator_traits<Alloc>::construct(*alloc, &slot->value,
+                                            std::move(other->value));
+  }
+  static void move(Alloc *alloc, slot_type *src, slot_type *dest) {
+    dest->mutable_value = std::move(src->mutable_value);
+  }
+  static void destroy(Alloc *alloc, slot_type *slot) {
+    std::allocator_traits<Alloc>::destroy(*alloc, &slot->mutable_value);
+  }
+
+private:
+  static void emplace(slot_type* slot) {
+    // The construction of union doesn't do anything at runtime but it allows us
+    // to access its members without violating aliasing rules.
+    new (slot) slot_type;
+  }
+};
+
+// A parameters structure for holding the type parameters for a btree_set.
+template <typename Key, typename Compare, typename Alloc, int TargetNodeSize, bool Multi>
+struct set_params
+    : public common_params<Key, Compare, Alloc, TargetNodeSize,
+                           sizeof(Key), Multi> {
+  using value_type = Key;
+  using mutable_value_type = value_type;
+  using slot_type = Key;
+  using pointer = value_type *;
+  using const_pointer = const value_type *;
+  using value_compare = typename set_params::common_params::key_compare;
+  using reference = value_type &;
+  using const_reference = const value_type &;
+  using is_map_container = std::false_type;
+  using init_type = mutable_value_type;
+
+  template <class... Args>
+  static void construct(Alloc *alloc, slot_type *slot, Args &&... args) {
+    std::allocator_traits<Alloc>::construct(*alloc,
+                                            slot,
+                                            std::forward<Args>(args)...);
+  }
+  static void construct(Alloc *alloc, slot_type *slot, slot_type *other) {
+    std::allocator_traits<Alloc>::construct(*alloc, slot, std::move(*other));
+  }
+  static void move(Alloc *alloc, slot_type *src, slot_type *dest) {
+    *dest = std::move(*src);
+  }
+  static void destroy(Alloc *alloc, slot_type *slot) {
+    std::allocator_traits<Alloc>::destroy(*alloc, slot);
+  }
+  static const Key &key(const value_type &x) { return x; }
+  static const Key &key(const slot_type *slot) { return *slot; }
+  static value_type &element(slot_type *slot) { return *slot; }
+  static const value_type &element(const slot_type *slot) { return *slot; }
+};
+
+// Helper functions to do a boolean comparison of two keys given a boolean
+// or three-way comparator.
+// SFINAE prevents implicit conversions to bool (such as from int).
+template <typename Result>
+constexpr bool compare_result_as_less_than(const Result r) {
+  if constexpr (std::is_signed_v<Result>) {
+    return r < 0;
+  } else {
+    return r;
+  }
+}
+// An adapter class that converts a lower-bound compare into an upper-bound
+// compare. Note: there is no need to make a version of this adapter specialized
+// for key-compare-to functors because the upper-bound (the first value greater
+// than the input) is never an exact match.
+template <typename Compare>
+struct upper_bound_adapter {
+  explicit upper_bound_adapter(const Compare &c) : comp(c) {}
+  template <typename K, typename LK>
+  bool operator()(const K &a, const LK &b) const {
+    // Returns true when a is not greater than b.
+    return !compare_result_as_less_than(comp(b, a));
+  }
+private:
+  const Compare& comp;
+};
+
+enum class MatchKind : uint8_t { kEq, kNe };
+
+template <typename V, bool IsCompareTo>
+struct SearchResult {
+  V value;
+  MatchKind match;
+
+  static constexpr bool has_match = true;
+  bool IsEq() const { return match == MatchKind::kEq; }
+};
+
+// When we don't use CompareTo, `match` is not present.
+// This ensures that callers can't use it accidentally when it provides no
+// useful information.
+template <typename V>
+struct SearchResult<V, false> {
+  V value;
+
+  static constexpr bool has_match = false;
+  static constexpr bool IsEq() { return false; }
+};
+
+// A node in the btree holding. The same node type is used for both internal
+// and leaf nodes in the btree, though the nodes are allocated in such a way
+// that the children array is only valid in internal nodes.
+template <typename Params>
+class btree_node {
+  using is_key_compare_to = typename Params::is_key_compare_to;
+  using is_multi_container = typename Params::is_multi_container;
+  using field_type = typename Params::node_count_type;
+  using allocator_type = typename Params::allocator_type;
+  using slot_type = typename Params::slot_type;
+
+ public:
+  using params_type = Params;
+  using key_type = typename Params::key_type;
+  using value_type = typename Params::value_type;
+  using mutable_value_type = typename Params::mutable_value_type;
+  using pointer = typename Params::pointer;
+  using const_pointer = typename Params::const_pointer;
+  using reference = typename Params::reference;
+  using const_reference = typename Params::const_reference;
+  using key_compare = typename Params::key_compare;
+  using size_type = typename Params::size_type;
+  using difference_type = typename Params::difference_type;
+
+  // Btree decides whether to use linear node search as follows:
+  //   - If the key is arithmetic and the comparator is std::less or
+  //     std::greater, choose linear.
+  //   - Otherwise, choose binary.
+  // TODO(ezb): Might make sense to add condition(s) based on node-size.
+  using use_linear_search = std::integral_constant<
+      bool,
+      std::is_arithmetic_v<key_type> &&
+      (std::is_same_v<std::less<key_type>, key_compare> ||
+       std::is_same_v<std::greater<key_type>, key_compare>)>;
+
+  ~btree_node() = default;
+  btree_node(const btree_node&) = delete;
+  btree_node& operator=(const btree_node&) = delete;
+
+ protected:
+  btree_node() = default;
+
+ private:
+  constexpr static size_type SizeWithNValues(size_type n) {
+    return sizeof(base_fields) + n * sizeof(value_type);;
+  }
+  // A lower bound for the overhead of fields other than values in a leaf node.
+  constexpr static size_type MinimumOverhead() {
+    return SizeWithNValues(1) - sizeof(value_type);
+  }
+
+  // Compute how many values we can fit onto a leaf node taking into account
+  // padding.
+  constexpr static size_type NodeTargetValues(const int begin, const int end) {
+    return begin == end ? begin
+                        : SizeWithNValues((begin + end) / 2 + 1) >
+                                  params_type::kTargetNodeSize
+                              ? NodeTargetValues(begin, (begin + end) / 2)
+                              : NodeTargetValues((begin + end) / 2 + 1, end);
+  }
+
+  constexpr static int kValueSize = params_type::kValueSize;
+  constexpr static int kTargetNodeSize = params_type::kTargetNodeSize;
+  constexpr static int kNodeTargetValues = NodeTargetValues(0, kTargetNodeSize);
+
+  // We need a minimum of 3 values per internal node in order to perform
+  // splitting (1 value for the two nodes involved in the split and 1 value
+  // propagated to the parent as the delimiter for the split).
+  constexpr static size_type kNodeValues = std::max(kNodeTargetValues, 3);
+
+  // The node is internal (i.e. is not a leaf node) if and only if `max_count`
+  // has this value.
+  constexpr static size_type kInternalNodeMaxCount = 0;
+
+  struct base_fields {
+    // A pointer to the node's parent.
+    btree_node *parent;
+    // The position of the node in the node's parent.
+    field_type position;
+    // The count of the number of values in the node.
+    field_type count;
+    // The maximum number of values the node can hold.
+    field_type max_count;
+  };
+
+  struct leaf_fields : public base_fields {
+    // The array of values. Only the first count of these values have been
+    // constructed and are valid.
+    slot_type values[kNodeValues];
+  };
+
+  struct internal_fields : public leaf_fields {
+    // The array of child pointers. The keys in children_[i] are all less than
+    // key(i). The keys in children_[i + 1] are all greater than key(i). There
+    // are always count + 1 children.
+    btree_node *children[kNodeValues + 1];
+  };
+
+  constexpr static size_type LeafSize(const int max_values = kNodeValues) {
+    return SizeWithNValues(max_values);
+  }
+  constexpr static size_type InternalSize() {
+    return sizeof(internal_fields);
+  }
+
+  template<auto MemPtr>
+  auto& GetField() {
+    return reinterpret_cast<internal_fields*>(this)->*MemPtr;
+  }
+
+  template<auto MemPtr>
+  auto& GetField() const {
+    return reinterpret_cast<const internal_fields*>(this)->*MemPtr;
+  }
+
+  void set_parent(btree_node *p) { GetField<&base_fields::parent>() = p; }
+  field_type &mutable_count() { return GetField<&base_fields::count>(); }
+  slot_type *slot(int i) { return &GetField<&leaf_fields::values>()[i]; }
+  const slot_type *slot(int i) const { return &GetField<&leaf_fields::values>()[i]; }
+  void set_position(field_type v) { GetField<&base_fields::position>() = v; }
+  void set_count(field_type v) { GetField<&base_fields::count>() = v; }
+  // This method is only called by the node init methods.
+  void set_max_count(field_type v) { GetField<&base_fields::max_count>() = v; }
+
+public:
+  constexpr static size_type Alignment() {
+    static_assert(alignof(leaf_fields) == alignof(internal_fields),
+                  "Alignment of all nodes must be equal.");
+    return alignof(internal_fields);
+  }
+
+  // Getter/setter for whether this is a leaf node or not. This value doesn't
+  // change after the node is created.
+  bool leaf() const { return GetField<&base_fields::max_count>() != kInternalNodeMaxCount; }
+
+  // Getter for the position of this node in its parent.
+  field_type position() const { return GetField<&base_fields::position>(); }
+
+  // Getter for the number of values stored in this node.
+  field_type count() const { return GetField<&base_fields::count>(); }
+  field_type max_count() const {
+    // Internal nodes have max_count==kInternalNodeMaxCount.
+    // Leaf nodes have max_count in [1, kNodeValues].
+    const field_type max_count = GetField<&base_fields::max_count>();
+    return max_count == field_type{kInternalNodeMaxCount}
+               ? field_type{kNodeValues}
+               : max_count;
+  }
+
+  // Getter for the parent of this node.
+  btree_node* parent() const { return GetField<&base_fields::parent>(); }
+  // Getter for whether the node is the root of the tree. The parent of the
+  // root of the tree is the leftmost node in the tree which is guaranteed to
+  // be a leaf.
+  bool is_root() const { return parent()->leaf(); }
+  void make_root() {
+    assert(parent()->is_root());
+    set_parent(parent()->parent());
+  }
+
+  // Getters for the key/value at position i in the node.
+  const key_type& key(int i) const { return params_type::key(slot(i)); }
+  reference value(int i) { return params_type::element(slot(i)); }
+  const_reference value(int i) const { return params_type::element(slot(i)); }
+
+  // Getters/setter for the child at position i in the node.
+  btree_node* child(int i) const { return GetField<&internal_fields::children>()[i]; }
+  btree_node*& mutable_child(int i) { return GetField<&internal_fields::children>()[i]; }
+  void clear_child(int i) {
+#ifndef NDEBUG
+    memset(&mutable_child(i), 0, sizeof(btree_node*));
+#endif
+  }
+  void set_child(int i, btree_node *c) {
+    mutable_child(i) = c;
+    c->set_position(i);
+  }
+  void init_child(int i, btree_node *c) {
+    set_child(i, c);
+    c->set_parent(this);
+  }
+  // Returns the position of the first value whose key is not less than k.
+  template <typename K>
+  SearchResult<int, is_key_compare_to::value> lower_bound(
+      const K &k, const key_compare &comp) const {
+    return use_linear_search::value ? linear_search(k, comp)
+                                    : binary_search(k, comp);
+  }
+  // Returns the position of the first value whose key is greater than k.
+  template <typename K>
+  int upper_bound(const K &k, const key_compare &comp) const {
+    auto upper_compare = upper_bound_adapter<key_compare>(comp);
+    return use_linear_search::value ? linear_search(k, upper_compare).value
+                                    : binary_search(k, upper_compare).value;
+  }
+
+  template <typename K, typename Compare>
+  SearchResult<int, btree_is_key_compare_to<Compare, key_type>::value>
+  linear_search(const K &k, const Compare &comp) const {
+    return linear_search_impl(k, 0, count(), comp,
+                              btree_is_key_compare_to<Compare, key_type>());
+  }
+
+  template <typename K, typename Compare>
+  SearchResult<int, btree_is_key_compare_to<Compare, key_type>::value>
+  binary_search(const K &k, const Compare &comp) const {
+    return binary_search_impl(k, 0, count(), comp,
+                              btree_is_key_compare_to<Compare, key_type>());
+  }
+  // Returns the position of the first value whose key is not less than k using
+  // linear search performed using plain compare.
+  template <typename K, typename Compare>
+  SearchResult<int, false> linear_search_impl(
+      const K &k, int s, const int e, const Compare &comp,
+      std::false_type /* IsCompareTo */) const {
+    while (s < e) {
+      if (!comp(key(s), k)) {
+        break;
+      }
+      ++s;
+    }
+    return {s};
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // linear search performed using compare-to.
+  template <typename K, typename Compare>
+  SearchResult<int, true> linear_search_impl(
+      const K &k, int s, const int e, const Compare &comp,
+      std::true_type /* IsCompareTo */) const {
+    while (s < e) {
+      const auto c = comp(key(s), k);
+      if (c == 0) {
+        return {s, MatchKind::kEq};
+      } else if (c > 0) {
+        break;
+      }
+      ++s;
+    }
+    return {s, MatchKind::kNe};
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // binary search performed using plain compare.
+  template <typename K, typename Compare>
+  SearchResult<int, false> binary_search_impl(
+      const K &k, int s, int e, const Compare &comp,
+      std::false_type /* IsCompareTo */) const {
+    while (s != e) {
+      const int mid = (s + e) >> 1;
+      if (comp(key(mid), k)) {
+        s = mid + 1;
+      } else {
+        e = mid;
+      }
+    }
+    return {s};
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // binary search performed using compare-to.
+  template <typename K, typename CompareTo>
+  SearchResult<int, true> binary_search_impl(
+      const K &k, int s, int e, const CompareTo &comp,
+      std::true_type /* IsCompareTo */) const {
+    if constexpr (is_multi_container::value) {
+      MatchKind exact_match = MatchKind::kNe;
+      while (s != e) {
+        const int mid = (s + e) >> 1;
+        const auto c = comp(key(mid), k);
+        if (c < 0) {
+          s = mid + 1;
+        } else {
+          e = mid;
+          if (c == 0) {
+            // Need to return the first value whose key is not less than k,
+            // which requires continuing the binary search if this is a
+            // multi-container.
+            exact_match = MatchKind::kEq;
+          }
+        }
+      }
+      return {s, exact_match};
+    } else {  // Not a multi-container.
+      while (s != e) {
+        const int mid = (s + e) >> 1;
+        const auto c = comp(key(mid), k);
+        if (c < 0) {
+          s = mid + 1;
+        } else if (c > 0) {
+          e = mid;
+        } else {
+          return {mid, MatchKind::kEq};
+        }
+      }
+      return {s, MatchKind::kNe};
+    }
+  }
+
+  // Emplaces a value at position i, shifting all existing values and
+  // children at positions >= i to the right by 1.
+  template <typename... Args>
+  void emplace_value(size_type i, allocator_type *alloc, Args &&... args);
+
+  // Removes the value at position i, shifting all existing values and children
+  // at positions > i to the left by 1.
+  void remove_value(const int i, allocator_type *alloc);
+
+  // Removes the values at positions [i, i + to_erase), shifting all values
+  // after that range to the left by to_erase. Does not change children at all.
+  void remove_values_ignore_children(int i, int to_erase,
+                                     allocator_type *alloc);
+
+  // Rebalances a node with its right sibling.
+  void rebalance_right_to_left(const int to_move, btree_node *right,
+                               allocator_type *alloc);
+  void rebalance_left_to_right(const int to_move, btree_node *right,
+                               allocator_type *alloc);
+
+  // Splits a node, moving a portion of the node's values to its right sibling.
+  void split(const int insert_position, btree_node *dest, allocator_type *alloc);
+
+  // Merges a node with its right sibling, moving all of the values and the
+  // delimiting key in the parent node onto itself.
+  void merge(btree_node *sibling, allocator_type *alloc);
+
+  // Swap the contents of "this" and "src".
+  void swap(btree_node *src, allocator_type *alloc);
+
+  // Node allocation/deletion routines.
+  static btree_node *init_leaf(btree_node *n, btree_node *parent,
+                               int max_count) {
+    n->set_parent(parent);
+    n->set_position(0);
+    n->set_count(0);
+    n->set_max_count(max_count);
+    return n;
+  }
+  static btree_node *init_internal(btree_node *n, btree_node *parent) {
+    init_leaf(n, parent, kNodeValues);
+    // Set `max_count` to a sentinel value to indicate that this node is
+    // internal.
+    n->set_max_count(kInternalNodeMaxCount);
+    return n;
+  }
+  void destroy(allocator_type *alloc) {
+    for (int i = 0; i < count(); ++i) {
+      value_destroy(i, alloc);
+    }
+  }
+
+ private:
+  template <typename... Args>
+  void value_init(const size_type i, allocator_type *alloc, Args &&... args) {
+    params_type::construct(alloc, slot(i), std::forward<Args>(args)...);
+  }
+  void value_destroy(const size_type i, allocator_type *alloc) {
+    params_type::destroy(alloc, slot(i));
+  }
+
+  // Move n values starting at value i in this node into the values starting at
+  // value j in node x.
+  void uninitialized_move_n(const size_type n, const size_type i,
+                            const size_type j, btree_node *x,
+                            allocator_type *alloc) {
+    for (slot_type *src = slot(i), *end = src + n, *dest = x->slot(j);
+         src != end; ++src, ++dest) {
+      params_type::construct(alloc, dest, src);
+    }
+  }
+
+  // Destroys a range of n values, starting at index i.
+  void value_destroy_n(const size_type i, const size_type n,
+                       allocator_type *alloc) {
+    for (int j = 0; j < n; ++j) {
+      value_destroy(i + j, alloc);
+    }
+  }
+
+private:
+  template <typename P>
+  friend class btree;
+  template <typename N, typename R, typename P>
+  friend struct btree_iterator;
+};
+
+template <typename Node, typename Reference, typename Pointer>
+struct btree_iterator {
+ private:
+  using key_type = typename Node::key_type;
+  using size_type = typename Node::size_type;
+  using params_type = typename Node::params_type;
+
+  using node_type = Node;
+  using normal_node = typename std::remove_const<Node>::type;
+  using const_node = const Node;
+  using normal_pointer = typename params_type::pointer;
+  using normal_reference = typename params_type::reference;
+  using const_pointer = typename params_type::const_pointer;
+  using const_reference = typename params_type::const_reference;
+  using slot_type = typename params_type::slot_type;
+
+  using iterator =
+      btree_iterator<normal_node, normal_reference, normal_pointer>;
+  using const_iterator =
+      btree_iterator<const_node, const_reference, const_pointer>;
+
+ public:
+  // These aliases are public for std::iterator_traits.
+  using difference_type = typename Node::difference_type;
+  using value_type = typename params_type::value_type;
+  using pointer = Pointer;
+  using reference = Reference;
+  using iterator_category = std::bidirectional_iterator_tag;
+
+  btree_iterator() = default;
+  btree_iterator(Node *n, int p) : node(n), position(p) {}
+
+  // NOTE: this SFINAE allows for implicit conversions from iterator to
+  // const_iterator, but it specifically avoids defining copy constructors so
+  // that btree_iterator can be trivially copyable. This is for performance and
+  // binary size reasons.
+  template<typename N, typename R, typename P,
+           std::enable_if_t<
+             std::is_same_v<btree_iterator<N, R, P>, iterator> &&
+             std::is_same_v<btree_iterator, const_iterator>,
+             int> = 0>
+  btree_iterator(const btree_iterator<N, R, P> &x)
+      : node(x.node), position(x.position) {}
+
+ private:
+  // This SFINAE allows explicit conversions from const_iterator to
+  // iterator, but also avoids defining a copy constructor.
+  // NOTE: the const_cast is safe because this constructor is only called by
+  // non-const methods and the container owns the nodes.
+  template <typename N, typename R, typename P,
+            std::enable_if_t<
+              std::is_same_v<btree_iterator<N, R, P>, const_iterator> &&
+              std::is_same_v<btree_iterator, iterator>,
+              int> = 0>
+  explicit btree_iterator(const btree_iterator<N, R, P> &x)
+      : node(const_cast<node_type *>(x.node)), position(x.position) {}
+
+  // Increment/decrement the iterator.
+  void increment() {
+    if (node->leaf() && ++position < node->count()) {
+      return;
+    }
+    increment_slow();
+  }
+  void increment_slow();
+
+  void decrement() {
+    if (node->leaf() && --position >= 0) {
+      return;
+    }
+    decrement_slow();
+  }
+  void decrement_slow();
+
+ public:
+  bool operator==(const const_iterator &x) const {
+    return node == x.node && position == x.position;
+  }
+  bool operator!=(const const_iterator &x) const {
+    return node != x.node || position != x.position;
+  }
+  bool operator==(const iterator& x) const {
+    return node == x.node && position == x.position;
+  }
+  bool operator!=(const iterator& x) const {
+    return node != x.node || position != x.position;
+  }
+
+  // Accessors for the key/value the iterator is pointing at.
+  reference operator*() const {
+    return node->value(position);
+  }
+  pointer operator->() const {
+    return &node->value(position);
+  }
+
+  btree_iterator& operator++() {
+    increment();
+    return *this;
+  }
+  btree_iterator& operator--() {
+    decrement();
+    return *this;
+  }
+  btree_iterator operator++(int) {
+    btree_iterator tmp = *this;
+    ++*this;
+    return tmp;
+  }
+  btree_iterator operator--(int) {
+    btree_iterator tmp = *this;
+    --*this;
+    return tmp;
+  }
+
+ private:
+  template <typename Params>
+  friend class btree;
+  template <typename Tree>
+  friend class btree_container;
+  template <typename Tree>
+  friend class btree_set_container;
+  template <typename Tree>
+  friend class btree_map_container;
+  template <typename Tree>
+  friend class btree_multiset_container;
+  template <typename N, typename R, typename P>
+  friend struct btree_iterator;
+
+  const key_type &key() const { return node->key(position); }
+  slot_type *slot() { return node->slot(position); }
+
+  // The node in the tree the iterator is pointing at.
+  Node *node = nullptr;
+  // The position within the node of the tree the iterator is pointing at.
+  int position = -1;
+};
+
+template <size_t Alignment, class Alloc>
+class AlignedAlloc {
+  struct alignas(Alignment) M {};
+  using alloc_t =
+    typename std::allocator_traits<Alloc>::template rebind_alloc<M>;
+  using traits_t =
+    typename std::allocator_traits<Alloc>::template rebind_traits<M>;
+  static constexpr size_t num_aligned_objects(size_t size) {
+    return (size + sizeof(M) - 1) / sizeof(M);
+  }
+public:
+  static void* allocate(Alloc* alloc, size_t size) {
+    alloc_t aligned_alloc(*alloc);
+    void* p = traits_t::allocate(aligned_alloc,
+                                 num_aligned_objects(size));
+    assert(reinterpret_cast<uintptr_t>(p) % Alignment == 0 &&
+         "allocator does not respect alignment");
+    return p;
+  }
+  static void deallocate(Alloc* alloc, void* p, size_t size) {
+    alloc_t aligned_alloc(*alloc);
+    traits_t::deallocate(aligned_alloc, static_cast<M*>(p),
+                         num_aligned_objects(size));
+  }
+};
+
+template <typename Params>
+class btree {
+  using node_type = btree_node<Params>;
+  using is_key_compare_to = typename Params::is_key_compare_to;
+
+  // We use a static empty node for the root/leftmost/rightmost of empty btrees
+  // in order to avoid branching in begin()/end().
+  struct alignas(node_type::Alignment()) EmptyNodeType : node_type {
+    using field_type = typename node_type::field_type;
+    node_type *parent;
+    field_type position = 0;
+    field_type count = 0;
+    // max_count must be != kInternalNodeMaxCount (so that this node is regarded
+    // as a leaf node). max_count() is never called when the tree is empty.
+    field_type max_count = node_type::kInternalNodeMaxCount + 1;
+
+    constexpr EmptyNodeType(node_type *p) : parent(p) {}
+  };
+
+  static node_type *EmptyNode() {
+    static constexpr EmptyNodeType empty_node(
+        const_cast<EmptyNodeType *>(&empty_node));
+    return const_cast<EmptyNodeType *>(&empty_node);
+  }
+
+  constexpr static int kNodeValues = node_type::kNodeValues;
+  constexpr static int kMinNodeValues = kNodeValues / 2;
+  constexpr static int kValueSize = node_type::kValueSize;
+
+  // A helper class to get the empty base class optimization for 0-size
+  // allocators. Base is allocator_type.
+  // (e.g. empty_base_handle<key_compare, allocator_type, node_type*>). If Base is
+  // 0-size, the compiler doesn't have to reserve any space for it and
+  // sizeof(empty_base_handle) will simply be sizeof(Data). Google [empty base
+  // class optimization] for more details.
+  template <typename Base1, typename Base2, typename Data>
+  struct empty_base_handle : public Base1, Base2 {
+    empty_base_handle(const Base1 &b1, const Base2 &b2, const Data &d)
+        : Base1(b1),
+          Base2(b2),
+          data(d) {}
+    Data data;
+  };
+
+  struct node_stats {
+    using size_type = typename Params::size_type;
+
+    node_stats(size_type l, size_type i)
+        : leaf_nodes(l),
+          internal_nodes(i) {
+    }
+
+    node_stats& operator+=(const node_stats &x) {
+      leaf_nodes += x.leaf_nodes;
+      internal_nodes += x.internal_nodes;
+      return *this;
+    }
+
+    size_type leaf_nodes;
+    size_type internal_nodes;
+  };
+
+ public:
+  using key_type = typename Params::key_type;
+  using value_type = typename Params::value_type;
+  using size_type = typename Params::size_type;
+  using difference_type = typename Params::difference_type;
+  using key_compare = typename Params::key_compare;
+  using value_compare = typename Params::value_compare;
+  using allocator_type = typename Params::allocator_type;
+  using reference = typename Params::reference;
+  using const_reference = typename Params::const_reference;
+  using pointer = typename Params::pointer;
+  using const_pointer = typename Params::const_pointer;
+  using iterator = btree_iterator<node_type, reference, pointer>;
+  using const_iterator = typename iterator::const_iterator;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+  // Internal types made public for use by btree_container types.
+  using params_type = Params;
+
+ private:
+  // For use in copy_or_move_values_in_order.
+  const value_type &maybe_move_from_iterator(const_iterator x) { return *x; }
+  value_type &&maybe_move_from_iterator(iterator x) { return std::move(*x); }
+
+  // Copies or moves (depending on the template parameter) the values in
+  // x into this btree in their order in x. This btree must be empty before this
+  // method is called. This method is used in copy construction, copy
+  // assignment, and move assignment.
+  template <typename Btree>
+  void copy_or_move_values_in_order(Btree *x);
+
+  // Validates that various assumptions/requirements are true at compile time.
+  constexpr static bool static_assert_validation();
+
+ public:
+  btree(const key_compare &comp, const allocator_type &alloc);
+
+  btree(const btree &x);
+  btree(btree &&x) noexcept
+      : root_(std::move(x.root_)),
+        rightmost_(std::exchange(x.rightmost_, EmptyNode())),
+        size_(std::exchange(x.size_, 0)) {
+    x.mutable_root() = EmptyNode();
+  }
+
+  ~btree() {
+    // Put static_asserts in destructor to avoid triggering them before the type
+    // is complete.
+    static_assert(static_assert_validation(), "This call must be elided.");
+    clear();
+  }
+
+  // Assign the contents of x to *this.
+  btree &operator=(const btree &x);
+  btree &operator=(btree &&x) noexcept;
+
+  iterator begin() {
+    return iterator(leftmost(), 0);
+  }
+  const_iterator begin() const {
+    return const_iterator(leftmost(), 0);
+  }
+  iterator end() {
+    return iterator(rightmost_, rightmost_->count());
+  }
+  const_iterator end() const {
+    return const_iterator(rightmost_, rightmost_->count());
+  }
+  reverse_iterator rbegin() {
+    return reverse_iterator(end());
+  }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  reverse_iterator rend() {
+    return reverse_iterator(begin());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  // Finds the first element whose key is not less than key.
+  template <typename K>
+  iterator lower_bound(const K &key) {
+    return internal_end(internal_lower_bound(key));
+  }
+  template <typename K>
+  const_iterator lower_bound(const K &key) const {
+    return internal_end(internal_lower_bound(key));
+  }
+
+  // Finds the first element whose key is greater than key.
+  template <typename K>
+  iterator upper_bound(const K &key) {
+    return internal_end(internal_upper_bound(key));
+  }
+  template <typename K>
+  const_iterator upper_bound(const K &key) const {
+    return internal_end(internal_upper_bound(key));
+  }
+
+  // Finds the range of values which compare equal to key. The first member of
+  // the returned pair is equal to lower_bound(key). The second member pair of
+  // the pair is equal to upper_bound(key).
+  template <typename K>
+  std::pair<iterator, iterator> equal_range(const K &key) {
+    return {lower_bound(key), upper_bound(key)};
+  }
+  template <typename K>
+  std::pair<const_iterator, const_iterator> equal_range(const K &key) const {
+    return {lower_bound(key), upper_bound(key)};
+  }
+
+  // Inserts a value into the btree only if it does not already exist. The
+  // boolean return value indicates whether insertion succeeded or failed.
+  // Requirement: if `key` already exists in the btree, does not consume `args`.
+  // Requirement: `key` is never referenced after consuming `args`.
+  template <typename... Args>
+  std::pair<iterator, bool> insert_unique(const key_type &key, Args &&... args);
+
+  // Inserts with hint. Checks to see if the value should be placed immediately
+  // before `position` in the tree. If so, then the insertion will take
+  // amortized constant time. If not, the insertion will take amortized
+  // logarithmic time as if a call to insert_unique() were made.
+  // Requirement: if `key` already exists in the btree, does not consume `args`.
+  // Requirement: `key` is never referenced after consuming `args`.
+  template <typename... Args>
+  std::pair<iterator, bool> insert_hint_unique(iterator position,
+                                               const key_type &key,
+                                               Args &&... args);
+
+  // Insert a range of values into the btree.
+  template <typename InputIterator>
+  void insert_iterator_unique(InputIterator b, InputIterator e);
+
+  // Inserts a value into the btree.
+  template <typename ValueType>
+  iterator insert_multi(const key_type &key, ValueType &&v);
+
+  // Inserts a value into the btree.
+  template <typename ValueType>
+  iterator insert_multi(ValueType &&v) {
+    return insert_multi(params_type::key(v), std::forward<ValueType>(v));
+  }
+
+  // Insert with hint. Check to see if the value should be placed immediately
+  // before position in the tree. If it does, then the insertion will take
+  // amortized constant time. If not, the insertion will take amortized
+  // logarithmic time as if a call to insert_multi(v) were made.
+  template <typename ValueType>
+  iterator insert_hint_multi(iterator position, ValueType &&v);
+
+  // Insert a range of values into the btree.
+  template <typename InputIterator>
+  void insert_iterator_multi(InputIterator b, InputIterator e);
+
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  // Requirement: does not read the value at `*iter`.
+  iterator erase(iterator iter);
+
+  // Erases range. Returns the number of keys erased and an iterator pointing
+  // to the element after the last erased element.
+  std::pair<size_type, iterator> erase(iterator begin, iterator end);
+
+  // Erases the specified key from the btree. Returns 1 if an element was
+  // erased and 0 otherwise.
+  template <typename K>
+  size_type erase_unique(const K &key);
+
+  // Erases all of the entries matching the specified key from the
+  // btree. Returns the number of elements erased.
+  template <typename K>
+  size_type erase_multi(const K &key);
+
+  // Finds the iterator corresponding to a key or returns end() if the key is
+  // not present.
+  template <typename K>
+  iterator find(const K &key) {
+    return internal_end(internal_find(key));
+  }
+  template <typename K>
+  const_iterator find(const K &key) const {
+    return internal_end(internal_find(key));
+  }
+
+  // Returns a count of the number of times the key appears in the btree.
+  template <typename K>
+  size_type count_unique(const K &key) const {
+    const iterator begin = internal_find(key);
+    if (begin.node == nullptr) {
+      // The key doesn't exist in the tree.
+      return 0;
+    }
+    return 1;
+  }
+  // Returns a count of the number of times the key appears in the btree.
+  template <typename K>
+  size_type count_multi(const K &key) const {
+    const auto range = equal_range(key);
+    return std::distance(range.first, range.second);
+  }
+
+  // Clear the btree, deleting all of the values it contains.
+  void clear();
+
+  // Swap the contents of *this and x.
+  void swap(btree &x);
+
+  const key_compare &key_comp() const noexcept {
+    return *static_cast<const key_compare*>(&root_);
+  }
+  template <typename K, typename LK>
+  bool compare_keys(const K &x, const LK &y) const {
+    return compare_result_as_less_than(key_comp()(x, y));
+  }
+
+  // Verifies the structure of the btree.
+  void verify() const;
+
+  // Size routines.
+  size_type size() const { return size_; }
+  size_type max_size() const { return std::numeric_limits<size_type>::max(); }
+  bool empty() const { return size_ == 0; }
+
+  // The height of the btree. An empty tree will have height 0.
+  size_type height() const {
+    size_type h = 0;
+    if (!empty()) {
+      // Count the length of the chain from the leftmost node up to the
+      // root. We actually count from the root back around to the level below
+      // the root, but the calculation is the same because of the circularity
+      // of that traversal.
+      const node_type *n = root();
+      do {
+        ++h;
+        n = n->parent();
+      } while (n != root());
+    }
+    return h;
+  }
+
+  // The number of internal, leaf and total nodes used by the btree.
+  size_type leaf_nodes() const {
+    return internal_stats(root()).leaf_nodes;
+  }
+  size_type internal_nodes() const {
+    return internal_stats(root()).internal_nodes;
+  }
+  size_type nodes() const {
+    node_stats stats = internal_stats(root());
+    return stats.leaf_nodes + stats.internal_nodes;
+  }
+
+  // The total number of bytes used by the btree.
+  size_type bytes_used() const {
+    node_stats stats = internal_stats(root());
+    if (stats.leaf_nodes == 1 && stats.internal_nodes == 0) {
+      return sizeof(*this) +
+             node_type::LeafSize(root()->max_count());
+    } else {
+      return sizeof(*this) +
+             stats.leaf_nodes * node_type::LeafSize() +
+             stats.internal_nodes * node_type::InternalSize();
+    }
+  }
+
+  // The average number of bytes used per value stored in the btree.
+  static double average_bytes_per_value() {
+    // Returns the number of bytes per value on a leaf node that is 75%
+    // full. Experimentally, this matches up nicely with the computed number of
+    // bytes per value in trees that had their values inserted in random order.
+    return node_type::LeafSize() / (kNodeValues * 0.75);
+  }
+
+  // The fullness of the btree. Computed as the number of elements in the btree
+  // divided by the maximum number of elements a tree with the current number
+  // of nodes could hold. A value of 1 indicates perfect space
+  // utilization. Smaller values indicate space wastage.
+  // Returns 0 for empty trees.
+  double fullness() const {
+    if (empty()) return 0.0;
+    return static_cast<double>(size()) / (nodes() * kNodeValues);
+  }
+  // The overhead of the btree structure in bytes per node. Computed as the
+  // total number of bytes used by the btree minus the number of bytes used for
+  // storing elements divided by the number of elements.
+  // Returns 0 for empty trees.
+  double overhead() const {
+    if (empty()) return 0.0;
+    return (bytes_used() - size() * sizeof(value_type)) /
+           static_cast<double>(size());
+  }
+
+  // The allocator used by the btree.
+  allocator_type get_allocator() const {
+    return allocator();
+  }
+
+ private:
+  // Internal accessor routines.
+  node_type *root() { return root_.data; }
+  const node_type *root() const { return root_.data; }
+  node_type *&mutable_root() { return root_.data; }
+  key_compare *mutable_key_comp() noexcept {
+    return static_cast<key_compare*>(&root_);
+  }
+
+  node_type* rightmost() {
+    return rightmost_;
+  }
+  const node_type* rightmost() const {
+    return rightmost_;
+  }
+  // The leftmost node is stored as the parent of the root node.
+  node_type* leftmost() { return root() ? root()->parent() : NULL; }
+  const node_type* leftmost() const { return root() ? root()->parent() : NULL; }
+
+  // The size of the tree is stored in the root node.
+  size_type* mutable_size() { return root()->mutable_size(); }
+
+  // Allocator routines.
+  allocator_type* mutable_allocator() noexcept {
+    return static_cast<allocator_type*>(&root_);
+  }
+  const allocator_type& allocator() const noexcept {
+    return *static_cast<const allocator_type*>(&root_);
+  }
+
+  node_type *allocate(const size_type size) {
+    using aligned_alloc_t =
+      AlignedAlloc<node_type::Alignment(), allocator_type>;
+    return static_cast<node_type*>(
+      aligned_alloc_t::allocate(mutable_allocator(), size));
+  }
+
+  // Node creation/deletion routines.
+  node_type* new_internal_node(node_type *parent) {
+    node_type *p = allocate(node_type::InternalSize());
+    return node_type::init_internal(p, parent);
+  }
+  node_type* new_leaf_node(node_type *parent) {
+    node_type *p = allocate(node_type::LeafSize());
+    return node_type::init_leaf(p, parent, kNodeValues);
+  }
+  node_type *new_leaf_root_node(const int max_count) {
+    node_type *p = allocate(node_type::LeafSize(max_count));
+    return node_type::init_leaf(p, p, max_count);
+  }
+
+  // Deletion helper routines.
+  void erase_same_node(iterator begin, iterator end);
+  iterator erase_from_leaf_node(iterator begin, size_type to_erase);
+  iterator rebalance_after_delete(iterator iter);
+
+  // Deallocates a node of a certain size in bytes using the allocator.
+  void deallocate(const size_type size, node_type *node) {
+    using aligned_alloc_t =
+      AlignedAlloc<node_type::Alignment(), allocator_type>;
+    aligned_alloc_t::deallocate(mutable_allocator(), node, size);
+  }
+
+  void delete_internal_node(node_type *node) {
+    node->destroy(mutable_allocator());
+    deallocate(node_type::InternalSize(), node);
+  }
+  void delete_leaf_node(node_type *node) {
+    node->destroy(mutable_allocator());
+    deallocate(node_type::LeafSize(node->max_count()), node);
+  }
+
+  // Rebalances or splits the node iter points to.
+  void rebalance_or_split(iterator *iter);
+
+  // Merges the values of left, right and the delimiting key on their parent
+  // onto left, removing the delimiting key and deleting right.
+  void merge_nodes(node_type *left, node_type *right);
+
+  // Tries to merge node with its left or right sibling, and failing that,
+  // rebalance with its left or right sibling. Returns true if a merge
+  // occurred, at which point it is no longer valid to access node. Returns
+  // false if no merging took place.
+  bool try_merge_or_rebalance(iterator *iter);
+
+  // Tries to shrink the height of the tree by 1.
+  void try_shrink();
+
+  iterator internal_end(iterator iter) {
+    return iter.node != nullptr ? iter : end();
+  }
+  const_iterator internal_end(const_iterator iter) const {
+    return iter.node != nullptr ? iter : end();
+  }
+
+  // Emplaces a value into the btree immediately before iter. Requires that
+  // key(v) <= iter.key() and (--iter).key() <= key(v).
+  template <typename... Args>
+  iterator internal_emplace(iterator iter, Args &&... args);
+
+  // Returns an iterator pointing to the first value >= the value "iter" is
+  // pointing at. Note that "iter" might be pointing to an invalid location as
+  // iter.position == iter.node->count(). This routine simply moves iter up in
+  // the tree to a valid location.
+  // Requires: iter.node is non-null.
+  template <typename IterType>
+  static IterType internal_last(IterType iter);
+
+  // Returns an iterator pointing to the leaf position at which key would
+  // reside in the tree. We provide 2 versions of internal_locate. The first
+  // version uses a less-than comparator and is incapable of distinguishing when
+  // there is an exact match. The second version is for the key-compare-to
+  // specialization and distinguishes exact matches. The key-compare-to
+  // specialization allows the caller to avoid a subsequent comparison to
+  // determine if an exact match was made, which is important for keys with
+  // expensive comparison, such as strings.
+  template <typename K>
+  SearchResult<iterator, is_key_compare_to::value> internal_locate(
+      const K &key) const;
+
+  template <typename K>
+  SearchResult<iterator, false> internal_locate_impl(
+      const K &key, std::false_type /* IsCompareTo */) const;
+
+  template <typename K>
+  SearchResult<iterator, true> internal_locate_impl(
+      const K &key, std::true_type /* IsCompareTo */) const;
+
+  // Internal routine which implements lower_bound().
+  template <typename K>
+  iterator internal_lower_bound(const K &key) const;
+
+  // Internal routine which implements upper_bound().
+  template <typename K>
+  iterator internal_upper_bound(const K &key) const;
+
+  // Internal routine which implements find().
+  template <typename K>
+  iterator internal_find(const K &key) const;
+
+  // Deletes a node and all of its children.
+  void internal_clear(node_type *node);
+
+  // Verifies the tree structure of node.
+  int internal_verify(const node_type *node,
+                      const key_type *lo, const key_type *hi) const;
+
+  node_stats internal_stats(const node_type *node) const {
+    // The root can be a static empty node.
+    if (node == nullptr || (node == root() && empty())) {
+      return node_stats(0, 0);
+    }
+    if (node->leaf()) {
+      return node_stats(1, 0);
+    }
+    node_stats res(0, 1);
+    for (int i = 0; i <= node->count(); ++i) {
+      res += internal_stats(node->child(i));
+    }
+    return res;
+  }
+
+ private:
+  empty_base_handle<key_compare, allocator_type, node_type*> root_;
+
+  // A pointer to the rightmost node. Note that the leftmost node is stored as
+  // the root's parent.
+  node_type *rightmost_;
+
+  // Number of values.
+  size_type size_;
+};
+
+////
+// btree_node methods
+template <typename P>
+template <typename... Args>
+inline void btree_node<P>::emplace_value(const size_type i,
+                                         allocator_type *alloc,
+                                         Args &&... args) {
+  assert(i <= count());
+  // Shift old values to create space for new value and then construct it in
+  // place.
+  if (i < count()) {
+    value_init(count(), alloc, slot(count() - 1));
+    std::copy_backward(std::make_move_iterator(slot(i)),
+                       std::make_move_iterator(slot(count() - 1)),
+                       slot(count()));
+    value_destroy(i, alloc);
+  }
+  value_init(i, alloc, std::forward<Args>(args)...);
+  set_count(count() + 1);
+
+  if (!leaf() && count() > i + 1) {
+    for (int j = count(); j > i + 1; --j) {
+      set_child(j, child(j - 1));
+    }
+    clear_child(i + 1);
+  }
+}
+
+template <typename P>
+inline void btree_node<P>::remove_value(const int i, allocator_type *alloc) {
+  if (!leaf() && count() > i + 1) {
+    assert(child(i + 1)->count() == 0);
+    for (size_type j = i + 1; j < count(); ++j) {
+      set_child(j, child(j + 1));
+    }
+    clear_child(count());
+  }
+
+  remove_values_ignore_children(i, /*to_erase=*/1, alloc);
+}
+
+template <typename P>
+inline void btree_node<P>::remove_values_ignore_children(
+    const int i, const int to_erase, allocator_type *alloc) {
+  assert(to_erase >= 0);
+  std::copy(std::make_move_iterator(slot(i + to_erase)),
+            std::make_move_iterator(slot(count())),
+            slot(i));
+  value_destroy_n(count() - to_erase, to_erase, alloc);
+  set_count(count() - to_erase);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_right_to_left(const int to_move,
+                                            btree_node *right,
+                                            allocator_type *alloc) {
+  assert(parent() == right->parent());
+  assert(position() + 1 == right->position());
+  assert(right->count() >= count());
+  assert(to_move >= 1);
+  assert(to_move <= right->count());
+
+  // 1) Move the delimiting value in the parent to the left node.
+  value_init(count(), alloc, parent()->slot(position()));
+
+  // 2) Move the (to_move - 1) values from the right node to the left node.
+  right->uninitialized_move_n(to_move - 1, 0, count() + 1, this, alloc);
+
+  // 3) Move the new delimiting value to the parent from the right node.
+  params_type::move(alloc, right->slot(to_move - 1),
+                    parent()->slot(position()));
+
+  // 4) Shift the values in the right node to their correct position.
+  std::copy(std::make_move_iterator(right->slot(to_move)),
+            std::make_move_iterator(right->slot(right->count())),
+            right->slot(0));
+
+  // 5) Destroy the now-empty to_move entries in the right node.
+  right->value_destroy_n(right->count() - to_move, to_move, alloc);
+
+  if (!leaf()) {
+    // Move the child pointers from the right to the left node.
+    for (int i = 0; i < to_move; ++i) {
+      init_child(count() + i + 1, right->child(i));
+    }
+    for (int i = 0; i <= right->count() - to_move; ++i) {
+      assert(i + to_move <= right->max_count());
+      right->init_child(i, right->child(i + to_move));
+      right->clear_child(i + to_move);
+    }
+  }
+
+  // Fixup the counts on the left and right nodes.
+  set_count(count() + to_move);
+  right->set_count(right->count() - to_move);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_left_to_right(const int to_move,
+                                            btree_node *right,
+                                            allocator_type *alloc) {
+  assert(parent() == right->parent());
+  assert(position() + 1 == right->position());
+  assert(count() >= right->count());
+  assert(to_move >= 1);
+  assert(to_move <= count());
+
+  // Values in the right node are shifted to the right to make room for the
+  // new to_move values. Then, the delimiting value in the parent and the
+  // other (to_move - 1) values in the left node are moved into the right node.
+  // Lastly, a new delimiting value is moved from the left node into the
+  // parent, and the remaining empty left node entries are destroyed.
+
+  if (right->count() >= to_move) {
+    // The original location of the right->count() values are sufficient to hold
+    // the new to_move entries from the parent and left node.
+
+    // 1) Shift existing values in the right node to their correct positions.
+    right->uninitialized_move_n(to_move, right->count() - to_move,
+                                right->count(), right, alloc);
+    std::copy_backward(std::make_move_iterator(right->slot(0)),
+                       std::make_move_iterator(right->slot(right->count() - to_move)),
+                       right->slot(right->count()));
+
+    // 2) Move the delimiting value in the parent to the right node.
+    params_type::move(alloc, parent()->slot(position()),
+                      right->slot(to_move - 1));
+
+    // 3) Move the (to_move - 1) values from the left node to the right node.
+    std::copy(std::make_move_iterator(slot(count() - (to_move - 1))),
+              std::make_move_iterator(slot(count())),
+              right->slot(0));
+  } else {
+    // The right node does not have enough initialized space to hold the new
+    // to_move entries, so part of them will move to uninitialized space.
+
+    // 1) Shift existing values in the right node to their correct positions.
+    right->uninitialized_move_n(right->count(), 0, to_move, right, alloc);
+
+    // 2) Move the delimiting value in the parent to the right node.
+    right->value_init(to_move - 1, alloc, parent()->slot(position()));
+
+    // 3) Move the (to_move - 1) values from the left node to the right node.
+    const size_type uninitialized_remaining = to_move - right->count() - 1;
+    uninitialized_move_n(uninitialized_remaining,
+                         count() - uninitialized_remaining, right->count(),
+                         right, alloc);
+    std::copy(std::make_move_iterator(slot(count() - (to_move - 1))),
+              std::make_move_iterator(slot(count() - uninitialized_remaining)),
+              right->slot(0));
+  }
+
+  // 4) Move the new delimiting value to the parent from the left node.
+  params_type::move(alloc, slot(count() - to_move), parent()->slot(position()));
+
+  // 5) Destroy the now-empty to_move entries in the left node.
+  value_destroy_n(count() - to_move, to_move, alloc);
+
+  if (!leaf()) {
+    // Move the child pointers from the left to the right node.
+    for (int i = right->count(); i >= 0; --i) {
+      right->init_child(i + to_move, right->child(i));
+      right->clear_child(i);
+    }
+    for (int i = 1; i <= to_move; ++i) {
+      right->init_child(i - 1, child(count() - to_move + i));
+      clear_child(count() - to_move + i);
+    }
+  }
+
+  // Fixup the counts on the left and right nodes.
+  set_count(count() - to_move);
+  right->set_count(right->count() + to_move);
+}
+
+template <typename P>
+void btree_node<P>::split(const int insert_position, btree_node *dest,
+                          allocator_type *alloc) {
+  assert(dest->count() == 0);
+  assert(max_count() == kNodeValues);
+
+  // We bias the split based on the position being inserted. If we're
+  // inserting at the beginning of the left node then bias the split to put
+  // more values on the right node. If we're inserting at the end of the
+  // right node then bias the split to put more values on the left node.
+  if (insert_position == 0) {
+    dest->set_count(count() - 1);
+  } else if (insert_position == kNodeValues) {
+    dest->set_count(0);
+  } else {
+    dest->set_count(count() / 2);
+  }
+  set_count(count() - dest->count());
+  assert(count() >= 1);
+
+  // Move values from the left sibling to the right sibling.
+  uninitialized_move_n(dest->count(), count(), 0, dest, alloc);
+
+  // Destroy the now-empty entries in the left node.
+  value_destroy_n(count(), dest->count(), alloc);
+
+  // The split key is the largest value in the left sibling.
+  set_count(count() - 1);
+  parent()->emplace_value(position(), alloc, slot(count()));
+  value_destroy(count(), alloc);
+  parent()->init_child(position() + 1, dest);
+
+  if (!leaf()) {
+    for (int i = 0; i <= dest->count(); ++i) {
+      assert(child(count() + i + 1) != nullptr);
+      dest->init_child(i, child(count() + i + 1));
+      clear_child(count() + i + 1);
+    }
+  }
+}
+
+template <typename P>
+void btree_node<P>::merge(btree_node *src, allocator_type *alloc) {
+  assert(parent() == src->parent());
+  assert(position() + 1 == src->position());
+
+  // Move the delimiting value to the left node.
+  value_init(count(), alloc, parent()->slot(position()));
+
+    // Move the values from the right to the left node.
+  src->uninitialized_move_n(src->count(), 0, count() + 1, this, alloc);
+
+  // Destroy the now-empty entries in the right node.
+  src->value_destroy_n(0, src->count(), alloc);
+
+  if (!leaf()) {
+    // Move the child pointers from the right to the left node.
+    for (int i = 0; i <= src->count(); ++i) {
+      init_child(count() + i + 1, src->child(i));
+      src->clear_child(i);
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(1 + count() + src->count());
+  src->set_count(0);
+
+  // Remove the value on the parent node.
+  parent()->remove_value(position(), alloc);
+}
+
+template <typename P>
+void btree_node<P>::swap(btree_node *x, allocator_type *alloc) {
+  using std::swap;
+  assert(leaf() == x->leaf());
+
+  // Determine which is the smaller/larger node.
+  btree_node *smaller = this, *larger = x;
+  if (smaller->count() > larger->count()) {
+    swap(smaller, larger);
+  }
+
+  // Swap the values.
+  std::swap_ranges(smaller->slot(0), smaller->slot(smaller->count()),
+                   larger->slot(0));
+
+  // Move values that can't be swapped.
+  const size_type to_move = larger->count() - smaller->count();
+  larger->uninitialized_move_n(to_move, smaller->count(), smaller->count(),
+                               smaller, alloc);
+  larger->value_destroy_n(smaller->count(), to_move, alloc);
+
+  if (!leaf()) {
+    // Swap the child pointers.
+    std::swap_ranges(&smaller->mutable_child(0),
+                     &smaller->mutable_child(smaller->count() + 1),
+                     &larger->mutable_child(0));
+    // Update swapped children's parent pointers.
+    int i = 0;
+    for (; i <= smaller->count(); ++i) {
+      smaller->child(i)->set_parent(smaller);
+      larger->child(i)->set_parent(larger);
+    }
+    // Move the child pointers that couldn't be swapped.
+    for (; i <= larger->count(); ++i) {
+      smaller->init_child(i, larger->child(i));
+      larger->clear_child(i);
+    }
+  }
+
+  // Swap the counts.
+  swap(mutable_count(), x->mutable_count());
+}
+
+////
+// btree_iterator methods
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_slow() {
+  if (node->leaf()) {
+    assert(position >= node->count());
+    btree_iterator save(*this);
+    while (position == node->count() && !node->is_root()) {
+      assert(node->parent()->child(node->position()) == node);
+      position = node->position();
+      node = node->parent();
+    }
+    if (position == node->count()) {
+      *this = save;
+    }
+  } else {
+    assert(position < node->count());
+    node = node->child(position + 1);
+    while (!node->leaf()) {
+      node = node->child(0);
+    }
+    position = 0;
+  }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::decrement_slow() {
+  if (node->leaf()) {
+    assert(position <= -1);
+    btree_iterator save(*this);
+    while (position < 0 && !node->is_root()) {
+      assert(node->parent()->child(node->position()) == node);
+      position = node->position() - 1;
+      node = node->parent();
+    }
+    if (position < 0) {
+      *this = save;
+    }
+  } else {
+    assert(position >= 0);
+    node = node->child(position);
+    while (!node->leaf()) {
+      node = node->child(node->count());
+    }
+    position = node->count() - 1;
+  }
+}
+
+////
+// btree methods
+template <typename P>
+template <typename Btree>
+void btree<P>::copy_or_move_values_in_order(Btree *x) {
+  static_assert(std::is_same_v<btree, Btree>||
+                std::is_same_v<const btree, Btree>,
+                "Btree type must be same or const.");
+  assert(empty());
+
+  // We can avoid key comparisons because we know the order of the
+  // values is the same order we'll store them in.
+  auto iter = x->begin();
+  if (iter == x->end()) return;
+  insert_multi(maybe_move_from_iterator(iter));
+  ++iter;
+  for (; iter != x->end(); ++iter) {
+    // If the btree is not empty, we can just insert the new value at the end
+    // of the tree.
+    internal_emplace(end(), maybe_move_from_iterator(iter));
+  }
+}
+
+template <typename P>
+constexpr bool btree<P>::static_assert_validation() {
+  static_assert(std::is_nothrow_copy_constructible_v<key_compare>,
+                "Key comparison must be nothrow copy constructible");
+  static_assert(std::is_nothrow_copy_constructible_v<allocator_type>,
+                "Allocator must be nothrow copy constructible");
+  static_assert(std::is_trivially_copyable_v<iterator>,
+                "iterator not trivially copyable.");
+
+  // Note: We assert that kTargetValues, which is computed from
+  // Params::kTargetNodeSize, must fit the base_fields::field_type.
+  static_assert(
+      kNodeValues < (1 << (8 * sizeof(typename node_type::field_type))),
+      "target node size too large");
+
+  // Verify that key_compare returns an absl::{weak,strong}_ordering or bool.
+  using compare_result_type =
+    std::invoke_result_t<key_compare, key_type, key_type>;
+  static_assert(
+      std::is_same_v<compare_result_type, bool> ||
+      std::is_signed_v<compare_result_type>,
+      "key comparison function must return a signed value or "
+      "bool.");
+
+  // Test the assumption made in setting kNodeValueSpace.
+  static_assert(node_type::MinimumOverhead() >= sizeof(void *) + 4,
+                "node space assumption incorrect");
+
+  return true;
+}
+
+template <typename P>
+btree<P>::btree(const key_compare &comp, const allocator_type &alloc)
+  : root_(comp, alloc, EmptyNode()), rightmost_(EmptyNode()), size_(0) {}
+
+template <typename P>
+btree<P>::btree(const btree &x) : btree(x.key_comp(), x.allocator()) {
+  copy_or_move_values_in_order(&x);
+}
+
+template <typename P>
+template <typename... Args>
+auto btree<P>::insert_unique(const key_type &key, Args &&... args)
+    -> std::pair<iterator, bool> {
+  if (empty()) {
+    mutable_root() = rightmost_ = new_leaf_root_node(1);
+  }
+
+  auto res = internal_locate(key);
+  iterator &iter = res.value;
+
+  if constexpr (res.has_match) {
+    if (res.IsEq()) {
+      // The key already exists in the tree, do nothing.
+      return {iter, false};
+    }
+  } else {
+    iterator last = internal_last(iter);
+    if (last.node && !compare_keys(key, last.key())) {
+      // The key already exists in the tree, do nothing.
+      return {last, false};
+    }
+  }
+  return {internal_emplace(iter, std::forward<Args>(args)...), true};
+}
+
+template <typename P>
+template <typename... Args>
+inline auto btree<P>::insert_hint_unique(iterator position, const key_type &key,
+                                         Args &&... args)
+    -> std::pair<iterator, bool> {
+  if (!empty()) {
+    if (position == end() || compare_keys(key, position.key())) {
+      iterator prev = position;
+      if (position == begin() || compare_keys((--prev).key(), key)) {
+        // prev.key() < key < position.key()
+        return {internal_emplace(position, std::forward<Args>(args)...), true};
+      }
+    } else if (compare_keys(position.key(), key)) {
+      ++position;
+      if (position == end() || compare_keys(key, position.key())) {
+        // {original `position`}.key() < key < {current `position`}.key()
+        return {internal_emplace(position, std::forward<Args>(args)...), true};
+      }
+    } else {
+      // position.key() == key
+      return {position, false};
+    }
+  }
+  return insert_unique(key, std::forward<Args>(args)...);
+}
+
+template <typename P>
+template <typename InputIterator>
+void btree<P>::insert_iterator_unique(InputIterator b, InputIterator e) {
+  for (; b != e; ++b) {
+    insert_hint_unique(end(), params_type::key(*b), *b);
+  }
+}
+
+template <typename P>
+template <typename ValueType>
+auto btree<P>::insert_multi(const key_type &key, ValueType&& v) -> iterator {
+  if (empty()) {
+    mutable_root() = rightmost_ = new_leaf_root_node(1);
+  }
+
+  iterator iter = internal_upper_bound(key);
+  if (iter.node == nullptr) {
+    iter = end();
+  }
+  return internal_emplace(iter, std::forward<ValueType>(v));
+}
+
+template <typename P>
+template <typename ValueType>
+auto btree<P>::insert_hint_multi(iterator position, ValueType &&v) -> iterator {
+  if (!empty()) {
+    const key_type &key = params_type::key(v);
+    if (position == end() || !compare_keys(position.key(), key)) {
+      iterator prev = position;
+      if (position == begin() || !compare_keys(key, (--prev).key())) {
+        // prev.key() <= key <= position.key()
+        return internal_emplace(position, std::forward<ValueType>(v));
+      }
+    } else {
+      iterator next = position;
+      ++next;
+      if (next == end() || !compare_keys(next.key(), key)) {
+        // position.key() < key <= next.key()
+        return internal_emplace(next, std::forward<ValueType>(v));
+      }
+    }
+  }
+  return insert_multi(std::forward<ValueType>(v));
+}
+
+template <typename P>
+template <typename InputIterator>
+void btree<P>::insert_iterator_multi(InputIterator b, InputIterator e) {
+  for (; b != e; ++b) {
+    insert_hint_multi(end(), *b);
+  }
+}
+
+template <typename P>
+auto btree<P>::operator=(const btree &x) -> btree & {
+  if (this != &x) {
+    clear();
+
+    *mutable_key_comp() = x.key_comp();
+    if constexpr (std::allocator_traits<
+                  allocator_type>::propagate_on_container_copy_assignment::value) {
+      *mutable_allocator() = x.allocator();
+    }
+
+    copy_or_move_values_in_order(&x);
+  }
+  return *this;
+}
+
+template <typename P>
+auto btree<P>::operator=(btree &&x) noexcept -> btree & {
+  if (this != &x) {
+    clear();
+
+    using std::swap;
+    if constexpr (std::allocator_traits<
+                  allocator_type>::propagate_on_container_copy_assignment::value) {
+      // Note: `root_` also contains the allocator and the key comparator.
+      swap(root_, x.root_);
+      swap(rightmost_, x.rightmost_);
+      swap(size_, x.size_);
+    } else {
+      if (allocator() == x.allocator()) {
+        swap(mutable_root(), x.mutable_root());
+        swap(*mutable_key_comp(), *x.mutable_key_comp());
+        swap(rightmost_, x.rightmost_);
+        swap(size_, x.size_);
+      } else {
+        // We aren't allowed to propagate the allocator and the allocator is
+        // different so we can't take over its memory. We must move each element
+        // individually. We need both `x` and `this` to have `x`s key comparator
+        // while moving the values so we can't swap the key comparators.
+        *mutable_key_comp() = x.key_comp();
+        copy_or_move_values_in_order(&x);
+      }
+    }
+  }
+  return *this;
+}
+
+template <typename P>
+auto btree<P>::erase(iterator iter) -> iterator {
+  bool internal_delete = false;
+  if (!iter.node->leaf()) {
+    // Deletion of a value on an internal node. First, move the largest value
+    // from our left child here, then delete that position (in remove_value()
+    // below). We can get to the largest value from our left child by
+    // decrementing iter.
+    iterator internal_iter(iter);
+    --iter;
+    assert(iter.node->leaf());
+    params_type::move(mutable_allocator(), iter.node->slot(iter.position),
+                      internal_iter.node->slot(internal_iter.position));
+    internal_delete = true;
+  }
+
+  // Delete the key from the leaf.
+  iter.node->remove_value(iter.position, mutable_allocator());
+  --size_;
+
+  // We want to return the next value after the one we just erased. If we
+  // erased from an internal node (internal_delete == true), then the next
+  // value is ++(++iter). If we erased from a leaf node (internal_delete ==
+  // false) then the next value is ++iter. Note that ++iter may point to an
+  // internal node and the value in the internal node may move to a leaf node
+  // (iter.node) when rebalancing is performed at the leaf level.
+
+  iterator res = rebalance_after_delete(iter);
+
+  // If we erased from an internal node, advance the iterator.
+  if (internal_delete) {
+    ++res;
+  }
+  return res;
+}
+
+template <typename P>
+auto btree<P>::rebalance_after_delete(iterator iter) -> iterator {
+  // Merge/rebalance as we walk back up the tree.
+  iterator res(iter);
+  bool first_iteration = true;
+  for (;;) {
+    if (iter.node == root()) {
+      try_shrink();
+      if (empty()) {
+        return end();
+      }
+      break;
+    }
+    if (iter.node->count() >= kMinNodeValues) {
+      break;
+    }
+    bool merged = try_merge_or_rebalance(&iter);
+    // On the first iteration, we should update `res` with `iter` because `res`
+    // may have been invalidated.
+    if (first_iteration) {
+      res = iter;
+      first_iteration = false;
+    }
+    if (!merged) {
+      break;
+    }
+    iter.position = iter.node->position();
+    iter.node = iter.node->parent();
+  }
+
+  // Adjust our return value. If we're pointing at the end of a node, advance
+  // the iterator.
+  if (res.position == res.node->count()) {
+    res.position = res.node->count() - 1;
+    ++res;
+  }
+
+  return res;
+}
+
+template <typename P>
+auto btree<P>::erase(iterator begin, iterator end)
+    -> std::pair<size_type, iterator> {
+  difference_type count = std::distance(begin, end);
+  assert(count >= 0);
+
+  if (count == 0) {
+    return {0, begin};
+  }
+
+  if (count == size_) {
+    clear();
+    return {count, this->end()};
+  }
+
+  if (begin.node == end.node) {
+    erase_same_node(begin, end);
+    size_ -= count;
+    return {count, rebalance_after_delete(begin)};
+  }
+
+  const size_type target_size = size_ - count;
+  while (size_ > target_size) {
+    if (begin.node->leaf()) {
+      const size_type remaining_to_erase = size_ - target_size;
+      const size_type remaining_in_node = begin.node->count() - begin.position;
+      begin = erase_from_leaf_node(
+          begin, std::min(remaining_to_erase, remaining_in_node));
+    } else {
+      begin = erase(begin);
+    }
+  }
+  return {count, begin};
+}
+
+template <typename P>
+void btree<P>::erase_same_node(iterator begin, iterator end) {
+  assert(begin.node == end.node);
+  assert(end.position > begin.position);
+
+  node_type *node = begin.node;
+  size_type to_erase = end.position - begin.position;
+  if (!node->leaf()) {
+    // Delete all children between begin and end.
+    for (size_type i = 0; i < to_erase; ++i) {
+      internal_clear(node->child(begin.position + i + 1));
+    }
+    // Rotate children after end into new positions.
+    for (size_type i = begin.position + to_erase + 1; i <= node->count(); ++i) {
+      node->set_child(i - to_erase, node->child(i));
+      node->clear_child(i);
+    }
+  }
+  node->remove_values_ignore_children(begin.position, to_erase,
+                                      mutable_allocator());
+
+  // Do not need to update rightmost_, because
+  // * either end == this->end(), and therefore node == rightmost_, and still
+  //   exists
+  // * or end != this->end(), and therefore rightmost_ hasn't been erased, since
+  //   it wasn't covered in [begin, end)
+}
+
+template <typename P>
+auto btree<P>::erase_from_leaf_node(iterator begin, size_type to_erase)
+    -> iterator {
+  node_type *node = begin.node;
+  assert(node->leaf());
+  assert(node->count() > begin.position);
+  assert(begin.position + to_erase <= node->count());
+
+  node->remove_values_ignore_children(begin.position, to_erase,
+                                      mutable_allocator());
+
+  size_ -= to_erase;
+
+  return rebalance_after_delete(begin);
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::erase_unique(const K &key) -> size_type {
+  const iterator iter = internal_find(key);
+  if (iter.node == nullptr) {
+    // The key doesn't exist in the tree, return nothing done.
+    return 0;
+  }
+  erase(iter);
+  return 1;
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::erase_multi(const K &key) -> size_type {
+  const iterator begin = internal_lower_bound(key);
+  if (begin.node == nullptr) {
+    // The key doesn't exist in the tree, return nothing done.
+    return 0;
+  }
+  // Delete all of the keys between begin and upper_bound(key).
+  const iterator end = internal_end(internal_upper_bound(key));
+  return erase(begin, end).first;
+}
+
+template <typename P>
+void btree<P>::clear() {
+  if (!empty()) {
+    internal_clear(root());
+  }
+  mutable_root() = EmptyNode();
+  rightmost_ = EmptyNode();
+  size_ = 0;
+}
+
+template <typename P>
+void btree<P>::swap(btree &x) {
+  using std::swap;
+  if (std::allocator_traits<
+          allocator_type>::propagate_on_container_swap::value) {
+    // Note: `root_` also contains the allocator and the key comparator.
+    swap(root_, x.root_);
+  } else {
+    // It's undefined behavior if the allocators are unequal here.
+    assert(allocator() == x.allocator());
+    swap(mutable_root(), x.mutable_root());
+    swap(*mutable_key_comp(), *x.mutable_key_comp());
+  }
+  swap(rightmost_, x.rightmost_);
+  swap(size_, x.size_);
+}
+
+template <typename P>
+void btree<P>::verify() const {
+  assert(root() != nullptr);
+  assert(leftmost() != nullptr);
+  assert(rightmost_ != nullptr);
+  assert(empty() || size() == internal_verify(root(), nullptr, nullptr));
+  assert(leftmost() == (++const_iterator(root(), -1)).node);
+  assert(rightmost_ == (--const_iterator(root(), root()->count())).node);
+  assert(leftmost()->leaf());
+  assert(rightmost_->leaf());
+}
+
+template <typename P>
+void btree<P>::rebalance_or_split(iterator *iter) {
+  node_type *&node = iter->node;
+  int &insert_position = iter->position;
+  assert(node->count() == node->max_count());
+  assert(kNodeValues == node->max_count());
+
+  // First try to make room on the node by rebalancing.
+  node_type *parent = node->parent();
+  if (node != root()) {
+    if (node->position() > 0) {
+      // Try rebalancing with our left sibling.
+      node_type *left = parent->child(node->position() - 1);
+      assert(left->max_count() == kNodeValues);
+      if (left->count() < kNodeValues) {
+        // We bias rebalancing based on the position being inserted. If we're
+        // inserting at the end of the right node then we bias rebalancing to
+        // fill up the left node.
+        int to_move = (kNodeValues - left->count()) /
+                      (1 + (insert_position < kNodeValues));
+        to_move = std::max(1, to_move);
+
+        if (((insert_position - to_move) >= 0) ||
+            ((left->count() + to_move) < kNodeValues)) {
+          left->rebalance_right_to_left(to_move, node, mutable_allocator());
+
+          assert(node->max_count() - node->count() == to_move);
+          insert_position = insert_position - to_move;
+          if (insert_position < 0) {
+            insert_position = insert_position + left->count() + 1;
+            node = left;
+          }
+
+          assert(node->count() < node->max_count());
+          return;
+        }
+      }
+    }
+
+    if (node->position() < parent->count()) {
+      // Try rebalancing with our right sibling.
+      node_type *right = parent->child(node->position() + 1);
+      assert(right->max_count() == kNodeValues);
+      if (right->count() < kNodeValues) {
+        // We bias rebalancing based on the position being inserted. If we're
+        // inserting at the beginning of the left node then we bias rebalancing
+        // to fill up the right node.
+        int to_move =
+            (kNodeValues - right->count()) / (1 + (insert_position > 0));
+        to_move = (std::max)(1, to_move);
+
+        if ((insert_position <= (node->count() - to_move)) ||
+            ((right->count() + to_move) < kNodeValues)) {
+          node->rebalance_left_to_right(to_move, right, mutable_allocator());
+
+          if (insert_position > node->count()) {
+            insert_position = insert_position - node->count() - 1;
+            node = right;
+          }
+
+          assert(node->count() < node->max_count());
+          return;
+        }
+      }
+    }
+
+    // Rebalancing failed, make sure there is room on the parent node for a new
+    // value.
+    assert(parent->max_count() == kNodeValues);
+    if (parent->count() == kNodeValues) {
+      iterator parent_iter(node->parent(), node->position());
+      rebalance_or_split(&parent_iter);
+    }
+  } else {
+    // Rebalancing not possible because this is the root node.
+    // Create a new root node and set the current root node as the child of the
+    // new root.
+    parent = new_internal_node(parent);
+    parent->init_child(0, root());
+    mutable_root() = parent;
+    // If the former root was a leaf node, then it's now the rightmost node.
+    assert(!parent->child(0)->leaf() || parent->child(0) == rightmost_);
+  }
+
+  // Split the node.
+  node_type *split_node;
+  if (node->leaf()) {
+    split_node = new_leaf_node(parent);
+    node->split(insert_position, split_node, mutable_allocator());
+    if (rightmost_ == node) rightmost_ = split_node;
+  } else {
+    split_node = new_internal_node(parent);
+    node->split(insert_position, split_node, mutable_allocator());
+  }
+
+  if (insert_position > node->count()) {
+    insert_position = insert_position - node->count() - 1;
+    node = split_node;
+  }
+}
+
+template <typename P>
+void btree<P>::merge_nodes(node_type *left, node_type *right) {
+  left->merge(right, mutable_allocator());
+  if (right->leaf()) {
+    if (rightmost_ == right) rightmost_ = left;
+    delete_leaf_node(right);
+  } else {
+    delete_internal_node(right);
+  }
+}
+
+template <typename P>
+bool btree<P>::try_merge_or_rebalance(iterator *iter) {
+  node_type *parent = iter->node->parent();
+  if (iter->node->position() > 0) {
+    // Try merging with our left sibling.
+    node_type *left = parent->child(iter->node->position() - 1);
+    assert(left->max_count() == kNodeValues);
+    if ((1 + left->count() + iter->node->count()) <= kNodeValues) {
+      iter->position += 1 + left->count();
+      merge_nodes(left, iter->node);
+      iter->node = left;
+      return true;
+    }
+  }
+  if (iter->node->position() < parent->count()) {
+    // Try merging with our right sibling.
+    node_type *right = parent->child(iter->node->position() + 1);
+    assert(right->max_count() == kNodeValues);
+    if ((1 + iter->node->count() + right->count()) <= kNodeValues) {
+      merge_nodes(iter->node, right);
+      return true;
+    }
+    // Try rebalancing with our right sibling. We don't perform rebalancing if
+    // we deleted the first element from iter->node and the node is not
+    // empty. This is a small optimization for the common pattern of deleting
+    // from the front of the tree.
+    if ((right->count() > kMinNodeValues) &&
+        ((iter->node->count() == 0) ||
+         (iter->position > 0))) {
+      int to_move = (right->count() - iter->node->count()) / 2;
+      to_move = std::min(to_move, right->count() - 1);
+      iter->node->rebalance_right_to_left(to_move, right, mutable_allocator());
+      return false;
+    }
+  }
+  if (iter->node->position() > 0) {
+    // Try rebalancing with our left sibling. We don't perform rebalancing if
+    // we deleted the last element from iter->node and the node is not
+    // empty. This is a small optimization for the common pattern of deleting
+    // from the back of the tree.
+    node_type *left = parent->child(iter->node->position() - 1);
+    if ((left->count() > kMinNodeValues) &&
+        ((iter->node->count() == 0) ||
+         (iter->position < iter->node->count()))) {
+      int to_move = (left->count() - iter->node->count()) / 2;
+      to_move = std::min(to_move, left->count() - 1);
+      left->rebalance_left_to_right(to_move, iter->node, mutable_allocator());
+      iter->position += to_move;
+      return false;
+    }
+  }
+  return false;
+}
+
+template <typename P>
+void btree<P>::try_shrink() {
+  if (root()->count() > 0) {
+    return;
+  }
+  // Deleted the last item on the root node, shrink the height of the tree.
+  if (root()->leaf()) {
+    assert(size() == 0);
+    delete_leaf_node(root());
+    mutable_root() = EmptyNode();
+    rightmost_ = EmptyNode();
+  } else {
+    node_type *child = root()->child(0);
+    child->make_root();
+    delete_internal_node(root());
+    mutable_root() = child;
+  }
+}
+
+template <typename P>
+template <typename IterType>
+inline IterType btree<P>::internal_last(IterType iter) {
+  assert(iter.node != nullptr);
+  while (iter.position == iter.node->count()) {
+    iter.position = iter.node->position();
+    iter.node = iter.node->parent();
+    if (iter.node->leaf()) {
+      iter.node = nullptr;
+      break;
+    }
+  }
+  return iter;
+}
+
+template <typename P>
+template <typename... Args>
+inline auto btree<P>::internal_emplace(iterator iter, Args &&... args)
+    -> iterator {
+  if (!iter.node->leaf()) {
+    // We can't insert on an internal node. Instead, we'll insert after the
+    // previous value which is guaranteed to be on a leaf node.
+    --iter;
+    ++iter.position;
+  }
+  const int max_count = iter.node->max_count();
+  if (iter.node->count() == max_count) {
+    // Make room in the leaf for the new item.
+    if (max_count < kNodeValues) {
+      // Insertion into the root where the root is smaller than the full node
+      // size. Simply grow the size of the root node.
+      assert(iter.node == root());
+      iter.node =
+          new_leaf_root_node(std::min(kNodeValues, 2 * max_count));
+      iter.node->swap(root(), mutable_allocator());
+      delete_leaf_node(root());
+      mutable_root() = iter.node;
+      rightmost_ = iter.node;
+    } else {
+      rebalance_or_split(&iter);
+    }
+  }
+  iter.node->emplace_value(iter.position, mutable_allocator(),
+                           std::forward<Args>(args)...);
+  ++size_;
+  return iter;
+}
+
+template <typename P>
+template <typename K>
+inline auto btree<P>::internal_locate(const K &key) const
+    -> SearchResult<iterator, is_key_compare_to::value> {
+  return internal_locate_impl(key, is_key_compare_to());
+}
+
+template <typename P>
+template <typename K>
+inline auto btree<P>::internal_locate_impl(
+    const K &key, std::false_type /* IsCompareTo */) const
+    -> SearchResult<iterator, false> {
+  iterator iter(const_cast<node_type *>(root()), 0);
+  for (;;) {
+    iter.position = iter.node->lower_bound(key, key_comp()).value;
+    // NOTE: we don't need to walk all the way down the tree if the keys are
+    // equal, but determining equality would require doing an extra comparison
+    // on each node on the way down, and we will need to go all the way to the
+    // leaf node in the expected case.
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return {iter};
+}
+
+template <typename P>
+template <typename K>
+inline auto btree<P>::internal_locate_impl(
+    const K &key, std::true_type /* IsCompareTo */) const
+    -> SearchResult<iterator, true> {
+  iterator iter(const_cast<node_type *>(root()), 0);
+  for (;;) {
+    SearchResult<int, true> res = iter.node->lower_bound(key, key_comp());
+    iter.position = res.value;
+    if (res.match == MatchKind::kEq) {
+      return {iter, MatchKind::kEq};
+    }
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return {iter, MatchKind::kNe};
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::internal_lower_bound(const K &key) const -> iterator {
+  iterator iter(const_cast<node_type *>(root()), 0);
+  for (;;) {
+    iter.position = iter.node->lower_bound(key, key_comp()).value;
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return internal_last(iter);
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::internal_upper_bound(const K &key) const -> iterator {
+  iterator iter(const_cast<node_type *>(root()), 0);
+  for (;;) {
+    iter.position = iter.node->upper_bound(key, key_comp());
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return internal_last(iter);
+}
+
+template <typename P>
+template <typename K>
+auto btree<P>::internal_find(const K &key) const -> iterator {
+  auto res = internal_locate(key);
+  if constexpr (res.has_match) {
+    if (res.IsEq()) {
+      return res.value;
+    }
+  } else {
+    const iterator iter = internal_last(res.value);
+    if (iter.node != nullptr && !compare_keys(key, iter.key())) {
+      return iter;
+    }
+  }
+  return {nullptr, 0};
+}
+
+template <typename P>
+void btree<P>::internal_clear(node_type *node) {
+  if (!node->leaf()) {
+    for (int i = 0; i <= node->count(); ++i) {
+      internal_clear(node->child(i));
+    }
+    delete_internal_node(node);
+  } else {
+    delete_leaf_node(node);
+  }
+}
+
+template <typename P>
+int btree<P>::internal_verify(
+    const node_type *node, const key_type *lo, const key_type *hi) const {
+  assert(node->count() > 0);
+  assert(node->count() <= node->max_count());
+  if (lo) {
+    assert(!compare_keys(node->key(0), *lo));
+  }
+  if (hi) {
+    assert(!compare_keys(*hi, node->key(node->count() - 1)));
+  }
+  for (int i = 1; i < node->count(); ++i) {
+    assert(!compare_keys(node->key(i), node->key(i - 1)));
+  }
+  int count = node->count();
+  if (!node->leaf()) {
+    for (int i = 0; i <= node->count(); ++i) {
+      assert(node->child(i) != nullptr);
+      assert(node->child(i)->parent() == node);
+      assert(node->child(i)->position() == i);
+      count += internal_verify(
+          node->child(i),
+          (i == 0) ? lo : &node->key(i - 1),
+          (i == node->count()) ? hi : &node->key(i));
+    }
+  }
+  return count;
+}
+
+} // namespace btree::internal
diff --git a/src/include/cpp-btree/btree_container.h b/src/include/cpp-btree/btree_container.h
new file mode 100644
index 000000000..e8d9efd38
--- /dev/null
+++ b/src/include/cpp-btree/btree_container.h
@@ -0,0 +1,526 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+#include "btree.h"
+
+namespace btree::internal {
+
+// A common base class for btree_set, btree_map, btree_multiset, and
+// btree_multimap.
+template <typename Tree>
+class btree_container {
+  using params_type = typename Tree::params_type;
+
+ protected:
+  // Alias used for heterogeneous lookup functions.
+  // `key_arg<K>` evaluates to `K` when the functors are transparent and to
+  // `key_type` otherwise. It permits template argument deduction on `K` for the
+  // transparent case.
+  template <class Compare>
+  using is_transparent_t = typename Compare::is_transparent;
+  template <class K>
+  using key_arg =
+    std::conditional_t<
+      std::experimental::is_detected_v<is_transparent_t, typename Tree::key_compare>,
+      K,
+      typename Tree::key_type>;
+
+ public:
+  using key_type = typename Tree::key_type;
+  using value_type = typename Tree::value_type;
+  using size_type = typename Tree::size_type;
+  using difference_type = typename Tree::difference_type;
+  using key_compare = typename Tree::key_compare;
+  using value_compare = typename Tree::value_compare;
+  using allocator_type = typename Tree::allocator_type;
+  using reference = typename Tree::reference;
+  using const_reference = typename Tree::const_reference;
+  using pointer = typename Tree::pointer;
+  using const_pointer = typename Tree::const_pointer;
+  using iterator = typename Tree::iterator;
+  using const_iterator = typename Tree::const_iterator;
+  using reverse_iterator = typename Tree::reverse_iterator;
+  using const_reverse_iterator = typename Tree::const_reverse_iterator;
+
+  // Constructors/assignments.
+  btree_container() : tree_(key_compare(), allocator_type()) {}
+  explicit btree_container(const key_compare &comp,
+                           const allocator_type &alloc = allocator_type())
+      : tree_(comp, alloc) {}
+  btree_container(const btree_container &x) = default;
+  btree_container(btree_container &&x) noexcept = default;
+  btree_container &operator=(const btree_container &x) = default;
+  btree_container &operator=(btree_container &&x) noexcept(
+      std::is_nothrow_move_assignable<Tree>::value) = default;
+
+  // Iterator routines.
+  iterator begin() { return tree_.begin(); }
+  const_iterator begin() const { return tree_.begin(); }
+  const_iterator cbegin() const { return tree_.begin(); }
+  iterator end() { return tree_.end(); }
+  const_iterator end() const { return tree_.end(); }
+  const_iterator cend() const { return tree_.end(); }
+  reverse_iterator rbegin() { return tree_.rbegin(); }
+  const_reverse_iterator rbegin() const { return tree_.rbegin(); }
+  const_reverse_iterator crbegin() const { return tree_.rbegin(); }
+  reverse_iterator rend() { return tree_.rend(); }
+  const_reverse_iterator rend() const { return tree_.rend(); }
+  const_reverse_iterator crend() const { return tree_.rend(); }
+
+  // Lookup routines.
+  template <typename K = key_type>
+  iterator find(const key_arg<K> &key) {
+    return tree_.find(key);
+  }
+  template <typename K = key_type>
+  const_iterator find(const key_arg<K> &key) const {
+    return tree_.find(key);
+  }
+  template <typename K = key_type>
+  bool contains(const key_arg<K> &key) const {
+    return find(key) != end();
+  }
+  template <typename K = key_type>
+  iterator lower_bound(const key_arg<K> &key) {
+    return tree_.lower_bound(key);
+  }
+  template <typename K = key_type>
+  const_iterator lower_bound(const key_arg<K> &key) const {
+    return tree_.lower_bound(key);
+  }
+  template <typename K = key_type>
+  iterator upper_bound(const key_arg<K> &key) {
+    return tree_.upper_bound(key);
+  }
+  template <typename K = key_type>
+  const_iterator upper_bound(const key_arg<K> &key) const {
+    return tree_.upper_bound(key);
+  }
+  template <typename K = key_type>
+  std::pair<iterator, iterator> equal_range(const key_arg<K> &key) {
+    return tree_.equal_range(key);
+  }
+  template <typename K = key_type>
+  std::pair<const_iterator, const_iterator> equal_range(
+      const key_arg<K> &key) const {
+    return tree_.equal_range(key);
+  }
+
+  // Deletion routines. Note that there is also a deletion routine that is
+  // specific to btree_set_container/btree_multiset_container.
+
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(const_iterator iter) { return tree_.erase(iterator(iter)); }
+  iterator erase(iterator iter) { return tree_.erase(iter); }
+  iterator erase(const_iterator first, const_iterator last) {
+    return tree_.erase(iterator(first), iterator(last)).second;
+  }
+
+ public:
+  // Utility routines.
+  void clear() { tree_.clear(); }
+  void swap(btree_container &x) { tree_.swap(x.tree_); }
+  void verify() const { tree_.verify(); }
+
+  // Size routines.
+  size_type size() const { return tree_.size(); }
+  size_type max_size() const { return tree_.max_size(); }
+  bool empty() const { return tree_.empty(); }
+
+  friend bool operator==(const btree_container &x, const btree_container &y) {
+    if (x.size() != y.size()) return false;
+    return std::equal(x.begin(), x.end(), y.begin());
+  }
+
+  friend bool operator!=(const btree_container &x, const btree_container &y) {
+    return !(x == y);
+  }
+
+  friend bool operator<(const btree_container &x, const btree_container &y) {
+    return std::lexicographical_compare(x.begin(), x.end(), y.begin(), y.end());
+  }
+
+  friend bool operator>(const btree_container &x, const btree_container &y) {
+    return y < x;
+  }
+
+  friend bool operator<=(const btree_container &x, const btree_container &y) {
+    return !(y < x);
+  }
+
+  friend bool operator>=(const btree_container &x, const btree_container &y) {
+    return !(x < y);
+  }
+
+  // The allocator used by the btree.
+  allocator_type get_allocator() const { return tree_.get_allocator(); }
+
+  // The key comparator used by the btree.
+  key_compare key_comp() const { return tree_.key_comp(); }
+  value_compare value_comp() const { return tree_.value_comp(); }
+
+ protected:
+  Tree tree_;
+};
+
+// A common base class for btree_set and btree_map.
+template <typename Tree>
+class btree_set_container : public btree_container<Tree> {
+  using super_type = btree_container<Tree>;
+  using params_type = typename Tree::params_type;
+  using init_type = typename params_type::init_type;
+  using is_key_compare_to = typename params_type::is_key_compare_to;
+  friend class BtreeNodePeer;
+
+ protected:
+  template <class K>
+  using key_arg = typename super_type::template key_arg<K>;
+
+ public:
+  using key_type = typename Tree::key_type;
+  using value_type = typename Tree::value_type;
+  using size_type = typename Tree::size_type;
+  using key_compare = typename Tree::key_compare;
+  using allocator_type = typename Tree::allocator_type;
+  using iterator = typename Tree::iterator;
+  using const_iterator = typename Tree::const_iterator;
+
+  // Inherit constructors.
+  using super_type::super_type;
+  btree_set_container() {}
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_set_container(InputIterator b, InputIterator e,
+                      const key_compare &comp = key_compare(),
+                      const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+    insert(b, e);
+  }
+
+  // Initializer list constructor.
+  btree_set_container(std::initializer_list<init_type> init,
+                      const key_compare &comp = key_compare(),
+                      const allocator_type &alloc = allocator_type())
+      : btree_set_container(init.begin(), init.end(), comp, alloc) {}
+
+  // Lookup routines.
+  template <typename K = key_type>
+  size_type count(const key_arg<K> &key) const {
+    return this->tree_.count_unique(key);
+  }
+
+  // Insertion routines.
+  std::pair<iterator, bool> insert(const value_type &x) {
+    return this->tree_.insert_unique(params_type::key(x), x);
+  }
+  std::pair<iterator, bool> insert(value_type &&x) {
+    return this->tree_.insert_unique(params_type::key(x), std::move(x));
+  }
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args &&... args) {
+    init_type v(std::forward<Args>(args)...);
+    return this->tree_.insert_unique(params_type::key(v), std::move(v));
+  }
+  iterator insert(const_iterator position, const value_type &x) {
+    return this->tree_
+        .insert_hint_unique(iterator(position), params_type::key(x), x)
+        .first;
+  }
+  iterator insert(const_iterator position, value_type &&x) {
+    return this->tree_
+        .insert_hint_unique(iterator(position), params_type::key(x),
+                            std::move(x))
+        .first;
+  }
+  template <typename... Args>
+  iterator emplace_hint(const_iterator position, Args &&... args) {
+    init_type v(std::forward<Args>(args)...);
+    return this->tree_
+        .insert_hint_unique(iterator(position), params_type::key(v),
+                            std::move(v))
+        .first;
+  }
+  template <typename InputIterator>
+  void insert(InputIterator b, InputIterator e) {
+    this->tree_.insert_iterator_unique(b, e);
+  }
+  void insert(std::initializer_list<init_type> init) {
+    this->tree_.insert_iterator_unique(init.begin(), init.end());
+  }
+  // Deletion routines.
+  template <typename K = key_type>
+  size_type erase(const key_arg<K> &key) {
+    return this->tree_.erase_unique(key);
+  }
+  using super_type::erase;
+
+  // Merge routines.
+  // Moves elements from `src` into `this`. If the element already exists in
+  // `this`, it is left unmodified in `src`.
+  template <
+    typename T,
+    typename std::enable_if_t<
+      std::conjunction_v<
+        std::is_same<value_type, typename T::value_type>,
+        std::is_same<allocator_type, typename T::allocator_type>,
+        std::is_same<typename params_type::is_map_container,
+                     typename T::params_type::is_map_container>>,
+      int> = 0>
+  void merge(btree_container<T> &src) {  // NOLINT
+    for (auto src_it = src.begin(); src_it != src.end();) {
+      if (insert(std::move(*src_it)).second) {
+        src_it = src.erase(src_it);
+      } else {
+        ++src_it;
+      }
+    }
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<
+      std::conjunction_v<
+        std::is_same<value_type, typename T::value_type>,
+        std::is_same<allocator_type, typename T::allocator_type>,
+        std::is_same<typename params_type::is_map_container,
+                     typename T::params_type::is_map_container>>,
+      int> = 0>
+  void merge(btree_container<T> &&src) {
+    merge(src);
+  }
+};
+
+// A common base class for btree_map and safe_btree_map.
+// Base class for btree_map.
+template <typename Tree>
+class btree_map_container : public btree_set_container<Tree> {
+  using super_type = btree_set_container<Tree>;
+  using params_type = typename Tree::params_type;
+
+ protected:
+  template <class K>
+  using key_arg = typename super_type::template key_arg<K>;
+
+ public:
+  using key_type = typename Tree::key_type;
+  using mapped_type = typename params_type::mapped_type;
+  using value_type = typename Tree::value_type;
+  using key_compare = typename Tree::key_compare;
+  using allocator_type = typename Tree::allocator_type;
+  using iterator = typename Tree::iterator;
+  using const_iterator = typename Tree::const_iterator;
+
+  // Inherit constructors.
+  using super_type::super_type;
+  btree_map_container() {}
+
+  // Insertion routines.
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const key_type &k, Args &&... args) {
+    return this->tree_.insert_unique(
+        k, std::piecewise_construct, std::forward_as_tuple(k),
+        std::forward_as_tuple(std::forward<Args>(args)...));
+  }
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(key_type &&k, Args &&... args) {
+    // Note: `key_ref` exists to avoid a ClangTidy warning about moving from `k`
+    // and then using `k` unsequenced. This is safe because the move is into a
+    // forwarding reference and insert_unique guarantees that `key` is never
+    // referenced after consuming `args`.
+    const key_type& key_ref = k;
+    return this->tree_.insert_unique(
+        key_ref, std::piecewise_construct, std::forward_as_tuple(std::move(k)),
+        std::forward_as_tuple(std::forward<Args>(args)...));
+  }
+  template <typename... Args>
+  iterator try_emplace(const_iterator hint, const key_type &k,
+                       Args &&... args) {
+    return this->tree_
+        .insert_hint_unique(iterator(hint), k, std::piecewise_construct,
+                            std::forward_as_tuple(k),
+                            std::forward_as_tuple(std::forward<Args>(args)...))
+        .first;
+  }
+  template <typename... Args>
+  iterator try_emplace(const_iterator hint, key_type &&k, Args &&... args) {
+    // Note: `key_ref` exists to avoid a ClangTidy warning about moving from `k`
+    // and then using `k` unsequenced. This is safe because the move is into a
+    // forwarding reference and insert_hint_unique guarantees that `key` is
+    // never referenced after consuming `args`.
+    const key_type& key_ref = k;
+    return this->tree_
+        .insert_hint_unique(iterator(hint), key_ref, std::piecewise_construct,
+                            std::forward_as_tuple(std::move(k)),
+                            std::forward_as_tuple(std::forward<Args>(args)...))
+        .first;
+  }
+  mapped_type &operator[](const key_type &k) {
+    return try_emplace(k).first->second;
+  }
+  mapped_type &operator[](key_type &&k) {
+    return try_emplace(std::move(k)).first->second;
+  }
+
+  template <typename K = key_type>
+  mapped_type &at(const key_arg<K> &key) {
+    auto it = this->find(key);
+    if (it == this->end())
+      throw std::out_of_range("btree_map::at");
+    return it->second;
+  }
+  template <typename K = key_type>
+  const mapped_type &at(const key_arg<K> &key) const {
+    auto it = this->find(key);
+    if (it == this->end())
+      throw std::out_of_range("btree_map::at");
+    return it->second;
+  }
+};
+
+// A common base class for btree_multiset and btree_multimap.
+template <typename Tree>
+class btree_multiset_container : public btree_container<Tree> {
+  using super_type = btree_container<Tree>;
+  using params_type = typename Tree::params_type;
+  using init_type = typename params_type::init_type;
+  using is_key_compare_to = typename params_type::is_key_compare_to;
+
+  template <class K>
+  using key_arg = typename super_type::template key_arg<K>;
+
+ public:
+  using key_type = typename Tree::key_type;
+  using value_type = typename Tree::value_type;
+  using size_type = typename Tree::size_type;
+  using key_compare = typename Tree::key_compare;
+  using allocator_type = typename Tree::allocator_type;
+  using iterator = typename Tree::iterator;
+  using const_iterator = typename Tree::const_iterator;
+
+  // Inherit constructors.
+  using super_type::super_type;
+  btree_multiset_container() {}
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_multiset_container(InputIterator b, InputIterator e,
+                           const key_compare &comp = key_compare(),
+                           const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+    insert(b, e);
+  }
+
+  // Initializer list constructor.
+  btree_multiset_container(std::initializer_list<init_type> init,
+                           const key_compare &comp = key_compare(),
+                           const allocator_type &alloc = allocator_type())
+      : btree_multiset_container(init.begin(), init.end(), comp, alloc) {}
+
+  // Lookup routines.
+  template <typename K = key_type>
+  size_type count(const key_arg<K> &key) const {
+    return this->tree_.count_multi(key);
+  }
+
+  // Insertion routines.
+  iterator insert(const value_type &x) { return this->tree_.insert_multi(x); }
+  iterator insert(value_type &&x) {
+    return this->tree_.insert_multi(std::move(x));
+  }
+  iterator insert(const_iterator position, const value_type &x) {
+    return this->tree_.insert_hint_multi(iterator(position), x);
+  }
+  iterator insert(const_iterator position, value_type &&x) {
+    return this->tree_.insert_hint_multi(iterator(position), std::move(x));
+  }
+  template <typename InputIterator>
+  void insert(InputIterator b, InputIterator e) {
+    this->tree_.insert_iterator_multi(b, e);
+  }
+  void insert(std::initializer_list<init_type> init) {
+    this->tree_.insert_iterator_multi(init.begin(), init.end());
+  }
+  template <typename... Args>
+  iterator emplace(Args &&... args) {
+    return this->tree_.insert_multi(init_type(std::forward<Args>(args)...));
+  }
+  template <typename... Args>
+  iterator emplace_hint(const_iterator position, Args &&... args) {
+    return this->tree_.insert_hint_multi(
+        iterator(position), init_type(std::forward<Args>(args)...));
+  }
+
+  // Deletion routines.
+  template <typename K = key_type>
+  size_type erase(const key_arg<K> &key) {
+    return this->tree_.erase_multi(key);
+  }
+  using super_type::erase;
+
+  // Merge routines.
+  // Moves all elements from `src` into `this`.
+  template <
+      typename T,
+      typename std::enable_if_t<
+          std::conjunction_v<
+              std::is_same<value_type, typename T::value_type>,
+              std::is_same<allocator_type, typename T::allocator_type>,
+              std::is_same<typename params_type::is_map_container,
+                           typename T::params_type::is_map_container>>,
+          int> = 0>
+  void merge(btree_container<T> &src) {  // NOLINT
+    insert(std::make_move_iterator(src.begin()),
+           std::make_move_iterator(src.end()));
+    src.clear();
+  }
+
+  template <
+      typename T,
+      typename std::enable_if_t<
+          std::conjunction_v<
+              std::is_same<value_type, typename T::value_type>,
+              std::is_same<allocator_type, typename T::allocator_type>,
+              std::is_same<typename params_type::is_map_container,
+                           typename T::params_type::is_map_container>>,
+          int> = 0>
+  void merge(btree_container<T> &&src) {
+    merge(src);
+  }
+};
+
+// A base class for btree_multimap.
+template <typename Tree>
+class btree_multimap_container : public btree_multiset_container<Tree> {
+  using super_type = btree_multiset_container<Tree>;
+  using params_type = typename Tree::params_type;
+
+ public:
+  using mapped_type = typename params_type::mapped_type;
+
+  // Inherit constructors.
+  using super_type::super_type;
+  btree_multimap_container() {}
+};
+} // namespace btree::internal
diff --git a/src/include/cpp-btree/btree_map.h b/src/include/cpp-btree/btree_map.h
new file mode 100644
index 000000000..749c2bbcd
--- /dev/null
+++ b/src/include/cpp-btree/btree_map.h
@@ -0,0 +1,159 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// -----------------------------------------------------------------------------
+// File: btree_map.h
+// -----------------------------------------------------------------------------
+//
+// This header file defines B-tree maps: sorted associative containers mapping
+// keys to values.
+//
+//     * `btree::btree_map<>`
+//     * `btree::btree_multimap<>`
+//
+// These B-tree types are similar to the corresponding types in the STL
+// (`std::map` and `std::multimap`) and generally conform to the STL interfaces
+// of those types. However, because they are implemented using B-trees, they
+// are more efficient in most situations.
+//
+// Unlike `std::map` and `std::multimap`, which are commonly implemented using
+// red-black tree nodes, B-tree maps use more generic B-tree nodes able to hold
+// multiple values per node. Holding multiple values per node often makes
+// B-tree maps perform better than their `std::map` counterparts, because
+// multiple entries can be checked within the same cache hit.
+//
+// However, these types should not be considered drop-in replacements for
+// `std::map` and `std::multimap` as there are some API differences, which are
+// noted in this header file.
+//
+// Importantly, insertions and deletions may invalidate outstanding iterators,
+// pointers, and references to elements. Such invalidations are typically only
+// an issue if insertion and deletion operations are interleaved with the use of
+// more than one iterator, pointer, or reference simultaneously. For this
+// reason, `insert()` and `erase()` return a valid iterator at the current
+// position.
+
+#pragma once
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// btree::btree_map<>
+//
+// A `btree::btree_map<K, V>` is an ordered associative container of
+// unique keys and associated values designed to be a more efficient replacement
+// for `std::map` (in most cases).
+//
+// Keys are sorted using an (optional) comparison function, which defaults to
+// `std::less<K>`.
+//
+// A `btree::btree_map<K, V>` uses a default allocator of
+// `std::allocator<std::pair<const K, V>>` to allocate (and deallocate)
+// nodes, and construct and destruct values within those nodes. You may
+// instead specify a custom allocator `A` (which in turn requires specifying a
+// custom comparator `C`) as in `btree::btree_map<K, V, C, A>`.
+//
+template <typename Key, typename Value, typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<std::pair<const Key, Value>>>
+class btree_map
+    : public internal::btree_map_container<
+          internal::btree<internal::map_params<
+              Key, Value, Compare, Alloc, /*TargetNodeSize=*/256,
+              /*Multi=*/false>>> {
+
+  using Base = typename btree_map::btree_map_container;
+
+ public:
+  // Default constructor.
+  btree_map() = default;
+  using Base::Base;
+};
+
+// btree::swap(btree::btree_map<>, btree::btree_map<>)
+//
+// Swaps the contents of two `btree::btree_map` containers.
+template <typename K, typename V, typename C, typename A>
+void swap(btree_map<K, V, C, A> &x, btree_map<K, V, C, A> &y) {
+  return x.swap(y);
+}
+
+// btree::erase_if(btree::btree_map<>, Pred)
+//
+// Erases all elements that satisfy the predicate pred from the container.
+template <typename K, typename V, typename C, typename A, typename Pred>
+void erase_if(btree_map<K, V, C, A> &map, Pred pred) {
+  for (auto it = map.begin(); it != map.end();) {
+    if (pred(*it)) {
+      it = map.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+// btree::btree_multimap
+//
+// A `btree::btree_multimap<K, V>` is an ordered associative container of
+// keys and associated values designed to be a more efficient replacement for
+// `std::multimap` (in most cases). Unlike `btree::btree_map`, a B-tree multimap
+// allows multiple elements with equivalent keys.
+//
+// Keys are sorted using an (optional) comparison function, which defaults to
+// `std::less<K>`.
+//
+// A `btree::btree_multimap<K, V>` uses a default allocator of
+// `std::allocator<std::pair<const K, V>>` to allocate (and deallocate)
+// nodes, and construct and destruct values within those nodes. You may
+// instead specify a custom allocator `A` (which in turn requires specifying a
+// custom comparator `C`) as in `btree::btree_multimap<K, V, C, A>`.
+//
+template <typename Key, typename Value, typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<std::pair<const Key, Value>>>
+class btree_multimap
+    : public internal::btree_multimap_container<
+          internal::btree<internal::map_params<
+              Key, Value, Compare, Alloc, /*TargetNodeSize=*/256,
+              /*Multi=*/true>>> {
+  using Base = typename btree_multimap::btree_multimap_container;
+
+ public:
+  btree_multimap() = default;
+  using Base::Base;
+};
+
+// btree::swap(btree::btree_multimap<>, btree::btree_multimap<>)
+//
+// Swaps the contents of two `btree::btree_multimap` containers.
+template <typename K, typename V, typename C, typename A>
+void swap(btree_multimap<K, V, C, A> &x, btree_multimap<K, V, C, A> &y) {
+  return x.swap(y);
+}
+
+// btree::erase_if(btree::btree_multimap<>, Pred)
+//
+// Erases all elements that satisfy the predicate pred from the container.
+template <typename K, typename V, typename C, typename A, typename Pred>
+void erase_if(btree_multimap<K, V, C, A> &map, Pred pred) {
+  for (auto it = map.begin(); it != map.end();) {
+    if (pred(*it)) {
+      it = map.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+} // namespace btree
diff --git a/src/include/cpp-btree/btree_set.h b/src/include/cpp-btree/btree_set.h
new file mode 100644
index 000000000..57536ce2f
--- /dev/null
+++ b/src/include/cpp-btree/btree_set.h
@@ -0,0 +1,632 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// -----------------------------------------------------------------------------
+// File: btree_set.h
+// -----------------------------------------------------------------------------
+//
+// This header file defines B-tree sets: sorted associative containers of
+// values.
+//
+//     * `absl::btree_set<>`
+//     * `absl::btree_multiset<>`
+//
+// These B-tree types are similar to the corresponding types in the STL
+// (`std::set` and `std::multiset`) and generally conform to the STL interfaces
+// of those types. However, because they are implemented using B-trees, they
+// are more efficient in most situations.
+//
+// Unlike `std::set` and `std::multiset`, which are commonly implemented using
+// red-black tree nodes, B-tree sets use more generic B-tree nodes able to hold
+// multiple values per node. Holding multiple values per node often makes
+// B-tree sets perform better than their `std::set` counterparts, because
+// multiple entries can be checked within the same cache hit.
+//
+// However, these types should not be considered drop-in replacements for
+// `std::set` and `std::multiset` as there are some API differences, which are
+// noted in this header file.
+//
+// Importantly, insertions and deletions may invalidate outstanding iterators,
+// pointers, and references to elements. Such invalidations are typically only
+// an issue if insertion and deletion operations are interleaved with the use of
+// more than one iterator, pointer, or reference simultaneously. For this
+// reason, `insert()` and `erase()` return a valid iterator at the current
+// position.
+
+#pragma once
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// btree::btree_set<>
+//
+// An `btree::btree_set<K>` is an ordered associative container of unique key
+// values designed to be a more efficient replacement for `std::set` (in most
+// cases).
+//
+// Keys are sorted using an (optional) comparison function, which defaults to
+// `std::less<K>`.
+//
+// An `btree::btree_set<K>` uses a default allocator of `std::allocator<K>` to
+// allocate (and deallocate) nodes, and construct and destruct values within
+// those nodes. You may instead specify a custom allocator `A` (which in turn
+// requires specifying a custom comparator `C`) as in
+// `btree::btree_set<K, C, A>`.
+//
+template <typename Key, typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<Key>>
+class btree_set
+    : public internal::btree_set_container<
+          internal::btree<internal::set_params<
+              Key, Compare, Alloc, /*TargetNodeSize=*/256,
+              /*Multi=*/false>>> {
+  using Base = typename btree_set::btree_set_container;
+
+ public:
+  // Constructors and Assignment Operators
+  //
+  // A `btree_set` supports the same overload set as `std::set`
+  // for construction and assignment:
+  //
+  // * Default constructor
+  //
+  //   btree::btree_set<std::string> set1;
+  //
+  // * Initializer List constructor
+  //
+  //   btree::btree_set<std::string> set2 =
+  //       {{"huey"}, {"dewey"}, {"louie"},};
+  //
+  // * Copy constructor
+  //
+  //   btree::btree_set<std::string> set3(set2);
+  //
+  // * Copy assignment operator
+  //
+  //  btree::btree_set<std::string> set4;
+  //  set4 = set3;
+  //
+  // * Move constructor
+  //
+  //   // Move is guaranteed efficient
+  //   btree::btree_set<std::string> set5(std::move(set4));
+  //
+  // * Move assignment operator
+  //
+  //   // May be efficient if allocators are compatible
+  //   btree::btree_set<std::string> set6;
+  //   set6 = std::move(set5);
+  //
+  // * Range constructor
+  //
+  //   std::vector<std::string> v = {"a", "b"};
+  //   btree::btree_set<std::string> set7(v.begin(), v.end());
+  btree_set() {}
+  using Base::Base;
+
+  // btree_set::begin()
+  //
+  // Returns an iterator to the beginning of the `btree_set`.
+  using Base::begin;
+
+  // btree_set::cbegin()
+  //
+  // Returns a const iterator to the beginning of the `btree_set`.
+  using Base::cbegin;
+
+  // btree_set::end()
+  //
+  // Returns an iterator to the end of the `btree_set`.
+  using Base::end;
+
+  // btree_set::cend()
+  //
+  // Returns a const iterator to the end of the `btree_set`.
+  using Base::cend;
+
+  // btree_set::empty()
+  //
+  // Returns whether or not the `btree_set` is empty.
+  using Base::empty;
+
+  // btree_set::max_size()
+  //
+  // Returns the largest theoretical possible number of elements within a
+  // `btree_set` under current memory constraints. This value can be thought
+  // of as the largest value of `std::distance(begin(), end())` for a
+  // `btree_set<Key>`.
+  using Base::max_size;
+
+  // btree_set::size()
+  //
+  // Returns the number of elements currently within the `btree_set`.
+  using Base::size;
+
+  // btree_set::clear()
+  //
+  // Removes all elements from the `btree_set`. Invalidates any references,
+  // pointers, or iterators referring to contained elements.
+  using Base::clear;
+
+  // btree_set::erase()
+  //
+  // Erases elements within the `btree_set`. Overloads are listed below.
+  //
+  // iterator erase(iterator position):
+  // iterator erase(const_iterator position):
+  //
+  //   Erases the element at `position` of the `btree_set`, returning
+  //   the iterator pointing to the element after the one that was erased
+  //   (or end() if none exists).
+  //
+  // iterator erase(const_iterator first, const_iterator last):
+  //
+  //   Erases the elements in the open interval [`first`, `last`), returning
+  //   the iterator pointing to the element after the interval that was erased
+  //   (or end() if none exists).
+  //
+  // template <typename K> size_type erase(const K& key):
+  //
+  //   Erases the element with the matching key, if it exists, returning the
+  //   number of elements erased.
+  using Base::erase;
+
+  // btree_set::insert()
+  //
+  // Inserts an element of the specified value into the `btree_set`,
+  // returning an iterator pointing to the newly inserted element, provided that
+  // an element with the given key does not already exist. If an insertion
+  // occurs, any references, pointers, or iterators are invalidated.
+  // Overloads are listed below.
+  //
+  // std::pair<iterator,bool> insert(const value_type& value):
+  //
+  //   Inserts a value into the `btree_set`. Returns a pair consisting of an
+  //   iterator to the inserted element (or to the element that prevented the
+  //   insertion) and a bool denoting whether the insertion took place.
+  //
+  // std::pair<iterator,bool> insert(value_type&& value):
+  //
+  //   Inserts a moveable value into the `btree_set`. Returns a pair
+  //   consisting of an iterator to the inserted element (or to the element that
+  //   prevented the insertion) and a bool denoting whether the insertion took
+  //   place.
+  //
+  // iterator insert(const_iterator hint, const value_type& value):
+  // iterator insert(const_iterator hint, value_type&& value):
+  //
+  //   Inserts a value, using the position of `hint` as a non-binding suggestion
+  //   for where to begin the insertion search. Returns an iterator to the
+  //   inserted element, or to the existing element that prevented the
+  //   insertion.
+  //
+  // void insert(InputIterator first, InputIterator last):
+  //
+  //   Inserts a range of values [`first`, `last`).
+  //
+  // void insert(std::initializer_list<init_type> ilist):
+  //
+  //   Inserts the elements within the initializer list `ilist`.
+  using Base::insert;
+
+  // btree_set::emplace()
+  //
+  // Inserts an element of the specified value by constructing it in-place
+  // within the `btree_set`, provided that no element with the given key
+  // already exists.
+  //
+  // The element may be constructed even if there already is an element with the
+  // key in the container, in which case the newly constructed element will be
+  // destroyed immediately.
+  //
+  // If an insertion occurs, any references, pointers, or iterators are
+  // invalidated.
+  using Base::emplace;
+
+  // btree_set::emplace_hint()
+  //
+  // Inserts an element of the specified value by constructing it in-place
+  // within the `btree_set`, using the position of `hint` as a non-binding
+  // suggestion for where to begin the insertion search, and only inserts
+  // provided that no element with the given key already exists.
+  //
+  // The element may be constructed even if there already is an element with the
+  // key in the container, in which case the newly constructed element will be
+  // destroyed immediately.
+  //
+  // If an insertion occurs, any references, pointers, or iterators are
+  // invalidated.
+  using Base::emplace_hint;
+
+  // btree_set::merge()
+  //
+  // Extracts elements from a given `source` btree_set into this
+  // `btree_set`. If the destination `btree_set` already contains an
+  // element with an equivalent key, that element is not extracted.
+  using Base::merge;
+
+  // btree_set::swap(btree_set& other)
+  //
+  // Exchanges the contents of this `btree_set` with those of the `other`
+  // btree_set, avoiding invocation of any move, copy, or swap operations on
+  // individual elements.
+  //
+  // All iterators and references on the `btree_set` remain valid, excepting
+  // for the past-the-end iterator, which is invalidated.
+  using Base::swap;
+
+  // btree_set::contains()
+  //
+  // template <typename K> bool contains(const K& key) const:
+  //
+  // Determines whether an element comparing equal to the given `key` exists
+  // within the `btree_set`, returning `true` if so or `false` otherwise.
+  //
+  // Supports heterogeneous lookup, provided that the set is provided a
+  // compatible heterogeneous comparator.
+  using Base::contains;
+
+  // btree_set::count()
+  //
+  // template <typename K> size_type count(const K& key) const:
+  //
+  // Returns the number of elements comparing equal to the given `key` within
+  // the `btree_set`. Note that this function will return either `1` or `0`
+  // since duplicate elements are not allowed within a `btree_set`.
+  //
+  // Supports heterogeneous lookup, provided that the set is provided a
+  // compatible heterogeneous comparator.
+  using Base::count;
+
+  // btree_set::equal_range()
+  //
+  // Returns a closed range [first, last], defined by a `std::pair` of two
+  // iterators, containing all elements with the passed key in the
+  // `btree_set`.
+  using Base::equal_range;
+
+  // btree_set::find()
+  //
+  // template <typename K> iterator find(const K& key):
+  // template <typename K> const_iterator find(const K& key) const:
+  //
+  // Finds an element with the passed `key` within the `btree_set`.
+  //
+  // Supports heterogeneous lookup, provided that the set is provided a
+  // compatible heterogeneous comparator.
+  using Base::find;
+
+  // btree_set::get_allocator()
+  //
+  // Returns the allocator function associated with this `btree_set`.
+  using Base::get_allocator;
+
+  // btree_set::key_comp();
+  //
+  // Returns the key comparator associated with this `btree_set`.
+  using Base::key_comp;
+
+  // btree_set::value_comp();
+  //
+  // Returns the value comparator associated with this `btree_set`. The keys to
+  // sort the elements are the values themselves, therefore `value_comp` and its
+  // sibling member function `key_comp` are equivalent.
+  using Base::value_comp;
+};
+
+// btree::swap(btree::btree_set<>, btree::btree_set<>)
+//
+// Swaps the contents of two `btree::btree_set` containers.
+template <typename K, typename C, typename A>
+void swap(btree_set<K, C, A> &x, btree_set<K, C, A> &y) {
+  return x.swap(y);
+}
+
+// btree::erase_if(btree::btree_set<>, Pred)
+//
+// Erases all elements that satisfy the predicate pred from the container.
+template <typename K, typename C, typename A, typename Pred>
+void erase_if(btree_set<K, C, A> &set, Pred pred) {
+  for (auto it = set.begin(); it != set.end();) {
+    if (pred(*it)) {
+      it = set.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+// btree::btree_multiset<>
+//
+// An `btree::btree_multiset<K>` is an ordered associative container of
+// keys and associated values designed to be a more efficient replacement
+// for `std::multiset` (in most cases). Unlike `btree::btree_set`, a B-tree
+// multiset allows equivalent elements.
+//
+// Keys are sorted using an (optional) comparison function, which defaults to
+// `std::less<K>`.
+//
+// An `btree::btree_multiset<K>` uses a default allocator of `std::allocator<K>`
+// to allocate (and deallocate) nodes, and construct and destruct values within
+// those nodes. You may instead specify a custom allocator `A` (which in turn
+// requires specifying a custom comparator `C`) as in
+// `btree::btree_multiset<K, C, A>`.
+//
+template <typename Key, typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<Key>>
+class btree_multiset
+    : public internal::btree_multiset_container<
+          internal::btree<internal::set_params<
+              Key, Compare, Alloc, /*TargetNodeSize=*/256,
+              /*Multi=*/true>>> {
+  using Base = typename btree_multiset::btree_multiset_container;
+
+ public:
+  // Constructors and Assignment Operators
+  //
+  // A `btree_multiset` supports the same overload set as `std::set`
+  // for construction and assignment:
+  //
+  // * Default constructor
+  //
+  //   btree::btree_multiset<std::string> set1;
+  //
+  // * Initializer List constructor
+  //
+  //   btree::btree_multiset<std::string> set2 =
+  //       {{"huey"}, {"dewey"}, {"louie"},};
+  //
+  // * Copy constructor
+  //
+  //   btree::btree_multiset<std::string> set3(set2);
+  //
+  // * Copy assignment operator
+  //
+  //  btree::btree_multiset<std::string> set4;
+  //  set4 = set3;
+  //
+  // * Move constructor
+  //
+  //   // Move is guaranteed efficient
+  //   btree::btree_multiset<std::string> set5(std::move(set4));
+  //
+  // * Move assignment operator
+  //
+  //   // May be efficient if allocators are compatible
+  //   btree::btree_multiset<std::string> set6;
+  //   set6 = std::move(set5);
+  //
+  // * Range constructor
+  //
+  //   std::vector<std::string> v = {"a", "b"};
+  //   btree::btree_multiset<std::string> set7(v.begin(), v.end());
+  btree_multiset() {}
+  using Base::Base;
+
+  // btree_multiset::begin()
+  //
+  // Returns an iterator to the beginning of the `btree_multiset`.
+  using Base::begin;
+
+  // btree_multiset::cbegin()
+  //
+  // Returns a const iterator to the beginning of the `btree_multiset`.
+  using Base::cbegin;
+
+  // btree_multiset::end()
+  //
+  // Returns an iterator to the end of the `btree_multiset`.
+  using Base::end;
+
+  // btree_multiset::cend()
+  //
+  // Returns a const iterator to the end of the `btree_multiset`.
+  using Base::cend;
+
+  // btree_multiset::empty()
+  //
+  // Returns whether or not the `btree_multiset` is empty.
+  using Base::empty;
+
+  // btree_multiset::max_size()
+  //
+  // Returns the largest theoretical possible number of elements within a
+  // `btree_multiset` under current memory constraints. This value can be
+  // thought of as the largest value of `std::distance(begin(), end())` for a
+  // `btree_multiset<Key>`.
+  using Base::max_size;
+
+  // btree_multiset::size()
+  //
+  // Returns the number of elements currently within the `btree_multiset`.
+  using Base::size;
+
+  // btree_multiset::clear()
+  //
+  // Removes all elements from the `btree_multiset`. Invalidates any references,
+  // pointers, or iterators referring to contained elements.
+  using Base::clear;
+
+  // btree_multiset::erase()
+  //
+  // Erases elements within the `btree_multiset`. Overloads are listed below.
+  //
+  // iterator erase(iterator position):
+  // iterator erase(const_iterator position):
+  //
+  //   Erases the element at `position` of the `btree_multiset`, returning
+  //   the iterator pointing to the element after the one that was erased
+  //   (or end() if none exists).
+  //
+  // iterator erase(const_iterator first, const_iterator last):
+  //
+  //   Erases the elements in the open interval [`first`, `last`), returning
+  //   the iterator pointing to the element after the interval that was erased
+  //   (or end() if none exists).
+  //
+  // template <typename K> size_type erase(const K& key):
+  //
+  //   Erases the elements matching the key, if any exist, returning the
+  //   number of elements erased.
+  using Base::erase;
+
+  // btree_multiset::insert()
+  //
+  // Inserts an element of the specified value into the `btree_multiset`,
+  // returning an iterator pointing to the newly inserted element.
+  // Any references, pointers, or iterators are invalidated.  Overloads are
+  // listed below.
+  //
+  // iterator insert(const value_type& value):
+  //
+  //   Inserts a value into the `btree_multiset`, returning an iterator to the
+  //   inserted element.
+  //
+  // iterator insert(value_type&& value):
+  //
+  //   Inserts a moveable value into the `btree_multiset`, returning an iterator
+  //   to the inserted element.
+  //
+  // iterator insert(const_iterator hint, const value_type& value):
+  // iterator insert(const_iterator hint, value_type&& value):
+  //
+  //   Inserts a value, using the position of `hint` as a non-binding suggestion
+  //   for where to begin the insertion search. Returns an iterator to the
+  //   inserted element.
+  //
+  // void insert(InputIterator first, InputIterator last):
+  //
+  //   Inserts a range of values [`first`, `last`).
+  //
+  // void insert(std::initializer_list<init_type> ilist):
+  //
+  //   Inserts the elements within the initializer list `ilist`.
+  using Base::insert;
+
+  // btree_multiset::emplace()
+  //
+  // Inserts an element of the specified value by constructing it in-place
+  // within the `btree_multiset`. Any references, pointers, or iterators are
+  // invalidated.
+  using Base::emplace;
+
+  // btree_multiset::emplace_hint()
+  //
+  // Inserts an element of the specified value by constructing it in-place
+  // within the `btree_multiset`, using the position of `hint` as a non-binding
+  // suggestion for where to begin the insertion search.
+  //
+  // Any references, pointers, or iterators are invalidated.
+  using Base::emplace_hint;
+
+  // btree_multiset::merge()
+  //
+  // Extracts elements from a given `source` btree_multiset into this
+  // `btree_multiset`. If the destination `btree_multiset` already contains an
+  // element with an equivalent key, that element is not extracted.
+  using Base::merge;
+
+  // btree_multiset::swap(btree_multiset& other)
+  //
+  // Exchanges the contents of this `btree_multiset` with those of the `other`
+  // btree_multiset, avoiding invocation of any move, copy, or swap operations
+  // on individual elements.
+  //
+  // All iterators and references on the `btree_multiset` remain valid,
+  // excepting for the past-the-end iterator, which is invalidated.
+  using Base::swap;
+
+  // btree_multiset::contains()
+  //
+  // template <typename K> bool contains(const K& key) const:
+  //
+  // Determines whether an element comparing equal to the given `key` exists
+  // within the `btree_multiset`, returning `true` if so or `false` otherwise.
+  //
+  // Supports heterogeneous lookup, provided that the set is provided a
+  // compatible heterogeneous comparator.
+  using Base::contains;
+
+  // btree_multiset::count()
+  //
+  // template <typename K> size_type count(const K& key) const:
+  //
+  // Returns the number of elements comparing equal to the given `key` within
+  // the `btree_multiset`.
+  //
+  // Supports heterogeneous lookup, provided that the set is provided a
+  // compatible heterogeneous comparator.
+  using Base::count;
+
+  // btree_multiset::equal_range()
+  //
+  // Returns a closed range [first, last], defined by a `std::pair` of two
+  // iterators, containing all elements with the passed key in the
+  // `btree_multiset`.
+  using Base::equal_range;
+
+  // btree_multiset::find()
+  //
+  // template <typename K> iterator find(const K& key):
+  // template <typename K> const_iterator find(const K& key) const:
+  //
+  // Finds an element with the passed `key` within the `btree_multiset`.
+  //
+  // Supports heterogeneous lookup, provided that the set is provided a
+  // compatible heterogeneous comparator.
+  using Base::find;
+
+  // btree_multiset::get_allocator()
+  //
+  // Returns the allocator function associated with this `btree_multiset`.
+  using Base::get_allocator;
+
+  // btree_multiset::key_comp();
+  //
+  // Returns the key comparator associated with this `btree_multiset`.
+  using Base::key_comp;
+
+  // btree_multiset::value_comp();
+  //
+  // Returns the value comparator associated with this `btree_multiset`. The
+  // keys to sort the elements are the values themselves, therefore `value_comp`
+  // and its sibling member function `key_comp` are equivalent.
+  using Base::value_comp;
+};
+
+// btree::swap(btree::btree_multiset<>, btree::btree_multiset<>)
+//
+// Swaps the contents of two `btree::btree_multiset` containers.
+template <typename K, typename C, typename A>
+void swap(btree_multiset<K, C, A> &x, btree_multiset<K, C, A> &y) {
+  return x.swap(y);
+}
+
+// btree::erase_if(btree::btree_multiset<>, Pred)
+//
+// Erases all elements that satisfy the predicate pred from the container.
+template <typename K, typename C, typename A, typename Pred>
+void erase_if(btree_multiset<K, C, A> &set, Pred pred) {
+  for (auto it = set.begin(); it != set.end();) {
+    if (pred(*it)) {
+      it = set.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+}  // namespace btree
diff --git a/src/include/cpp_lib_backport.h b/src/include/cpp_lib_backport.h
new file mode 100644
index 000000000..ea956c446
--- /dev/null
+++ b/src/include/cpp_lib_backport.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <cstring>
+#include <type_traits>
+
+namespace std {
+
+#ifndef __cpp_lib_bit_cast
+#define __cpp_lib_bit_cast 201806L
+
+/// Create a value of type `To` from the bits of `from`.
+template<typename To, typename From>
+requires (sizeof(To) == sizeof(From)) &&
+         std::is_trivially_copyable_v<From> &&
+         std::is_trivially_copyable_v<To>
+[[nodiscard]] constexpr To
+bit_cast(const From& from) noexcept {
+#if __has_builtin(__builtin_bit_cast)
+  return __builtin_bit_cast(To, from);
+#else
+  static_assert(std::is_trivially_constructible_v<To>);
+  To to;
+  std::memcpy(&to, &from, sizeof(To));
+  return to;
+#endif
+}
+
+#endif // __cpp_lib_bit_cast
+
+} // namespace std
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
new file mode 100644
index 000000000..dd4ede666
--- /dev/null
+++ b/src/include/crc32c.h
@@ -0,0 +1,57 @@
+#ifndef CEPH_CRC32C_H
+#define CEPH_CRC32C_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
+
+/*
+ * this is a static global with the chosen crc32c implementation for
+ * the given architecture.
+ */
+extern ceph_crc32c_func_t ceph_crc32c_func;
+
+extern ceph_crc32c_func_t ceph_choose_crc32(void);
+
+/**
+ * calculate crc32c for data that is entirely 0 (ZERO)
+ *
+ * Note: works the same as ceph_crc32c_func for data == nullptr, 
+ * but faster than the optimized assembly on certain architectures.
+ * This is faster than intel optimized assembly, but not as fast as 
+ * ppc64le optimized assembly.  
+ *
+ * @param crc initial value
+ * @param length length of buffer
+ */
+uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length);
+
+/**
+ * calculate crc32c
+ *
+ * Note: if the data pointer is NULL, we calculate a crc value as if
+ * it were zero-filled.
+ *
+ * @param crc initial value
+ * @param data pointer to data buffer
+ * @param length length of buffer
+ */
+static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
+{
+#ifndef HAVE_POWER8
+  if (!data && length > 16)
+    return ceph_crc32c_zeros(crc, length);
+#endif /* HAVE_POWER8 */
+
+  return ceph_crc32c_func(crc, data, length);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/demangle.h b/src/include/demangle.h
new file mode 100644
index 000000000..9e46d952f
--- /dev/null
+++ b/src/include/demangle.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INCLUDE_DEMANGLE
+#define CEPH_INCLUDE_DEMANGLE
+
+//// Stole this code from http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
+#ifdef __GNUG__
+#include <cstdlib>
+#include <memory>
+#include <cxxabi.h>
+
+static std::string ceph_demangle(const char* name)
+{
+  int status = -4; // some arbitrary value to eliminate the compiler warning
+
+  // enable c++11 by passing the flag -std=c++11 to g++
+  std::unique_ptr<char, void(*)(void*)> res {
+    abi::__cxa_demangle(name, NULL, NULL, &status),
+    std::free
+  };
+
+  return (status == 0) ? res.get() : name ;
+}
+
+#else
+
+// does nothing if not g++
+static std::string demangle(const char* name)
+{
+  return name;
+}
+
+#endif
+
+
+#endif
diff --git a/src/include/denc.h b/src/include/denc.h
new file mode 100644
index 000000000..d075dd518
--- /dev/null
+++ b/src/include/denc.h
@@ -0,0 +1,1895 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+// If you #include "include/encoding.h" you get the old-style *and*
+// the new-style definitions.  (The old-style needs denc_traits<> in
+// order to disable the container helpers when new-style traits are
+// present.)
+
+// You can also just #include "include/denc.h" and get only the
+// new-style helpers.  The eventual goal is to drop the legacy
+// definitions.
+
+#ifndef _ENC_DEC_H
+#define _ENC_DEC_H
+
+#include <array>
+#include <bit>
+#include <cstring>
+#include <concepts>
+#include <map>
+#include <optional>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/small_vector.hpp>
+#include <boost/intrusive/set.hpp>
+#include <boost/optional.hpp>
+
+#include "include/cpp_lib_backport.h"
+#include "include/compat.h"
+#include "include/int_types.h"
+#include "include/scope_guard.h"
+
+#include "buffer.h"
+#include "byteorder.h"
+
+#include "common/convenience.h"
+#include "common/error_code.h"
+
+template<typename T, typename=void>
+struct denc_traits {
+  static constexpr bool supported = false;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = true;
+};
+
+template<typename T>
+inline constexpr bool denc_supported = denc_traits<T>::supported;
+
+
+// hack for debug only; FIXME
+//#include <iostream>
+//using std::cout;
+
+// Define this to compile in a dump of all encoded objects to disk to
+// populate ceph-object-corpus.  Note that there is an almost
+// identical implementation in encoding.h, but you only need to define
+// ENCODE_DUMP_PATH here.
+//
+// See src/test/encoding/generate-corpus-objects.sh.
+//
+//#define ENCODE_DUMP_PATH /tmp/something
+
+#ifdef ENCODE_DUMP_PATH
+# include <cstdio>
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <fcntl.h>
+
+# define ENCODE_STR(x) #x
+# define ENCODE_STRINGIFY(x) ENCODE_STR(x)
+
+template<typename T>
+class DencDumper {
+public:
+  DencDumper(const char* name,
+	     const ceph::bufferlist::contiguous_appender& appender)
+    : name{name},
+      appender{appender},
+      bl_offset{appender.bl.length()},
+      space_offset{space_size()},
+      start{appender.get_pos()}
+  {}
+  ~DencDumper() {
+    if (do_sample()) {
+      dump();
+    }
+  }
+private:
+  static bool do_sample() {
+    // this hackery with bits below is just to get a semi-reasonable
+    // distribution across time.  it is somewhat exponential but not
+    // quite.
+    i++;
+    int bits = 0;
+    for (unsigned t = i; t; bits++)
+      t &= t - 1;
+    return bits <= 2;
+  }
+  size_t space_size() const {
+    return appender.get_logical_offset() - appender.get_out_of_band_offset();
+  }
+  void dump() const {
+    char fn[PATH_MAX];
+    ::snprintf(fn, sizeof(fn),
+	       ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", name,
+	       getpid(), i++);
+    int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC|O_BINARY, 0644);
+    if (fd < 0) {
+      return;
+    }
+    auto close_fd = make_scope_guard([fd] { ::close(fd); });
+    if (auto bl_delta = appender.bl.length() - bl_offset; bl_delta > 0) {
+      ceph::bufferlist dump_bl;
+      appender.bl.begin(bl_offset + space_offset).copy(bl_delta - space_offset, dump_bl);
+      const size_t space_len = space_size();
+      dump_bl.append(appender.get_pos() - space_len, space_len);
+      dump_bl.write_fd(fd);
+    } else {
+      size_t len = appender.get_pos() - start;
+      [[maybe_unused]] int r = ::write(fd, start, len);
+    }
+  }
+  const char* name;
+  const ceph::bufferlist::contiguous_appender& appender;
+  const size_t bl_offset;
+  const size_t space_offset;
+  const char* start;
+  static int i;
+};
+
+template<typename T> int DencDumper<T>::i = 0;
+
+# define DENC_DUMP_PRE(Type)						\
+  DencDumper<Type> _denc_dumper{#Type, p};
+#else
+# define DENC_DUMP_PRE(Type)
+#endif
+
+
+/*
+
+  top level level functions look like so
+  ======================================
+
+    inline void denc(const T& o, size_t& p, uint64_t features=0);
+    inline void denc(const T& o, ceph::buffer::list::contiguous_appender& p,
+                     uint64_t features=0);
+    inline void denc(T& o, ceph::buffer::ptr::const_iterator& p, uint64_t features=0);
+
+  or (for featured objects)
+
+    inline void denc(const T& o, size_t& p, uint64_t features);
+    inline void denc(const T& o, ceph::buffer::list::contiguous_appender& p,
+                     uint64_t features);
+    inline void denc(T& o, ceph::buffer::ptr::const_iterator& p, uint64_t features);
+
+  - These are symmetrical, so that they can be used from the magic DENC
+  method of writing the bound_encode/encode/decode methods all in one go;
+  they differ only in the type of p.
+
+  - These are automatically fabricated via a template that calls into
+  the denc_traits<> methods (see below), provided denc_traits<T>::supported
+  is defined and true.  They never need to be written explicitly.
+
+
+  static denc_traits<> definitions look like so
+  =============================================
+
+    template<>
+    struct denc_traits<T> {
+      static constexpr bool supported = true;
+      static constexpr bool bounded = false;
+      static constexpr bool featured = false;
+      static constexpr bool need_contiguous = true;
+      static void bound_encode(const T &o, size_t& p, uint64_t f=0);
+      static void encode(const T &o, ceph::buffer::list::contiguous_appender& p,
+		         uint64_t f=0);
+      static void decode(T& o, ceph::buffer::ptr::const_iterator &p, uint64_t f=0);
+    };
+
+  or (for featured objects)
+
+    template<>
+    struct denc_traits<T> {
+      static constexpr bool supported = true;
+      static constexpr bool bounded = false;
+      static constexpr bool featured = true;
+      static constexpr bool need_contiguous = true;
+      static void bound_encode(const T &o, size_t& p, uint64_t f);
+      static void encode(const T &o, ceph::buffer::list::contiguous_appender& p,
+		         uint64_t f);
+      static void decode(T& o, ceph::buffer::ptr::const_iterator &p, uint64_t f=0);
+    };
+
+  - denc_traits<T> is normally declared via the WRITE_CLASS_DENC(type) macro,
+  which is used in place of the old-style WRITE_CLASS_ENCODER(type) macro.
+  There are _FEATURED and _BOUNDED variants.  The class traits simply call
+  into class methods of the same name (see below).
+
+  - denc_traits<T> can also be written explicitly for some type to indicate
+  how it should be encoded.  This is the "source of truth" for how a type
+  is encoded.
+
+  - denc_traits<T> are declared for the base integer types, string, ceph::buffer::ptr,
+  and ceph::buffer::list base types.
+
+  - denc_traits<std::foo<T>>-like traits are declared for standard container
+  types.
+
+
+  class methods look like so
+  ==========================
+
+    void bound_encode(size_t& p) const;
+    void encode(ceph::buffer::list::contiguous_appender& p) const;
+    void decode(ceph::buffer::ptr::const_iterator &p);
+
+  or (for featured objects)
+
+    void bound_encode(size_t& p, uint64_t f) const;
+    void encode(ceph::buffer::list::contiguous_appender& p, uint64_t f) const;
+    void decode(ceph::buffer::ptr::const_iterator &p);
+
+  - These are normally invoked by the denc_traits<> methods that are
+  declared via WRITE_CLASS_DENC, although you can also invoke them explicitly
+  in your code.
+
+  - These methods are optimised for contiguous buffer, but denc() will try
+    rebuild a contigous one if the decoded ceph::buffer::list is segmented. If you are
+    concerned about the cost, you might want to define yet another method:
+
+    void decode(ceph::buffer::list::iterator &p);
+
+  - These can be defined either explicitly (as above), or can be "magically"
+  defined all in one go using the DENC macro and DENC_{START,FINISH} helpers
+  (which work like the legacy {ENCODE,DECODE}_{START,FINISH} macros):
+
+    class foo_t {
+      ...
+      DENC(foo_t, v, p) {
+        DENC_START(1, 1, p);
+        denc(v.foo, p);
+        denc(v.bar, p);
+        denc(v.baz, p);
+        DENC_FINISH(p);
+      }
+      ...
+    };
+    WRITE_CLASS_DENC(foo_t)
+
+  */
+
+// ---------------------------------------------------------------------
+// raw types
+namespace _denc {
+template<typename T, typename... U>
+concept is_any_of = (std::same_as<T, U> || ...);
+
+template<typename T, typename=void> struct underlying_type {
+  using type = T;
+};
+template<typename T>
+struct underlying_type<T, std::enable_if_t<std::is_enum_v<T>>> {
+  using type = std::underlying_type_t<T>;
+};
+template<typename T>
+using underlying_type_t = typename underlying_type<T>::type;
+}
+
+template<class It>
+concept is_const_iterator = requires(It& it, size_t n) {
+  { it.get_pos_add(n) } -> std::same_as<const char*>;
+};
+
+template<typename T, is_const_iterator It>
+const T& get_pos_add(It& i) {
+  return *reinterpret_cast<const T*>(i.get_pos_add(sizeof(T)));
+}
+
+template<typename T, class It>
+requires (!is_const_iterator<It>)
+T& get_pos_add(It& i) {
+  return *reinterpret_cast<T*>(i.get_pos_add(sizeof(T)));
+}
+
+template<typename T>
+requires _denc::is_any_of<_denc::underlying_type_t<T>,
+		          ceph_le64, ceph_le32, ceph_le16, uint8_t
+#ifndef _CHAR_IS_SIGNED
+		          , int8_t
+#endif
+			  >
+struct denc_traits<T> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+  static void bound_encode(const T &o, size_t& p, uint64_t f=0) {
+    p += sizeof(T);
+  }
+  template<class It>
+  requires (!is_const_iterator<It>)
+  static void encode(const T &o, It& p, uint64_t f=0) {
+    get_pos_add<T>(p) = o;
+  }
+  template<is_const_iterator It>
+  static void decode(T& o, It& p, uint64_t f=0) {
+    o = get_pos_add<T>(p);
+  }
+  static void decode(T& o, ceph::buffer::list::const_iterator &p) {
+    p.copy(sizeof(T), reinterpret_cast<char*>(&o));
+  }
+};
+
+
+// -----------------------------------------------------------------------
+// integer types
+
+// itype == internal type
+// otype == external type, i.e., the type on the wire
+
+// NOTE: the overload resolution ensures that the legacy encode/decode methods
+// defined for int types is preferred to the ones  defined using the specialized
+// template, and hence get selected. This machinery prevents these these from
+// getting glued into the legacy encode/decode methods; the overhead of setting
+// up a contiguous_appender etc is likely to be slower.
+namespace _denc {
+
+template<typename T> struct ExtType {
+  using type = void;
+};
+
+template<typename T>
+requires _denc::is_any_of<T,
+			  int16_t, uint16_t>
+struct ExtType<T> {
+  using type = ceph_le16;
+};
+
+template<typename T>
+requires _denc::is_any_of<T,
+			  int32_t, uint32_t>
+struct ExtType<T> {
+  using type = ceph_le32;
+};
+
+template<typename T>
+requires _denc::is_any_of<T,
+			  int64_t, uint64_t>
+struct ExtType<T> {
+  using type = ceph_le64;
+};
+
+template<>
+struct ExtType<bool> {
+  using type = uint8_t;
+};
+template<typename T>
+using ExtType_t = typename ExtType<T>::type;
+} // namespace _denc
+
+template<typename T>
+requires (!std::is_void_v<_denc::ExtType_t<T>>)
+struct denc_traits<T>
+{
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+  using etype = _denc::ExtType_t<T>;
+  static void bound_encode(const T &o, size_t& p, uint64_t f=0) {
+    p += sizeof(etype);
+  }
+  template<class It>
+  requires (!is_const_iterator<It>)
+  static void encode(const T &o, It& p, uint64_t f=0) {
+    get_pos_add<etype>(p) = o;
+  }
+  template<is_const_iterator It>
+  static void decode(T& o, It &p, uint64_t f=0) {
+    o = get_pos_add<etype>(p);
+  }
+  static void decode(T& o, ceph::buffer::list::const_iterator &p) {
+    etype e;
+    p.copy(sizeof(etype), reinterpret_cast<char*>(&e));
+    o = e;
+  }
+};
+
+// varint
+//
+// high bit of each byte indicates another byte follows.
+template<typename T>
+inline void denc_varint(T v, size_t& p) {
+  p += sizeof(T) + 1;
+}
+
+template<typename T>
+inline void denc_varint(T v, ceph::buffer::list::contiguous_appender& p) {
+  uint8_t byte = v & 0x7f;
+  v >>= 7;
+  while (v) {
+    byte |= 0x80;
+    get_pos_add<__u8>(p) = byte;
+    byte = (v & 0x7f);
+    v >>= 7;
+  }
+  get_pos_add<__u8>(p) = byte;
+}
+
+template<typename T>
+inline void denc_varint(T& v, ceph::buffer::ptr::const_iterator& p) {
+  uint8_t byte = *(__u8*)p.get_pos_add(1);
+  v = byte & 0x7f;
+  int shift = 7;
+  while (byte & 0x80) {
+    byte = get_pos_add<__u8>(p);
+    v |= (T)(byte & 0x7f) << shift;
+    shift += 7;
+  }
+}
+
+
+// signed varint encoding
+//
+// low bit = 1 = negative, 0 = positive
+// high bit of every byte indicates whether another byte follows.
+inline void denc_signed_varint(int64_t v, size_t& p) {
+  p += sizeof(v) + 2;
+}
+template<class It>
+requires (!is_const_iterator<It>)
+void denc_signed_varint(int64_t v, It& p) {
+  if (v < 0) {
+    v = (-v << 1) | 1;
+  } else {
+    v <<= 1;
+  }
+  denc_varint(v, p);
+}
+
+template<typename T, is_const_iterator It>
+inline void denc_signed_varint(T& v, It& p)
+{
+  int64_t i = 0;
+  denc_varint(i, p);
+  if (i & 1) {
+    v = -(i >> 1);
+  } else {
+    v = i >> 1;
+  }
+}
+
+// varint + lowz encoding
+//
+// first(low) 2 bits = how many low zero bits (nibbles)
+// high bit of each byte = another byte follows
+// (so, 5 bits data in first byte, 7 bits data thereafter)
+inline void denc_varint_lowz(uint64_t v, size_t& p) {
+  p += sizeof(v) + 2;
+}
+inline void denc_varint_lowz(uint64_t v,
+			     ceph::buffer::list::contiguous_appender& p) {
+  int lowznib = v ? (std::countr_zero(v) / 4) : 0;
+  if (lowznib > 3)
+    lowznib = 3;
+  v >>= lowznib * 4;
+  v <<= 2;
+  v |= lowznib;
+  denc_varint(v, p);
+}
+
+template<typename T>
+inline void denc_varint_lowz(T& v, ceph::buffer::ptr::const_iterator& p)
+{
+  uint64_t i = 0;
+  denc_varint(i, p);
+  int lowznib = (i & 3);
+  i >>= 2;
+  i <<= lowznib * 4;
+  v = i;
+}
+
+// signed varint + lowz encoding
+//
+// first low bit = 1 for negative, 0 for positive
+// next 2 bits = how many low zero bits (nibbles)
+// high bit of each byte = another byte follows
+// (so, 4 bits data in first byte, 7 bits data thereafter)
+inline void denc_signed_varint_lowz(int64_t v, size_t& p) {
+  p += sizeof(v) + 2;
+}
+template<class It>
+requires (!is_const_iterator<It>)
+inline void denc_signed_varint_lowz(int64_t v, It& p) {
+  bool negative = false;
+  if (v < 0) {
+    v = -v;
+    negative = true;
+  }
+  unsigned lowznib = v ? (std::countr_zero(std::bit_cast<uint64_t>(v)) / 4) : 0u;
+  if (lowznib > 3)
+    lowznib = 3;
+  v >>= lowznib * 4;
+  v <<= 3;
+  v |= lowznib << 1;
+  v |= (int)negative;
+  denc_varint(v, p);
+}
+
+template<typename T, is_const_iterator It>
+inline void denc_signed_varint_lowz(T& v, It& p)
+{
+  int64_t i = 0;
+  denc_varint(i, p);
+  int lowznib = (i & 6) >> 1;
+  if (i & 1) {
+    i >>= 3;
+    i <<= lowznib * 4;
+    v = -i;
+  } else {
+    i >>= 3;
+    i <<= lowznib * 4;
+    v = i;
+  }
+}
+
+
+// LBA
+//
+// first 1-3 bits = how many low zero bits
+//     *0 = 12 (common 4 K alignment case)
+//    *01 = 16
+//   *011 = 20
+//   *111 = byte
+// then 28-30 bits of data
+// then last bit = another byte follows
+// high bit of each subsequent byte = another byte follows
+inline void denc_lba(uint64_t v, size_t& p) {
+  p += sizeof(v) + 2;
+}
+
+template<class It>
+requires (!is_const_iterator<It>)
+inline void denc_lba(uint64_t v, It& p) {
+  int low_zero_nibbles = v ? std::countr_zero(v) / 4 : 0;
+  int pos;
+  uint32_t word;
+  int t = low_zero_nibbles - 3;
+  if (t < 0) {
+    pos = 3;
+    word = 0x7;
+  } else if (t < 3) {
+    v >>= (low_zero_nibbles * 4);
+    pos = t + 1;
+    word = (1 << t) - 1;
+  } else {
+    v >>= 20;
+    pos = 3;
+    word = 0x3;
+  }
+  word |= (v << pos) & 0x7fffffff;
+  v >>= 31 - pos;
+  if (!v) {
+    *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word;
+    return;
+  }
+  word |= 0x80000000;
+  *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word;
+  uint8_t byte = v & 0x7f;
+  v >>= 7;
+  while (v) {
+    byte |= 0x80;
+    *(__u8*)p.get_pos_add(1) = byte;
+    byte = (v & 0x7f);
+    v >>= 7;
+  }
+  *(__u8*)p.get_pos_add(1) = byte;
+}
+
+template<is_const_iterator It>
+inline void denc_lba(uint64_t& v, It& p) {
+  uint32_t word = *(ceph_le32*)p.get_pos_add(sizeof(uint32_t));
+  int shift = 0;
+  switch (word & 7) {
+  case 0:
+  case 2:
+  case 4:
+  case 6:
+    v = (uint64_t)(word & 0x7ffffffe) << (12 - 1);
+    shift = 12 + 30;
+    break;
+  case 1:
+  case 5:
+    v = (uint64_t)(word & 0x7ffffffc) << (16 - 2);
+    shift = 16 + 29;
+    break;
+  case 3:
+    v = (uint64_t)(word & 0x7ffffff8) << (20 - 3);
+    shift = 20 + 28;
+    break;
+  case 7:
+    v = (uint64_t)(word & 0x7ffffff8) >> 3;
+    shift = 28;
+  }
+  uint8_t byte = word >> 24;
+  while (byte & 0x80) {
+    byte = *(__u8*)p.get_pos_add(1);
+    v |= (uint64_t)(byte & 0x7f) << shift;
+    shift += 7;
+  }
+}
+
+
+// ---------------------------------------------------------------------
+// denc top-level methods that call into denc_traits<T> methods
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported> denc(
+  const T& o,
+  size_t& p,
+  uint64_t f=0)
+{
+  if constexpr (traits::featured) {
+    traits::bound_encode(o, p, f);
+  } else {
+    traits::bound_encode(o, p);
+  }
+}
+
+template<typename T, class It, typename traits=denc_traits<T>>
+requires traits::supported && (!is_const_iterator<It>)
+inline void
+denc(const T& o,
+     It& p,
+     uint64_t features=0)
+{
+  if constexpr (traits::featured) {
+    traits::encode(o, p, features);
+  } else {
+    traits::encode(o, p);
+  }
+}
+
+template<typename T, is_const_iterator It, typename traits=denc_traits<T>>
+requires traits::supported
+inline void
+denc(T& o,
+     It& p,
+     uint64_t features=0)
+{
+  if constexpr (traits::featured) {
+    traits::decode(o, p, features);
+  } else {
+    traits::decode(o, p);
+  }
+}
+
+namespace _denc {
+template<typename T, typename = void>
+struct has_legacy_denc : std::false_type {};
+template<typename T>
+struct has_legacy_denc<T, decltype(std::declval<T&>()
+				   .decode(std::declval<
+					   ceph::buffer::list::const_iterator&>()))>
+  : std::true_type {
+  static void decode(T& v, ceph::buffer::list::const_iterator& p) {
+    v.decode(p);
+  }
+};
+template<typename T>
+struct has_legacy_denc<T,
+		       std::enable_if_t<
+			 !denc_traits<T>::need_contiguous>> : std::true_type {
+  static void decode(T& v, ceph::buffer::list::const_iterator& p) {
+    denc_traits<T>::decode(v, p);
+  }
+};
+}
+
+template<typename T,
+	 typename traits=denc_traits<T>,
+	 typename has_legacy_denc=_denc::has_legacy_denc<T>>
+inline std::enable_if_t<traits::supported &&
+			has_legacy_denc::value> denc(
+  T& o,
+  ceph::buffer::list::const_iterator& p)
+{
+  has_legacy_denc::decode(o, p);
+}
+
+// ---------------------------------------------------------------------
+// base types and containers
+
+//
+// std::string
+//
+template<typename A>
+struct denc_traits<std::basic_string<char,std::char_traits<char>,A>> {
+private:
+  using value_type = std::basic_string<char,std::char_traits<char>,A>;
+
+public:
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = false;
+
+  static void bound_encode(const value_type& s, size_t& p, uint64_t f=0) {
+    p += sizeof(uint32_t) + s.size();
+  }
+  template<class It>
+  static void encode(const value_type& s,
+		     It& p,
+                     uint64_t f=0) {
+    denc((uint32_t)s.size(), p);
+    memcpy(p.get_pos_add(s.size()), s.data(), s.size());
+  }
+  template<class It>
+  static void decode(value_type& s,
+		     It& p,
+		     uint64_t f=0) {
+    uint32_t len;
+    denc(len, p);
+    decode_nohead(len, s, p);
+  }
+  static void decode(value_type& s, ceph::buffer::list::const_iterator& p)
+  {
+    uint32_t len;
+    denc(len, p);
+    decode_nohead(len, s, p);
+  }
+  template<class It>
+  static void decode_nohead(size_t len, value_type& s, It& p) {
+    s.clear();
+    if (len) {
+      s.append(p.get_pos_add(len), len);
+    }
+  }
+  static void decode_nohead(size_t len, value_type& s,
+                            ceph::buffer::list::const_iterator& p) {
+    if (len) {
+      if constexpr (std::is_same_v<value_type, std::string>) {
+        s.clear();
+        p.copy(len, s);
+      } else {
+        s.resize(len);
+        p.copy(len, s.data());
+      }
+    } else {
+      s.clear();
+    }
+  }
+  template<class It>
+  requires (!is_const_iterator<It>)
+  static void
+  encode_nohead(const value_type& s, It& p) {
+    auto len = s.length();
+    maybe_inline_memcpy(p.get_pos_add(len), s.data(), len, 16);
+  }
+};
+
+//
+// ceph::buffer::ptr
+//
+template<>
+struct denc_traits<ceph::buffer::ptr> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = false;
+  static void bound_encode(const ceph::buffer::ptr& v, size_t& p, uint64_t f=0) {
+    p += sizeof(uint32_t) + v.length();
+  }
+  template <class It>
+  requires (!is_const_iterator<It>)
+  static void
+  encode(const ceph::buffer::ptr& v, It& p, uint64_t f=0) {
+    denc((uint32_t)v.length(), p);
+    p.append(v);
+  }
+  template <is_const_iterator It>
+  static void
+  decode(ceph::buffer::ptr& v, It& p, uint64_t f=0) {
+    uint32_t len;
+    denc(len, p);
+    v = p.get_ptr(len);
+  }
+  static void decode(ceph::buffer::ptr& v, ceph::buffer::list::const_iterator& p) {
+    uint32_t len;
+    denc(len, p);
+    ceph::buffer::list s;
+    p.copy(len, s);
+    if (len) {
+      if (s.get_num_buffers() == 1)
+	v = s.front();
+      else
+	v = ceph::buffer::copy(s.c_str(), s.length());
+    }
+  }
+};
+
+//
+// ceph::buffer::list
+//
+template<>
+struct denc_traits<ceph::buffer::list> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = false;
+  static void bound_encode(const ceph::buffer::list& v, size_t& p, uint64_t f=0) {
+    p += sizeof(uint32_t) + v.length();
+  }
+  static void encode(const ceph::buffer::list& v, ceph::buffer::list::contiguous_appender& p,
+	      uint64_t f=0) {
+    denc((uint32_t)v.length(), p);
+    p.append(v);
+  }
+  static void decode(ceph::buffer::list& v, ceph::buffer::ptr::const_iterator& p, uint64_t f=0) {
+    uint32_t len = 0;
+    denc(len, p);
+    v.clear();
+    v.push_back(p.get_ptr(len));
+  }
+  static void decode(ceph::buffer::list& v, ceph::buffer::list::const_iterator& p) {
+    uint32_t len;
+    denc(len, p);
+    v.clear();
+    p.copy(len, v);
+  }
+  static void encode_nohead(const ceph::buffer::list& v,
+			    ceph::buffer::list::contiguous_appender& p) {
+    p.append(v);
+  }
+  static void decode_nohead(size_t len, ceph::buffer::list& v,
+			    ceph::buffer::ptr::const_iterator& p) {
+    v.clear();
+    if (len) {
+      v.append(p.get_ptr(len));
+    }
+  }
+  static void decode_nohead(size_t len, ceph::buffer::list& v,
+			    ceph::buffer::list::const_iterator& p) {
+    v.clear();
+    p.copy(len, v);
+  }
+};
+
+//
+// std::pair<A, B>
+//
+template<typename A, typename B>
+struct denc_traits<
+  std::pair<A, B>,
+  std::enable_if_t<denc_supported<std::remove_const_t<A>> && denc_supported<B>>> {
+  typedef denc_traits<A> a_traits;
+  typedef denc_traits<B> b_traits;
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = a_traits::featured || b_traits::featured ;
+  static constexpr bool bounded = a_traits::bounded && b_traits::bounded;
+  static constexpr bool need_contiguous = (a_traits::need_contiguous ||
+					   b_traits::need_contiguous);
+
+  static void bound_encode(const std::pair<A,B>& v, size_t& p, uint64_t f = 0) {
+    if constexpr (featured) {
+      denc(v.first, p, f);
+      denc(v.second, p, f);
+    } else {
+      denc(v.first, p);
+      denc(v.second, p);
+    }
+  }
+
+  static void encode(const std::pair<A,B>& v, ceph::buffer::list::contiguous_appender& p,
+		     uint64_t f = 0) {
+    if constexpr (featured) {
+      denc(v.first, p, f);
+      denc(v.second, p, f);
+    } else {
+      denc(v.first, p);
+      denc(v.second, p);
+    }
+  }
+
+  static void decode(std::pair<A,B>& v, ceph::buffer::ptr::const_iterator& p, uint64_t f=0) {
+    denc(const_cast<std::remove_const_t<A>&>(v.first), p, f);
+    denc(v.second, p, f);
+  }
+  template<typename AA=A>
+  static std::enable_if_t<!!sizeof(AA) && !need_contiguous>
+  decode(std::pair<A,B>& v, ceph::buffer::list::const_iterator& p,
+	 uint64_t f = 0) {
+    denc(const_cast<std::remove_const_t<AA>&>(v.first), p);
+    denc(v.second, p);
+  }
+};
+
+namespace _denc {
+  template<template<class...> class C, typename Details, typename ...Ts>
+  struct container_base {
+  private:
+    using container = C<Ts...>;
+    using T = typename Details::T;
+
+  public:
+    using traits = denc_traits<T>;
+
+    static constexpr bool supported = true;
+    static constexpr bool featured = traits::featured;
+    static constexpr bool bounded = false;
+    static constexpr bool need_contiguous = traits::need_contiguous;
+
+    template<typename U=T>
+    static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+      p += sizeof(uint32_t);
+      if constexpr (traits::bounded) {
+#if _GLIBCXX_USE_CXX11_ABI
+        // intensionally not calling container's empty() method to not prohibit
+        // compiler from optimizing the check if it and the ::size() operate on
+        // different memory (observed when std::list::empty() works on pointers,
+        // not the size field).
+        if (const auto elem_num = s.size(); elem_num > 0) {
+#else
+        if (!s.empty()) {
+	  const auto elem_num = s.size();
+#endif
+          // STL containers use weird element types like std::pair<const K, V>;
+          // cast to something we have denc_traits for.
+          size_t elem_size = 0;
+          if constexpr (traits::featured) {
+            denc(static_cast<const T&>(*s.begin()), elem_size, f);
+          } else {
+            denc(static_cast<const T&>(*s.begin()), elem_size);
+          }
+          p += elem_size * elem_num;
+        }
+      } else {
+        for (const T& e : s) {
+          if constexpr (traits::featured) {
+            denc(e, p, f);
+          } else {
+            denc(e, p);
+          }
+        }
+      }
+    }
+
+    template<typename U=T>
+    static void encode(const container& s,
+		       ceph::buffer::list::contiguous_appender& p,
+		       uint64_t f = 0) {
+      denc((uint32_t)s.size(), p);
+      if constexpr (traits::featured) {
+        encode_nohead(s, p, f);
+      } else {
+        encode_nohead(s, p);
+      }
+    }
+    static void decode(container& s, ceph::buffer::ptr::const_iterator& p,
+		       uint64_t f = 0) {
+      uint32_t num;
+      denc(num, p);
+      decode_nohead(num, s, p, f);
+    }
+    template<typename U=T>
+    static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+    decode(container& s, ceph::buffer::list::const_iterator& p) {
+      uint32_t num;
+      denc(num, p);
+      decode_nohead(num, s, p);
+    }
+
+    // nohead
+    static void encode_nohead(const container& s, ceph::buffer::list::contiguous_appender& p,
+			      uint64_t f = 0) {
+      for (const T& e : s) {
+        if constexpr (traits::featured) {
+          denc(e, p, f);
+        } else {
+          denc(e, p);
+        }
+      }
+    }
+    static void decode_nohead(size_t num, container& s,
+			      ceph::buffer::ptr::const_iterator& p,
+			      uint64_t f=0) {
+      s.clear();
+      Details::reserve(s, num);
+      while (num--) {
+	T t;
+	denc(t, p, f);
+	Details::insert(s, std::move(t));
+      }
+    }
+    template<typename U=T>
+    static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+    decode_nohead(size_t num, container& s,
+		  ceph::buffer::list::const_iterator& p) {
+      s.clear();
+      Details::reserve(s, num);
+      while (num--) {
+	T t;
+	denc(t, p);
+	Details::insert(s, std::move(t));
+      }
+    }
+  };
+
+  template<typename T>
+  class container_has_reserve {
+    template<typename U, U> struct SFINAE_match;
+    template<typename U>
+    static std::true_type test(SFINAE_match<T(*)(typename T::size_type),
+			       &U::reserve>*);
+
+    template<typename U>
+    static std::false_type test(...);
+
+  public:
+    static constexpr bool value = decltype(
+      test<denc_traits<T>>(0))::value;
+  };
+  template<typename T>
+  inline constexpr bool container_has_reserve_v =
+    container_has_reserve<T>::value;
+
+
+  template<typename Container>
+  struct container_details_base {
+    using T = typename Container::value_type;
+    static void reserve(Container& c, size_t s) {
+      if constexpr (container_has_reserve_v<Container>) {
+        c.reserve(s);
+      }
+    }
+  };
+
+  template<typename Container>
+  struct pushback_details : public container_details_base<Container> {
+    template<typename ...Args>
+    static void insert(Container& c, Args&& ...args) {
+      c.emplace_back(std::forward<Args>(args)...);
+    }
+  };
+}
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+  std::list<T, Ts...>,
+  typename std::enable_if_t<denc_traits<T>::supported>>
+  : public _denc::container_base<std::list,
+				 _denc::pushback_details<std::list<T, Ts...>>,
+				 T, Ts...> {};
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+  std::vector<T, Ts...>,
+  typename std::enable_if_t<denc_traits<T>::supported>>
+  : public _denc::container_base<std::vector,
+				 _denc::pushback_details<std::vector<T, Ts...>>,
+				 T, Ts...> {};
+
+template<typename T, std::size_t N, typename ...Ts>
+struct denc_traits<
+  boost::container::small_vector<T, N, Ts...>,
+  typename std::enable_if_t<denc_traits<T>::supported>> {
+private:
+  using container = boost::container::small_vector<T, N, Ts...>;
+public:
+  using traits = denc_traits<T>;
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = traits::featured;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = traits::need_contiguous;
+
+  template<typename U=T>
+  static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+    p += sizeof(uint32_t);
+    if constexpr (traits::bounded) {
+      if (!s.empty()) {
+	const auto elem_num = s.size();
+	size_t elem_size = 0;
+	if constexpr (traits::featured) {
+	  denc(*s.begin(), elem_size, f);
+        } else {
+          denc(*s.begin(), elem_size);
+        }
+        p += elem_size * elem_num;
+      }
+    } else {
+      for (const T& e : s) {
+	if constexpr (traits::featured) {
+	  denc(e, p, f);
+	} else {
+	  denc(e, p);
+	}
+      }
+    }
+  }
+
+  template<typename U=T>
+  static void encode(const container& s,
+		     ceph::buffer::list::contiguous_appender& p,
+		     uint64_t f = 0) {
+    denc((uint32_t)s.size(), p);
+    if constexpr (traits::featured) {
+      encode_nohead(s, p, f);
+    } else {
+      encode_nohead(s, p);
+    }
+  }
+  static void decode(container& s, ceph::buffer::ptr::const_iterator& p,
+		     uint64_t f = 0) {
+    uint32_t num;
+    denc(num, p);
+    decode_nohead(num, s, p, f);
+  }
+  template<typename U=T>
+  static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+  decode(container& s, ceph::buffer::list::const_iterator& p) {
+    uint32_t num;
+    denc(num, p);
+    decode_nohead(num, s, p);
+  }
+
+  // nohead
+  static void encode_nohead(const container& s, ceph::buffer::list::contiguous_appender& p,
+			    uint64_t f = 0) {
+    for (const T& e : s) {
+      if constexpr (traits::featured) {
+        denc(e, p, f);
+      } else {
+        denc(e, p);
+      }
+    }
+  }
+  static void decode_nohead(size_t num, container& s,
+			    ceph::buffer::ptr::const_iterator& p,
+			    uint64_t f=0) {
+    s.clear();
+    s.reserve(num);
+    while (num--) {
+      T t;
+      denc(t, p, f);
+      s.push_back(std::move(t));
+    }
+  }
+  template<typename U=T>
+  static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+  decode_nohead(size_t num, container& s,
+		ceph::buffer::list::const_iterator& p) {
+    s.clear();
+    s.reserve(num);
+    while (num--) {
+      T t;
+      denc(t, p);
+      s.push_back(std::move(t));
+    }
+  }
+};
+
+namespace _denc {
+  template<typename Container>
+  struct setlike_details : public container_details_base<Container> {
+    using T = typename Container::value_type;
+    template<typename ...Args>
+    static void insert(Container& c, Args&& ...args) {
+      c.emplace_hint(c.cend(), std::forward<Args>(args)...);
+    }
+  };
+}
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+  std::set<T, Ts...>,
+  std::enable_if_t<denc_traits<T>::supported>>
+  : public _denc::container_base<std::set,
+				 _denc::setlike_details<std::set<T, Ts...>>,
+				 T, Ts...> {};
+
+template<typename T, typename ...Ts>
+struct denc_traits<
+  boost::container::flat_set<T, Ts...>,
+  std::enable_if_t<denc_traits<T>::supported>>
+  : public _denc::container_base<
+  boost::container::flat_set,
+  _denc::setlike_details<boost::container::flat_set<T, Ts...>>,
+  T, Ts...> {};
+
+namespace _denc {
+  template<typename Container>
+  struct maplike_details : public container_details_base<Container> {
+    using T = typename Container::value_type;
+    template<typename ...Args>
+    static void insert(Container& c, Args&& ...args) {
+      c.emplace_hint(c.cend(), std::forward<Args>(args)...);
+    }
+  };
+}
+
+template<typename A, typename B, typename ...Ts>
+struct denc_traits<
+  std::map<A, B, Ts...>,
+  std::enable_if_t<denc_traits<A>::supported &&
+		   denc_traits<B>::supported>>
+  : public _denc::container_base<std::map,
+				 _denc::maplike_details<std::map<A, B, Ts...>>,
+				 A, B, Ts...> {};
+
+template<typename A, typename B, typename ...Ts>
+struct denc_traits<
+  boost::container::flat_map<A, B, Ts...>,
+  std::enable_if_t<denc_traits<A>::supported &&
+		   denc_traits<B>::supported>>
+  : public _denc::container_base<
+  boost::container::flat_map,
+  _denc::maplike_details<boost::container::flat_map<
+			   A, B, Ts...>>,
+  A, B, Ts...> {};
+
+template<typename T, size_t N>
+struct denc_traits<
+  std::array<T, N>,
+  std::enable_if_t<denc_traits<T>::supported>> {
+private:
+  using container = std::array<T, N>;
+public:
+  using traits = denc_traits<T>;
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = traits::featured;
+  static constexpr bool bounded = traits::bounded;
+  static constexpr bool need_contiguous = traits::need_contiguous;
+
+  static void bound_encode(const container& s, size_t& p, uint64_t f = 0) {
+    if constexpr (traits::bounded) {
+      if constexpr (traits::featured) {
+        if (!s.empty()) {
+          size_t elem_size = 0;
+          denc(*s.begin(), elem_size, f);
+          p += elem_size * s.size();
+        }
+      } else {
+        size_t elem_size = 0;
+        denc(*s.begin(), elem_size);
+        p += elem_size * N;
+      }
+    } else {
+      for (const auto& e : s) {
+        if constexpr (traits::featured) {
+          denc(e, p, f);
+        } else {
+          denc(e, p);
+        }
+      }
+    }
+  }
+
+  static void encode(const container& s, ceph::buffer::list::contiguous_appender& p,
+		     uint64_t f = 0) {
+    for (const auto& e : s) {
+      if constexpr (traits::featured) {
+        denc(e, p, f);
+      } else {
+        denc(e, p);
+      }
+    }
+  }
+  static void decode(container& s, ceph::buffer::ptr::const_iterator& p,
+		     uint64_t f = 0) {
+    for (auto& e : s)
+      denc(e, p, f);
+  }
+  template<typename U=T>
+  static std::enable_if_t<!!sizeof(U) &&
+			  !need_contiguous>
+  decode(container& s, ceph::buffer::list::const_iterator& p) {
+    for (auto& e : s) {
+      denc(e, p);
+    }
+  }
+};
+
+template<typename... Ts>
+struct denc_traits<
+  std::tuple<Ts...>,
+  std::enable_if_t<(denc_traits<Ts>::supported && ...)>> {
+
+private:
+  static_assert(sizeof...(Ts) > 0,
+		"Zero-length tuples are not supported.");
+  using container = std::tuple<Ts...>;
+
+public:
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = (denc_traits<Ts>::featured || ...);
+  static constexpr bool bounded = (denc_traits<Ts>::bounded && ...);
+  static constexpr bool need_contiguous =
+      (denc_traits<Ts>::need_contiguous || ...);
+
+  template<typename U = container>
+  static std::enable_if_t<denc_traits<U>::featured>
+  bound_encode(const container& s, size_t& p, uint64_t f) {
+    ceph::for_each(s, [&p, f] (const auto& e) {
+	if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) {
+	  denc(e, p, f);
+	} else {
+	  denc(e, p);
+	}
+      });
+  }
+  template<typename U = container>
+  static std::enable_if_t<!denc_traits<U>::featured>
+  bound_encode(const container& s, size_t& p) {
+    ceph::for_each(s, [&p] (const auto& e) {
+	denc(e, p);
+      });
+  }
+
+  template<typename U = container>
+  static std::enable_if_t<denc_traits<U>::featured>
+  encode(const container& s, ceph::buffer::list::contiguous_appender& p,
+	 uint64_t f) {
+    ceph::for_each(s, [&p, f] (const auto& e) {
+	if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) {
+	  denc(e, p, f);
+	} else {
+	  denc(e, p);
+	}
+      });
+  }
+  template<typename U = container>
+  static std::enable_if_t<!denc_traits<U>::featured>
+  encode(const container& s, ceph::buffer::list::contiguous_appender& p) {
+    ceph::for_each(s, [&p] (const auto& e) {
+	denc(e, p);
+      });
+  }
+
+  static void decode(container& s, ceph::buffer::ptr::const_iterator& p,
+		     uint64_t f = 0) {
+    ceph::for_each(s, [&p] (auto& e) {
+	denc(e, p);
+      });
+  }
+
+  template<typename U = container>
+  static std::enable_if_t<!denc_traits<U>::need_contiguous>
+  decode(container& s, ceph::buffer::list::const_iterator& p, uint64_t f = 0) {
+    ceph::for_each(s, [&p] (auto& e) {
+	denc(e, p);
+      });
+  }
+};
+
+//
+// boost::optional<T>
+//
+template<typename T>
+struct denc_traits<
+  boost::optional<T>,
+  std::enable_if_t<denc_traits<T>::supported>> {
+  using traits = denc_traits<T>;
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = traits::featured;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = traits::need_contiguous;
+
+  static void bound_encode(const boost::optional<T>& v, size_t& p,
+			   uint64_t f = 0) {
+    p += sizeof(bool);
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void encode(const boost::optional<T>& v,
+		     ceph::buffer::list::contiguous_appender& p,
+		     uint64_t f = 0) {
+    denc((bool)v, p);
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void decode(boost::optional<T>& v, ceph::buffer::ptr::const_iterator& p,
+		     uint64_t f = 0) {
+    bool x;
+    denc(x, p, f);
+    if (x) {
+      v = T{};
+      denc(*v, p, f);
+    } else {
+      v = boost::none;
+    }
+  }
+
+  template<typename U = T>
+  static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+  decode(boost::optional<T>& v, ceph::buffer::list::const_iterator& p) {
+    bool x;
+    denc(x, p);
+    if (x) {
+      v = T{};
+      denc(*v, p);
+    } else {
+      v = boost::none;
+    }
+  }
+
+  template<typename U = T>
+  static void encode_nohead(const boost::optional<T>& v,
+			    ceph::buffer::list::contiguous_appender& p,
+			    uint64_t f = 0) {
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void decode_nohead(bool num, boost::optional<T>& v,
+			    ceph::buffer::ptr::const_iterator& p, uint64_t f = 0) {
+    if (num) {
+      v = T();
+      denc(*v, p, f);
+    } else {
+      v = boost::none;
+    }
+  }
+};
+
+template<>
+struct denc_traits<boost::none_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+
+  static void bound_encode(const boost::none_t& v, size_t& p) {
+    p += sizeof(bool);
+  }
+
+  static void encode(const boost::none_t& v,
+		     ceph::buffer::list::contiguous_appender& p) {
+    denc(false, p);
+  }
+};
+
+//
+// std::optional<T>
+//
+template<typename T>
+struct denc_traits<
+  std::optional<T>,
+  std::enable_if_t<denc_traits<T>::supported>> {
+  using traits = denc_traits<T>;
+
+  static constexpr bool supported = true;
+  static constexpr bool featured = traits::featured;
+  static constexpr bool bounded = false;
+  static constexpr bool need_contiguous = traits::need_contiguous;
+
+  static void bound_encode(const std::optional<T>& v, size_t& p,
+			   uint64_t f = 0) {
+    p += sizeof(bool);
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void encode(const std::optional<T>& v,
+		     ceph::buffer::list::contiguous_appender& p,
+		     uint64_t f = 0) {
+    denc((bool)v, p);
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void decode(std::optional<T>& v, ceph::buffer::ptr::const_iterator& p,
+		     uint64_t f = 0) {
+    bool x;
+    denc(x, p, f);
+    if (x) {
+      v = T{};
+      denc(*v, p, f);
+    } else {
+      v = std::nullopt;
+    }
+  }
+
+  template<typename U = T>
+  static std::enable_if_t<!!sizeof(U) && !need_contiguous>
+  decode(std::optional<T>& v, ceph::buffer::list::const_iterator& p) {
+    bool x;
+    denc(x, p);
+    if (x) {
+      v = T{};
+      denc(*v, p);
+    } else {
+      v = std::nullopt;
+    }
+  }
+
+  static void encode_nohead(const std::optional<T>& v,
+			    ceph::buffer::list::contiguous_appender& p,
+			    uint64_t f = 0) {
+    if (v) {
+      if constexpr (featured) {
+        denc(*v, p, f);
+      } else {
+        denc(*v, p);
+      }
+    }
+  }
+
+  static void decode_nohead(bool num, std::optional<T>& v,
+			    ceph::buffer::ptr::const_iterator& p, uint64_t f = 0) {
+    if (num) {
+      v = T();
+      denc(*v, p, f);
+    } else {
+      v = std::nullopt;
+    }
+  }
+};
+
+template<>
+struct denc_traits<std::nullopt_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+
+  static void bound_encode(const std::nullopt_t& v, size_t& p) {
+    p += sizeof(bool);
+  }
+
+  static void encode(const std::nullopt_t& v,
+		     ceph::buffer::list::contiguous_appender& p) {
+    denc(false, p);
+  }
+};
+
+// ----------------------------------------------------------------------
+// class helpers
+
+// Write denc_traits<> for a class that defines bound_encode/encode/decode
+// methods.
+
+#define WRITE_CLASS_DENC(T) _DECLARE_CLASS_DENC(T, false)
+#define WRITE_CLASS_DENC_BOUNDED(T) _DECLARE_CLASS_DENC(T, true)
+#define _DECLARE_CLASS_DENC(T, b)					\
+  template<> struct denc_traits<T> {					\
+    static constexpr bool supported = true;				\
+    static constexpr bool featured = false;				\
+    static constexpr bool bounded = b;					\
+    static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\
+    static void bound_encode(const T& v, size_t& p, uint64_t f=0) {	\
+      v.bound_encode(p);						\
+    }									\
+    static void encode(const T& v, ::ceph::buffer::list::contiguous_appender& p, \
+		       uint64_t f=0) {					\
+      v.encode(p);							\
+    }									\
+    static void decode(T& v, ::ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { \
+      v.decode(p);							\
+    }									\
+  };
+
+#define WRITE_CLASS_DENC_FEATURED(T) _DECLARE_CLASS_DENC_FEATURED(T, false)
+#define WRITE_CLASS_DENC_FEATURED_BOUNDED(T) _DECLARE_CLASS_DENC_FEATURED(T, true)
+#define _DECLARE_CLASS_DENC_FEATURED(T, b)				\
+  template<> struct denc_traits<T> {					\
+    static constexpr bool supported = true;				\
+    static constexpr bool featured = true;				\
+    static constexpr bool bounded = b;					\
+    static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\
+    static void bound_encode(const T& v, size_t& p, uint64_t f) {	\
+      v.bound_encode(p, f);						\
+    }									\
+    static void encode(const T& v, ::ceph::buffer::list::contiguous_appender& p, \
+		       uint64_t f) {					\
+      v.encode(p, f);							\
+    }									\
+    static void decode(T& v, ::ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { \
+      v.decode(p, f);							\
+    }									\
+  };
+
+// ----------------------------------------------------------------------
+// encoded_sizeof_wrapper
+
+namespace ceph {
+
+template <typename T, typename traits=denc_traits<T>>
+constexpr std::enable_if_t<traits::supported && traits::bounded, size_t>
+encoded_sizeof_bounded() {
+  size_t p = 0;
+  traits::bound_encode(T(), p);
+  return p;
+}
+
+template <typename T, typename traits=denc_traits<T>>
+std::enable_if_t<traits::supported, size_t>
+encoded_sizeof(const T &t) {
+  size_t p = 0;
+  traits::bound_encode(t, p);
+  return p;
+}
+
+} // namespace ceph
+
+
+// ----------------------------------------------------------------------
+// encode/decode wrappers
+
+// These glue the new-style denc world into old-style calls to encode
+// and decode by calling into denc_traits<> methods (when present).
+
+namespace ceph {
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::featured> encode(
+  const T& o,
+  ceph::buffer::list& bl,
+  uint64_t features_unused=0)
+{
+  size_t len = 0;
+  traits::bound_encode(o, len);
+  auto a = bl.get_contiguous_appender(len);
+  traits::encode(o, a);
+}
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && traits::featured> encode(
+  const T& o, ::ceph::buffer::list& bl,
+  uint64_t features)
+{
+  size_t len = 0;
+  traits::bound_encode(o, len, features);
+  auto a = bl.get_contiguous_appender(len);
+  traits::encode(o, a, features);
+}
+
+template<typename T,
+	 typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::need_contiguous> decode(
+  T& o,
+  ::ceph::buffer::list::const_iterator& p)
+{
+  if (p.end())
+    throw ::ceph::buffer::end_of_buffer();
+  const auto& bl = p.get_bl();
+  const auto remaining = bl.length() - p.get_off();
+  // it is expensive to rebuild a contigous buffer and drop it, so avoid this.
+  if (!p.is_pointing_same_raw(bl.back()) && remaining > CEPH_PAGE_SIZE) {
+    traits::decode(o, p);
+  } else {
+    // ensure we get a contigous buffer... until the end of the
+    // ceph::buffer::list.  we don't really know how much we'll need here,
+    // unfortunately.  hopefully it is already contiguous and we're just
+    // bumping the raw ref and initializing the ptr tmp fields.
+    ceph::buffer::ptr tmp;
+    auto t = p;
+    t.copy_shallow(remaining, tmp);
+    auto cp = std::cbegin(tmp);
+    traits::decode(o, cp);
+    p += cp.get_offset();
+  }
+}
+
+template<typename T,
+	 typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && traits::need_contiguous> decode(
+  T& o,
+  ceph::buffer::list::const_iterator& p)
+{
+  if (p.end())
+    throw ceph::buffer::end_of_buffer();
+  // ensure we get a contigous buffer... until the end of the
+  // ceph::buffer::list.  we don't really know how much we'll need here,
+  // unfortunately.  hopefully it is already contiguous and we're just
+  // bumping the raw ref and initializing the ptr tmp fields.
+  ceph::buffer::ptr tmp;
+  auto t = p;
+  t.copy_shallow(p.get_bl().length() - p.get_off(), tmp);
+  auto cp = std::cbegin(tmp);
+  traits::decode(o, cp);
+  p += cp.get_offset();
+}
+
+// nohead variants
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported &&
+			!traits::featured> encode_nohead(
+  const T& o,
+  ceph::buffer::list& bl)
+{
+  size_t len = 0;
+  traits::bound_encode(o, len);
+  auto a = bl.get_contiguous_appender(len);
+  traits::encode_nohead(o, a);
+}
+
+template<typename T, typename traits=denc_traits<T>>
+inline std::enable_if_t<traits::supported && !traits::featured> decode_nohead(
+  size_t num,
+  T& o,
+  ceph::buffer::list::const_iterator& p)
+{
+  if (!num)
+    return;
+  if (p.end())
+    throw ceph::buffer::end_of_buffer();
+  if constexpr (traits::need_contiguous) {
+    ceph::buffer::ptr tmp;
+    auto t = p;
+    if constexpr (denc_traits<typename T::value_type>::bounded) {
+      size_t element_size = 0;
+      typename T::value_type v;
+      denc_traits<typename T::value_type>::bound_encode(v, element_size);
+      t.copy_shallow(num * element_size, tmp);
+    } else {
+      t.copy_shallow(p.get_bl().length() - p.get_off(), tmp);
+    }
+    auto cp = std::cbegin(tmp);
+    traits::decode_nohead(num, o, cp);
+    p += cp.get_offset();
+  } else {
+    traits::decode_nohead(num, o, p);
+  }
+}
+}
+
+
+// ----------------------------------------------------------------
+// DENC
+
+// These are some class methods we need to do the version and length
+// wrappers for DENC_{START,FINISH} for inter-version
+// interoperability.
+
+#define DENC_HELPERS							\
+  /* bound_encode */							\
+  static void _denc_start(size_t& p,					\
+			  __u8 *struct_v,				\
+			  __u8 *struct_compat,				\
+			  char **, uint32_t *) {			\
+    p += 2 + 4;								\
+  }									\
+  static void _denc_finish(size_t& p,					\
+			   __u8 *struct_v,				\
+			   __u8 *struct_compat,				\
+			   char **, uint32_t *) { }			\
+  /* encode */								\
+  static void _denc_start(::ceph::buffer::list::contiguous_appender& p,	\
+			  __u8 *struct_v,				\
+			  __u8 *struct_compat,				\
+			  char **len_pos,				\
+			  uint32_t *start_oob_off) {			\
+    denc(*struct_v, p);							\
+    denc(*struct_compat, p);						\
+    *len_pos = p.get_pos_add(4);					\
+    *start_oob_off = p.get_out_of_band_offset();			\
+  }									\
+  static void _denc_finish(::ceph::buffer::list::contiguous_appender& p, \
+			   __u8 *struct_v,				\
+			   __u8 *struct_compat,				\
+			   char **len_pos,				\
+			   uint32_t *start_oob_off) {			\
+    *(ceph_le32*)*len_pos = p.get_pos() - *len_pos - sizeof(uint32_t) +	\
+      p.get_out_of_band_offset() - *start_oob_off;			\
+  }									\
+  /* decode */								\
+  static void _denc_start(::ceph::buffer::ptr::const_iterator& p,	\
+			  __u8 *struct_v,				\
+			  __u8 *struct_compat,				\
+			  char **start_pos,				\
+			  uint32_t *struct_len) {			\
+    denc(*struct_v, p);							\
+    denc(*struct_compat, p);						\
+    denc(*struct_len, p);						\
+    *start_pos = const_cast<char*>(p.get_pos());			\
+  }									\
+  static void _denc_finish(::ceph::buffer::ptr::const_iterator& p,	\
+			   __u8 *struct_v, __u8 *struct_compat,		\
+			   char **start_pos,				\
+			   uint32_t *struct_len) {			\
+    const char *pos = p.get_pos();					\
+    char *end = *start_pos + *struct_len;				\
+    if (pos > end) {							\
+      throw ::ceph::buffer::malformed_input(__PRETTY_FUNCTION__);	\
+    }									\
+    if (pos < end) {							\
+      p += end - pos;							\
+    }									\
+  }
+
+// Helpers for versioning the encoding.  These correspond to the
+// {ENCODE,DECODE}_{START,FINISH} macros.
+
+#define DENC_START(v, compat, p)					\
+  __u8 struct_v = v;							\
+  __u8 struct_compat = compat;						\
+  char *_denc_pchar;							\
+  uint32_t _denc_u32;							\
+  _denc_start(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32);	\
+  do {
+
+#define DENC_FINISH(p)							\
+  } while (false);							\
+  _denc_finish(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32);
+
+
+// ----------------------------------------------------------------------
+
+// Helpers for writing a unified bound_encode/encode/decode
+// implementation that won't screw up buffer size estimations.
+
+#define DENC(Type, v, p)						\
+  DENC_HELPERS								\
+  void bound_encode(size_t& p) const {					\
+    _denc_friend(*this, p);						\
+  }									\
+  void encode(::ceph::buffer::list::contiguous_appender& p) const {	\
+    DENC_DUMP_PRE(Type);						\
+    _denc_friend(*this, p);						\
+  }									\
+  void decode(::ceph::buffer::ptr::const_iterator& p) {			\
+    _denc_friend(*this, p);						\
+  }									\
+  template<typename T, typename P>					\
+  friend std::enable_if_t<std::is_same_v<T, Type> ||			\
+			  std::is_same_v<T, const Type>>		\
+  _denc_friend(T& v, P& p)
+
+#define DENC_FEATURED(Type, v, p, f)					\
+  DENC_HELPERS								\
+  void bound_encode(size_t& p, uint64_t f) const {			\
+    _denc_friend(*this, p, f);						\
+  }									\
+  void encode(::ceph::buffer::list::contiguous_appender& p, uint64_t f) const { \
+    DENC_DUMP_PRE(Type);						\
+    _denc_friend(*this, p, f);						\
+  }									\
+  void decode(::ceph::buffer::ptr::const_iterator& p, uint64_t f=0) {	\
+    _denc_friend(*this, p, f);						\
+  }									\
+  template<typename T, typename P>					\
+  friend std::enable_if_t<std::is_same_v<T, Type> ||			\
+			  std::is_same_v<T, const Type>>			\
+  _denc_friend(T& v, P& p, uint64_t f)
+
+#endif
diff --git a/src/include/dlfcn_compat.h b/src/include/dlfcn_compat.h
new file mode 100644
index 000000000..95fd64e51
--- /dev/null
+++ b/src/include/dlfcn_compat.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef DLFCN_COMPAT_H
+#define DLFCN_COMPAT_H
+
+#include "acconfig.h"
+
+#define SHARED_LIB_SUFFIX CMAKE_SHARED_LIBRARY_SUFFIX
+
+#ifdef _WIN32
+  #include <string>
+
+  using dl_errmsg_t = std::string;
+
+  // The load mode flags will be ignored on Windows. We keep the same
+  // values for debugging purposes though.
+  #define RTLD_LAZY       0x00001
+  #define RTLD_NOW        0x00002
+  #define RTLD_BINDING_MASK   0x3
+  #define RTLD_NOLOAD     0x00004
+  #define RTLD_DEEPBIND   0x00008
+  #define RTLD_GLOBAL     0x00100
+  #define RTLD_LOCAL      0
+  #define RTLD_NODELETE   0x01000
+
+  void* dlopen(const char *filename, int flags);
+  int dlclose(void* handle);
+  dl_errmsg_t dlerror();
+  void* dlsym(void* handle, const char* symbol);
+#else
+  #include <dlfcn.h>
+
+  using dl_errmsg_t = char*;
+#endif /* _WIN32 */
+
+#endif /* DLFCN_H */
diff --git a/src/include/elist.h b/src/include/elist.h
new file mode 100644
index 000000000..38be35dbf
--- /dev/null
+++ b/src/include/elist.h
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_ELIST_H
+#define CEPH_ELIST_H
+
+/*
+ * elist: embedded list.
+ *
+ * requirements:
+ *   - elist<T>::item be embedded in the parent class
+ *   - items are _always_ added to the list via the same elist<T>::item at the same
+ *     fixed offset in the class.
+ *   - begin(), front(), back() methods take the member offset as an argument for traversal.
+ *
+ */
+
+#define member_offset(cls, member) ((size_t)(&((cls*)1)->member) - 1)
+
+template<typename T>
+class elist {
+public:
+  struct item {
+    item *_prev, *_next;
+    
+    item(T i=0) : _prev(this), _next(this) {}
+    ~item() { 
+      ceph_assert(!is_on_list());
+    }
+
+    item(const item& other) = delete;
+    const item& operator= (const item& right) = delete;
+
+    
+    bool empty() const { return _prev == this; }
+    bool is_on_list() const { return !empty(); }
+
+    bool remove_myself() {
+      if (_next == this) {
+	ceph_assert(_prev == this);
+	return false;
+      }
+      _next->_prev = _prev;
+      _prev->_next = _next;
+      _prev = _next = this;
+      return true;
+    }
+
+    void insert_after(item *other) {
+      ceph_assert(other->empty());
+      other->_prev = this;
+      other->_next = _next;
+      _next->_prev = other;
+      _next = other;
+    }
+    void insert_before(item *other) {
+      ceph_assert(other->empty());
+      other->_next = this;
+      other->_prev = _prev;
+      _prev->_next = other;
+      _prev = other;
+    }
+
+    T get_item(size_t offset) {
+      ceph_assert(offset);
+      return (T)(((char *)this) - offset); 
+    }
+  };
+
+private:
+  item _head;
+  size_t item_offset;
+
+public:
+  elist(const elist& other);
+  const elist& operator=(const elist& other);
+
+  elist(size_t o) : _head(NULL), item_offset(o) {}
+  ~elist() { 
+    ceph_assert(_head.empty());
+  }
+
+  bool empty() const {
+    return _head.empty();
+  }
+
+  void clear() {
+    while (!_head.empty())
+      pop_front();
+  }
+
+  void push_front(item *i) {
+    if (!i->empty()) 
+      i->remove_myself();
+    _head.insert_after(i);
+  }
+  void push_back(item *i) {
+    if (!i->empty()) 
+      i->remove_myself();
+    _head.insert_before(i);
+  }
+
+  T front(size_t o=0) {
+    ceph_assert(!_head.empty());
+    return _head._next->get_item(o ? o : item_offset);
+  }
+  T back(size_t o=0) {
+    ceph_assert(!_head.empty());
+    return _head._prev->get_item(o ? o : item_offset);
+  }
+
+  void pop_front() {
+    ceph_assert(!empty());
+    _head._next->remove_myself();
+  }
+  void pop_back() {
+    ceph_assert(!empty());
+    _head._prev->remove_myself();
+  }
+
+  void clear_list() {
+    while (!empty())
+      pop_front();
+  }
+
+  enum mode_t {
+    MAGIC, CURRENT, CACHE_NEXT
+  };
+
+  class iterator {
+  private:
+    item *head;
+    item *cur, *next;
+    size_t item_offset;
+    mode_t mode;
+  public:
+    iterator(item *h, size_t o, mode_t m) :
+      head(h), cur(h->_next), next(cur->_next), item_offset(o),
+      mode(m) {
+      ceph_assert(item_offset > 0);
+    }
+    T operator*() {
+      return cur->get_item(item_offset);
+    }
+    iterator& operator++() {
+      ceph_assert(cur);
+      ceph_assert(cur != head);
+      if (mode == MAGIC) {
+	// if 'cur' appears to be valid, use that.  otherwise,
+	// use cached 'next'.
+	// this is a bit magic, and probably a bad idea... :/
+	if (cur->empty())
+	  cur = next;
+	else
+	  cur = cur->_next;
+      } else if (mode == CURRENT)
+	cur = cur->_next;
+      else if (mode == CACHE_NEXT)
+	cur = next;
+      else
+	ceph_abort();
+      next = cur->_next;
+      return *this;
+    }
+    bool end() const {
+      return cur == head;
+    }
+  };
+
+  iterator begin(size_t o=0) {
+    return iterator(&_head, o ? o : item_offset, MAGIC);
+  }
+  iterator begin_use_current(size_t o=0) {
+    return iterator(&_head, o ? o : item_offset, CURRENT);
+  }
+  iterator begin_cache_next(size_t o=0) {
+    return iterator(&_head, o ? o : item_offset, CACHE_NEXT);
+  }
+};
+
+
+#endif
diff --git a/src/include/encoding.h b/src/include/encoding.h
new file mode 100644
index 000000000..40ba9d39c
--- /dev/null
+++ b/src/include/encoding.h
@@ -0,0 +1,1548 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_ENCODING_H
+#define CEPH_ENCODING_H
+
+#include <set>
+#include <map>
+#include <deque>
+#include <vector>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <optional>
+#include <boost/container/small_vector.hpp>
+#include <boost/optional/optional_io.hpp>
+#include <boost/tuple/tuple.hpp>
+
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+#include "common/ceph_time.h"
+
+#include "include/int_types.h"
+
+#include "common/convenience.h"
+
+#include "byteorder.h"
+#include "buffer.h"
+
+// pull in the new-style encoding so that we get the denc_traits<> definition.
+#include "denc.h"
+
+#include "assert.h"
+
+using namespace ceph;
+
+namespace ceph {
+
+/*
+ * Notes on feature encoding:
+ *
+ * - The default encode() methods have a features argument with a default parameter
+ *   (which goes to zero).
+ * - Normal classes will use WRITE_CLASS_ENCODER, with that features=0 default.
+ * - Classes that _require_ features will use WRITE_CLASS_ENCODER_FEATURES, which
+ *   does not define the default.  Any caller must explicitly pass it in.
+ * - STL container macros have two encode variants: one with a features arg, and one
+ *   without.
+ *
+ * The result:
+ * - A feature encode() method will fail to compile if a value is not
+ *   passed in.
+ * - The feature varianet of the STL templates will be used when the feature arg is
+ *   provided.  It will be passed through to any template arg types, but it will be
+ *   ignored when not needed.
+ */
+
+// --------------------------------------
+// base types
+
+template<class T>
+inline void encode_raw(const T& t, bufferlist& bl)
+{
+  bl.append((char*)&t, sizeof(t));
+}
+template<class T>
+inline void decode_raw(T& t, bufferlist::const_iterator &p)
+{
+  p.copy(sizeof(t), (char*)&t);
+}
+
+#define WRITE_RAW_ENCODER(type)						\
+  inline void encode(const type &v, ::ceph::bufferlist& bl, uint64_t features=0) { ::ceph::encode_raw(v, bl); } \
+  inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { ::ceph::decode_raw(v, p); }
+
+WRITE_RAW_ENCODER(__u8)
+#ifndef _CHAR_IS_SIGNED
+WRITE_RAW_ENCODER(__s8)
+#endif
+WRITE_RAW_ENCODER(char)
+WRITE_RAW_ENCODER(ceph_le64)
+WRITE_RAW_ENCODER(ceph_le32)
+WRITE_RAW_ENCODER(ceph_le16)
+
+inline void encode(const bool &v, bufferlist& bl) {
+  __u8 vv = v;
+  encode_raw(vv, bl);
+}
+inline void decode(bool &v, bufferlist::const_iterator& p) {
+  __u8 vv;
+  decode_raw(vv, p);
+  v = vv;
+}
+
+
+// -----------------------------------
+// int types
+
+#define WRITE_INTTYPE_ENCODER(type, etype)				\
+  inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \
+    ceph_##etype e;					                \
+    e = v;                                                              \
+    ::ceph::encode_raw(e, bl);						\
+  }									\
+  inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) {	\
+    ceph_##etype e;							\
+    ::ceph::decode_raw(e, p);						\
+    v = e;								\
+  }
+
+WRITE_INTTYPE_ENCODER(uint64_t, le64)
+WRITE_INTTYPE_ENCODER(int64_t, le64)
+WRITE_INTTYPE_ENCODER(uint32_t, le32)
+WRITE_INTTYPE_ENCODER(int32_t, le32)
+WRITE_INTTYPE_ENCODER(uint16_t, le16)
+WRITE_INTTYPE_ENCODER(int16_t, le16)
+
+// -----------------------------------
+// float types
+//
+// NOTE: The following code assumes all supported platforms use IEEE binary32
+// as float and IEEE binary64 as double floating-point format.  The assumption
+// is verified by the assertions below.
+//
+// Under this assumption, we can use raw encoding of floating-point types
+// on little-endian machines, but we still need to perform a byte swap
+// on big-endian machines to ensure cross-architecture compatibility.
+// To achive that, we reinterpret the values as integers first, which are
+// byte-swapped via the ceph_le types as above.  The extra conversions
+// are optimized away on little-endian machines by the compiler.
+#define WRITE_FLTTYPE_ENCODER(type, itype, etype)			\
+  static_assert(sizeof(type) == sizeof(itype));				\
+  static_assert(std::numeric_limits<type>::is_iec559,			\
+	      "floating-point type not using IEEE754 format");		\
+  inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \
+    ceph_##etype e;							\
+    e = *reinterpret_cast<itype *>(&v);					\
+    ::ceph::encode_raw(e, bl);						\
+  }									\
+  inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) {	\
+    ceph_##etype e;							\
+    ::ceph::decode_raw(e, p);						\
+    *reinterpret_cast<itype *>(&v) = e;					\
+  }
+
+WRITE_FLTTYPE_ENCODER(float, uint32_t, le32)
+WRITE_FLTTYPE_ENCODER(double, uint64_t, le64)
+
+// see denc.h for ENCODE_DUMP_PATH discussion and definition.
+#ifdef ENCODE_DUMP_PATH
+# define ENCODE_DUMP_PRE()			\
+  unsigned pre_off = bl.length()
+# define ENCODE_DUMP_POST(cl)						\
+  do {									\
+    static int i = 0;							\
+    i++;								\
+    int bits = 0;							\
+    for (unsigned t = i; t; bits++)					\
+      t &= t - 1;							\
+    if (bits > 2)							\
+      break;								\
+    char fn[PATH_MAX];							\
+    snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #cl, getpid(), i++); \
+    int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC|O_BINARY, 0644);		\
+    if (fd >= 0) {							\
+      ::ceph::bufferlist sub;						\
+      sub.substr_of(bl, pre_off, bl.length() - pre_off);		\
+      sub.write_fd(fd);							\
+      ::close(fd);							\
+    }									\
+  } while (0)
+#else
+# define ENCODE_DUMP_PRE()
+# define ENCODE_DUMP_POST(cl)
+#endif
+
+
+#define WRITE_CLASS_ENCODER(cl)						\
+  inline void encode(const cl& c, ::ceph::buffer::list &bl, uint64_t features=0) { \
+    ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); }		\
+  inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_MEMBER_ENCODER(cl)					\
+  inline void encode(const cl &c, ::ceph::bufferlist &bl) const {	\
+    ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); }		\
+  inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_ENCODER_FEATURES(cl)				\
+  inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features) { \
+    ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); }	\
+  inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+#define WRITE_CLASS_ENCODER_OPTIONAL_FEATURES(cl)				\
+  inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features = 0) { \
+    ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); }	\
+  inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); }
+
+
+// string
+inline void encode(std::string_view s, bufferlist& bl, uint64_t features=0)
+{
+  __u32 len = s.length();
+  encode(len, bl);
+  if (len)
+    bl.append(s.data(), len);
+}
+inline void encode(const std::string& s, bufferlist& bl, uint64_t features=0)
+{
+  return encode(std::string_view(s), bl, features);
+}
+inline void decode(std::string& s, bufferlist::const_iterator& p)
+{
+  __u32 len;
+  decode(len, p);
+  s.clear();
+  p.copy(len, s);
+}
+
+inline void encode_nohead(std::string_view s, bufferlist& bl)
+{
+  bl.append(s.data(), s.length());
+}
+inline void encode_nohead(const std::string& s, bufferlist& bl)
+{
+  encode_nohead(std::string_view(s), bl);
+}
+inline void decode_nohead(int len, std::string& s, bufferlist::const_iterator& p)
+{
+  s.clear();
+  p.copy(len, s);
+}
+
+// const char* (encode only, string compatible)
+inline void encode(const char *s, bufferlist& bl) 
+{
+  encode(std::string_view(s, strlen(s)), bl);
+}
+
+// opaque byte vectors
+inline void encode(std::vector<uint8_t>& v, bufferlist& bl)
+{
+  uint32_t len = v.size();
+  encode(len, bl);
+  if (len)
+    bl.append((char *)v.data(), len);
+}
+
+inline void decode(std::vector<uint8_t>& v, bufferlist::const_iterator& p)
+{
+  uint32_t len;
+
+  decode(len, p);
+  v.resize(len);
+  p.copy(len, (char *)v.data());
+}
+
+// -----------------------------
+// buffers
+
+// bufferptr (encapsulated)
+inline void encode(const buffer::ptr& bp, bufferlist& bl) 
+{
+  __u32 len = bp.length();
+  encode(len, bl);
+  if (len)
+    bl.append(bp);
+}
+inline void decode(buffer::ptr& bp, bufferlist::const_iterator& p)
+{
+  __u32 len;
+  decode(len, p);
+
+  bufferlist s;
+  p.copy(len, s);
+
+  if (len) {
+    if (s.get_num_buffers() == 1)
+      bp = s.front();
+    else
+      bp = buffer::copy(s.c_str(), s.length());
+  }
+}
+
+// bufferlist (encapsulated)
+inline void encode(const bufferlist& s, bufferlist& bl) 
+{
+  __u32 len = s.length();
+  encode(len, bl);
+  bl.append(s);
+}
+inline void encode_destructively(bufferlist& s, bufferlist& bl) 
+{
+  __u32 len = s.length();
+  encode(len, bl);
+  bl.claim_append(s);
+}
+inline void decode(bufferlist& s, bufferlist::const_iterator& p)
+{
+  __u32 len;
+  decode(len, p);
+  s.clear();
+  p.copy(len, s);
+}
+
+inline void encode_nohead(const bufferlist& s, bufferlist& bl) 
+{
+  bl.append(s);
+}
+inline void decode_nohead(int len, bufferlist& s, bufferlist::const_iterator& p)
+{
+  s.clear();
+  p.copy(len, s);
+}
+
+// Time, since the templates are defined in std::chrono
+
+template<typename Clock, typename Duration,
+         typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr>
+void encode(const std::chrono::time_point<Clock, Duration>& t,
+	    ceph::bufferlist &bl) {
+  auto ts = Clock::to_timespec(t);
+  // A 32 bit count of seconds causes me vast unhappiness.
+  uint32_t s = ts.tv_sec;
+  uint32_t ns = ts.tv_nsec;
+  encode(s, bl);
+  encode(ns, bl);
+}
+
+template<typename Clock, typename Duration,
+         typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr>
+void decode(std::chrono::time_point<Clock, Duration>& t,
+	    bufferlist::const_iterator& p) {
+  uint32_t s;
+  uint32_t ns;
+  decode(s, p);
+  decode(ns, p);
+  struct timespec ts = {
+    static_cast<time_t>(s),
+    static_cast<long int>(ns)};
+
+  t = Clock::from_timespec(ts);
+}
+
+template<typename Rep, typename Period,
+         typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr>
+void encode(const std::chrono::duration<Rep, Period>& d,
+	    ceph::bufferlist &bl) {
+  using namespace std::chrono;
+  int32_t s = duration_cast<seconds>(d).count();
+  int32_t ns = (duration_cast<nanoseconds>(d) % seconds(1)).count();
+  encode(s, bl);
+  encode(ns, bl);
+}
+
+template<typename Rep, typename Period,
+         typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr>
+void decode(std::chrono::duration<Rep, Period>& d,
+	    bufferlist::const_iterator& p) {
+  int32_t s;
+  int32_t ns;
+  decode(s, p);
+  decode(ns, p);
+  d = std::chrono::seconds(s) + std::chrono::nanoseconds(ns);
+}
+
+// -----------------------------
+// STL container types
+
+template<typename T>
+inline void encode(const boost::optional<T> &p, bufferlist &bl);
+template<typename T>
+inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp);
+template<typename T>
+inline void encode(const std::optional<T> &p, bufferlist &bl);
+template<typename T>
+inline void decode(std::optional<T> &p, bufferlist::const_iterator &bp);
+template<class A, class B, class C>
+inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl);
+template<class A, class B, class C>
+inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp);
+template<class A, class B,
+	 typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported || !b_traits::supported>
+encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features);
+template<class A, class B,
+	 typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported ||
+			!b_traits::supported>
+encode(const std::pair<A,B> &p, bufferlist &bl);
+template<class A, class B,
+	 typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>>
+inline std::enable_if_t<!a_traits::supported ||
+			!b_traits::supported>
+decode(std::pair<A,B> &pa, bufferlist::const_iterator &p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::list<T, Alloc>& ls, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p);
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist& bl);
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist& bl, uint64_t features);
+template<class T, class Alloc>
+inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s,
+	      bufferlist& bl);
+template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s,
+	      bufferlist::iterator& p);
+template<class T, class Comp, class Alloc>
+inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl);
+template<class T, class Comp, class Alloc>
+inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::vector<T,Alloc>& v, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl);
+template<class T, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p);
+template<class T,class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist& bl,
+		   uint64_t features);
+template<class T, class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist& bl);
+template<class T, class Alloc>
+inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist::const_iterator& p);
+// small_vector
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl);
+template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p);
+// std::map
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported ||
+			!u_traits::supported>
+encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl,
+       uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m,
+			   bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+	      bufferlist& bl);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+	      bufferlist& bl, uint64_t features);
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m,
+	      bufferlist::const_iterator& p);
+template<class T, class U, class Comp, class Alloc>
+inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Comp, class Alloc>
+inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl,
+		   uint64_t features);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl);
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class Hash, class Pred, class Alloc>
+inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl);
+template<class T, class Hash, class Pred, class Alloc>
+inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p);
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features);
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl);
+template<class T, class Alloc>
+inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl);
+template<class T, size_t N, typename traits = denc_traits<T>>
+inline std::enable_if_t<!traits::supported>
+decode(std::array<T, N>& v, bufferlist::const_iterator& p);
+
+// full bl decoder
+template<class T>
+inline void decode(T &o, const bufferlist& bl)
+{
+  auto p = bl.begin();
+  decode(o, p);
+  ceph_assert(p.end());
+}
+
+// boost optional
+template<typename T>
+inline void encode(const boost::optional<T> &p, bufferlist &bl)
+{
+  __u8 present = static_cast<bool>(p);
+  encode(present, bl);
+  if (p)
+    encode(p.get(), bl);
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+template<typename T>
+inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp)
+{
+  __u8 present;
+  decode(present, bp);
+  if (present) {
+    p = T{};
+    decode(p.get(), bp);
+  } else {
+    p = boost::none;
+  }
+}
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+// std optional
+template<typename T>
+inline void encode(const std::optional<T> &p, bufferlist &bl)
+{
+  __u8 present = static_cast<bool>(p);
+  encode(present, bl);
+  if (p)
+    encode(*p, bl);
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+template<typename T>
+inline void decode(std::optional<T> &p, bufferlist::const_iterator &bp)
+{
+  __u8 present;
+  decode(present, bp);
+  if (present) {
+    p = T{};
+    decode(*p, bp);
+  } else {
+    p = std::nullopt;
+  }
+}
+
+// std::tuple
+template<typename... Ts>
+inline void encode(const std::tuple<Ts...> &t, bufferlist& bl)
+{
+  ceph::for_each(t, [&bl](const auto& e) {
+      encode(e, bl);
+    });
+}
+template<typename... Ts>
+inline void decode(std::tuple<Ts...> &t, bufferlist::const_iterator &bp)
+{
+  ceph::for_each(t, [&bp](auto& e) {
+      decode(e, bp);
+    });
+}
+
+//triple boost::tuple
+template<class A, class B, class C>
+inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl)
+{
+  encode(boost::get<0>(t), bl);
+  encode(boost::get<1>(t), bl);
+  encode(boost::get<2>(t), bl);
+}
+template<class A, class B, class C>
+inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp)
+{
+  decode(boost::get<0>(t), bp);
+  decode(boost::get<1>(t), bp);
+  decode(boost::get<2>(t), bp);
+}
+
+// std::pair<A,B>
+template<class A, class B,
+	 typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported || !b_traits::supported>
+  encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features)
+{
+  encode(p.first, bl, features);
+  encode(p.second, bl, features);
+}
+template<class A, class B,
+	 typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported ||
+			!b_traits::supported>
+  encode(const std::pair<A,B> &p, bufferlist &bl)
+{
+  encode(p.first, bl);
+  encode(p.second, bl);
+}
+template<class A, class B, typename a_traits, typename b_traits>
+inline std::enable_if_t<!a_traits::supported ||
+			!b_traits::supported>
+  decode(std::pair<A,B> &pa, bufferlist::const_iterator &p)
+{
+  decode(pa.first, p);
+  decode(pa.second, p);
+}
+
+// std::list<T>
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::list<T, Alloc>& ls, bufferlist& bl)
+{
+  __u32 n = (__u32)(ls.size());  // c++11 std::list::size() is O(1)
+  encode(n, bl);
+  for (auto p = ls.begin(); p != ls.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features)
+{
+  using counter_encode_t = ceph_le32;
+  unsigned n = 0;
+  auto filler = bl.append_hole(sizeof(counter_encode_t));
+  for (const auto& item : ls) {
+    // we count on our own because of buggy std::list::size() implementation
+    // which doesn't follow the O(1) complexity constraint C++11 has brought.
+    ++n;
+    encode(item, bl, features);
+  }
+  counter_encode_t en;
+  en = n;
+  filler.copy_in(sizeof(en), reinterpret_cast<char*>(&en));
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  ls.clear();
+  while (n--) {
+    ls.emplace_back();
+    decode(ls.back(), p);
+  }
+}
+
+// std::list<std::shared_ptr<T>>
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist& bl)
+{
+  __u32 n = (__u32)(ls.size());  // c++11 std::list::size() is O(1)
+  encode(n, bl);
+  for (const auto& ref : ls) {
+    encode(*ref, bl);
+  }
+}
+template<class T, class Alloc>
+inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist& bl, uint64_t features)
+{
+  __u32 n = (__u32)(ls.size());  // c++11 std::list::size() is O(1)
+  encode(n, bl);
+  for (const auto& ref : ls) {
+    encode(*ref, bl, features);
+  }
+}
+template<class T, class Alloc>
+inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls,
+		   bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  ls.clear();
+  while (n--) {
+    auto ref = std::make_shared<T>();
+    decode(*ref, p);
+    ls.emplace_back(std::move(ref));
+  }
+}
+
+// std::set<T>
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl)
+{
+  __u32 n = (__u32)(s.size());
+  encode(n, bl);
+  for (auto p = s.begin(); p != s.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  s.clear();
+  while (n--) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+template<class T, class Comp, class Alloc, typename traits>
+inline typename std::enable_if<!traits::supported>::type
+  encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl)
+{
+  for (auto p = s.begin(); p != s.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+  for (int i=0; i<len; i++) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+// boost::container::flat_set<T>
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl)
+{
+  __u32 n = (__u32)(s.size());
+  encode(n, bl);
+  for (const auto& e : s)
+    encode(e, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  s.clear();
+  s.reserve(n);
+  while (n--) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s,
+	      bufferlist& bl)
+{
+  for (const auto& e : s)
+    encode(e, bl);
+}
+template<class T, class Comp, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s,
+	      bufferlist::iterator& p)
+{
+  s.reserve(len);
+  for (int i=0; i<len; i++) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+// multiset
+template<class T, class Comp, class Alloc>
+inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl)
+{
+  __u32 n = (__u32)(s.size());
+  encode(n, bl);
+  for (auto p = s.begin(); p != s.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Comp, class Alloc>
+inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  s.clear();
+  while (n--) {
+    T v;
+    decode(v, p);
+    s.insert(v);
+  }
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (auto p = v.begin(); p != v.end(); ++p)
+    encode(*p, bl, features);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const std::vector<T,Alloc>& v, bufferlist& bl)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (auto p = v.begin(); p != v.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  v.resize(n);
+  for (__u32 i=0; i<n; i++) 
+    decode(v[i], p);
+}
+
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl)
+{
+  for (auto p = v.begin(); p != v.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p)
+{
+  v.resize(len);
+  for (__u32 i=0; i<v.size(); i++) 
+    decode(v[i], p);
+}
+
+// small vector
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (const auto& i : v)
+    encode(i, bl, features);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (const auto& i : v)
+    encode(i, bl);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  v.resize(n);
+  for (auto& i : v)
+    decode(i, p);
+}
+
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl)
+{
+  for (const auto& i : v)
+    encode(i, bl);
+}
+template<class T, std::size_t N, class Alloc, typename traits>
+inline std::enable_if_t<!traits::supported>
+  decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p)
+{
+  v.resize(len);
+  for (auto& i : v)
+    decode(i, p);
+}
+
+
+// vector (shared_ptr)
+template<class T,class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist& bl,
+		   uint64_t features)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (const auto& ref : v) {
+    if (ref)
+      encode(*ref, bl, features);
+    else
+      encode(T(), bl, features);
+  }
+}
+template<class T, class Alloc>
+inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist& bl)
+{
+  __u32 n = (__u32)(v.size());
+  encode(n, bl);
+  for (const auto& ref : v) {
+    if (ref)
+      encode(*ref, bl);
+    else
+      encode(T(), bl);
+  }
+}
+template<class T, class Alloc>
+inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v,
+		   bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  v.clear();
+  v.reserve(n);
+  while (n--) {
+    auto ref = std::make_shared<T>();
+    decode(*ref, p);
+    v.emplace_back(std::move(ref));
+  }
+}
+
+// map
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported ||
+			!u_traits::supported>
+  encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features)
+{
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+
+// boost::container::flat-map
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (typename boost::container::flat_map<T,U,Comp>::const_iterator p
+	 = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl,
+	 uint64_t features)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  m.reserve(n);
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m,
+			   bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.reserve(m.size() + n);
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+		bufferlist& bl)
+{
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+  inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
+		bufferlist& bl, uint64_t features)
+{
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Comp, class Alloc,
+	 typename t_traits, typename u_traits>
+inline std::enable_if_t<!t_traits::supported || !u_traits::supported>
+  decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m,
+		bufferlist::const_iterator& p)
+{
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+
+// multimap
+template<class T, class U, class Comp, class Alloc>
+inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Comp, class Alloc>
+inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    typename std::pair<T,U> tu = std::pair<T,U>();
+    decode(tu.first, p);
+    typename std::multimap<T,U,Comp,Alloc>::iterator it = m.insert(tu);
+    decode(it->second, p);
+  }
+}
+
+// ceph::unordered_map
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl,
+		   uint64_t features)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl, features);
+    encode(p->second, bl, features);
+  }
+}
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    encode(p->first, bl);
+    encode(p->second, bl);
+  }
+}
+template<class T, class U, class Hash, class Pred, class Alloc>
+inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    decode(m[k], p);
+  }
+}
+
+// ceph::unordered_set
+template<class T, class Hash, class Pred, class Alloc>
+inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl)
+{
+  __u32 n = (__u32)(m.size());
+  encode(n, bl);
+  for (auto p = m.begin(); p != m.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Hash, class Pred, class Alloc>
+inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  m.clear();
+  while (n--) {
+    T k;
+    decode(k, p);
+    m.insert(k);
+  }
+}
+
+// deque
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features)
+{
+  __u32 n = ls.size();
+  encode(n, bl);
+  for (auto p = ls.begin(); p != ls.end(); ++p)
+    encode(*p, bl, features);
+}
+template<class T, class Alloc>
+inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl)
+{
+  __u32 n = ls.size();
+  encode(n, bl);
+  for (auto p = ls.begin(); p != ls.end(); ++p)
+    encode(*p, bl);
+}
+template<class T, class Alloc>
+inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p)
+{
+  __u32 n;
+  decode(n, p);
+  ls.clear();
+  while (n--) {
+    ls.emplace_back();
+    decode(ls.back(), p);
+  }
+}
+
+// std::array<T, N>
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features)
+{
+  for (const auto& e : v)
+    encode(e, bl, features);
+}
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+encode(const std::array<T, N>& v, bufferlist& bl)
+{
+  for (const auto& e : v)
+    encode(e, bl);
+}
+template<class T, size_t N, typename traits>
+inline std::enable_if_t<!traits::supported>
+decode(std::array<T, N>& v, bufferlist::const_iterator& p)
+{
+  for (auto& e : v)
+    decode(e, p);
+}
+}
+
+/*
+ * guards
+ */
+
+/**
+ * start encoding block
+ *
+ * @param v current (code) version of the encoding
+ * @param compat oldest code version that can decode it
+ * @param bl bufferlist to encode to
+ *
+ */
+#define ENCODE_START(v, compat, bl)			     \
+  __u8 struct_v = v;                                         \
+  __u8 struct_compat = compat;		                     \
+  ceph_le32 struct_len;				             \
+  auto filler = (bl).append_hole(sizeof(struct_v) +	     \
+    sizeof(struct_compat) + sizeof(struct_len));	     \
+  const auto starting_bl_len = (bl).length();		     \
+  using ::ceph::encode;					     \
+  do {
+
+/**
+ * finish encoding block
+ *
+ * @param bl bufferlist we were encoding to
+ * @param new_struct_compat struct-compat value to use
+ */
+#define ENCODE_FINISH_NEW_COMPAT(bl, new_struct_compat)      \
+  } while (false);                                           \
+  if (new_struct_compat) {                                   \
+    struct_compat = new_struct_compat;                       \
+  }                                                          \
+  struct_len = (bl).length() - starting_bl_len;              \
+  filler.copy_in(sizeof(struct_v), (char *)&struct_v);       \
+  filler.copy_in(sizeof(struct_compat),			     \
+    (char *)&struct_compat);				     \
+  filler.copy_in(sizeof(struct_len), (char *)&struct_len);
+
+#define ENCODE_FINISH(bl) ENCODE_FINISH_NEW_COMPAT(bl, 0)
+
+#define DECODE_ERR_OLDVERSION(func, v, compatv)					\
+  (std::string(func) + " no longer understand old encoding version " #v " < " + std::to_string(compatv))
+
+#define DECODE_ERR_PAST(func) \
+  (std::string(func) + " decode past end of struct encoding")
+
+/**
+ * check for very old encoding
+ *
+ * If the encoded data is older than oldestv, raise an exception.
+ *
+ * @param oldestv oldest version of the code we can successfully decode.
+ */
+#define DECODE_OLDEST(oldestv)						\
+  if (struct_v < oldestv)						\
+    throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, oldestv)); 
+
+/**
+ * start a decoding block
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param bl bufferlist::iterator for the encoded data
+ */
+#define DECODE_START(v, bl)						\
+  __u8 struct_v, struct_compat;						\
+  using ::ceph::decode;							\
+  decode(struct_v, bl);						\
+  decode(struct_compat, bl);						\
+  if (v < struct_compat)						\
+    throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
+  __u32 struct_len;							\
+  decode(struct_len, bl);						\
+  if (struct_len > bl.get_remaining())					\
+    throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+  unsigned struct_end = bl.get_off() + struct_len;			\
+  do {
+
+/* BEWARE: any change to this macro MUST be also reflected in the duplicative
+ * DECODE_START_LEGACY_COMPAT_LEN! */
+#define __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, skip_v, bl)	\
+  using ::ceph::decode;							\
+  __u8 struct_v;							\
+  decode(struct_v, bl);						\
+  if (struct_v >= compatv) {						\
+    __u8 struct_compat;							\
+    decode(struct_compat, bl);					\
+    if (v < struct_compat)						\
+      throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
+  } else if (skip_v) {							\
+    if (bl.get_remaining() < skip_v)					\
+      throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+    bl +=  skip_v;							\
+  }									\
+  unsigned struct_end = 0;						\
+  if (struct_v >= lenv) {						\
+    __u32 struct_len;							\
+    decode(struct_len, bl);						\
+    if (struct_len > bl.get_remaining())				\
+      throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+    struct_end = bl.get_off() + struct_len;				\
+  }									\
+  do {
+
+/**
+ * start a decoding block with legacy support for older encoding schemes
+ *
+ * The old encoding schemes has a __u8 struct_v only, or lacked either
+ * the compat version or length.  Skip those fields conditionally.
+ *
+ * Most of the time, v, compatv, and lenv will all match the version
+ * where the structure was switched over to the new macros.
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param compatv oldest version that includes a __u8 compat version field
+ * @param lenv oldest version that includes a __u32 length wrapper
+ * @param bl bufferlist::iterator containing the encoded data
+ */
+
+/* BEWARE: this is duplication of __DECODE_START_LEGACY_COMPAT_LEN which
+ * MUST be changed altogether. For the rationale behind code duplication,
+ * please `git blame` and refer to the commit message. */
+#define DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, bl)		\
+  using ::ceph::decode;							\
+  __u8 struct_v;							\
+  decode(struct_v, bl);							\
+  if (struct_v >= compatv) {						\
+    __u8 struct_compat;							\
+    decode(struct_compat, bl);						\
+    if (v < struct_compat)						\
+      throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(	\
+	__PRETTY_FUNCTION__, v, struct_compat));			\
+  }									\
+  unsigned struct_end = 0;						\
+  if (struct_v >= lenv) {						\
+    __u32 struct_len;							\
+    decode(struct_len, bl);						\
+    if (struct_len > bl.get_remaining())				\
+      throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+    struct_end = bl.get_off() + struct_len;				\
+  }									\
+  do {
+
+/**
+ * start a decoding block with legacy support for older encoding schemes
+ *
+ * This version of the macro assumes the legacy encoding had a 32 bit
+ * version
+ *
+ * The old encoding schemes has a __u8 struct_v only, or lacked either
+ * the compat version or length.  Skip those fields conditionally.
+ *
+ * Most of the time, v, compatv, and lenv will all match the version
+ * where the structure was switched over to the new macros.
+ *
+ * @param v current version of the encoding that the code supports/encodes
+ * @param compatv oldest version that includes a __u8 compat version field
+ * @param lenv oldest version that includes a __u32 length wrapper
+ * @param bl bufferlist::iterator containing the encoded data
+ */
+#define DECODE_START_LEGACY_COMPAT_LEN_32(v, compatv, lenv, bl)		\
+  __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 3u, bl)
+
+#define DECODE_START_LEGACY_COMPAT_LEN_16(v, compatv, lenv, bl)		\
+  __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 1u, bl)
+
+/**
+ * finish decode block
+ *
+ * @param bl bufferlist::iterator we were decoding from
+ */
+#define DECODE_FINISH(bl)						\
+  } while (false);							\
+  if (struct_end) {							\
+    if (bl.get_off() > struct_end)					\
+      throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
+    if (bl.get_off() < struct_end)					\
+      bl += struct_end - bl.get_off();					\
+  }
+
+namespace ceph {
+
+/*
+ * Encoders/decoders to read from current offset in a file handle and
+ * encode/decode the data according to argument types.
+ */
+inline ssize_t decode_file(int fd, std::string &str)
+{
+  bufferlist bl;
+  __u32 len = 0;
+  bl.read_fd(fd, sizeof(len));
+  decode(len, bl);
+  bl.read_fd(fd, len);
+  decode(str, bl);
+  return bl.length();
+}
+
+inline ssize_t decode_file(int fd, bufferptr &bp)
+{
+  bufferlist bl;
+  __u32 len = 0;
+  bl.read_fd(fd, sizeof(len));
+  decode(len, bl);
+  bl.read_fd(fd, len);
+  auto bli = std::cbegin(bl);
+
+  decode(bp, bli);
+  return bl.length();
+}
+}
+
+#endif
diff --git a/src/include/err.h b/src/include/err.h
new file mode 100644
index 000000000..c188e9753
--- /dev/null
+++ b/src/include/err.h
@@ -0,0 +1,31 @@
+#ifndef CEPH_ERR_H
+#define CEPH_ERR_H
+
+/*
+ * adapted from linux 2.6.24 include/linux/err.h
+ */
+#define MAX_ERRNO 4095
+#define IS_ERR_VALUE(x) ((x) >= (uintptr_t)-MAX_ERRNO)
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+/* this generates a warning in c++; caller can do the cast manually
+static inline void *ERR_PTR(long error)
+{
+  return (void *) error;
+}
+*/
+
+static inline intptr_t PTR_ERR(const void *ptr)
+{
+  return (intptr_t) ptr;
+}
+
+static inline bool IS_ERR(const void *ptr)
+{
+  return IS_ERR_VALUE((uintptr_t)ptr);
+}
+
+#endif
diff --git a/src/include/error.h b/src/include/error.h
new file mode 100644
index 000000000..a548d9756
--- /dev/null
+++ b/src/include/error.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#include <stdarg.h>
+
+#ifdef    __cplusplus
+extern "C" {
+#endif
+
+#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__)
+
+#define ASSERT(c) \
+  ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1))
+
+/* print usage error message and exit */
+extern void userror(const char *use, const char *fmt, ...);
+
+/* print system error message and exit */
+extern void syserror(const char *fmt, ...);
+
+/* print error message and exit */
+extern void exiterror(const char *fmt, ...);
+
+/* print error message */
+extern void error(const char *fmt, ...);
+
+#ifdef    __cplusplus
+} // extern "C"
+#endif
diff --git a/src/include/event_type.h b/src/include/event_type.h
new file mode 100644
index 000000000..aa6ddedb4
--- /dev/null
+++ b/src/include/event_type.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_EVENT_TYPE_H
+#define CEPH_COMMON_EVENT_TYPE_H
+
+#define EVENT_SOCKET_TYPE_NONE 0
+#define EVENT_SOCKET_TYPE_PIPE 1
+#define EVENT_SOCKET_TYPE_EVENTFD 2
+
+#endif
diff --git a/src/include/expected.hpp b/src/include/expected.hpp
new file mode 100644
index 000000000..740c6ad24
--- /dev/null
+++ b/src/include/expected.hpp
@@ -0,0 +1,2282 @@
+///
+// expected - An implementation of std::expected with extensions
+// Written in 2017 by Simon Brand (@TartanLlama)
+//
+// To the extent possible under law, the author(s) have dedicated all
+// copyright and related and neighboring rights to this software to the
+// public domain worldwide. This software is distributed without any warranty.
+//
+// You should have received a copy of the CC0 Public Domain Dedication
+// along with this software. If not, see
+// <http://creativecommons.org/publicdomain/zero/1.0/>.
+///
+
+#ifndef TL_EXPECTED_HPP
+#define TL_EXPECTED_HPP
+
+#define TL_EXPECTED_VERSION_MAJOR 0
+#define TL_EXPECTED_VERSION_MINOR 2
+
+#include <exception>
+#include <functional>
+#include <type_traits>
+#include <utility>
+
+#if defined(__EXCEPTIONS) || defined(_CPPUNWIND)
+#define TL_EXPECTED_EXCEPTIONS_ENABLED
+#endif
+
+#if (defined(_MSC_VER) && _MSC_VER == 1900)
+/// \exclude
+#define TL_EXPECTED_MSVC2015
+#define TL_EXPECTED_MSVC2015_CONSTEXPR
+#else
+#define TL_EXPECTED_MSVC2015_CONSTEXPR constexpr
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 &&              \
+     !defined(__clang__))
+/// \exclude
+#define TL_EXPECTED_GCC49
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 4 &&              \
+     !defined(__clang__))
+/// \exclude
+#define TL_EXPECTED_GCC54
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 5 &&              \
+     !defined(__clang__))
+/// \exclude
+#define TL_EXPECTED_GCC55
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 &&              \
+     !defined(__clang__))
+// GCC < 5 doesn't support overloading on const&& for member functions
+/// \exclude
+#define TL_EXPECTED_NO_CONSTRR
+
+// GCC < 5 doesn't support some standard C++11 type traits
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)                         \
+  std::has_trivial_copy_constructor<T>
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T)                            \
+  std::has_trivial_copy_assign<T>
+
+// This one will be different for GCC 5.7 if it's ever supported
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T)                               \
+  std::is_trivially_destructible<T>
+
+// GCC 5 < v < 8 has a bug in is_trivially_copy_constructible which breaks std::vector
+// for non-copyable types
+#elif (defined(__GNUC__) && __GNUC__ < 8 &&                                                \
+     !defined(__clang__))
+#ifndef TL_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
+#define TL_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
+namespace tl {
+  namespace detail {
+      template<class T>
+      struct is_trivially_copy_constructible : std::is_trivially_copy_constructible<T>{};
+#ifdef _GLIBCXX_VECTOR
+      template<class T, class A>
+      struct is_trivially_copy_constructible<std::vector<T,A>>
+          : std::is_trivially_copy_constructible<T>{};
+#endif
+  }
+}
+#endif
+
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)                                     \
+  tl::detail::is_trivially_copy_constructible<T>
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T)                                        \
+  std::is_trivially_copy_assignable<T>
+#define TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T>
+#else
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)                         \
+  std::is_trivially_copy_constructible<T>
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T)                            \
+  std::is_trivially_copy_assignable<T>
+/// \exclude
+#define TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T)                               \
+  std::is_trivially_destructible<T>
+#endif
+
+#if __cplusplus > 201103L
+/// \exclude
+#define TL_EXPECTED_CXX14
+#endif
+
+#ifdef TL_EXPECTED_GCC49
+#define TL_EXPECTED_GCC49_CONSTEXPR
+#else
+#define TL_EXPECTED_GCC49_CONSTEXPR constexpr
+#endif
+
+#if (__cplusplus == 201103L || defined(TL_EXPECTED_MSVC2015) ||                \
+     defined(TL_EXPECTED_GCC49))
+/// \exclude
+#define TL_EXPECTED_11_CONSTEXPR
+#else
+/// \exclude
+#define TL_EXPECTED_11_CONSTEXPR constexpr
+#endif
+
+namespace tl {
+template <class T, class E> class expected;
+
+#ifndef TL_MONOSTATE_INPLACE_MUTEX
+#define TL_MONOSTATE_INPLACE_MUTEX
+/// \brief Used to represent an expected with no data
+class monostate {};
+
+/// \brief A tag type to tell expected to construct its value in-place
+struct in_place_t {
+  explicit in_place_t() = default;
+};
+/// \brief A tag to tell expected to construct its value in-place
+static constexpr in_place_t in_place{};
+#endif
+
+/// Used as a wrapper to store the unexpected value
+template <class E> class unexpected {
+public:
+  static_assert(!std::is_same<E, void>::value, "E must not be void");
+
+  unexpected() = delete;
+  constexpr explicit unexpected(const E &e) : m_val(e) {}
+
+  constexpr explicit unexpected(E &&e) : m_val(std::move(e)) {}
+
+  /// \returns the contained value
+  /// \group unexpected_value
+  constexpr const E &value() const & { return m_val; }
+  /// \group unexpected_value
+  TL_EXPECTED_11_CONSTEXPR E &value() & { return m_val; }
+  /// \group unexpected_value
+  TL_EXPECTED_11_CONSTEXPR E &&value() && { return std::move(m_val); }
+  /// \exclude
+  constexpr const E &&value() const && { return std::move(m_val); }
+
+private:
+  E m_val;
+};
+
+/// \brief Compares two unexpected objects
+/// \details Simply compares lhs.value() to rhs.value()
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator==(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+  return lhs.value() == rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator!=(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+  return lhs.value() != rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator<(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+  return lhs.value() < rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator<=(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+  return lhs.value() <= rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator>(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+  return lhs.value() > rhs.value();
+}
+/// \group unexpected_relop
+template <class E>
+constexpr bool operator>=(const unexpected<E> &lhs, const unexpected<E> &rhs) {
+  return lhs.value() >= rhs.value();
+}
+
+/// Create an `unexpected` from `e`, deducing the return type
+///
+/// *Example:*
+/// auto e1 = tl::make_unexpected(42);
+/// unexpected<int> e2 (42); //same semantics
+template <class E>
+unexpected<typename std::decay<E>::type> make_unexpected(E &&e) {
+  return unexpected<typename std::decay<E>::type>(std::forward<E>(e));
+}
+
+/// \brief A tag type to tell expected to construct the unexpected value
+struct unexpect_t {
+  unexpect_t() = default;
+};
+/// \brief A tag to tell expected to construct the unexpected value
+static constexpr unexpect_t unexpect{};
+
+/// \exclude
+namespace detail {
+template<typename E>
+[[noreturn]] TL_EXPECTED_11_CONSTEXPR void throw_exception(E &&e) {
+#ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+    throw std::forward<E>(e);
+#else
+  #ifdef _MSC_VER
+    __assume(0);
+  #else
+    __builtin_unreachable();
+  #endif
+#endif
+}
+
+#ifndef TL_TRAITS_MUTEX
+#define TL_TRAITS_MUTEX
+// C++14-style aliases for brevity
+template <class T> using remove_const_t = typename std::remove_const<T>::type;
+template <class T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <class T> using decay_t = typename std::decay<T>::type;
+template <bool E, class T = void>
+using enable_if_t = typename std::enable_if<E, T>::type;
+template <bool B, class T, class F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+
+// std::conjunction from C++17
+template <class...> struct conjunction : std::true_type {};
+template <class B> struct conjunction<B> : B {};
+template <class B, class... Bs>
+struct conjunction<B, Bs...>
+    : std::conditional<bool(B::value), conjunction<Bs...>, B>::type {};
+
+// std::invoke from C++17
+// https://stackoverflow.com/questions/38288042/c11-14-invoke-workaround
+template <typename Fn, typename... Args,
+          typename = enable_if_t<std::is_member_pointer<decay_t<Fn>>{}>,
+          int = 0>
+constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
+    noexcept(std::mem_fn(f)(std::forward<Args>(args)...)))
+    -> decltype(std::mem_fn(f)(std::forward<Args>(args)...)) {
+  return std::mem_fn(f)(std::forward<Args>(args)...);
+}
+
+template <typename Fn, typename... Args,
+          typename = enable_if_t<!std::is_member_pointer<decay_t<Fn>>{}>>
+constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
+    noexcept(std::forward<Fn>(f)(std::forward<Args>(args)...)))
+    -> decltype(std::forward<Fn>(f)(std::forward<Args>(args)...)) {
+  return std::forward<Fn>(f)(std::forward<Args>(args)...);
+}
+
+// std::invoke_result from C++17
+template <class F, class, class... Us> struct invoke_result_impl;
+
+template <class F, class... Us>
+struct invoke_result_impl<
+    F, decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...), void()),
+    Us...> {
+  using type = decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...));
+};
+
+template <class F, class... Us>
+using invoke_result = invoke_result_impl<F, void, Us...>;
+
+template <class F, class... Us>
+using invoke_result_t = typename invoke_result<F, Us...>::type;
+#endif
+
+// Trait for checking if a type is a tl::expected
+template <class T> struct is_expected_impl : std::false_type {};
+template <class T, class E>
+struct is_expected_impl<expected<T, E>> : std::true_type {};
+template <class T> using is_expected = is_expected_impl<decay_t<T>>;
+
+template <class T, class E, class U>
+using expected_enable_forward_value = detail::enable_if_t<
+    std::is_constructible<T, U &&>::value &&
+    !std::is_same<detail::decay_t<U>, in_place_t>::value &&
+    !std::is_same<expected<T, E>, detail::decay_t<U>>::value &&
+    !std::is_same<unexpected<E>, detail::decay_t<U>>::value>;
+
+template <class T, class E, class U, class G, class UR, class GR>
+using expected_enable_from_other = detail::enable_if_t<
+    std::is_constructible<T, UR>::value &&
+    std::is_constructible<E, GR>::value &&
+    !std::is_constructible<T, expected<U, G> &>::value &&
+    !std::is_constructible<T, expected<U, G> &&>::value &&
+    !std::is_constructible<T, const expected<U, G> &>::value &&
+    !std::is_constructible<T, const expected<U, G> &&>::value &&
+    !std::is_convertible<expected<U, G> &, T>::value &&
+    !std::is_convertible<expected<U, G> &&, T>::value &&
+    !std::is_convertible<const expected<U, G> &, T>::value &&
+    !std::is_convertible<const expected<U, G> &&, T>::value>;
+
+template <class T, class U>
+using is_void_or = conditional_t<std::is_void<T>::value, std::true_type, U>;
+
+template <class T>
+using is_copy_constructible_or_void =
+    is_void_or<T, std::is_copy_constructible<T>>;
+
+template <class T>
+using is_move_constructible_or_void =
+    is_void_or<T, std::is_move_constructible<T>>;
+
+template <class T>
+using is_copy_assignable_or_void =
+    is_void_or<T, std::is_copy_assignable<T>>;
+
+
+template <class T>
+using is_move_assignable_or_void =
+    is_void_or<T, std::is_move_assignable<T>>;
+    
+
+} // namespace detail
+
+/// \exclude
+namespace detail {
+struct no_init_t {};
+static constexpr no_init_t no_init{};
+
+// Implements the storage of the values, and ensures that the destructor is
+// trivial if it can be.
+//
+// This specialization is for where neither `T` or `E` is trivially
+// destructible, so the destructors must be called on destruction of the
+// `expected`
+template <class T, class E, bool = std::is_trivially_destructible<T>::value,
+          bool = std::is_trivially_destructible<E>::value>
+struct expected_storage_base {
+  constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {}
+  constexpr expected_storage_base(no_init_t) : m_no_init(), m_has_val(false) {}
+
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+                nullptr>
+  constexpr expected_storage_base(in_place_t, Args &&... args)
+      : m_val(std::forward<Args>(args)...), m_has_val(true) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr expected_storage_base(in_place_t, std::initializer_list<U> il,
+                                  Args &&... args)
+      : m_val(il, std::forward<Args>(args)...), m_has_val(true) {}
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+                nullptr>
+  constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+      : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr explicit expected_storage_base(unexpect_t,
+                                           std::initializer_list<U> il,
+                                           Args &&... args)
+      : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+  ~expected_storage_base() {
+    if (m_has_val) {
+      m_val.~T();
+    } else {
+      m_unexpect.~unexpected<E>();
+    }
+  }
+  union {
+    char m_no_init;
+    T m_val;
+    unexpected<E> m_unexpect;
+  };
+  bool m_has_val;
+};
+
+// This specialization is for when both `T` and `E` are trivially-destructible,
+// so the destructor of the `expected` can be trivial.
+template <class T, class E> struct expected_storage_base<T, E, true, true> {
+  constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {}
+  constexpr expected_storage_base(no_init_t) : m_no_init(), m_has_val(false) {}
+
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+                nullptr>
+  constexpr expected_storage_base(in_place_t, Args &&... args)
+      : m_val(std::forward<Args>(args)...), m_has_val(true) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr expected_storage_base(in_place_t, std::initializer_list<U> il,
+                                  Args &&... args)
+      : m_val(il, std::forward<Args>(args)...), m_has_val(true) {}
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+                nullptr>
+  constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+      : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr explicit expected_storage_base(unexpect_t,
+                                           std::initializer_list<U> il,
+                                           Args &&... args)
+      : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+  ~expected_storage_base() = default;
+  union {
+    char m_no_init;
+    T m_val;
+    unexpected<E> m_unexpect;
+  };
+  bool m_has_val;
+};
+
+// T is trivial, E is not.
+template <class T, class E> struct expected_storage_base<T, E, true, false> {
+  constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {}
+  TL_EXPECTED_MSVC2015_CONSTEXPR expected_storage_base(no_init_t)
+      : m_no_init(), m_has_val(false) {}
+
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+                nullptr>
+  constexpr expected_storage_base(in_place_t, Args &&... args)
+      : m_val(std::forward<Args>(args)...), m_has_val(true) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr expected_storage_base(in_place_t, std::initializer_list<U> il,
+                                  Args &&... args)
+      : m_val(il, std::forward<Args>(args)...), m_has_val(true) {}
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+                nullptr>
+  constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+      : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr explicit expected_storage_base(unexpect_t,
+                                           std::initializer_list<U> il,
+                                           Args &&... args)
+      : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+  ~expected_storage_base() {
+    if (!m_has_val) {
+      m_unexpect.~unexpected<E>();
+    }
+  }
+
+  union {
+    char m_no_init;
+    T m_val;
+    unexpected<E> m_unexpect;
+  };
+  bool m_has_val;
+};
+
+// E is trivial, T is not.
+template <class T, class E> struct expected_storage_base<T, E, false, true> {
+  constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {}
+  constexpr expected_storage_base(no_init_t) : m_no_init(), m_has_val(false) {}
+
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+                nullptr>
+  constexpr expected_storage_base(in_place_t, Args &&... args)
+      : m_val(std::forward<Args>(args)...), m_has_val(true) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr expected_storage_base(in_place_t, std::initializer_list<U> il,
+                                  Args &&... args)
+      : m_val(il, std::forward<Args>(args)...), m_has_val(true) {}
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+                nullptr>
+  constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+      : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr explicit expected_storage_base(unexpect_t,
+                                           std::initializer_list<U> il,
+                                           Args &&... args)
+      : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+  ~expected_storage_base() {
+    if (m_has_val) {
+      m_val.~T();
+    }
+  }
+  union {
+    char m_no_init;
+    T m_val;
+    unexpected<E> m_unexpect;
+  };
+  bool m_has_val;
+};
+
+// `T` is `void`, `E` is trivially-destructible
+template <class E> struct expected_storage_base<void, E, false, true> {
+  TL_EXPECTED_MSVC2015_CONSTEXPR expected_storage_base() : m_has_val(true) {}
+  constexpr expected_storage_base(no_init_t) : m_val(), m_has_val(false) {}
+
+  constexpr expected_storage_base(in_place_t) : m_has_val(true) {}
+
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+                nullptr>
+  constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+      : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr explicit expected_storage_base(unexpect_t,
+                                           std::initializer_list<U> il,
+                                           Args &&... args)
+      : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+  ~expected_storage_base() = default;
+  struct dummy {};
+  union {
+    dummy m_val;
+    unexpected<E> m_unexpect;
+  };
+  bool m_has_val;
+};
+
+// `T` is `void`, `E` is not trivially-destructible
+template <class E> struct expected_storage_base<void, E, false, false> {
+  constexpr expected_storage_base() : m_dummy(), m_has_val(true) {}
+  constexpr expected_storage_base(no_init_t) : m_dummy(), m_has_val(false) {}
+
+  constexpr expected_storage_base(in_place_t) : m_dummy(), m_has_val(true) {}
+
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+                nullptr>
+  constexpr explicit expected_storage_base(unexpect_t, Args &&... args)
+      : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr explicit expected_storage_base(unexpect_t,
+                                           std::initializer_list<U> il,
+                                           Args &&... args)
+      : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {}
+
+  ~expected_storage_base() {
+    if (!m_has_val) {
+      m_unexpect.~unexpected<E>();
+    }
+  }
+
+  union {
+    char m_dummy;
+    unexpected<E> m_unexpect;
+  };
+  bool m_has_val;
+};
+
+// This base class provides some handy member functions which can be used in
+// further derived classes
+template <class T, class E>
+struct expected_operations_base : expected_storage_base<T, E> {
+  using expected_storage_base<T, E>::expected_storage_base;
+
+  template <class... Args> void construct(Args &&... args) noexcept {
+    new (std::addressof(this->m_val)) T(std::forward<Args>(args)...);
+    this->m_has_val = true;
+  }
+
+  template <class Rhs> void construct_with(Rhs &&rhs) noexcept {
+    new (std::addressof(this->m_val)) T(std::forward<Rhs>(rhs).get());
+    this->m_has_val = true;
+  }
+
+  template <class... Args> void construct_error(Args &&... args) noexcept {
+    new (std::addressof(this->m_unexpect))
+        unexpected<E>(std::forward<Args>(args)...);
+    this->m_has_val = false;
+  }
+
+  #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+
+  // These assign overloads ensure that the most efficient assignment
+  // implementation is used while maintaining the strong exception guarantee.
+  // The problematic case is where rhs has a value, but *this does not.
+  //
+  // This overload handles the case where we can just copy-construct `T`
+  // directly into place without throwing.
+  template <class U = T,
+            detail::enable_if_t<std::is_nothrow_copy_constructible<U>::value>
+                * = nullptr>
+  void assign(const expected_operations_base &rhs) noexcept {
+    if (!this->m_has_val && rhs.m_has_val) {
+      geterr().~unexpected<E>();
+      construct(rhs.get());
+    } else {
+      assign_common(rhs);
+    }
+  }
+
+  // This overload handles the case where we can attempt to create a copy of
+  // `T`, then no-throw move it into place if the copy was successful.
+  template <class U = T,
+            detail::enable_if_t<!std::is_nothrow_copy_constructible<U>::value &&
+                                std::is_nothrow_move_constructible<U>::value>
+                * = nullptr>
+  void assign(const expected_operations_base &rhs) noexcept {
+    if (!this->m_has_val && rhs.m_has_val) {
+      T tmp = rhs.get();
+      geterr().~unexpected<E>();
+      construct(std::move(tmp));
+    } else {
+      assign_common(rhs);
+    }
+  }
+
+  // This overload is the worst-case, where we have to move-construct the
+  // unexpected value into temporary storage, then try to copy the T into place.
+  // If the construction succeeds, then everything is fine, but if it throws,
+  // then we move the old unexpected value back into place before rethrowing the
+  // exception.
+  template <class U = T,
+            detail::enable_if_t<!std::is_nothrow_copy_constructible<U>::value &&
+                                !std::is_nothrow_move_constructible<U>::value>
+                * = nullptr>
+  void assign(const expected_operations_base &rhs) {
+    if (!this->m_has_val && rhs.m_has_val) {
+      auto tmp = std::move(geterr());
+      geterr().~unexpected<E>();
+
+      try {
+        construct(rhs.get());
+      } catch (...) {
+        geterr() = std::move(tmp);
+        throw;
+      }
+    } else {
+      assign_common(rhs);
+    }
+  }
+
+  // These overloads do the same as above, but for rvalues
+  template <class U = T,
+            detail::enable_if_t<std::is_nothrow_move_constructible<U>::value>
+                * = nullptr>
+  void assign(expected_operations_base &&rhs) noexcept {
+    if (!this->m_has_val && rhs.m_has_val) {
+      geterr().~unexpected<E>();
+      construct(std::move(rhs).get());
+    } else {
+      assign_common(std::move(rhs));
+    }
+  }
+
+  template <class U = T,
+            detail::enable_if_t<!std::is_nothrow_move_constructible<U>::value>
+                * = nullptr>
+  void assign(expected_operations_base &&rhs) {
+    if (!this->m_has_val && rhs.m_has_val) {
+      auto tmp = std::move(geterr());
+      geterr().~unexpected<E>();
+      try {
+        construct(std::move(rhs).get());
+      } catch (...) {
+        geterr() = std::move(tmp);
+        throw;
+      }
+    } else {
+      assign_common(std::move(rhs));
+    }
+  }
+
+  #else
+
+  // If exceptions are disabled then we can just copy-construct
+  void assign(const expected_operations_base &rhs) noexcept {
+    if (!this->m_has_val && rhs.m_has_val) {
+      geterr().~unexpected<E>();
+      construct(rhs.get());
+    } else {
+      assign_common(rhs);
+    }
+  }
+
+  void assign(expected_operations_base &&rhs) noexcept {
+    if (!this->m_has_val && rhs.m_has_val) {
+      geterr().~unexpected<E>();
+      construct(std::move(rhs).get());
+    } else {
+      assign_common(rhs);
+    }
+  }
+
+  #endif
+
+  // The common part of move/copy assigning
+  template <class Rhs> void assign_common(Rhs &&rhs) {
+    if (this->m_has_val) {
+      if (rhs.m_has_val) {
+        get() = std::forward<Rhs>(rhs).get();
+      } else {
+		destroy_val();
+        construct_error(std::forward<Rhs>(rhs).geterr());
+      }
+    } else {
+      if (!rhs.m_has_val) {
+        geterr() = std::forward<Rhs>(rhs).geterr();
+      }
+    }
+  }
+
+  bool has_value() const { return this->m_has_val; }
+
+  TL_EXPECTED_11_CONSTEXPR T &get() & { return this->m_val; }
+  constexpr const T &get() const & { return this->m_val; }
+  TL_EXPECTED_11_CONSTEXPR T &&get() && { return std::move(this->m_val); }
+#ifndef TL_EXPECTED_NO_CONSTRR
+  constexpr const T &&get() const && { return std::move(this->m_val); }
+#endif
+
+  TL_EXPECTED_11_CONSTEXPR unexpected<E> &geterr() & {
+    return this->m_unexpect;
+  }
+  constexpr const unexpected<E> &geterr() const & { return this->m_unexpect; }
+  TL_EXPECTED_11_CONSTEXPR unexpected<E> &&geterr() && {
+    return std::move(this->m_unexpect);
+  }
+#ifndef TL_EXPECTED_NO_CONSTRR
+  constexpr const unexpected<E> &&geterr() const && {
+    return std::move(this->m_unexpect);
+  }
+#endif
+
+  constexpr void destroy_val() {
+	get().~T();
+  }
+};
+
+// This base class provides some handy member functions which can be used in
+// further derived classes
+template <class E>
+struct expected_operations_base<void, E> : expected_storage_base<void, E> {
+  using expected_storage_base<void, E>::expected_storage_base;
+
+  template <class... Args> void construct() noexcept { this->m_has_val = true; }
+
+  // This function doesn't use its argument, but needs it so that code in
+  // levels above this can work independently of whether T is void
+  template <class Rhs> void construct_with(Rhs &&) noexcept {
+    this->m_has_val = true;
+  }
+
+  template <class... Args> void construct_error(Args &&... args) noexcept {
+    new (std::addressof(this->m_unexpect))
+        unexpected<E>(std::forward<Args>(args)...);
+    this->m_has_val = false;
+  }
+
+  template <class Rhs> void assign(Rhs &&rhs) noexcept {
+    if (!this->m_has_val) {
+      if (rhs.m_has_val) {
+        geterr().~unexpected<E>();
+        construct();
+      } else {
+        geterr() = std::forward<Rhs>(rhs).geterr();
+      }
+    } else {
+      if (!rhs.m_has_val) {
+        construct_error(std::forward<Rhs>(rhs).geterr());
+      }
+    }
+  }
+
+  bool has_value() const { return this->m_has_val; }
+
+  TL_EXPECTED_11_CONSTEXPR unexpected<E> &geterr() & {
+    return this->m_unexpect;
+  }
+  constexpr const unexpected<E> &geterr() const & { return this->m_unexpect; }
+  TL_EXPECTED_11_CONSTEXPR unexpected<E> &&geterr() && {
+    return std::move(this->m_unexpect);
+  }
+#ifndef TL_EXPECTED_NO_CONSTRR
+  constexpr const unexpected<E> &&geterr() const && {
+    return std::move(this->m_unexpect);
+  }
+#endif
+
+  constexpr void destroy_val() {
+	  //no-op
+  }
+};
+
+// This class manages conditionally having a trivial copy constructor
+// This specialization is for when T and E are trivially copy constructible
+template <class T, class E,
+          bool = is_void_or<T, TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)>::
+              value &&TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(E)::value>
+struct expected_copy_base : expected_operations_base<T, E> {
+  using expected_operations_base<T, E>::expected_operations_base;
+};
+
+// This specialization is for when T or E are not trivially copy constructible
+template <class T, class E>
+struct expected_copy_base<T, E, false> : expected_operations_base<T, E> {
+  using expected_operations_base<T, E>::expected_operations_base;
+
+  expected_copy_base() = default;
+  expected_copy_base(const expected_copy_base &rhs)
+      : expected_operations_base<T, E>(no_init) {
+    if (rhs.has_value()) {
+      this->construct_with(rhs);
+    } else {
+      this->construct_error(rhs.geterr());
+    }
+  }
+
+  expected_copy_base(expected_copy_base &&rhs) = default;
+  expected_copy_base &operator=(const expected_copy_base &rhs) = default;
+  expected_copy_base &operator=(expected_copy_base &&rhs) = default;
+};
+
+// This class manages conditionally having a trivial move constructor
+// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it
+// doesn't implement an analogue to std::is_trivially_move_constructible. We
+// have to make do with a non-trivial move constructor even if T is trivially
+// move constructible
+#ifndef TL_EXPECTED_GCC49
+template <class T, class E,
+          bool = is_void_or<T, std::is_trivially_move_constructible<T>>::value
+              &&std::is_trivially_move_constructible<E>::value>
+struct expected_move_base : expected_copy_base<T, E> {
+  using expected_copy_base<T, E>::expected_copy_base;
+};
+#else
+template <class T, class E, bool = false> struct expected_move_base;
+#endif
+template <class T, class E>
+struct expected_move_base<T, E, false> : expected_copy_base<T, E> {
+  using expected_copy_base<T, E>::expected_copy_base;
+
+  expected_move_base() = default;
+  expected_move_base(const expected_move_base &rhs) = default;
+
+  expected_move_base(expected_move_base &&rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value)
+      : expected_copy_base<T, E>(no_init) {
+    if (rhs.has_value()) {
+      this->construct_with(std::move(rhs));
+    } else {
+      this->construct_error(std::move(rhs.geterr()));
+    }
+  }
+  expected_move_base &operator=(const expected_move_base &rhs) = default;
+  expected_move_base &operator=(expected_move_base &&rhs) = default;
+};
+
+// This class manages conditionally having a trivial copy assignment operator
+template <class T, class E,
+          bool = is_void_or<
+              T, conjunction<TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T),
+                             TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T),
+                             TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T)>>::value
+              &&TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(E)::value
+                  &&TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(E)::value
+                      &&TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(E)::value>
+struct expected_copy_assign_base : expected_move_base<T, E> {
+  using expected_move_base<T, E>::expected_move_base;
+};
+
+template <class T, class E>
+struct expected_copy_assign_base<T, E, false> : expected_move_base<T, E> {
+  using expected_move_base<T, E>::expected_move_base;
+
+  expected_copy_assign_base() = default;
+  expected_copy_assign_base(const expected_copy_assign_base &rhs) = default;
+
+  expected_copy_assign_base(expected_copy_assign_base &&rhs) = default;
+  expected_copy_assign_base &operator=(const expected_copy_assign_base &rhs) {
+    this->assign(rhs);
+    return *this;
+  }
+  expected_copy_assign_base &
+  operator=(expected_copy_assign_base &&rhs) = default;
+};
+
+// This class manages conditionally having a trivial move assignment operator
+// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it
+// doesn't implement an analogue to std::is_trivially_move_assignable. We have
+// to make do with a non-trivial move assignment operator even if T is trivially
+// move assignable
+#ifndef TL_EXPECTED_GCC49
+template <class T, class E,
+          bool =
+              is_void_or<T, conjunction<std::is_trivially_destructible<T>,
+                                        std::is_trivially_move_constructible<T>,
+                                        std::is_trivially_move_assignable<T>>>::
+                  value &&std::is_trivially_destructible<E>::value
+                      &&std::is_trivially_move_constructible<E>::value
+                          &&std::is_trivially_move_assignable<E>::value>
+struct expected_move_assign_base : expected_copy_assign_base<T, E> {
+  using expected_copy_assign_base<T, E>::expected_copy_assign_base;
+};
+#else
+template <class T, class E, bool = false> struct expected_move_assign_base;
+#endif
+
+template <class T, class E>
+struct expected_move_assign_base<T, E, false>
+    : expected_copy_assign_base<T, E> {
+  using expected_copy_assign_base<T, E>::expected_copy_assign_base;
+
+  expected_move_assign_base() = default;
+  expected_move_assign_base(const expected_move_assign_base &rhs) = default;
+
+  expected_move_assign_base(expected_move_assign_base &&rhs) = default;
+
+  expected_move_assign_base &
+  operator=(const expected_move_assign_base &rhs) = default;
+
+  expected_move_assign_base &
+  operator=(expected_move_assign_base &&rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value
+          &&std::is_nothrow_move_assignable<T>::value) {
+    this->assign(std::move(rhs));
+    return *this;
+  }
+};
+
+// expected_delete_ctor_base will conditionally delete copy and move
+// constructors depending on whether T is copy/move constructible
+template <class T, class E,
+          bool EnableCopy = (is_copy_constructible_or_void<T>::value &&
+                             std::is_copy_constructible<E>::value),
+          bool EnableMove = (is_move_constructible_or_void<T>::value &&
+                             std::is_move_constructible<E>::value)>
+struct expected_delete_ctor_base {
+  expected_delete_ctor_base() = default;
+  expected_delete_ctor_base(const expected_delete_ctor_base &) = default;
+  expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = default;
+  expected_delete_ctor_base &
+  operator=(const expected_delete_ctor_base &) = default;
+  expected_delete_ctor_base &
+  operator=(expected_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_ctor_base<T, E, true, false> {
+  expected_delete_ctor_base() = default;
+  expected_delete_ctor_base(const expected_delete_ctor_base &) = default;
+  expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = delete;
+  expected_delete_ctor_base &
+  operator=(const expected_delete_ctor_base &) = default;
+  expected_delete_ctor_base &
+  operator=(expected_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_ctor_base<T, E, false, true> {
+  expected_delete_ctor_base() = default;
+  expected_delete_ctor_base(const expected_delete_ctor_base &) = delete;
+  expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = default;
+  expected_delete_ctor_base &
+  operator=(const expected_delete_ctor_base &) = default;
+  expected_delete_ctor_base &
+  operator=(expected_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_ctor_base<T, E, false, false> {
+  expected_delete_ctor_base() = default;
+  expected_delete_ctor_base(const expected_delete_ctor_base &) = delete;
+  expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = delete;
+  expected_delete_ctor_base &
+  operator=(const expected_delete_ctor_base &) = default;
+  expected_delete_ctor_base &
+  operator=(expected_delete_ctor_base &&) noexcept = default;
+};
+
+// expected_delete_assign_base will conditionally delete copy and move
+// constructors depending on whether T and E are copy/move constructible +
+// assignable
+template <class T, class E,
+          bool EnableCopy = (is_copy_constructible_or_void<T>::value &&
+                             std::is_copy_constructible<E>::value &&
+                             is_copy_assignable_or_void<T>::value &&
+                             std::is_copy_assignable<E>::value),
+          bool EnableMove = (is_move_constructible_or_void<T>::value &&
+                             std::is_move_constructible<E>::value &&
+                             is_move_assignable_or_void<T>::value &&
+                             std::is_move_assignable<E>::value)>
+struct expected_delete_assign_base {
+  expected_delete_assign_base() = default;
+  expected_delete_assign_base(const expected_delete_assign_base &) = default;
+  expected_delete_assign_base(expected_delete_assign_base &&) noexcept =
+      default;
+  expected_delete_assign_base &
+  operator=(const expected_delete_assign_base &) = default;
+  expected_delete_assign_base &
+  operator=(expected_delete_assign_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_assign_base<T, E, true, false> {
+  expected_delete_assign_base() = default;
+  expected_delete_assign_base(const expected_delete_assign_base &) = default;
+  expected_delete_assign_base(expected_delete_assign_base &&) noexcept =
+      default;
+  expected_delete_assign_base &
+  operator=(const expected_delete_assign_base &) = default;
+  expected_delete_assign_base &
+  operator=(expected_delete_assign_base &&) noexcept = delete;
+};
+
+template <class T, class E>
+struct expected_delete_assign_base<T, E, false, true> {
+  expected_delete_assign_base() = default;
+  expected_delete_assign_base(const expected_delete_assign_base &) = default;
+  expected_delete_assign_base(expected_delete_assign_base &&) noexcept =
+      default;
+  expected_delete_assign_base &
+  operator=(const expected_delete_assign_base &) = delete;
+  expected_delete_assign_base &
+  operator=(expected_delete_assign_base &&) noexcept = default;
+};
+
+template <class T, class E>
+struct expected_delete_assign_base<T, E, false, false> {
+  expected_delete_assign_base() = default;
+  expected_delete_assign_base(const expected_delete_assign_base &) = default;
+  expected_delete_assign_base(expected_delete_assign_base &&) noexcept =
+      default;
+  expected_delete_assign_base &
+  operator=(const expected_delete_assign_base &) = delete;
+  expected_delete_assign_base &
+  operator=(expected_delete_assign_base &&) noexcept = delete;
+};
+
+// This is needed to be able to construct the expected_default_ctor_base which
+// follows, while still conditionally deleting the default constructor.
+struct default_constructor_tag {
+  explicit constexpr default_constructor_tag() = default;
+};
+
+// expected_default_ctor_base will ensure that expected has a deleted default
+// consturctor if T is not default constructible.
+// This specialization is for when T is default constructible
+template <class T, class E,
+          bool Enable =
+              std::is_default_constructible<T>::value || std::is_void<T>::value>
+struct expected_default_ctor_base {
+  constexpr expected_default_ctor_base() noexcept = default;
+  constexpr expected_default_ctor_base(
+      expected_default_ctor_base const &) noexcept = default;
+  constexpr expected_default_ctor_base(expected_default_ctor_base &&) noexcept =
+      default;
+  expected_default_ctor_base &
+  operator=(expected_default_ctor_base const &) noexcept = default;
+  expected_default_ctor_base &
+  operator=(expected_default_ctor_base &&) noexcept = default;
+
+  constexpr explicit expected_default_ctor_base(default_constructor_tag) {}
+};
+
+// This specialization is for when T is not default constructible
+template <class T, class E> struct expected_default_ctor_base<T, E, false> {
+  constexpr expected_default_ctor_base() noexcept = delete;
+  constexpr expected_default_ctor_base(
+      expected_default_ctor_base const &) noexcept = default;
+  constexpr expected_default_ctor_base(expected_default_ctor_base &&) noexcept =
+      default;
+  expected_default_ctor_base &
+  operator=(expected_default_ctor_base const &) noexcept = default;
+  expected_default_ctor_base &
+  operator=(expected_default_ctor_base &&) noexcept = default;
+
+  constexpr explicit expected_default_ctor_base(default_constructor_tag) {}
+};
+} // namespace detail
+
+template <class E> class bad_expected_access : public std::exception {
+public:
+  explicit bad_expected_access(E e) : m_val(std::move(e)) {}
+
+  virtual const char *what() const noexcept override {
+    return "Bad expected access";
+  }
+
+  const E &error() const & { return m_val; }
+  E &error() & { return m_val; }
+  const E &&error() const && { return std::move(m_val); }
+  E &&error() && { return std::move(m_val); }
+
+private:
+  E m_val;
+};
+
+/// An `expected<T, E>` object is an object that contains the storage for
+/// another object and manages the lifetime of this contained object `T`.
+/// Alternatively it could contain the storage for another unexpected object
+/// `E`. The contained object may not be initialized after the expected object
+/// has been initialized, and may not be destroyed before the expected object
+/// has been destroyed. The initialization state of the contained object is
+/// tracked by the expected object.
+template <class T, class E>
+class expected : private detail::expected_move_assign_base<T, E>,
+                 private detail::expected_delete_ctor_base<T, E>,
+                 private detail::expected_delete_assign_base<T, E>,
+                 private detail::expected_default_ctor_base<T, E> {
+  static_assert(!std::is_reference<T>::value, "T must not be a reference");
+  static_assert(!std::is_same<T, std::remove_cv<in_place_t>>::value,
+                "T must not be in_place_t");
+  static_assert(!std::is_same<T, std::remove_cv<unexpect_t>>::value,
+                "T must not be unexpect_t");
+  static_assert(!std::is_same<T, std::remove_cv<unexpected<E>>>::value,
+                "T must not be unexpected<E>");
+  static_assert(!std::is_reference<E>::value, "E must not be a reference");
+
+  T *valptr() { return std::addressof(this->m_val); }
+  const T *valptr() const { return std::addressof(this->m_val); }    
+  unexpected<E> *errptr() { return std::addressof(this->m_unexpect); }
+  const unexpected<E> *errptr() const { return std::addressof(this->m_unexpect); }    
+
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  U &val() {
+    return this->m_val;
+  }
+  unexpected<E> &err() { return this->m_unexpect; }
+
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  const U &val() const {
+    return this->m_val;
+  }
+  const unexpected<E> &err() const { return this->m_unexpect; }
+
+  using impl_base = detail::expected_move_assign_base<T, E>;
+  using ctor_base = detail::expected_default_ctor_base<T, E>;
+
+public:
+  typedef T value_type;
+  typedef E error_type;
+  typedef unexpected<E> unexpected_type;
+
+#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) &&               \
+    !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55)
+  /// \group and_then
+  /// Carries out some operation which returns an expected on the stored object
+  /// if there is one. \requires `std::invoke(std::forward<F>(f), value())`
+  /// returns an `expected<U>` for some `U`. \returns Let `U` be the result
+  /// of `std::invoke(std::forward<F>(f), value())`. Returns an
+  /// `expected<U>`. The return value is empty if `*this` is empty,
+  /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+  /// is returned.
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  template <class F> TL_EXPECTED_11_CONSTEXPR auto and_then(F &&f) & {
+    return and_then_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  template <class F> TL_EXPECTED_11_CONSTEXPR auto and_then(F &&f) && {
+    return and_then_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  template <class F> constexpr auto and_then(F &&f) const & {
+    return and_then_impl(*this, std::forward<F>(f));
+  }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  template <class F> constexpr auto and_then(F &&f) const && {
+    return and_then_impl(std::move(*this), std::forward<F>(f));
+  }
+#endif
+
+#else
+  /// \group and_then
+  /// Carries out some operation which returns an expected on the stored object
+  /// if there is one. \requires `std::invoke(std::forward<F>(f), value())`
+  /// returns an `expected<U>` for some `U`. \returns Let `U` be the result
+  /// of `std::invoke(std::forward<F>(f), value())`. Returns an
+  /// `expected<U>`. The return value is empty if `*this` is empty,
+  /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+  /// is returned.
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  template <class F>
+  TL_EXPECTED_11_CONSTEXPR auto
+  and_then(F &&f) & -> decltype(and_then_impl(*this, std::forward<F>(f))) {
+    return and_then_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  template <class F>
+  TL_EXPECTED_11_CONSTEXPR auto and_then(F &&f) && -> decltype(
+      and_then_impl(std::move(*this), std::forward<F>(f))) {
+    return and_then_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  template <class F>
+  constexpr auto and_then(F &&f) const & -> decltype(
+      and_then_impl(*this, std::forward<F>(f))) {
+    return and_then_impl(*this, std::forward<F>(f));
+  }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  template <class F>
+  constexpr auto and_then(F &&f) const && -> decltype(
+      and_then_impl(std::move(*this), std::forward<F>(f))) {
+    return and_then_impl(std::move(*this), std::forward<F>(f));
+  }
+#endif
+#endif
+
+#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) &&               \
+    !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55)
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. If `U` is `void`, returns an `expected<monostate,E>, otherwise
+  //  returns an `expected<U,E>`. If `*this` is unexpected, the
+  /// result is `*this`, otherwise an `expected<U,E>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &;
+  template <class F> TL_EXPECTED_11_CONSTEXPR auto map(F &&f) & {
+    return expected_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &&;
+  template <class F> TL_EXPECTED_11_CONSTEXPR auto map(F &&f) && {
+    return expected_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const &;
+  template <class F> constexpr auto map(F &&f) const & {
+    return expected_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const &&;
+  template <class F> constexpr auto map(F &&f) const && {
+    return expected_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#else
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. If `U` is `void`, returns an `expected<monostate,E>, otherwise
+  //  returns an `expected<U,E>`. If `*this` is unexpected, the
+  /// result is `*this`, otherwise an `expected<U,E>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &;
+  template <class F>
+  TL_EXPECTED_11_CONSTEXPR decltype(
+      expected_map_impl(std::declval<expected &>(), std::declval<F &&>()))
+  map(F &&f) & {
+    return expected_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &&;
+  template <class F>
+  TL_EXPECTED_11_CONSTEXPR decltype(
+      expected_map_impl(std::declval<expected>(), std::declval<F &&>()))
+  map(F &&f) && {
+    return expected_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const &;
+  template <class F>
+  constexpr decltype(expected_map_impl(std::declval<const expected &>(),
+                                       std::declval<F &&>()))
+  map(F &&f) const & {
+    return expected_map_impl(*this, std::forward<F>(f));
+  }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const &&;
+  template <class F>
+  constexpr decltype(expected_map_impl(std::declval<const expected &&>(),
+                                       std::declval<F &&>()))
+  map(F &&f) const && {
+    return expected_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#endif
+#endif
+
+#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) &&               \
+    !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55)
+  /// \brief Carries out some operation on the stored unexpected object if there
+  /// is one.
+  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. If `U` is `void`, returns an `expected<T,monostate>`, otherwise
+  /// returns an `expected<T,U>`. If `*this` has an expected
+  /// value, the result is `*this`, otherwise an `expected<T,U>` is constructed
+  /// from `make_unexpected(std::invoke(std::forward<F>(f), value()))` and is
+  /// returned.
+  ///
+  /// \group map_error
+  /// \synopsis template <class F> constexpr auto map_error(F &&f) &;
+  template <class F> TL_EXPECTED_11_CONSTEXPR auto map_error(F &&f) & {
+    return map_error_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map_error
+  /// \synopsis template <class F> constexpr auto map_error(F &&f) &&;
+  template <class F> TL_EXPECTED_11_CONSTEXPR auto map_error(F &&f) && {
+    return map_error_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map_error
+  /// \synopsis template <class F> constexpr auto map_error(F &&f) const &;
+  template <class F> constexpr auto map_error(F &&f) const & {
+    return map_error_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map_error
+  /// \synopsis template <class F> constexpr auto map_error(F &&f) const &&;
+  template <class F> constexpr auto map_error(F &&f) const && {
+    return map_error_impl(std::move(*this), std::forward<F>(f));
+  }
+#else
+  /// \brief Carries out some operation on the stored unexpected object if there
+  /// is one.
+  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns an `expected<T,U>`. If `*this` has an expected
+  /// value, the result is `*this`, otherwise an `expected<T,U>` is constructed
+  /// from `make_unexpected(std::invoke(std::forward<F>(f), value()))` and is
+  /// returned.
+  ///
+  /// \group map_error
+  /// \synopsis template <class F> constexpr auto map_error(F &&f) &;
+  template <class F>
+  TL_EXPECTED_11_CONSTEXPR decltype(map_error_impl(std::declval<expected &>(),
+                                                   std::declval<F &&>()))
+  map_error(F &&f) & {
+    return map_error_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map_error
+  /// \synopsis template <class F> constexpr auto map_error(F &&f) &&;
+  template <class F>
+  TL_EXPECTED_11_CONSTEXPR decltype(map_error_impl(std::declval<expected &&>(),
+                                                   std::declval<F &&>()))
+  map_error(F &&f) && {
+    return map_error_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map_error
+  /// \synopsis template <class F> constexpr auto map_error(F &&f) const &;
+  template <class F>
+  constexpr decltype(map_error_impl(std::declval<const expected &>(),
+                                    std::declval<F &&>()))
+  map_error(F &&f) const & {
+    return map_error_impl(*this, std::forward<F>(f));
+  }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+  /// \group map_error
+  /// \synopsis template <class F> constexpr auto map_error(F &&f) const &&;
+  template <class F>
+  constexpr decltype(map_error_impl(std::declval<const expected &&>(),
+                                    std::declval<F &&>()))
+  map_error(F &&f) const && {
+    return map_error_impl(std::move(*this), std::forward<F>(f));
+  }
+#endif
+#endif
+
+  /// \brief Calls `f` if the expectd is in the unexpected state
+  /// \requires `F` is invokable with `E`, and `std::invoke_result_t<F>`
+  /// must be void or convertible to `expcted<T,E>`.
+  /// \effects If `*this` has a value, returns `*this`.
+  /// Otherwise, if `f` returns `void`, calls `std::forward<F>(f)(E)` and returns
+  /// `std::nullopt`. Otherwise, returns `std::forward<F>(f)(E)`.
+  ///
+  /// \group or_else
+  template <class F> expected TL_EXPECTED_11_CONSTEXPR or_else(F &&f) & {
+    return or_else_impl(*this, std::forward<F>(f));
+  }
+
+  template <class F> expected TL_EXPECTED_11_CONSTEXPR or_else(F &&f) && {
+    return or_else_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  template <class F> expected constexpr or_else(F &&f) const & {
+    return or_else_impl(*this, std::forward<F>(f));
+  }
+
+#ifndef TL_EXPECTED_NO_CONSTRR
+  template <class F> expected constexpr or_else(F &&f) const && {
+    return or_else_impl(std::move(*this), std::forward<F>(f));
+  }
+#endif
+  constexpr expected() = default;
+  constexpr expected(const expected &rhs) = default;
+  constexpr expected(expected &&rhs) = default;
+  expected &operator=(const expected &rhs) = default;
+  expected &operator=(expected &&rhs) = default;
+
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * =
+                nullptr>
+  constexpr expected(in_place_t, Args &&... args)
+      : impl_base(in_place, std::forward<Args>(args)...),
+        ctor_base(detail::default_constructor_tag{}) {}
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr expected(in_place_t, std::initializer_list<U> il, Args &&... args)
+      : impl_base(in_place, il, std::forward<Args>(args)...),
+        ctor_base(detail::default_constructor_tag{}) {}
+
+  /// \group unexpected_ctor
+  /// \synopsis EXPLICIT constexpr expected(const unexpected<G> &e);
+  template <class G = E,
+            detail::enable_if_t<std::is_constructible<E, const G &>::value> * =
+                nullptr,
+            detail::enable_if_t<!std::is_convertible<const G &, E>::value> * =
+                nullptr>
+  explicit constexpr expected(const unexpected<G> &e)
+      : impl_base(unexpect, e.value()),
+        ctor_base(detail::default_constructor_tag{}) {}
+
+  /// \exclude
+  template <
+      class G = E,
+      detail::enable_if_t<std::is_constructible<E, const G &>::value> * =
+          nullptr,
+      detail::enable_if_t<std::is_convertible<const G &, E>::value> * = nullptr>
+  constexpr expected(unexpected<G> const &e)
+      : impl_base(unexpect, e.value()),
+        ctor_base(detail::default_constructor_tag{}) {}
+
+  /// \group unexpected_ctor
+  /// \synopsis EXPLICIT constexpr expected(unexpected<G> &&e);
+  template <
+      class G = E,
+      detail::enable_if_t<std::is_constructible<E, G &&>::value> * = nullptr,
+      detail::enable_if_t<!std::is_convertible<G &&, E>::value> * = nullptr>
+  explicit constexpr expected(unexpected<G> &&e) noexcept(
+      std::is_nothrow_constructible<E, G &&>::value)
+      : impl_base(unexpect, std::move(e.value())),
+        ctor_base(detail::default_constructor_tag{}) {}
+
+  /// \exclude
+  template <
+      class G = E,
+      detail::enable_if_t<std::is_constructible<E, G &&>::value> * = nullptr,
+      detail::enable_if_t<std::is_convertible<G &&, E>::value> * = nullptr>
+  constexpr expected(unexpected<G> &&e) noexcept(
+      std::is_nothrow_constructible<E, G &&>::value)
+      : impl_base(unexpect, std::move(e.value())),
+        ctor_base(detail::default_constructor_tag{}) {}
+
+  template <class... Args,
+            detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * =
+                nullptr>
+  constexpr explicit expected(unexpect_t, Args &&... args)
+      : impl_base(unexpect, std::forward<Args>(args)...),
+        ctor_base(detail::default_constructor_tag{}) {}
+
+  /// \exclude
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_constructible<
+                E, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  constexpr explicit expected(unexpect_t, std::initializer_list<U> il,
+                              Args &&... args)
+      : impl_base(unexpect, il, std::forward<Args>(args)...),
+        ctor_base(detail::default_constructor_tag{}) {}
+
+  template <class U, class G,
+            detail::enable_if_t<!(std::is_convertible<U const &, T>::value &&
+                                  std::is_convertible<G const &, E>::value)> * =
+                nullptr,
+            detail::expected_enable_from_other<T, E, U, G, const U &, const G &>
+                * = nullptr>
+  explicit TL_EXPECTED_11_CONSTEXPR expected(const expected<U, G> &rhs)
+      : ctor_base(detail::default_constructor_tag{}) {
+    if (rhs.has_value()) {
+      this->construct(*rhs);
+    } else {
+      this->construct_error(rhs.error());        
+    }
+  }
+
+  /// \exclude
+  template <class U, class G,
+            detail::enable_if_t<(std::is_convertible<U const &, T>::value &&
+                                 std::is_convertible<G const &, E>::value)> * =
+                nullptr,
+            detail::expected_enable_from_other<T, E, U, G, const U &, const G &>
+                * = nullptr>
+  TL_EXPECTED_11_CONSTEXPR expected(const expected<U, G> &rhs)
+      : ctor_base(detail::default_constructor_tag{}) {
+    if (rhs.has_value()) {
+      this->construct(*rhs);
+    } else {
+      this->construct_error(rhs.error());        
+    }      
+  }
+
+  template <
+      class U, class G,
+      detail::enable_if_t<!(std::is_convertible<U &&, T>::value &&
+                            std::is_convertible<G &&, E>::value)> * = nullptr,
+      detail::expected_enable_from_other<T, E, U, G, U &&, G &&> * = nullptr>
+  explicit TL_EXPECTED_11_CONSTEXPR expected(expected<U, G> &&rhs)
+      : ctor_base(detail::default_constructor_tag{}) {
+    if (rhs.has_value()) {
+      this->construct(std::move(*rhs));
+    } else {
+      this->construct_error(std::move(rhs.error()));        
+    }            
+  }
+
+  /// \exclude
+  template <
+      class U, class G,
+      detail::enable_if_t<(std::is_convertible<U &&, T>::value &&
+                           std::is_convertible<G &&, E>::value)> * = nullptr,
+      detail::expected_enable_from_other<T, E, U, G, U &&, G &&> * = nullptr>
+  TL_EXPECTED_11_CONSTEXPR expected(expected<U, G> &&rhs)
+      : ctor_base(detail::default_constructor_tag{}) {
+    if (rhs.has_value()) {
+      this->construct(std::move(*rhs));
+    } else {
+      this->construct_error(std::move(rhs.error()));        
+    }                  
+  }
+
+  template <
+      class U = T,
+      detail::enable_if_t<!std::is_convertible<U &&, T>::value> * = nullptr,
+      detail::expected_enable_forward_value<T, E, U> * = nullptr>
+  explicit TL_EXPECTED_MSVC2015_CONSTEXPR expected(U &&v)
+      : expected(in_place, std::forward<U>(v)) {}
+
+  /// \exclude
+  template <
+      class U = T,
+      detail::enable_if_t<std::is_convertible<U &&, T>::value> * = nullptr,
+      detail::expected_enable_forward_value<T, E, U> * = nullptr>
+  TL_EXPECTED_MSVC2015_CONSTEXPR expected(U &&v)
+      : expected(in_place, std::forward<U>(v)) {}
+
+  template <
+      class U = T, class G = T,
+      detail::enable_if_t<std::is_nothrow_constructible<T, U &&>::value> * =
+          nullptr,
+      detail::enable_if_t<!std::is_void<G>::value> * = nullptr,
+      detail::enable_if_t<
+          (!std::is_same<expected<T, E>, detail::decay_t<U>>::value &&
+           !detail::conjunction<std::is_scalar<T>,
+                                std::is_same<T, detail::decay_t<U>>>::value &&
+           std::is_constructible<T, U>::value &&
+           std::is_assignable<G &, U>::value &&
+           std::is_nothrow_move_constructible<E>::value)> * = nullptr>
+  expected &operator=(U &&v) {
+    if (has_value()) {
+      val() = std::forward<U>(v);
+    } else {
+      err().~unexpected<E>();
+      ::new (valptr()) T(std::forward<U>(v));
+      this->m_has_val = true;
+    }
+
+    return *this;
+  }
+
+  /// \exclude
+  template <
+      class U = T, class G = T,
+      detail::enable_if_t<!std::is_nothrow_constructible<T, U &&>::value> * =
+          nullptr,
+      detail::enable_if_t<!std::is_void<U>::value> * = nullptr,
+      detail::enable_if_t<
+          (!std::is_same<expected<T, E>, detail::decay_t<U>>::value &&
+           !detail::conjunction<std::is_scalar<T>,
+                                std::is_same<T, detail::decay_t<U>>>::value &&
+           std::is_constructible<T, U>::value &&
+           std::is_assignable<G &, U>::value &&
+           std::is_nothrow_move_constructible<E>::value)> * = nullptr>
+  expected &operator=(U &&v) {
+    if (has_value()) {
+      val() = std::forward<U>(v);
+    } else {
+      auto tmp = std::move(err());
+      err().~unexpected<E>();
+
+      #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+      try {
+        ::new (valptr()) T(std::move(v));
+        this->m_has_val = true;
+      } catch (...) {
+        err() = std::move(tmp);
+        throw;
+      }
+      #else
+        ::new (valptr()) T(std::move(v));
+        this->m_has_val = true;
+      #endif
+    }
+
+    return *this;
+  }
+
+  template <class G = E,
+            detail::enable_if_t<std::is_nothrow_copy_constructible<G>::value &&
+                                std::is_assignable<G &, G>::value> * = nullptr>
+  expected &operator=(const unexpected<G> &rhs) {
+    if (!has_value()) {
+      err() = rhs;
+    } else {
+      this->destroy_val();
+      ::new (errptr()) unexpected<E>(rhs);
+      this->m_has_val = false;
+    }
+
+    return *this;
+  }
+
+  template <class G = E,
+            detail::enable_if_t<std::is_nothrow_move_constructible<G>::value &&
+                                std::is_move_assignable<G>::value> * = nullptr>
+  expected &operator=(unexpected<G> &&rhs) noexcept {
+    if (!has_value()) {
+      err() = std::move(rhs);
+    } else {
+      this->destroy_val();
+      ::new (errptr()) unexpected<E>(std::move(rhs));
+      this->m_has_val = false;
+    }
+
+    return *this;
+  }
+
+  template <class... Args, detail::enable_if_t<std::is_nothrow_constructible<
+                               T, Args &&...>::value> * = nullptr>
+  void emplace(Args &&... args) {
+    if (has_value()) {
+      val() = T(std::forward<Args>(args)...);
+    } else {
+      err().~unexpected<E>();
+      ::new (valptr()) T(std::forward<Args>(args)...);
+      this->m_has_val = true;
+    }
+  }
+
+  /// \exclude
+  template <class... Args, detail::enable_if_t<!std::is_nothrow_constructible<
+                               T, Args &&...>::value> * = nullptr>
+  void emplace(Args &&... args) {
+    if (has_value()) {
+      val() = T(std::forward<Args>(args)...);
+    } else {
+      auto tmp = std::move(err());
+      err().~unexpected<E>();
+
+      #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+      try {
+        ::new (valptr()) T(std::forward<Args>(args)...);
+        this->m_has_val = true;
+      } catch (...) {
+        err() = std::move(tmp);
+        throw;
+      }
+      #else
+      ::new (valptr()) T(std::forward<Args>(args)...);
+      this->m_has_val = true;
+      #endif
+    }
+  }
+
+  template <class U, class... Args,
+            detail::enable_if_t<std::is_nothrow_constructible<
+                T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  void emplace(std::initializer_list<U> il, Args &&... args) {
+    if (has_value()) {
+      T t(il, std::forward<Args>(args)...);
+      val() = std::move(t);
+    } else {
+      err().~unexpected<E>();
+      ::new (valptr()) T(il, std::forward<Args>(args)...);
+      this->m_has_val = true;
+    }
+  }
+
+  /// \exclude
+  template <class U, class... Args,
+            detail::enable_if_t<!std::is_nothrow_constructible<
+                T, std::initializer_list<U> &, Args &&...>::value> * = nullptr>
+  void emplace(std::initializer_list<U> il, Args &&... args) {
+    if (has_value()) {
+      T t(il, std::forward<Args>(args)...);
+      val() = std::move(t);
+    } else {
+      auto tmp = std::move(err());
+      err().~unexpected<E>();
+
+      #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED
+      try {
+        ::new (valptr()) T(il, std::forward<Args>(args)...);
+        this->m_has_val = true;
+      } catch (...) {
+        err() = std::move(tmp);
+        throw;
+      }
+      #else
+      ::new (valptr()) T(il, std::forward<Args>(args)...);
+      this->m_has_val = true;
+      #endif
+    }
+  }
+
+  // TODO SFINAE
+  void swap(expected &rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value &&noexcept(
+          swap(std::declval<T &>(), std::declval<T &>())) &&
+      std::is_nothrow_move_constructible<E>::value &&
+      noexcept(swap(std::declval<E &>(), std::declval<E &>()))) {
+    if (has_value() && rhs.has_value()) {
+      using std::swap;
+      swap(val(), rhs.val());
+    } else if (!has_value() && rhs.has_value()) {
+      using std::swap;
+      swap(err(), rhs.err());
+    } else if (has_value()) {
+      auto temp = std::move(rhs.err());
+      ::new (rhs.valptr()) T(val());
+      ::new (errptr()) unexpected_type(std::move(temp));
+      std::swap(this->m_has_val, rhs.m_has_val);
+    } else {
+      auto temp = std::move(this->err());
+      ::new (valptr()) T(rhs.val());
+      ::new (errptr()) unexpected_type(std::move(temp));
+      std::swap(this->m_has_val, rhs.m_has_val);
+    }
+  }
+
+  /// \returns a pointer to the stored value
+  /// \requires a value is stored
+  /// \group pointer
+  constexpr const T *operator->() const { return valptr(); }
+  /// \group pointer
+  TL_EXPECTED_11_CONSTEXPR T *operator->() { return valptr(); }
+
+  /// \returns the stored value
+  /// \requires a value is stored
+  /// \group deref
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  constexpr const U &operator*() const & {
+    return val();
+  }
+  /// \group deref
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  TL_EXPECTED_11_CONSTEXPR U &operator*() & {
+    return val();
+  }
+  /// \group deref
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  constexpr const U &&operator*() const && {
+    return std::move(val());
+  }
+  /// \group deref
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  TL_EXPECTED_11_CONSTEXPR U &&operator*() && {
+    return std::move(val());
+  }
+
+  /// \returns whether or not the optional has a value
+  /// \group has_value
+  constexpr bool has_value() const noexcept { return this->m_has_val; }
+  /// \group has_value
+  constexpr explicit operator bool() const noexcept { return this->m_has_val; }
+
+  /// \returns the contained value if there is one, otherwise throws
+  /// [bad_expected_access]
+  ///
+  /// \group value
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  TL_EXPECTED_11_CONSTEXPR const U &value() const & {
+    if (!has_value())
+      detail::throw_exception(bad_expected_access<E>(err().value()));
+    return val();
+  }
+  /// \group value
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  TL_EXPECTED_11_CONSTEXPR U &value() & {
+    if (!has_value())
+      detail::throw_exception(bad_expected_access<E>(err().value()));
+    return val();
+  }
+  /// \group value
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  TL_EXPECTED_11_CONSTEXPR const U &&value() const && {
+    if (!has_value())
+      detail::throw_exception(bad_expected_access<E>(err().value()));
+    return std::move(val());
+  }
+  /// \group value
+  template <class U = T,
+            detail::enable_if_t<!std::is_void<U>::value> * = nullptr>
+  TL_EXPECTED_11_CONSTEXPR U &&value() && {
+    if (!has_value())
+      detail::throw_exception(bad_expected_access<E>(err().value()));
+    return std::move(val());
+  }
+
+  /// \returns the unexpected value
+  /// \requires there is an unexpected value
+  /// \group error
+  constexpr const E &error() const & { return err().value(); }
+  /// \group error
+  TL_EXPECTED_11_CONSTEXPR E &error() & { return err().value(); }
+  /// \group error
+  constexpr const E &&error() const && { return std::move(err().value()); }
+  /// \group error
+  TL_EXPECTED_11_CONSTEXPR E &&error() && { return std::move(err().value()); }
+
+  /// \returns the stored value if there is one, otherwise returns `u`
+  /// \group value_or
+  template <class U> constexpr T value_or(U &&v) const & {
+    static_assert(std::is_copy_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be copy-constructible and convertible to from U&&");
+    return bool(*this) ? **this : static_cast<T>(std::forward<U>(v));
+  }
+  /// \group value_or
+  template <class U> TL_EXPECTED_11_CONSTEXPR T value_or(U &&v) && {
+    static_assert(std::is_move_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be move-constructible and convertible to from U&&");
+    return bool(*this) ? std::move(**this) : static_cast<T>(std::forward<U>(v));
+  }
+};
+
+/// \exclude
+namespace detail {
+template <class Exp> using exp_t = typename detail::decay_t<Exp>::value_type;
+template <class Exp> using err_t = typename detail::decay_t<Exp>::error_type;
+template <class Exp, class Ret> using ret_t = expected<Ret, err_t<Exp>>;
+
+#ifdef TL_EXPECTED_CXX14
+template <class Exp, class F,
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Exp>()))>
+constexpr auto and_then_impl(Exp &&exp, F &&f) {
+  static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+
+  return exp.has_value()
+             ? detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp))
+             : Ret(unexpect, exp.error());
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+          class Ret = decltype(detail::invoke(std::declval<F>()))>
+constexpr auto and_then_impl(Exp &&exp, F &&f) {
+  static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+
+  return exp.has_value() ? detail::invoke(std::forward<F>(f))
+                         : Ret(unexpect, exp.error());
+}
+#else
+template <class> struct TC;
+template <class Exp, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Exp>())),
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr>
+auto and_then_impl(Exp &&exp, F &&f) -> Ret {
+  static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+
+  return exp.has_value()
+             ? detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp))
+             : Ret(unexpect, exp.error());
+}
+
+template <class Exp, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>())),
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr>
+constexpr auto and_then_impl(Exp &&exp, F &&f) -> Ret {
+  static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+
+  return exp.has_value() ? detail::invoke(std::forward<F>(f))
+                         : Ret(unexpect, exp.error());
+}
+#endif
+
+#ifdef TL_EXPECTED_CXX14
+template <class Exp, class F,
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,          
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Exp>())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto expected_map_impl(Exp &&exp, F &&f) {
+  using result = ret_t<Exp, detail::decay_t<Ret>>;
+  return exp.has_value() ? result(detail::invoke(std::forward<F>(f),
+                                                 *std::forward<Exp>(exp)))
+                         : result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,          
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Exp>())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto expected_map_impl(Exp &&exp, F &&f) {
+  using result = expected<void, err_t<Exp>>;
+  if (exp.has_value()) {
+    detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp));
+    return result();
+  }
+
+  return result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,
+          class Ret = decltype(detail::invoke(std::declval<F>())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto expected_map_impl(Exp &&exp, F &&f) {
+  using result = ret_t<Exp, detail::decay_t<Ret>>;
+  return exp.has_value() ? result(detail::invoke(std::forward<F>(f)))
+                         : result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,          
+          class Ret = decltype(detail::invoke(std::declval<F>())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto expected_map_impl(Exp &&exp, F &&f) {
+  using result = expected<void, err_t<Exp>>;
+  if (exp.has_value()) {
+    detail::invoke(std::forward<F>(f));
+    return result();
+  }
+
+  return result(unexpect, std::forward<Exp>(exp).error());
+}    
+#else
+template <class Exp, class F,
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,          
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Exp>())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+
+constexpr auto expected_map_impl(Exp &&exp, F &&f)
+    -> ret_t<Exp, detail::decay_t<Ret>> {
+  using result = ret_t<Exp, detail::decay_t<Ret>>;
+
+  return exp.has_value() ? result(detail::invoke(std::forward<F>(f),
+                                                 *std::forward<Exp>(exp)))
+                         : result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,                    
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Exp>())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+
+auto expected_map_impl(Exp &&exp, F &&f) -> expected<void, err_t<Exp>> {
+  if (exp.has_value()) {
+    detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp));
+    return {};
+  }
+
+  return unexpected<err_t<Exp>>(std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,                              
+          class Ret = decltype(detail::invoke(std::declval<F>())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+
+constexpr auto expected_map_impl(Exp &&exp, F &&f)
+    -> ret_t<Exp, detail::decay_t<Ret>> {
+  using result = ret_t<Exp, detail::decay_t<Ret>>;
+
+  return exp.has_value() ? result(detail::invoke(std::forward<F>(f)))
+                         : result(unexpect, std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,                                        
+          class Ret = decltype(detail::invoke(std::declval<F>())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+
+auto expected_map_impl(Exp &&exp, F &&f) -> expected<void, err_t<Exp>> {
+  if (exp.has_value()) {
+    detail::invoke(std::forward<F>(f));
+    return {};
+  }
+
+  return unexpected<err_t<Exp>>(std::forward<Exp>(exp).error());
+}    
+#endif
+
+#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) &&               \
+    !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55)
+template <class Exp, class F,
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,          
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto map_error_impl(Exp &&exp, F &&f) {
+  using result = expected<exp_t<Exp>, detail::decay_t<Ret>>;
+  return exp.has_value()
+             ? result(*std::forward<Exp>(exp))
+             : result(unexpect, detail::invoke(std::forward<F>(f),
+                                               std::forward<Exp>(exp).error()));
+}
+template <class Exp, class F,
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,                    
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto map_error_impl(Exp &&exp, F &&f) {
+  using result = expected<exp_t<Exp>, monostate>;
+  if (exp.has_value()) {
+    return result(*std::forward<Exp>(exp));
+  }
+
+  detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+  return result(unexpect, monostate{});
+}
+template <class Exp, class F,
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,          
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto map_error_impl(Exp &&exp, F &&f) {
+  using result = expected<exp_t<Exp>, detail::decay_t<Ret>>;
+  return exp.has_value()
+             ? result()
+             : result(unexpect, detail::invoke(std::forward<F>(f),
+                                               std::forward<Exp>(exp).error()));
+}
+template <class Exp, class F,
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,                    
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto map_error_impl(Exp &&exp, F &&f) {
+  using result = expected<exp_t<Exp>, monostate>;
+  if (exp.has_value()) {
+    return result();
+  }
+
+  detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+  return result(unexpect, monostate{});
+}    
+#else
+template <class Exp, class F,
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,                              
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto map_error_impl(Exp &&exp, F &&f)
+    -> expected<exp_t<Exp>, detail::decay_t<Ret>> {
+  using result = expected<exp_t<Exp>, detail::decay_t<Ret>>;
+
+  return exp.has_value()
+             ? result(*std::forward<Exp>(exp))
+             : result(unexpect, detail::invoke(std::forward<F>(f),
+                                               std::forward<Exp>(exp).error()));
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr,                                        
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto map_error_impl(Exp &&exp, F &&f) -> expected<exp_t<Exp>, monostate> {
+  using result = expected<exp_t<Exp>, monostate>;
+  if (exp.has_value()) {
+    return result(*std::forward<Exp>(exp));
+  }
+
+  detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+  return result(unexpect, monostate{});
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,                              
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto map_error_impl(Exp &&exp, F &&f)
+    -> expected<exp_t<Exp>, detail::decay_t<Ret>> {
+  using result = expected<exp_t<Exp>, detail::decay_t<Ret>>;
+
+  return exp.has_value()
+             ? result()
+             : result(unexpect, detail::invoke(std::forward<F>(f),
+                                               std::forward<Exp>(exp).error()));
+}
+
+template <class Exp, class F,
+          detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr,                                        
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+auto map_error_impl(Exp &&exp, F &&f) -> expected<exp_t<Exp>, monostate> {
+  using result = expected<exp_t<Exp>, monostate>;
+  if (exp.has_value()) {
+    return result();
+  }
+
+  detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+  return result(unexpect, monostate{});
+}    
+#endif
+
+#ifdef TL_EXPECTED_CXX14
+template <class Exp, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+constexpr auto or_else_impl(Exp &&exp, F &&f) {
+  static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+  return exp.has_value()
+  ? std::forward<Exp>(exp)
+  : detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+detail::decay_t<Exp> or_else_impl(Exp &&exp, F &&f) {
+  return exp.has_value()
+  ? std::forward<Exp>(exp)
+  : (detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()),
+    std::forward<Exp>(exp));
+}
+#else
+template <class Exp, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+  detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+auto or_else_impl(Exp &&exp, F &&f) -> Ret {
+  static_assert(detail::is_expected<Ret>::value, "F must return an expected");
+  return exp.has_value()
+         ? std::forward<Exp>(exp)
+         : detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error());
+}
+
+template <class Exp, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              std::declval<Exp>().error())),
+  detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+detail::decay_t<Exp> or_else_impl(Exp &&exp, F &&f) {
+  return exp.has_value()
+         ? std::forward<Exp>(exp)
+         : (detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()),
+            std::forward<Exp>(exp));
+}
+#endif
+} // namespace detail
+
+template <class T, class E, class U, class F>
+constexpr bool operator==(const expected<T, E> &lhs,
+                          const expected<U, F> &rhs) {
+  return (lhs.has_value() != rhs.has_value())
+             ? false
+             : (!lhs.has_value() ? lhs.error() == rhs.error() : *lhs == *rhs);
+}
+template <class T, class E, class U, class F>
+constexpr bool operator!=(const expected<T, E> &lhs,
+                          const expected<U, F> &rhs) {
+  return (lhs.has_value() != rhs.has_value())
+             ? true
+             : (!lhs.has_value() ? lhs.error() != rhs.error() : *lhs != *rhs);
+}
+
+template <class T, class E, class U>
+constexpr bool operator==(const expected<T, E> &x, const U &v) {
+  return x.has_value() ? *x == v : false;
+}
+template <class T, class E, class U>
+constexpr bool operator==(const U &v, const expected<T, E> &x) {
+  return x.has_value() ? *x == v : false;
+}
+template <class T, class E, class U>
+constexpr bool operator!=(const expected<T, E> &x, const U &v) {
+  return x.has_value() ? *x != v : true;
+}
+template <class T, class E, class U>
+constexpr bool operator!=(const U &v, const expected<T, E> &x) {
+  return x.has_value() ? *x != v : true;
+}
+
+template <class T, class E>
+constexpr bool operator==(const expected<T, E> &x, const unexpected<E> &e) {
+  return x.has_value() ? false : x.error() == e.value();
+}
+template <class T, class E>
+constexpr bool operator==(const unexpected<E> &e, const expected<T, E> &x) {
+  return x.has_value() ? false : x.error() == e.value();
+}
+template <class T, class E>
+constexpr bool operator!=(const expected<T, E> &x, const unexpected<E> &e) {
+  return x.has_value() ? true : x.error() != e.value();
+}
+template <class T, class E>
+constexpr bool operator!=(const unexpected<E> &e, const expected<T, E> &x) {
+  return x.has_value() ? true : x.error() != e.value();
+}
+
+// TODO is_swappable
+template <class T, class E,
+          detail::enable_if_t<std::is_move_constructible<T>::value &&
+                              std::is_move_constructible<E>::value> * = nullptr>
+void swap(expected<T, E> &lhs,
+          expected<T, E> &rhs) noexcept(noexcept(lhs.swap(rhs))) {
+  lhs.swap(rhs);
+}
+} // namespace tl
+
+#define TL_OPTIONAL_EXPECTED_MUTEX
+#endif
diff --git a/src/include/filepath.h b/src/include/filepath.h
new file mode 100644
index 000000000..d0965ad0c
--- /dev/null
+++ b/src/include/filepath.h
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_FILEPATH_H
+#define CEPH_FILEPATH_H
+
+/*
+ * BUG:  /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string.
+ *   -> should it be different?  how?  should this[0] be "", with depth 4?
+ *
+ */
+
+
+#include <iosfwd>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "buffer.h"
+#include "encoding.h"
+#include "include/types.h"
+#include "include/fs_types.h"
+
+#include "common/Formatter.h"
+
+
+class filepath {
+  inodeno_t ino = 0;   // base inode.  ino=0 implies pure relative path.
+  std::string path;     // relative path.
+
+  /** bits - path segments
+   * this is ['a', 'b', 'c'] for both the aboslute and relative case.
+   *
+   * NOTE: this value is LAZILY maintained... i.e. it's a cache
+   */
+  mutable std::vector<std::string> bits;
+  bool encoded = false;
+
+  void rebuild_path() {
+    path.clear();
+    for (unsigned i=0; i<bits.size(); i++) {
+      if (i) path += "/";
+      path += bits[i];
+    }
+  }
+  void parse_bits() const {
+    bits.clear();
+    int off = 0;
+    while (off < (int)path.length()) {
+      int nextslash = path.find('/', off);
+      if (nextslash < 0) 
+        nextslash = path.length();  // no more slashes
+      if (((nextslash - off) > 0) || encoded) {
+        // skip empty components unless they were introduced deliberately
+        // see commit message for more detail
+        bits.push_back( path.substr(off,nextslash-off) );
+      }
+      off = nextslash+1;
+    }
+  }
+
+ public:
+  filepath() = default;
+  filepath(std::string_view p, inodeno_t i) : ino(i), path(p) {}
+  filepath(const filepath& o) {
+    ino = o.ino;
+    path = o.path;
+    bits = o.bits;
+    encoded = o.encoded;
+  }
+  filepath(inodeno_t i) : ino(i) {}
+  filepath& operator=(const char* path) {
+    set_path(path);
+    return *this;
+  }
+
+  /*
+   * if we are fed a relative path as a string, either set ino=0 (strictly
+   * relative) or 1 (absolute).  throw out any leading '/'.
+   */
+  filepath(std::string_view s) { set_path(s); }
+  filepath(const char* s) { set_path(s); }
+
+  void set_path(std::string_view s, inodeno_t b) {
+    path = s;
+    ino = b;
+  }
+  void set_path(std::string_view s) {
+    if (s[0] == '/') {
+      path = s.substr(1);
+      ino = 1;
+    } else {
+      ino = 0;
+      path = s;
+    }
+    bits.clear();
+  }
+
+
+  // accessors
+  inodeno_t get_ino() const { return ino; }
+  const std::string& get_path() const { return path; }
+  const char *c_str() const { return path.c_str(); }
+
+  int length() const { return path.length(); }
+  unsigned depth() const {
+    if (bits.empty() && path.length() > 0) parse_bits();
+    return bits.size();
+  }
+  bool empty() const { return path.length() == 0 && ino == 0; }
+
+  bool absolute() const { return ino == 1; }
+  bool pure_relative() const { return ino == 0; }
+  bool ino_relative() const { return ino > 0; }
+
+  const std::string& operator[](int i) const {
+    if (bits.empty() && path.length() > 0) parse_bits();
+    return bits[i];
+  }
+
+  const std::string& last_dentry() const {
+    if (bits.empty() && path.length() > 0) parse_bits();
+    ceph_assert(!bits.empty());
+    return bits[ bits.size()-1 ];
+  }
+
+  filepath prefixpath(int s) const {
+    filepath t(ino);
+    for (int i=0; i<s; i++)
+      t.push_dentry(bits[i]);
+    return t;
+  }
+  filepath postfixpath(int s) const {
+    filepath t;
+    for (unsigned i=s; i<bits.size(); i++)
+      t.push_dentry(bits[i]);
+    return t;
+  }
+
+
+  // modifiers
+  //  string can be relative "a/b/c" (ino=0) or absolute "/a/b/c" (ino=1)
+  void _set_ino(inodeno_t i) { ino = i; }
+  void clear() {
+    ino = 0;
+    path = "";
+    bits.clear();
+  }
+
+  void pop_dentry() {
+    if (bits.empty() && path.length() > 0) 
+      parse_bits();
+    bits.pop_back();
+    rebuild_path();
+  }
+  void push_dentry(std::string_view s) {
+    if (bits.empty() && path.length() > 0) 
+      parse_bits();
+    if (!bits.empty())
+      path += "/";
+    path += s;
+    bits.emplace_back(s);
+  }
+  void push_dentry(const std::string& s) {
+    push_dentry(std::string_view(s));
+  }
+  void push_dentry(const char *cs) {
+    push_dentry(std::string_view(cs, strlen(cs)));
+  }
+  void push_front_dentry(const std::string& s) {
+    bits.insert(bits.begin(), s);
+    rebuild_path();
+  }
+  void append(const filepath& a) {
+    ceph_assert(a.pure_relative());
+    for (unsigned i=0; i<a.depth(); i++) 
+      push_dentry(a[i]);
+  }
+
+  // encoding
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    __u8 struct_v = 1;
+    encode(struct_v, bl);
+    encode(ino, bl);
+    encode(path, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& blp) {
+    using ceph::decode;
+    bits.clear();
+    __u8 struct_v;
+    decode(struct_v, blp);
+    decode(ino, blp);
+    decode(path, blp);
+    encoded = true;
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_unsigned("base_ino", ino);
+    f->dump_string("relative_path", path);
+  }
+  static void generate_test_instances(std::list<filepath*>& o) {
+    o.push_back(new filepath);
+    o.push_back(new filepath("/usr/bin", 0));
+    o.push_back(new filepath("/usr/sbin", 1));
+    o.push_back(new filepath("var/log", 1));
+    o.push_back(new filepath("foo/bar", 101));
+  }
+
+  bool is_last_dot_or_dotdot() const {
+    if (depth() > 0) {
+      std::string dname = last_dentry();
+      if (dname == "." || dname == "..") {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  bool is_last_snap() const {
+    // walk into snapdir?
+    return depth() > 0 && bits[0].length() == 0;
+  }
+};
+
+WRITE_CLASS_ENCODER(filepath)
+
+inline std::ostream& operator<<(std::ostream& out, const filepath& path)
+{
+  if (path.get_ino()) {
+    out << '#' << path.get_ino();
+    if (path.length())
+      out << '/';
+  }
+  return out << path.get_path();
+}
+
+#endif
diff --git a/src/include/frag.h b/src/include/frag.h
new file mode 100644
index 000000000..ec18bddfb
--- /dev/null
+++ b/src/include/frag.h
@@ -0,0 +1,615 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_FRAG_H
+#define CEPH_FRAG_H
+
+#include <boost/container/small_vector.hpp>
+
+#include <iostream>
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "buffer.h"
+#include "compact_map.h"
+
+#include "ceph_frag.h"
+#include "include/encoding.h"
+#include "include/ceph_assert.h"
+
+#include "common/dout.h"
+
+/*
+ * 
+ * the goal here is to use a binary split strategy to partition a namespace.  
+ * frag_t represents a particular fragment.  bits() tells you the size of the
+ * fragment, and value() it's name.  this is roughly analogous to an ip address
+ * and netmask.
+ * 
+ * fragtree_t represents an entire namespace and it's partition.  it essentially 
+ * tells you where fragments are split into other fragments, and by how much 
+ * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
+ * 
+ * this vaguely resembles a btree, in that when a fragment becomes large or small
+ * we can split or merge, except that there is no guarantee of being balanced.
+ *
+ * presumably we are partitioning the output of a (perhaps specialized) hash 
+ * function.
+ */
+
+/**
+ * frag_t
+ *
+ * description of an individual fragment.  that is, a particular piece
+ * of the overall namespace.
+ *
+ * this is conceptually analogous to an ip address and netmask.
+ *
+ * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
+ *
+ * we write it as v/b, where v is a value and b is the number of bits.
+ * 0/0 (bits==0) corresponds to the entire namespace.  if we bisect that,
+ * we get 0/1 and 1/1.  quartering gives us 0/2, 1/2, 2/2, 3/2.  and so on.
+ *
+ * this makes the right most bit of v the "most significant", which is the 
+ * opposite of what we usually see.
+ */
+
+/*
+ * TODO:
+ *  - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) 
+ *    iteration efficient (see, e.g., try_assimilate_children()
+ *  - rework frag_t so that we mask the left-most (most significant) bits instead of
+ *    the right-most (least significant) bits.  just because it's more intuitive, and
+ *    matches the network/netmask concept.
+ */
+
+class frag_t {
+  /*
+   * encoding is dictated by frag_* functions in ceph_fs.h.  use those
+   * helpers _exclusively_.
+   */
+public:
+  using _frag_t = uint32_t;
+  
+  frag_t() = default;
+  frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { }
+  frag_t(_frag_t e) : _enc(e) { }
+
+  // constructors
+  void from_unsigned(unsigned e) { _enc = e; }
+  
+  // accessors
+  unsigned value() const { return ceph_frag_value(_enc); }
+  unsigned bits() const { return ceph_frag_bits(_enc); }
+  unsigned mask() const { return ceph_frag_mask(_enc); }
+  unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); }
+
+  operator _frag_t() const { return _enc; }
+
+  // tests
+  bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); }
+  bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); }
+  bool is_root() const { return bits() == 0; }
+  frag_t parent() const {
+    ceph_assert(bits() > 0);
+    return frag_t(ceph_frag_parent(_enc));
+  }
+
+  // splitting
+  frag_t make_child(int i, int nb) const {
+    ceph_assert(i < (1<<nb));
+    return frag_t(ceph_frag_make_child(_enc, nb, i));
+  }
+  template<typename T>
+  void split(int nb, T& fragments) const {
+    ceph_assert(nb > 0);
+    unsigned nway = 1 << nb;
+    for (unsigned i=0; i<nway; i++) 
+      fragments.push_back(make_child(i, nb));
+  }
+
+  // binary splitting
+  frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); }
+  frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); }
+
+  bool is_left() const { return ceph_frag_is_left_child(_enc); }
+  bool is_right() const { return ceph_frag_is_right_child(_enc); }
+  frag_t get_sibling() const {
+    ceph_assert(!is_root());
+    return frag_t(ceph_frag_sibling(_enc));
+  }
+
+  // sequencing
+  bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); }
+  bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); }
+  frag_t next() const {
+    ceph_assert(!is_rightmost());
+    return frag_t(ceph_frag_next(_enc));
+  }
+
+  // parse
+  bool parse(const char *s) {
+    int pvalue, pbits;
+    int r = sscanf(s, "%x/%d", &pvalue, &pbits);
+    if (r == 2) {
+      *this = frag_t(pvalue, pbits);
+      return true;
+    }
+    return false;
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    ceph::encode_raw(_enc, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    __u32 v;
+    ceph::decode_raw(v, p);
+    _enc = v;
+  }
+  bool operator<(const frag_t& b) const
+  {
+    if (value() != b.value())
+      return value() < b.value();
+    else
+      return bits() < b.bits();
+  }
+private:
+  _frag_t _enc = 0;
+};
+WRITE_CLASS_ENCODER(frag_t)
+
+inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
+{
+  //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
+  unsigned num = hb.bits();
+  if (num) {
+    unsigned val = hb.value();
+    for (unsigned bit = 23; num; num--, bit--) 
+      out << ((val & (1<<bit)) ? '1':'0');
+  }
+  return out << '*';
+}
+
+
+using frag_vec_t = boost::container::small_vector<frag_t, 4>;
+
+/**
+ * fragtree_t -- partition an entire namespace into one or more frag_t's. 
+ */
+class fragtree_t {
+  // pairs <f, b>:
+  //  frag_t f is split by b bits.
+  //  if child frag_t does not appear, it is not split.
+public:
+  compact_map<frag_t,int32_t> _splits;
+
+public:
+  // -------------
+  // basics
+  void swap(fragtree_t& other) {
+    _splits.swap(other._splits);
+  }
+  void clear() {
+    _splits.clear();
+  }
+
+  // -------------
+  // accessors
+  bool empty() const { 
+    return _splits.empty();
+  }
+  int get_split(const frag_t hb) const {
+    compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
+    if (p == _splits.end())
+      return 0;
+    else
+      return p->second;
+  }
+
+  
+  bool is_leaf(frag_t x) const {
+    frag_vec_t s;
+    get_leaves_under(x, s);
+    //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
+    return s.size() == 1 && s.front() == x;
+  }
+
+  /**
+   * get_leaves -- list all leaves
+   */
+  template<typename T>
+  void get_leaves(T& c) const {
+    return get_leaves_under_split(frag_t(), c);
+  }
+
+  /**
+   * get_leaves_under_split -- list all leaves under a known split point (or root)
+   */
+  template<typename T>
+  void get_leaves_under_split(frag_t under, T& c) const {
+    frag_vec_t s;
+    s.push_back(under);
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      int nb = get_split(t);
+      if (nb) 
+	t.split(nb, s);   // queue up children
+      else
+	c.push_back(t);  // not spit, it's a leaf.
+    }
+  }
+
+  /**
+   * get_branch -- get branch point at OR above frag @a x
+   *  - may be @a x itself, if @a x is a split
+   *  - may be root (frag_t())
+   */
+  frag_t get_branch(frag_t x) const {
+    while (1) {
+      if (x == frag_t()) return x;  // root
+      if (get_split(x)) return x;   // found it!
+      x = x.parent();
+    }
+  }
+
+  /**
+   * get_branch_above -- get a branch point above frag @a x
+   *  - may be root (frag_t())
+   *  - may NOT be @a x, even if @a x is a split.
+   */
+  frag_t get_branch_above(frag_t x) const {
+    while (1) {
+      if (x == frag_t()) return x;  // root
+      x = x.parent();
+      if (get_split(x)) return x;   // found it!
+    }
+  }
+
+
+  /**
+   * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
+   *  - may be @a x itself, if @a x is a split or leaf
+   *  - may be root (frag_t())
+   */
+  frag_t get_branch_or_leaf(frag_t x) const {
+    frag_t branch = get_branch(x);
+    int nb = get_split(branch);
+    if (nb > 0 &&                                  // if branch is a split, and
+	branch.bits() + nb <= x.bits())            // one of the children is or contains x 
+      return frag_t(x.value(), branch.bits()+nb);  // then return that child (it's a leaf)
+    else
+      return branch;
+  }
+
+  /**
+   * get_leaves_under(x, ls) -- search for any leaves fully contained by x
+   */
+  template<typename T>
+  void get_leaves_under(frag_t x, T& c) const {
+    frag_vec_t s;
+    s.push_back(get_branch_or_leaf(x));
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      if (t.bits() >= x.bits() &&    // if t is more specific than x, and
+	  !x.contains(t))            // x does not contain t,
+	continue;         // then skip
+      int nb = get_split(t);
+      if (nb) 
+	t.split(nb, s);   // queue up children
+      else if (x.contains(t))
+	c.push_back(t);  // not spit, it's a leaf.
+    }
+  }
+
+  /**
+   * contains(fg) -- does fragtree contain the specific frag @a x
+   */
+  bool contains(frag_t x) const {
+    frag_vec_t s;
+    s.push_back(get_branch(x));
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      if (t.bits() >= x.bits() &&  // if t is more specific than x, and
+	  !x.contains(t))          // x does not contain t,
+	continue;         // then skip 
+      int nb = get_split(t);
+      if (nb) {
+	if (t == x) return false;  // it's split.
+	t.split(nb, s);   // queue up children
+      } else {
+	if (t == x) return true;   // it's there.
+      }
+    }
+    return false;
+  }
+
+  /** 
+   * operator[] -- map a (hash?) value to a frag
+   */
+  frag_t operator[](unsigned v) const {
+    frag_t t;
+    while (1) {
+      ceph_assert(t.contains(v));
+      int nb = get_split(t);
+
+      // is this a leaf?
+      if (nb == 0) return t;  // done.
+      
+      // pick appropriate child fragment.
+      unsigned nway = 1 << nb;
+      unsigned i;
+      for (i=0; i<nway; i++) {
+	frag_t n = t.make_child(i, nb);
+	if (n.contains(v)) {
+	  t = n;
+	  break;
+	}
+      }
+      ceph_assert(i < nway);
+    }
+  }
+
+
+  // ---------------
+  // modifiers
+  void split(frag_t x, int b, bool simplify=true) {
+    ceph_assert(is_leaf(x));
+    _splits[x] = b;
+    
+    if (simplify)
+      try_assimilate_children(get_branch_above(x));
+  }
+  void merge(frag_t x, int b, bool simplify=true) {
+    ceph_assert(!is_leaf(x));
+    ceph_assert(_splits[x] == b);
+    _splits.erase(x);
+
+    if (simplify)
+      try_assimilate_children(get_branch_above(x));
+  }
+
+  /*
+   * if all of a given split's children are identically split,
+   * then the children can be assimilated.
+   */
+  void try_assimilate_children(frag_t x) {
+    int nb = get_split(x);
+    if (!nb) return;
+    frag_vec_t children;
+    x.split(nb, children);
+    int childbits = 0;
+    for (auto& frag : children) {
+      int cb = get_split(frag);
+      if (!cb) return;  // nope.
+      if (childbits && cb != childbits) return;  // not the same
+      childbits = cb;
+    }
+    // all children are split with childbits!
+    for (auto& frag : children)
+      _splits.erase(frag);
+    _splits[x] += childbits;
+  }
+
+  bool force_to_leaf(CephContext *cct, frag_t x) {
+    if (is_leaf(x))
+      return false;
+
+    lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl;
+
+    frag_t parent = get_branch_or_leaf(x);
+    ceph_assert(parent.bits() <= x.bits());
+    lgeneric_dout(cct, 10) << "parent is " << parent << dendl;
+
+    // do we need to split from parent to x?
+    if (parent.bits() < x.bits()) {
+      int spread = x.bits() - parent.bits();
+      int nb = get_split(parent);
+      lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl;
+      if (nb == 0) {
+	// easy: split parent (a leaf) by the difference
+	lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl;
+	split(parent, spread);
+	ceph_assert(is_leaf(x));
+	return true;
+      }
+      ceph_assert(nb > spread);
+      
+      // add an intermediary split
+      merge(parent, nb, false);
+      split(parent, spread, false);
+
+      frag_vec_t subs;
+      parent.split(spread, subs);
+      for (auto& frag : subs) {
+	lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl;
+	split(frag, nb - spread, false);
+      }
+    }
+
+    // x is now a leaf or split.  
+    // hoover up any children.
+    frag_vec_t s;
+    s.push_back(x);
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      int nb = get_split(t);
+      if (nb) {
+	lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl;
+	merge(t, nb, false);    // merge this point, and
+	t.split(nb, s);         // queue up children
+      }
+    }
+
+    lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl;
+    ceph_assert(is_leaf(x));
+    return true;
+  }
+
+  // encoding
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(_splits, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    decode(_splits, p);
+  }
+  void encode_nohead(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
+	 p != _splits.end();
+	 ++p) {
+      encode(p->first, bl);
+      encode(p->second, bl);
+    }
+  }
+  void decode_nohead(int n, ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    _splits.clear();
+    while (n-- > 0) {
+      frag_t f;
+      decode(f, p);
+      decode(_splits[f], p);
+    }
+  }
+
+  void print(std::ostream& out) {
+    out << "fragtree_t(";
+    frag_vec_t s;
+    s.push_back(frag_t());
+    while (!s.empty()) {
+      frag_t t = s.back();
+      s.pop_back();
+      // newline + indent?
+      if (t.bits()) {
+	out << std::endl;
+	for (unsigned i=0; i<t.bits(); i++) out << ' ';
+      }
+      int nb = get_split(t);
+      if (nb) {
+	out << t << " %" << nb;
+	t.split(nb, s);   // queue up children
+      } else {
+	out << t;
+      }
+    }
+    out << ")";
+  }
+
+  void dump(ceph::Formatter *f) const {
+    f->open_array_section("splits");
+    for (auto p = _splits.begin(); p != _splits.end(); ++p) {
+      f->open_object_section("split");
+      std::ostringstream frag_str;
+      frag_str << p->first;
+      f->dump_string("frag", frag_str.str());
+      f->dump_int("children", p->second);
+      f->close_section(); // split
+    }
+    f->close_section(); // splits
+  }
+};
+WRITE_CLASS_ENCODER(fragtree_t)
+
+inline bool operator==(const fragtree_t& l, const fragtree_t& r) {
+  return l._splits == r._splits;
+}
+inline bool operator!=(const fragtree_t& l, const fragtree_t& r) {
+  return l._splits != r._splits;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
+{
+  out << "fragtree_t(";
+  
+  for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
+       p != ft._splits.end();
+       ++p) {
+    if (p != ft._splits.begin())
+      out << " ";
+    out << p->first << "^" << p->second;
+  }
+  return out << ")";
+}
+
+/**
+ * fragset_t -- a set of fragments
+ */
+class fragset_t {
+  std::set<frag_t> _set;
+
+public:
+  const std::set<frag_t> &get() const { return _set; }
+  std::set<frag_t>::const_iterator begin() const { return _set.begin(); }
+  std::set<frag_t>::const_iterator end() const { return _set.end(); }
+
+  bool empty() const { return _set.empty(); }
+
+  bool contains(frag_t f) const {
+    while (1) {
+      if (_set.count(f)) return true;
+      if (f.bits() == 0) return false;
+      f = f.parent();
+    }
+  }
+
+  void clear() {
+    _set.clear();
+  }
+
+  void insert_raw(frag_t f){
+    _set.insert(f);
+  }
+  void insert(frag_t f) {
+    _set.insert(f);
+    simplify();
+  }
+
+  void simplify() {
+    auto it = _set.begin();
+    while (it != _set.end()) {
+      if (!it->is_root() &&
+	  _set.count(it->get_sibling())) {
+	_set.erase(it->get_sibling());
+	auto ret = _set.insert(it->parent());
+	_set.erase(it);
+	it = ret.first;
+      } else {
+	++it;
+      }
+    }
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    ceph::encode(_set, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    ceph::decode(_set, p);
+  }
+};
+WRITE_CLASS_ENCODER(fragset_t)
+
+
+inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs) 
+{
+  return out << "fragset_t(" << fs.get() << ")";
+}
+
+#endif
diff --git a/src/include/fs_types.h b/src/include/fs_types.h
new file mode 100644
index 000000000..c1932bfcc
--- /dev/null
+++ b/src/include/fs_types.h
@@ -0,0 +1,175 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_INCLUDE_FS_TYPES_H
+#define CEPH_INCLUDE_FS_TYPES_H
+
+#include "types.h"
+class JSONObj;
+
+#define CEPHFS_EBLOCKLISTED    108
+#define CEPHFS_EPERM           1
+#define CEPHFS_ESTALE          116
+#define CEPHFS_ENOSPC          28
+#define CEPHFS_ETIMEDOUT       110
+#define CEPHFS_EIO             5
+#define CEPHFS_ENOTCONN        107
+#define CEPHFS_EEXIST          17
+#define CEPHFS_EINTR           4
+#define CEPHFS_EINVAL          22
+#define CEPHFS_EBADF           9
+#define CEPHFS_EROFS           30
+#define CEPHFS_EAGAIN          11
+#define CEPHFS_EACCES          13
+#define CEPHFS_ELOOP           40
+#define CEPHFS_EISDIR          21
+#define CEPHFS_ENOENT          2
+#define CEPHFS_ENOTDIR         20
+#define CEPHFS_ENAMETOOLONG    36
+#define CEPHFS_EBUSY           16
+#define CEPHFS_EDQUOT          122
+#define CEPHFS_EFBIG           27
+#define CEPHFS_ERANGE          34
+#define CEPHFS_ENXIO           6
+#define CEPHFS_ECANCELED       125
+#define CEPHFS_ENODATA         61
+#define CEPHFS_EOPNOTSUPP      95
+#define CEPHFS_EXDEV           18
+#define CEPHFS_ENOMEM          12
+#define CEPHFS_ENOTRECOVERABLE 131
+#define CEPHFS_ENOSYS          38
+#define CEPHFS_EWOULDBLOCK     CEPHFS_EAGAIN
+#define CEPHFS_ENOTEMPTY       39
+#define CEPHFS_EDEADLK         35
+#define CEPHFS_EDEADLOCK       CEPHFS_EDEADLK
+#define CEPHFS_EDOM            33
+#define CEPHFS_EMLINK          31
+#define CEPHFS_ETIME           62
+#define CEPHFS_EOLDSNAPC       85
+#define CEPHFS_EFAULT          14
+#define CEPHFS_EISCONN         106
+#define CEPHFS_EMULTIHOP       72
+
+// taken from linux kernel: include/uapi/linux/fcntl.h
+#define CEPHFS_AT_FDCWD        -100    /* Special value used to indicate
+                                          openat should use the current
+                                          working directory. */
+
+// --------------------------------------
+// ino
+
+typedef uint64_t _inodeno_t;
+
+struct inodeno_t {
+  _inodeno_t val;
+  inodeno_t() : val(0) {}
+  // cppcheck-suppress noExplicitConstructor
+  inodeno_t(_inodeno_t v) : val(v) {}
+  inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; }
+  operator _inodeno_t() const { return val; }
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(val, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    decode(val, p);
+  }
+} __attribute__ ((__may_alias__));
+WRITE_CLASS_ENCODER(inodeno_t)
+
+template<>
+struct denc_traits<inodeno_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const inodeno_t &o, size_t& p) {
+    denc(o.val, p);
+  }
+  static void encode(const inodeno_t &o, ceph::buffer::list::contiguous_appender& p) {
+    denc(o.val, p);
+  }
+  static void decode(inodeno_t& o, ceph::buffer::ptr::const_iterator &p) {
+    denc(o.val, p);
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const inodeno_t& ino) {
+  return out << std::hex << "0x" << ino.val << std::dec;
+}
+
+namespace std {
+template<>
+struct hash<inodeno_t> {
+  size_t operator()( const inodeno_t& x ) const {
+    static rjhash<uint64_t> H;
+    return H(x.val);
+  }
+};
+} // namespace std
+
+
+// file modes
+
+inline bool file_mode_is_readonly(int mode) {
+  return (mode & CEPH_FILE_MODE_WR) == 0;
+}
+
+
+// dentries
+#define MAX_DENTRY_LEN 255
+
+// --
+namespace ceph {
+  class Formatter;
+}
+void dump(const ceph_file_layout& l, ceph::Formatter *f);
+void dump(const ceph_dir_layout& l, ceph::Formatter *f);
+
+
+
+// file_layout_t
+
+struct file_layout_t {
+  // file -> object mapping
+  uint32_t stripe_unit;   ///< stripe unit, in bytes,
+  uint32_t stripe_count;  ///< over this many objects
+  uint32_t object_size;   ///< until objects are this big
+
+  int64_t pool_id;        ///< rados pool id
+  std::string pool_ns;         ///< rados pool namespace
+
+  file_layout_t(uint32_t su=0, uint32_t sc=0, uint32_t os=0)
+    : stripe_unit(su),
+      stripe_count(sc),
+      object_size(os),
+      pool_id(-1) {
+  }
+
+  bool operator==(const file_layout_t&) const = default;
+
+  static file_layout_t get_default() {
+    return file_layout_t(1<<22, 1, 1<<22);
+  }
+
+  uint64_t get_period() const {
+    return static_cast<uint64_t>(stripe_count) * object_size;
+  }
+
+  void from_legacy(const ceph_file_layout& fl);
+  void to_legacy(ceph_file_layout *fl) const;
+
+  bool is_valid() const;
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<file_layout_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(file_layout_t)
+
+std::ostream& operator<<(std::ostream& out, const file_layout_t &layout);
+
+#endif
diff --git a/src/include/function2.hpp b/src/include/function2.hpp
new file mode 100644
index 000000000..613e651c7
--- /dev/null
+++ b/src/include/function2.hpp
@@ -0,0 +1,1581 @@
+
+//  Copyright 2015-2018 Denis Blank <denis.blank at outlook dot com>
+//     Distributed under the Boost Software License, Version 1.0
+//       (See accompanying file LICENSE_1_0.txt or copy at
+//             http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef FU2_INCLUDED_FUNCTION2_HPP_
+#define FU2_INCLUDED_FUNCTION2_HPP_
+
+#include <cassert>
+#include <cstdlib>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+// Defines:
+// - FU2_HAS_DISABLED_EXCEPTIONS
+#if defined(FU2_WITH_DISABLED_EXCEPTIONS) ||                                   \
+    defined(FU2_MACRO_DISABLE_EXCEPTIONS)
+#define FU2_HAS_DISABLED_EXCEPTIONS
+#else // FU2_WITH_DISABLED_EXCEPTIONS
+#if defined(_MSC_VER)
+#if !defined(_HAS_EXCEPTIONS) || (_HAS_EXCEPTIONS == 0)
+#define FU2_HAS_DISABLED_EXCEPTIONS
+#endif
+#elif defined(__clang__)
+#if !(__EXCEPTIONS && __has_feature(cxx_exceptions))
+#define FU2_HAS_DISABLED_EXCEPTIONS
+#endif
+#elif defined(__GNUC__)
+#if !__EXCEPTIONS
+#define FU2_HAS_DISABLED_EXCEPTIONS
+#endif
+#endif
+#endif // FU2_WITH_DISABLED_EXCEPTIONS
+// - FU2_HAS_NO_FUNCTIONAL_HEADER
+#if !defined(FU2_WITH_NO_FUNCTIONAL_HEADER) ||                                 \
+    !defined(FU2_NO_FUNCTIONAL_HEADER) ||                                      \
+    !defined(FU2_HAS_DISABLED_EXCEPTIONS)
+#define FU2_HAS_NO_FUNCTIONAL_HEADER
+#include <functional>
+#endif
+// - FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#if defined(FU2_WITH_CXX17_NOEXCEPT_FUNCTION_TYPE)
+#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#else // FU2_WITH_CXX17_NOEXCEPT_FUNCTION_TYPE
+#if defined(_MSC_VER)
+#if defined(_HAS_CXX17) && _HAS_CXX17
+#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#endif
+#elif defined(__cpp_noexcept_function_type)
+#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#elif defined(__cplusplus) && (__cplusplus >= 201703L)
+#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#endif
+#endif // FU2_WITH_CXX17_NOEXCEPT_FUNCTION_TYPE
+
+#if !defined(FU2_HAS_DISABLED_EXCEPTIONS)
+#include <exception>
+#endif
+
+namespace fu2 {
+inline namespace abi_310 {
+namespace detail {
+template <typename Config, typename Property>
+class function;
+
+template <typename...>
+struct identity {};
+
+// Equivalent to C++17's std::void_t which is targets a bug in GCC,
+// that prevents correct SFINAE behavior.
+// See http://stackoverflow.com/questions/35753920 for details.
+template <typename...>
+struct deduce_to_void : std::common_type<void> {};
+
+template <typename... T>
+using void_t = typename deduce_to_void<T...>::type;
+
+// Copy enabler helper class
+template <bool /*Copyable*/>
+struct copyable {};
+template <>
+struct copyable<false> {
+  copyable() = default;
+  ~copyable() = default;
+  copyable(copyable const&) = delete;
+  copyable(copyable&&) = default;
+  copyable& operator=(copyable const&) = delete;
+  copyable& operator=(copyable&&) = default;
+};
+
+/// Configuration trait to configure the function_base class.
+template <bool Owning, bool Copyable, std::size_t Capacity>
+struct config {
+  // Is true if the function is copyable.
+  static constexpr auto const is_owning = Owning;
+
+  // Is true if the function is copyable.
+  static constexpr auto const is_copyable = Copyable;
+
+  // The internal capacity of the function
+  // used in small functor optimization.
+  static constexpr auto const capacity = Capacity;
+};
+
+/// A config which isn't compatible to other configs
+template <bool Throws, bool HasStrongExceptGuarantee, typename... Args>
+struct property {
+  // Is true when the function throws an exception on empty invocation.
+  static constexpr auto const is_throwing = Throws;
+
+  // Is true when the function throws an exception on empty invocation.
+  static constexpr auto const is_strong_exception_guaranteed = Throws;
+};
+
+/// Provides utilities for invocing callable objects
+namespace invocation {
+/// Invokes the given callable object with the given arguments
+template <typename Callable, typename... Args>
+constexpr auto invoke(Callable&& callable, Args&&... args) noexcept(
+    noexcept(std::forward<Callable>(callable)(std::forward<Args>(args)...)))
+    -> decltype(std::forward<Callable>(callable)(std::forward<Args>(args)...)) {
+
+  return std::forward<Callable>(callable)(std::forward<Args>(args)...);
+}
+/// Invokes the given member function pointer by reference
+template <typename T, typename Type, typename Self, typename... Args>
+constexpr auto invoke(Type T::*member, Self&& self, Args&&... args) noexcept(
+    noexcept((std::forward<Self>(self).*member)(std::forward<Args>(args)...)))
+    -> decltype((std::forward<Self>(self).*
+                 member)(std::forward<Args>(args)...)) {
+  return (std::forward<Self>(self).*member)(std::forward<Args>(args)...);
+}
+/// Invokes the given member function pointer by pointer
+template <typename T, typename Type, typename Self, typename... Args>
+constexpr auto invoke(Type T::*member, Self&& self, Args&&... args) noexcept(
+    noexcept((std::forward<Self>(self)->*member)(std::forward<Args>(args)...)))
+    -> decltype(
+        (std::forward<Self>(self)->*member)(std::forward<Args>(args)...)) {
+  return (std::forward<Self>(self)->*member)(std::forward<Args>(args)...);
+}
+/// Invokes the given pointer to a scalar member by reference
+template <typename T, typename Type, typename Self>
+constexpr auto
+invoke(Type T::*member,
+       Self&& self) noexcept(noexcept(std::forward<Self>(self).*member))
+    -> decltype(std::forward<Self>(self).*member) {
+  return (std::forward<Self>(self).*member);
+}
+/// Invokes the given pointer to a scalar member by pointer
+template <typename T, typename Type, typename Self>
+constexpr auto
+invoke(Type T::*member,
+       Self&& self) noexcept(noexcept(std::forward<Self>(self)->*member))
+    -> decltype(std::forward<Self>(self)->*member) {
+  return std::forward<Self>(self)->*member;
+}
+
+/// Deduces to a true type if the callable object can be invoked with
+/// the given arguments.
+/// We don't use invoke here because MSVC can't evaluate the nested expression
+/// SFINAE here.
+template <typename T, typename Args, typename = void>
+struct can_invoke : std::false_type {};
+template <typename T, typename... Args>
+struct can_invoke<T, identity<Args...>,
+                  decltype((void)std::declval<T>()(std::declval<Args>()...))>
+    : std::true_type {};
+template <typename Pointer, typename T, typename... Args>
+struct can_invoke<Pointer, identity<T&, Args...>,
+                  decltype((void)((std::declval<T&>().*std::declval<Pointer>())(
+                      std::declval<Args>()...)))> : std::true_type {};
+template <typename Pointer, typename T, typename... Args>
+struct can_invoke<Pointer, identity<T&&, Args...>,
+                  decltype(
+                      (void)((std::declval<T&&>().*std::declval<Pointer>())(
+                          std::declval<Args>()...)))> : std::true_type {};
+template <typename Pointer, typename T, typename... Args>
+struct can_invoke<Pointer, identity<T*, Args...>,
+                  decltype(
+                      (void)((std::declval<T*>()->*std::declval<Pointer>())(
+                          std::declval<Args>()...)))> : std::true_type {};
+template <typename Pointer, typename T>
+struct can_invoke<Pointer, identity<T&>,
+                  decltype((void)(std::declval<T&>().*std::declval<Pointer>()))>
+    : std::true_type {};
+template <typename Pointer, typename T>
+struct can_invoke<Pointer, identity<T&&>,
+                  decltype((void)(std::declval<T&&>().*
+                                  std::declval<Pointer>()))> : std::true_type {
+};
+template <typename Pointer, typename T>
+struct can_invoke<Pointer, identity<T*>,
+                  decltype(
+                      (void)(std::declval<T*>()->*std::declval<Pointer>()))>
+    : std::true_type {};
+
+template <bool RequiresNoexcept, typename T, typename Args>
+struct is_noexcept_correct : std::true_type {};
+template <typename T, typename... Args>
+struct is_noexcept_correct<true, T, identity<Args...>>
+    : std::integral_constant<bool, noexcept(invoke(std::declval<T>(),
+                                                   std::declval<Args>()...))> {
+};
+} // end namespace invocation
+
+namespace overloading {
+template <typename... Args>
+struct overload_impl;
+template <typename Current, typename Next, typename... Rest>
+struct overload_impl<Current, Next, Rest...> : Current,
+                                               overload_impl<Next, Rest...> {
+  explicit overload_impl(Current current, Next next, Rest... rest)
+      : Current(std::move(current)), overload_impl<Next, Rest...>(
+                                         std::move(next), std::move(rest)...) {
+  }
+
+  using Current::operator();
+  using overload_impl<Next, Rest...>::operator();
+};
+template <typename Current>
+struct overload_impl<Current> : Current {
+  explicit overload_impl(Current current) : Current(std::move(current)) {
+  }
+
+  using Current::operator();
+};
+
+template <typename... T>
+constexpr auto overload(T&&... callables) {
+  return overload_impl<std::decay_t<T>...>{std::forward<T>(callables)...};
+}
+} // namespace overloading
+
+/// Declares the namespace which provides the functionality to work with a
+/// type-erased object.
+namespace type_erasure {
+/// Specialization to work with addresses of callable objects
+template <typename T, typename = void>
+struct address_taker {
+  template <typename O>
+  static void* take(O&& obj) {
+    return std::addressof(obj);
+  }
+  static T& restore(void* ptr) {
+    return *static_cast<T*>(ptr);
+  }
+  static T const& restore(void const* ptr) {
+    return *static_cast<T const*>(ptr);
+  }
+  static T volatile& restore(void volatile* ptr) {
+    return *static_cast<T volatile*>(ptr);
+  }
+  static T const volatile& restore(void const volatile* ptr) {
+    return *static_cast<T const volatile*>(ptr);
+  }
+};
+/// Specialization to work with addresses of raw function pointers
+template <typename T>
+struct address_taker<T, std::enable_if_t<std::is_pointer<T>::value>> {
+  template <typename O>
+  static void* take(O&& obj) {
+    return reinterpret_cast<void*>(obj);
+  }
+  template <typename O>
+  static T restore(O ptr) {
+    return reinterpret_cast<T>(const_cast<void*>(ptr));
+  }
+};
+
+template <typename Box>
+struct box_factory;
+/// Store the allocator inside the box
+template <bool IsCopyable, typename T, typename Allocator>
+struct box : private Allocator {
+  friend box_factory<box>;
+
+  T value_;
+
+  explicit box(T value, Allocator allocator)
+      : Allocator(std::move(allocator)), value_(std::move(value)) {
+  }
+
+  box(box&&) = default;
+  box(box const&) = default;
+  box& operator=(box&&) = default;
+  box& operator=(box const&) = default;
+  ~box() = default;
+};
+template <typename T, typename Allocator>
+struct box<false, T, Allocator> : private Allocator {
+  friend box_factory<box>;
+
+  T value_;
+
+  explicit box(T value, Allocator allocator)
+      : Allocator(std::move(allocator)), value_(std::move(value)) {
+  }
+
+  box(box&&) = default;
+  box(box const&) = delete;
+  box& operator=(box&&) = default;
+  box& operator=(box const&) = delete;
+  ~box() = default;
+};
+
+template <bool IsCopyable, typename T, typename Allocator>
+struct box_factory<box<IsCopyable, T, Allocator>> {
+  using real_allocator =
+      typename std::allocator_traits<std::decay_t<Allocator>>::
+          template rebind_alloc<box<IsCopyable, T, Allocator>>;
+
+  /// Allocates space through the boxed allocator
+  static box<IsCopyable, T, Allocator>*
+  box_allocate(box<IsCopyable, T, Allocator> const* me) {
+    real_allocator allocator(*static_cast<Allocator const*>(me));
+
+    return static_cast<box<IsCopyable, T, Allocator>*>(
+        std::allocator_traits<real_allocator>::allocate(allocator, 1U));
+  }
+
+  /// Destroys the box through the given allocator
+  static void box_deallocate(box<IsCopyable, T, Allocator>* me) {
+    real_allocator allocator(*static_cast<Allocator const*>(me));
+
+    me->~box();
+    std::allocator_traits<real_allocator>::deallocate(allocator, me, 1U);
+  }
+};
+
+/// Creates a box containing the given value and allocator
+template <bool IsCopyable, typename T,
+          typename Allocator = std::allocator<std::decay_t<T>>>
+auto make_box(std::integral_constant<bool, IsCopyable>, T&& value,
+              Allocator&& allocator = Allocator{}) {
+  return box<IsCopyable, std::decay_t<T>, std::decay_t<Allocator>>{
+      std::forward<T>(value), std::forward<Allocator>(allocator)};
+}
+
+template <typename T>
+struct is_box : std::false_type {};
+template <bool IsCopyable, typename T, typename Allocator>
+struct is_box<box<IsCopyable, T, Allocator>> : std::true_type {};
+
+/// Provides access to the pointer to a heal allocated erased object
+/// as well to the inplace storage.
+union data_accessor {
+  data_accessor() = default;
+  explicit constexpr data_accessor(std::nullptr_t) noexcept : ptr_(nullptr) {
+  }
+  explicit constexpr data_accessor(void* ptr) noexcept : ptr_(ptr) {
+  }
+
+  /// The pointer we use if the object is on the heap
+  void* ptr_;
+  /// The first field of the inplace storage
+  std::size_t inplace_storage_;
+};
+
+/// See opcode::op_fetch_empty
+constexpr void write_empty(data_accessor* accessor, bool empty) noexcept {
+  accessor->inplace_storage_ = std::size_t(empty);
+}
+
+template <typename From, typename To>
+using transfer_const_t =
+    std::conditional_t<std::is_const<std::remove_pointer_t<From>>::value,
+                       std::add_const_t<To>, To>;
+template <typename From, typename To>
+using transfer_volatile_t =
+    std::conditional_t<std::is_volatile<std::remove_pointer_t<From>>::value,
+                       std::add_volatile_t<To>, To>;
+
+/// The retriever when the object is allocated inplace
+template <typename T, typename Accessor>
+constexpr auto retrieve(std::true_type /*is_inplace*/, Accessor from,
+                        std::size_t from_capacity) {
+  using type = transfer_const_t<Accessor, transfer_volatile_t<Accessor, void>>*;
+
+  /// Process the command by using the data inside the internal capacity
+  auto storage = &(from->inplace_storage_);
+  auto inplace = const_cast<void*>(static_cast<type>(storage));
+  return type(std::align(alignof(T), sizeof(T), inplace, from_capacity));
+}
+
+/// The retriever which is used when the object is allocated
+/// through the allocator
+template <typename T, typename Accessor>
+constexpr auto retrieve(std::false_type /*is_inplace*/, Accessor from,
+                        std::size_t /*from_capacity*/) {
+
+  return from->ptr_;
+}
+
+namespace invocation_table {
+#if !defined(FU2_HAS_DISABLED_EXCEPTIONS)
+#if defined(FU2_HAS_NO_FUNCTIONAL_HEADER)
+struct bad_function_call : std::exception {
+  bad_function_call() noexcept {
+  }
+
+  char const* what() const noexcept override {
+    return "bad function call";
+  }
+};
+#elif
+using std::bad_function_call;
+#endif
+#endif
+
+#ifdef FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#define FU2_EXPAND_QUALIFIERS_NOEXCEPT(F)                                      \
+  F(, , noexcept, , &)                                                         \
+  F(const, , noexcept, , &)                                                    \
+  F(, volatile, noexcept, , &)                                                 \
+  F(const, volatile, noexcept, , &)                                            \
+  F(, , noexcept, &, &)                                                        \
+  F(const, , noexcept, &, &)                                                   \
+  F(, volatile, noexcept, &, &)                                                \
+  F(const, volatile, noexcept, &, &)                                           \
+  F(, , noexcept, &&, &&)                                                      \
+  F(const, , noexcept, &&, &&)                                                 \
+  F(, volatile, noexcept, &&, &&)                                              \
+  F(const, volatile, noexcept, &&, &&)
+#else // FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+#define FU2_EXPAND_QUALIFIERS_NOEXCEPT(F)
+#endif // FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE
+
+#define FU2_EXPAND_QUALIFIERS(F)                                               \
+  F(, , , , &)                                                                 \
+  F(const, , , , &)                                                            \
+  F(, volatile, , , &)                                                         \
+  F(const, volatile, , , &)                                                    \
+  F(, , , &, &)                                                                \
+  F(const, , , &, &)                                                           \
+  F(, volatile, , &, &)                                                        \
+  F(const, volatile, , &, &)                                                   \
+  F(, , , &&, &&)                                                              \
+  F(const, , , &&, &&)                                                         \
+  F(, volatile, , &&, &&)                                                      \
+  F(const, volatile, , &&, &&)                                                 \
+  FU2_EXPAND_QUALIFIERS_NOEXCEPT(F)
+
+/// If the function is qualified as noexcept, the call will never throw
+template <bool IsNoexcept>
+[[noreturn]] void throw_or_abortnoexcept(
+    std::integral_constant<bool, IsNoexcept> /*is_throwing*/) noexcept {
+  std::abort();
+}
+/// Calls std::abort on empty function calls
+[[noreturn]] inline void
+throw_or_abort(std::false_type /*is_throwing*/) noexcept {
+  std::abort();
+}
+/// Throws bad_function_call on empty funciton calls
+[[noreturn]] inline void throw_or_abort(std::true_type /*is_throwing*/) {
+#ifdef FU2_HAS_DISABLED_EXCEPTIONS
+  throw_or_abort(std::false_type{});
+#else
+  throw bad_function_call{};
+#endif
+}
+
+template <typename T>
+struct function_trait;
+
+using is_noexcept_ = std::false_type;
+using is_noexcept_noexcept = std::true_type;
+
+#define FU2_DEFINE_FUNCTION_TRAIT(CONST, VOLATILE, NOEXCEPT, OVL_REF, REF)     \
+  template <typename Ret, typename... Args>                                    \
+  struct function_trait<Ret(Args...) CONST VOLATILE OVL_REF NOEXCEPT> {        \
+    using pointer_type = Ret (*)(data_accessor CONST VOLATILE*,                \
+                                 std::size_t capacity, Args...);               \
+    template <typename T, bool IsInplace>                                      \
+    struct internal_invoker {                                                  \
+      static Ret invoke(data_accessor CONST VOLATILE* data,                    \
+                        std::size_t capacity, Args... args) NOEXCEPT {         \
+        auto obj = retrieve<T>(std::integral_constant<bool, IsInplace>{},      \
+                               data, capacity);                                \
+        auto box = static_cast<T CONST VOLATILE*>(obj);                        \
+        return invocation::invoke(                                             \
+            static_cast<std::decay_t<decltype(box->value_)> CONST VOLATILE     \
+                            REF>(box->value_),                                 \
+            std::forward<Args>(args)...);                                      \
+      }                                                                        \
+    };                                                                         \
+                                                                               \
+    template <typename T>                                                      \
+    struct view_invoker {                                                      \
+      static Ret invoke(data_accessor CONST VOLATILE* data, std::size_t,       \
+                        Args... args) NOEXCEPT {                               \
+                                                                               \
+        auto ptr = static_cast<void CONST VOLATILE*>(data->ptr_);              \
+        return invocation::invoke(address_taker<T>::restore(ptr),              \
+                                  std::forward<Args>(args)...);                \
+      }                                                                        \
+    };                                                                         \
+                                                                               \
+    template <typename T>                                                      \
+    using callable = T CONST VOLATILE REF;                                     \
+                                                                               \
+    using arguments = identity<Args...>;                                       \
+                                                                               \
+    using is_noexcept = is_noexcept_##NOEXCEPT;                                \
+                                                                               \
+    template <bool Throws>                                                     \
+    struct empty_invoker {                                                     \
+      static Ret invoke(data_accessor CONST VOLATILE* /*data*/,                \
+                        std::size_t /*capacity*/, Args... /*args*/) NOEXCEPT { \
+        throw_or_abort##NOEXCEPT(std::integral_constant<bool, Throws>{});      \
+      }                                                                        \
+    };                                                                         \
+  };
+
+FU2_EXPAND_QUALIFIERS(FU2_DEFINE_FUNCTION_TRAIT)
+#undef FU2_DEFINE_FUNCTION_TRAIT
+
+/// Deduces to the function pointer to the given signature
+template <typename Signature>
+using function_pointer_of = typename function_trait<Signature>::pointer_type;
+
+template <typename... Args>
+struct invoke_table;
+
+/// We optimize the vtable_t in case there is a single function overload
+template <typename First>
+struct invoke_table<First> {
+  using type = function_pointer_of<First>;
+
+  /// Return the function pointer itself
+  template <std::size_t Index>
+  static constexpr auto fetch(type pointer) noexcept {
+    static_assert(Index == 0U, "The index should be 0 here!");
+    return pointer;
+  }
+
+  /// Returns the thunk of an single overloaded callable
+  template <typename T, bool IsInplace>
+  static constexpr type get_invocation_table_of() noexcept {
+    return &function_trait<First>::template internal_invoker<T,
+                                                             IsInplace>::invoke;
+  }
+  /// Returns the thunk of an single overloaded callable
+  template <typename T>
+  static constexpr type get_invocation_view_table_of() noexcept {
+    return &function_trait<First>::template view_invoker<T>::invoke;
+  }
+  /// Returns the thunk of an empty single overloaded callable
+  template <bool IsThrowing>
+  static constexpr type get_empty_invocation_table() noexcept {
+    return &function_trait<First>::template empty_invoker<IsThrowing>::invoke;
+  }
+};
+/// We generate a table in case of multiple function overloads
+template <typename First, typename Second, typename... Args>
+struct invoke_table<First, Second, Args...> {
+  using type =
+      std::tuple<function_pointer_of<First>, function_pointer_of<Second>,
+                 function_pointer_of<Args>...> const*;
+
+  /// Return the function pointer at the particular index
+  template <std::size_t Index>
+  static constexpr auto fetch(type table) noexcept {
+    return std::get<Index>(*table);
+  }
+
+  /// The invocation vtable for a present object
+  template <typename T, bool IsInplace>
+  struct invocation_vtable : public std::tuple<function_pointer_of<First>,
+                                               function_pointer_of<Second>,
+                                               function_pointer_of<Args>...> {
+    constexpr invocation_vtable() noexcept
+        : std::tuple<function_pointer_of<First>, function_pointer_of<Second>,
+                     function_pointer_of<Args>...>(std::make_tuple(
+              &function_trait<First>::template internal_invoker<
+                  T, IsInplace>::invoke,
+              &function_trait<Second>::template internal_invoker<
+                  T, IsInplace>::invoke,
+              &function_trait<Args>::template internal_invoker<
+                  T, IsInplace>::invoke...)) {
+    }
+  };
+
+  /// Returns the thunk of an multi overloaded callable
+  template <typename T, bool IsInplace>
+  static type get_invocation_table_of() noexcept {
+    static invocation_vtable<T, IsInplace> const table;
+    return &table;
+  }
+
+  /// The invocation vtable for a present object
+  template <typename T>
+  struct invocation_view_vtable
+      : public std::tuple<function_pointer_of<First>,
+                          function_pointer_of<Second>,
+                          function_pointer_of<Args>...> {
+    constexpr invocation_view_vtable() noexcept
+        : std::tuple<function_pointer_of<First>, function_pointer_of<Second>,
+                     function_pointer_of<Args>...>(std::make_tuple(
+              &function_trait<First>::template view_invoker<T>::invoke,
+              &function_trait<Second>::template view_invoker<T>::invoke,
+              &function_trait<Args>::template view_invoker<T>::invoke...)) {
+    }
+  };
+
+  /// Returns the thunk of an multi overloaded callable
+  template <typename T>
+  static type get_invocation_view_table_of() noexcept {
+    static invocation_view_vtable<T> const table;
+    return &table;
+  }
+
+  /// The invocation table for an empty wrapper
+  template <bool IsThrowing>
+  struct empty_vtable : public std::tuple<function_pointer_of<First>,
+                                          function_pointer_of<Second>,
+                                          function_pointer_of<Args>...> {
+    constexpr empty_vtable() noexcept
+        : std::tuple<function_pointer_of<First>, function_pointer_of<Second>,
+                     function_pointer_of<Args>...>(
+              std::make_tuple(&function_trait<First>::template empty_invoker<
+                                  IsThrowing>::invoke,
+                              &function_trait<Second>::template empty_invoker<
+                                  IsThrowing>::invoke,
+                              &function_trait<Args>::template empty_invoker<
+                                  IsThrowing>::invoke...)) {
+    }
+  };
+
+  /// Returns the thunk of an multi single overloaded callable
+  template <bool IsThrowing>
+  static type get_empty_invocation_table() noexcept {
+    static empty_vtable<IsThrowing> const table;
+    return &table;
+  }
+};
+
+template <std::size_t Index, typename Function, typename... Signatures>
+class operator_impl;
+
+#define FU2_DEFINE_FUNCTION_TRAIT(CONST, VOLATILE, NOEXCEPT, OVL_REF, REF)     \
+  template <std::size_t Index, typename Function, typename Ret,                \
+            typename... Args, typename Next, typename... Signatures>           \
+  class operator_impl<Index, Function,                                         \
+                      Ret(Args...) CONST VOLATILE OVL_REF NOEXCEPT, Next,      \
+                      Signatures...>                                           \
+      : operator_impl<Index + 1, Function, Next, Signatures...> {              \
+                                                                               \
+    template <std::size_t, typename, typename...>                              \
+    friend class operator_impl;                                                \
+                                                                               \
+  protected:                                                                   \
+    operator_impl() = default;                                                 \
+    ~operator_impl() = default;                                                \
+    operator_impl(operator_impl const&) = default;                             \
+    operator_impl(operator_impl&&) = default;                                  \
+    operator_impl& operator=(operator_impl const&) = default;                  \
+    operator_impl& operator=(operator_impl&&) = default;                       \
+                                                                               \
+    using operator_impl<Index + 1, Function, Next, Signatures...>::operator(); \
+                                                                               \
+    Ret operator()(Args... args) CONST VOLATILE OVL_REF NOEXCEPT {             \
+      auto parent = static_cast<Function CONST VOLATILE*>(this);               \
+      using erasure_t = std::decay_t<decltype(parent->erasure_)>;              \
+                                                                               \
+      return erasure_t::template invoke<Index>(                                \
+          static_cast<erasure_t CONST VOLATILE REF>(parent->erasure_),         \
+          std::forward<Args>(args)...);                                        \
+    }                                                                          \
+  };                                                                           \
+  template <std::size_t Index, typename Config, typename Property,             \
+            typename Ret, typename... Args>                                    \
+  class operator_impl<Index, function<Config, Property>,                       \
+                      Ret(Args...) CONST VOLATILE OVL_REF NOEXCEPT>            \
+      : copyable<Config::is_owning || Config::is_copyable> {                   \
+                                                                               \
+    template <std::size_t, typename, typename...>                              \
+    friend class operator_impl;                                                \
+                                                                               \
+  protected:                                                                   \
+    operator_impl() = default;                                                 \
+    ~operator_impl() = default;                                                \
+    operator_impl(operator_impl const&) = default;                             \
+    operator_impl(operator_impl&&) = default;                                  \
+    operator_impl& operator=(operator_impl const&) = default;                  \
+    operator_impl& operator=(operator_impl&&) = default;                       \
+                                                                               \
+    Ret operator()(Args... args) CONST VOLATILE OVL_REF NOEXCEPT {             \
+      auto parent =                                                            \
+          static_cast<function<Config, Property> CONST VOLATILE*>(this);       \
+      using erasure_t = std::decay_t<decltype(parent->erasure_)>;              \
+                                                                               \
+      return erasure_t::template invoke<Index>(                                \
+          static_cast<erasure_t CONST VOLATILE REF>(parent->erasure_),         \
+          std::forward<Args>(args)...);                                        \
+    }                                                                          \
+  };
+
+FU2_EXPAND_QUALIFIERS(FU2_DEFINE_FUNCTION_TRAIT)
+#undef FU2_DEFINE_FUNCTION_TRAIT
+} // namespace invocation_table
+
+namespace tables {
+/// Identifies the action which is dispatched on the erased object
+enum class opcode {
+  op_move,         //< Move the object and set the vtable
+  op_copy,         //< Copy the object and set the vtable
+  op_destroy,      //< Destroy the object and reset the vtable
+  op_weak_destroy, //< Destroy the object without resetting the vtable
+  op_fetch_empty,  //< Stores true or false into the to storage
+                   //< to indicate emptiness
+};
+
+/// Abstraction for a vtable together with a command table
+/// TODO Add optimization for a single formal argument
+/// TODO Add optimization to merge both tables if the function is size
+/// optimized
+template <typename Property>
+class vtable;
+template <bool IsThrowing, bool HasStrongExceptGuarantee,
+          typename... FormalArgs>
+class vtable<property<IsThrowing, HasStrongExceptGuarantee, FormalArgs...>> {
+  using command_function_t = void (*)(vtable* /*this*/, opcode /*op*/,
+                                      data_accessor* /*from*/,
+                                      std::size_t /*from_capacity*/,
+                                      data_accessor* /*to*/,
+                                      std::size_t /*to_capacity*/);
+
+  using invoke_table_t = invocation_table::invoke_table<FormalArgs...>;
+
+  command_function_t cmd_;
+  typename invoke_table_t::type vtable_;
+
+  template <typename T>
+  struct trait {
+    static_assert(is_box<T>::value,
+                  "The trait must be specialized with a box!");
+
+    /// The command table
+    template <bool IsInplace>
+    static void process_cmd(vtable* to_table, opcode op, data_accessor* from,
+                            std::size_t from_capacity, data_accessor* to,
+                            std::size_t to_capacity) {
+
+      switch (op) {
+        case opcode::op_move: {
+          /// Retrieve the pointer to the object
+          auto box = static_cast<T*>(retrieve<T>(
+              std::integral_constant<bool, IsInplace>{}, from, from_capacity));
+          assert(box && "The object must not be over aligned or null!");
+
+          if (!IsInplace) {
+            // Just swap both pointers if we allocated on the heap
+            to->ptr_ = from->ptr_;
+
+#ifndef _NDEBUG
+            // We don't need to null the pointer since we know that
+            // we don't own the data anymore through the vtable
+            // which is set to empty.
+            from->ptr_ = nullptr;
+#endif
+
+            to_table->template set_allocated<T>();
+
+          }
+          // The object is allocated inplace
+          else {
+            construct(std::true_type{}, std::move(*box), to_table, to,
+                      to_capacity);
+            box->~T();
+          }
+          return;
+        }
+        case opcode::op_copy: {
+          auto box = static_cast<T const*>(retrieve<T>(
+              std::integral_constant<bool, IsInplace>{}, from, from_capacity));
+          assert(box && "The object must not be over aligned or null!");
+
+          assert(std::is_copy_constructible<T>::value &&
+                 "The box is required to be copyable here!");
+
+          // Try to allocate the object inplace
+          construct(std::is_copy_constructible<T>{}, *box, to_table, to,
+                    to_capacity);
+          return;
+        }
+        case opcode::op_destroy:
+        case opcode::op_weak_destroy: {
+
+          assert(!to && !to_capacity && "Arg overflow!");
+          auto box = static_cast<T*>(retrieve<T>(
+              std::integral_constant<bool, IsInplace>{}, from, from_capacity));
+
+          if (IsInplace) {
+            box->~T();
+          } else {
+            box_factory<T>::box_deallocate(box);
+          }
+
+          if (op == opcode::op_destroy) {
+            to_table->set_empty();
+          }
+          return;
+        }
+        case opcode::op_fetch_empty: {
+          write_empty(to, false);
+          return;
+        }
+      }
+
+      // TODO Use an unreachable intrinsic
+      assert(false && "Unreachable!");
+      std::exit(-1);
+    }
+
+    template <typename Box>
+    static void
+    construct(std::true_type /*apply*/, Box&& box, vtable* to_table,
+              data_accessor* to,
+              std::size_t to_capacity) noexcept(HasStrongExceptGuarantee) {
+      // Try to allocate the object inplace
+      void* storage = retrieve<T>(std::true_type{}, to, to_capacity);
+      if (storage) {
+        to_table->template set_inplace<T>();
+      } else {
+        // Allocate the object through the allocator
+        to->ptr_ = storage =
+            box_factory<std::decay_t<Box>>::box_allocate(std::addressof(box));
+        to_table->template set_allocated<T>();
+      }
+      new (storage) T(std::forward<Box>(box));
+    }
+
+    template <typename Box>
+    static void
+    construct(std::false_type /*apply*/, Box&& /*box*/, vtable* /*to_table*/,
+              data_accessor* /*to*/,
+              std::size_t /*to_capacity*/) noexcept(HasStrongExceptGuarantee) {
+    }
+  };
+
+  /// The command table
+  static void empty_cmd(vtable* to_table, opcode op, data_accessor* /*from*/,
+                        std::size_t /*from_capacity*/, data_accessor* to,
+                        std::size_t /*to_capacity*/) {
+
+    switch (op) {
+      case opcode::op_move:
+      case opcode::op_copy: {
+        to_table->set_empty();
+        break;
+      }
+      case opcode::op_destroy:
+      case opcode::op_weak_destroy: {
+        // Do nothing
+        break;
+      }
+      case opcode::op_fetch_empty: {
+        write_empty(to, true);
+        break;
+      }
+    }
+  }
+
+public:
+  vtable() noexcept = default;
+
+  /// Initialize an object at the given position
+  template <typename T>
+  static void init(vtable& table, T&& object, data_accessor* to,
+                   std::size_t to_capacity) {
+
+    trait<std::decay_t<T>>::construct(std::true_type{}, std::forward<T>(object),
+                                      &table, to, to_capacity);
+  }
+
+  /// Moves the object at the given position
+  void move(vtable& to_table, data_accessor* from, std::size_t from_capacity,
+            data_accessor* to,
+            std::size_t to_capacity) noexcept(HasStrongExceptGuarantee) {
+    cmd_(&to_table, opcode::op_move, from, from_capacity, to, to_capacity);
+    set_empty();
+  }
+
+  /// Destroys the object at the given position
+  void copy(vtable& to_table, data_accessor const* from,
+            std::size_t from_capacity, data_accessor* to,
+            std::size_t to_capacity) const {
+    cmd_(&to_table, opcode::op_copy, const_cast<data_accessor*>(from),
+         from_capacity, to, to_capacity);
+  }
+
+  /// Destroys the object at the given position
+  void destroy(data_accessor* from,
+               std::size_t from_capacity) noexcept(HasStrongExceptGuarantee) {
+    cmd_(this, opcode::op_destroy, from, from_capacity, nullptr, 0U);
+  }
+
+  /// Destroys the object at the given position without invalidating the
+  /// vtable
+  void
+  weak_destroy(data_accessor* from,
+               std::size_t from_capacity) noexcept(HasStrongExceptGuarantee) {
+    cmd_(this, opcode::op_weak_destroy, from, from_capacity, nullptr, 0U);
+  }
+
+  /// Returns true when the vtable doesn't hold any erased object
+  bool empty() const noexcept {
+    data_accessor data;
+    cmd_(nullptr, opcode::op_fetch_empty, nullptr, 0U, &data, 0U);
+    return bool(data.inplace_storage_);
+  }
+
+  /// Invoke the function at the given index
+  template <std::size_t Index, typename... Args>
+  constexpr auto invoke(Args&&... args) const {
+    auto thunk = invoke_table_t::template fetch<Index>(vtable_);
+    return thunk(std::forward<Args>(args)...);
+  }
+  /// Invoke the function at the given index
+  template <std::size_t Index, typename... Args>
+  constexpr auto invoke(Args&&... args) const volatile {
+    auto thunk = invoke_table_t::template fetch<Index>(vtable_);
+    return thunk(std::forward<Args>(args)...);
+  }
+
+  template <typename T>
+  void set_inplace() noexcept {
+    using type = std::decay_t<T>;
+    vtable_ = invoke_table_t::template get_invocation_table_of<type, true>();
+    cmd_ = &trait<type>::template process_cmd<true>;
+  }
+
+  template <typename T>
+  void set_allocated() noexcept {
+    using type = std::decay_t<T>;
+    vtable_ = invoke_table_t::template get_invocation_table_of<type, false>();
+    cmd_ = &trait<type>::template process_cmd<false>;
+  }
+
+  void set_empty() noexcept {
+    vtable_ = invoke_table_t::template get_empty_invocation_table<IsThrowing>();
+    cmd_ = &empty_cmd;
+  }
+};
+} // namespace tables
+
+/// A union which makes the pointer to the heap object share the
+/// same space with the internal capacity.
+/// The storage type is distinguished by multiple versions of the
+/// control and vtable.
+template <std::size_t Capacity, typename = void>
+struct internal_capacity {
+  /// We extend the union through a technique similar to the tail object hack
+  typedef union {
+    /// Tag to access the structure in a type-safe way
+    data_accessor accessor_;
+    /// The internal capacity we use to allocate in-place
+    std::aligned_storage_t<Capacity> capacity_;
+  } type;
+};
+template <std::size_t Capacity>
+struct internal_capacity<Capacity,
+                         std::enable_if_t<(Capacity < sizeof(void*))>> {
+  typedef struct {
+    /// Tag to access the structure in a type-safe way
+    data_accessor accessor_;
+  } type;
+};
+
+template <std::size_t Capacity>
+class internal_capacity_holder {
+  // Tag to access the structure in a type-safe way
+  typename internal_capacity<Capacity>::type storage_;
+
+public:
+  constexpr internal_capacity_holder() = default;
+
+  constexpr data_accessor* opaque_ptr() noexcept {
+    return &storage_.accessor_;
+  }
+  constexpr data_accessor const* opaque_ptr() const noexcept {
+    return &storage_.accessor_;
+  }
+  constexpr data_accessor volatile* opaque_ptr() volatile noexcept {
+    return &storage_.accessor_;
+  }
+  constexpr data_accessor const volatile* opaque_ptr() const volatile noexcept {
+    return &storage_.accessor_;
+  }
+
+  static constexpr std::size_t capacity() noexcept {
+    return sizeof(storage_);
+  }
+};
+
+/// An owning erasure
+template <bool IsOwning /* = true*/, typename Config, typename Property>
+class erasure : internal_capacity_holder<Config::capacity> {
+  template <bool, typename, typename>
+  friend class erasure;
+  template <std::size_t, typename, typename...>
+  friend class operator_impl;
+
+  using vtable_t = tables::vtable<Property>;
+
+  vtable_t vtable_;
+
+public:
+  /// Returns the capacity of this erasure
+  static constexpr std::size_t capacity() noexcept {
+    return internal_capacity_holder<Config::capacity>::capacity();
+  }
+
+  constexpr erasure() noexcept {
+    vtable_.set_empty();
+  }
+
+  constexpr erasure(std::nullptr_t) noexcept {
+    vtable_.set_empty();
+  }
+
+  constexpr erasure(erasure&& right) noexcept(
+      Property::is_strong_exception_guaranteed) {
+    right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(),
+                       this->opaque_ptr(), capacity());
+  }
+
+  constexpr erasure(erasure const& right) {
+    right.vtable_.copy(vtable_, right.opaque_ptr(), right.capacity(),
+                       this->opaque_ptr(), capacity());
+  }
+
+  template <typename OtherConfig>
+  constexpr erasure(erasure<true, OtherConfig, Property> right) noexcept(
+      Property::is_strong_exception_guaranteed) {
+    right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(),
+                       this->opaque_ptr(), capacity());
+  }
+
+  template <typename T, typename Allocator = std::allocator<std::decay_t<T>>>
+  constexpr erasure(T&& callable, Allocator&& allocator = Allocator{}) {
+    vtable_t::init(vtable_,
+                   type_erasure::make_box(
+                       std::integral_constant<bool, Config::is_copyable>{},
+                       std::forward<T>(callable),
+                       std::forward<Allocator>(allocator)),
+                   this->opaque_ptr(), capacity());
+  }
+
+  ~erasure() {
+    vtable_.weak_destroy(this->opaque_ptr(), capacity());
+  }
+
+  constexpr erasure&
+  operator=(std::nullptr_t) noexcept(Property::is_strong_exception_guaranteed) {
+    vtable_.destroy(this->opaque_ptr(), capacity());
+    return *this;
+  }
+
+  constexpr erasure& operator=(erasure&& right) noexcept(
+      Property::is_strong_exception_guaranteed) {
+    vtable_.weak_destroy(this->opaque_ptr(), capacity());
+    right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(),
+                       this->opaque_ptr(), capacity());
+    return *this;
+  }
+
+  constexpr erasure& operator=(erasure const& right) {
+    vtable_.weak_destroy(this->opaque_ptr(), capacity());
+    right.vtable_.copy(vtable_, right.opaque_ptr(), right.capacity(),
+                       this->opaque_ptr(), capacity());
+    return *this;
+  }
+
+  template <typename OtherConfig>
+  constexpr erasure&
+  operator=(erasure<true, OtherConfig, Property> right) noexcept(
+      Property::is_strong_exception_guaranteed) {
+    vtable_.weak_destroy(this->opaque_ptr(), capacity());
+    right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(),
+                       this->opaque_ptr(), capacity());
+    return *this;
+  }
+
+  template <typename T>
+  constexpr erasure& operator=(T&& callable) {
+    vtable_.weak_destroy(this->opaque_ptr(), capacity());
+    vtable_t::init(vtable_,
+                   type_erasure::make_box(
+                       std::integral_constant<bool, Config::is_copyable>{},
+                       std::forward<T>(callable)),
+                   this->opaque_ptr(), capacity());
+    return *this;
+  }
+
+  template <typename T, typename Allocator>
+  void assign(T&& callable, Allocator&& allocator) {
+    vtable_.weak_destroy(this->opaque_ptr(), capacity());
+    vtable_t::init(vtable_,
+                   type_erasure::make_box(
+                       std::integral_constant<bool, Config::is_copyable>{},
+                       std::forward<T>(callable),
+                       std::forward<Allocator>(allocator)),
+                   this->opaque_ptr(), capacity());
+  }
+
+  /// Returns true when the erasure doesn't hold any erased object
+  constexpr bool empty() const noexcept {
+    return vtable_.empty();
+  }
+
+  /// Invoke the function of the erasure at the given index
+  ///
+  /// We define this out of class to be able to forward the qualified
+  /// erasure correctly.
+  template <std::size_t Index, typename Erasure, typename... Args>
+  static constexpr auto invoke(Erasure&& erasure, Args&&... args) {
+    auto const capacity = erasure.capacity();
+    return erasure.vtable_.template invoke<Index>(
+        std::forward<Erasure>(erasure).opaque_ptr(), capacity,
+        std::forward<Args>(args)...);
+  }
+};
+
+// A non owning erasure
+template </*bool IsOwning = false, */ typename Config, bool IsThrowing,
+          bool HasStrongExceptGuarantee, typename... Args>
+class erasure<false, Config,
+              property<IsThrowing, HasStrongExceptGuarantee, Args...>> {
+  template <bool, typename, typename>
+  friend class erasure;
+  template <std::size_t, typename, typename...>
+  friend class operator_impl;
+
+  using property_t = property<IsThrowing, HasStrongExceptGuarantee, Args...>;
+
+  using invoke_table_t = invocation_table::invoke_table<Args...>;
+  typename invoke_table_t::type invoke_table_;
+
+  /// The internal pointer to the non owned object
+  data_accessor view_;
+
+public:
+  // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+  constexpr erasure() noexcept
+      : invoke_table_(
+            invoke_table_t::template get_empty_invocation_table<IsThrowing>()),
+        view_(nullptr) {
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+  constexpr erasure(std::nullptr_t) noexcept
+      : invoke_table_(
+            invoke_table_t::template get_empty_invocation_table<IsThrowing>()),
+        view_(nullptr) {
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+  constexpr erasure(erasure&& right) noexcept
+      : invoke_table_(right.invoke_table_), view_(right.view_) {
+  }
+
+  constexpr erasure(erasure const& /*right*/) = default;
+
+  template <typename OtherConfig>
+  // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+  constexpr erasure(erasure<false, OtherConfig, property_t> right) noexcept
+      : invoke_table_(right.invoke_table_), view_(right.view_) {
+  }
+
+  template <typename T>
+  // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init)
+  constexpr erasure(T&& object)
+      : invoke_table_(invoke_table_t::template get_invocation_view_table_of<
+                      std::decay_t<T>>()),
+        view_(address_taker<std::decay_t<T>>::take(std::forward<T>(object))) {
+  }
+
+  ~erasure() = default;
+
+  constexpr erasure&
+  operator=(std::nullptr_t) noexcept(HasStrongExceptGuarantee) {
+    invoke_table_ =
+        invoke_table_t::template get_empty_invocation_table<IsThrowing>();
+    view_.ptr_ = nullptr;
+    return *this;
+  }
+
+  constexpr erasure& operator=(erasure&& right) noexcept {
+    invoke_table_ = right.invoke_table_;
+    view_ = right.view_;
+    right = nullptr;
+    return *this;
+  }
+
+  constexpr erasure& operator=(erasure const& /*right*/) = default;
+
+  template <typename OtherConfig>
+  constexpr erasure&
+  operator=(erasure<true, OtherConfig, property_t> right) noexcept {
+    invoke_table_ = right.invoke_table_;
+    view_ = right.view_;
+    return *this;
+  }
+
+  template <typename T>
+  constexpr erasure& operator=(T&& object) {
+    invoke_table_ = invoke_table_t::template get_invocation_view_table_of<
+        std::decay_t<T>>();
+    view_.ptr_ = address_taker<std::decay_t<T>>::take(std::forward<T>(object));
+    return *this;
+  }
+
+  /// Returns true when the erasure doesn't hold any erased object
+  constexpr bool empty() const noexcept {
+    return view_.ptr_ == nullptr;
+  }
+
+  template <std::size_t Index, typename Erasure, typename... T>
+  static constexpr auto invoke(Erasure&& erasure, T&&... args) {
+    auto thunk = invoke_table_t::template fetch<Index>(erasure.invoke_table_);
+    return thunk(&(erasure.view_), 0UL, std::forward<T>(args)...);
+  }
+};
+} // namespace type_erasure
+
+/// Deduces to a true_type if the type T provides the given signature and the
+/// signature is noexcept correct callable.
+template <typename T, typename Signature,
+          typename Trait =
+              type_erasure::invocation_table::function_trait<Signature>>
+struct accepts_one
+    : std::integral_constant<
+          bool, invocation::can_invoke<typename Trait::template callable<T>,
+                                       typename Trait::arguments>::value &&
+                    invocation::is_noexcept_correct<
+                        Trait::is_noexcept::value,
+                        typename Trait::template callable<T>,
+                        typename Trait::arguments>::value> {};
+
+/// Deduces to a true_type if the type T provides all signatures
+template <typename T, typename Signatures, typename = void>
+struct accepts_all : std::false_type {};
+template <typename T, typename... Signatures>
+struct accepts_all<
+    T, identity<Signatures...>,
+    void_t<std::enable_if_t<accepts_one<T, Signatures>::value>...>>
+    : std::true_type {};
+
+template <typename Config, typename T>
+struct assert_wrong_copy_assign {
+  static_assert(!Config::is_copyable ||
+                    std::is_copy_constructible<std::decay_t<T>>::value,
+                "Can't wrap a non copyable object into a unique function!");
+
+  using type = void;
+};
+
+template <bool IsStrongExceptGuaranteed, typename T>
+struct assert_no_strong_except_guarantee {
+  static_assert(
+      !IsStrongExceptGuaranteed ||
+          (std::is_nothrow_move_constructible<T>::value &&
+           std::is_nothrow_destructible<T>::value),
+      "Can't wrap a object an object that has no strong exception guarantees "
+      "if this is required by the wrapper!");
+
+  using type = void;
+};
+
+/// SFINAES out if the given callable is not copyable correct to the left one.
+template <typename LeftConfig, typename RightConfig>
+using enable_if_copyable_correct_t =
+    std::enable_if_t<(!LeftConfig::is_copyable || RightConfig::is_copyable)>;
+
+template <typename LeftConfig, typename RightConfig>
+using is_owning_correct =
+    std::integral_constant<bool,
+                           (LeftConfig::is_owning == RightConfig::is_owning)>;
+
+/// SFINAES out if the given function2 is not owning correct to this one
+template <typename LeftConfig, typename RightConfig>
+using enable_if_owning_correct_t =
+    std::enable_if_t<is_owning_correct<LeftConfig, RightConfig>::value>;
+
+template <typename Config, bool IsThrowing, bool HasStrongExceptGuarantee,
+          typename... Args>
+class function<Config, property<IsThrowing, HasStrongExceptGuarantee, Args...>>
+    : type_erasure::invocation_table::operator_impl<
+          0U,
+          function<Config,
+                   property<IsThrowing, HasStrongExceptGuarantee, Args...>>,
+          Args...> {
+
+  template <typename, typename>
+  friend class function;
+
+  template <std::size_t, typename, typename...>
+  friend class type_erasure::invocation_table::operator_impl;
+
+  using property_t = property<IsThrowing, HasStrongExceptGuarantee, Args...>;
+  using erasure_t =
+      type_erasure::erasure<Config::is_owning, Config, property_t>;
+
+  template <typename T>
+  using enable_if_can_accept_all_t =
+      std::enable_if_t<accepts_all<std::decay_t<T>, identity<Args...>>::value>;
+
+  template <typename Function, typename = void>
+  struct is_convertible_to_this : std::false_type {};
+  template <typename RightConfig>
+  struct is_convertible_to_this<
+      function<RightConfig, property_t>,
+      void_t<enable_if_copyable_correct_t<Config, RightConfig>,
+             enable_if_owning_correct_t<Config, RightConfig>>>
+      : std::true_type {};
+
+  template <typename T>
+  using enable_if_not_convertible_to_this =
+      std::enable_if_t<!is_convertible_to_this<std::decay_t<T>>::value>;
+
+  template <typename T>
+  using enable_if_owning_t =
+      std::enable_if_t<std::is_same<T, T>::value && Config::is_owning>;
+
+  template <typename T>
+  using assert_wrong_copy_assign_t =
+      typename assert_wrong_copy_assign<Config, std::decay_t<T>>::type;
+
+  template <typename T>
+  using assert_no_strong_except_guarantee_t =
+      typename assert_no_strong_except_guarantee<HasStrongExceptGuarantee,
+                                                 std::decay_t<T>>::type;
+
+  erasure_t erasure_;
+
+public:
+  /// Default constructor which empty constructs the function
+  function() = default;
+  ~function() = default;
+
+  explicit constexpr function(function const& /*right*/) = default;
+  explicit constexpr function(function&& /*right*/) = default;
+
+  /// Copy construction from another copyable function
+  template <typename RightConfig,
+            std::enable_if_t<RightConfig::is_copyable>* = nullptr,
+            enable_if_copyable_correct_t<Config, RightConfig>* = nullptr,
+            enable_if_owning_correct_t<Config, RightConfig>* = nullptr>
+  constexpr function(function<RightConfig, property_t> const& right)
+      : erasure_(right.erasure_) {
+  }
+
+  /// Move construction from another function
+  template <typename RightConfig,
+            enable_if_copyable_correct_t<Config, RightConfig>* = nullptr,
+            enable_if_owning_correct_t<Config, RightConfig>* = nullptr>
+  constexpr function(function<RightConfig, property_t>&& right)
+      : erasure_(std::move(right.erasure_)) {
+  }
+
+  /// Construction from a callable object which overloads the `()` operator
+  template <typename T, //
+            enable_if_not_convertible_to_this<T>* = nullptr,
+            enable_if_can_accept_all_t<T>* = nullptr,
+            assert_wrong_copy_assign_t<T>* = nullptr,
+            assert_no_strong_except_guarantee_t<T>* = nullptr>
+  constexpr function(T&& callable) : erasure_(std::forward<T>(callable)) {
+  }
+  template <typename T, typename Allocator, //
+            enable_if_not_convertible_to_this<T>* = nullptr,
+            enable_if_can_accept_all_t<T>* = nullptr,
+            enable_if_owning_t<T>* = nullptr,
+            assert_wrong_copy_assign_t<T>* = nullptr,
+            assert_no_strong_except_guarantee_t<T>* = nullptr>
+  constexpr function(T&& callable, Allocator&& allocator)
+      : erasure_(std::forward<T>(callable),
+                 std::forward<Allocator>(allocator)) {
+  }
+
+  /// Empty constructs the function
+  constexpr function(std::nullptr_t np) : erasure_(np) {
+  }
+
+  function& operator=(function const& /*right*/) = default;
+  function& operator=(function&& /*right*/) = default;
+
+  /// Copy assigning from another copyable function
+  template <typename RightConfig,
+            std::enable_if_t<RightConfig::is_copyable>* = nullptr,
+            enable_if_copyable_correct_t<Config, RightConfig>* = nullptr,
+            enable_if_owning_correct_t<Config, RightConfig>* = nullptr>
+  function& operator=(function<RightConfig, property_t> const& right) {
+    erasure_ = right.erasure_;
+    return *this;
+  }
+
+  /// Move assigning from another function
+  template <typename RightConfig,
+            enable_if_copyable_correct_t<Config, RightConfig>* = nullptr,
+            enable_if_owning_correct_t<Config, RightConfig>* = nullptr>
+  function& operator=(function<RightConfig, property_t>&& right) {
+    erasure_ = std::move(right.erasure_);
+    return *this;
+  }
+
+  /// Move assigning from a callable object
+  template <typename T, // ...
+            enable_if_not_convertible_to_this<T>* = nullptr,
+            enable_if_can_accept_all_t<T>* = nullptr,
+            assert_wrong_copy_assign_t<T>* = nullptr,
+            assert_no_strong_except_guarantee_t<T>* = nullptr>
+  function& operator=(T&& callable) {
+    erasure_ = std::forward<T>(callable);
+    return *this;
+  }
+
+  /// Clears the function
+  function& operator=(std::nullptr_t np) {
+    erasure_ = np;
+    return *this;
+  }
+
+  /// Returns true when the function is empty
+  bool empty() const noexcept {
+    return erasure_.empty();
+  }
+
+  /// Returns true when the function isn't empty
+  explicit operator bool() const noexcept {
+    return !empty();
+  }
+
+  /// Assigns a new target with an optional allocator
+  template <typename T, typename Allocator = std::allocator<std::decay_t<T>>,
+            enable_if_not_convertible_to_this<T>* = nullptr,
+            enable_if_can_accept_all_t<T>* = nullptr,
+            assert_wrong_copy_assign_t<T>* = nullptr,
+            assert_no_strong_except_guarantee_t<T>* = nullptr>
+  void assign(T&& callable, Allocator&& allocator = Allocator{}) {
+    erasure_.assign(std::forward<T>(callable),
+                    std::forward<Allocator>(allocator));
+  }
+
+  /// Swaps this function with the given function
+  void swap(function& other) noexcept(HasStrongExceptGuarantee) {
+    if (&other == this) {
+      return;
+    }
+
+    function cache = std::move(other);
+    other = std::move(*this);
+    *this = std::move(cache);
+  }
+
+  /// Swaps the left function with the right one
+  friend void swap(function& left,
+                   function& right) noexcept(HasStrongExceptGuarantee) {
+    left.swap(right);
+  }
+
+  /// Calls the wrapped callable object
+  using type_erasure::invocation_table::operator_impl<
+      0U, function<Config, property_t>, Args...>::operator();
+};
+
+template <typename Config, typename Property>
+bool operator==(function<Config, Property> const& f, std::nullptr_t) {
+  return !bool(f);
+}
+
+template <typename Config, typename Property>
+bool operator!=(function<Config, Property> const& f, std::nullptr_t) {
+  return bool(f);
+}
+
+template <typename Config, typename Property>
+bool operator==(std::nullptr_t, function<Config, Property> const& f) {
+  return !bool(f);
+}
+
+template <typename Config, typename Property>
+bool operator!=(std::nullptr_t, function<Config, Property> const& f) {
+  return bool(f);
+}
+
+// Default object size of the function
+using object_size = std::integral_constant<std::size_t, 32U>;
+
+// Default capacity for small functor optimization
+using default_capacity =
+    std::integral_constant<std::size_t,
+                           object_size::value - (2 * sizeof(void*))>;
+} // namespace detail
+} // namespace abi_310
+
+/// Adaptable function wrapper base for arbitrary functional types.
+template <
+    /// This is a placeholder for future non owning support
+    bool IsOwning,
+    /// Defines whether the function is copyable or not
+    bool IsCopyable,
+    /// Defines the internal capacity of the function
+    /// for small functor optimization.
+    /// The size of the whole function object will be the capacity plus
+    /// the size of two pointers.
+    /// If the capacity is zero, the size will increase through one additional
+    /// pointer so the whole object has the size of 3 * sizeof(void*).
+    std::size_t Capacity,
+    /// Defines whether the function throws an exception on empty function
+    /// call, `std::abort` is called otherwise.
+    bool IsThrowing,
+    /// Defines whether all objects satisfy the strong exception guarantees,
+    /// which means the function type will satisfy the strong exception
+    /// guarantees too.
+    bool HasStrongExceptGuarantee,
+    /// Defines the signature of the function wrapper
+    typename... Signatures>
+using function_base = detail::function<
+    detail::config<IsOwning, IsCopyable, Capacity>,
+    detail::property<IsThrowing, HasStrongExceptGuarantee, Signatures...>>;
+
+/// An owning copyable function wrapper for arbitrary callable types.
+template <typename... Signatures>
+using function = function_base<true, true, detail::default_capacity::value,
+                               true, false, Signatures...>;
+
+/// An owning non copyable function wrapper for arbitrary callable types.
+template <typename... Signatures>
+using unique_function =
+    function_base<true, false, detail::default_capacity::value, true, false,
+                  Signatures...>;
+
+/// A non owning copyable function wrapper for arbitrary callable types.
+template <typename... Signatures>
+using function_view =
+    function_base<false, true, detail::default_capacity::value, true, false,
+                  Signatures...>;
+
+#if !defined(FU2_HAS_DISABLED_EXCEPTIONS)
+/// Exception type that is thrown when invoking empty function objects
+/// and exception support isn't disabled.
+///
+/// Exception suport is enabled if
+/// the template parameter 'Throwing' is set to true (default).
+///
+/// This type will default to std::bad_function_call if the
+/// functional header is used, otherwise the library provides its own type.
+///
+/// You may disable the inclusion of the functionl header
+/// through defining `FU2_WITH_NO_FUNCTIONAL_HEADER`.
+///
+using detail::type_erasure::invocation_table::bad_function_call;
+#endif
+
+/// Returns a callable object, which unifies all callable objects
+/// that were passed to this function.
+///
+///   ```cpp
+///   auto overloaded = fu2::overload([](std::true_type) { return true; },
+///                                   [](std::false_type) { return false; });
+///   ```
+///
+/// \param  callables A pack of callable objects with arbitrary signatures.
+///
+/// \returns          A callable object which exposes the
+///
+template <typename... T>
+constexpr auto overload(T&&... callables) {
+  return detail::overloading::overload(std::forward<T>(callables)...);
+}
+} // namespace fu2
+
+#undef FU2_EXPAND_QUALIFIERS
+#undef FU2_EXPAND_QUALIFIERS_NOEXCEPT
+
+#endif // FU2_INCLUDED_FUNCTION2_HPP_
diff --git a/src/include/hash.h b/src/include/hash.h
new file mode 100644
index 000000000..2ab95448b
--- /dev/null
+++ b/src/include/hash.h
@@ -0,0 +1,64 @@
+#ifndef CEPH_HASH_H
+#define CEPH_HASH_H
+
+#include "acconfig.h"
+
+// Robert Jenkins' function for mixing 32-bit values
+// http://burtleburtle.net/bob/hash/evahash.html
+// a, b = random bits, c = input and output
+
+#define hashmix(a,b,c) \
+	a=a-b;  a=a-c;  a=a^(c>>13); \
+	b=b-c;  b=b-a;  b=b^(a<<8);  \
+	c=c-a;  c=c-b;  c=c^(b>>13); \
+	a=a-b;  a=a-c;  a=a^(c>>12); \
+	b=b-c;  b=b-a;  b=b^(a<<16); \
+	c=c-a;  c=c-b;  c=c^(b>>5);  \
+	a=a-b;  a=a-c;  a=a^(c>>3); \
+	b=b-c;  b=b-a;  b=b^(a<<10); \
+	c=c-a;  c=c-b;  c=c^(b>>15);
+
+
+//namespace ceph {
+
+template <class _Key> struct rjhash { };
+
+inline uint64_t rjhash64(uint64_t key) {
+  key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+  key = key ^ (key >> 24);
+  key = (key + (key << 3)) + (key << 8); // key * 265
+  key = key ^ (key >> 14);
+  key = (key + (key << 2)) + (key << 4); // key * 21
+  key = key ^ (key >> 28);
+  key = key + (key << 31);
+  return key;
+}
+
+inline uint32_t rjhash32(uint32_t a) {
+  a = (a+0x7ed55d16) + (a<<12);
+  a = (a^0xc761c23c) ^ (a>>19);
+  a = (a+0x165667b1) + (a<<5);
+  a = (a+0xd3a2646c) ^ (a<<9);
+  a = (a+0xfd7046c5) + (a<<3);
+  a = (a^0xb55a4f09) ^ (a>>16);
+  return a;
+}
+
+
+template<> struct rjhash<uint32_t> {
+  inline size_t operator()(const uint32_t x) const {
+    return rjhash32(x);
+  }
+};
+
+template<> struct rjhash<uint64_t> {
+  inline size_t operator()(const uint64_t x) const {
+    return rjhash64(x);
+  }
+};
+
+//}
+
+
+
+#endif
diff --git a/src/include/health.h b/src/include/health.h
new file mode 100644
index 000000000..03191eff7
--- /dev/null
+++ b/src/include/health.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "include/encoding.h"
+
+// health_status_t
+enum health_status_t {
+  HEALTH_ERR = 0,
+  HEALTH_WARN = 1,
+  HEALTH_OK = 2,
+};
+
+inline void encode(health_status_t hs, ceph::buffer::list& bl) {
+  using ceph::encode;
+  uint8_t v = hs;
+  encode(v, bl);
+}
+inline void decode(health_status_t& hs, ceph::buffer::list::const_iterator& p) {
+  using ceph::decode;
+  uint8_t v;
+  decode(v, p);
+  hs = health_status_t(v);
+}
+template<>
+struct denc_traits<health_status_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+  static void bound_encode(const ceph::buffer::ptr& v, size_t& p, uint64_t f=0) {
+    p++;
+  }
+  static void encode(const health_status_t& v,
+		     ceph::buffer::list::contiguous_appender& p,
+		     uint64_t f=0) {
+    ::denc((uint8_t)v, p);
+  }
+  static void decode(health_status_t& v, ceph::buffer::ptr::const_iterator& p,
+		     uint64_t f=0) {
+    uint8_t tmp;
+    ::denc(tmp, p);
+    v = health_status_t(tmp);
+  }
+  static void decode(health_status_t& v, ceph::buffer::list::const_iterator& p,
+		     uint64_t f=0) {
+    uint8_t tmp;
+    ::denc(tmp, p);
+    v = health_status_t(tmp);
+  }
+};
+
+inline std::ostream& operator<<(std::ostream &oss, const health_status_t status) {
+  switch (status) {
+    case HEALTH_ERR:
+      oss << "HEALTH_ERR";
+      break;
+    case HEALTH_WARN:
+      oss << "HEALTH_WARN";
+      break;
+    case HEALTH_OK:
+      oss << "HEALTH_OK";
+      break;
+  }
+  return oss;
+}
+
+inline const char *short_health_string(const health_status_t status) {
+  switch (status) {
+  case HEALTH_ERR:
+    return "ERR";
+  case HEALTH_WARN:
+    return "WRN";
+  case HEALTH_OK:
+    return "OK";
+  default:
+    return "???";
+  }
+}
diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h
new file mode 100644
index 000000000..48d889763
--- /dev/null
+++ b/src/include/inline_memory.h
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_INLINE_MEMORY_H
+#define CEPH_INLINE_MEMORY_H
+
+#if defined(__GNUC__)
+
+// optimize for the common case, which is very small copies
+static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+				       size_t inline_len)
+  __attribute__((always_inline));
+
+void *maybe_inline_memcpy(void *dest, const void *src, size_t l,
+			 size_t inline_len)
+{
+  if (l > inline_len) {
+    return memcpy(dest, src, l);
+  }
+  switch (l) {
+  case 8:
+    return __builtin_memcpy(dest, src, 8);
+  case 4:
+    return __builtin_memcpy(dest, src, 4);
+  case 3:
+    return __builtin_memcpy(dest, src, 3);
+  case 2:
+    return __builtin_memcpy(dest, src, 2);
+  case 1:
+    return __builtin_memcpy(dest, src, 1);
+  default:
+    int cursor = 0;
+    while (l >= sizeof(uint64_t)) {
+      __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+		       sizeof(uint64_t));
+      cursor += sizeof(uint64_t);
+      l -= sizeof(uint64_t);
+    }
+    while (l >= sizeof(uint32_t)) {
+      __builtin_memcpy((char*)dest + cursor, (char*)src + cursor,
+		       sizeof(uint32_t));
+      cursor += sizeof(uint32_t);
+      l -= sizeof(uint32_t);
+    }
+    while (l > 0) {
+      *((char*)dest + cursor) = *((char*)src + cursor);
+      cursor++;
+      l--;
+    }
+  }
+  return dest;
+}
+
+#else
+
+#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l)
+
+#endif
+
+
+#if defined(__GNUC__) && defined(__x86_64__)
+
+namespace ceph {
+typedef unsigned uint128_t __attribute__ ((mode (TI)));
+}
+using ceph::uint128_t;
+
+static inline bool mem_is_zero(const char *data, size_t len)
+  __attribute__((always_inline));
+
+bool mem_is_zero(const char *data, size_t len)
+{
+  // we do have XMM registers in x86-64, so if we need to check at least
+  // 16 bytes, make use of them
+  if (len / sizeof(uint128_t) > 0) {
+    // align data pointer to 16 bytes, otherwise it'll segfault due to bug
+    // in (at least some) GCC versions (using MOVAPS instead of MOVUPS).
+    // check up to 15 first bytes while at it.
+    while (((unsigned long long)data) & 15) {
+      if (*(uint8_t*)data != 0) {
+	return false;
+      }
+      data += sizeof(uint8_t);
+      --len;
+    }
+
+    const char* data_start = data;
+    const char* max128 = data + (len / sizeof(uint128_t))*sizeof(uint128_t);
+
+    while (data < max128) {
+      if (*(uint128_t*)data != 0) {
+	return false;
+      }
+      data += sizeof(uint128_t);
+    }
+    len -= (data - data_start);
+  }
+
+  const char* max = data + len;
+  const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t);
+  while (data < max32) {
+    if (*(uint32_t*)data != 0) {
+      return false;
+    }
+    data += sizeof(uint32_t);
+  }
+  while (data < max) {
+    if (*(uint8_t*)data != 0) {
+      return false;
+    }
+    data += sizeof(uint8_t);
+  }
+  return true;
+}
+
+#else  // gcc and x86_64
+
+static inline bool mem_is_zero(const char *data, size_t len) {
+  const char *end = data + len;
+  const char* end64 = data + (len / sizeof(uint64_t))*sizeof(uint64_t);
+
+  while (data < end64) {
+    if (*(uint64_t*)data != 0) {
+      return false;
+    }
+    data += sizeof(uint64_t);
+  }
+
+  while (data < end) {
+    if (*data != 0) {
+      return false;
+    }
+    ++data;
+  }
+  return true;
+}
+
+#endif  // !x86_64
+
+#endif
diff --git a/src/include/int_types.h b/src/include/int_types.h
new file mode 100644
index 000000000..a704ba71d
--- /dev/null
+++ b/src/include/int_types.h
@@ -0,0 +1,56 @@
+#ifndef CEPH_INTTYPES_H
+#define CEPH_INTTYPES_H
+
+#include "acconfig.h"
+
+#include <inttypes.h>
+
+#ifdef __linux__
+#include <linux/types.h>
+#else
+#ifndef HAVE___U8
+typedef uint8_t __u8;
+#endif
+
+#ifndef HAVE___S8
+typedef int8_t __s8;
+#endif
+
+#ifndef HAVE___U16
+typedef uint16_t __u16;
+#endif
+
+#ifndef HAVE___S16
+typedef int16_t __s16;
+#endif
+
+#ifndef HAVE___U32
+typedef uint32_t __u32;
+#endif
+
+#ifndef HAVE___S32
+typedef int32_t __s32;
+#endif
+
+#ifndef HAVE___U64
+typedef uint64_t __u64;
+#endif
+
+#ifndef HAVE___S64
+typedef int64_t __s64;
+#endif
+#endif /* LINUX_TYPES_H */
+
+#ifndef BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
+#endif
+
+#ifndef BOOST_MPL_LIMIT_VECTOR_SIZE
+#define BOOST_MPL_LIMIT_VECTOR_SIZE 30 // or whatever you need
+#endif
+
+#ifndef BOOST_MPL_LIMIT_MAP_SIZE
+#define BOOST_MPL_LIMIT_MAP_SIZE 30 // or whatever you need
+#endif
+
+#endif
diff --git a/src/include/intarith.h b/src/include/intarith.h
new file mode 100644
index 000000000..68b0345a4
--- /dev/null
+++ b/src/include/intarith.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INTARITH_H
+#define CEPH_INTARITH_H
+
+#include <bit>
+#include <climits>
+#include <concepts>
+#include <type_traits>
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> div_round_up(T n, U d) {
+  return (n + d - 1) / d;
+}
+
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> round_up_to(T n, U d) {
+  return (n % d ? (n + d - n % d) : n);
+}
+
+template<typename T, typename U>
+constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> shift_round_up(T x, U y) {
+  return (x + (1 << y) - 1) >> y;
+}
+
+/*
+ * Wrappers for various sorts of alignment and rounding.  The "align" must
+ * be a power of 2.  Often times it is a block, sector, or page.
+ */
+
+/*
+ * return x rounded down to an align boundary
+ * eg, p2align(1200, 1024) == 1024 (1*align)
+ * eg, p2align(1024, 1024) == 1024 (1*align)
+ * eg, p2align(0x1234, 0x100) == 0x1200 (0x12*align)
+ * eg, p2align(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+template<typename T>
+constexpr inline T p2align(T x, T align) {
+  return x & -align;
+}
+
+/*
+ * return x % (mod) align
+ * eg, p2phase(0x1234, 0x100) == 0x34 (x-0x12*align)
+ * eg, p2phase(0x5600, 0x100) == 0x00 (x-0x56*align)
+ */
+template<typename T>
+constexpr inline T p2phase(T x, T align) {
+  return x & (align - 1);
+}
+
+/*
+ * return how much space is left in this block (but if it's perfectly
+ * aligned, return 0).
+ * eg, p2nphase(0x1234, 0x100) == 0xcc (0x13*align-x)
+ * eg, p2nphase(0x5600, 0x100) == 0x00 (0x56*align-x)
+ */
+template<typename T>
+constexpr inline T p2nphase(T x, T align) {
+  return -x & (align - 1);
+}
+
+/*
+ * return x rounded up to an align boundary
+ * eg, p2roundup(0x1234, 0x100) == 0x1300 (0x13*align)
+ * eg, p2roundup(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+template<typename T>
+constexpr inline T p2roundup(T x, T align) {
+  return -(-x & -align);
+}
+
+// count bits (set + any 0's that follow)
+template<std::integral T>
+unsigned cbits(T v) {
+  return (sizeof(v) * CHAR_BIT) - std::countl_zero(std::make_unsigned_t<T>(v));
+}
+
+#endif
diff --git a/src/include/interval_set.h b/src/include/interval_set.h
new file mode 100644
index 000000000..dfb2a306c
--- /dev/null
+++ b/src/include/interval_set.h
@@ -0,0 +1,824 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_INTERVAL_SET_H
+#define CEPH_INTERVAL_SET_H
+
+#include <iterator>
+#include <map>
+#include <ostream>
+
+#include "encoding.h"
+
+/*
+ * *** NOTE ***
+ *
+ * This class is written to work with a variety of map-like containers,
+ * *include* ones that invalidate iterators when they are modified (e.g.,
+ * flat_map and btree_map).
+ */
+
+template<typename T, template<typename, typename, typename ...> class C = std::map>
+class interval_set {
+ public:
+  using Map = C<T, T>;
+  using value_type = typename Map::value_type;
+  using offset_type = T;
+  using length_type = T;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using size_type = typename Map::size_type;
+
+  class const_iterator;
+
+  class iterator
+  {
+    public:
+        using difference_type = ssize_t;
+        using value_type = typename Map::value_type;
+        using pointer = typename Map::value_type*;
+        using reference = typename Map::value_type&;
+        using iterator_category = std::forward_iterator_tag;
+
+        explicit iterator(typename Map::iterator iter)
+          : _iter(iter)
+        { }
+
+        // For the copy constructor and assignment operator, the compiler-generated functions, which
+        // perform simple bitwise copying, should be fine.
+
+        bool operator==(const iterator& rhs) const {
+          return (_iter == rhs._iter);
+        }
+
+        bool operator!=(const iterator& rhs) const {
+          return (_iter != rhs._iter);
+        }
+
+        // Dereference this iterator to get a pair.
+        reference operator*() const {
+          return *_iter;
+        }
+
+        // Return the interval start.
+        offset_type get_start() const {
+          return _iter->first;
+        }
+
+        // Return the interval length.
+        length_type get_len() const {
+          return _iter->second;
+        }
+
+        offset_type get_end() const {
+          return _iter->first + _iter->second;
+        }
+
+        // Set the interval length.
+        void set_len(const length_type& len) {
+          _iter->second = len;
+        }
+
+        // Preincrement
+        iterator& operator++()
+        {
+          ++_iter;
+          return *this;
+        }
+
+        // Postincrement
+        iterator operator++(int)
+        {
+          iterator prev(_iter);
+          ++_iter;
+          return prev;
+        }
+
+        // Predecrement
+        iterator& operator--()
+        {
+          --_iter;
+          return *this;
+        }
+
+        // Postdecrement
+        iterator operator--(int)
+        {
+          iterator prev(_iter);
+          --_iter;
+          return prev;
+        }
+
+    friend class interval_set::const_iterator;
+
+    protected:
+        typename Map::iterator _iter;
+    friend class interval_set;
+  };
+
+  class const_iterator
+  {
+    public:
+        using difference_type = ssize_t;
+        using value_type = const typename Map::value_type;
+        using pointer = const typename Map::value_type*;
+        using reference = const typename Map::value_type&;
+        using iterator_category = std::forward_iterator_tag;
+
+        explicit const_iterator(typename Map::const_iterator iter)
+          : _iter(iter)
+        { }
+
+        const_iterator(const iterator &i)
+	  : _iter(i._iter)
+        { }
+
+        // For the copy constructor and assignment operator, the compiler-generated functions, which
+        // perform simple bitwise copying, should be fine.
+
+        bool operator==(const const_iterator& rhs) const {
+          return (_iter == rhs._iter);
+        }
+
+        bool operator!=(const const_iterator& rhs) const {
+          return (_iter != rhs._iter);
+        }
+
+        // Dereference this iterator to get a pair.
+        reference operator*() const {
+          return *_iter;
+        }
+
+        // Return the interval start.
+        offset_type get_start() const {
+          return _iter->first;
+        }
+        offset_type get_end() const {
+          return _iter->first + _iter->second;
+        }
+
+        // Return the interval length.
+        length_type get_len() const {
+          return _iter->second;
+        }
+
+        // Preincrement
+        const_iterator& operator++()
+        {
+          ++_iter;
+          return *this;
+        }
+
+        // Postincrement
+        const_iterator operator++(int)
+        {
+          const_iterator prev(_iter);
+          ++_iter;
+          return prev;
+        }
+
+        // Predecrement
+        iterator& operator--()
+        {
+          --_iter;
+          return *this;
+        }
+
+        // Postdecrement
+        iterator operator--(int)
+        {
+          iterator prev(_iter);
+          --_iter;
+          return prev;
+        }
+
+    protected:
+        typename Map::const_iterator _iter;
+  };
+
+  interval_set() = default;
+  interval_set(Map&& other) {
+    m.swap(other);
+    for (const auto& p : m) {
+      _size += p.second;
+    }
+  }
+
+  size_type num_intervals() const
+  {
+    return m.size();
+  }
+
+  iterator begin() {
+    return iterator(m.begin());
+  }
+
+  iterator lower_bound(T start) {
+    return iterator(find_inc_m(start));
+  }
+
+  iterator end() {
+    return iterator(m.end());
+  }
+
+  const_iterator begin() const {
+    return const_iterator(m.begin());
+  }
+
+  const_iterator lower_bound(T start) const {
+    return const_iterator(find_inc(start));
+  }
+
+  const_iterator end() const {
+    return const_iterator(m.end());
+  }
+
+  // helpers
+ private:
+  auto find_inc(T start) const {
+    auto p = m.lower_bound(start);  // p->first >= start
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      --p;   // might overlap?
+      if (p->first + p->second <= start)
+        ++p; // it doesn't.
+    }
+    return p;
+  }
+  
+  auto find_inc_m(T start) {
+    auto p = m.lower_bound(start);
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      --p;   // might overlap?
+      if (p->first + p->second <= start)
+        ++p; // it doesn't.
+    }
+    return p;
+  }
+  
+  auto find_adj(T start) const {
+    auto p = m.lower_bound(start);
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      --p;   // might touch?
+      if (p->first + p->second < start)
+        ++p; // it doesn't.
+    }
+    return p;
+  }
+  
+  auto find_adj_m(T start) {
+    auto p = m.lower_bound(start);
+    if (p != m.begin() &&
+        (p == m.end() || p->first > start)) {
+      --p;   // might touch?
+      if (p->first + p->second < start)
+        ++p; // it doesn't.
+    }
+    return p;
+  }
+
+  void intersection_size_asym(const interval_set &s, const interval_set &l) {
+    auto ps = s.m.begin();
+    ceph_assert(ps != s.m.end());
+    auto offset = ps->first;
+    bool first = true;
+    auto mi = m.begin();
+
+    while (1) {
+      if (first)
+        first = false;
+      auto pl = l.find_inc(offset);
+      if (pl == l.m.end())
+        break;
+      while (ps != s.m.end() && ps->first + ps->second <= pl->first)
+        ++ps;
+      if (ps == s.m.end())
+        break;
+      offset = pl->first + pl->second;
+      if (offset <= ps->first) {
+        offset = ps->first;
+        continue;
+      }
+
+      if (*ps == *pl) {
+        do {
+          mi = m.insert(mi, *ps);
+          _size += ps->second;
+          ++ps;
+          ++pl;
+        } while (ps != s.m.end() && pl != l.m.end() && *ps == *pl);
+        if (ps == s.m.end())
+          break;
+        offset = ps->first;
+        continue;
+      }
+
+      auto start = std::max<T>(ps->first, pl->first);
+      auto en = std::min<T>(ps->first + ps->second, offset);
+      ceph_assert(en > start);
+      mi = m.emplace_hint(mi, start, en - start);
+      _size += mi->second;
+      if (ps->first + ps->second <= offset) {
+        ++ps;
+        if (ps == s.m.end())
+          break;
+        offset = ps->first;
+      }
+    }
+  }
+
+  bool subset_size_sym(const interval_set &b) const {
+    auto pa = m.begin(), pb = b.m.begin();
+    const auto a_end = m.end(), b_end = b.m.end();
+
+    while (pa != a_end && pb != b_end) {
+      while (pb->first + pb->second <= pa->first) {
+        ++pb;
+        if (pb == b_end)
+          return false;
+      }
+
+      if (*pa == *pb) {
+        do {
+          ++pa;
+          ++pb;
+        } while (pa != a_end && pb != b_end && *pa == *pb);
+        continue;
+      }
+
+      // interval begins before other
+      if (pa->first < pb->first)
+        return false;
+      // interval is longer than other
+      if (pa->first + pa->second > pb->first + pb->second)
+        return false;
+
+      ++pa;
+    }
+
+    return pa == a_end;
+  }
+  
+ public:
+  bool operator==(const interval_set& other) const {
+    return _size == other._size && m == other.m;
+  }
+
+  uint64_t size() const {
+    return _size;
+  }
+
+  void bound_encode(size_t& p) const {
+    denc_traits<Map>::bound_encode(m, p);
+  }
+  void encode(ceph::buffer::list::contiguous_appender& p) const {
+    denc(m, p);
+  }
+  void decode(ceph::buffer::ptr::const_iterator& p) {
+    denc(m, p);
+    _size = 0;
+    for (const auto& p : m) {
+      _size += p.second;
+    }
+  }
+  void decode(ceph::buffer::list::iterator& p) {
+    denc(m, p);
+    _size = 0;
+    for (const auto& p : m) {
+      _size += p.second;
+    }
+  }
+
+  void encode_nohead(ceph::buffer::list::contiguous_appender& p) const {
+    denc_traits<Map>::encode_nohead(m, p);
+  }
+  void decode_nohead(int n, ceph::buffer::ptr::const_iterator& p) {
+    denc_traits<Map>::decode_nohead(n, m, p);
+    _size = 0;
+    for (const auto& p : m) {
+      _size += p.second;
+    }
+  }
+
+  void clear() {
+    m.clear();
+    _size = 0;
+  }
+
+  bool contains(T i, T *pstart=0, T *plen=0) const {
+    auto p = find_inc(i);
+    if (p == m.end()) return false;
+    if (p->first > i) return false;
+    if (p->first+p->second <= i) return false;
+    ceph_assert(p->first <= i && p->first+p->second > i);
+    if (pstart)
+      *pstart = p->first;
+    if (plen)
+      *plen = p->second;
+    return true;
+  }
+  bool contains(T start, T len) const {
+    auto p = find_inc(start);
+    if (p == m.end()) return false;
+    if (p->first > start) return false;
+    if (p->first+p->second <= start) return false;
+    ceph_assert(p->first <= start && p->first+p->second > start);
+    if (p->first+p->second < start+len) return false;
+    return true;
+  }
+  bool intersects(T start, T len) const {
+    interval_set a;
+    a.insert(start, len);
+    interval_set i;
+    i.intersection_of( *this, a );
+    if (i.empty()) return false;
+    return true;
+  }
+
+  // outer range of set
+  bool empty() const {
+    return m.empty();
+  }
+  offset_type range_start() const {
+    ceph_assert(!empty());
+    auto p = m.begin();
+    return p->first;
+  }
+  offset_type range_end() const {
+    ceph_assert(!empty());
+    auto p = m.rbegin();
+    return p->first + p->second;
+  }
+
+  // interval start after p (where p not in set)
+  bool starts_after(T i) const {
+    ceph_assert(!contains(i));
+    auto p = find_inc(i);
+    if (p == m.end()) return false;
+    return true;
+  }
+  offset_type start_after(T i) const {
+    ceph_assert(!contains(i));
+    auto p = find_inc(i);
+    return p->first;
+  }
+
+  // interval end that contains start
+  offset_type end_after(T start) const {
+    ceph_assert(contains(start));
+    auto p = find_inc(start);
+    return p->first+p->second;
+  }
+  
+  void insert(T val) {
+    insert(val, 1);
+  }
+
+  void insert(T start, T len, T *pstart=0, T *plen=0) {
+    //cout << "insert " << start << "~" << len << endl;
+    ceph_assert(len > 0);
+    _size += len;
+    auto p = find_adj_m(start);
+    if (p == m.end()) {
+      m[start] = len;                  // new interval
+      if (pstart)
+	*pstart = start;
+      if (plen)
+	*plen = len;
+    } else {
+      if (p->first < start) {
+        
+        if (p->first + p->second != start) {
+          //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl;
+          ceph_abort();
+        }
+        
+        p->second += len;               // append to end
+        
+        auto n = p;
+        ++n;
+	if (pstart)
+	  *pstart = p->first;
+        if (n != m.end() && 
+            start+len == n->first) {   // combine with next, too!
+          p->second += n->second;
+	  if (plen)
+	    *plen = p->second;
+          m.erase(n);
+        } else {
+	  if (plen)
+	    *plen = p->second;
+	}
+      } else {
+        if (start+len == p->first) {
+	  if (pstart)
+	    *pstart = start;
+	  if (plen)
+	    *plen = len + p->second;
+	  T psecond = p->second;
+          m.erase(p);
+          m[start] = len + psecond;  // append to front
+        } else {
+          ceph_assert(p->first > start+len);
+	  if (pstart)
+	    *pstart = start;
+	  if (plen)
+	    *plen = len;
+          m[start] = len;              // new interval
+        }
+      }
+    }
+  }
+
+  void swap(interval_set& other) {
+    m.swap(other.m);
+    std::swap(_size, other._size);
+  }    
+  
+  void erase(const iterator &i) {
+    _size -= i.get_len();
+    m.erase(i._iter);
+  }
+
+  void erase(T val) {
+    erase(val, 1);
+  }
+
+  void erase(T start, T len, 
+    std::function<bool(T, T)> claim = {}) {
+    auto p = find_inc_m(start);
+
+    _size -= len;
+
+    ceph_assert(p != m.end());
+    ceph_assert(p->first <= start);
+
+    T before = start - p->first;
+    ceph_assert(p->second >= before+len);
+    T after = p->second - before - len;
+    if (before) {
+      if (claim && claim(p->first, before)) {
+	_size -= before;
+	m.erase(p);
+      } else {
+	p->second = before;        // shorten bit before
+      }
+    } else {
+      m.erase(p);
+    }
+    if (after) {
+      if (claim && claim(start + len, after)) {
+	_size -= after;
+      } else {
+	m[start + len] = after;
+      }
+    }
+  }
+
+  void subtract(const interval_set &a) {
+    for (const auto& [start, len] : a.m) {
+      erase(start, len);
+    }
+  }
+
+  void insert(const interval_set &a) {
+    for (const auto& [start, len] : a.m) {
+      insert(start, len);
+    }
+  }
+
+
+  void intersection_of(const interval_set &a, const interval_set &b) {
+    ceph_assert(&a != this);
+    ceph_assert(&b != this);
+    clear();
+
+    const interval_set *s, *l;
+
+    if (a.size() < b.size()) {
+      s = &a;
+      l = &b;
+    } else {
+      s = &b;
+      l = &a;
+    }
+
+    if (!s->size())
+      return;
+
+    /*
+     * Use the lower_bound algorithm for larger size ratios
+     * where it performs better, but not for smaller size
+     * ratios where sequential search performs better.
+     */
+    if (l->size() / s->size() >= 10) {
+      intersection_size_asym(*s, *l);
+      return;
+    }
+
+    auto pa = a.m.begin();
+    auto pb = b.m.begin();
+    auto mi = m.begin();
+
+    while (pa != a.m.end() && pb != b.m.end()) {
+      // passing?
+      if (pa->first + pa->second <= pb->first) 
+        { pa++;  continue; }
+      if (pb->first + pb->second <= pa->first) 
+        { pb++;  continue; }
+
+      if (*pa == *pb) {
+        do {
+          mi = m.insert(mi, *pa);
+          _size += pa->second;
+          ++pa;
+          ++pb;
+        } while (pa != a.m.end() && pb != b.m.end() && *pa == *pb);
+        continue;
+      }
+
+      T start = std::max(pa->first, pb->first);
+      T en = std::min(pa->first+pa->second, pb->first+pb->second);
+      ceph_assert(en > start);
+      mi = m.emplace_hint(mi, start, en - start);
+      _size += mi->second;
+      if (pa->first+pa->second > pb->first+pb->second)
+        pb++;
+      else
+        pa++; 
+    }
+  }
+  void intersection_of(const interval_set& b) {
+    interval_set a;
+    swap(a);
+    intersection_of(a, b);
+  }
+
+  void union_of(const interval_set &a, const interval_set &b) {
+    ceph_assert(&a != this);
+    ceph_assert(&b != this);
+    clear();
+    
+    //cout << "union_of" << endl;
+
+    // a
+    m = a.m;
+    _size = a._size;
+
+    // - (a*b)
+    interval_set ab;
+    ab.intersection_of(a, b);
+    subtract(ab);
+
+    // + b
+    insert(b);
+    return;
+  }
+  void union_of(const interval_set &b) {
+    interval_set a;
+    swap(a);    
+    union_of(a, b);
+  }
+  void union_insert(T off, T len) {
+    interval_set a;
+    a.insert(off, len);
+    union_of(a);
+  }
+
+  bool subset_of(const interval_set &big) const {
+    if (!size())
+      return true;
+    if (size() > big.size())
+      return false;
+    if (range_end() > big.range_end())
+      return false;
+
+    /*
+     * Use the lower_bound algorithm for larger size ratios
+     * where it performs better, but not for smaller size
+     * ratios where sequential search performs better.
+     */
+    if (big.size() / size() < 10)
+      return subset_size_sym(big);
+
+    for (const auto& [start, len] : m) {
+      if (!big.contains(start, len)) return false;
+    }
+    return true;
+  }  
+
+  /*
+   * build a subset of @other, starting at or after @start, and including
+   * @len worth of values, skipping holes.  e.g.,
+   *  span_of([5~10,20~5], 8, 5) -> [8~2,20~3]
+   */
+  void span_of(const interval_set &other, T start, T len) {
+    clear();
+    auto p = other.find_inc(start);
+    if (p == other.m.end())
+      return;
+    if (p->first < start) {
+      if (p->first + p->second < start)
+	return;
+      if (p->first + p->second < start + len) {
+	T howmuch = p->second - (start - p->first);
+	insert(start, howmuch);
+	len -= howmuch;
+	p++;
+      } else {
+	insert(start, len);
+	return;
+      }
+    }
+    while (p != other.m.end() && len > 0) {
+      if (p->second < len) {
+	insert(p->first, p->second);
+	len -= p->second;
+	p++;
+      } else {
+	insert(p->first, len);
+	return;
+      }
+    }
+  }
+
+  /*
+   * Move contents of m into another Map. Use that instead of
+   * encoding interval_set into bufferlist then decoding it back into Map.
+   */
+  Map detach() && {
+    return std::move(m);
+  }
+
+private:
+  // data
+  uint64_t _size = 0;
+  Map m;   // map start -> len
+};
+
+// declare traits explicitly because (1) it's templatized, and (2) we
+// want to include _nohead variants.
+template<typename T, template<typename, typename, typename ...> class C>
+struct denc_traits<interval_set<T, C>> {
+private:
+  using container_t = interval_set<T, C>;
+public:
+  static constexpr bool supported = true;
+  static constexpr bool bounded = false;
+  static constexpr bool featured = false;
+  static constexpr bool need_contiguous = denc_traits<T, C<T,T>>::need_contiguous;
+  static void bound_encode(const container_t& v, size_t& p) {
+    v.bound_encode(p);
+  }
+  static void encode(const container_t& v,
+		     ceph::buffer::list::contiguous_appender& p) {
+    v.encode(p);
+  }
+  static void decode(container_t& v, ceph::buffer::ptr::const_iterator& p) {
+    v.decode(p);
+  }
+  template<typename U=T>
+    static typename std::enable_if<sizeof(U) && !need_contiguous>::type
+  decode(container_t& v, ceph::buffer::list::iterator& p) {
+    v.decode(p);
+  }
+  static void encode_nohead(const container_t& v,
+			    ceph::buffer::list::contiguous_appender& p) {
+    v.encode_nohead(p);
+  }
+  static void decode_nohead(size_t n, container_t& v,
+			    ceph::buffer::ptr::const_iterator& p) {
+    v.decode_nohead(n, p);
+  }
+};
+
+
+template<typename T, template<typename, typename, typename ...> class C>
+inline std::ostream& operator<<(std::ostream& out, const interval_set<T,C> &s) {
+  out << "[";
+  bool first = true;
+  for (const auto& [start, len] : s) {
+    if (!first) out << ",";
+    out << start << "~" << len;
+    first = false;
+  }
+  out << "]";
+  return out;
+}
+
+
+#endif
diff --git a/src/include/ipaddr.h b/src/include/ipaddr.h
new file mode 100644
index 000000000..bf06cfc93
--- /dev/null
+++ b/src/include/ipaddr.h
@@ -0,0 +1,47 @@
+#ifndef CEPH_IPADDR_H
+#define CEPH_IPADDR_H
+
+class entity_addr_t;
+
+/*
+ * Check if an IP address that is in the wanted subnet.
+ */
+bool matches_ipv4_in_subnet(const struct ifaddrs& addrs,
+                            const struct sockaddr_in* net,
+                            unsigned int prefix_len);
+bool matches_ipv6_in_subnet(const struct ifaddrs& addrs,
+                            const struct sockaddr_in6* net,
+                            unsigned int prefix_len);
+
+/*
+ * Validate and parse IPv4 or IPv6 network
+ *
+ * Given a network (e.g. "192.168.0.0/24") and pointers to a sockaddr_storage
+ * struct and an unsigned int:
+ *
+ * if the network string is valid, return true and populate sockaddr_storage
+ * and prefix_len;
+ *
+ * if the network string is invalid, return false.
+ */
+bool parse_network(const char *s,
+		   struct sockaddr_storage *network,
+		   unsigned int *prefix_len);
+bool parse_network(const char *s,
+		   entity_addr_t *network,
+		   unsigned int *prefix_len);
+
+void netmask_ipv6(const struct in6_addr *addr,
+		  unsigned int prefix_len,
+		  struct in6_addr *out);
+
+void netmask_ipv4(const struct in_addr *addr,
+		  unsigned int prefix_len,
+		  struct in_addr *out);
+
+bool network_contains(
+	const struct entity_addr_t& network,
+	unsigned int prefix_len,
+	const struct entity_addr_t& addr);
+
+#endif
diff --git a/src/include/krbd.h b/src/include/krbd.h
new file mode 100644
index 000000000..977d45fe2
--- /dev/null
+++ b/src/include/krbd.h
@@ -0,0 +1,97 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_KRBD_H
+#define CEPH_KRBD_H
+
+#include "rados/librados.h"
+
+/*
+ * Don't wait for udev add uevents in krbd_map() and udev remove
+ * uevents in krbd_unmap*().  Instead, make do with the respective
+ * kernel uevents and return as soon as they are received.
+ *
+ * systemd-udevd sends out udev uevents after it finishes processing
+ * the respective kernel uevents, which mostly boils down to executing
+ * all matching udev rules.  With this flag set, on return from
+ * krbd_map() systemd-udevd may still be poking at the device: it
+ * may still be open with tools such as blkid and various ioctls to
+ * be run against it, none of the persistent symlinks to the device
+ * node may be there, etc.  udev used to be responsible for creating
+ * the device node as well, but that has been handled by devtmpfs in
+ * the kernel for many years now, so the device node (as returned
+ * through @pdevnode) is guaranteed to be there.
+ *
+ * If set, krbd_map() and krbd_unmap*() can be invoked from any
+ * network namespace that is owned by the initial user namespace
+ * (which is a formality because things like loading kernel modules
+ * and creating block devices are not namespaced and require global
+ * privileges, i.e. capabilities in the initial user namespace).
+ * Otherwise, krbd_map() and krbd_unmap*() must be invoked from
+ * the initial network namespace.
+ *
+ * If set, krbd_unmap*() doesn't attempt to settle the udev queue
+ * before retrying unmap for the last time.  Some EBUSY errors due
+ * to systemd-udevd poking at the device at the time krbd_unmap*()
+ * is invoked that are otherwise covered by the retry logic may be
+ * returned.
+ */
+#define KRBD_CTX_F_NOUDEV       (1U << 0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct krbd_ctx;
+
+int krbd_create_from_context(rados_config_t cct, uint32_t flags,
+                             struct krbd_ctx **pctx);
+void krbd_destroy(struct krbd_ctx *ctx);
+
+int krbd_map(struct krbd_ctx *ctx,
+             const char *pool_name,
+             const char *nspace_name,
+             const char *image_name,
+             const char *snap_name,
+             const char *options,
+             char **pdevnode);
+int krbd_is_mapped(struct krbd_ctx *ctx,
+                   const char *pool_name,
+                   const char *nspace_name,
+                   const char *image_name,
+                   const char *snap_name,
+                   char **pdevnode);
+
+int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
+               const char *options);
+int krbd_unmap_by_spec(struct krbd_ctx *ctx,
+                       const char *pool_name,
+                       const char *nspace_name,
+                       const char *image_name,
+                       const char *snap_name,
+                       const char *options);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+namespace ceph {
+  class Formatter;
+}
+
+int krbd_showmapped(struct krbd_ctx *ctx, ceph::Formatter *f);
+
+#endif /* __cplusplus */
+
+#endif /* CEPH_KRBD_H */
diff --git a/src/include/libcephsqlite.h b/src/include/libcephsqlite.h
new file mode 100644
index 000000000..d81cc55e8
--- /dev/null
+++ b/src/include/libcephsqlite.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2021 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License version 2.1, as published by
+ * the Free Software Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef LIBCEPHSQLITE_H
+#define LIBCEPHSQLITE_H
+
+/* This loadable extension does not generally require using this header. It is
+ * here to allow controlling which version of the library is linked in. See
+ * also sqlite3_cephsqlite_init below. Additionally, you may specify which
+ * CephContext to use rather than the library instantiating its own and using
+ * whatever the default credential is.
+ */
+
+#include <sqlite3.h>
+
+#ifdef _WIN32
+#  define LIBCEPHSQLITE_API __declspec(dllexport)
+#else
+#  define LIBCEPHSQLITE_API [[gnu::visibility("default")]]
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* This is the SQLite entry point when loaded as a dynamic library. You also
+ * need to ensure SQLite calls this method when using libcephsqlite as a static
+ * library or a dynamic library linked at compile time. For the latter case,
+ * you can do this by:
+ *
+ *   sqlite3_auto_extension((void (*)())sqlite3_cephsqlite_init);
+ *   sqlite3* db = nullptr;
+ *   int rc = sqlite3_open_v2(":memory:", &db, SQLITE_OPEN_READWRITE, nullptr);
+ *   if (rc == SQLITE_DONE) {
+ *     sqlite3_close(db);
+ *   } else {
+ *     // failure
+ *   }
+ *
+ * The throwaway database created (name == "") is a memory database opened so
+ * that SQLite runs the libcephsqlite initialization routine to register the
+ * VFS. AFter that's done, the VFS is available for a future database open with
+ * the VFS set to "ceph":
+ *
+ *   sqlite3_open_v2("foo:bar/baz.db", &db, SQLITE_OPEN_READWRITE, "ceph");
+ *
+ * You MUST do this before calling any other libcephsqlite routine so that
+ * sqlite3 can pass its API routines to the libcephsqlite extension.
+ */
+
+LIBCEPHSQLITE_API int sqlite3_cephsqlite_init(sqlite3* db, char** err, const sqlite3_api_routines* api);
+
+/* If you prefer to have libcephsqlite use a CephContext managed by your
+ * application, use this routine to set that. libcephsqlite can only have one
+ * context globally.
+ */
+
+LIBCEPHSQLITE_API int cephsqlite_setcct(class CephContext* cct, char** ident);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/linux_fiemap.h b/src/include/linux_fiemap.h
new file mode 100644
index 000000000..36046b5cc
--- /dev/null
+++ b/src/include/linux_fiemap.h
@@ -0,0 +1,73 @@
+/*
+ * FS_IOC_FIEMAP ioctl infrastructure.
+ *
+ * Some portions copyright (C) 2007 Cluster File Systems, Inc
+ *
+ * Authors: Mark Fasheh <mfasheh@suse.com>
+ *          Kalpak Shah <kalpak.shah@sun.com>
+ *          Andreas Dilger <adilger@sun.com>
+ */
+#ifndef _LINUX_FIEMAP_H
+#define _LINUX_FIEMAP_H
+
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD_)
+#include <sys/types.h>
+#endif
+
+#include "include/int_types.h"
+
+struct fiemap_extent {
+	__u64 fe_logical;  /* logical offset in bytes for the start of
+			    * the extent from the beginning of the file */
+	__u64 fe_physical; /* physical offset in bytes for the start
+			    * of the extent from the beginning of the disk */
+	__u64 fe_length;   /* length in bytes for this extent */
+	__u64 fe_reserved64[2];
+	__u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+	__u32 fe_reserved[3];
+};
+
+struct fiemap {
+	__u64 fm_start;		/* logical offset (inclusive) at
+				 * which to start mapping (in) */
+	__u64 fm_length;	/* logical length of mapping which
+				 * userspace wants (in) */
+	__u32 fm_flags;		/* FIEMAP_FLAG_* flags for request (in/out) */
+	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+	__u32 fm_extent_count;  /* size of fm_extents array (in) */
+	__u32 fm_reserved;
+	struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET	(~0ULL)
+
+#define FIEMAP_FLAG_SYNC	0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR	0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_FLAGS_COMPAT	(FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
+
+#define FIEMAP_EXTENT_LAST		0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN		0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC		0x00000004 /* Location still pending.
+						    * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED		0x00000008 /* Data can not be read
+						    * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED	0x00000080 /* Data is encrypted by fs.
+						    * Sets EXTENT_NO_BYPASS. */
+#define FIEMAP_EXTENT_NOT_ALIGNED	0x00000100 /* Extent offsets may not be
+						    * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE	0x00000200 /* Data mixed with metadata.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL		0x00000400 /* Multiple files in block.
+						    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN		0x00000800 /* Space allocated, but
+						    * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED		0x00001000 /* File does not natively
+						    * support extents. Result
+						    * merged for efficiency. */
+#define FIEMAP_EXTENT_SHARED		0x00002000 /* Space shared with other
+						    * files. */
+
+#endif /* _LINUX_FIEMAP_H */
diff --git a/src/include/lru.h b/src/include/lru.h
new file mode 100644
index 000000000..3f5069ee3
--- /dev/null
+++ b/src/include/lru.h
@@ -0,0 +1,241 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef CEPH_LRU_H
+#define CEPH_LRU_H
+
+#include <math.h>
+#include <stdint.h>
+
+#include "common/config.h"
+#include "xlist.h"
+
+class LRUObject {
+public:
+  LRUObject() : lru_link(this) {}
+  virtual ~LRUObject();
+
+  // pin/unpin item in cache
+  void lru_pin();
+  void lru_unpin();
+  bool lru_is_expireable() const { return !lru_pinned; }
+
+  friend class LRU;
+private:
+  class LRU *lru{};
+  xlist<LRUObject *>::item lru_link;
+  bool lru_pinned = false;
+};
+
+class LRU {
+public:
+  uint64_t lru_get_size() const { return lru_get_top()+lru_get_bot()+lru_get_pintail(); }
+  uint64_t lru_get_top() const { return top.size(); }
+  uint64_t lru_get_bot() const{ return bottom.size(); }
+  uint64_t lru_get_pintail() const { return pintail.size(); }
+  uint64_t lru_get_num_pinned() const { return num_pinned; }
+
+  void lru_set_midpoint(double f) { midpoint = fmin(1.0, fmax(0.0, f)); }
+  
+  void lru_clear() {
+    while (!top.empty()) {
+      lru_remove(top.front());
+    }
+    while (!bottom.empty()) {
+      lru_remove(bottom.front());
+    }
+    while (!pintail.empty()) {
+      lru_remove(pintail.front());
+    }
+    ceph_assert(num_pinned == 0);
+  }
+
+  // insert at top of lru
+  void lru_insert_top(LRUObject *o) {
+    ceph_assert(!o->lru);
+    o->lru = this;
+    top.push_front(&o->lru_link);
+    if (o->lru_pinned) num_pinned++;
+    adjust();
+  }
+
+  // insert at mid point in lru
+  void lru_insert_mid(LRUObject *o) {
+    ceph_assert(!o->lru);
+    o->lru = this;
+    bottom.push_front(&o->lru_link);
+    if (o->lru_pinned) num_pinned++;
+    adjust();
+  }
+
+  // insert at bottom of lru
+  void lru_insert_bot(LRUObject *o) {
+    ceph_assert(!o->lru);
+    o->lru = this;
+    bottom.push_back(&o->lru_link);
+    if (o->lru_pinned) num_pinned++;
+    adjust();
+  }
+
+  // remove an item
+  LRUObject *lru_remove(LRUObject *o) {
+    if (!o->lru) return o;
+    auto list = o->lru_link.get_list();
+    ceph_assert(list == &top || list == &bottom || list == &pintail);
+    o->lru_link.remove_myself();
+    if (o->lru_pinned) num_pinned--;
+    o->lru = nullptr;
+    adjust();
+    return o;
+  }
+
+  // touch item -- move to head of lru
+  bool lru_touch(LRUObject *o) {
+    if (!o->lru) {
+      lru_insert_top(o);
+    } else {
+      ceph_assert(o->lru == this);
+      auto list = o->lru_link.get_list();
+      ceph_assert(list == &top || list == &bottom || list == &pintail);
+      top.push_front(&o->lru_link);
+      adjust();
+    }
+    return true;
+  }
+
+  // touch item -- move to midpoint (unless already higher)
+  bool lru_midtouch(LRUObject *o) {
+    if (!o->lru) {
+      lru_insert_mid(o);
+    } else {
+      ceph_assert(o->lru == this);
+      auto list = o->lru_link.get_list();
+      ceph_assert(list == &top || list == &bottom || list == &pintail);
+      if (list == &top) return false;
+      bottom.push_front(&o->lru_link);
+      adjust();
+    }
+    return true;
+  }
+
+  // touch item -- move to bottom
+  bool lru_bottouch(LRUObject *o) {
+    if (!o->lru) {
+      lru_insert_bot(o);
+    } else {
+      ceph_assert(o->lru == this);
+      auto list = o->lru_link.get_list();
+      ceph_assert(list == &top || list == &bottom || list == &pintail);
+      bottom.push_back(&o->lru_link);
+      adjust();
+    }
+    return true;
+  }
+
+  void lru_touch_entire_pintail() {
+    // promote entire pintail to the top lru
+    while (pintail.size() > 0) {
+      top.push_back(&pintail.front()->lru_link);
+      adjust();
+    }
+  }
+
+  // expire -- expire a single item
+  LRUObject *lru_get_next_expire() {
+    adjust();
+    // look through tail of bot
+    while (bottom.size()) {
+      LRUObject *p = bottom.back();
+      if (!p->lru_pinned) return p;
+
+      // move to pintail
+      pintail.push_front(&p->lru_link);
+    }
+
+    // ok, try head then
+    while (top.size()) {
+      LRUObject *p = top.back();
+      if (!p->lru_pinned) return p;
+
+      // move to pintail
+      pintail.push_front(&p->lru_link);
+    }
+    
+    // no luck!
+    return NULL;
+  }
+  
+  LRUObject *lru_expire() {
+    LRUObject *p = lru_get_next_expire();
+    if (p) 
+      return lru_remove(p);
+    return NULL;
+  }
+
+  void lru_status() {
+    //generic_dout(10) << "lru: " << lru_get_size() << " items, " << top.size() << " top, " << bottom.size() << " bot, " << pintail.size() << " pintail" << dendl;
+  }
+
+protected:
+  // adjust top/bot balance, as necessary
+  void adjust() {
+    uint64_t toplen = top.size();
+    uint64_t topwant = (midpoint * (double)(lru_get_size() - num_pinned));
+    /* move items from below midpoint (bottom) to top: move midpoint forward */
+    for (uint64_t i = toplen; i < topwant; i++) {
+      top.push_back(&bottom.front()->lru_link);
+    }
+    /* or: move items from above midpoint (top) to bottom: move midpoint backwards */
+    for (uint64_t i = toplen; i > topwant; i--) {
+      bottom.push_front(&top.back()->lru_link);
+    }
+  }
+
+  uint64_t num_pinned = 0;
+  double midpoint = 0.6;
+
+  friend class LRUObject;
+private:
+  using LRUList = xlist<LRUObject*>;
+  LRUList top, bottom, pintail;
+};
+
+inline LRUObject::~LRUObject() {
+  if (lru) {
+    lru->lru_remove(this);
+  }
+}
+
+inline void LRUObject::lru_pin() {
+  if (lru && !lru_pinned) {
+    lru->num_pinned++;
+  }
+  lru_pinned = true;
+}
+
+inline void LRUObject::lru_unpin() {
+  if (lru && lru_pinned) {
+    lru->num_pinned--;
+
+    // move from pintail -> bot
+    if (lru_link.get_list() == &lru->pintail) {
+      lru->lru_bottouch(this);
+    }
+  }
+  lru_pinned = false;
+}
+
+#endif
diff --git a/src/include/mempool.h b/src/include/mempool.h
new file mode 100644
index 000000000..076c62afe
--- /dev/null
+++ b/src/include/mempool.h
@@ -0,0 +1,557 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef _CEPH_INCLUDE_MEMPOOL_H
+#define _CEPH_INCLUDE_MEMPOOL_H
+
+#include <cstddef>
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <vector>
+#include <list>
+#include <mutex>
+#include <typeinfo>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/flat_map.hpp>
+
+#include "common/Formatter.h"
+#include "common/ceph_atomic.h"
+#include "include/ceph_assert.h"
+#include "include/compact_map.h"
+#include "include/compact_set.h"
+#include "include/compat.h"
+
+
+/*
+
+Memory Pools
+============
+
+A memory pool is a method for accounting the consumption of memory of
+a set of containers.
+
+Memory pools are statically declared (see pool_index_t).
+
+Each memory pool tracks the number of bytes and items it contains.
+
+Allocators can be declared and associated with a type so that they are
+tracked independently of the pool total.  This additional accounting
+is optional and only incurs an overhead if the debugging is enabled at
+runtime.  This allows developers to see what types are consuming the
+pool resources.
+
+
+Declaring
+---------
+
+Using memory pools is very easy.
+
+To create a new memory pool, simply add a new name into the list of
+memory pools that's defined in "DEFINE_MEMORY_POOLS_HELPER".  That's
+it.  :)
+
+For each memory pool that's created a C++ namespace is also
+automatically created (name is same as in DEFINE_MEMORY_POOLS_HELPER).
+That namespace contains a set of common STL containers that are predefined
+with the appropriate allocators.
+
+Thus for mempool "osd" we have automatically available to us:
+
+   mempool::osd::map
+   mempool::osd::multimap
+   mempool::osd::set
+   mempool::osd::multiset
+   mempool::osd::list
+   mempool::osd::vector
+   mempool::osd::unordered_map
+
+
+Putting objects in a mempool
+----------------------------
+
+In order to use a memory pool with a particular type, a few additional
+declarations are needed.
+
+For a class:
+
+  struct Foo {
+    MEMPOOL_CLASS_HELPERS();
+    ...
+  };
+
+Then, in an appropriate .cc file,
+
+  MEMPOOL_DEFINE_OBJECT_FACTORY(Foo, foo, osd);
+
+The second argument can generally be identical to the first, except
+when the type contains a nested scope.  For example, for
+BlueStore::Onode, we need to do
+
+  MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
+                                bluestore_meta);
+
+(This is just because we need to name some static variables and we
+can't use :: in a variable name.)
+
+XXX Note: the new operator hard-codes the allocation size to the size of the
+object given in MEMPOOL_DEFINE_OBJECT_FACTORY. For this reason, you cannot
+incorporate mempools into a base class without also defining a helper/factory
+for the child class as well (as the base class is usually smaller than the
+child class).
+
+In order to use the STL containers, simply use the namespaced variant
+of the container type.  For example,
+
+  mempool::osd::map<int> myvec;
+
+Introspection
+-------------
+
+The simplest way to interrogate the process is with
+
+  Formater *f = ...
+  mempool::dump(f);
+
+This will dump information about *all* memory pools.  When debug mode
+is enabled, the runtime complexity of dump is O(num_shards *
+num_types).  When debug name is disabled it is O(num_shards).
+
+You can also interrogate a specific pool programmatically with
+
+  size_t bytes = mempool::unittest_2::allocated_bytes();
+  size_t items = mempool::unittest_2::allocated_items();
+
+The runtime complexity is O(num_shards).
+
+Note that you cannot easily query per-type, primarily because debug
+mode is optional and you should not rely on that information being
+available.
+
+*/
+
+namespace mempool {
+
+// --------------------------------------------------------------
+// define memory pools
+
+#define DEFINE_MEMORY_POOLS_HELPER(f) \
+  f(bloom_filter)		      \
+  f(bluestore_alloc)		      \
+  f(bluestore_cache_data)	      \
+  f(bluestore_cache_onode)	      \
+  f(bluestore_cache_meta)	      \
+  f(bluestore_cache_other)	      \
+  f(bluestore_cache_buffer)	      \
+  f(bluestore_extent)		      \
+  f(bluestore_blob)		      \
+  f(bluestore_shared_blob)	      \
+  f(bluestore_inline_bl)	      \
+  f(bluestore_fsck)		      \
+  f(bluestore_txc)		      \
+  f(bluestore_writing_deferred)      \
+  f(bluestore_writing)		      \
+  f(bluefs)			      \
+  f(bluefs_file_reader)              \
+  f(bluefs_file_writer)              \
+  f(buffer_anon)		      \
+  f(buffer_meta)		      \
+  f(osd)			      \
+  f(osd_mapbl)			      \
+  f(osd_pglog)			      \
+  f(osdmap)			      \
+  f(osdmap_mapping)		      \
+  f(pgmap)			      \
+  f(mds_co)			      \
+  f(unittest_1)			      \
+  f(unittest_2)
+
+
+// give them integer ids
+#define P(x) mempool_##x,
+enum pool_index_t {
+  DEFINE_MEMORY_POOLS_HELPER(P)
+  num_pools        // Must be last.
+};
+#undef P
+
+extern bool debug_mode;
+extern void set_debug_mode(bool d);
+
+// --------------------------------------------------------------
+class pool_t;
+
+// we shard pool stats across many shard_t's to reduce the amount
+// of cacheline ping pong.
+enum {
+  num_shard_bits = 5
+};
+enum {
+  num_shards = 1 << num_shard_bits
+};
+
+//
+// Align shard to a cacheline.
+//
+// It would be possible to retrieve the value at runtime (for instance
+// with getconf LEVEL1_DCACHE_LINESIZE or grep -m1 cache_alignment
+// /proc/cpuinfo). It is easier to hard code the largest cache
+// linesize for all known processors (128 bytes). If the actual cache
+// linesize is smaller on a given processor, it will just waste a few
+// bytes.
+//
+struct shard_t {
+  ceph::atomic<size_t> bytes = {0};
+  ceph::atomic<size_t> items = {0};
+  char __padding[128 - sizeof(ceph::atomic<size_t>)*2];
+} __attribute__ ((aligned (128)));
+
+static_assert(sizeof(shard_t) == 128, "shard_t should be cacheline-sized");
+
+struct stats_t {
+  ssize_t items = 0;
+  ssize_t bytes = 0;
+  void dump(ceph::Formatter *f) const {
+    f->dump_int("items", items);
+    f->dump_int("bytes", bytes);
+  }
+
+  stats_t& operator+=(const stats_t& o) {
+    items += o.items;
+    bytes += o.bytes;
+    return *this;
+  }
+};
+
+pool_t& get_pool(pool_index_t ix);
+const char *get_pool_name(pool_index_t ix);
+
+struct type_t {
+  const char *type_name;
+  size_t item_size;
+  ceph::atomic<ssize_t> items = {0};  // signed
+};
+
+struct type_info_hash {
+  std::size_t operator()(const std::type_info& k) const {
+    return k.hash_code();
+  }
+};
+
+class pool_t {
+  shard_t shard[num_shards];
+
+  mutable std::mutex lock;  // only used for types list
+  std::unordered_map<const char *, type_t> type_map;
+
+public:
+  //
+  // How much this pool consumes. O(<num_shards>)
+  //
+  size_t allocated_bytes() const;
+  size_t allocated_items() const;
+
+  void adjust_count(ssize_t items, ssize_t bytes);
+
+  static size_t pick_a_shard_int() {
+    // Dirt cheap, see:
+    //   https://fossies.org/dox/glibc-2.32/pthread__self_8c_source.html
+    size_t me = (size_t)pthread_self();
+    size_t i = (me >> CEPH_PAGE_SHIFT) & ((1 << num_shard_bits) - 1);
+    return i;
+  }
+
+  shard_t* pick_a_shard() {
+    size_t i = pick_a_shard_int();
+    return &shard[i];
+  }
+
+  type_t *get_type(const std::type_info& ti, size_t size) {
+    std::lock_guard<std::mutex> l(lock);
+    auto p = type_map.find(ti.name());
+    if (p != type_map.end()) {
+      return &p->second;
+    }
+    type_t &t = type_map[ti.name()];
+    t.type_name = ti.name();
+    t.item_size = size;
+    return &t;
+  }
+
+  // get pool stats.  by_type is not populated if !debug
+  void get_stats(stats_t *total,
+		 std::map<std::string, stats_t> *by_type) const;
+
+  void dump(ceph::Formatter *f, stats_t *ptotal=0) const;
+};
+
+void dump(ceph::Formatter *f);
+
+
+// STL allocator for use with containers.  All actual state
+// is stored in the static pool_allocator_base_t, which saves us from
+// passing the allocator to container constructors.
+
+template<pool_index_t pool_ix, typename T>
+class pool_allocator {
+  pool_t *pool;
+  type_t *type = nullptr;
+
+public:
+  typedef pool_allocator<pool_ix, T> allocator_type;
+  typedef T value_type;
+  typedef value_type *pointer;
+  typedef const value_type * const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+
+  template<typename U> struct rebind {
+    typedef pool_allocator<pool_ix,U> other;
+  };
+
+  void init(bool force_register) {
+    pool = &get_pool(pool_ix);
+    if (debug_mode || force_register) {
+      type = pool->get_type(typeid(T), sizeof(T));
+    }
+  }
+
+  pool_allocator(bool force_register=false) {
+    init(force_register);
+  }
+  template<typename U>
+  pool_allocator(const pool_allocator<pool_ix,U>&) {
+    init(false);
+  }
+
+  T* allocate(size_t n, void *p = nullptr) {
+    size_t total = sizeof(T) * n;
+    shard_t *shard = pool->pick_a_shard();
+    shard->bytes += total;
+    shard->items += n;
+    if (type) {
+      type->items += n;
+    }
+    T* r = reinterpret_cast<T*>(new char[total]);
+    return r;
+  }
+
+  void deallocate(T* p, size_t n) {
+    size_t total = sizeof(T) * n;
+    shard_t *shard = pool->pick_a_shard();
+    shard->bytes -= total;
+    shard->items -= n;
+    if (type) {
+      type->items -= n;
+    }
+    delete[] reinterpret_cast<char*>(p);
+  }
+
+  T* allocate_aligned(size_t n, size_t align, void *p = nullptr) {
+    size_t total = sizeof(T) * n;
+    shard_t *shard = pool->pick_a_shard();
+    shard->bytes += total;
+    shard->items += n;
+    if (type) {
+      type->items += n;
+    }
+    char *ptr;
+    int rc = ::posix_memalign((void**)(void*)&ptr, align, total);
+    if (rc)
+      throw std::bad_alloc();
+    T* r = reinterpret_cast<T*>(ptr);
+    return r;
+  }
+
+  void deallocate_aligned(T* p, size_t n) {
+    size_t total = sizeof(T) * n;
+    shard_t *shard = pool->pick_a_shard();
+    shard->bytes -= total;
+    shard->items -= n;
+    if (type) {
+      type->items -= n;
+    }
+    aligned_free(p);
+  }
+
+  void destroy(T* p) {
+    p->~T();
+  }
+
+  template<class U>
+  void destroy(U *p) {
+    p->~U();
+  }
+
+  void construct(T* p, const T& val) {
+    ::new ((void *)p) T(val);
+  }
+
+  template<class U, class... Args> void construct(U* p,Args&&... args) {
+    ::new((void *)p) U(std::forward<Args>(args)...);
+  }
+
+  bool operator==(const pool_allocator&) const { return true; }
+  bool operator!=(const pool_allocator&) const { return false; }
+};
+
+
+// Namespace mempool
+
+#define P(x)								\
+  namespace x {								\
+    static const mempool::pool_index_t id = mempool::mempool_##x;	\
+    template<typename v>						\
+    using pool_allocator = mempool::pool_allocator<id,v>;		\
+                                                                        \
+    using string = std::basic_string<char,std::char_traits<char>,       \
+                                     pool_allocator<char>>;             \
+                                                                        \
+    template<typename k,typename v, typename cmp = std::less<k> >	\
+    using map = std::map<k, v, cmp,					\
+			 pool_allocator<std::pair<const k,v>>>;		\
+                                                                        \
+    template<typename k,typename v, typename cmp = std::less<k> >       \
+    using compact_map = compact_map<k, v, cmp,                          \
+			 pool_allocator<std::pair<const k,v>>>;         \
+                                                                        \
+    template<typename k,typename v, typename cmp = std::less<k> >       \
+    using compact_multimap = compact_multimap<k, v, cmp,                \
+			 pool_allocator<std::pair<const k,v>>>;         \
+                                                                        \
+    template<typename k, typename cmp = std::less<k> >                  \
+    using compact_set = compact_set<k, cmp, pool_allocator<k>>;         \
+                                                                        \
+    template<typename k,typename v, typename cmp = std::less<k> >	\
+    using multimap = std::multimap<k,v,cmp,				\
+				   pool_allocator<std::pair<const k,	\
+							    v>>>;	\
+                                                                        \
+    template<typename k, typename cmp = std::less<k> >			\
+    using set = std::set<k,cmp,pool_allocator<k>>;			\
+                                                                        \
+    template<typename k, typename cmp = std::less<k> >			\
+    using flat_set = boost::container::flat_set<k,cmp,pool_allocator<k>>; \
+									\
+    template<typename k, typename v, typename cmp = std::less<k> >	\
+    using flat_map = boost::container::flat_map<k,v,cmp,		\
+						pool_allocator<std::pair<k,v>>>; \
+                                                                        \
+    template<typename v>						\
+    using list = std::list<v,pool_allocator<v>>;			\
+                                                                        \
+    template<typename v>						\
+    using vector = std::vector<v,pool_allocator<v>>;			\
+                                                                        \
+    template<typename k, typename v,					\
+	     typename h=std::hash<k>,					\
+	     typename eq = std::equal_to<k>>				\
+    using unordered_map =						\
+      std::unordered_map<k,v,h,eq,pool_allocator<std::pair<const k,v>>>;\
+                                                                        \
+    inline size_t allocated_bytes() {					\
+      return mempool::get_pool(id).allocated_bytes();			\
+    }									\
+    inline size_t allocated_items() {					\
+      return mempool::get_pool(id).allocated_items();			\
+    }									\
+  };
+
+DEFINE_MEMORY_POOLS_HELPER(P)
+
+#undef P
+
+};
+
+// the elements allocated by mempool is in the same memory space as the ones
+// allocated by the default allocator. so compare them in an efficient way:
+// libstdc++'s std::equal is specialized to use memcmp if T is integer or
+// pointer. this is good enough for our usecase. use
+// std::is_trivially_copyable<T> to expand the support to more types if
+// nececssary.
+template<typename T, mempool::pool_index_t pool_index>
+bool operator==(const std::vector<T, std::allocator<T>>& lhs,
+		const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs)
+{
+  return (lhs.size() == rhs.size() &&
+	  std::equal(lhs.begin(), lhs.end(), rhs.begin()));
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator!=(const std::vector<T, std::allocator<T>>& lhs,
+		const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs)
+{
+  return !(lhs == rhs);
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator==(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs,
+		const std::vector<T, std::allocator<T>>& rhs)
+{
+  return rhs == lhs;
+}
+
+template<typename T, mempool::pool_index_t pool_index>
+bool operator!=(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs,
+		const std::vector<T, std::allocator<T>>& rhs)
+{
+  return !(lhs == rhs);
+}
+
+// Use this for any type that is contained by a container (unless it
+// is a class you defined; see below).
+#define MEMPOOL_DECLARE_FACTORY(obj, factoryname, pool)			\
+  namespace mempool {							\
+    namespace pool {							\
+      extern pool_allocator<obj> alloc_##factoryname;			\
+    }									\
+  }
+
+#define MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool)			\
+  namespace mempool {							\
+    namespace pool {							\
+      pool_allocator<obj> alloc_##factoryname = {true};			\
+    }									\
+  }
+
+// Use this for each class that belongs to a mempool.  For example,
+//
+//   class T {
+//     MEMPOOL_CLASS_HELPERS();
+//     ...
+//   };
+//
+#define MEMPOOL_CLASS_HELPERS()						\
+  void *operator new(size_t size);					\
+  void *operator new[](size_t size) noexcept {				\
+    ceph_abort_msg("no array new");					\
+    return nullptr; }							\
+  void  operator delete(void *);					\
+  void  operator delete[](void *) { ceph_abort_msg("no array delete"); }
+
+
+// Use this in some particular .cc file to match each class with a
+// MEMPOOL_CLASS_HELPERS().
+#define MEMPOOL_DEFINE_OBJECT_FACTORY(obj,factoryname,pool)		\
+  MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool)			\
+  void *obj::operator new(size_t size) {				\
+    return mempool::pool::alloc_##factoryname.allocate(1); \
+  }									\
+  void obj::operator delete(void *p)  {					\
+    return mempool::pool::alloc_##factoryname.deallocate((obj*)p, 1);	\
+  }
+
+#endif
diff --git a/src/include/msgr.h b/src/include/msgr.h
new file mode 100644
index 000000000..c8ad48ad1
--- /dev/null
+++ b/src/include/msgr.h
@@ -0,0 +1,255 @@
+#ifndef CEPH_MSGR_H
+#define CEPH_MSGR_H
+
+#ifndef __KERNEL__
+#include <sys/socket.h> // for struct sockaddr_storage
+#endif
+
+#include "include/int_types.h"
+
+/* See comment in ceph_fs.h.  */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT_LEGACY    6789  /* legacy default monitor port */
+#define CEPH_MON_PORT_IANA      3300  /* IANA monitor port */
+
+/*
+ * tcp connection banner.  include a protocol version. and adjust
+ * whenever the wire protocol changes.  try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+
+
+/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2_PREFIX "ceph v2\n"
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name)               \
+	const static uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+	const static uint64_t CEPH_MSGR2_FEATUREMASK_##name =            \
+			(1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+	(((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE(0, 1, REVISION_1)   // msgr2.1
+DEFINE_MSGR2_FEATURE(1, 1, COMPRESSION)  // on-wire compression
+
+/*
+ * Features supported.  Should be everything above.
+ */
+#define CEPH_MSGR2_SUPPORTED_FEATURES \
+	(CEPH_MSGR2_FEATURE_REVISION_1 | \
+	 CEPH_MSGR2_FEATURE_COMPRESSION | \
+	 0ULL)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES (0ULL)
+
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+       return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+	__u8 type;      /* CEPH_ENTITY_TYPE_* */
+	__le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON    0x01
+#define CEPH_ENTITY_TYPE_MDS    0x02
+#define CEPH_ENTITY_TYPE_OSD    0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_MGR    0x10
+#define CEPH_ENTITY_TYPE_AUTH   0x20
+
+#define CEPH_ENTITY_TYPE_ANY    0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+	__le32 type;
+	__le32 nonce;  /* unique id for process (e.g. pid) */
+	struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+	struct ceph_entity_name name;
+	struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
+					  incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
+					  with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
+					  with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
+#define CEPH_MSGR_TAG_MSG           7  /* message */
+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
+#define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2     14
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15  /* keepalive reply */
+#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16  /* ceph v2 doing server challenge */
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+	__le64 features;     /* supported feature bits */
+	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
+	__le32 global_seq;   /* count connections initiated by this host */
+	__le32 connect_seq;  /* count connections initiated in this session */
+	__le32 protocol_version;
+	__le32 authorizer_protocol;
+	__le32 authorizer_len;
+	__u8  flags;         /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+	__u8 tag;
+	__le64 features;     /* feature bits for this session */
+	__le32 global_seq;
+	__le32 connect_seq;
+	__le32 protocol_version;
+	__le32 authorizer_len;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header_old {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_inst src, orig_src;
+	__le32 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_name src;
+
+	/* oldest code we think can decode this.  unknown if zero. */
+	__le16 compat_version;
+	__le16 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header2 {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 data_pre_padding_len;
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	__le64 ack_seq;
+	__u8 flags;
+	/* oldest code we think can decode this.  unknown if zero. */
+	__le16 compat_version;
+	__le16 reserved;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW     64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH    196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ * ceph_msg_footer_old does not support digital signatures on messages PLR
+ */
+
+struct ceph_msg_footer_old {
+	__le32 front_crc, middle_crc, data_crc;
+	__u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_msg_footer {
+	__le32 front_crc, middle_crc, data_crc;
+	// sig holds the 64 bits of the digital signature for the message PLR
+	__le64  sig;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
+#define CEPH_MSG_FOOTER_SIGNED	  (1<<2)   /* msg was signed */
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/neorados/RADOS.hpp b/src/include/neorados/RADOS.hpp
new file mode 100644
index 000000000..fa1ac92ae
--- /dev/null
+++ b/src/include/neorados/RADOS.hpp
@@ -0,0 +1,1150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef NEORADOS_RADOS_HPP
+#define NEORADOS_RADOS_HPP
+
+#include <cstddef>
+#include <memory>
+#include <tuple>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <variant>
+
+#include <boost/asio.hpp>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/uuid/uuid.hpp>
+
+#include <boost/system/error_code.hpp>
+
+// Will be in C++20!
+
+#include "include/expected.hpp"
+
+// Had better be in C++20. Why is this not in Boost?
+
+#include "include/function2.hpp"
+
+// Things broken out so we can decode them in Objecter.
+
+#include "include/neorados/RADOS_Decodable.hpp"
+
+// Needed for type erasure and template support. We can't really avoid
+// it.
+
+#include "common/async/completion.h"
+
+// These are needed for RGW, but in general as a 'shiny new interface'
+// we should try to use forward declarations and provide standard alternatives.
+
+#include "include/common_fwd.h"
+
+#include "include/buffer.h"
+#include "include/rados/librados_fwd.hpp"
+
+#include "common/ceph_time.h"
+
+namespace neorados {
+class Object;
+class IOContext;
+}
+namespace std {
+template<>
+struct hash<neorados::Object>;
+template<>
+struct hash<neorados::IOContext>;
+}
+
+namespace neorados {
+namespace detail {
+class Client;
+}
+
+class RADOS;
+
+// Exists mostly so that repeated operations on the same object don't
+// have to pay for the string copy to construct an object_t.
+
+class Object final {
+  friend RADOS;
+  friend std::hash<Object>;
+
+public:
+  Object();
+  Object(const char* s);
+  Object(std::string_view s);
+  Object(std::string&& s);
+  Object(const std::string& s);
+  ~Object();
+
+  Object(const Object& o);
+  Object& operator =(const Object& o);
+
+  Object(Object&& o);
+  Object& operator =(Object&& o);
+
+  operator std::string_view() const;
+
+  friend std::ostream& operator <<(std::ostream& m, const Object& o);
+  friend bool operator <(const Object& lhs, const Object& rhs);
+  friend bool operator <=(const Object& lhs, const Object& rhs);
+  friend bool operator >=(const Object& lhs, const Object& rhs);
+  friend bool operator >(const Object& lhs, const Object& rhs);
+
+  friend bool operator ==(const Object& lhs, const Object& rhs);
+  friend bool operator !=(const Object& lhs, const Object& rhs);
+
+private:
+
+  static constexpr std::size_t impl_size = 4 * 8;
+  std::aligned_storage_t<impl_size> impl;
+};
+
+// Not the same as the librados::IoCtx, but it does gather together
+// some of the same metadata. Since we're likely to do multiple
+// operations in the same pool or namespace, it doesn't make sense to
+// redo a bunch of lookups and string copies.
+
+class IOContext final {
+  friend RADOS;
+  friend std::hash<IOContext>;
+
+public:
+
+  IOContext();
+  explicit IOContext(std::int64_t pool);
+  IOContext(std::int64_t _pool, std::string_view _ns);
+  IOContext(std::int64_t _pool, std::string&& _ns);
+  ~IOContext();
+
+  IOContext(const IOContext& rhs);
+  IOContext& operator =(const IOContext& rhs);
+
+  IOContext(IOContext&& rhs);
+  IOContext& operator =(IOContext&& rhs);
+
+  std::int64_t pool() const;
+  void pool(std::int64_t _pool);
+
+  std::string_view ns() const;
+  void ns(std::string_view _ns);
+  void ns(std::string&& _ns);
+
+  std::optional<std::string_view> key() const;
+  void key(std::string_view _key);
+  void key(std::string&& _key);
+  void clear_key();
+
+  std::optional<std::int64_t> hash() const;
+  void hash(std::int64_t _hash);
+  void clear_hash();
+
+  std::optional<std::uint64_t> read_snap() const;
+  void read_snap(std::optional<std::uint64_t> _snapid);
+
+  // I can't actually move-construct here since snapid_t is its own
+  // separate class type, not an alias.
+  std::optional<
+    std::pair<std::uint64_t,
+	      std::vector<std::uint64_t>>> write_snap_context() const;
+  void write_snap_context(std::optional<
+			  std::pair<std::uint64_t,
+			              std::vector<std::uint64_t>>> snapc);
+
+  bool full_try() const;
+  void full_try(bool _full_try);
+
+  friend std::ostream& operator <<(std::ostream& m, const IOContext& o);
+  friend bool operator <(const IOContext& lhs, const IOContext& rhs);
+  friend bool operator <=(const IOContext& lhs, const IOContext& rhs);
+  friend bool operator >=(const IOContext& lhs, const IOContext& rhs);
+  friend bool operator >(const IOContext& lhs, const IOContext& rhs);
+
+  friend bool operator ==(const IOContext& lhs, const IOContext& rhs);
+  friend bool operator !=(const IOContext& lhs, const IOContext& rhs);
+
+private:
+
+  static constexpr std::size_t impl_size = 16 * 8;
+  std::aligned_storage_t<impl_size> impl;
+};
+
+inline constexpr std::string_view all_nspaces("\001");
+
+enum class cmpxattr_op : std::uint8_t {
+  eq  = 1,
+  ne  = 2,
+  gt  = 3,
+  gte = 4,
+  lt  = 5,
+  lte = 6
+};
+
+namespace alloc_hint {
+enum alloc_hint_t {
+  sequential_write = 1,
+  random_write = 2,
+  sequential_read = 4,
+  random_read = 8,
+  append_only = 16,
+  immutable = 32,
+  shortlived = 64,
+  longlived = 128,
+  compressible = 256,
+  incompressible = 512
+};
+}
+
+class Op {
+  friend RADOS;
+
+public:
+
+  Op(const Op&) = delete;
+  Op& operator =(const Op&) = delete;
+  Op(Op&&);
+  Op& operator =(Op&&);
+  ~Op();
+
+  void set_excl();
+  void set_failok();
+  void set_fadvise_random();
+  void set_fadvise_sequential();
+  void set_fadvise_willneed();
+  void set_fadvise_dontneed();
+  void set_fadvise_nocache();
+
+  void cmpext(uint64_t off, ceph::buffer::list&& cmp_bl, std::size_t* s);
+  void cmpxattr(std::string_view name, cmpxattr_op op,
+		const ceph::buffer::list& val);
+  void cmpxattr(std::string_view name, cmpxattr_op op, std::uint64_t val);
+  void assert_version(uint64_t ver);
+  void assert_exists();
+  void cmp_omap(const boost::container::flat_map<
+		  std::string,
+		  std::pair<ceph::buffer::list, int>>& assertions);
+
+  void exec(std::string_view cls, std::string_view method,
+	    const ceph::buffer::list& inbl,
+	    ceph::buffer::list* out,
+	    boost::system::error_code* ec = nullptr);
+  void exec(std::string_view cls, std::string_view method,
+	    const ceph::buffer::list& inbl,
+	    fu2::unique_function<void(boost::system::error_code,
+				      const ceph::buffer::list&) &&> f);
+  void exec(std::string_view cls, std::string_view method,
+	    const ceph::buffer::list& inbl,
+	    fu2::unique_function<void(boost::system::error_code, int,
+				      const ceph::buffer::list&) &&> f);
+  void exec(std::string_view cls, std::string_view method,
+	    const ceph::buffer::list& inbl,
+	    boost::system::error_code* ec = nullptr);
+
+
+  // Flags that apply to all ops in the operation vector
+  void balance_reads();
+  void localize_reads();
+  void order_reads_writes();
+  void ignore_cache();
+  void skiprwlocks();
+  void ignore_overlay();
+  void full_try();
+  void full_force();
+  void ignore_redirect();
+  void ordersnap();
+  void returnvec();
+
+  std::size_t size() const;
+  using Signature = void(boost::system::error_code);
+  using Completion = ceph::async::Completion<Signature>;
+
+  friend std::ostream& operator <<(std::ostream& m, const Op& o);
+protected:
+  Op();
+  static constexpr std::size_t impl_size = 85 * 8;
+  std::aligned_storage_t<impl_size> impl;
+};
+
+// This class is /not/ thread-safe. If you want you can wrap it in
+// something that locks it.
+
+class ReadOp final : public Op {
+  friend RADOS;
+
+public:
+
+  ReadOp() = default;
+  ReadOp(const ReadOp&) = delete;
+  ReadOp(ReadOp&&) = default;
+
+  ReadOp& operator =(const ReadOp&) = delete;
+  ReadOp& operator =(ReadOp&&) = default;
+
+  void read(size_t off, uint64_t len, ceph::buffer::list* out,
+	    boost::system::error_code* ec = nullptr);
+  void get_xattr(std::string_view name, ceph::buffer::list* out,
+		 boost::system::error_code* ec = nullptr);
+  void get_omap_header(ceph::buffer::list*,
+		       boost::system::error_code* ec = nullptr);
+
+  void sparse_read(uint64_t off, uint64_t len,
+		   ceph::buffer::list* out,
+		   std::vector<std::pair<std::uint64_t, std::uint64_t>>* extents,
+		   boost::system::error_code* ec = nullptr);
+
+  void stat(std::uint64_t* size, ceph::real_time* mtime,
+	    boost::system::error_code* ec = nullptr);
+
+  void get_omap_keys(std::optional<std::string_view> start_after,
+		     std::uint64_t max_return,
+		     boost::container::flat_set<std::string>* keys,
+		     bool* truncated,
+		     boost::system::error_code* ec = nullptr);
+
+
+  void get_xattrs(boost::container::flat_map<std::string,
+		                             ceph::buffer::list>* kv,
+		     boost::system::error_code* ec = nullptr);
+
+  void get_omap_vals(std::optional<std::string_view> start_after,
+		     std::optional<std::string_view> filter_prefix,
+		     uint64_t max_return,
+		     boost::container::flat_map<std::string,
+		                                ceph::buffer::list>* kv,
+		     bool* truncated,
+		     boost::system::error_code* ec = nullptr);
+
+
+  void get_omap_vals_by_keys(const boost::container::flat_set<std::string>& keys,
+			     boost::container::flat_map<std::string,
+			                                ceph::buffer::list>* kv,
+			     boost::system::error_code* ec = nullptr);
+
+  void list_watchers(std::vector<struct ObjWatcher>* watchers,
+		     boost::system::error_code* ec = nullptr);
+
+  void list_snaps(struct SnapSet* snaps,
+		  boost::system::error_code* ec = nullptr);
+};
+
+class WriteOp final : public Op {
+  friend RADOS;
+public:
+
+  WriteOp() = default;
+  WriteOp(const WriteOp&) = delete;
+  WriteOp(WriteOp&&) = default;
+
+  WriteOp& operator =(const WriteOp&) = delete;
+  WriteOp& operator =(WriteOp&&) = default;
+
+  void set_mtime(ceph::real_time t);
+  void create(bool exclusive);
+  void write(uint64_t off, ceph::buffer::list&& bl);
+  void write_full(ceph::buffer::list&& bl);
+  void writesame(std::uint64_t off, std::uint64_t write_len,
+		 ceph::buffer::list&& bl);
+  void append(ceph::buffer::list&& bl);
+  void remove();
+  void truncate(uint64_t off);
+  void zero(uint64_t off, uint64_t len);
+  void rmxattr(std::string_view name);
+  void setxattr(std::string_view name,
+		ceph::buffer::list&& bl);
+  void rollback(uint64_t snapid);
+  void set_omap(const boost::container::flat_map<std::string,
+		                                 ceph::buffer::list>& map);
+  void set_omap_header(ceph::buffer::list&& bl);
+  void clear_omap();
+  void rm_omap_keys(const boost::container::flat_set<std::string>& to_rm);
+  void set_alloc_hint(uint64_t expected_object_size,
+		      uint64_t expected_write_size,
+		      alloc_hint::alloc_hint_t flags);
+};
+
+
+struct FSStats {
+  uint64_t kb;
+  uint64_t kb_used;
+  uint64_t kb_avail;
+  uint64_t num_objects;
+};
+
+// From librados.h, maybe move into a common file. But I want to see
+// if we need/want to amend/add/remove anything first.
+struct PoolStats {
+  /// space used in bytes
+  uint64_t num_bytes;
+  /// space used in KB
+  uint64_t num_kb;
+  /// number of objects in the pool
+  uint64_t num_objects;
+  /// number of clones of objects
+  uint64_t num_object_clones;
+  /// num_objects * num_replicas
+  uint64_t num_object_copies;
+  /// number of objects missing on primary
+  uint64_t num_objects_missing_on_primary;
+  /// number of objects found on no OSDs
+  uint64_t num_objects_unfound;
+  /// number of objects replicated fewer times than they should be
+  /// (but found on at least one OSD)
+  uint64_t num_objects_degraded;
+  /// number of objects read
+  uint64_t num_rd;
+  /// objects read in KB
+  uint64_t num_rd_kb;
+  /// number of objects written
+  uint64_t num_wr;
+  /// objects written in KB
+  uint64_t num_wr_kb;
+  /// bytes originally provided by user
+  uint64_t num_user_bytes;
+  /// bytes passed compression
+  uint64_t compressed_bytes_orig;
+  /// bytes resulted after compression
+  uint64_t compressed_bytes;
+  /// bytes allocated at storage
+  uint64_t compressed_bytes_alloc;
+};
+
+// Placement group, for PG commands
+struct PG {
+  uint64_t pool;
+  uint32_t seed;
+};
+
+class Cursor final {
+public:
+  static Cursor begin();
+  static Cursor end();
+
+  Cursor();
+  Cursor(const Cursor&);
+  Cursor& operator =(const Cursor&);
+  Cursor(Cursor&&);
+  Cursor& operator =(Cursor&&);
+  ~Cursor();
+
+  friend bool operator ==(const Cursor& lhs,
+			  const Cursor& rhs);
+  friend bool operator !=(const Cursor& lhs,
+			  const Cursor& rhs);
+  friend bool operator <(const Cursor& lhs,
+			 const Cursor& rhs);
+  friend bool operator <=(const Cursor& lhs,
+			  const Cursor& rhs);
+  friend bool operator >=(const Cursor& lhs,
+			  const Cursor& rhs);
+  friend bool operator >(const Cursor& lhs,
+			 const Cursor& rhs);
+
+  std::string to_str() const;
+  static std::optional<Cursor> from_str(const std::string& s);
+
+private:
+  struct end_magic_t {};
+  Cursor(end_magic_t);
+  Cursor(void*);
+  friend RADOS;
+  static constexpr std::size_t impl_size = 16 * 8;
+  std::aligned_storage_t<impl_size> impl;
+};
+
+class RADOS final
+{
+public:
+  static constexpr std::tuple<uint32_t, uint32_t, uint32_t> version() {
+    return {0, 0, 1};
+  }
+
+  using BuildSig = void(boost::system::error_code, RADOS);
+  using BuildComp = ceph::async::Completion<BuildSig>;
+  class Builder {
+    std::optional<std::string> conf_files;
+    std::optional<std::string> cluster;
+    std::optional<std::string> name;
+    std::vector<std::pair<std::string, std::string>> configs;
+    bool no_default_conf = false;
+    bool no_mon_conf = false;
+
+  public:
+    Builder() = default;
+    Builder& add_conf_file(std::string_view v);
+    Builder& set_cluster(std::string_view c) {
+      cluster = std::string(c);
+      return *this;
+    }
+    Builder& set_name(std::string_view n) {
+      name = std::string(n);
+      return *this;
+    }
+    Builder& set_no_default_conf() {
+      no_default_conf = true;
+      return *this;
+    }
+    Builder& set_no_mon_conf() {
+      no_mon_conf = true;
+      return *this;
+    }
+    Builder& set_conf_option(std::string_view opt, std::string_view val) {
+      configs.emplace_back(std::string(opt), std::string(val));
+      return *this;
+    }
+
+    template<typename CompletionToken>
+    auto build(boost::asio::io_context& ioctx, CompletionToken&& token) {
+      boost::asio::async_completion<CompletionToken, BuildSig> init(token);
+      build(ioctx,
+	    BuildComp::create(ioctx.get_executor(),
+			      std::move(init.completion_handler)));
+      return init.result.get();
+    }
+
+  private:
+    void build(boost::asio::io_context& ioctx,
+	       std::unique_ptr<BuildComp> c);
+  };
+
+
+  template<typename CompletionToken>
+  static auto make_with_cct(CephContext* cct,
+			    boost::asio::io_context& ioctx,
+			    CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, BuildSig> init(token);
+    make_with_cct(cct, ioctx,
+		  BuildComp::create(ioctx.get_executor(),
+				    std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  static RADOS make_with_librados(librados::Rados& rados);
+
+  RADOS(const RADOS&) = delete;
+  RADOS& operator =(const RADOS&) = delete;
+
+  RADOS(RADOS&&);
+  RADOS& operator =(RADOS&&);
+
+  ~RADOS();
+
+  CephContext* cct();
+
+  using executor_type = boost::asio::io_context::executor_type;
+  executor_type get_executor() const;
+  boost::asio::io_context& get_io_context();
+
+  template<typename CompletionToken>
+  auto execute(const Object& o, const IOContext& ioc, ReadOp&& op,
+	       ceph::buffer::list* bl,
+	       CompletionToken&& token, uint64_t* objver = nullptr,
+	       const blkin_trace_info* trace_info = nullptr) {
+    boost::asio::async_completion<CompletionToken, Op::Signature> init(token);
+    execute(o, ioc, std::move(op), bl,
+	    ReadOp::Completion::create(get_executor(),
+				       std::move(init.completion_handler)),
+	    objver, trace_info);
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto execute(const Object& o, const IOContext& ioc, WriteOp&& op,
+	       CompletionToken&& token, uint64_t* objver = nullptr,
+	       const blkin_trace_info* trace_info = nullptr) {
+    boost::asio::async_completion<CompletionToken, Op::Signature> init(token);
+    execute(o, ioc, std::move(op),
+	    Op::Completion::create(get_executor(),
+				   std::move(init.completion_handler)),
+	    objver, trace_info);
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto execute(const Object& o, std::int64_t pool,
+	       ReadOp&& op,
+	       ceph::buffer::list* bl,
+	       CompletionToken&& token,
+	       std::optional<std::string_view> ns = {},
+	       std::optional<std::string_view> key = {},
+	       uint64_t* objver = nullptr) {
+    boost::asio::async_completion<CompletionToken, Op::Signature> init(token);
+    execute(o, pool, std::move(op), bl,
+	    ReadOp::Completion::create(get_executor(),
+				       std::move(init.completion_handler)),
+	    ns, key, objver);
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto execute(const Object& o, std::int64_t pool, WriteOp&& op,
+	       CompletionToken&& token,
+	       std::optional<std::string_view> ns = {},
+	       std::optional<std::string_view> key = {},
+	       uint64_t* objver = nullptr) {
+    boost::asio::async_completion<CompletionToken, Op::Signature> init(token);
+    execute(o, pool, std::move(op),
+	    Op::Completion::create(get_executor(),
+				   std::move(init.completion_handler)),
+	    ns, key, objver);
+    return init.result.get();
+  }
+
+  boost::uuids::uuid get_fsid() const noexcept;
+
+  using LookupPoolSig = void(boost::system::error_code,
+			     std::int64_t);
+  using LookupPoolComp = ceph::async::Completion<LookupPoolSig>;
+  template<typename CompletionToken>
+  auto lookup_pool(std::string_view name,
+		   CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, LookupPoolSig> init(token);
+    lookup_pool(name,
+		LookupPoolComp::create(get_executor(),
+				       std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  std::optional<uint64_t> get_pool_alignment(int64_t pool_id);
+
+  using LSPoolsSig = void(std::vector<std::pair<std::int64_t, std::string>>);
+  using LSPoolsComp = ceph::async::Completion<LSPoolsSig>;
+  template<typename CompletionToken>
+  auto list_pools(CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, LSPoolsSig> init(token);
+    list_pools(LSPoolsComp::create(get_executor(),
+				   std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+
+
+  using SimpleOpSig = void(boost::system::error_code);
+  using SimpleOpComp = ceph::async::Completion<SimpleOpSig>;
+  template<typename CompletionToken>
+  auto create_pool_snap(int64_t pool, std::string_view snapName,
+			CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    create_pool_snap(pool, snapName,
+		     SimpleOpComp::create(get_executor(),
+					  std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  using SMSnapSig = void(boost::system::error_code, std::uint64_t);
+  using SMSnapComp = ceph::async::Completion<SMSnapSig>;
+  template<typename CompletionToken>
+  auto allocate_selfmanaged_snap(int64_t pool,
+				 CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SMSnapSig> init(token);
+    allocate_selfmanaged_snap(pool,
+			      SMSnapComp::create(
+				get_executor(),
+				std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto delete_pool_snap(int64_t pool, std::string_view snapName,
+			CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    delete_pool_snap(pool, snapName,
+		     SimpleOpComp::create(get_executor(),
+					  std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto delete_selfmanaged_snap(int64_t pool, std::string_view snapName,
+			       CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    delete_selfmanaged_snap(pool, snapName,
+			    SimpleOpComp::create(
+			      get_executor(),
+			      std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto create_pool(std::string_view name, std::optional<int> crush_rule,
+		   CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    create_pool(name, crush_rule,
+		SimpleOpComp::create(get_executor(),
+				     std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto delete_pool(std::string_view name,
+		   CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    delete_pool(name,
+		SimpleOpComp::create(get_executor(),
+				     std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto delete_pool(int64_t pool,
+		   CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    delete_pool(pool,
+		SimpleOpComp::create(get_executor(),
+				     std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  using PoolStatSig = void(boost::system::error_code,
+			   boost::container::flat_map<std::string,
+			                              PoolStats>, bool);
+  using PoolStatComp = ceph::async::Completion<PoolStatSig>;
+  template<typename CompletionToken>
+  auto stat_pools(const std::vector<std::string>& pools,
+		  CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, PoolStatSig> init(token);
+    stat_pools(pools,
+	       PoolStatComp::create(get_executor(),
+				    std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  using StatFSSig = void(boost::system::error_code,
+			 FSStats);
+  using StatFSComp = ceph::async::Completion<StatFSSig>;
+  template<typename CompletionToken>
+  auto statfs(std::optional<int64_t> pool,
+	      CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, StatFSSig> init(token);
+    ceph_statfs(pool, StatFSComp::create(get_executor(),
+					 std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  using WatchCB = fu2::unique_function<void(boost::system::error_code,
+					    uint64_t notify_id,
+					    uint64_t cookie,
+					    uint64_t notifier_id,
+					    ceph::buffer::list&& bl)>;
+
+  using WatchSig = void(boost::system::error_code ec,
+			uint64_t cookie);
+  using WatchComp = ceph::async::Completion<WatchSig>;
+  template<typename CompletionToken>
+  auto watch(const Object& o, const IOContext& ioc,
+	     std::optional<std::chrono::seconds> timeout,
+	     WatchCB&& cb, CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, WatchSig> init(token);
+    watch(o, ioc, timeout, std::move(cb),
+	  WatchComp::create(get_executor(),
+			    std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto watch(const Object& o, std::int64_t pool,
+	     std::optional<std::chrono::seconds> timeout,
+	     WatchCB&& cb, CompletionToken&& token,
+	     std::optional<std::string_view> ns = {},
+	     std::optional<std::string_view> key = {}) {
+    boost::asio::async_completion<CompletionToken, WatchSig> init(token);
+    watch(o, pool, timeout, std::move(cb),
+	  WatchComp::create(get_executor(),
+			    std::move(init.completion_handler)),
+	  ns, key);
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto notify_ack(const Object& o,
+		  const IOContext& ioc,
+		  uint64_t notify_id,
+		  uint64_t cookie,
+		  ceph::buffer::list&& bl,
+		  CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    notify_ack(o, ioc, notify_id, cookie, std::move(bl),
+	       SimpleOpComp::create(get_executor(),
+				    std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto notify_ack(const Object& o,
+		  std::int64_t pool,
+		  uint64_t notify_id,
+		  uint64_t cookie,
+		  ceph::buffer::list&& bl,
+		  CompletionToken&& token,
+		  std::optional<std::string_view> ns = {},
+		  std::optional<std::string_view> key = {}) {
+    boost::asio::async_completion<CompletionToken, WatchSig> init(token);
+    notify_ack(o, pool, notify_id, cookie, std::move(bl),
+	       SimpleOpComp::create(get_executor(),
+				    std::move(init.completion_handler)),
+	       ns, key);
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto unwatch(uint64_t cookie, const IOContext& ioc,
+	       CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    unwatch(cookie, ioc,
+	    SimpleOpComp::create(get_executor(),
+				 std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto unwatch(uint64_t cookie, std::int64_t pool,
+	       CompletionToken&& token,
+	       std::optional<std::string_view> ns = {},
+	       std::optional<std::string_view> key = {}) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    unwatch(cookie, pool,
+	    SimpleOpComp::create(get_executor(),
+				 std::move(init.completion_handler)),
+	    ns, key);
+    return init.result.get();
+  }
+
+  // This is one of those places where having to force everything into
+  // a .cc file is really infuriating. If we had modules, that would
+  // let us separate out the implementation details without
+  // sacrificing all the benefits of templates.
+  using VoidOpSig = void();
+  using VoidOpComp = ceph::async::Completion<VoidOpSig>;
+  template<typename CompletionToken>
+  auto flush_watch(CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, VoidOpSig> init(token);
+    flush_watch(VoidOpComp::create(get_executor(),
+				   std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  using NotifySig = void(boost::system::error_code, ceph::buffer::list);
+  using NotifyComp = ceph::async::Completion<NotifySig>;
+  template<typename CompletionToken>
+  auto notify(const Object& oid, const IOContext& ioc, ceph::buffer::list&& bl,
+	      std::optional<std::chrono::milliseconds> timeout,
+	      CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, NotifySig> init(token);
+    notify(oid, ioc, std::move(bl), timeout,
+	   NotifyComp::create(get_executor(),
+			      std::move(init.completion_handler)));
+
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto notify(const Object& oid, std::int64_t pool, ceph::buffer::list&& bl,
+	      std::optional<std::chrono::milliseconds> timeout,
+	      CompletionToken&& token,
+	      std::optional<std::string_view> ns = {},
+	      std::optional<std::string_view> key = {}) {
+    boost::asio::async_completion<CompletionToken, NotifySig> init(token);
+    notify(oid, pool, bl, timeout,
+	   NotifyComp::create(get_executor(),
+			      std::move(init.completion_handler)),
+	   ns, key);
+
+    return init.result.get();
+  }
+
+  // The versions with pointers are fine for coroutines, but
+  // extraordinarily unappealing for callback-oriented programming.
+  using EnumerateSig = void(boost::system::error_code,
+			    std::vector<Entry>,
+			    Cursor);
+  using EnumerateComp = ceph::async::Completion<EnumerateSig>;
+  template<typename CompletionToken>
+  auto enumerate_objects(const IOContext& ioc, const Cursor& begin,
+			 const Cursor& end, const std::uint32_t max,
+			 const ceph::buffer::list& filter,
+			 CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, EnumerateSig> init(token);
+    enumerate_objects(ioc, begin, end, max, filter,
+		      EnumerateComp::create(get_executor(),
+					    std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto enumerate_objects(std::int64_t pool, const Cursor& begin,
+			 const Cursor& end, const std::uint32_t max,
+			 const ceph::buffer::list& filter,
+			 CompletionToken&& token,
+			 std::optional<std::string_view> ns = {},
+			 std::optional<std::string_view> key = {}) {
+    boost::asio::async_completion<CompletionToken, EnumerateSig> init(token);
+    enumerate_objects(pool, begin, end, max, filter,
+		      EnumerateComp::create(get_executor(),
+					    std::move(init.completion_handler)),
+		      ns, key);
+    return init.result.get();
+  }
+
+  using CommandSig = void(boost::system::error_code,
+			  std::string, ceph::buffer::list);
+  using CommandComp = ceph::async::Completion<CommandSig>;
+  template<typename CompletionToken>
+  auto osd_command(int osd, std::vector<std::string>&& cmd,
+		   ceph::buffer::list&& in, CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+    osd_command(osd, std::move(cmd), std::move(in),
+		CommandComp::create(get_executor(),
+				      std::move(init.completion_handler)));
+    return init.result.get();
+  }
+  template<typename CompletionToken>
+  auto pg_command(PG pg, std::vector<std::string>&& cmd,
+		  ceph::buffer::list&& in, CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, CommandSig> init(token);
+    pg_command(pg, std::move(cmd), std::move(in),
+	       CommandComp::create(get_executor(),
+				      std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto mon_command(std::vector<std::string> command,
+		   const ceph::buffer::list& bl,
+		   std::string* outs, ceph::buffer::list* outbl,
+		   CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    mon_command(command, bl, outs, outbl,
+		SimpleOpComp::create(get_executor(),
+				     std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto enable_application(std::string_view pool, std::string_view app_name,
+			  bool force, CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    enable_application(pool, app_name, force,
+		       SimpleOpComp::create(get_executor(),
+					    std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto blocklist_add(std::string_view client_address,
+                     std::optional<std::chrono::seconds> expire,
+                     CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    blocklist_add(client_address, expire,
+                  SimpleOpComp::create(get_executor(),
+                                       std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  template<typename CompletionToken>
+  auto wait_for_latest_osd_map(CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token);
+    wait_for_latest_osd_map(
+      SimpleOpComp::create(get_executor(), std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  uint64_t instance_id() const;
+
+private:
+
+  RADOS();
+
+  friend Builder;
+
+  RADOS(std::unique_ptr<detail::Client> impl);
+  static void make_with_cct(CephContext* cct,
+			    boost::asio::io_context& ioctx,
+		    std::unique_ptr<BuildComp> c);
+
+  void execute(const Object& o, const IOContext& ioc, ReadOp&& op,
+	       ceph::buffer::list* bl, std::unique_ptr<Op::Completion> c,
+	       uint64_t* objver, const blkin_trace_info* trace_info);
+
+  void execute(const Object& o, const IOContext& ioc, WriteOp&& op,
+	       std::unique_ptr<Op::Completion> c, uint64_t* objver,
+	       const blkin_trace_info* trace_info);
+
+  void execute(const Object& o, std::int64_t pool, ReadOp&& op,
+	       ceph::buffer::list* bl, std::unique_ptr<Op::Completion> c,
+	       std::optional<std::string_view> ns,
+	       std::optional<std::string_view> key,
+	       uint64_t* objver);
+
+  void execute(const Object& o, std::int64_t pool, WriteOp&& op,
+	       std::unique_ptr<Op::Completion> c,
+	       std::optional<std::string_view> ns,
+	       std::optional<std::string_view> key,
+	       uint64_t* objver);
+
+  void lookup_pool(std::string_view name, std::unique_ptr<LookupPoolComp> c);
+  void list_pools(std::unique_ptr<LSPoolsComp> c);
+  void create_pool_snap(int64_t pool, std::string_view snapName,
+			std::unique_ptr<SimpleOpComp> c);
+  void allocate_selfmanaged_snap(int64_t pool, std::unique_ptr<SMSnapComp> c);
+  void delete_pool_snap(int64_t pool, std::string_view snapName,
+			std::unique_ptr<SimpleOpComp> c);
+  void delete_selfmanaged_snap(int64_t pool, std::uint64_t snap,
+			       std::unique_ptr<SimpleOpComp> c);
+  void create_pool(std::string_view name, std::optional<int> crush_rule,
+		   std::unique_ptr<SimpleOpComp> c);
+  void delete_pool(std::string_view name,
+		   std::unique_ptr<SimpleOpComp> c);
+  void delete_pool(int64_t pool,
+		   std::unique_ptr<SimpleOpComp> c);
+  void stat_pools(const std::vector<std::string>& pools,
+		  std::unique_ptr<PoolStatComp> c);
+  void stat_fs(std::optional<std::int64_t> pool,
+	       std::unique_ptr<StatFSComp> c);
+
+  void watch(const Object& o, const IOContext& ioc,
+	     std::optional<std::chrono::seconds> timeout,
+	     WatchCB&& cb, std::unique_ptr<WatchComp> c);
+  void watch(const Object& o, std::int64_t pool,
+	     std::optional<std::chrono::seconds> timeout,
+	     WatchCB&& cb, std::unique_ptr<WatchComp> c,
+	     std::optional<std::string_view> ns,
+	     std::optional<std::string_view> key);
+  tl::expected<ceph::timespan, boost::system::error_code>
+  watch_check(uint64_t cookie);
+  void notify_ack(const Object& o,
+		  const IOContext& _ioc,
+		  uint64_t notify_id,
+		  uint64_t cookie,
+		  ceph::buffer::list&& bl,
+		  std::unique_ptr<SimpleOpComp>);
+  void notify_ack(const Object& o,
+		  std::int64_t pool,
+		  uint64_t notify_id,
+		  uint64_t cookie,
+		  ceph::buffer::list&& bl,
+		  std::unique_ptr<SimpleOpComp>,
+		  std::optional<std::string_view> ns,
+		  std::optional<std::string_view> key);
+  void unwatch(uint64_t cookie, const IOContext& ioc,
+	       std::unique_ptr<SimpleOpComp>);
+  void unwatch(uint64_t cookie, std::int64_t pool,
+	       std::unique_ptr<SimpleOpComp>,
+	       std::optional<std::string_view> ns,
+	       std::optional<std::string_view> key);
+  void notify(const Object& oid, const IOContext& ioctx,
+	      ceph::buffer::list&& bl,
+	      std::optional<std::chrono::milliseconds> timeout,
+	      std::unique_ptr<NotifyComp> c);
+  void notify(const Object& oid, std::int64_t pool,
+	      ceph::buffer::list&& bl,
+	      std::optional<std::chrono::milliseconds> timeout,
+	      std::unique_ptr<NotifyComp> c,
+	      std::optional<std::string_view> ns,
+	      std::optional<std::string_view> key);
+  void flush_watch(std::unique_ptr<VoidOpComp>);
+
+  void enumerate_objects(const IOContext& ioc, const Cursor& begin,
+			 const Cursor& end, const std::uint32_t max,
+			 const ceph::buffer::list& filter,
+			 std::vector<Entry>* ls,
+			 Cursor* cursor,
+			 std::unique_ptr<SimpleOpComp> c);
+  void enumerate_objects(std::int64_t pool, const Cursor& begin,
+			 const Cursor& end, const std::uint32_t max,
+			 const ceph::buffer::list& filter,
+			 std::vector<Entry>* ls,
+			 Cursor* cursor,
+			 std::unique_ptr<SimpleOpComp> c,
+			 std::optional<std::string_view> ns,
+			 std::optional<std::string_view> key);
+  void enumerate_objects(const IOContext& ioc, const Cursor& begin,
+			 const Cursor& end, const std::uint32_t max,
+			 const ceph::buffer::list& filter,
+			 std::unique_ptr<EnumerateComp> c);
+  void enumerate_objects(std::int64_t pool, const Cursor& begin,
+			 const Cursor& end, const std::uint32_t max,
+			 const ceph::buffer::list& filter,
+			 std::unique_ptr<EnumerateComp> c,
+			 std::optional<std::string_view> ns,
+			 std::optional<std::string_view> key);
+  void osd_command(int osd, std::vector<std::string>&& cmd,
+		   ceph::buffer::list&& in, std::unique_ptr<CommandComp> c);
+  void pg_command(PG pg, std::vector<std::string>&& cmd,
+		  ceph::buffer::list&& in, std::unique_ptr<CommandComp> c);
+
+  void mon_command(std::vector<std::string> command,
+		   const ceph::buffer::list& bl,
+		   std::string* outs, ceph::buffer::list* outbl,
+		   std::unique_ptr<SimpleOpComp> c);
+
+  void enable_application(std::string_view pool, std::string_view app_name,
+			  bool force, std::unique_ptr<SimpleOpComp> c);
+
+  void blocklist_add(std::string_view client_address,
+                     std::optional<std::chrono::seconds> expire,
+                     std::unique_ptr<SimpleOpComp> c);
+
+  void wait_for_latest_osd_map(std::unique_ptr<SimpleOpComp> c);
+
+  // Proxy object to provide access to low-level RADOS messaging clients
+  std::unique_ptr<detail::Client> impl;
+};
+
+enum class errc {
+  pool_dne = 1,
+  invalid_snapcontext
+};
+
+const boost::system::error_category& error_category() noexcept;
+}
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::neorados::errc> {
+  static const bool value = true;
+};
+
+template<>
+struct is_error_condition_enum<::neorados::errc> {
+  static const bool value = false;
+};
+}
+
+namespace neorados {
+//  explicit conversion:
+inline boost::system::error_code make_error_code(errc e) noexcept {
+  return { static_cast<int>(e), error_category() };
+}
+
+// implicit conversion:
+inline boost::system::error_condition make_error_condition(errc e) noexcept {
+  return { static_cast<int>(e), error_category() };
+}
+}
+
+namespace std {
+template<>
+struct hash<neorados::Object> {
+  size_t operator ()(const neorados::Object& r) const;
+};
+template<>
+struct hash<neorados::IOContext> {
+  size_t operator ()(const neorados::IOContext& r) const;
+};
+} // namespace std
+
+#endif // NEORADOS_RADOS_HPP
diff --git a/src/include/neorados/RADOS_Decodable.hpp b/src/include/neorados/RADOS_Decodable.hpp
new file mode 100644
index 000000000..83d065b3f
--- /dev/null
+++ b/src/include/neorados/RADOS_Decodable.hpp
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef NEORADOS_RADOS_DECODABLE_HPP
+#define NEORADOS_RADOS_DECODABLE_HPP
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <fmt/core.h>
+#if FMT_VERSION >= 90000
+#include <fmt/ostream.h>
+#endif
+
+namespace neorados {
+struct Entry {
+  std::string nspace;
+  std::string oid;
+  std::string locator;
+
+  Entry() {}
+  Entry(std::string nspace, std::string oid, std::string locator) :
+    nspace(std::move(nspace)), oid(std::move(oid)), locator(locator) {}
+};
+inline bool operator ==(const Entry& l, const Entry r) {
+  return std::tie(l.nspace, l.oid, l.locator) ==
+    std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator !=(const Entry& l, const Entry r) {
+  return std::tie(l.nspace, l.oid, l.locator) !=
+    std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator <(const Entry& l, const Entry r) {
+  return std::tie(l.nspace, l.oid, l.locator) <
+    std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator <=(const Entry& l, const Entry r) {
+  return std::tie(l.nspace, l.oid, l.locator) <=
+    std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator >=(const Entry& l, const Entry r) {
+  return std::tie(l.nspace, l.oid, l.locator) >=
+    std::tie(r.nspace, r.oid, r.locator);
+}
+inline bool operator >(const Entry& l, const Entry r) {
+  return std::tie(l.nspace, l.oid, l.locator) >
+    std::tie(r.nspace, r.oid, r.locator);
+}
+
+inline std::ostream& operator <<(std::ostream& out, const Entry& entry) {
+  if (!entry.nspace.empty())
+    out << entry.nspace << '/';
+  out << entry.oid;
+  if (!entry.locator.empty())
+    out << '@' << entry.locator;
+  return out;
+}
+
+struct CloneInfo {
+  uint64_t cloneid = 0;
+  std::vector<uint64_t> snaps; // ascending
+  std::vector<std::pair<uint64_t, uint64_t>> overlap;// with next newest
+  uint64_t size = 0;
+  CloneInfo() = default;
+};
+
+struct SnapSet {
+  std::vector<CloneInfo> clones; // ascending
+  std::uint64_t seq = 0;   // newest snapid seen by the object
+  SnapSet() = default;
+};
+
+struct ObjWatcher {
+  /// Address of the Watcher
+  std::string addr;
+  /// Watcher ID
+  std::int64_t watcher_id;
+  /// Cookie
+  std::uint64_t cookie;
+  /// Timeout in Seconds
+  std::uint32_t timeout_seconds;
+};
+}
+
+namespace std {
+template<>
+struct hash<::neorados::Entry> {
+  std::size_t operator ()(::neorados::Entry e) const {
+    hash<std::string> h;
+    return (h(e.nspace) << 2) ^ (h(e.oid) << 1) ^ h(e.locator);
+  }
+};
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<neorados::Entry> : ostream_formatter {};
+#endif
+
+#endif // RADOS_DECODABLE_HPP
diff --git a/src/include/neorados/buffer_fwd.h b/src/include/neorados/buffer_fwd.h
new file mode 120000
index 000000000..bd1f6f1b0
--- /dev/null
+++ b/src/include/neorados/buffer_fwd.h
@@ -0,0 +1 @@
+../buffer_fwd.h
+\ No newline at end of file
diff --git a/src/include/neorados/completion.h b/src/include/neorados/completion.h
new file mode 120000
index 000000000..100678fc2
--- /dev/null
+++ b/src/include/neorados/completion.h
@@ -0,0 +1 @@
+../../common/async/completion.h
+\ No newline at end of file
diff --git a/src/include/object.h b/src/include/object.h
new file mode 100644
index 000000000..4564af86e
--- /dev/null
+++ b/src/include/object.h
@@ -0,0 +1,189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_OBJECT_H
+#define CEPH_OBJECT_H
+
+#include <cstdint>
+#include <cstdio>
+#include <iomanip>
+#include <iosfwd>
+#include <string>
+#include <string>
+#include <string_view>
+
+#include "include/rados.h"
+#include "include/unordered_map.h"
+
+#include "hash.h"
+#include "encoding.h"
+#include "ceph_hash.h"
+
+struct object_t {
+  std::string name;
+
+  object_t() {}
+  // cppcheck-suppress noExplicitConstructor
+  object_t(const char *s) : name(s) {}
+  // cppcheck-suppress noExplicitConstructor
+  object_t(const std::string& s) : name(s) {}
+  object_t(std::string&& s) : name(std::move(s)) {}
+  object_t(std::string_view s) : name(s) {}
+
+  auto operator<=>(const object_t&) const noexcept = default;
+
+  void swap(object_t& o) {
+    name.swap(o.name);
+  }
+  void clear() {
+    name.clear();
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    using ceph::encode;
+    encode(name, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    using ceph::decode;
+    decode(name, bl);
+  }
+};
+WRITE_CLASS_ENCODER(object_t)
+
+inline std::ostream& operator<<(std::ostream& out, const object_t& o) {
+  return out << o.name;
+}
+
+namespace std {
+template<> struct hash<object_t> {
+  size_t operator()(const object_t& r) const {
+    //static hash<string> H;
+    //return H(r.name);
+    return ceph_str_hash_linux(r.name.c_str(), r.name.length());
+  }
+};
+} // namespace std
+
+
+struct file_object_t {
+  uint64_t ino, bno;
+  mutable char buf[34];
+
+  file_object_t(uint64_t i=0, uint64_t b=0) : ino(i), bno(b) {
+    buf[0] = 0;
+  }
+  
+  const char *c_str() const {
+    if (!buf[0])
+      snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)bno);
+    return buf;
+  }
+
+  operator object_t() {
+    return object_t(c_str());
+  }
+};
+
+
+// ---------------------------
+// snaps
+
+struct snapid_t {
+  uint64_t val;
+  // cppcheck-suppress noExplicitConstructor
+  snapid_t(uint64_t v=0) : val(v) {}
+  snapid_t operator+=(snapid_t o) { val += o.val; return *this; }
+  snapid_t operator++() { ++val; return *this; }
+  operator uint64_t() const { return val; }
+};
+
+inline void encode(snapid_t i, ceph::buffer::list &bl) {
+  using ceph::encode;
+  encode(i.val, bl);
+}
+inline void decode(snapid_t &i, ceph::buffer::list::const_iterator &p) {
+  using ceph::decode;
+  decode(i.val, p);
+}
+
+template<>
+struct denc_traits<snapid_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const snapid_t& o, size_t& p) {
+    denc(o.val, p);
+  }
+  static void encode(const snapid_t &o, ceph::buffer::list::contiguous_appender& p) {
+    denc(o.val, p);
+  }
+  static void decode(snapid_t& o, ceph::buffer::ptr::const_iterator &p) {
+    denc(o.val, p);
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const snapid_t& s) {
+  if (s == CEPH_NOSNAP)
+    return out << "head";
+  else if (s == CEPH_SNAPDIR)
+    return out << "snapdir";
+  else
+    return out << std::hex << s.val << std::dec;
+}
+
+
+struct sobject_t {
+  object_t oid;
+  snapid_t snap;
+
+  sobject_t() : snap(0) {}
+  sobject_t(object_t o, snapid_t s) : oid(o), snap(s) {}
+
+  auto operator<=>(const sobject_t&) const noexcept = default;
+
+  void swap(sobject_t& o) {
+    oid.swap(o.oid);
+    snapid_t t = snap;
+    snap = o.snap;
+    o.snap = t;
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(oid, bl);
+    encode(snap, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    decode(oid, bl);
+    decode(snap, bl);
+  }
+};
+WRITE_CLASS_ENCODER(sobject_t)
+
+inline std::ostream& operator<<(std::ostream& out, const sobject_t &o) {
+  return out << o.oid << "/" << o.snap;
+}
+namespace std {
+template<> struct hash<sobject_t> {
+  size_t operator()(const sobject_t &r) const {
+    static hash<object_t> H;
+    static rjhash<uint64_t> I;
+    return H(r.oid) ^ I(r.snap);
+  }
+};
+} // namespace std
+
+#endif
diff --git a/src/include/object_fmt.h b/src/include/object_fmt.h
new file mode 100644
index 000000000..33df5e3fb
--- /dev/null
+++ b/src/include/object_fmt.h
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+/**
+ * \file fmtlib formatters for some object.h structs
+ */
+#include <fmt/format.h>
+
+#include "object.h"
+
+
+template <>
+struct fmt::formatter<snapid_t> {
+
+  constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+  template <typename FormatContext>
+  auto format(const snapid_t& snp, FormatContext& ctx) const
+  {
+    if (snp == CEPH_NOSNAP) {
+      return fmt::format_to(ctx.out(), "head");
+    }
+    if (snp == CEPH_SNAPDIR) {
+      return fmt::format_to(ctx.out(), "snapdir");
+    }
+    return fmt::format_to(ctx.out(), "{:x}", snp.val);
+  }
+};
diff --git a/src/include/on_exit.h b/src/include/on_exit.h
new file mode 100644
index 000000000..c412ab33e
--- /dev/null
+++ b/src/include/on_exit.h
@@ -0,0 +1,49 @@
+#ifndef CEPH_ON_EXIT_H
+#define CEPH_ON_EXIT_H
+
+#include <pthread.h>
+#include <vector>
+
+#include "include/ceph_assert.h"
+/*
+ * Create a static instance at the file level to get callbacks called when the
+ * process exits via main() or exit().
+ */
+
+class OnExitManager {
+  public:
+    typedef void (*callback_t)(void *arg);
+
+    OnExitManager() {
+      int ret = pthread_mutex_init(&lock_, NULL);
+      ceph_assert(ret == 0);
+    }
+
+    ~OnExitManager() {
+      pthread_mutex_lock(&lock_);
+      std::vector<struct cb>::iterator it;
+      for (it = funcs_.begin(); it != funcs_.end(); it++) {
+        it->func(it->arg);
+      }
+      funcs_.clear();
+      pthread_mutex_unlock(&lock_);
+    }
+
+    void add_callback(callback_t func, void *arg) {
+      pthread_mutex_lock(&lock_);
+      struct cb callback = { func, arg };
+      funcs_.push_back(callback);
+      pthread_mutex_unlock(&lock_);
+    }
+
+  private:
+    struct cb {
+      callback_t func;
+      void *arg;
+    };
+
+    std::vector<struct cb> funcs_;
+    pthread_mutex_t lock_;
+};
+
+#endif
diff --git a/src/include/page.h b/src/include/page.h
new file mode 100644
index 000000000..db6e20585
--- /dev/null
+++ b/src/include/page.h
@@ -0,0 +1,18 @@
+#ifndef CEPH_PAGE_H
+#define CEPH_PAGE_H
+
+namespace ceph {
+  // these are in common/page.cc
+  extern unsigned _page_size;
+  extern unsigned long _page_mask;
+  extern unsigned _page_shift;
+}
+
+#endif
+
+
+#define CEPH_PAGE_SIZE ceph::_page_size
+#define CEPH_PAGE_MASK ceph::_page_mask
+#define CEPH_PAGE_SHIFT ceph::_page_shift
+
+
diff --git a/src/include/rados.h b/src/include/rados.h
new file mode 100644
index 000000000..eac3a2159
--- /dev/null
+++ b/src/include/rados.h
@@ -0,0 +1,700 @@
+#ifndef CEPH_RADOS_H
+#define CEPH_RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include <string.h>
+#include <stdbool.h>
+#include "msgr.h"
+
+/* See comment in ceph_fs.h.  */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+	unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+				    const struct ceph_fsid *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+
+struct ceph_timespec {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+	__le16 preferred; /* preferred primary osd */
+	__le16 ps;        /* placement seed */
+	__le32 pool;      /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg pool types
+ *
+ * NOTE: These map 1:1 on to the pg_pool_t::TYPE_* values.  They are
+ * duplicated here only for CrushCompiler's benefit.
+ */
+#define CEPH_PG_TYPE_REPLICATED 1
+/* #define CEPH_PG_TYPE_RAID4   2   never implemented */
+#define CEPH_PG_TYPE_ERASURE 3
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ *
+ * ** This function is released to the public domain by the author. **
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+	if ((x & bmask) < b)
+		return x & bmask;
+	else
+		return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+	__le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+	__le32 epoch;
+	__le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS       (1<<0)
+#define CEPH_OSD_UP           (1<<1)
+#define CEPH_OSD_AUTOOUT      (1<<2)  /* osd was automatically marked out */
+#define CEPH_OSD_NEW          (1<<3)  /* osd is new, never marked in */
+#define CEPH_OSD_FULL         (1<<4)  /* osd is at or above full threshold */
+#define CEPH_OSD_NEARFULL     (1<<5)  /* osd is at or above nearfull threshold */
+#define CEPH_OSD_BACKFILLFULL (1<<6)  /* osd is at or above backfillfull threshold */
+#define CEPH_OSD_DESTROYED    (1<<7)  /* osd has been destroyed */
+#define CEPH_OSD_NOUP         (1<<8)  /* osd can not be marked up */
+#define CEPH_OSD_NODOWN       (1<<9)  /* osd can not be marked down */
+#define CEPH_OSD_NOIN         (1<<10) /* osd can not be marked in */
+#define CEPH_OSD_NOOUT        (1<<11) /* osd can not be marked out */
+#define CEPH_OSD_STOP         (1<<12) /* osd has been stopped by admin */
+
+extern const char *ceph_osd_state_name(int s);
+
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+
+#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
+#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL         (1<<0)  /* sync writes (near ENOSPC), deprecated since mimic*/
+#define CEPH_OSDMAP_FULL             (1<<1)  /* no data writes (ENOSPC), deprecated since mimic */
+#define CEPH_OSDMAP_PAUSERD          (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR          (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC         (1<<4)  /* pause recovery */
+#define CEPH_OSDMAP_NOUP             (1<<5)  /* block osd boot */
+#define CEPH_OSDMAP_NODOWN           (1<<6)  /* block osd mark-down/failure */
+#define CEPH_OSDMAP_NOOUT            (1<<7)  /* block osd auto mark-out */
+#define CEPH_OSDMAP_NOIN             (1<<8)  /* block osd auto mark-in */
+#define CEPH_OSDMAP_NOBACKFILL       (1<<9)  /* block osd backfill */
+#define CEPH_OSDMAP_NORECOVER        (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB          (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB     (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT      (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE      (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE      (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL    (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN   (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
+#define CEPH_OSDMAP_PURGED_SNAPDIRS  (1<<20) /* osds have converted snapsets */
+#define CEPH_OSDMAP_NOSNAPTRIM       (1<<21) /* disable snap trimming */
+#define CEPH_OSDMAP_PGLOG_HARDLIMIT  (1<<22) /* put a hard limit on pg log length */
+#define CEPH_OSDMAP_NOAUTOSCALE      (1<<23)  /* block pg autoscale */
+
+/* these are hidden in 'ceph status' view */
+#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL|	\
+				      CEPH_OSDMAP_REQUIRE_KRAKEN |	\
+				      CEPH_OSDMAP_REQUIRE_LUMINOUS |	\
+				      CEPH_OSDMAP_RECOVERY_DELETES |	\
+				      CEPH_OSDMAP_SORTBITWISE |		\
+				      CEPH_OSDMAP_PURGED_SNAPDIRS |     \
+                                      CEPH_OSDMAP_PGLOG_HARDLIMIT)
+#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL |	\
+					  CEPH_OSDMAP_REQUIRE_KRAKEN |	\
+					  CEPH_OSDMAP_REQUIRE_LUMINOUS)
+
+/*
+ * major ceph release numbers
+ */
+#define CEPH_RELEASE_ARGONAUT    1
+#define CEPH_RELEASE_BOBTAIL     2
+#define CEPH_RELEASE_CUTTLEFISH  3
+#define CEPH_RELEASE_DUMPLING    4
+#define CEPH_RELEASE_EMPEROR     5
+#define CEPH_RELEASE_FIREFLY     6
+#define CEPH_RELEASE_GIANT       7
+#define CEPH_RELEASE_HAMMER      8
+#define CEPH_RELEASE_INFERNALIS  9
+#define CEPH_RELEASE_JEWEL      10
+#define CEPH_RELEASE_KRAKEN     11
+#define CEPH_RELEASE_LUMINOUS   12
+#define CEPH_RELEASE_MIMIC      13
+#define CEPH_RELEASE_NAUTILUS   14
+#define CEPH_RELEASE_OCTOPUS    15
+#define CEPH_RELEASE_PACIFIC    16
+#define CEPH_RELEASE_QUINCY     17
+#define CEPH_RELEASE_REEF       18
+#define CEPH_RELEASE_MAX        19  /* highest + 1 */
+
+/*
+ * The error code to return when an OSD can't handle a write
+ * because it is too large.
+ */
+#define OSD_WRITETOOBIG EMSGSIZE
+
+/*
+ * osd ops
+ *
+ * WARNING: do not use these op codes directly.  Use the helpers
+ * defined below instead.  In certain cases, op code behavior was
+ * redefined, resulting in special-cases in the helpers.
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+#define CEPH_OSD_OP_MODE_CACHE 0x8000
+
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+//      LEAVE UNUSED           0x0600 used to be multiobject ops
+
+#define __CEPH_OSD_OP1(mode, nr) \
+	(CEPH_OSD_OP_MODE_##mode | (nr))
+
+#define __CEPH_OSD_OP(mode, type, nr) \
+	(CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr))
+
+#define __CEPH_FORALL_OSD_OPS(f)					    \
+	/** data **/							    \
+	/* read */							    \
+	f(READ,		__CEPH_OSD_OP(RD, DATA, 1),	"read")		    \
+	f(STAT,		__CEPH_OSD_OP(RD, DATA, 2),	"stat")		    \
+	f(MAPEXT,	__CEPH_OSD_OP(RD, DATA, 3),	"mapext")	    \
+	f(CHECKSUM,	__CEPH_OSD_OP(RD, DATA, 31),	"checksum")	    \
+									    \
+	/* fancy read */						    \
+	f(MASKTRUNC,	__CEPH_OSD_OP(RD, DATA, 4),	"masktrunc")	    \
+	f(SPARSE_READ,	__CEPH_OSD_OP(RD, DATA, 5),	"sparse-read")	    \
+									    \
+	f(NOTIFY,	__CEPH_OSD_OP(RD, DATA, 6),	"notify")	    \
+	f(NOTIFY_ACK,	__CEPH_OSD_OP(RD, DATA, 7),	"notify-ack")	    \
+									    \
+	/* versioning */						    \
+	f(ASSERT_VER,	__CEPH_OSD_OP(RD, DATA, 8),	"assert-version")   \
+									    \
+	f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9),	"list-watchers")    \
+									    \
+	f(LIST_SNAPS,	__CEPH_OSD_OP(RD, DATA, 10),	"list-snaps")	    \
+									    \
+	/* sync */							    \
+	f(SYNC_READ,	__CEPH_OSD_OP(RD, DATA, 11),	"sync_read")	    \
+									    \
+	/* write */							    \
+	f(WRITE,	__CEPH_OSD_OP(WR, DATA, 1),	"write")	    \
+	f(WRITEFULL,	__CEPH_OSD_OP(WR, DATA, 2),	"writefull")	    \
+	f(TRUNCATE,	__CEPH_OSD_OP(WR, DATA, 3),	"truncate")	    \
+	f(ZERO,		__CEPH_OSD_OP(WR, DATA, 4),	"zero")		    \
+	f(DELETE,	__CEPH_OSD_OP(WR, DATA, 5),	"delete")	    \
+									    \
+	/* fancy write */						    \
+	f(APPEND,	__CEPH_OSD_OP(WR, DATA, 6),	"append")	    \
+	f(STARTSYNC,	__CEPH_OSD_OP(WR, DATA, 7),	"startsync")	    \
+	f(SETTRUNC,	__CEPH_OSD_OP(WR, DATA, 8),	"settrunc")	    \
+	f(TRIMTRUNC,	__CEPH_OSD_OP(WR, DATA, 9),	"trimtrunc")	    \
+									    \
+	f(TMAPUP,	__CEPH_OSD_OP(RMW, DATA, 10),	"tmapup")	    \
+	f(TMAPPUT,	__CEPH_OSD_OP(WR, DATA, 11),	"tmapput")	    \
+	f(TMAPGET,	__CEPH_OSD_OP(RD, DATA, 12),	"tmapget")	    \
+									    \
+	f(CREATE,	__CEPH_OSD_OP(WR, DATA, 13),	"create")	    \
+	f(ROLLBACK,	__CEPH_OSD_OP(WR, DATA, 14),	"rollback")	    \
+									    \
+	f(WATCH,	__CEPH_OSD_OP(WR, DATA, 15),	"watch")	    \
+									    \
+	/* omap */							    \
+	f(OMAPGETKEYS,	__CEPH_OSD_OP(RD, DATA, 17),	"omap-get-keys")    \
+	f(OMAPGETVALS,	__CEPH_OSD_OP(RD, DATA, 18),	"omap-get-vals")    \
+	f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19),	"omap-get-header")  \
+	f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \
+	f(OMAPSETVALS,	__CEPH_OSD_OP(WR, DATA, 21),	"omap-set-vals")    \
+	f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22),	"omap-set-header")  \
+	f(OMAPCLEAR,	__CEPH_OSD_OP(WR, DATA, 23),	"omap-clear")	    \
+	f(OMAPRMKEYS,	__CEPH_OSD_OP(WR, DATA, 24),	"omap-rm-keys")	    \
+	f(OMAPRMKEYRANGE, __CEPH_OSD_OP(WR, DATA, 44),	"omap-rm-key-range") \
+	f(OMAP_CMP,	__CEPH_OSD_OP(RD, DATA, 25),	"omap-cmp")	    \
+									    \
+	/* tiering */							    \
+	f(COPY_FROM,	__CEPH_OSD_OP(WR, DATA, 26),	"copy-from")	    \
+	f(COPY_FROM2,	__CEPH_OSD_OP(WR, DATA, 45),	"copy-from2")	    \
+	/* was copy-get-classic */					\
+	f(UNDIRTY,	__CEPH_OSD_OP(WR, DATA, 28),	"undirty")	    \
+	f(ISDIRTY,	__CEPH_OSD_OP(RD, DATA, 29),	"isdirty")	    \
+	f(COPY_GET,	__CEPH_OSD_OP(RD, DATA, 30),	"copy-get")	    \
+	f(CACHE_FLUSH,	__CEPH_OSD_OP(CACHE, DATA, 31),	"cache-flush")	    \
+	f(CACHE_EVICT,	__CEPH_OSD_OP(CACHE, DATA, 32),	"cache-evict")	    \
+	f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \
+									    \
+	/* convert tmap to omap */					    \
+	f(TMAP2OMAP,	__CEPH_OSD_OP(RMW, DATA, 34),	"tmap2omap")	    \
+									    \
+	/* hints */							    \
+	f(SETALLOCHINT,	__CEPH_OSD_OP(WR, DATA, 35),	"set-alloc-hint")   \
+                                                                            \
+	/* cache pin/unpin */						    \
+	f(CACHE_PIN,	__CEPH_OSD_OP(WR, DATA, 36),	"cache-pin")        \
+	f(CACHE_UNPIN,	__CEPH_OSD_OP(WR, DATA, 37),	"cache-unpin")      \
+									    \
+	/* ESX/SCSI */							    \
+	f(WRITESAME,	__CEPH_OSD_OP(WR, DATA, 38),	"write-same")	    \
+	f(CMPEXT,	__CEPH_OSD_OP(RD, DATA, 32),	"cmpext")	    \
+									    \
+	/* Extensible */						    \
+	f(SET_REDIRECT,	__CEPH_OSD_OP(WR, DATA, 39),	"set-redirect")	    \
+	f(SET_CHUNK,	__CEPH_OSD_OP(CACHE, DATA, 40),	"set-chunk")	    \
+	f(TIER_PROMOTE,	__CEPH_OSD_OP(WR, DATA, 41),	"tier-promote")	    \
+	f(UNSET_MANIFEST, __CEPH_OSD_OP(WR, DATA, 42),	"unset-manifest")   \
+	f(TIER_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 43),	"tier-flush")	    \
+	f(TIER_EVICT, __CEPH_OSD_OP(CACHE, DATA, 44),	"tier-evict")	    \
+									    \
+	/** attrs **/							    \
+	/* read */							    \
+	f(GETXATTR,	__CEPH_OSD_OP(RD, ATTR, 1),	"getxattr")	    \
+	f(GETXATTRS,	__CEPH_OSD_OP(RD, ATTR, 2),	"getxattrs")	    \
+	f(CMPXATTR,	__CEPH_OSD_OP(RD, ATTR, 3),	"cmpxattr")	    \
+									    \
+	/* write */							    \
+	f(SETXATTR,	__CEPH_OSD_OP(WR, ATTR, 1),	"setxattr")	    \
+	f(SETXATTRS,	__CEPH_OSD_OP(WR, ATTR, 2),	"setxattrs")	    \
+	f(RESETXATTRS,	__CEPH_OSD_OP(WR, ATTR, 3),	"resetxattrs")	    \
+	f(RMXATTR,	__CEPH_OSD_OP(WR, ATTR, 4),	"rmxattr")	    \
+									    \
+	/** subop **/							    \
+	f(PULL,		__CEPH_OSD_OP1(SUB, 1),		"pull")		    \
+	f(PUSH,		__CEPH_OSD_OP1(SUB, 2),		"push")		    \
+	f(BALANCEREADS,	__CEPH_OSD_OP1(SUB, 3),		"balance-reads")    \
+	f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4),	"unbalance-reads")  \
+	f(SCRUB,	__CEPH_OSD_OP1(SUB, 5),		"scrub")	    \
+	f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6),	"scrub-reserve")    \
+	f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7),	"scrub-unreserve")  \
+	/* 8 used to be scrub-stop */					\
+	f(SCRUB_MAP,	__CEPH_OSD_OP1(SUB, 9),		"scrub-map")	    \
+									    \
+	/** exec **/							    \
+	/* note: the RD bit here is wrong; see special-case below in helper */ \
+	f(CALL,		__CEPH_OSD_OP(RD, EXEC, 1),	"call")		    \
+									    \
+	/** pg **/							    \
+	f(PGLS,		__CEPH_OSD_OP(RD, PG, 1),	"pgls")		    \
+	f(PGLS_FILTER,	__CEPH_OSD_OP(RD, PG, 2),	"pgls-filter")	    \
+	f(PG_HITSET_LS,	__CEPH_OSD_OP(RD, PG, 3),	"pg-hitset-ls")	    \
+	f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4),	"pg-hitset-get")    \
+	f(PGNLS,	__CEPH_OSD_OP(RD, PG, 5),	"pgnls")	    \
+	f(PGNLS_FILTER,	__CEPH_OSD_OP(RD, PG, 6),	"pgnls-filter")     \
+	f(SCRUBLS, __CEPH_OSD_OP(RD, PG, 7), "scrubls")
+
+enum {
+#define GENERATE_ENUM_ENTRY(op, opcode, str)	CEPH_OSD_OP_##op = (opcode),
+__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY)
+#undef GENERATE_ENUM_ENTRY
+};
+
+static inline int ceph_osd_op_type_data(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+	return (op & CEPH_OSD_OP_MODE_RD) &&
+		op != CEPH_OSD_OP_CALL;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+	return op & CEPH_OSD_OP_MODE_WR;
+}
+static inline int ceph_osd_op_mode_cache(int op)
+{
+	return op & CEPH_OSD_OP_MODE_CACHE;
+}
+static inline bool ceph_osd_op_uses_extent(int op)
+{
+	switch(op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_MAPEXT:
+	case CEPH_OSD_OP_MASKTRUNC:
+	case CEPH_OSD_OP_SPARSE_READ:
+	case CEPH_OSD_OP_SYNC_READ:
+	case CEPH_OSD_OP_WRITE:
+	case CEPH_OSD_OP_WRITEFULL:
+	case CEPH_OSD_OP_TRUNCATE:
+	case CEPH_OSD_OP_ZERO:
+	case CEPH_OSD_OP_APPEND:
+	case CEPH_OSD_OP_TRIMTRUNC:
+	case CEPH_OSD_OP_CMPEXT:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * and objclass.h. Any modification here needs to be updated there
+ */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
+#define CEPH_OSD_TMAP_RM  'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
+
+extern const char *ceph_osd_op_name(int op);
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+	CEPH_OSD_FLAG_ACK =            0x0001,  /* want (or is) "ack" ack */
+	CEPH_OSD_FLAG_ONNVRAM =        0x0002,  /* want (or is) "onnvram" ack */
+	CEPH_OSD_FLAG_ONDISK =         0x0004,  /* want (or is) "ondisk" ack */
+	CEPH_OSD_FLAG_RETRY =          0x0008,  /* resend attempt */
+	CEPH_OSD_FLAG_READ =           0x0010,  /* op may read */
+	CEPH_OSD_FLAG_WRITE =          0x0020,  /* op may write */
+	CEPH_OSD_FLAG_ORDERSNAP =      0x0040,  /* EOLDSNAP if snapc is out of order */
+	CEPH_OSD_FLAG_PEERSTAT_OLD =   0x0080,  /* DEPRECATED msg includes osd_peer_stat */
+	CEPH_OSD_FLAG_BALANCE_READS =  0x0100,
+	CEPH_OSD_FLAG_PARALLELEXEC =   0x0200,  /* execute op in parallel */
+	CEPH_OSD_FLAG_PGOP =           0x0400,  /* pg op, no object */
+	CEPH_OSD_FLAG_EXEC =           0x0800,  /* op may exec */
+	CEPH_OSD_FLAG_EXEC_PUBLIC =    0x1000,  /* DEPRECATED op may exec (public) */
+	CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000,  /* read from nearby replica, if any */
+	CEPH_OSD_FLAG_RWORDERED =      0x4000,  /* order wrt concurrent reads */
+	CEPH_OSD_FLAG_IGNORE_CACHE =   0x8000,  /* ignore cache logic */
+	CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
+	CEPH_OSD_FLAG_IGNORE_OVERLAY =0x20000,  /* ignore pool overlay */
+	CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+	CEPH_OSD_FLAG_MAP_SNAP_CLONE =0x80000,  /* map snap direct to clone id
+						 */
+	CEPH_OSD_FLAG_ENFORCE_SNAPC    =0x100000,  /* use snapc provided even if
+						      pool uses pool snaps */
+	CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
+	CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+	CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+	CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
+	CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000,  /* ignore redirection */
+	CEPH_OSD_FLAG_RETURNVEC = 0x4000000, /* allow overall result >= 0, and return >= 0 and buffer for each op in opvec */
+	CEPH_OSD_FLAG_SUPPORTSPOOLEIO = 0x8000000,   /* client understands pool EIO flag */
+};
+
+enum {
+	CEPH_OSD_OP_FLAG_EXCL = 0x1,      /* EXCL object create */
+	CEPH_OSD_OP_FLAG_FAILOK = 0x2,    /* continue despite failure */
+	CEPH_OSD_OP_FLAG_FADVISE_RANDOM     = 0x4, /* the op is random */
+	CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */
+	CEPH_OSD_OP_FLAG_FADVISE_WILLNEED   = 0x10,/* data will be accessed in the near future */
+	CEPH_OSD_OP_FLAG_FADVISE_DONTNEED   = 0x20,/* data will not be accessed in the near future */
+	CEPH_OSD_OP_FLAG_FADVISE_NOCACHE   = 0x40, /* data will be accessed only once by this client */
+	CEPH_OSD_OP_FLAG_WITH_REFERENCE   = 0x80, /* need reference couting */
+	CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */
+};
+
+#define EOLDSNAPC    85  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLOCKLISTED 108 /* blocklisted */
+#define EBLACKLISTED 108 /* deprecated */
+
+/* xattr comparison */
+enum {
+	CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+	CEPH_OSD_CMPXATTR_OP_NE  = 2,
+	CEPH_OSD_CMPXATTR_OP_GT  = 3,
+	CEPH_OSD_CMPXATTR_OP_GTE = 4,
+	CEPH_OSD_CMPXATTR_OP_LT  = 5,
+	CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+	CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+	CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
+
+enum {
+	CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1,     /* part of a flush operation */
+	CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2,  /* ignore pool overlay */
+	CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
+	CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
+						     * cloneid */
+	CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
+	CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* use provided truncate_{seq,size} (copy-from2 only) */
+};
+
+#define CEPH_OSD_COPY_FROM_FLAGS			\
+	(CEPH_OSD_COPY_FROM_FLAG_FLUSH |		\
+	 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |	\
+	 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |		\
+	 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |	\
+	 CEPH_OSD_COPY_FROM_FLAG_RWORDERED |		\
+	 CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)
+
+enum {
+	CEPH_OSD_TMAP2OMAP_NULLOK = 1,
+};
+
+enum {
+	CEPH_OSD_WATCH_OP_UNWATCH = 0,
+	CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+	/* note: use only ODD ids to prevent pre-giant code from
+	   interpreting the op as UNWATCH */
+	CEPH_OSD_WATCH_OP_WATCH = 3,
+	CEPH_OSD_WATCH_OP_RECONNECT = 5,
+	CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+enum {
+	CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32 = 0,
+	CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64 = 1,
+	CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C   = 2
+};
+
+const char *ceph_osd_watch_op_name(int o);
+
+enum {
+	CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+	CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+	CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+	CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+	CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+	CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+	CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+	CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128,
+	CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+	CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+
+const char *ceph_osd_alloc_hint_flag_name(int f);
+
+enum {
+	CEPH_OSD_BACKOFF_OP_BLOCK = 1,
+	CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
+	CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
+};
+
+const char *ceph_osd_backoff_op_name(int op);
+
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+	__le16 op;           /* CEPH_OSD_OP_* */
+	__le32 flags;        /* CEPH_OSD_OP_FLAG_* */
+	union {
+		struct {
+			__le64 offset, length;
+			__le64 truncate_size;
+			__le32 truncate_seq;
+		} __attribute__ ((packed)) extent;
+		struct {
+			__le32 name_len;
+			__le32 value_len;
+			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+		} __attribute__ ((packed)) xattr;
+		struct {
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+			__le32 indata_len;
+		} __attribute__ ((packed)) cls;
+		struct {
+			__le64 count;
+			__le32 start_epoch; /* for the pgls sequence */
+		} __attribute__ ((packed)) pgls;
+	        struct {
+		        __le64 snapid;
+	        } __attribute__ ((packed)) snap;
+		struct {
+			__le64 cookie;
+			__le64 ver;     /* no longer used */
+			__u8 op;	/* CEPH_OSD_WATCH_OP_* */
+			__u32 gen;      /* registration generation */
+			__u32 timeout; /* connection timeout */
+		} __attribute__ ((packed)) watch;
+		struct {
+			__le64 cookie;
+		} __attribute__ ((packed)) notify;
+		struct {
+			__le64 unused;
+			__le64 ver;
+		} __attribute__ ((packed)) assert_ver;
+		struct {
+			__le64 offset, length;
+			__le64 src_offset;
+		} __attribute__ ((packed)) clonerange;
+		struct {
+			__le64 max;     /* max data in reply */
+		} __attribute__ ((packed)) copy_get;
+		struct {
+			__le64 snapid;
+			__le64 src_version;
+			__u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */
+			/*
+			 * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags
+			 * for src object, flags for dest object are in
+			 * ceph_osd_op::flags.
+			 */
+			__le32 src_fadvise_flags;
+		} __attribute__ ((packed)) copy_from;
+		struct {
+			struct ceph_timespec stamp;
+		} __attribute__ ((packed)) hit_set_get;
+		struct {
+			__u8 flags;
+		} __attribute__ ((packed)) tmap2omap;
+		struct {
+			__le64 expected_object_size;
+			__le64 expected_write_size;
+			__le32 flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+		} __attribute__ ((packed)) alloc_hint;
+		struct {
+			__le64 offset;
+			__le64 length;
+			__le64 data_length;
+		} __attribute__ ((packed)) writesame;
+		struct {
+			__le64 offset;
+			__le64 length;
+			__le32 chunk_size;
+			__u8 type;              /* CEPH_OSD_CHECKSUM_OP_TYPE_* */
+		} __attribute__ ((packed)) checksum;
+	} __attribute__ ((packed));
+	__le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * Check the compatibility of struct ceph_osd_op
+ *  (2+4+(2*8+8+4)+4) = (sizeof(ceph_osd_op::op) +
+ *                     sizeof(ceph_osd_op::flags) +
+ *                     sizeof(ceph_osd_op::extent) +
+ *                     sizeof(ceph_osd_op::payload_len))
+ */
+#ifdef __cplusplus
+static_assert(sizeof(ceph_osd_op) == (2+4+(2*8+8+4)+4),
+              "sizeof(ceph_osd_op) breaks the compatibility");
+#endif
+
+struct ceph_osd_reply_head {
+	__le32 client_inc;                /* client incarnation */
+	__le32 flags;
+	struct ceph_object_layout layout;
+	__le32 osdmap_epoch;
+	struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+	__le32 result;                    /* result code */
+
+	__le32 object_len;                /* length of object name */
+	__le32 num_ops;
+	struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h
new file mode 120000
index 000000000..51fc03be1
--- /dev/null
+++ b/src/include/rados/buffer.h
@@ -0,0 +1 @@
+../buffer.h
+\ No newline at end of file
diff --git a/src/include/rados/buffer_fwd.h b/src/include/rados/buffer_fwd.h
new file mode 120000
index 000000000..bd1f6f1b0
--- /dev/null
+++ b/src/include/rados/buffer_fwd.h
@@ -0,0 +1 @@
+../buffer_fwd.h
+\ No newline at end of file
diff --git a/src/include/rados/crc32c.h b/src/include/rados/crc32c.h
new file mode 120000
index 000000000..19ef4317e
--- /dev/null
+++ b/src/include/rados/crc32c.h
@@ -0,0 +1 @@
+../crc32c.h
+\ No newline at end of file
diff --git a/src/include/rados/inline_memory.h b/src/include/rados/inline_memory.h
new file mode 120000
index 000000000..48f0d4436
--- /dev/null
+++ b/src/include/rados/inline_memory.h
@@ -0,0 +1 @@
+../inline_memory.h
+\ No newline at end of file
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
new file mode 100644
index 000000000..858804c3a
--- /dev/null
+++ b/src/include/rados/librados.h
@@ -0,0 +1,4156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRADOS_H
+#define CEPH_LIBRADOS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include <unistd.h>
+#include <string.h>
+#include "rados_types.h"
+
+#include <sys/time.h>
+
+#ifndef CEPH_OSD_TMAP_SET
+/* These are also defined in rados.h and objclass.h. Keep them in sync! */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c'
+#define CEPH_OSD_TMAP_RM  'r'
+#endif
+
+#define LIBRADOS_VER_MAJOR 3
+#define LIBRADOS_VER_MINOR 0
+#define LIBRADOS_VER_EXTRA 0
+
+#define LIBRADOS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA)
+
+#define LIBRADOS_SUPPORTS_WATCH 1
+#define LIBRADOS_SUPPORTS_SERVICES 1
+#define LIBRADOS_SUPPORTS_GETADDRS 1
+#define LIBRADOS_SUPPORTS_APP_METADATA 1
+
+/* RADOS lock flags
+ * They are also defined in cls_lock_types.h. Keep them in sync!
+ */
+#define LIBRADOS_LOCK_FLAG_RENEW       (1u<<0)
+#define LIBRADOS_LOCK_FLAG_MAY_RENEW   LIBRADOS_LOCK_FLAG_RENEW
+#define LIBRADOS_LOCK_FLAG_MUST_RENEW  (1u<<1)
+
+/*
+ * Constants for rados_write_op_create().
+ */
+#define LIBRADOS_CREATE_EXCLUSIVE 1
+#define LIBRADOS_CREATE_IDEMPOTENT 0
+
+/*
+ * Flags that can be set on a per-op basis via
+ * rados_read_op_set_flags() and rados_write_op_set_flags().
+ */
+enum {
+  // fail a create operation if the object already exists
+  LIBRADOS_OP_FLAG_EXCL               =  0x1,
+  // allow the transaction to succeed even if the flagged op fails
+  LIBRADOS_OP_FLAG_FAILOK 	      = 0x2,
+  // indicate read/write op random
+  LIBRADOS_OP_FLAG_FADVISE_RANDOM     = 0x4,
+  // indicate read/write op sequential
+  LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8,
+  // indicate read/write data will be accessed in the near future (by someone)
+  LIBRADOS_OP_FLAG_FADVISE_WILLNEED   = 0x10,
+  // indicate read/write data will not accessed in the near future (by anyone)
+  LIBRADOS_OP_FLAG_FADVISE_DONTNEED   = 0x20,
+  // indicate read/write data will not accessed again (by *this* client)
+  LIBRADOS_OP_FLAG_FADVISE_NOCACHE    = 0x40,
+  // optionally support FUA (force unit access) on write requests
+  LIBRADOS_OP_FLAG_FADVISE_FUA        = 0x80,
+};
+
+#define CEPH_RADOS_API
+
+/**
+ * @name xattr comparison operations
+ * Operators for comparing xattrs on objects, and aborting the
+ * rados_read_op or rados_write_op transaction if the comparison
+ * fails.
+ *
+ * @{
+ */
+enum {
+	LIBRADOS_CMPXATTR_OP_EQ  = 1,
+	LIBRADOS_CMPXATTR_OP_NE  = 2,
+	LIBRADOS_CMPXATTR_OP_GT  = 3,
+	LIBRADOS_CMPXATTR_OP_GTE = 4,
+	LIBRADOS_CMPXATTR_OP_LT  = 5,
+	LIBRADOS_CMPXATTR_OP_LTE = 6
+};
+/** @} */
+
+/**
+ * @name Operation Flags
+ * Flags for rados_read_op_operate(), rados_write_op_operate(),
+ * rados_aio_read_op_operate(), and rados_aio_write_op_operate().
+ * See librados.hpp for details.
+ * @{
+ */
+enum {
+  LIBRADOS_OPERATION_NOFLAG             = 0,
+  LIBRADOS_OPERATION_BALANCE_READS      = 1,
+  LIBRADOS_OPERATION_LOCALIZE_READS     = 2,
+  LIBRADOS_OPERATION_ORDER_READS_WRITES = 4,
+  LIBRADOS_OPERATION_IGNORE_CACHE       = 8,
+  LIBRADOS_OPERATION_SKIPRWLOCKS        = 16,
+  LIBRADOS_OPERATION_IGNORE_OVERLAY     = 32,
+  /* send requests to cluster despite the cluster or pool being marked
+     full; ops will either succeed (e.g., delete) or return EDQUOT or
+     ENOSPC. */
+  LIBRADOS_OPERATION_FULL_TRY           = 64,
+  /*
+   * Mainly for delete op
+   */
+  LIBRADOS_OPERATION_FULL_FORCE		= 128,
+  LIBRADOS_OPERATION_IGNORE_REDIRECT	= 256,
+  LIBRADOS_OPERATION_ORDERSNAP          = 512,
+  /* enable/allow >0 return values and payloads on write/update */
+  LIBRADOS_OPERATION_RETURNVEC          = 1024,
+};
+/** @} */
+
+/**
+ * @name Alloc hint flags
+ * Flags for rados_write_op_alloc_hint2() and rados_set_alloc_hint2()
+ * indicating future IO patterns.
+ * @{
+ */
+enum {
+  LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+  LIBRADOS_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+  LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+  LIBRADOS_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+  LIBRADOS_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+  LIBRADOS_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+  LIBRADOS_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+  LIBRADOS_ALLOC_HINT_FLAG_LONGLIVED = 128,
+  LIBRADOS_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+  LIBRADOS_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+/** @} */
+
+typedef enum {
+	LIBRADOS_CHECKSUM_TYPE_XXHASH32 = 0,
+	LIBRADOS_CHECKSUM_TYPE_XXHASH64 = 1,
+	LIBRADOS_CHECKSUM_TYPE_CRC32C   = 2
+} rados_checksum_type_t;
+
+/*
+ * snap id contants
+ */
+#define LIBRADOS_SNAP_HEAD  UINT64_C(-2)
+#define LIBRADOS_SNAP_DIR   UINT64_C(-1)
+
+/**
+ * @typedef rados_t
+ *
+ * A handle for interacting with a RADOS cluster. It encapsulates all
+ * RADOS client configuration, including username, key for
+ * authentication, logging, and debugging. Talking to different clusters
+ * -- or to the same cluster with different users -- requires
+ * different cluster handles.
+ */
+#ifndef VOIDPTR_RADOS_T
+#define VOIDPTR_RADOS_T
+typedef void *rados_t;
+#endif //VOIDPTR_RADOS_T
+
+/**
+ * @typedef rados_config_t
+ *
+ * A handle for the ceph configuration context for the rados_t cluster
+ * instance.  This can be used to share configuration context/state
+ * (e.g., logging configuration) between librados instance.
+ *
+ * @warning The config context does not have independent reference
+ * counting.  As such, a rados_config_t handle retrieved from a given
+ * rados_t is only valid as long as that rados_t.
+ */
+typedef void *rados_config_t;
+
+/**
+ * @typedef rados_ioctx_t
+ *
+ * An io context encapsulates a few settings for all I/O operations
+ * done on it:
+ * - pool - set when the io context is created (see rados_ioctx_create())
+ * - snapshot context for writes (see
+ *   rados_ioctx_selfmanaged_snap_set_write_ctx())
+ * - snapshot id to read from (see rados_ioctx_snap_set_read())
+ * - object locator for all single-object operations (see
+ *   rados_ioctx_locator_set_key())
+ * - namespace for all single-object operations (see
+ *   rados_ioctx_set_namespace()).  Set to LIBRADOS_ALL_NSPACES
+ *   before rados_nobjects_list_open() will list all objects in all
+ *   namespaces.
+ *
+ * @warning Changing any of these settings is not thread-safe -
+ * librados users must synchronize any of these changes on their own,
+ * or use separate io contexts for each thread
+ */
+typedef void *rados_ioctx_t;
+
+/**
+ * @typedef rados_list_ctx_t
+ *
+ * An iterator for listing the objects in a pool.
+ * Used with rados_nobjects_list_open(),
+ * rados_nobjects_list_next(), rados_nobjects_list_next2(), and
+ * rados_nobjects_list_close().
+ */
+typedef void *rados_list_ctx_t;
+
+/**
+ * @typedef rados_object_list_cursor
+ *
+ * The cursor used with rados_enumerate_objects
+ * and accompanying methods.
+ */
+typedef void * rados_object_list_cursor;
+
+/**
+ * @struct rados_object_list_item
+ *
+ * The item populated by rados_object_list in
+ * the results array.
+ */
+typedef struct {
+
+  /// oid length
+  size_t oid_length;
+  /// name of the object
+  char *oid;
+  /// namespace length
+  size_t nspace_length;
+  /// the object namespace
+  char *nspace;
+  /// locator length
+  size_t locator_length;
+  /// object locator
+  char *locator;
+} rados_object_list_item;
+
+/**
+ * @typedef rados_snap_t
+ * The id of a snapshot.
+ */
+typedef uint64_t rados_snap_t;
+
+/**
+ * @typedef rados_xattrs_iter_t
+ * An iterator for listing extended attrbutes on an object.
+ * Used with rados_getxattrs(), rados_getxattrs_next(), and
+ * rados_getxattrs_end().
+ */
+typedef void *rados_xattrs_iter_t;
+
+/**
+ * @typedef rados_omap_iter_t
+ * An iterator for listing omap key/value pairs on an object.
+ * Used with rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals(),
+ * rados_read_op_omap_get_vals_by_keys(), rados_omap_get_next(), and
+ * rados_omap_get_end().
+ */
+typedef void *rados_omap_iter_t;
+
+/**
+ * @struct rados_pool_stat_t
+ * Usage information for a pool.
+ */
+struct rados_pool_stat_t {
+  /// space used in bytes
+  uint64_t num_bytes;
+  /// space used in KB
+  uint64_t num_kb;
+  /// number of objects in the pool
+  uint64_t num_objects;
+  /// number of clones of objects
+  uint64_t num_object_clones;
+  /// num_objects * num_replicas
+  uint64_t num_object_copies;
+  /// number of objects missing on primary
+  uint64_t num_objects_missing_on_primary;
+  /// number of objects found on no OSDs
+  uint64_t num_objects_unfound;
+  /// number of objects replicated fewer times than they should be
+  /// (but found on at least one OSD)
+  uint64_t num_objects_degraded;
+  /// number of objects read
+  uint64_t num_rd;
+  /// objects read in KB
+  uint64_t num_rd_kb;
+  /// number of objects written
+  uint64_t num_wr;
+  /// objects written in KB
+  uint64_t num_wr_kb;
+  /// bytes originally provided by user
+  uint64_t num_user_bytes;
+  /// bytes passed compression
+  uint64_t compressed_bytes_orig;
+  /// bytes resulted after compression
+  uint64_t compressed_bytes;
+  /// bytes allocated at storage
+  uint64_t compressed_bytes_alloc;
+};
+
+/**
+ * @struct rados_cluster_stat_t
+ * Cluster-wide usage information
+ */
+struct rados_cluster_stat_t {
+  /// total device size
+  uint64_t kb;
+  /// total used
+  uint64_t kb_used;
+  /// total available/free
+  uint64_t kb_avail;
+  /// number of objects
+  uint64_t num_objects;
+};
+
+/**
+ * @typedef rados_write_op_t
+ *
+ * An object write operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_write_op() rados_release_write_op()
+ * - Extended attribute manipulation: rados_write_op_cmpxattr()
+ *   rados_write_op_cmpxattr(), rados_write_op_setxattr(),
+ *   rados_write_op_rmxattr()
+ * - Object map key/value pairs: rados_write_op_omap_set(),
+ *   rados_write_op_omap_rm_keys(), rados_write_op_omap_clear(),
+ *   rados_write_op_omap_cmp()
+ * - Object properties: rados_write_op_assert_exists(),
+ *   rados_write_op_assert_version()
+ * - Creating objects: rados_write_op_create()
+ * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero
+ *   rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove,
+ *   rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext()
+ * - Hints: rados_write_op_set_alloc_hint()
+ * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate()
+ */
+typedef void *rados_write_op_t;
+
+/**
+ * @typedef rados_read_op_t
+ *
+ * An object read operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_read_op() rados_release_read_op()
+ * - Extended attribute manipulation: rados_read_op_cmpxattr(),
+ *   rados_read_op_getxattr(), rados_read_op_getxattrs()
+ * - Object map key/value pairs: rados_read_op_omap_get_vals(),
+ *   rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals_by_keys(),
+ *   rados_read_op_omap_cmp()
+ * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(),
+ *   rados_read_op_assert_version()
+ * - IO on objects: rados_read_op_read(), rados_read_op_checksum(),
+ *   rados_read_op_cmpext()
+ * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf()
+ * - Request properties: rados_read_op_set_flags()
+ * - Performing the operation: rados_read_op_operate(),
+ *   rados_aio_read_op_operate()
+ */
+typedef void *rados_read_op_t;
+
+/**
+ * @typedef rados_completion_t
+ * Represents the state of an asynchronous operation - it contains the
+ * return value once the operation completes, and can be used to block
+ * until the operation is complete or safe.
+ */
+typedef void *rados_completion_t;
+
+/**
+ * @struct blkin_trace_info
+ * blkin trace information for Zipkin tracing
+ */
+struct blkin_trace_info;
+
+/**
+ * Get the version of librados.
+ *
+ * The version number is major.minor.extra. Note that this is
+ * unrelated to the Ceph version number.
+ *
+ * TODO: define version semantics, i.e.:
+ * - incrementing major is for backwards-incompatible changes
+ * - incrementing minor is for backwards-compatible changes
+ * - incrementing extra is for bug fixes
+ *
+ * @param major where to store the major version number
+ * @param minor where to store the minor version number
+ * @param extra where to store the extra version number
+ */
+CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra);
+
+/**
+ * @name Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using librados.
+ *
+ * @{
+ */
+
+/**
+ * Create a handle for communicating with a RADOS cluster.
+ *
+ * Ceph environment variables are read when this is called, so if
+ * $CEPH_ARGS specifies everything you need to connect, no further
+ * configuration is necessary.
+ *
+ * @param cluster where to store the handle
+ * @param id the user to connect as (i.e. admin, not client.admin)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create(rados_t *cluster, const char * const id);
+
+/**
+ * Extended version of rados_create.
+ *
+ * Like rados_create, but 
+ * 1) don't assume 'client\.'+id; allow full specification of name
+ * 2) allow specification of cluster name
+ * 3) flags for future expansion
+ */
+CEPH_RADOS_API int rados_create2(rados_t *pcluster,
+                                 const char *const clustername,
+                                 const char * const name, uint64_t flags);
+
+/**
+ * Initialize a cluster handle from an existing configuration.
+ *
+ * Share configuration state with another rados_t instance.
+ *
+ * @param cluster where to store the handle
+ * @param cct the existing configuration to use
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create_with_context(rados_t *cluster,
+                                             rados_config_t cct);
+
+/**
+ * Ping the monitor with ID mon_id, storing the resulting reply in
+ * buf (if specified) with a maximum size of len.
+ *
+ * The result buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free().  The
+ * buffer and length pointers can be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster         cluster handle
+ * @param mon_id [in]     ID of the monitor to ping
+ * @param outstr [out]    double pointer with the resulting reply
+ * @param outstrlen [out] pointer with the size of the reply in outstr
+ */
+CEPH_RADOS_API int rados_ping_monitor(rados_t cluster, const char *mon_id,
+                                      char **outstr, size_t *outstrlen);
+
+/**
+ * Connect to the cluster.
+ *
+ * @note BUG: Before calling this, calling a function that communicates with the
+ * cluster will crash.
+ *
+ * @pre The cluster handle is configured with at least a monitor
+ * address. If cephx is enabled, a client name and secret must also be
+ * set.
+ *
+ * @post If this succeeds, any function in librados may be used
+ *
+ * @param cluster The cluster to connect to.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_connect(rados_t cluster);
+
+/**
+ * Disconnects from the cluster.
+ *
+ * For clean up, this is only necessary after rados_connect() has
+ * succeeded.
+ *
+ * @warning This does not guarantee any asynchronous writes have
+ * completed. To do that, you must call rados_aio_flush() on all open
+ * io contexts.
+ *
+ * @warning We implicitly call rados_watch_flush() on shutdown.  If
+ * there are watches being used, this should be done explicitly before
+ * destroying the relevant IoCtx.  We do it here as a safety measure.
+ *
+ * @post the cluster handle cannot be used again
+ *
+ * @param cluster the cluster to shutdown
+ */
+CEPH_RADOS_API void rados_shutdown(rados_t cluster);
+
+/** @} init */
+
+/**
+ * @name Configuration
+ * These functions read and update Ceph configuration for a cluster
+ * handle. Any configuration changes must be done before connecting to
+ * the cluster.
+ *
+ * Options that librados users might want to set include:
+ * - mon_host
+ * - auth_supported
+ * - key, keyfile, or keyring when using cephx
+ * - log_file, log_to_stderr, err_to_stderr, and log_to_syslog
+ * - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms
+ *
+ * See docs.ceph.com for information about available configuration options`
+ *
+ * @{
+ */
+
+/**
+ * Configure the cluster handle using a Ceph config file
+ *
+ * If path is NULL, the default locations are searched, and the first
+ * found is used. The locations are:
+ * - $CEPH_CONF (environment variable)
+ * - /etc/ceph/ceph.conf
+ * - ~/.ceph/config
+ * - ceph.conf (in the current working directory)
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param path path to a Ceph configuration file
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_read_file(rados_t cluster, const char *path);
+
+/**
+ * Configure the cluster handle with command line arguments
+ *
+ * argv can contain any common Ceph command line option, including any
+ * configuration parameter prefixed by '--' and replacing spaces with
+ * dashes or underscores. For example, the following options are equivalent:
+ * - --mon-host 10.0.0.1:6789
+ * - --mon_host 10.0.0.1:6789
+ * - -m 10.0.0.1:6789
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv(rados_t cluster, int argc,
+                                         const char **argv);
+
+
+/**
+ * Configure the cluster handle with command line arguments, returning
+ * any remainders.  Same rados_conf_parse_argv, except for extra
+ * remargv argument to hold returns unrecognized arguments.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @param remargv char* array for returned unrecognized arguments
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv_remainder(rados_t cluster, int argc,
+				                   const char **argv,
+                                                   const char **remargv);
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cluster cluster handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_env(rados_t cluster, const char *var);
+
+/**
+ * Set a configuration option
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param option option to set
+ * @param value value of the option
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when the option is not a Ceph configuration option
+ */
+CEPH_RADOS_API int rados_conf_set(rados_t cluster, const char *option,
+                                  const char *value);
+
+/**
+ * Get the value of a configuration option
+ *
+ * @param cluster configuration to read
+ * @param option which option to read
+ * @param buf where to write the configuration value
+ * @param len the size of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENAMETOOLONG if the buffer is too short to contain the
+ * requested value
+ */
+CEPH_RADOS_API int rados_conf_get(rados_t cluster, const char *option,
+                                  char *buf, size_t len);
+
+/** @} config */
+
+/**
+ * Read usage info about the cluster
+ *
+ * This tells you total space, space used, space available, and number
+ * of objects. These are not updated immediately when data is written,
+ * they are eventually consistent.
+ *
+ * @param cluster cluster to query
+ * @param result where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cluster_stat(rados_t cluster,
+                                      struct rados_cluster_stat_t *result);
+
+/**
+ * Get the fsid of the cluster as a hexadecimal string.
+ *
+ * The fsid is a unique id of an entire Ceph cluster.
+ *
+ * @param cluster where to get the fsid
+ * @param buf where to write the fsid
+ * @param len the size of buf in bytes (should be 37)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the buffer is too short to contain the
+ * fsid
+ */
+CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len);
+
+/**
+ * Get/wait for the most recent osdmap
+ * 
+ * @param cluster the cluster to shutdown
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster);
+
+/**
+ * @name Pools
+ *
+ * RADOS pools are separate namespaces for objects. Pools may have
+ * different crush rules associated with them, so they could have
+ * differing replication levels or placement strategies. RADOS
+ * permissions are also tied to pools - users can have different read,
+ * write, and execute permissions on a per-pool basis.
+ *
+ * @{
+ */
+
+/**
+ * List pools
+ *
+ * Gets a list of pool names as NULL-terminated strings.  The pool
+ * names will be placed in the supplied buffer one after another.
+ * After the last pool name, there will be two 0 bytes in a row.
+ *
+ * If len is too short to fit all the pool name entries we need, we will fill
+ * as much as we can.
+ *
+ * Buf may be null to determine the buffer size needed to list all pools.
+ *
+ * @param cluster cluster handle
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_pool_list(rados_t cluster, char *buf, size_t len);
+
+/**
+ * List inconsistent placement groups of the given pool
+ *
+ * Gets a list of inconsistent placement groups as NULL-terminated strings.
+ * The placement group names will be placed in the supplied buffer one after
+ * another. After the last name, there will be two 0 types in a row.
+ *
+ * If len is too short to fit all the placement group entries we need, we  will
+ * fill as much as we can.
+ *
+ * @param cluster cluster handle
+ * @param pool pool ID
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_inconsistent_pg_list(rados_t cluster, int64_t pool,
+					      char *buf, size_t len);
+
+/**
+ * Get a configuration handle for a rados cluster handle
+ *
+ * This handle is valid only as long as the cluster handle is valid.
+ *
+ * @param cluster cluster handle
+ * @returns config handle for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_cct(rados_t cluster);
+
+/**
+ * Get a global id for current instance
+ *
+ * This id is a unique representation of current connection to the cluster
+ *
+ * @param cluster cluster handle
+ * @returns instance global id
+ */
+CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster);
+
+/**
+ * Gets the minimum compatible OSD version
+ *
+ * @param cluster cluster handle
+ * @param require_osd_release [out] minimum compatible OSD version
+ *  based upon the current features
+ * @returns 0 on sucess, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_osd(rados_t cluster,
+                                                int8_t* require_osd_release);
+
+/**
+ * Gets the minimum compatible client version
+ *
+ * @param cluster cluster handle
+ * @param min_compat_client [out] minimum compatible client version
+ *  based upon the current features
+ * @param require_min_compat_client [out] required minimum client version
+ *  based upon explicit setting
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_client(rados_t cluster,
+                                                   int8_t* min_compat_client,
+                                                   int8_t* require_min_compat_client);
+
+/**
+ * Create an io context
+ *
+ * The io context allows you to perform operations within a particular
+ * pool. For more details see rados_ioctx_t.
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name name of the pool
+ * @param ioctx where to store the io context
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_create(rados_t cluster, const char *pool_name,
+                                      rados_ioctx_t *ioctx);
+CEPH_RADOS_API int rados_ioctx_create2(rados_t cluster, int64_t pool_id,
+                                       rados_ioctx_t *ioctx);
+
+/**
+ * The opposite of rados_ioctx_create
+ *
+ * This just tells librados that you no longer need to use the io context.
+ * It may not be freed immediately if there are pending asynchronous
+ * requests on it, but you should not use an io context again after
+ * calling this function on it.
+ *
+ * @warning This does not guarantee any asynchronous
+ * writes have completed. You must call rados_aio_flush()
+ * on the io context before destroying it to do that.
+ *
+ * @warning If this ioctx is used by rados_watch, the caller needs to
+ * be sure that all registered watches are disconnected via
+ * rados_unwatch() and that rados_watch_flush() is called.  This
+ * ensures that a racing watch callback does not make use of a
+ * destroyed ioctx.
+ *
+ * @param io the io context to dispose of
+ */
+CEPH_RADOS_API void rados_ioctx_destroy(rados_ioctx_t io);
+
+/**
+ * Get configuration handle for a pool handle
+ *
+ * @param io pool handle
+ * @returns rados_config_t for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_ioctx_cct(rados_ioctx_t io);
+
+/**
+ * Get the cluster handle used by this rados_ioctx_t
+ * Note that this is a weak reference, and should not
+ * be destroyed via rados_shutdown().
+ *
+ * @param io the io context
+ * @returns the cluster handle for this io context
+ */
+CEPH_RADOS_API rados_t rados_ioctx_get_cluster(rados_ioctx_t io);
+
+/**
+ * Get pool usage statistics
+ *
+ * Fills in a rados_pool_stat_t after querying the cluster.
+ *
+ * @param io determines which pool to query
+ * @param stats where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_stat(rados_ioctx_t io,
+                                         struct rados_pool_stat_t *stats);
+
+/**
+ * Get the id of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name which pool to look up
+ * @returns id of the pool
+ * @returns -ENOENT if the pool is not found
+ */
+CEPH_RADOS_API int64_t rados_pool_lookup(rados_t cluster,
+                                         const char *pool_name);
+
+/**
+ * Get the name of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param id the id of the pool
+ * @param buf where to store the pool name
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_pool_reverse_lookup(rados_t cluster, int64_t id,
+                                             char *buf, size_t maxlen);
+
+/**
+ * Create a pool with default settings
+ *
+ * The default crush rule is rule 0.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create(rados_t cluster, const char *pool_name);
+
+/**
+ * Create a pool owned by a specific auid.
+ *
+ * DEPRECATED: auid support has been removed, and this call will be removed in a future
+ * release.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_auid(rados_t cluster,
+                                               const char *pool_name,
+                                               uint64_t auid)
+  __attribute__((deprecated));
+
+/**
+ * Create a pool with a specific CRUSH rule
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool1
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_crush_rule(rados_t cluster,
+                                                     const char *pool_name,
+				                     uint8_t crush_rule_num);
+
+/**
+ * Create a pool with a specific CRUSH rule and auid
+ *
+ * DEPRECATED: auid support has been removed and this call will be removed
+ * in a future release.
+ *
+ * This is a combination of rados_pool_create_with_crush_rule() and
+ * rados_pool_create_with_auid().
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool2
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_all(rados_t cluster,
+                                              const char *pool_name,
+                                              uint64_t auid,
+			                      uint8_t crush_rule_num)
+  __attribute__((deprecated));
+
+/**
+ * Returns the pool that is the base tier for this pool.
+ *
+ * The return value is the ID of the pool that should be used to read from/write to.
+ * If tiering is not set up for the pool, returns \c pool.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool ID of the pool to query
+ * @param base_tier [out] base tier, or \c pool if tiering is not configured
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool,
+                                            int64_t* base_tier);
+
+/**
+ * Delete a pool and all data inside it
+ *
+ * The pool is removed from the cluster immediately,
+ * but the actual data is deleted in the background.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool_name which pool to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name);
+
+/**
+ * Attempt to change an io context's associated auid "owner"
+ *
+ * DEPRECATED: auid support has been removed and this call has no effect.
+ *
+ * Requires that you have write permission on both the current and new
+ * auid.
+ *
+ * @param io reference to the pool to change.
+ * @param auid the auid you wish the io to have.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid)
+  __attribute__((deprecated));
+
+
+/**
+ * Get the auid of a pool
+ *
+ * DEPRECATED: auid support has been removed and this call always reports
+ * CEPH_AUTH_UID_DEFAULT (-1).
+
+ * @param io pool to query
+ * @param auid where to store the auid
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid)
+  __attribute__((deprecated));
+
+/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io)
+  __attribute__((deprecated));
+
+/**
+ * Test whether the specified pool requires alignment or not.
+ *
+ * @param io pool to query
+ * @param req 1 if alignment is supported, 0 if not.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io,
+  int *req);
+
+/* deprecated, use rados_ioctx_pool_required_alignment2 instead */
+CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io)
+  __attribute__((deprecated));
+
+/**
+ * Get the alignment flavor of a pool
+ *
+ * @param io pool to query
+ * @param alignment where to store the alignment flavor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io,
+  uint64_t *alignment);
+
+/**
+ * Get the pool id of the io context
+ *
+ * @param io the io context to query
+ * @returns the id of the pool the io context uses
+ */
+CEPH_RADOS_API int64_t rados_ioctx_get_id(rados_ioctx_t io);
+
+/**
+ * Get the pool name of the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf,
+                                             unsigned maxlen);
+
+/** @} pools */
+
+/**
+ * @name Object Locators
+ *
+ * @{
+ */
+
+/**
+ * Set the key for mapping objects to pgs within an io context.
+ *
+ * The key is used instead of the object name to determine which
+ * placement groups an object is put in. This affects all subsequent
+ * operations of the io context - until a different locator key is
+ * set, all objects in this io context will be placed in the same pg.
+ *
+ * @param io the io context to change
+ * @param key the key to use as the object locator, or NULL to discard
+ * any previously set key
+ */
+CEPH_RADOS_API void rados_ioctx_locator_set_key(rados_ioctx_t io,
+                                                const char *key);
+
+/**
+ * Set the namespace for objects within an io context
+ *
+ * The namespace specification further refines a pool into different
+ * domains.  The mapping of objects to pgs is also based on this
+ * value.
+ *
+ * @param io the io context to change
+ * @param nspace the name to use as the namespace, or NULL use the
+ * default namespace
+ */
+CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io,
+                                              const char *nspace);
+
+/**
+ * Get the namespace for objects within the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_namespace(rados_ioctx_t io, char *buf,
+                                             unsigned maxlen);
+
+/** @} obj_loc */
+
+/**
+ * @name Listing Objects
+ * @{
+ */
+/**
+ * Start listing objects in a pool
+ *
+ * @param io the pool to list from
+ * @param ctx the handle to store list context in
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_open(rados_ioctx_t io,
+                                            rados_list_ctx_t *ctx);
+
+/**
+ * Return hash position of iterator, rounded to the current PG
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @returns current hash position, rounded to the current pg
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_get_pg_hash_position(rados_list_ctx_t ctx);
+
+/**
+ * Reposition object iterator to a different hash position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param pos hash position to move to
+ * @returns actual (rounded) position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek(rados_list_ctx_t ctx,
+                                                 uint32_t pos);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor position to move to
+ * @returns rounded position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek_cursor(rados_list_ctx_t ctx,
+                                                        rados_object_list_cursor cursor);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * The returned handle must be released with rados_object_list_cursor_free().
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor where to store cursor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_get_cursor(rados_list_ctx_t ctx,
+                                                  rados_object_list_cursor *cursor);
+
+/**
+ * Get the next object name and locator in the pool
+ *
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next(rados_list_ctx_t ctx,
+                                            const char **entry,
+	                                    const char **key,
+                                            const char **nspace);
+
+/**
+ * Get the next object name, locator and their sizes in the pool
+ *
+ * The sizes allow to list objects with \0 (the NUL character)
+ * in .e.g *entry. Is is unusual see such object names but a bug
+ * in a client has risen the need to handle them as well.
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @param entry_size where to store the size of name of the entry
+ * @param key_size where to store the size of object locator (set to NULL to ignore)
+ * @param nspace_size where to store the size of object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next2(rados_list_ctx_t ctx,
+                                             const char **entry,
+                                             const char **key,
+                                             const char **nspace,
+                                             size_t *entry_size,
+                                             size_t *key_size,
+                                             size_t *nspace_size);
+
+/**
+ * Close the object listing handle.
+ *
+ * This should be called when the handle is no longer needed.
+ * The handle should not be used after it has been closed.
+ *
+ * @param ctx the handle to close
+ */
+CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx);
+
+/**
+ * Get cursor handle pointing to the *beginning* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool.  It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_begin(
+  rados_ioctx_t io);
+
+/**
+ * Get cursor handle pointing to the *end* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool.  It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_end(rados_ioctx_t io);
+
+/**
+ * Check if a cursor has reached the end of a pool
+ *
+ * @param io ioctx
+ * @param cur cursor
+ * @returns 1 if the cursor has reached the end of the pool, 0 otherwise
+ */
+CEPH_RADOS_API int rados_object_list_is_end(rados_ioctx_t io,
+    rados_object_list_cursor cur);
+
+/**
+ * Release a cursor
+ *
+ * Release a cursor.  The handle may not be used after this point.
+ *
+ * @param io ioctx
+ * @param cur cursor
+ */
+CEPH_RADOS_API void rados_object_list_cursor_free(rados_ioctx_t io,
+    rados_object_list_cursor cur);
+
+/**
+ * Compare two cursor positions
+ *
+ * Compare two cursors, and indicate whether the first cursor precedes,
+ * matches, or follows the second.
+ *
+ * @param io ioctx
+ * @param lhs first cursor
+ * @param rhs second cursor
+ * @returns -1, 0, or 1 for lhs < rhs, lhs == rhs, or lhs > rhs
+ */
+CEPH_RADOS_API int rados_object_list_cursor_cmp(rados_ioctx_t io,
+    rados_object_list_cursor lhs, rados_object_list_cursor rhs);
+
+/**
+ * @return the number of items set in the results array
+ */
+CEPH_RADOS_API int rados_object_list(rados_ioctx_t io,
+    const rados_object_list_cursor start,
+    const rados_object_list_cursor finish,
+    const size_t result_size,
+    const char *filter_buf,
+    const size_t filter_buf_len,
+    rados_object_list_item *results,
+    rados_object_list_cursor *next);
+
+CEPH_RADOS_API void rados_object_list_free(
+    const size_t result_size,
+    rados_object_list_item *results);
+
+/**
+ * Obtain cursors delineating a subset of a range.  Use this
+ * when you want to split up the work of iterating over the
+ * global namespace.  Expected use case is when you are iterating
+ * in parallel, with `m` workers, and each worker taking an id `n`.
+ *
+ * @param io ioctx
+ * @param start start of the range to be sliced up (inclusive)
+ * @param finish end of the range to be sliced up (exclusive)
+ * @param n which of the m chunks you would like to get cursors for
+ * @param m how many chunks to divide start-finish into
+ * @param split_start cursor populated with start of the subrange (inclusive)
+ * @param split_finish cursor populated with end of the subrange (exclusive)
+ */
+CEPH_RADOS_API void rados_object_list_slice(rados_ioctx_t io,
+    const rados_object_list_cursor start,
+    const rados_object_list_cursor finish,
+    const size_t n,
+    const size_t m,
+    rados_object_list_cursor *split_start,
+    rados_object_list_cursor *split_finish);
+
+
+/** @} Listing Objects */
+
+/**
+ * @name Snapshots
+ *
+ * RADOS snapshots are based upon sequence numbers that form a
+ * snapshot context. They are pool-specific. The snapshot context
+ * consists of the current snapshot sequence number for a pool, and an
+ * array of sequence numbers at which snapshots were taken, in
+ * descending order. Whenever a snapshot is created or deleted, the
+ * snapshot sequence number for the pool is increased. To add a new
+ * snapshot, the new snapshot sequence number must be increased and
+ * added to the snapshot context.
+ *
+ * There are two ways to manage these snapshot contexts:
+ * -# within the RADOS cluster
+ *    These are called pool snapshots, and store the snapshot context
+ *    in the OSDMap. These represent a snapshot of all the objects in
+ *    a pool.
+ * -# within the RADOS clients
+ *    These are called self-managed snapshots, and push the
+ *    responsibility for keeping track of the snapshot context to the
+ *    clients. For every write, the client must send the snapshot
+ *    context. In librados, this is accomplished with
+ *    rados_selfmanaged_snap_set_write_ctx(). These are more
+ *    difficult to manage, but are restricted to specific objects
+ *    instead of applying to an entire pool.
+ *
+ * @{
+ */
+
+/**
+ * Create a pool-wide snapshot
+ *
+ * @param io the pool to snapshot
+ * @param snapname the name of the snapshot
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_create(rados_ioctx_t io,
+                                           const char *snapname);
+
+/**
+ * Delete a pool snapshot
+ *
+ * @param io the pool to delete the snapshot from
+ * @param snapname which snapshot to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_remove(rados_ioctx_t io,
+                                           const char *snapname);
+
+/**
+ * Rollback an object to a pool snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapname which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid,
+		                             const char *snapname);
+
+/**
+ * @warning Deprecated: Use rados_ioctx_snap_rollback() instead
+ */
+CEPH_RADOS_API int rados_rollback(rados_ioctx_t io, const char *oid,
+				  const char *snapname)
+  __attribute__((deprecated));
+
+/**
+ * Set the snapshot from which reads are performed.
+ *
+ * Subsequent reads will return data as it was at the time of that
+ * snapshot.
+ *
+ * @param io the io context to change
+ * @param snap the id of the snapshot to set, or LIBRADOS_SNAP_HEAD for no
+ * snapshot (i.e. normal operation)
+ */
+CEPH_RADOS_API void rados_ioctx_snap_set_read(rados_ioctx_t io,
+                                              rados_snap_t snap);
+
+/**
+ * Allocate an ID for a self-managed snapshot
+ *
+ * Get a unique ID to put in the snaphot context to create a
+ * snapshot. A clone of an object is not created until a write with
+ * the new snapshot context is completed.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+                                                       rados_snap_t *snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+                                        rados_snap_t *snapid,
+                                        rados_completion_t completion);
+
+/**
+ * Remove a self-managed snapshot
+ *
+ * This increases the snapshot sequence number, which will cause
+ * snapshots to be removed lazily.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+                                                       rados_snap_t snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+                                        rados_snap_t snapid,
+                                        rados_completion_t completion);
+
+/**
+ * Rollback an object to a self-managed snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapid which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io,
+                                                         const char *oid,
+                                                         rados_snap_t snapid);
+
+/**
+ * Set the snapshot context for use when writing to objects
+ *
+ * This is stored in the io context, and applies to all future writes.
+ *
+ * @param io the io context to change
+ * @param seq the newest snapshot sequence number for the pool
+ * @param snaps array of snapshots in sorted by descending id
+ * @param num_snaps how many snaphosts are in the snaps array
+ * @returns 0 on success, negative error code on failure
+ * @returns -EINVAL if snaps are not in descending order
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io,
+                                                              rados_snap_t seq,
+                                                              rados_snap_t *snaps,
+                                                              int num_snaps);
+
+/**
+ * List all the ids of pool snapshots
+ *
+ * If the output array does not have enough space to fit all the
+ * snapshots, -ERANGE is returned and the caller should retry with a
+ * larger array.
+ *
+ * @param io the pool to read from
+ * @param snaps where to store the results
+ * @param maxlen the number of rados_snap_t that fit in the snaps array
+ * @returns number of snapshots on success, negative error code on failure
+ * @returns -ERANGE is returned if the snaps array is too short
+ */
+CEPH_RADOS_API int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t *snaps,
+                                         int maxlen);
+
+/**
+ * Get the id of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param name the snapshot to find
+ * @param id where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_lookup(rados_ioctx_t io, const char *name,
+                                           rados_snap_t *id);
+
+/**
+ * Get the name of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param id the snapshot to find
+ * @param name where to store the result
+ * @param maxlen the size of the name array
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the name array is too small
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_name(rados_ioctx_t io, rados_snap_t id,
+                                             char *name, int maxlen);
+
+/**
+ * Find when a pool snapshot occurred
+ *
+ * @param io the pool the snapshot was taken in
+ * @param id the snapshot to lookup
+ * @param t where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id,
+                                              time_t *t);
+
+/** @} Snapshots */
+
+/**
+ * @name Synchronous I/O
+ * Writes are replicated to a number of OSDs based on the
+ * configuration of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_ioctx_wait_for_complete().  For greater data safety, use the
+ * asynchronous functions and rados_aio_wait_for_safe().
+ *
+ * @{
+ */
+
+/**
+ * Return the version of the last object read or written to.
+ *
+ * This exposes the internal version number of the last object read or
+ * written via this io context
+ *
+ * @param io the io context to check
+ * @returns last read or written object version
+ */
+CEPH_RADOS_API uint64_t rados_get_last_version(rados_ioctx_t io);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object, starting at
+ * offset *off*. The value of *len* must be <= UINT_MAX/2.
+ *
+ * @note This will never return a positive value not equal to len.
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid,
+                               const char *buf, size_t len, uint64_t off);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write_full(rados_ioctx_t io, const char *oid,
+                                    const char *buf, size_t len);
+
+/**
+ * Write the same *data_len* bytes from *buf* multiple times into the
+ * *oid* object. *write_len* bytes are written in total, which must be
+ * a multiple of *data_len*. The value of *write_len* and *data_len*
+ * must be <= UINT_MAX/2.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_writesame(rados_ioctx_t io, const char *oid,
+                                   const char *buf, size_t data_len,
+                                   size_t write_len, uint64_t off);
+
+/**
+ * Append *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_append(rados_ioctx_t io, const char *oid,
+                                const char *buf, size_t len);
+
+/**
+ * Read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+CEPH_RADOS_API int rados_read(rados_ioctx_t io, const char *oid, char *buf,
+                              size_t len, uint64_t off);
+
+/**
+ * Compute checksum from object data
+ *
+ * The io context determines the snapshot to checksum, if any was set
+ * by rados_ioctx_snap_set_read(). The length of the init_value and
+ * resulting checksum are dependent upon the checksum type:
+ *
+ *    XXHASH64: le64
+ *    XXHASH32: le32
+ *    CRC32C:	le32
+ *
+ * The checksum result is encoded the following manner:
+ *
+ *    le32 num_checksum_chunks
+ *    {
+ *      leXX checksum for chunk (where XX = appropriate size for the checksum type)
+ *    } * num_checksum_chunks
+ *
+ * @param io the context in which to perform the checksum
+ * @param oid the name of the object to checksum
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param len the number of bytes to checksum
+ * @param off the offset to start checksumming in the object
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result
+ * @param checksum_len the number of bytes available for the result
+ * @return negative error code on failure
+ */
+CEPH_RADOS_API int rados_checksum(rados_ioctx_t io, const char *oid,
+				  rados_checksum_type_t type,
+				  const char *init_value, size_t init_value_len,
+				  size_t len, uint64_t off, size_t chunk_size,
+				  char *pchecksum, size_t checksum_len);
+
+/**
+ * Delete an object
+ *
+ * @note This does not delete any snapshots of the object.
+ *
+ * @param io the pool to delete the object from
+ * @param oid the name of the object to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_remove(rados_ioctx_t io, const char *oid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @param io the context in which to truncate
+ * @param oid the name of the object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid,
+                               uint64_t size);
+
+/**
+ * Compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o name of the object
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ *  (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o,
+                                const char *cmp_buf, size_t cmp_len,
+                                uint64_t off);
+
+/**
+ * @name Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattr(rados_ioctx_t io, const char *o,
+                                  const char *name, char *buf, size_t len);
+
+/**
+ * Set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_setxattr(rados_ioctx_t io, const char *o,
+                                  const char *name, const char *buf,
+                                  size_t len);
+
+/**
+ * Delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_rmxattr(rados_ioctx_t io, const char *o,
+                                 const char *name);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs(rados_ioctx_t io, const char *oid,
+                                   rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs_next(rados_xattrs_iter_t iter,
+                                        const char **name, const char **val,
+                                        size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Get the next omap key/value pair on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key is
+ * null-terminated, and val has length len. If the end of the list has
+ * been reached, key and val are NULL, and len is 0. key and val will
+ * not be accessible after rados_omap_get_end() is called on iter, so
+ * if they are needed after that they should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next(rados_omap_iter_t iter,
+                                       char **key,
+                                       char **val,
+                                       size_t *len);
+
+/**
+ * Get the next omap key/value pair on the object. Note that it's
+ * perfectly safe to mix calls to rados_omap_get_next and
+ * rados_omap_get_next2.
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key has length
+ * keylen and val has length vallen. If the end of the list has
+ * been reached, key and val are NULL, and keylen and vallen is 0.
+ * key and val will not be accessible after rados_omap_get_end()
+ * is called on iter, so if they are needed after that they
+ * should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param key_len where to store the number of bytes in key
+ * @param val_len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next2(rados_omap_iter_t iter,
+                                       char **key,
+                                       char **val,
+                                       size_t *key_len,
+                                       size_t *val_len);
+
+/**
+ * Return number of elements in the iterator
+ *
+ * @param iter the iterator of which to return the size
+ */
+CEPH_RADOS_API unsigned int rados_omap_iter_size(rados_omap_iter_t iter);
+
+/**
+ * Close the omap iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_omap_get_end(rados_omap_iter_t iter);
+
+/**
+ * Get object size and most recent update time from the OSD.
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_stat(rados_ioctx_t io, const char *o, uint64_t *psize,
+                              time_t *pmtime);
+
+CEPH_RADOS_API int rados_stat2(rados_ioctx_t io, const char *o, uint64_t *psize,
+                              struct timespec *pmtime);
+
+/**
+ * Execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param oid the object to call the method on
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns the length of the output, or
+ * -ERANGE if out_buf does not have enough space to store it (For methods that return data). For
+ * methods that don't return data, the return value is
+ * method-specific.
+ */
+CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid,
+                              const char *cls, const char *method,
+	                      const char *in_buf, size_t in_len, char *buf,
+                              size_t out_len);
+
+
+/** @} Synchronous I/O */
+
+/**
+ * @name Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_callback_t
+ * Callbacks for asynchrous operations take two parameters:
+ * - cb the completion that has finished
+ * - arg application defined data made available to the callback function
+ */
+typedef void (*rados_callback_t)(rados_completion_t cb, void *arg);
+
+/**
+ * Constructs a completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * TODO: more complete documentation of this elsewhere (in the RADOS docs?)
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all replicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_create_completion(void *cb_arg,
+                                               rados_callback_t cb_complete,
+                                               rados_callback_t cb_safe,
+				               rados_completion_t *pc);
+
+/**
+ * Constructs a completion to use with asynchronous operations
+ *
+ * The complete callback corresponds to operation being acked.
+ *
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is committed
+ * on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_create_completion2(void *cb_arg,
+						rados_callback_t cb_complete,
+						rados_completion_t *pc);
+
+/**
+ * Block until an operation completes
+ *
+ * This means it is in memory on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete(rados_completion_t c);
+
+/**
+ * Block until an operation is safe
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe(rados_completion_t c)
+  __attribute__((deprecated));
+
+/**
+ * Has an asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe(rados_completion_t c);
+
+/**
+ * Block until an operation completes and callback completes
+ *
+ * This means it is in memory on all replicas and can be read.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete_and_cb(rados_completion_t c);
+
+/**
+ * Block until an operation is safe and callback has completed
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe_and_cb(rados_completion_t c)
+  __attribute__((deprecated));
+
+/**
+ * Has an asynchronous operation and callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete_and_cb(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe and has the callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe_and_cb(rados_completion_t c);
+
+/**
+ * Get the return value of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns return value of the operation
+ */
+CEPH_RADOS_API int rados_aio_get_return_value(rados_completion_t c);
+
+/**
+ * Get the internal object version of the target of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns version number of the asychronous operation's target
+ */
+CEPH_RADOS_API uint64_t rados_aio_get_version(rados_completion_t c);
+
+/**
+ * Release a completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c completion to release
+ */
+CEPH_RADOS_API void rados_aio_release(rados_completion_t c);
+
+/**
+ * Write data to an object asynchronously
+ *
+ * Queues the write and returns. The return value of the completion
+ * will be 0 on success, negative error code on failure.
+ *
+ * @param io the context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write(rados_ioctx_t io, const char *oid,
+		                   rados_completion_t completion,
+		                   const char *buf, size_t len, uint64_t off);
+
+/**
+ * Asynchronously append data to an object
+ *
+ * Queues the append and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the append is safe and complete
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_append(rados_ioctx_t io, const char *oid,
+		                    rados_completion_t completion,
+		                    const char *buf, size_t len);
+
+/**
+ * Asynchronously write an entire object
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ * Queues the write_full and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write_full is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write_full(rados_ioctx_t io, const char *oid,
+			                rados_completion_t completion,
+			                const char *buf, size_t len);
+
+/**
+ * Asynchronously write the same buffer multiple times
+ *
+ * Queues the writesame and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the writesame is safe and complete
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_writesame(rados_ioctx_t io, const char *oid,
+			               rados_completion_t completion,
+			               const char *buf, size_t data_len,
+				       size_t write_len, uint64_t off);
+
+/**
+ * Asynchronously remove an object
+ *
+ * Queues the remove and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_remove(rados_ioctx_t io, const char *oid,
+		                    rados_completion_t completion);
+
+/**
+ * Asynchronously read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @note only the 'complete' callback of the completion will be called.
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param completion what to do when the read is complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_read(rados_ioctx_t io, const char *oid,
+		                  rados_completion_t completion,
+		                  char *buf, size_t len, uint64_t off);
+
+/**
+ * Block until all pending writes in an io context are safe
+ *
+ * This is not equivalent to calling rados_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @note BUG: always returns 0, should be void or accept a timeout
+ *
+ * @param io the context to flush
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush(rados_ioctx_t io);
+
+
+/**
+ * Schedule a callback for when all currently pending
+ * aio writes are safe. This is a non-blocking version of
+ * rados_aio_flush().
+ *
+ * @param io the context to flush
+ * @param completion what to do when the writes are safe
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush_async(rados_ioctx_t io,
+                                         rados_completion_t completion);
+
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param completion what to do when the stat is complete
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o,
+		                  rados_completion_t completion,
+		                  uint64_t *psize, time_t *pmtime);
+
+CEPH_RADOS_API int rados_aio_stat2(rados_ioctx_t io, const char *o,
+		                  rados_completion_t completion,
+		                  uint64_t *psize, struct timespec *pmtime);
+
+/**
+ * Asynchronously compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o the name of the object to compare with
+ * @param completion what to do when the comparison is complete
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ *  (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o,
+                                    rados_completion_t completion,
+                                    const char *cmp_buf,
+                                    size_t cmp_len,
+                                    uint64_t off);
+
+/**
+ * Cancel async operation
+ *
+ * @param io ioctx
+ * @param completion completion handle
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io,
+                                    rados_completion_t completion);
+
+/**
+ * Asynchronously execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param o name of the object
+ * @param completion what to do when the exec completes
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_exec(rados_ioctx_t io, const char *o,
+				  rados_completion_t completion,
+				  const char *cls, const char *method,
+				  const char *in_buf, size_t in_len,
+				  char *buf, size_t out_len);
+
+/** @} Asynchronous I/O */
+
+/**
+ * @name Asynchronous Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Asynchronously get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param completion what to do when the getxattr completes
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattr(rados_ioctx_t io, const char *o,
+				      rados_completion_t completion,
+				      const char *name, char *buf, size_t len);
+
+/**
+ * Asynchronously set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param completion what to do when the setxattr completes
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_setxattr(rados_ioctx_t io, const char *o,
+				      rados_completion_t completion,
+				      const char *name, const char *buf,
+				      size_t len);
+
+/**
+ * Asynchronously delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param completion what to do when the rmxattr completes
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_rmxattr(rados_ioctx_t io, const char *o,
+				     rados_completion_t completion,
+				     const char *name);
+
+/**
+ * Asynchronously start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param completion what to do when the getxattrs completes
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattrs(rados_ioctx_t io, const char *oid,
+				       rados_completion_t completion,
+				       rados_xattrs_iter_t *iter);
+
+/** @} Asynchronous Xattrs */
+
+/**
+ * @name Watch/Notify
+ *
+ * Watch/notify is a protocol to help communicate among clients. It
+ * can be used to sychronize client state. All that's needed is a
+ * well-known object name (for example, rbd uses the header object of
+ * an image).
+ *
+ * Watchers register an interest in an object, and receive all
+ * notifies on that object. A notify attempts to communicate with all
+ * clients watching an object, and blocks on the notifier until each
+ * client responds or a timeout is reached.
+ *
+ * See rados_watch() and rados_notify() for more details.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_watchcb_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param opcode undefined
+ * @param ver version of the watched object
+ * @param arg application-specific data
+ *
+ * @note BUG: opcode is an internal detail that shouldn't be exposed
+ * @note BUG: ver is unused
+ */
+typedef void (*rados_watchcb_t)(uint8_t opcode, uint64_t ver, void *arg);
+
+/**
+ * @typedef rados_watchcb2_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param arg opaque user-defined value provided to rados_watch2()
+ * @param notify_id an id for this notify event
+ * @param handle the watcher handle we are notifying
+ * @param notifier_id the unique client id for the notifier
+ * @param data payload from the notifier
+ * @param data_len length of payload buffer
+ */
+typedef void (*rados_watchcb2_t)(void *arg,
+				 uint64_t notify_id,
+				 uint64_t handle,
+				 uint64_t notifier_id,
+				 void *data,
+				 size_t data_len);
+
+/**
+ * @typedef rados_watcherrcb_t
+ *
+ * Callback activated when we encounter an error with the watch session.
+ * This can happen when the location of the objects moves within the
+ * cluster and we fail to register our watch with the new object location,
+ * or when our connection with the object OSD is otherwise interrupted and
+ * we may have missed notify events.
+ *
+ * @param pre opaque user-defined value provided to rados_watch2()
+ * @param cookie the internal id assigned to the watch session
+ * @param err error code
+ */
+  typedef void (*rados_watcherrcb_t)(void *pre, uint64_t cookie, int err);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @note BUG: librados should provide a way for watchers to notice connection resets
+ * @note BUG: the ver parameter does not work, and -ERANGE will never be returned
+ *            (See URL tracker.ceph.com/issues/2592)
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param ver expected version of the object
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param arg application defined data to pass when watchcb is called
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the version of the object is greater than ver
+ */
+CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver,
+			       uint64_t *cookie,
+			       rados_watchcb_t watchcb, void *arg)
+  __attribute__((deprecated));
+
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to the
+ * primary OSD for a watched object, the watch will be removed after
+ * a timeout configured with osd_client_watch_timeout.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch2(rados_ioctx_t io, const char *o, uint64_t *cookie,
+				rados_watchcb2_t watchcb,
+				rados_watcherrcb_t watcherrcb,
+				void *arg);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch3(rados_ioctx_t io, const char *o, uint64_t *cookie,
+        rados_watchcb2_t watchcb,
+        rados_watcherrcb_t watcherrcb,
+        uint32_t timeout,
+        void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch(rados_ioctx_t io, const char *o,
+				   rados_completion_t completion, uint64_t *handle,
+				   rados_watchcb2_t watchcb,
+				   rados_watcherrcb_t watcherrcb,
+				   void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after the number of seconds that configured in timeout parameter.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch2(rados_ioctx_t io, const char *o,
+           rados_completion_t completion, uint64_t *handle,
+           rados_watchcb2_t watchcb,
+           rados_watcherrcb_t watcherrcb,
+           uint32_t timeout,
+           void *arg);
+
+/**
+ * Check on the status of a watch
+ *
+ * Return the number of milliseconds since the watch was last confirmed.
+ * Or, if there has been an error, return that.
+ *
+ * If there is an error, the watch is no longer valid, and should be
+ * destroyed with rados_unwatch2().  The the user is still interested
+ * in the object, a new watch should be created with rados_watch2().
+ *
+ * @param io the pool the object is in
+ * @param cookie the watch handle
+ * @returns ms since last confirmed on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch_check(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the watched object (ignored)
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch(rados_ioctx_t io, const char *o, uint64_t cookie)
+  __attribute__((deprecated));
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch2(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Asynchronous unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unwatch(rados_ioctx_t io, uint64_t cookie,
+                                     rados_completion_t completion);
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * @note BUG: the timeout is not changeable via the C API
+ * @note BUG: the bufferlist is inaccessible in a rados_watchcb_t
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param ver obsolete - just pass zero
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver,
+				const char *buf, int buf_len)
+  __attribute__((deprecated));
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * The reply buffer is optional.  If specified, the client will get
+ * back an encoded buffer that includes the ids of the clients that
+ * acknowledged the notify as well as their notify ack payloads (if
+ * any).  Clients that timed out are not included.  Even clients that
+ * do not include a notify ack payload are included in the list but
+ * have a 0-length payload associated with them.  The format:
+ *
+ *    le32 num_acks
+ *    {
+ *      le64 gid     global id for the client (for client.1234 that's 1234)
+ *      le64 cookie  cookie for the client
+ *      le32 buflen  length of reply message buffer
+ *      u8 * buflen  payload
+ *    } * num_acks
+ *    le32 num_timeouts
+ *    {
+ *      le64 gid     global id for the client
+ *      le64 cookie  cookie for the client
+ *    } * num_timeouts
+ *
+ * Note: There may be multiple instances of the same gid if there are
+ * multiple watchers registered via the same client.
+ *
+ * Note: The buffer must be released with rados_buffer_free() when the
+ * user is done with it.
+ *
+ * Note: Since the result buffer includes clients that time out, it
+ * will be set even when rados_notify() returns an error code (like
+ * -ETIMEDOUT).
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param o the name of the object
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @param timeout_ms notify timeout (in ms)
+ * @param reply_buffer pointer to reply buffer pointer (free with rados_buffer_free)
+ * @param reply_buffer_len pointer to size of reply buffer
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o,
+				    rados_completion_t completion,
+				    const char *buf, int buf_len,
+				    uint64_t timeout_ms, char **reply_buffer,
+				    size_t *reply_buffer_len);
+CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o,
+				 const char *buf, int buf_len,
+				 uint64_t timeout_ms,
+				 char **reply_buffer, size_t *reply_buffer_len);
+
+/**
+ * Decode a notify response
+ *
+ * Decode a notify response (from rados_aio_notify() call) into acks and
+ * timeout arrays.
+ *
+ * @param reply_buffer buffer from rados_aio_notify() call
+ * @param reply_buffer_len reply_buffer length
+ * @param acks pointer to struct notify_ack_t pointer
+ * @param nr_acks pointer to ack count
+ * @param timeouts pointer to notify_timeout_t pointer
+ * @param nr_timeouts pointer to timeout count
+ * @returns 0 on success
+ */
+CEPH_RADOS_API int rados_decode_notify_response(char *reply_buffer, size_t reply_buffer_len,
+                                                struct notify_ack_t **acks, size_t *nr_acks,
+                                                struct notify_timeout_t **timeouts, size_t *nr_timeouts);
+
+/**
+ * Free notify allocated buffer
+ *
+ * Release memory allocated by rados_decode_notify_response() call
+ *
+ * @param acks notify_ack_t struct (from rados_decode_notify_response())
+ * @param nr_acks ack count
+ * @param timeouts notify_timeout_t struct (from rados_decode_notify_response())
+ */
+CEPH_RADOS_API void rados_free_notify_response(struct notify_ack_t *acks, size_t nr_acks,
+                                               struct notify_timeout_t *timeouts);
+
+/**
+ * Acknolwedge receipt of a notify
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param notify_id the notify_id we got on the watchcb2_t callback
+ * @param cookie the watcher handle
+ * @param buf payload to return to notifier (optional)
+ * @param buf_len payload length
+ * @returns 0 on success
+ */
+CEPH_RADOS_API int rados_notify_ack(rados_ioctx_t io, const char *o,
+				    uint64_t notify_id, uint64_t cookie,
+				    const char *buf, int buf_len);
+
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will block until all pending watch/notify callbacks have
+ * been executed and the queue is empty.  It should usually be called
+ * after shutting down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ */
+CEPH_RADOS_API int rados_watch_flush(rados_t cluster);
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will be nonblock, and the completion will be called
+ * until all pending watch/notify callbacks have been executed and
+ * the queue is empty.  It should usually be called after shutting
+ * down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ * @param completion what to do when operation has been attempted
+ */
+CEPH_RADOS_API int rados_aio_watch_flush(rados_t cluster, rados_completion_t completion);
+
+/** @} Watch/Notify */
+
+/**
+ * Pin an object in the cache tier
+ *
+ * When an object is pinned in the cache tier, it stays in the cache
+ * tier, and won't be flushed out.
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o);
+
+/**
+ * Unpin an object in the cache tier
+ *
+ * After an object is unpinned in the cache tier, it can be flushed out
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o);
+
+/**
+ * @name Hints
+ *
+ * @{
+ */
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o,
+                                        uint64_t expected_object_size,
+                                        uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint2(rados_ioctx_t io, const char *o,
+					 uint64_t expected_object_size,
+					 uint64_t expected_write_size,
+					 uint32_t flags);
+
+/** @} Hints */
+
+/**
+ * @name Object Operations
+ *
+ * A single rados operation can do multiple operations on one object
+ * atomically. The whole operation will succeed or fail, and no partial
+ * results will be visible.
+ *
+ * Operations may be either reads, which can return data, or writes,
+ * which cannot. The effects of writes are applied and visible all at
+ * once, so an operation that sets an xattr and then checks its value
+ * will not see the updated value.
+ *
+ * @{
+ */
+
+/**
+ * Create a new rados_write_op_t write operation. This will store all actions
+ * to be performed atomically. You must call rados_release_write_op when you are
+ * finished with it.
+ *
+ * @note the ownership of a write operartion is passed to the function
+ *       performing the operation, so the same instance of @c rados_write_op_t
+ *       cannot be used again after being performed.
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_write_op_t rados_create_write_op(void);
+
+/**
+ * Free a rados_write_op_t, must be called when you're done with it.
+ * @param write_op operation to deallocate, created with rados_create_write_op
+ */
+CEPH_RADOS_API void rados_release_write_op(rados_write_op_t write_op);
+
+/**
+ * Set flags for the last operation added to this write_op.
+ * At least one op must have been added to the write_op.
+ * @param write_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_write_op_set_flags(rados_write_op_t write_op,
+                                             int flags);
+
+/**
+ * Ensure that the object exists before writing
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before writing. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ *   then rados_write_op_operate will return -ERANGE instead of
+ *   executing the op.
+ * - If the object's version is less than the asserted version
+ *   then rados_write_op_operate will return -EOVERFLOW instead
+ *   of executing the op.
+ * @param write_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param write_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ *  on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op,
+                                          const char *cmp_buf,
+                                          size_t cmp_len,
+                                          uint64_t off,
+                                          int *prval);
+
+/**
+ * Ensure that given xattr satisfies comparison.
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_write_op_cmpxattr(rados_write_op_t write_op,
+                                            const char *name,
+                                            uint8_t comparison_operator,
+                                            const char *value,
+                                            size_t value_len);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+   LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp(rados_write_op_t write_op,
+                                            const char *key,
+                                            uint8_t comparison_operator,
+                                            const char *val,
+                                            size_t val_len,
+                                            int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+   LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp2(rados_write_op_t write_op,
+                                            const char *key,
+                                            uint8_t comparison_operator,
+                                            const char *val,
+                                            size_t key_len,
+                                            size_t val_len,
+                                            int *prval);
+
+/**
+ * Set an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr
+ * @param value buffer to set xattr to
+ * @param value_len length of buffer to set xattr to
+ */
+CEPH_RADOS_API void rados_write_op_setxattr(rados_write_op_t write_op,
+                                            const char *name,
+                                            const char *value,
+                                            size_t value_len);
+
+/**
+ * Remove an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to remove
+ */
+CEPH_RADOS_API void rados_write_op_rmxattr(rados_write_op_t write_op,
+                                           const char *name);
+
+/**
+ * Create the object
+ * @param write_op operation to add this action to
+ * @param exclusive set to either LIBRADOS_CREATE_EXCLUSIVE or
+   LIBRADOS_CREATE_IDEMPOTENT
+ * will error if the object already exists.
+ * @param category category string (DEPRECATED, HAS NO EFFECT)
+ */
+CEPH_RADOS_API void rados_write_op_create(rados_write_op_t write_op,
+                                          int exclusive,
+                                          const char* category);
+
+/**
+ * Write to offset
+ * @param write_op operation to add this action to
+ * @param offset offset to write to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write(rados_write_op_t write_op,
+                                         const char *buffer,
+                                         size_t len,
+                                         uint64_t offset);
+
+/**
+ * Write whole object, atomically replacing it.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write_full(rados_write_op_t write_op,
+                                              const char *buffer,
+                                              size_t len);
+
+/**
+ * Write the same buffer multiple times
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param data_len length of buffer
+ * @param write_len total number of bytes to write, as a multiple of @c data_len
+ * @param offset offset to write to
+ */
+CEPH_RADOS_API void rados_write_op_writesame(rados_write_op_t write_op,
+                                             const char *buffer,
+                                             size_t data_len,
+                                             size_t write_len,
+                                             uint64_t offset);
+
+/**
+ * Append to end of object.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_append(rados_write_op_t write_op,
+                                          const char *buffer,
+                                          size_t len);
+/**
+ * Remove object
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op);
+
+/**
+ * Truncate an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to truncate to
+ */
+CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op,
+                                            uint64_t offset);
+
+/**
+ * Zero part of an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to zero
+ * @param len length to zero
+ */
+CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op,
+			                uint64_t offset,
+			                uint64_t len);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * @param write_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_write_op_exec(rados_write_op_t write_op,
+			                const char *cls,
+			                const char *method,
+			                const char *in_buf,
+			                size_t in_len,
+			                int *prval);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set(rados_write_op_t write_op,
+                                            char const* const* keys,
+                                            char const* const* vals,
+                                            const size_t *lens,
+                                            size_t num);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param key_lens array of lengths corresponding to each key
+ * @param val_lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set2(rados_write_op_t write_op,
+                                            char const* const* keys,
+                                            char const* const* vals,
+                                            const size_t *key_lens,
+                                            const size_t *val_lens,
+                                            size_t num);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to remove
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys(rados_write_op_t write_op,
+                                                char const* const* keys,
+                                                size_t keys_len);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of char arrays representing keys to remove
+ * @param key_lens array of size_t values representing length of each key
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys2(rados_write_op_t write_op,
+                                                char const* const* keys,
+                                                const size_t* key_lens,
+                                                size_t keys_len);
+
+
+/**
+ * Remove key/value pairs from an object whose keys are in the range
+ * [key_begin, key_end)
+ *
+ * @param write_op operation to add this action to
+ * @param key_begin the lower bound of the key range to remove
+ * @param key_begin_len length of key_begin
+ * @param key_end the upper bound of the key range to remove
+ * @param key_end_len length of key_end
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_range2(rados_write_op_t write_op,
+                                                  const char *key_begin,
+                                                  size_t key_begin_len,
+                                                  const char *key_end,
+                                                  size_t key_end_len);
+
+/**
+ * Remove all key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_omap_clear(rados_write_op_t write_op);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint(rados_write_op_t write_op,
+                                                  uint64_t expected_object_size,
+                                                  uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint2(rados_write_op_t write_op,
+						   uint64_t expected_object_size,
+						   uint64_t expected_write_size,
+						   uint32_t flags);
+
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_write_op_operate(rados_write_op_t write_op,
+			                  rados_ioctx_t io,
+			                  const char *oid,
+			                  time_t *mtime,
+			                  int flags);
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+
+CEPH_RADOS_API int rados_write_op_operate2(rados_write_op_t write_op,
+                                           rados_ioctx_t io,
+                                           const char *oid,
+                                           struct timespec *mtime,
+                                           int flags);
+
+/**
+ * Perform a write operation asynchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_write_op_operate(rados_write_op_t write_op,
+                                              rados_ioctx_t io,
+                                              rados_completion_t completion,
+                                              const char *oid,
+                                              time_t *mtime,
+			                      int flags);
+
+/**
+ * Perform a write operation asynchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_write_op_operate2(rados_write_op_t write_op,
+                                               rados_ioctx_t io,
+                                               rados_completion_t completion,
+                                               const char *oid,
+                                               struct timespec *mtime,
+                                               int flags);
+
+/**
+ * Create a new rados_read_op_t read operation. This will store all
+ * actions to be performed atomically. You must call
+ * rados_release_read_op when you are finished with it (after it
+ * completes, or you decide not to send it in the first place).
+ *
+ * @note the ownership of a read operartion is passed to the function
+ *       performing the operation, so the same instance of @c rados_read_op_t
+ *       cannot be used again after being performed.
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_read_op_t rados_create_read_op(void);
+
+/**
+ * Free a rados_read_op_t, must be called when you're done with it.
+ * @param read_op operation to deallocate, created with rados_create_read_op
+ */
+CEPH_RADOS_API void rados_release_read_op(rados_read_op_t read_op);
+
+/**
+ * Set flags for the last operation added to this read_op.
+ * At least one op must have been added to the read_op.
+ * @param read_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_read_op_set_flags(rados_read_op_t read_op, int flags);
+
+/**
+ * Ensure that the object exists before reading
+ * @param read_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before reading. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ *   then rados_read_op_operate will return -ERANGE instead of
+ *   executing the op.
+ * - If the object's version is less than the asserted version
+ *   then rados_read_op_operate will return -EOVERFLOW instead
+ *   of executing the op.
+ * @param read_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param read_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ *  on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op,
+                                         const char *cmp_buf,
+                                         size_t cmp_len,
+                                         uint64_t off,
+                                         int *prval);
+
+/**
+ * Ensure that the an xattr satisfies a comparison
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param read_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_read_op_cmpxattr(rados_read_op_t read_op,
+			                   const char *name,
+			                   uint8_t comparison_operator,
+			                   const char *value,
+			                   size_t value_len);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @param read_op operation to add this action to
+ * @param iter where to store the iterator
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_getxattrs(rados_read_op_t read_op,
+			                    rados_xattrs_iter_t *iter,
+			                    int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+   LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp(rados_read_op_t read_op,
+                                           const char *key,
+                                           uint8_t comparison_operator,
+                                           const char *val,
+                                           size_t val_len,
+                                           int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+   LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp2(rados_read_op_t read_op,
+                                           const char *key,
+                                           uint8_t comparison_operator,
+                                           const char *val,
+                                           size_t key_len,
+                                           size_t val_len,
+                                           int *prval);
+
+/**
+ * Get object size and mtime
+ * @param read_op operation to add this action to
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op,
+			               uint64_t *psize,
+			               time_t *pmtime,
+			               int *prval);
+
+CEPH_RADOS_API void rados_read_op_stat2(rados_read_op_t read_op,
+			               uint64_t *psize,
+			               struct timespec *pmtime,
+			               int *prval);
+/**
+ * Read bytes from offset into buffer.
+ *
+ * prlen will be filled with the number of bytes read if successful.
+ * A short read can only occur if the read reaches the end of the
+ * object.
+ *
+ * @param read_op operation to add this action to
+ * @param offset offset to read from
+ * @param len length of buffer
+ * @param buffer where to put the data
+ * @param bytes_read where to store the number of bytes read by this action
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op,
+			               uint64_t offset,
+			               size_t len,
+			               char *buffer,
+			               size_t *bytes_read,
+			               int *prval);
+
+/**
+ * Compute checksum from object data
+ *
+ * @param read_op operation to add this action to
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param offset the offset to start checksumming in the object
+ * @param len the number of bytes to checksum
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result for this action
+ * @param checksum_len the number of bytes available for the result
+ * @param prval where to store the return value for this action
+ */
+CEPH_RADOS_API void rados_read_op_checksum(rados_read_op_t read_op,
+					   rados_checksum_type_t type,
+					   const char *init_value,
+					   size_t init_value_len,
+					   uint64_t offset, size_t len,
+					   size_t chunk_size, char *pchecksum,
+					   size_t checksum_len, int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * The output buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf where to put librados-allocated output buffer
+ * @param out_len length of out_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec(rados_read_op_t read_op,
+			               const char *cls,
+			               const char *method,
+			               const char *in_buf,
+			               size_t in_len,
+			               char **out_buf,
+			               size_t *out_len,
+			               int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * If the output buffer is too small, prval will
+ * be set to -ERANGE and used_len will be 0.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf user-provided buffer to read into
+ * @param out_len length of out_buf in bytes
+ * @param used_len where to store the number of bytes read into out_buf
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op,
+				                const char *cls,
+				                const char *method,
+				                const char *in_buf,
+				                size_t in_len,
+				                char *out_buf,
+				                size_t out_len,
+				                size_t *used_len,
+				                int *prval);
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op,
+				                const char *start_after,
+				                const char *filter_prefix,
+				                uint64_t max_return,
+				                rados_omap_iter_t *iter,
+				                int *prval)
+  __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals2(rados_read_op_t read_op,
+						 const char *start_after,
+						 const char *filter_prefix,
+						 uint64_t max_return,
+						 rados_omap_iter_t *iter,
+						 unsigned char *pmore,
+						 int *prval);
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys(rados_read_op_t read_op,
+				                const char *start_after,
+				                uint64_t max_return,
+				                rados_omap_iter_t *iter,
+				                int *prval)
+  __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys2(rados_read_op_t read_op,
+						 const char *start_after,
+						 uint64_t max_return,
+						 rados_omap_iter_t *iter,
+						 unsigned char *pmore,
+						 int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to null-terminated keys to get
+ * @param keys_len the number of strings in keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op,
+                                                        char const* const* keys,
+                                                        size_t keys_len,
+                                                        rados_omap_iter_t *iter,
+                                                        int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to keys to get
+ * @param num_keys the number of strings in keys
+ * @param key_lens array of size_t's describing each key len (in bytes)
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys2(rados_read_op_t read_op,
+                                                        char const* const* keys,
+                                                        size_t num_keys,
+                                                        const size_t* key_lens,
+                                                        rados_omap_iter_t *iter,
+                                                        int *prval);
+
+/**
+ * Perform a read operation synchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op,
+			                 rados_ioctx_t io,
+			                 const char *oid,
+			                 int flags);
+
+/**
+ * Perform a read operation asynchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op,
+			                     rados_ioctx_t io,
+			                     rados_completion_t completion,
+			                     const char *oid,
+			                     int flags);
+
+/** @} Object Operations */
+
+/**
+ * Take an exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid,
+                                        const char * name, const char * cookie,
+                                        const char * desc,
+                                        struct timeval * duration,
+                                        uint8_t flags);
+
+/**
+ * Take a shared lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param tag The tag of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_shared(rados_ioctx_t io, const char * o,
+                                     const char * name, const char * cookie,
+                                     const char * tag, const char * desc,
+	                             struct timeval * duration, uint8_t flags);
+
+/**
+ * Release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_unlock(rados_ioctx_t io, const char *o,
+                                const char *name, const char *cookie);
+
+/**
+ * Asynchronous release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @param completion what to do when operation has been attempted
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unlock(rados_ioctx_t io, const char *o,
+                                    const char *name, const char *cookie,
+			            rados_completion_t completion);
+
+/**
+ * List clients that have locked the named object lock and information about
+ * the lock.
+ *
+ * The number of bytes required in each buffer is put in the
+ * corresponding size out parameter. If any of the provided buffers
+ * are too short, -ERANGE is returned after these sizes are filled in.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param exclusive where to store whether the lock is exclusive (1) or shared (0)
+ * @param tag where to store the tag associated with the object lock
+ * @param tag_len number of bytes in tag buffer
+ * @param clients buffer in which locker clients are stored, separated by '\0'
+ * @param clients_len number of bytes in the clients buffer
+ * @param cookies buffer in which locker cookies are stored, separated by '\0'
+ * @param cookies_len number of bytes in the cookies buffer
+ * @param addrs buffer in which locker addresses are stored, separated by '\0'
+ * @param addrs_len number of bytes in the clients buffer
+ * @returns number of lockers on success, negative error code on failure
+ * @returns -ERANGE if any of the buffers are too short
+ */
+CEPH_RADOS_API ssize_t rados_list_lockers(rados_ioctx_t io, const char *o,
+			                  const char *name, int *exclusive,
+			                  char *tag, size_t *tag_len,
+			                  char *clients, size_t *clients_len,
+			                  char *cookies, size_t *cookies_len,
+			                  char *addrs, size_t *addrs_len);
+
+/**
+ * Releases a shared or exclusive lock on an object, which was taken by the
+ * specified client.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param client the client currently holding the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ * @returns -EINVAL if the client cannot be parsed
+ */
+CEPH_RADOS_API int rados_break_lock(rados_ioctx_t io, const char *o,
+                                    const char *name, const char *client,
+                                    const char *cookie);
+
+/**
+ * Blocklists the specified client from the OSDs
+ *
+ * @param cluster cluster handle
+ * @param client_address client address
+ * @param expire_seconds number of seconds to blocklist (0 for default)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_blocklist_add(rados_t cluster,
+				       char *client_address,
+				       uint32_t expire_seconds);
+CEPH_RADOS_API int rados_blacklist_add(rados_t cluster,
+				       char *client_address,
+				       uint32_t expire_seconds)
+  __attribute__((deprecated));
+
+/**
+ * Gets addresses of the RADOS session, suitable for blocklisting.
+ *
+ * @param cluster cluster handle
+ * @param addrs the output string.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getaddrs(rados_t cluster, char** addrs);
+
+CEPH_RADOS_API void rados_set_osdmap_full_try(rados_ioctx_t io)
+  __attribute__((deprecated));
+
+CEPH_RADOS_API void rados_unset_osdmap_full_try(rados_ioctx_t io)
+  __attribute__((deprecated));
+
+CEPH_RADOS_API void rados_set_pool_full_try(rados_ioctx_t io);
+
+CEPH_RADOS_API void rados_unset_pool_full_try(rados_ioctx_t io);
+
+/**
+ * Enable an application on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param force 0 if only single application per pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io,
+                                            const char *app_name, int force);
+
+/**
+ * List all enabled applications
+ *
+ * If the provided buffer is too short, the required length is filled in and
+ * -ERANGE is returned. Otherwise, the buffers are filled with the application
+ * names, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param values buffer in which to store application names
+ * @param values_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values,
+                                          size_t *values_len);
+
+/**
+ * Get application metadata value from pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value result buffer
+ * @param value_len maximum len of value
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io,
+                                                  const char *app_name,
+                                                  const char *key, char *value,
+                                                  size_t *value_len);
+
+/**
+ * Set application metadata on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io,
+                                                  const char *app_name,
+                                                  const char *key,
+                                                  const char *value);
+
+/**
+ * Remove application metadata from a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io,
+                                                     const char *app_name,
+                                                     const char *key);
+
+/**
+ * List all metadata key/value pairs associated with an application.
+ *
+ * This iterates over all metadata, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are filled
+ * in and -ERANGE is returned. Otherwise, the buffers are filled with
+ * the keys and values of the metadata, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param keys buffer in which to store key names
+ * @param key_len number of bytes in keys buffer
+ * @param values buffer in which to store values
+ * @param vals_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io,
+                                                   const char *app_name,
+                                                   char *keys, size_t *key_len,
+                                                   char *values,
+                                                   size_t *vals_len);
+
+/**
+ * @name Mon/OSD/PG Commands
+ *
+ * These interfaces send commands relating to the monitor, OSD, or PGs.
+ *
+ * @{
+ */
+
+/**
+ * Send monitor command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free().  The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command(rados_t cluster, const char **cmd,
+                                     size_t cmdlen, const char *inbuf,
+                                     size_t inbuflen, char **outbuf,
+                                     size_t *outbuflen, char **outs,
+                                     size_t *outslen);
+
+/**
+ * Send ceph-mgr command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free().  The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mgr_command(rados_t cluster, const char **cmd,
+                                     size_t cmdlen, const char *inbuf,
+                                     size_t inbuflen, char **outbuf,
+                                     size_t *outbuflen, char **outs,
+                                     size_t *outslen);
+
+/**
+ * Send ceph-mgr tell command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free().  The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param name mgr name to target
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mgr_command_target(
+  rados_t cluster,
+  const char *name,
+  const char **cmd,
+  size_t cmdlen, const char *inbuf,
+  size_t inbuflen, char **outbuf,
+  size_t *outbuflen, char **outs,
+  size_t *outslen);
+
+/**
+ * Send monitor command to a specific monitor.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free().  The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param name target monitor's name
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command_target(rados_t cluster, const char *name,
+			                    const char **cmd, size_t cmdlen,
+			                    const char *inbuf, size_t inbuflen,
+			                    char **outbuf, size_t *outbuflen,
+			                    char **outs, size_t *outslen);
+
+/**
+ * free a rados-allocated buffer
+ *
+ * Release memory allocated by librados calls like rados_mon_command().
+ *
+ * @param buf buffer pointer
+ */
+CEPH_RADOS_API void rados_buffer_free(char *buf);
+
+CEPH_RADOS_API int rados_osd_command(rados_t cluster, int osdid,
+                                     const char **cmd, size_t cmdlen,
+		                     const char *inbuf, size_t inbuflen,
+		                     char **outbuf, size_t *outbuflen,
+		                     char **outs, size_t *outslen);
+
+CEPH_RADOS_API int rados_pg_command(rados_t cluster, const char *pgstr,
+                                    const char **cmd, size_t cmdlen,
+		                    const char *inbuf, size_t inbuflen,
+		                    char **outbuf, size_t *outbuflen,
+		                    char **outs, size_t *outslen);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log.  The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback_t)(void *arg,
+				     const char *line,
+				     const char *who, 
+				     uint64_t sec, uint64_t nsec,
+				     uint64_t seq, const char *level,
+				     const char *msg);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log.  The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback2_t)(void *arg,
+				     const char *line,
+				     const char *channel,
+				     const char *who,
+				     const char *name,
+				     uint64_t sec, uint64_t nsec,
+				     uint64_t seq, const char *level,
+				     const char *msg);
+
+CEPH_RADOS_API int rados_monitor_log(rados_t cluster, const char *level,
+                                     rados_log_callback_t cb, void *arg);
+CEPH_RADOS_API int rados_monitor_log2(rados_t cluster, const char *level,
+				      rados_log_callback2_t cb, void *arg);
+
+
+/**
+ * register daemon instance for a service
+ *
+ * Register us as a daemon providing a particular service.  We identify
+ * the service (e.g., 'rgw') and our instance name (e.g., 'rgw.$hostname').
+ * The metadata is a map of keys and values with arbitrary static metdata
+ * for this instance.  The encoding is a series of NULL-terminated strings,
+ * alternating key names and values, terminating with an empty key name.
+ * For example,  "foo\0bar\0this\0that\0\0" is the dict {foo=bar,this=that}.
+ *
+ * For the lifetime of the librados instance, regular beacons will be sent
+ * to the cluster to maintain our registration in the service map.
+ *
+ * @param cluster handle
+ * @param service service name
+ * @param daemon daemon instance name
+ * @param metadata_dict static daemon metadata dict
+ */
+CEPH_RADOS_API int rados_service_register(
+  rados_t cluster,
+  const char *service,
+  const char *daemon,
+  const char *metadata_dict);
+
+/**
+ * update daemon status
+ *
+ * Update our mutable status information in the service map.
+ *
+ * The status dict is encoded the same way the daemon metadata is encoded
+ * for rados_service_register.  For example, "foo\0bar\0this\0that\0\0" is
+ * {foo=bar,this=that}.
+ *
+ * @param cluster rados cluster handle
+ * @param status_dict status dict
+ */
+CEPH_RADOS_API int rados_service_update_status(
+  rados_t cluster,
+  const char *status_dict);
+
+/** @} Mon/OSD/PG commands */
+
+/*
+ * These methods are no longer supported and return -ENOTSUP where possible.
+ */
+CEPH_RADOS_API int rados_objects_list_open(
+  rados_ioctx_t io,
+  rados_list_ctx_t *ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_get_pg_hash_position(
+  rados_list_ctx_t ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_seek(
+  rados_list_ctx_t ctx,
+  uint32_t pos) __attribute__((deprecated));
+CEPH_RADOS_API int rados_objects_list_next(
+  rados_list_ctx_t ctx,
+  const char **entry,
+  const char **key) __attribute__((deprecated));
+CEPH_RADOS_API void rados_objects_list_close(
+  rados_list_ctx_t ctx) __attribute__((deprecated));
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
new file mode 100644
index 000000000..cb8261af1
--- /dev/null
+++ b/src/include/rados/librados.hpp
@@ -0,0 +1,1568 @@
+#ifndef __LIBRADOS_HPP
+#define __LIBRADOS_HPP
+
+#include <string>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+#include <utility>
+#include "buffer.h"
+
+#include "librados.h"
+#include "librados_fwd.hpp"
+#include "rados_types.hpp"
+
+namespace libradosstriper
+{
+  class RadosStriper;
+}
+
+namespace neorados { class RADOS; }
+
+namespace librados {
+
+using ceph::bufferlist;
+
+struct AioCompletionImpl;
+struct IoCtxImpl;
+struct ListObjectImpl;
+class NObjectIteratorImpl;
+struct ObjListCtx;
+class ObjectOperationImpl;
+struct PlacementGroupImpl;
+struct PoolAsyncCompletionImpl;
+
+typedef struct rados_cluster_stat_t cluster_stat_t;
+typedef struct rados_pool_stat_t pool_stat_t;
+
+typedef void *list_ctx_t;
+typedef uint64_t auid_t;
+typedef void *config_t;
+
+typedef struct {
+  std::string client;
+  std::string cookie;
+  std::string address;
+} locker_t;
+
+typedef std::map<std::string, pool_stat_t> stats_map;
+
+typedef void *completion_t;
+typedef void (*callback_t)(completion_t cb, void *arg);
+
+inline namespace v14_2_0 {
+
+  class IoCtx;
+  class RadosClient;
+
+  class CEPH_RADOS_API ListObject
+  {
+  public:
+    const std::string& get_nspace() const;
+    const std::string& get_oid() const;
+    const std::string& get_locator() const;
+
+    ListObject();
+    ~ListObject();
+    ListObject( const ListObject&);
+    ListObject& operator=(const ListObject& rhs);
+  private:
+    ListObject(ListObjectImpl *impl);
+
+    friend class librados::NObjectIteratorImpl;
+    friend std::ostream& operator<<(std::ostream& out, const ListObject& lop);
+
+    ListObjectImpl *impl;
+  };
+  CEPH_RADOS_API std::ostream& operator<<(std::ostream& out, const librados::ListObject& lop);
+
+  class CEPH_RADOS_API NObjectIterator;
+
+  class CEPH_RADOS_API ObjectCursor
+  {
+    public:
+    ObjectCursor();
+    ObjectCursor(const ObjectCursor &rhs);
+    explicit ObjectCursor(rados_object_list_cursor c);
+    ~ObjectCursor();
+    ObjectCursor& operator=(const ObjectCursor& rhs);
+    bool operator<(const ObjectCursor &rhs) const;
+    bool operator==(const ObjectCursor &rhs) const;
+    void set(rados_object_list_cursor c);
+
+    friend class IoCtx;
+    friend class librados::NObjectIteratorImpl;
+    friend std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+    std::string to_str() const;
+    bool from_str(const std::string& s);
+
+    protected:
+    rados_object_list_cursor c_cursor;
+  };
+  CEPH_RADOS_API std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+  class CEPH_RADOS_API NObjectIterator {
+  public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = ListObject;
+    using difference_type = std::ptrdiff_t;
+    using pointer = ListObject*;
+    using reference = ListObject&;
+    static const NObjectIterator __EndObjectIterator;
+    NObjectIterator(): impl(NULL) {}
+    ~NObjectIterator();
+    NObjectIterator(const NObjectIterator &rhs);
+    NObjectIterator& operator=(const NObjectIterator& rhs);
+
+    bool operator==(const NObjectIterator& rhs) const;
+    bool operator!=(const NObjectIterator& rhs) const;
+    const ListObject& operator*() const;
+    const ListObject* operator->() const;
+    NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions
+    NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions
+    friend class IoCtx;
+    friend class librados::NObjectIteratorImpl;
+
+    /// get current hash position of the iterator, rounded to the current pg
+    uint32_t get_pg_hash_position() const;
+
+    /// move the iterator to a given hash position. this may (will!) be rounded
+    /// to the nearest pg. errors are thrown as exceptions
+    uint32_t seek(uint32_t pos);
+
+    /// move the iterator to a given cursor position. errors are thrown as exceptions
+    uint32_t seek(const ObjectCursor& cursor);
+
+    /// get current cursor position
+    ObjectCursor get_cursor();
+
+    /**
+     * Configure PGLS filter to be applied OSD-side (requires caller
+     * to know/understand the format expected by the OSD)
+     */
+    void set_filter(const bufferlist &bl);
+
+  private:
+    NObjectIterator(ObjListCtx *ctx_);
+    void get_next();
+    NObjectIteratorImpl *impl;
+  };
+
+  class CEPH_RADOS_API ObjectItem
+  {
+    public:
+    std::string oid;
+    std::string nspace;
+    std::string locator;
+  };
+
+  /// DEPRECATED; do not use
+  class CEPH_RADOS_API WatchCtx {
+  public:
+    virtual ~WatchCtx();
+    virtual void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) = 0;
+  };
+
+  class CEPH_RADOS_API WatchCtx2 {
+  public:
+    virtual ~WatchCtx2();
+    /**
+     * Callback activated when we receive a notify event.
+     *
+     * @param notify_id unique id for this notify event
+     * @param cookie the watcher we are notifying
+     * @param notifier_id the unique client id of the notifier
+     * @param bl opaque notify payload (from the notifier)
+     */
+    virtual void handle_notify(uint64_t notify_id,
+			       uint64_t cookie,
+			       uint64_t notifier_id,
+			       bufferlist& bl) = 0;
+
+    /**
+     * Callback activated when we encounter an error with the watch.
+     *
+     * Errors we may see:
+     *   -ENOTCONN  : our watch was disconnected
+     *   -ETIMEDOUT : our watch is still valid, but we may have missed
+     *                a notify event.
+     *
+     * @param cookie the watcher with the problem
+     * @param err error
+     */
+    virtual void handle_error(uint64_t cookie, int err) = 0;
+  };
+
+  struct CEPH_RADOS_API AioCompletion {
+    AioCompletion(AioCompletionImpl *pc_) : pc(pc_) {}
+    ~AioCompletion();
+    int set_complete_callback(void *cb_arg, callback_t cb);
+    int set_safe_callback(void *cb_arg, callback_t cb)
+      __attribute__ ((deprecated));
+    int wait_for_complete();
+    int wait_for_safe() __attribute__ ((deprecated));
+    int wait_for_complete_and_cb();
+    int wait_for_safe_and_cb() __attribute__ ((deprecated));
+    bool is_complete();
+    bool is_safe() __attribute__ ((deprecated));
+    bool is_complete_and_cb();
+    bool is_safe_and_cb() __attribute__ ((deprecated));
+    int get_return_value();
+    int get_version() __attribute__ ((deprecated));
+    uint64_t get_version64();
+    void release();
+    AioCompletionImpl *pc;
+  };
+
+  struct CEPH_RADOS_API PoolAsyncCompletion {
+    PoolAsyncCompletion(PoolAsyncCompletionImpl *pc_) : pc(pc_) {}
+    ~PoolAsyncCompletion();
+    int set_callback(void *cb_arg, callback_t cb);
+    int wait();
+    bool is_complete();
+    int get_return_value();
+    void release();
+    PoolAsyncCompletionImpl *pc;
+  };
+
+  /**
+   * These are per-op flags which may be different among
+   * ops added to an ObjectOperation.
+   */
+  enum ObjectOperationFlags {
+    OP_EXCL =   LIBRADOS_OP_FLAG_EXCL,
+    OP_FAILOK = LIBRADOS_OP_FLAG_FAILOK,
+    OP_FADVISE_RANDOM = LIBRADOS_OP_FLAG_FADVISE_RANDOM,
+    OP_FADVISE_SEQUENTIAL = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL,
+    OP_FADVISE_WILLNEED = LIBRADOS_OP_FLAG_FADVISE_WILLNEED,
+    OP_FADVISE_DONTNEED = LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
+    OP_FADVISE_NOCACHE = LIBRADOS_OP_FLAG_FADVISE_NOCACHE,
+  };
+
+  class CEPH_RADOS_API ObjectOperationCompletion {
+  public:
+    virtual ~ObjectOperationCompletion() {}
+    virtual void handle_completion(int r, bufferlist& outbl) = 0;
+  };
+
+  /**
+   * These flags apply to the ObjectOperation as a whole.
+   *
+   * Prior to octopus BALANCE_READS and LOCALIZE_READS should only
+   * be used when reading from data you're certain won't change, like
+   * a snapshot, or where eventual consistency is ok.  Since octopus
+   * (get_min_compatible_osd() >= CEPH_RELEASE_OCTOPUS) both are safe
+   * for general use.
+   *
+   * ORDER_READS_WRITES will order reads the same way writes are
+   * ordered (e.g., waiting for degraded objects).  In particular, it
+   * will make a write followed by a read sequence be preserved.
+   *
+   * IGNORE_CACHE will skip the caching logic on the OSD that normally
+   * handles promotion of objects between tiers.  This allows an operation
+   * to operate (or read) the cached (or uncached) object, even if it is
+   * not coherent.
+   *
+   * IGNORE_OVERLAY will ignore the pool overlay tiering metadata and
+   * process the op directly on the destination pool.  This is useful
+   * for CACHE_FLUSH and CACHE_EVICT operations.
+   */
+  enum ObjectOperationGlobalFlags {
+    OPERATION_NOFLAG             = LIBRADOS_OPERATION_NOFLAG,
+    OPERATION_BALANCE_READS      = LIBRADOS_OPERATION_BALANCE_READS,
+    OPERATION_LOCALIZE_READS     = LIBRADOS_OPERATION_LOCALIZE_READS,
+    OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES,
+    OPERATION_IGNORE_CACHE       = LIBRADOS_OPERATION_IGNORE_CACHE,
+    OPERATION_SKIPRWLOCKS        = LIBRADOS_OPERATION_SKIPRWLOCKS,
+    OPERATION_IGNORE_OVERLAY     = LIBRADOS_OPERATION_IGNORE_OVERLAY,
+    // send requests to cluster despite the cluster or pool being
+    // marked full; ops will either succeed (e.g., delete) or return
+    // EDQUOT or ENOSPC
+    OPERATION_FULL_TRY           = LIBRADOS_OPERATION_FULL_TRY,
+    // mainly for delete
+    OPERATION_FULL_FORCE	 = LIBRADOS_OPERATION_FULL_FORCE,
+    OPERATION_IGNORE_REDIRECT	 = LIBRADOS_OPERATION_IGNORE_REDIRECT,
+    OPERATION_ORDERSNAP          = LIBRADOS_OPERATION_ORDERSNAP,
+    // enable/allow return value and per-op return code/buffers
+    OPERATION_RETURNVEC          = LIBRADOS_OPERATION_RETURNVEC,
+  };
+
+  /*
+   * Alloc hint flags for the alloc_hint operation.
+   */
+  enum AllocHintFlags {
+    ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+    ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+    ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+    ALLOC_HINT_FLAG_RANDOM_READ = 8,
+    ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+    ALLOC_HINT_FLAG_IMMUTABLE = 32,
+    ALLOC_HINT_FLAG_SHORTLIVED = 64,
+    ALLOC_HINT_FLAG_LONGLIVED = 128,
+    ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+    ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+  };
+
+  /*
+   * ObjectOperation : compound object operation
+   * Batch multiple object operations into a single request, to be applied
+   * atomically.
+   */
+  class CEPH_RADOS_API ObjectOperation
+  {
+  public:
+    ObjectOperation();
+    virtual ~ObjectOperation();
+
+    ObjectOperation(const ObjectOperation&) = delete;
+    ObjectOperation& operator=(const ObjectOperation&) = delete;
+
+    /**
+     * Move constructor.
+     * \warning A moved from ObjectOperation is invalid and may not be used for
+     *          any purpose. This is a hard contract violation and will
+     *          kill your program.
+     */
+    ObjectOperation(ObjectOperation&&);
+    ObjectOperation& operator =(ObjectOperation&&);
+
+    size_t size();
+    void set_op_flags(ObjectOperationFlags flags) __attribute__((deprecated));
+    //flag mean ObjectOperationFlags
+    void set_op_flags2(int flags);
+
+    void cmpext(uint64_t off, const bufferlist& cmp_bl, int *prval);
+    void cmpxattr(const char *name, uint8_t op, const bufferlist& val);
+    void cmpxattr(const char *name, uint8_t op, uint64_t v);
+    void exec(const char *cls, const char *method, bufferlist& inbl);
+    void exec(const char *cls, const char *method, bufferlist& inbl, bufferlist *obl, int *prval);
+    void exec(const char *cls, const char *method, bufferlist& inbl, ObjectOperationCompletion *completion);
+    /**
+     * Guard operation with a check that object version == ver
+     *
+     * @param ver [in] version to check
+     */
+    void assert_version(uint64_t ver);
+
+    /**
+     * Guard operation with a check that the object already exists
+     */
+    void assert_exists();
+
+    /**
+     * get key/value pairs for specified keys
+     *
+     * @param assertions [in] comparison assertions
+     * @param prval [out] place error code in prval upon completion
+     *
+     * assertions has the form of mappings from keys to (comparison rval, assertion)
+     * The assertion field may be CEPH_OSD_CMPXATTR_OP_[GT|LT|EQ].
+     *
+     * That is, to assert that the value at key 'foo' is greater than 'bar':
+     *
+     * ObjectReadOperation op;
+     * int r;
+     * map<string, pair<bufferlist, int> > assertions;
+     * bufferlist bar(string('bar'));
+     * assertions['foo'] = make_pair(bar, CEPH_OSD_CMP_XATTR_OP_GT);
+     * op.omap_cmp(assertions, &r);
+     */
+    void omap_cmp(
+      const std::map<std::string, std::pair<bufferlist, int> > &assertions,
+      int *prval);
+
+  protected:
+    ObjectOperationImpl* impl;
+    friend class IoCtx;
+    friend class Rados;
+  };
+
+  /*
+   * ObjectWriteOperation : compound object write operation
+   * Batch multiple object operations into a single request, to be applied
+   * atomically.
+   */
+  class CEPH_RADOS_API ObjectWriteOperation : public ObjectOperation
+  {
+  protected:
+    time_t *unused;
+  public:
+    ObjectWriteOperation() : unused(NULL) {}
+    ~ObjectWriteOperation() override {}
+
+    ObjectWriteOperation(ObjectWriteOperation&&) = default;
+    ObjectWriteOperation& operator =(ObjectWriteOperation&&) = default;
+
+    void mtime(time_t *pt);
+    void mtime2(struct timespec *pts);
+
+    void create(bool exclusive);
+    void create(bool exclusive,
+		const std::string& category); ///< NOTE: category is unused
+
+    void write(uint64_t off, const bufferlist& bl);
+    void write_full(const bufferlist& bl);
+    void writesame(uint64_t off, uint64_t write_len,
+		   const bufferlist& bl);
+    void append(const bufferlist& bl);
+    void remove();
+    void truncate(uint64_t off);
+    void zero(uint64_t off, uint64_t len);
+    void rmxattr(const char *name);
+    void setxattr(const char *name, const bufferlist& bl);
+    void setxattr(const char *name, const bufferlist&& bl);
+    void tmap_update(const bufferlist& cmdbl);
+    void tmap_put(const bufferlist& bl);
+    void selfmanaged_snap_rollback(uint64_t snapid);
+
+    /**
+     * Rollback an object to the specified snapshot id
+     *
+     * Used with pool snapshots
+     *
+     * @param snapid [in] snopshot id specified
+     */
+    void snap_rollback(uint64_t snapid);
+
+    /**
+     * set keys and values according to map
+     *
+     * @param map [in] keys and values to set
+     */
+    void omap_set(const std::map<std::string, bufferlist> &map);
+
+    /**
+     * set header
+     *
+     * @param bl [in] header to set
+     */
+    void omap_set_header(const bufferlist &bl);
+
+    /**
+     * Clears omap contents
+     */
+    void omap_clear();
+
+    /**
+     * Clears keys in to_rm
+     *
+     * @param to_rm [in] keys to remove
+     */
+    void omap_rm_keys(const std::set<std::string> &to_rm);
+
+    /**
+     * Copy an object
+     *
+     * Copies an object from another location.  The operation is atomic in that
+     * the copy either succeeds in its entirety or fails (e.g., because the
+     * source object was modified while the copy was in progress).
+     *
+     * @param src source object name
+     * @param src_ioctx ioctx for the source object
+     * @param src_version current version of the source object
+     * @param src_fadvise_flags the fadvise flags for source object
+     */
+    void copy_from(const std::string& src, const IoCtx& src_ioctx,
+		   uint64_t src_version, uint32_t src_fadvise_flags);
+
+    /**
+     * Copy an object
+     *
+     * Copies an object from another location.  The operation is atomic in that
+     * the copy either succeeds in its entirety or fails (e.g., because the
+     * source object was modified while the copy was in progress).  Instead of
+     * copying truncate_seq and truncate_size from the source object it receives
+     * these values as parameters.
+     *
+     * @param src source object name
+     * @param src_ioctx ioctx for the source object
+     * @param src_version current version of the source object
+     * @param truncate_seq truncate sequence for the destination object
+     * @param truncate_size truncate size for the destination object
+     * @param src_fadvise_flags the fadvise flags for source object
+     */
+    void copy_from2(const std::string& src, const IoCtx& src_ioctx,
+		    uint64_t src_version, uint32_t truncate_seq,
+		    uint64_t truncate_size, uint32_t src_fadvise_flags);
+
+    /**
+     * undirty an object
+     *
+     * Clear an objects dirty flag
+     */
+    void undirty();
+
+    /**
+     * Set allocation hint for an object
+     *
+     * @param expected_object_size expected size of the object, in bytes
+     * @param expected_write_size expected size of writes to the object, in bytes
+     * @param flags flags ()
+     */
+    void set_alloc_hint(uint64_t expected_object_size,
+                        uint64_t expected_write_size);
+    void set_alloc_hint2(uint64_t expected_object_size,
+			 uint64_t expected_write_size,
+			 uint32_t flags);
+
+    /**
+     * Pin/unpin an object in cache tier
+     *
+     * @returns 0 on success, negative error code on failure
+     */
+    void cache_pin();
+    void cache_unpin();
+
+    /**
+     * Extensible tier
+     *
+     * Set redirect target
+     */
+    void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx,
+		      uint64_t tgt_version, int flag = 0);
+    void tier_promote();
+    void unset_manifest();
+
+    friend class IoCtx;
+  };
+
+  /*
+   * ObjectReadOperation : compound object operation that return value
+   * Batch multiple object operations into a single request, to be applied
+   * atomically.
+   */
+  class CEPH_RADOS_API ObjectReadOperation : public ObjectOperation
+  {
+  public:
+    ObjectReadOperation() {}
+    ~ObjectReadOperation() override {}
+
+    ObjectReadOperation(ObjectReadOperation&&) = default;
+    ObjectReadOperation& operator =(ObjectReadOperation&&) = default;
+
+    void stat(uint64_t *psize, time_t *pmtime, int *prval);
+    void stat2(uint64_t *psize, struct timespec *pts, int *prval);
+    void getxattr(const char *name, bufferlist *pbl, int *prval);
+    void getxattrs(std::map<std::string, bufferlist> *pattrs, int *prval);
+    void read(size_t off, uint64_t len, bufferlist *pbl, int *prval);
+    void checksum(rados_checksum_type_t type, const bufferlist &init_value_bl,
+		  uint64_t off, size_t len, size_t chunk_size, bufferlist *pbl,
+		  int *prval);
+
+    /**
+     * see aio_sparse_read()
+     */
+    void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m,
+                     bufferlist *data_bl, int *prval,
+                     uint64_t truncate_size = 0,
+                     uint32_t truncate_seq = 0);
+
+    /**
+     * omap_get_vals: keys and values from the object omap
+     *
+     * Get up to max_return keys and values beginning after start_after
+     *
+     * @param start_after [in] list no keys smaller than start_after
+     * @param max_return [in] list no more than max_return key/value pairs
+     * @param out_vals [out] place returned values in out_vals on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals(
+      const std::string &start_after,
+      uint64_t max_return,
+      std::map<std::string, bufferlist> *out_vals,
+      int *prval) __attribute__ ((deprecated));  // use v2
+
+    /**
+     * omap_get_vals: keys and values from the object omap
+     *
+     * Get up to max_return keys and values beginning after start_after
+     *
+     * @param start_after [in] list no keys smaller than start_after
+     * @param max_return [in] list no more than max_return key/value pairs
+     * @param out_vals [out] place returned values in out_vals on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals2(
+      const std::string &start_after,
+      uint64_t max_return,
+      std::map<std::string, bufferlist> *out_vals,
+      bool *pmore,
+      int *prval);
+
+    /**
+     * omap_get_vals: keys and values from the object omap
+     *
+     * Get up to max_return keys and values beginning after start_after
+     *
+     * @param start_after [in] list keys starting after start_after
+     * @param filter_prefix [in] list only keys beginning with filter_prefix
+     * @param max_return [in] list no more than max_return key/value pairs
+     * @param out_vals [out] place returned values in out_vals on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals(
+      const std::string &start_after,
+      const std::string &filter_prefix,
+      uint64_t max_return,
+      std::map<std::string, bufferlist> *out_vals,
+      int *prval) __attribute__ ((deprecated));  // use v2
+
+    /**
+     * omap_get_vals2: keys and values from the object omap
+     *
+     * Get up to max_return keys and values beginning after start_after
+     *
+     * @param start_after [in] list keys starting after start_after
+     * @param filter_prefix [in] list only keys beginning with filter_prefix
+     * @param max_return [in] list no more than max_return key/value pairs
+     * @param out_vals [out] place returned values in out_vals on completion
+     * @param pmore [out] pointer to bool indicating whether there are more keys
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals2(
+      const std::string &start_after,
+      const std::string &filter_prefix,
+      uint64_t max_return,
+      std::map<std::string, bufferlist> *out_vals,
+      bool *pmore,
+      int *prval);
+
+
+    /**
+     * omap_get_keys: keys from the object omap
+     *
+     * Get up to max_return keys beginning after start_after
+     *
+     * @param start_after [in] list keys starting after start_after
+     * @param max_return [in] list no more than max_return keys
+     * @param out_keys [out] place returned values in out_keys on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_keys(const std::string &start_after,
+                       uint64_t max_return,
+                       std::set<std::string> *out_keys,
+                       int *prval) __attribute__ ((deprecated)); // use v2
+
+    /**
+     * omap_get_keys2: keys from the object omap
+     *
+     * Get up to max_return keys beginning after start_after
+     *
+     * @param start_after [in] list keys starting after start_after
+     * @param max_return [in] list no more than max_return keys
+     * @param out_keys [out] place returned values in out_keys on completion
+     * @param pmore [out] pointer to bool indicating whether there are more keys
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_keys2(const std::string &start_after,
+			uint64_t max_return,
+			std::set<std::string> *out_keys,
+			bool *pmore,
+			int *prval);
+
+    /**
+     * omap_get_header: get header from object omap
+     *
+     * @param header [out] place header here upon completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_header(bufferlist *header, int *prval);
+
+    /**
+     * get key/value pairs for specified keys
+     *
+     * @param keys [in] keys to get
+     * @param map [out] place key/value pairs found here on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void omap_get_vals_by_keys(const std::set<std::string> &keys,
+			       std::map<std::string, bufferlist> *map,
+			       int *prval);
+
+    /**
+     * list_watchers: Get list watchers of object
+     *
+     * @param out_watchers [out] place returned values in out_watchers on completion
+     * @param prval [out] place error code in prval upon completion
+     */
+    void list_watchers(std::list<obj_watch_t> *out_watchers, int *prval);
+
+    /**
+     * list snapshot clones associated with a logical object
+     *
+     * This will include a record for each version of the object,
+     * include the "HEAD" (which will have a cloneid of SNAP_HEAD).
+     * Each clone includes a vector of snap ids for which it is
+     * defined to exist.
+     *
+     * NOTE: this operation must be submitted from an IoCtx with a
+     * read snapid of SNAP_DIR for reliable results.
+     *
+     * @param out_snaps [out] pointer to resulting snap_set_t
+     * @param prval [out] place error code in prval upon completion
+     */
+    void list_snaps(snap_set_t *out_snaps, int *prval);
+
+    /**
+     * query dirty state of an object
+     *
+     * @param isdirty [out] pointer to resulting bool
+     * @param prval [out] place error code in prval upon completion
+     */
+    void is_dirty(bool *isdirty, int *prval);
+
+    /**
+     * flush a cache tier object to backing tier; will block racing
+     * updates.
+     *
+     * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+     * triggering a promotion.
+     */
+    void cache_flush();
+
+    /**
+     * Flush a cache tier object to backing tier; will EAGAIN if we race
+     * with an update.  Must be used with the SKIPRWLOCKS flag.
+     *
+     * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+     * triggering a promotion.
+     */
+    void cache_try_flush();
+
+    /**
+     * evict a clean cache tier object
+     *
+     * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+     * triggering a promote on the OSD (that is then evicted).
+     */
+    void cache_evict();
+
+    /**
+     * Extensible tier
+     *
+     * set_chunk: make a chunk pointing a part of the source object at the target 
+     * 		  object
+     *
+     * @param src_offset [in] source offset to indicate the start position of 
+     * 				a chunk in the source object
+     * @param src_length [in] source length to set the length of the chunk
+     * @param tgt_oid    [in] target object's id to set a chunk
+     * @param tgt_offset [in] the start position of the target object
+     * @param flag       [in] flag for the source object
+     *
+     */
+    void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx,
+                   std::string tgt_oid, uint64_t tgt_offset, int flag = 0);
+    /**
+     * flush a manifest tier object to backing tier, performing deduplication;
+     * will block racing updates.
+     *
+     * Invoking tier_flush() implicitly makes a manifest object even if
+     * the target object is not manifest. 
+     */
+    void tier_flush();
+    /**
+     * evict a manifest tier object to backing tier; will block racing
+     * updates.
+     */
+    void tier_evict();
+  };
+
+  /* IoCtx : This is a context in which we can perform I/O.
+   * It includes a Pool,
+   *
+   * Typical use (error checking omitted):
+   *
+   * IoCtx p;
+   * rados.ioctx_create("my_pool", p);
+   * p->stat(&stats);
+   * ... etc ...
+   *
+   * NOTE: be sure to call watch_flush() prior to destroying any IoCtx
+   * that is used for watch events to ensure that racing callbacks
+   * have completed.
+   */
+  class CEPH_RADOS_API IoCtx
+  {
+  public:
+    IoCtx();
+    static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool);
+    IoCtx(const IoCtx& rhs);
+    IoCtx& operator=(const IoCtx& rhs);
+    IoCtx(IoCtx&& rhs) noexcept;
+    IoCtx& operator=(IoCtx&& rhs) noexcept;
+
+    ~IoCtx();
+
+    bool is_valid() const;
+
+    // Close our pool handle
+    void close();
+
+    // deep copy
+    void dup(const IoCtx& rhs);
+
+    // set pool auid
+    int set_auid(uint64_t auid_)
+      __attribute__ ((deprecated));
+
+    // set pool auid
+    int set_auid_async(uint64_t auid_, PoolAsyncCompletion *c)
+      __attribute__ ((deprecated));
+
+    // get pool auid
+    int get_auid(uint64_t *auid_)
+      __attribute__ ((deprecated));
+
+    uint64_t get_instance_id() const;
+
+    std::string get_pool_name();
+
+    bool pool_requires_alignment();
+    int pool_requires_alignment2(bool * req);
+    uint64_t pool_required_alignment();
+    int pool_required_alignment2(uint64_t * alignment);
+
+    // create an object
+    int create(const std::string& oid, bool exclusive);
+    int create(const std::string& oid, bool exclusive,
+	       const std::string& category); ///< category is unused
+
+    /**
+     * write bytes to an object at a specified offset
+     *
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+    /**
+     * append bytes to an object
+     *
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int append(const std::string& oid, bufferlist& bl, size_t len);
+    /**
+     * replace object contents with provided data
+     *
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int write_full(const std::string& oid, bufferlist& bl);
+    int writesame(const std::string& oid, bufferlist& bl,
+		  size_t write_len, uint64_t off);
+    int read(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+    int checksum(const std::string& o, rados_checksum_type_t type,
+		 const bufferlist &init_value_bl, size_t len, uint64_t off,
+		 size_t chunk_size, bufferlist *pbl);
+    int remove(const std::string& oid);
+    int remove(const std::string& oid, int flags);
+    int trunc(const std::string& oid, uint64_t size);
+    int mapext(const std::string& o, uint64_t off, size_t len, std::map<uint64_t,uint64_t>& m);
+    int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl);
+    int sparse_read(const std::string& o, std::map<uint64_t,uint64_t>& m, bufferlist& bl, size_t len, uint64_t off);
+    int getxattr(const std::string& oid, const char *name, bufferlist& bl);
+    int getxattrs(const std::string& oid, std::map<std::string, bufferlist>& attrset);
+    int setxattr(const std::string& oid, const char *name, bufferlist& bl);
+    int rmxattr(const std::string& oid, const char *name);
+    int stat(const std::string& oid, uint64_t *psize, time_t *pmtime);
+    int stat2(const std::string& oid, uint64_t *psize, struct timespec *pts);
+    int exec(const std::string& oid, const char *cls, const char *method,
+	     bufferlist& inbl, bufferlist& outbl);
+    /**
+     * modify object tmap based on encoded update sequence
+     *
+     * NOTE: this call steals the contents of @param bl
+     */
+    int tmap_update(const std::string& oid, bufferlist& cmdbl);
+
+    int omap_get_vals(const std::string& oid,
+                      const std::string& start_after,
+                      uint64_t max_return,
+                      std::map<std::string, bufferlist> *out_vals);
+    int omap_get_vals2(const std::string& oid,
+		       const std::string& start_after,
+		       uint64_t max_return,
+		       std::map<std::string, bufferlist> *out_vals,
+		       bool *pmore);
+    int omap_get_vals(const std::string& oid,
+                      const std::string& start_after,
+                      const std::string& filter_prefix,
+                      uint64_t max_return,
+                      std::map<std::string, bufferlist> *out_vals);
+    int omap_get_vals2(const std::string& oid,
+		       const std::string& start_after,
+		       const std::string& filter_prefix,
+		       uint64_t max_return,
+		       std::map<std::string, bufferlist> *out_vals,
+		       bool *pmore);
+    int omap_get_keys(const std::string& oid,
+                      const std::string& start_after,
+                      uint64_t max_return,
+                      std::set<std::string> *out_keys);
+    int omap_get_keys2(const std::string& oid,
+		       const std::string& start_after,
+		       uint64_t max_return,
+		       std::set<std::string> *out_keys,
+		       bool *pmore);
+    int omap_get_header(const std::string& oid,
+                        bufferlist *bl);
+    int omap_get_vals_by_keys(const std::string& oid,
+                              const std::set<std::string>& keys,
+                              std::map<std::string, bufferlist> *vals);
+    int omap_set(const std::string& oid,
+                 const std::map<std::string, bufferlist>& map);
+    int omap_set_header(const std::string& oid,
+                        const bufferlist& bl);
+    int omap_clear(const std::string& oid);
+    int omap_rm_keys(const std::string& oid,
+                     const std::set<std::string>& keys);
+
+    void snap_set_read(snap_t seq);
+    int selfmanaged_snap_set_write_ctx(snap_t seq, std::vector<snap_t>& snaps);
+
+    // Create a snapshot with a given name
+    int snap_create(const char *snapname);
+
+    // Look up a snapshot by name.
+    // Returns 0 on success; error code otherwise
+    int snap_lookup(const char *snapname, snap_t *snap);
+
+    // Gets a timestamp for a snap
+    int snap_get_stamp(snap_t snapid, time_t *t);
+
+    // Gets the name of a snap
+    int snap_get_name(snap_t snapid, std::string *s);
+
+    // Remove a snapshot from this pool
+    int snap_remove(const char *snapname);
+
+    int snap_list(std::vector<snap_t> *snaps);
+
+    int snap_rollback(const std::string& oid, const char *snapname);
+
+    // Deprecated name kept for backward compatibility - same as snap_rollback()
+    int rollback(const std::string& oid, const char *snapname)
+      __attribute__ ((deprecated));
+
+    int selfmanaged_snap_create(uint64_t *snapid);
+    void aio_selfmanaged_snap_create(uint64_t *snapid, AioCompletion *c);
+
+    int selfmanaged_snap_remove(uint64_t snapid);
+    void aio_selfmanaged_snap_remove(uint64_t snapid, AioCompletion *c);
+
+    int selfmanaged_snap_rollback(const std::string& oid, uint64_t snapid);
+
+    // Advisory locking on rados objects.
+    int lock_exclusive(const std::string &oid, const std::string &name,
+		       const std::string &cookie,
+		       const std::string &description,
+		       struct timeval * duration, uint8_t flags);
+
+    int lock_shared(const std::string &oid, const std::string &name,
+		    const std::string &cookie, const std::string &tag,
+		    const std::string &description,
+		    struct timeval * duration, uint8_t flags);
+
+    int unlock(const std::string &oid, const std::string &name,
+	       const std::string &cookie);
+
+    int break_lock(const std::string &oid, const std::string &name,
+		   const std::string &client, const std::string &cookie);
+
+    int list_lockers(const std::string &oid, const std::string &name,
+		     int *exclusive,
+		     std::string *tag,
+		     std::list<librados::locker_t> *lockers);
+
+
+    /// Start enumerating objects for a pool. Errors are thrown as exceptions.
+    NObjectIterator nobjects_begin(const bufferlist &filter=bufferlist());
+    /// Start enumerating objects for a pool starting from a hash position.
+    /// Errors are thrown as exceptions.
+    NObjectIterator nobjects_begin(uint32_t start_hash_position,
+                                   const bufferlist &filter=bufferlist());
+    /// Start enumerating objects for a pool starting from cursor. Errors are
+    /// thrown as exceptions.
+    NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor,
+                                   const bufferlist &filter=bufferlist());
+    /// Iterator indicating the end of a pool
+    const NObjectIterator& nobjects_end() const;
+
+    /// Get cursor for pool beginning
+    ObjectCursor object_list_begin();
+
+    /// Get cursor for pool end
+    ObjectCursor object_list_end();
+
+    /// Check whether a cursor is at the end of a pool
+    bool object_list_is_end(const ObjectCursor &oc);
+
+    /// List some objects between two cursors
+    int object_list(const ObjectCursor &start, const ObjectCursor &finish,
+                    const size_t result_count,
+                    const bufferlist &filter,
+                    std::vector<ObjectItem> *result,
+                    ObjectCursor *next);
+
+    /// Generate cursors that include the N out of Mth slice of the pool
+    void object_list_slice(
+        const ObjectCursor start,
+        const ObjectCursor finish,
+        const size_t n,
+        const size_t m,
+        ObjectCursor *split_start,
+        ObjectCursor *split_finish);
+
+    /**
+     * List available hit set objects
+     *
+     * @param uint32_t [in] hash position to query
+     * @param c [in] completion
+     * @param pls [out] list of available intervals
+     */
+    int hit_set_list(uint32_t hash, AioCompletion *c,
+		     std::list< std::pair<time_t, time_t> > *pls);
+
+    /**
+     * Retrieve hit set for a given hash, and time
+     *
+     * @param hash [in] hash position
+     * @param c [in] completion
+     * @param stamp [in] time interval that falls within the hit set's interval
+     * @param pbl [out] buffer to store the result in
+     */
+    int hit_set_get(uint32_t hash, AioCompletion *c, time_t stamp,
+		    bufferlist *pbl);
+
+    uint64_t get_last_version();
+
+    int aio_read(const std::string& oid, AioCompletion *c,
+		 bufferlist *pbl, size_t len, uint64_t off);
+    /**
+     * Asynchronously read from an object at a particular snapshot
+     *
+     * This is the same as normal aio_read, except that it chooses
+     * the snapshot to read from from its arguments instead of the
+     * internal IoCtx state.
+     *
+     * The return value of the completion will be number of bytes read on
+     * success, negative error code on failure.
+     *
+     * @param oid the name of the object to read from
+     * @param c what to do when the read is complete
+     * @param pbl where to store the results
+     * @param len the number of bytes to read
+     * @param off the offset to start reading from in the object
+     * @param snapid the id of the snapshot to read from
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_read(const std::string& oid, AioCompletion *c,
+		 bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid);
+    int aio_sparse_read(const std::string& oid, AioCompletion *c,
+			std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+			size_t len, uint64_t off);
+    /**
+     * Asynchronously read existing extents from an object at a
+     * particular snapshot
+     *
+     * This is the same as normal aio_sparse_read, except that it chooses
+     * the snapshot to read from from its arguments instead of the
+     * internal IoCtx state.
+     *
+     * m will be filled in with a map of extents in the object,
+     * mapping offsets to lengths (in bytes) within the range
+     * requested. The data for all of the extents are stored
+     * back-to-back in offset order in data_bl.
+     *
+     * @param oid the name of the object to read from
+     * @param c what to do when the read is complete
+     * @param m where to store the map of extents
+     * @param data_bl where to store the data
+     * @param len the number of bytes to read
+     * @param off the offset to start reading from in the object
+     * @param snapid the id of the snapshot to read from
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_sparse_read(const std::string& oid, AioCompletion *c,
+			std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+			size_t len, uint64_t off, uint64_t snapid);
+    /**
+     * Asynchronously compare an on-disk object range with a buffer
+     *
+     * @param oid the name of the object to read from
+     * @param c what to do when the read is complete
+     * @param off object byte offset at which to start the comparison
+     * @param cmp_bl buffer containing bytes to be compared with object contents
+     * @returns 0 on success, negative error code on failure,
+     *  (-MAX_ERRNO - mismatch_off) on mismatch
+     */
+    int aio_cmpext(const std::string& oid,
+		   librados::AioCompletion *c,
+		   uint64_t off,
+		   bufferlist& cmp_bl);
+    int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+		  size_t len, uint64_t off);
+    int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+		  size_t len);
+    int aio_write_full(const std::string& oid, AioCompletion *c, const bufferlist& bl);
+    int aio_writesame(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+		      size_t write_len, uint64_t off);
+
+    /**
+     * Asynchronously remove an object
+     *
+     * Queues the remove and returns.
+     *
+     * The return value of the completion will be 0 on success, negative
+     * error code on failure.
+     *
+     * @param oid the name of the object
+     * @param c what to do when the remove is safe and complete
+     * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+     * other than SNAP_HEAD
+     */
+    int aio_remove(const std::string& oid, AioCompletion *c);
+    int aio_remove(const std::string& oid, AioCompletion *c, int flags);
+
+    /**
+     * Wait for all currently pending aio writes to be safe.
+     *
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_flush();
+
+    /**
+     * Schedule a callback for when all currently pending
+     * aio writes are safe. This is a non-blocking version of
+     * aio_flush().
+     *
+     * @param c what to do when the writes are safe
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_flush_async(AioCompletion *c);
+    int aio_getxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+    int aio_getxattrs(const std::string& oid, AioCompletion *c, std::map<std::string, bufferlist>& attrset);
+    int aio_setxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+    int aio_rmxattr(const std::string& oid, AioCompletion *c, const char *name);
+    int aio_stat(const std::string& oid, AioCompletion *c, uint64_t *psize, time_t *pmtime);
+    int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts);
+
+    /**
+     * Cancel aio operation
+     *
+     * @param c completion handle
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_cancel(AioCompletion *c);
+
+    int aio_exec(const std::string& oid, AioCompletion *c, const char *cls, const char *method,
+	         bufferlist& inbl, bufferlist *outbl);
+
+    /*
+     * asynchronous version of unlock
+     */
+    int aio_unlock(const std::string &oid, const std::string &name,
+	           const std::string &cookie, AioCompletion *c);
+
+    // compound object operations
+    int operate(const std::string& oid, ObjectWriteOperation *op);
+    int operate(const std::string& oid, ObjectWriteOperation *op, int flags);
+    int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl);
+    int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl, int flags);
+    int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op);
+    int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op, int flags);
+    /**
+     * Schedule an async write operation with explicit snapshot parameters
+     *
+     * This is the same as the first aio_operate(), except that it
+     * gets the snapshot context from its arguments instead of the
+     * IoCtx internal state.
+     *
+     * @param oid the object to operate on
+     * @param c what to do when the operation is complete and safe
+     * @param op which operations to perform
+     * @param seq latest selfmanaged snapshot sequence number for this object
+     * @param snaps currently existing selfmanaged snapshot ids for this object
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_operate(const std::string& oid, AioCompletion *c,
+		    ObjectWriteOperation *op, snap_t seq,
+		    std::vector<snap_t>& snaps);
+    int aio_operate(const std::string& oid, AioCompletion *c,
+        ObjectWriteOperation *op, snap_t seq,
+        std::vector<snap_t>& snaps,
+        const blkin_trace_info *trace_info);
+    int aio_operate(const std::string& oid, AioCompletion *c,
+        ObjectWriteOperation *op, snap_t seq,
+        std::vector<snap_t>& snaps, int flags,
+        const blkin_trace_info *trace_info);
+    int aio_operate(const std::string& oid, AioCompletion *c,
+		    ObjectReadOperation *op, bufferlist *pbl);
+
+    int aio_operate(const std::string& oid, AioCompletion *c,
+		    ObjectReadOperation *op, snap_t snapid, int flags,
+		    bufferlist *pbl)
+      __attribute__ ((deprecated));
+
+    int aio_operate(const std::string& oid, AioCompletion *c,
+		    ObjectReadOperation *op, int flags,
+		    bufferlist *pbl);
+    int aio_operate(const std::string& oid, AioCompletion *c,
+        ObjectReadOperation *op, int flags,
+        bufferlist *pbl, const blkin_trace_info *trace_info);
+
+    // watch/notify
+    int watch2(const std::string& o, uint64_t *handle,
+	       librados::WatchCtx2 *ctx);
+    int watch3(const std::string& o, uint64_t *handle,
+	       librados::WatchCtx2 *ctx, uint32_t timeout);
+    int aio_watch(const std::string& o, AioCompletion *c, uint64_t *handle,
+	       librados::WatchCtx2 *ctx);
+    int aio_watch2(const std::string& o, AioCompletion *c, uint64_t *handle,
+	       librados::WatchCtx2 *ctx, uint32_t timeout);
+    int unwatch2(uint64_t handle);
+    int aio_unwatch(uint64_t handle, AioCompletion *c);
+    /**
+     * Send a notify event to watchers
+     *
+     * Upon completion the pbl bufferlist reply payload will be
+     * encoded like so:
+     *
+     *    le32 num_acks
+     *    {
+     *      le64 gid     global id for the client (for client.1234 that's 1234)
+     *      le64 cookie  cookie for the client
+     *      le32 buflen  length of reply message buffer
+     *      u8 * buflen  payload
+     *    } * num_acks
+     *    le32 num_timeouts
+     *    {
+     *      le64 gid     global id for the client
+     *      le64 cookie  cookie for the client
+     *    } * num_timeouts
+     *
+     *
+     */
+    int notify2(const std::string& o,   ///< object
+		bufferlist& bl,         ///< optional broadcast payload
+		uint64_t timeout_ms,    ///< timeout (in ms)
+		bufferlist *pbl);       ///< reply buffer
+    int aio_notify(const std::string& o,   ///< object
+                   AioCompletion *c,       ///< completion when notify completes
+                   bufferlist& bl,         ///< optional broadcast payload
+                   uint64_t timeout_ms,    ///< timeout (in ms)
+                   bufferlist *pbl);       ///< reply buffer
+   /*
+    * Decode a notify response into acks and timeout vectors.
+    */
+    void decode_notify_response(bufferlist &bl,
+                                std::vector<librados::notify_ack_t> *acks,
+                                std::vector<librados::notify_timeout_t> *timeouts);
+
+    int list_watchers(const std::string& o, std::list<obj_watch_t> *out_watchers);
+    int list_snaps(const std::string& o, snap_set_t *out_snaps);
+    void set_notify_timeout(uint32_t timeout);
+
+    /// acknowledge a notify we received.
+    void notify_ack(const std::string& o, ///< watched object
+		    uint64_t notify_id,   ///< notify id
+		    uint64_t cookie,      ///< our watch handle
+		    bufferlist& bl);      ///< optional reply payload
+
+    /***
+     * check on watch validity
+     *
+     * Check if a watch is valid.  If so, return the number of
+     * milliseconds since we last confirmed its liveness.  If there is
+     * a known error, return it.
+     *
+     * If there is an error, the watch is no longer valid, and should
+     * be destroyed with unwatch().  The user is still interested in
+     * the object, a new watch should be created with watch().
+     *
+     * @param cookie watch handle
+     * @returns ms since last confirmed valid, or error
+     */
+    int watch_check(uint64_t cookie);
+
+    // old, deprecated versions
+    int watch(const std::string& o, uint64_t ver, uint64_t *cookie,
+	      librados::WatchCtx *ctx) __attribute__ ((deprecated));
+    int notify(const std::string& o, uint64_t ver, bufferlist& bl)
+      __attribute__ ((deprecated));
+    int unwatch(const std::string& o, uint64_t cookie)
+      __attribute__ ((deprecated));
+
+    /**
+     * Set allocation hint for an object
+     *
+     * This is an advisory operation, it will always succeed (as if it
+     * was submitted with a OP_FAILOK flag set) and is not guaranteed
+     * to do anything on the backend.
+     *
+     * @param o the name of the object
+     * @param expected_object_size expected size of the object, in bytes
+     * @param expected_write_size expected size of writes to the object, in bytes
+     * @returns 0 on success, negative error code on failure
+     */
+    int set_alloc_hint(const std::string& o,
+                       uint64_t expected_object_size,
+                       uint64_t expected_write_size);
+    int set_alloc_hint2(const std::string& o,
+			uint64_t expected_object_size,
+			uint64_t expected_write_size,
+			uint32_t flags);
+
+    // assert version for next sync operations
+    void set_assert_version(uint64_t ver);
+
+    /**
+     * Pin/unpin an object in cache tier
+     *
+     * @param o the name of the object
+     * @returns 0 on success, negative error code on failure
+     */
+    int cache_pin(const std::string& o);
+    int cache_unpin(const std::string& o);
+
+    std::string get_pool_name() const;
+
+    void locator_set_key(const std::string& key);
+    void set_namespace(const std::string& nspace);
+    std::string get_namespace() const;
+
+    int64_t get_id();
+
+    // deprecated versions
+    uint32_t get_object_hash_position(const std::string& oid)
+      __attribute__ ((deprecated));
+    uint32_t get_object_pg_hash_position(const std::string& oid)
+      __attribute__ ((deprecated));
+
+    int get_object_hash_position2(const std::string& oid, uint32_t *hash_position);
+    int get_object_pg_hash_position2(const std::string& oid, uint32_t *pg_hash_position);
+
+    config_t cct();
+
+    void set_osdmap_full_try()
+      __attribute__ ((deprecated));
+    void unset_osdmap_full_try()
+      __attribute__ ((deprecated));
+
+    bool get_pool_full_try();
+    void set_pool_full_try();
+    void unset_pool_full_try();
+
+    int application_enable(const std::string& app_name, bool force);
+    int application_enable_async(const std::string& app_name,
+                                 bool force, PoolAsyncCompletion *c);
+    int application_list(std::set<std::string> *app_names);
+    int application_metadata_get(const std::string& app_name,
+                                 const std::string &key,
+                                 std::string *value);
+    int application_metadata_set(const std::string& app_name,
+                                 const std::string &key,
+                                 const std::string& value);
+    int application_metadata_remove(const std::string& app_name,
+                                    const std::string &key);
+    int application_metadata_list(const std::string& app_name,
+                                  std::map<std::string, std::string> *values);
+
+  private:
+    /* You can only get IoCtx instances from Rados */
+    IoCtx(IoCtxImpl *io_ctx_impl_);
+
+    friend class Rados; // Only Rados can use our private constructor to create IoCtxes.
+    friend class libradosstriper::RadosStriper; // Striper needs to see our IoCtxImpl
+    friend class ObjectWriteOperation;  // copy_from needs to see our IoCtxImpl
+    friend class ObjectReadOperation;  // set_chunk needs to see our IoCtxImpl
+
+    IoCtxImpl *io_ctx_impl;
+  };
+
+  struct CEPH_RADOS_API PlacementGroup {
+    PlacementGroup();
+    PlacementGroup(const PlacementGroup&);
+    ~PlacementGroup();
+    bool parse(const char*);
+    std::unique_ptr<PlacementGroupImpl> impl;
+  };
+
+  CEPH_RADOS_API std::ostream& operator<<(std::ostream&, const PlacementGroup&);
+
+  class CEPH_RADOS_API Rados
+  {
+  public:
+    static void version(int *major, int *minor, int *extra);
+
+    Rados();
+    explicit Rados(IoCtx& ioctx);
+    ~Rados();
+    static void from_rados_t(rados_t cluster, Rados &rados);
+
+    int init(const char * const id);
+    int init2(const char * const name, const char * const clustername,
+	      uint64_t flags);
+    int init_with_context(config_t cct_);
+    config_t cct();
+    int connect();
+    void shutdown();
+    int watch_flush();
+    int aio_watch_flush(AioCompletion*);
+    int conf_read_file(const char * const path) const;
+    int conf_parse_argv(int argc, const char ** argv) const;
+    int conf_parse_argv_remainder(int argc, const char ** argv,
+				  const char ** remargv) const;
+    int conf_parse_env(const char *env) const;
+    int conf_set(const char *option, const char *value);
+    int conf_get(const char *option, std::string &val);
+
+    int service_daemon_register(
+      const std::string& service,  ///< service name (e.g., 'rgw')
+      const std::string& name,     ///< daemon name (e.g., 'gwfoo')
+      const std::map<std::string,std::string>& metadata); ///< static metadata about daemon
+    int service_daemon_update_status(
+      std::map<std::string,std::string>&& status);
+
+    int pool_create(const char *name);
+    int pool_create(const char *name, uint64_t auid)
+      __attribute__ ((deprecated));
+    int pool_create(const char *name, uint64_t auid, uint8_t crush_rule)
+      __attribute__ ((deprecated));
+    int pool_create_with_rule(const char *name, uint8_t crush_rule);
+    int pool_create_async(const char *name, PoolAsyncCompletion *c);
+    int pool_create_async(const char *name, uint64_t auid, PoolAsyncCompletion *c)
+      __attribute__ ((deprecated));
+    int pool_create_async(const char *name, uint64_t auid, uint8_t crush_rule, PoolAsyncCompletion *c)
+      __attribute__ ((deprecated));
+    int pool_create_with_rule_async(const char *name, uint8_t crush_rule, PoolAsyncCompletion *c);
+    int pool_get_base_tier(int64_t pool, int64_t* base_tier);
+    int pool_delete(const char *name);
+    int pool_delete_async(const char *name, PoolAsyncCompletion *c);
+    int64_t pool_lookup(const char *name);
+    int pool_reverse_lookup(int64_t id, std::string *name);
+
+    uint64_t get_instance_id();
+
+    int get_min_compatible_osd(int8_t* require_osd_release);
+    int get_min_compatible_client(int8_t* min_compat_client,
+                                  int8_t* require_min_compat_client);
+
+    int mon_command(std::string cmd, const bufferlist& inbl,
+		    bufferlist *outbl, std::string *outs);
+    int mgr_command(std::string cmd, const bufferlist& inbl,
+		    bufferlist *outbl, std::string *outs);
+    int osd_command(int osdid, std::string cmd, const bufferlist& inbl,
+                    bufferlist *outbl, std::string *outs);
+    int pg_command(const char *pgstr, std::string cmd, const bufferlist& inbl,
+                   bufferlist *outbl, std::string *outs);
+
+    int ioctx_create(const char *name, IoCtx &pioctx);
+    int ioctx_create2(int64_t pool_id, IoCtx &pioctx);
+
+    // Features useful for test cases
+    void test_blocklist_self(bool set);
+
+    /* pool info */
+    int pool_list(std::list<std::string>& v);
+    int pool_list2(std::list<std::pair<int64_t, std::string> >& v);
+    int get_pool_stats(std::list<std::string>& v,
+		       stats_map& result);
+    /// deprecated; use simpler form.  categories no longer supported.
+    int get_pool_stats(std::list<std::string>& v,
+		       std::map<std::string, stats_map>& stats);
+    /// deprecated; categories no longer supported
+    int get_pool_stats(std::list<std::string>& v,
+                       std::string& category,
+		       std::map<std::string, stats_map>& stats);
+    /// check if pool has selfmanaged snaps
+    bool get_pool_is_selfmanaged_snaps_mode(const std::string& poolname);
+
+    int cluster_stat(cluster_stat_t& result);
+    int cluster_fsid(std::string *fsid);
+
+    /**
+     * List inconsistent placement groups in the given pool
+     *
+     * @param pool_id the pool id
+     * @param pgs [out] the inconsistent PGs
+     */
+    int get_inconsistent_pgs(int64_t pool_id,
+                             std::vector<PlacementGroup>* pgs);
+    /**
+     * List the inconsistent objects found in a given PG by last scrub
+     *
+     * @param pg the placement group returned by @c pg_list()
+     * @param start_after the first returned @c objects
+     * @param max_return the max number of the returned @c objects
+     * @param c what to do when the operation is complete and safe
+     * @param objects [out] the objects where inconsistencies are found
+     * @param interval [in,out] an epoch indicating current interval
+     * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+     *          the current interval begin epoch is different.
+     */
+    int get_inconsistent_objects(const PlacementGroup& pg,
+                                 const object_id_t &start_after,
+                                 unsigned max_return,
+                                 AioCompletion *c,
+                                 std::vector<inconsistent_obj_t>* objects,
+                                 uint32_t* interval);
+    /**
+     * List the inconsistent snapsets found in a given PG by last scrub
+     *
+     * @param pg the placement group returned by @c pg_list()
+     * @param start_after the first returned @c objects
+     * @param max_return the max number of the returned @c objects
+     * @param c what to do when the operation is complete and safe
+     * @param snapsets [out] the objects where inconsistencies are found
+     * @param interval [in,out] an epoch indicating current interval
+     * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+     *          the current interval begin epoch is different.
+     */
+    int get_inconsistent_snapsets(const PlacementGroup& pg,
+                                  const object_id_t &start_after,
+                                  unsigned max_return,
+                                  AioCompletion *c,
+                                  std::vector<inconsistent_snapset_t>* snapset,
+                                  uint32_t* interval);
+
+    /// get/wait for the most recent osdmap
+    int wait_for_latest_osdmap();
+
+    int blocklist_add(const std::string& client_address,
+                      uint32_t expire_seconds);
+
+    std::string get_addrs() const;
+
+    /*
+     * pool aio
+     *
+     * It is up to the caller to release the completion handler, even if the pool_create_async()
+     * and/or pool_delete_async() fails and does not send the async request
+     */
+    static PoolAsyncCompletion *pool_async_create_completion();
+
+   // -- aio --
+    static AioCompletion *aio_create_completion();
+    static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete,
+						callback_t cb_safe)
+      __attribute__ ((deprecated));
+    static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete);
+
+    friend std::ostream& operator<<(std::ostream &oss, const Rados& r);
+  private:
+    friend class neorados::RADOS;
+
+    // We don't allow assignment or copying
+    Rados(const Rados& rhs);
+    const Rados& operator=(const Rados& rhs);
+    RadosClient *client;
+  };
+
+} // namespace v14_2_0
+} // namespace librados
+
+#endif
+
diff --git a/src/include/rados/librados_fwd.hpp b/src/include/rados/librados_fwd.hpp
new file mode 100644
index 000000000..396f3a838
--- /dev/null
+++ b/src/include/rados/librados_fwd.hpp
@@ -0,0 +1,34 @@
+#ifndef __LIBRADOS_FWD_HPP
+#define __LIBRADOS_FWD_HPP
+
+struct blkin_trace_info;
+
+namespace libradosstriper {
+
+class RadosStriper;
+
+} // namespace libradosstriper
+
+namespace librados {
+inline namespace v14_2_0 {
+
+class AioCompletion;
+class IoCtx;
+class ListObject;
+class NObjectIterator;
+class ObjectCursor;
+class ObjectItem;
+class ObjectOperation;
+class ObjectOperationCompletion;
+class ObjectReadOperation;
+class ObjectWriteOperation;
+class PlacementGroup;
+class PoolAsyncCompletion;
+class Rados;
+class WatchCtx;
+class WatchCtx2;
+
+} // inline namespace v14_2_0
+} // namespace librados
+
+#endif // __LIBRADOS_FWD_HPP
diff --git a/src/include/rados/librgw.h b/src/include/rados/librgw.h
new file mode 100644
index 000000000..c20e96bed
--- /dev/null
+++ b/src/include/rados/librgw.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_LIBRGW_H
+#define CEPH_LIBRGW_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_VER_MAJOR 1
+#define LIBRGW_VER_MINOR 1
+#define LIBRGW_VER_EXTRA 0
+
+#define LIBRGW_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_VERSION_CODE LIBRGW_VERSION(LIBRGW_VER_MAJOR, LIBRGW_VER_MINOR, LIBRGW_VER_EXTRA)
+
+typedef void* librgw_t;
+int librgw_create(librgw_t *rgw, int argc, char **argv);
+void librgw_shutdown(librgw_t rgw);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_LIBRGW_H */
diff --git a/src/include/rados/objclass.h b/src/include/rados/objclass.h
new file mode 100644
index 000000000..80ae69d25
--- /dev/null
+++ b/src/include/rados/objclass.h
@@ -0,0 +1,177 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+#define CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+
+#ifdef __cplusplus
+
+#include "buffer.h"
+
+extern "C" {
+#endif
+
+#define CEPH_CLS_API [[gnu::visibility("default")]]
+
+#define CLS_VER(maj,min) \
+int __cls_ver__## maj ## _ ##min = 0; \
+int __cls_ver_maj = maj; \
+int __cls_ver_min = min;
+
+#define CLS_NAME(name) \
+int __cls_name__## name = 0; \
+const char *__cls_name = #name;
+
+#define CLS_INIT(name) \
+CEPH_CLS_API void __cls_init()
+
+#define CLS_METHOD_RD       0x1 /// method executes read operations
+#define CLS_METHOD_WR       0x2 /// method executes write operations
+#define CLS_METHOD_PROMOTE  0x8 /// method cannot be proxied to base tier
+
+#define CLS_LOG(level, fmt, ...)                                        \
+  cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__)
+
+/**
+ * Initialize a class.
+ */
+void __cls_init();
+
+/**
+ * @typdef cls_handle_t
+ *
+ * A handle for interacting with the object class.
+ */
+typedef void *cls_handle_t;
+
+/**
+ * @typedef cls_method_handle_t
+ *
+ * A handle for interacting with the method of the object class.
+ */
+typedef void *cls_method_handle_t;
+
+/**
+ * @typedef cls_method_context_t
+ *
+ * A context for the method of the object class.
+ */
+typedef void* cls_method_context_t;
+
+/*class utils*/
+extern int cls_log(int level, const char *format, ...)
+  __attribute__((__format__(printf, 2, 3)));
+
+/* class registration api */
+extern int cls_register(const char *name, cls_handle_t *handle);
+
+#ifdef __cplusplus
+}
+
+/**
+ * @typedef cls_method_cxx_call_t
+ *
+ */
+typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx,
+    class ceph::buffer::list *inbl, class ceph::buffer::list *outbl);
+
+/**
+ * Register a method.
+ *
+ * @param hclass
+ * @param method
+ * @param flags
+ * @param class_call
+ * @param handle
+ */
+extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags,
+                                   cls_method_cxx_call_t class_call, cls_method_handle_t *handle);
+
+/**
+ * Create an object.
+ *
+ * @param hctx
+ * @param exclusive
+ */
+extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive);
+
+/**
+ * Remove an object.
+ *
+ * @param hctx
+ */
+extern int cls_cxx_remove(cls_method_context_t hctx);
+
+/**
+ * Check on the status of an object.
+ *
+ * @param hctx
+ * @param size
+ * @param mtime
+ */
+extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime);
+
+/**
+ * Read contents of an object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Write to the object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Get xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param outbl
+ */
+extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
+                            ceph::bufferlist *outbl);
+
+/**
+ * Set xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param inbl
+ */
+extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name,
+                            ceph::bufferlist *inbl);
+
+/**
+ * Get value corresponding to a key from the map.
+ *
+ * @param hctx
+ * @param key
+ * @param outbl
+ */
+extern int cls_cxx_map_get_val(cls_method_context_t hctx,
+                               const std::string &key, ceph::bufferlist *outbl);
+
+/**
+ * Set value corresponding to a key in the map.
+ *
+ * @param hctx
+ * @param key
+ * @param inbl
+ */
+extern int cls_cxx_map_set_val(cls_method_context_t hctx,
+                               const std::string &key, ceph::bufferlist *inbl);
+
+#endif
+
+#endif
diff --git a/src/include/rados/page.h b/src/include/rados/page.h
new file mode 120000
index 000000000..cf983e838
--- /dev/null
+++ b/src/include/rados/page.h
@@ -0,0 +1 @@
+../page.h
+\ No newline at end of file
diff --git a/src/include/rados/rados_types.h b/src/include/rados/rados_types.h
new file mode 100644
index 000000000..d308341ec
--- /dev/null
+++ b/src/include/rados/rados_types.h
@@ -0,0 +1,41 @@
+#ifndef CEPH_RADOS_TYPES_H
+#define CEPH_RADOS_TYPES_H
+
+#include <stdint.h>
+
+/**
+ * @struct obj_watch_t
+ * One item from list_watchers
+ */
+struct obj_watch_t {
+  /// Address of the Watcher
+  char addr[256];
+  /// Watcher ID
+  int64_t watcher_id;
+  /// Cookie
+  uint64_t cookie;
+  /// Timeout in Seconds
+  uint32_t timeout_seconds;
+}; 
+
+struct notify_ack_t {
+  uint64_t notifier_id;
+  uint64_t cookie;
+  char *payload;
+  uint64_t payload_len;
+};
+
+struct notify_timeout_t {
+  uint64_t notifier_id;
+  uint64_t cookie;
+};
+
+/**
+ *
+ * Pass as nspace argument to rados_ioctx_set_namespace()
+ * before calling rados_nobjects_list_open() to return
+ * all objects in all namespaces.
+ */
+#define	LIBRADOS_ALL_NSPACES "\001"
+
+#endif
diff --git a/src/include/rados/rados_types.hpp b/src/include/rados/rados_types.hpp
new file mode 100644
index 000000000..84023579b
--- /dev/null
+++ b/src/include/rados/rados_types.hpp
@@ -0,0 +1,341 @@
+#ifndef CEPH_RADOS_TYPES_HPP
+#define CEPH_RADOS_TYPES_HPP
+
+#include <map>
+#include <utility>
+#include <vector>
+#include <stdint.h>
+#include <string>
+
+#include "buffer.h"
+#include "rados_types.h"
+
+namespace librados {
+
+typedef uint64_t snap_t;
+
+enum {
+  SNAP_HEAD = (uint64_t)(-2),
+  SNAP_DIR = (uint64_t)(-1)
+};
+
+struct clone_info_t {
+  snap_t cloneid;
+  std::vector<snap_t> snaps;          // ascending
+  std::vector< std::pair<uint64_t,uint64_t> > overlap;  // with next newest
+  uint64_t size;
+  clone_info_t() : cloneid(0), size(0) {}
+};
+
+struct snap_set_t {
+  std::vector<clone_info_t> clones;   // ascending
+  snap_t seq;   // newest snapid seen by the object
+  snap_set_t() : seq(0) {}
+};
+
+struct object_id_t {
+  std::string name;
+  std::string nspace;
+  std::string locator;
+  snap_t snap = 0;
+  object_id_t() = default;
+  object_id_t(const std::string& name,
+              const std::string& nspace,
+              const std::string& locator,
+              snap_t snap)
+    : name(name),
+      nspace(nspace),
+      locator(locator),
+      snap(snap)
+  {}
+};
+
+struct err_t {
+  enum : uint64_t {
+    SHARD_MISSING        = 1 << 1,
+    SHARD_STAT_ERR       = 1 << 2,
+    SHARD_READ_ERR       = 1 << 3,
+    DATA_DIGEST_MISMATCH_OI = 1 << 9,   // Old
+    DATA_DIGEST_MISMATCH_INFO = 1 << 9,
+    OMAP_DIGEST_MISMATCH_OI = 1 << 10,  // Old
+    OMAP_DIGEST_MISMATCH_INFO = 1 << 10,
+    SIZE_MISMATCH_OI        = 1 << 11,  // Old
+    SIZE_MISMATCH_INFO        = 1 << 11,
+    SHARD_EC_HASH_MISMATCH  = 1 << 12,
+    SHARD_EC_SIZE_MISMATCH  = 1 << 13,
+    OI_ATTR_MISSING         = 1 << 14, // Old
+    INFO_MISSING         = 1 << 14,
+    OI_ATTR_CORRUPTED       = 1 << 15, // Old
+    INFO_CORRUPTED       = 1 << 15,
+    SS_ATTR_MISSING         = 1 << 16, // Old
+    SNAPSET_MISSING         = 1 << 16,
+    SS_ATTR_CORRUPTED       = 1 << 17, // Old
+    SNAPSET_CORRUPTED       = 1 << 17,
+    OBJ_SIZE_OI_MISMATCH      = 1 << 18, // Old
+    OBJ_SIZE_INFO_MISMATCH      = 1 << 18,
+    HINFO_MISSING         = 1 << 19,
+    HINFO_CORRUPTED       = 1 << 20
+    // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+  };
+  uint64_t errors = 0;
+  static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED;
+  static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH;
+  bool has_shard_missing() const {
+    return errors & SHARD_MISSING;
+  }
+  bool has_stat_error() const {
+    return errors & SHARD_STAT_ERR;
+  }
+  bool has_read_error() const {
+    return errors & SHARD_READ_ERR;
+  }
+  bool has_data_digest_mismatch_oi() const {   // Compatibility
+    return errors & DATA_DIGEST_MISMATCH_OI;
+  }
+  bool has_data_digest_mismatch_info() const {
+    return errors & DATA_DIGEST_MISMATCH_INFO;
+  }
+  bool has_omap_digest_mismatch_oi() const {   // Compatibility
+    return errors & OMAP_DIGEST_MISMATCH_OI;
+  }
+  bool has_omap_digest_mismatch_info() const {
+    return errors & OMAP_DIGEST_MISMATCH_INFO;
+  }
+  bool has_size_mismatch_oi() const {   // Compatibility
+    return errors & SIZE_MISMATCH_OI;
+  }
+  bool has_size_mismatch_info() const {
+    return errors & SIZE_MISMATCH_INFO;
+  }
+  bool has_ec_hash_error() const {
+    return errors & SHARD_EC_HASH_MISMATCH;
+  }
+  bool has_ec_size_error() const {
+    return errors & SHARD_EC_SIZE_MISMATCH;
+  }
+  bool has_oi_attr_missing() const {    // Compatibility
+    return errors & OI_ATTR_MISSING;
+  }
+  bool has_info_missing() const {
+    return errors & INFO_MISSING;
+  }
+  bool has_oi_attr_corrupted() const {	 // Compatibility
+    return errors & OI_ATTR_CORRUPTED;
+  }
+  bool has_info_corrupted() const {
+    return errors & INFO_CORRUPTED;
+  }
+  bool has_ss_attr_missing() const {	// Compatibility
+    return errors & SS_ATTR_MISSING;
+  }
+  bool has_snapset_missing() const {
+    return errors & SNAPSET_MISSING;
+  }
+  bool has_ss_attr_corrupted() const {	// Compatibility
+    return errors & SS_ATTR_CORRUPTED;
+  }
+  bool has_snapset_corrupted() const {
+    return errors & SNAPSET_CORRUPTED;
+  }
+  bool has_shallow_errors() const {
+    return errors & SHALLOW_ERRORS;
+  }
+  bool has_deep_errors() const {
+    return errors & DEEP_ERRORS;
+  }
+  bool has_obj_size_oi_mismatch() const {   // Compatibility
+    return errors & OBJ_SIZE_OI_MISMATCH;
+   }
+  bool has_obj_size_info_mismatch() const {
+    return errors & OBJ_SIZE_INFO_MISMATCH;
+  }
+  bool has_hinfo_missing() const {
+    return errors & HINFO_MISSING;
+  }
+  bool has_hinfo_corrupted() const {
+    return errors & HINFO_CORRUPTED;
+  }
+};
+
+struct shard_info_t : err_t {
+  std::map<std::string, ceph::bufferlist> attrs;
+  uint64_t size = -1;
+  bool omap_digest_present = false;
+  uint32_t omap_digest = 0;
+  bool data_digest_present = false;
+  uint32_t data_digest = 0;
+  bool selected_oi = false;
+  bool primary = false;
+};
+
+struct osd_shard_t {
+  int32_t osd;
+  int8_t shard;
+};
+
+inline bool operator<(const osd_shard_t &lhs, const osd_shard_t &rhs) {
+  if (lhs.osd < rhs.osd)
+    return true;
+  else if (lhs.osd > rhs.osd)
+    return false;
+  else
+    return lhs.shard < rhs.shard;
+}
+
+struct obj_err_t {
+  enum : uint64_t {
+    OBJECT_INFO_INCONSISTENCY   = 1 << 1,
+    // XXX: Can an older rados binary work if these bits stay the same?
+    DATA_DIGEST_MISMATCH = 1 << 4,
+    OMAP_DIGEST_MISMATCH = 1 << 5,
+    SIZE_MISMATCH        = 1 << 6,
+    ATTR_VALUE_MISMATCH  = 1 << 7,
+    ATTR_NAME_MISMATCH    = 1 << 8,
+    SNAPSET_INCONSISTENCY   = 1 << 9,
+    HINFO_INCONSISTENCY   = 1 << 10,
+    SIZE_TOO_LARGE        = 1 << 11,
+    // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+  };
+  uint64_t errors = 0;
+  static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH
+	  |ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY|SIZE_TOO_LARGE;
+  static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH;
+  bool has_object_info_inconsistency() const {
+    return errors & OBJECT_INFO_INCONSISTENCY;
+  }
+  bool has_data_digest_mismatch() const {
+    return errors & DATA_DIGEST_MISMATCH;
+  }
+  bool has_omap_digest_mismatch() const {
+    return errors & OMAP_DIGEST_MISMATCH;
+  }
+  bool has_size_mismatch() const {
+    return errors & SIZE_MISMATCH;
+  }
+  bool has_attr_value_mismatch() const {
+    return errors & ATTR_VALUE_MISMATCH;
+  }
+  bool has_attr_name_mismatch() const {
+    return errors & ATTR_NAME_MISMATCH;
+  }
+  bool has_shallow_errors() const {
+    return errors & SHALLOW_ERRORS;
+  }
+  bool has_deep_errors() const {
+    return errors & DEEP_ERRORS;
+  }
+  bool has_snapset_inconsistency() const {
+    return errors & SNAPSET_INCONSISTENCY;
+  }
+  bool has_hinfo_inconsistency() const {
+    return errors & HINFO_INCONSISTENCY;
+  }
+  bool has_size_too_large() const {
+    return errors & SIZE_TOO_LARGE;
+  }
+};
+
+struct inconsistent_obj_t : obj_err_t {
+  inconsistent_obj_t() = default;
+  inconsistent_obj_t(const object_id_t& object)
+    : object{object}, version(0)
+  {}
+  object_id_t object;
+  uint64_t version;  // XXX: Redundant with object info attr
+  std::map<osd_shard_t, shard_info_t> shards;
+  err_t union_shards;
+};
+
+struct inconsistent_snapset_t {
+  inconsistent_snapset_t() = default;
+  inconsistent_snapset_t(const object_id_t& head)
+    : object{head}
+  {}
+  enum {
+    SNAPSET_MISSING = 1 << 0,
+    SNAPSET_CORRUPTED = 1 << 1,
+    CLONE_MISSING  = 1 << 2,
+    SNAP_ERROR  = 1 << 3,
+    HEAD_MISMATCH  = 1 << 4,  // Unused
+    HEADLESS_CLONE = 1 << 5,
+    SIZE_MISMATCH  = 1 << 6,
+    OI_MISSING   = 1 << 7,    // Old
+    INFO_MISSING   = 1 << 7,
+    OI_CORRUPTED = 1 << 8,    // Old
+    INFO_CORRUPTED = 1 << 8,
+    EXTRA_CLONES = 1 << 9,
+  };
+  uint64_t errors = 0;
+  object_id_t object;
+  // Extra clones
+  std::vector<snap_t> clones;
+  std::vector<snap_t> missing;
+  ceph::bufferlist ss_bl;
+
+  bool ss_attr_missing() const {     // Compatibility
+    return errors & SNAPSET_MISSING;
+  }
+  bool snapset_missing() const {
+    return errors & SNAPSET_MISSING;
+  }
+  bool ss_attr_corrupted() const {   // Compatibility
+    return errors & SNAPSET_CORRUPTED;
+  }
+  bool snapset_corrupted() const {
+    return errors & SNAPSET_CORRUPTED;
+  }
+  bool clone_missing() const  {
+    return errors & CLONE_MISSING;
+  }
+  bool snapset_mismatch() const {    // Compatibility
+    return errors & SNAP_ERROR;
+  }
+  bool snapset_error() const {
+    return errors & SNAP_ERROR;
+  }
+  bool head_mismatch() const {      // Compatibility
+    return false;
+  }
+  bool headless() const {
+    return errors & HEADLESS_CLONE;
+  }
+  bool size_mismatch() const {
+    return errors & SIZE_MISMATCH;
+  }
+  bool oi_attr_missing() const {   // Compatibility
+    return errors & OI_MISSING;
+  }
+  bool info_missing() const {
+    return errors & INFO_MISSING;
+  }
+  bool oi_attr_corrupted() const {  // Compatibility
+    return errors & OI_CORRUPTED;
+  }
+  bool info_corrupted() const {
+    return errors & INFO_CORRUPTED;
+  }
+  bool extra_clones() const {
+    return errors & EXTRA_CLONES;
+  }
+};
+
+/**
+ * @var all_nspaces
+ * Pass as nspace argument to IoCtx::set_namespace()
+ * before calling nobjects_begin() to iterate
+ * through all objects in all namespaces.
+ */
+const std::string all_nspaces(LIBRADOS_ALL_NSPACES);
+
+struct notify_ack_t {
+  uint64_t notifier_id;
+  uint64_t cookie;
+  ceph::bufferlist payload_bl;
+};
+
+struct notify_timeout_t {
+  uint64_t notifier_id;
+  uint64_t cookie;
+};
+}
+#endif
diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h
new file mode 100644
index 000000000..e1ea45593
--- /dev/null
+++ b/src/include/rados/rgw_file.h
@@ -0,0 +1,431 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * convert RGW commands to file commands
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef RADOS_RGW_FILE_H
+#define RADOS_RGW_FILE_H
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "librgw.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_FILE_VER_MAJOR 1
+#define LIBRGW_FILE_VER_MINOR 2
+#define LIBRGW_FILE_VER_EXTRA 0
+
+#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA)
+
+/*
+ * object types
+ */
+enum rgw_fh_type {
+  RGW_FS_TYPE_NIL = 0,
+  RGW_FS_TYPE_FILE,
+  RGW_FS_TYPE_DIRECTORY,
+  RGW_FS_TYPE_SYMBOLIC_LINK,
+};
+
+/*
+ * dynamic allocated handle to support nfs handle
+ */
+
+/* content-addressable hash */
+struct rgw_fh_hk {
+  uint64_t bucket;
+  uint64_t object;
+};
+
+struct rgw_file_handle
+{
+  /* content-addressable hash */
+  struct rgw_fh_hk fh_hk;
+  void *fh_private; /* librgw private data */
+  /* object type */
+  enum rgw_fh_type fh_type;
+};
+
+struct rgw_fs
+{
+  librgw_t rgw;
+  void *fs_private;
+  struct rgw_file_handle* root_fh;
+};
+
+
+/* XXX mount info hypothetical--emulate Unix, support at least
+ * UUID-length fsid */
+struct rgw_statvfs {
+    uint64_t  f_bsize;    /* file system block size */
+    uint64_t  f_frsize;   /* fragment size */
+    uint64_t     f_blocks;   /* size of fs in f_frsize units */
+    uint64_t     f_bfree;    /* # free blocks */
+    uint64_t     f_bavail;   /* # free blocks for unprivileged users */
+    uint64_t     f_files;    /* # inodes */
+    uint64_t     f_ffree;    /* # free inodes */
+    uint64_t     f_favail;   /* # free inodes for unprivileged users */
+    uint64_t     f_fsid[2];     /* file system ID */
+    uint64_t     f_flag;     /* mount flags */
+    uint64_t     f_namemax;  /* maximum filename length */
+};
+
+
+void rgwfile_version(int *major, int *minor, int *extra);
+
+/*
+  lookup object by name (POSIX style)
+*/
+#define RGW_LOOKUP_FLAG_NONE    0x0000
+#define RGW_LOOKUP_FLAG_CREATE  0x0001
+#define RGW_LOOKUP_FLAG_RCB     0x0002 /* readdir callback hint */
+#define RGW_LOOKUP_FLAG_DIR     0x0004
+#define RGW_LOOKUP_FLAG_FILE    0x0008
+
+#define RGW_LOOKUP_TYPE_FLAGS \
+  (RGW_LOOKUP_FLAG_DIR|RGW_LOOKUP_FLAG_FILE)
+
+int rgw_lookup(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *parent_fh, const char *path,
+	      struct rgw_file_handle **fh,
+	      struct stat *st, uint32_t mask, uint32_t flags);
+
+/*
+  lookup object by handle (NFS style)
+*/
+int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk,
+		      struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+ * release file handle
+ */
+#define RGW_FH_RELE_FLAG_NONE   0x0000
+
+int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		uint32_t flags);
+
+/*
+ attach rgw namespace
+*/
+#define RGW_MOUNT_FLAG_NONE     0x0000
+
+int rgw_mount(librgw_t rgw, const char *uid, const char *key,
+	      const char *secret, struct rgw_fs **rgw_fs,
+	      uint32_t flags);
+
+int rgw_mount2(librgw_t rgw, const char *uid, const char *key,
+               const char *secret, const char *root, struct rgw_fs **rgw_fs,
+               uint32_t flags);
+
+/*
+ register invalidate callbacks
+*/
+#define RGW_REG_INVALIDATE_FLAG_NONE    0x0000
+
+typedef void (*rgw_fh_callback_t)(void *handle, struct rgw_fh_hk fh_hk);
+
+int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb,
+			    void *arg, uint32_t flags);
+
+/*
+ detach rgw namespace
+*/
+#define RGW_UMOUNT_FLAG_NONE    0x0000
+
+int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags);
+
+
+/*
+  get filesystem attributes
+*/
+#define RGW_STATFS_FLAG_NONE     0x0000
+
+int rgw_statfs(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *parent_fh,
+	       struct rgw_statvfs *vfs_st,
+	       uint32_t flags);
+
+
+/* XXX (get|set)attr mask bits */
+#define RGW_SETATTR_MODE   1
+#define RGW_SETATTR_UID    2
+#define RGW_SETATTR_GID    4
+#define RGW_SETATTR_MTIME  8
+#define RGW_SETATTR_ATIME 16
+#define RGW_SETATTR_SIZE  32
+#define RGW_SETATTR_CTIME 64
+
+/*
+  create file
+*/
+#define RGW_CREATE_FLAG_NONE     0x0000
+
+int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+	       const char *name, struct stat *st, uint32_t mask,
+	       struct rgw_file_handle **fh, uint32_t posix_flags,
+	       uint32_t flags);
+
+/*
+  create a symbolic link
+ */
+#define RGW_CREATELINK_FLAG_NONE     0x0000
+int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+               const char *name, const char *link_path, struct stat *st, 
+               uint32_t mask, struct rgw_file_handle **fh, uint32_t posix_flags,
+               uint32_t flags);
+
+/*
+  create a new directory
+*/
+#define RGW_MKDIR_FLAG_NONE      0x0000
+
+int rgw_mkdir(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *parent_fh,
+	      const char *name, struct stat *st, uint32_t mask,
+	      struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+  rename object
+*/
+#define RGW_RENAME_FLAG_NONE      0x0000
+
+int rgw_rename(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *olddir, const char* old_name,
+	       struct rgw_file_handle *newdir, const char* new_name,
+	       uint32_t flags);
+
+/*
+  remove file or directory
+*/
+#define RGW_UNLINK_FLAG_NONE      0x0000
+
+int rgw_unlink(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *parent_fh, const char* path,
+	       uint32_t flags);
+
+/*
+    read  directory content
+*/
+typedef int (*rgw_readdir_cb)(const char *name, void *arg, uint64_t offset,
+			       struct stat *st, uint32_t mask,
+			       uint32_t flags);
+
+#define RGW_READDIR_FLAG_NONE      0x0000
+#define RGW_READDIR_FLAG_DOTDOT    0x0001 /* send dot names */
+
+int rgw_readdir(struct rgw_fs *rgw_fs,
+		struct rgw_file_handle *parent_fh, uint64_t *offset,
+		rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+		uint32_t flags);
+
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+		 struct rgw_file_handle *parent_fh, const char *name,
+		 rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+		 uint32_t flags);
+
+/* project offset of dirent name */
+#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000
+
+int rgw_dirent_offset(struct rgw_fs *rgw_fs,
+		      struct rgw_file_handle *parent_fh,
+		      const char *name, int64_t *offset,
+		      uint32_t flags);
+
+/*
+   get unix attributes for object
+*/
+#define RGW_GETATTR_FLAG_NONE      0x0000
+
+int rgw_getattr(struct rgw_fs *rgw_fs,
+		struct rgw_file_handle *fh, struct stat *st,
+		uint32_t flags);
+
+/*
+   set unix attributes for object
+*/
+#define RGW_SETATTR_FLAG_NONE      0x0000
+
+int rgw_setattr(struct rgw_fs *rgw_fs,
+		struct rgw_file_handle *fh, struct stat *st,
+		uint32_t mask, uint32_t flags);
+
+/*
+   truncate file
+*/
+#define RGW_TRUNCATE_FLAG_NONE     0x0000
+
+int rgw_truncate(struct rgw_fs *rgw_fs,
+		 struct rgw_file_handle *fh, uint64_t size,
+		 uint32_t flags);
+
+/*
+   open file
+*/
+#define RGW_OPEN_FLAG_NONE         0x0000
+#define RGW_OPEN_FLAG_CREATE       0x0001
+#define RGW_OPEN_FLAG_V3           0x0002 /* ops have v3 semantics */
+#define RGW_OPEN_FLAG_STATELESS    0x0002 /* alias it */
+
+int rgw_open(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+	     uint32_t posix_flags, uint32_t flags);
+
+/*
+   close file
+*/
+
+#define RGW_CLOSE_FLAG_NONE        0x0000
+#define RGW_CLOSE_FLAG_RELE        0x0001
+  
+int rgw_close(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+	      uint32_t flags);
+
+/*
+   read data from file
+*/
+#define RGW_READ_FLAG_NONE 0x0000
+
+int rgw_read(struct rgw_fs *rgw_fs,
+	     struct rgw_file_handle *fh, uint64_t offset,
+	     size_t length, size_t *bytes_read, void *buffer,
+	     uint32_t flags);
+
+/*
+   read symbolic link
+*/
+#define RGW_READLINK_FLAG_NONE 0x0000
+
+int rgw_readlink(struct rgw_fs *rgw_fs,
+	     struct rgw_file_handle *fh, uint64_t offset,
+	     size_t length, size_t *bytes_read, void *buffer,
+	     uint32_t flags);
+
+/*
+   write data to file
+*/
+#define RGW_WRITE_FLAG_NONE      0x0000
+
+int rgw_write(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *fh, uint64_t offset,
+	      size_t length, size_t *bytes_written, void *buffer,
+	      uint32_t flags);
+
+#define RGW_UIO_NONE    0x0000
+#define RGW_UIO_GIFT    0x0001
+#define RGW_UIO_FREE    0x0002
+#define RGW_UIO_BUFQ    0x0004
+
+struct rgw_uio;
+typedef void (*rgw_uio_release)(struct rgw_uio *, uint32_t);
+
+/* buffer vector descriptors */
+struct rgw_vio {
+  void *vio_p1;
+  void *vio_u1;
+  void *vio_base;
+  int32_t vio_len;
+};
+  
+struct rgw_uio {
+  rgw_uio_release uio_rele;
+  void *uio_p1;
+  void *uio_u1;
+  uint64_t uio_offset;
+  uint64_t uio_resid;
+  uint32_t uio_cnt;
+  uint32_t uio_flags;
+  struct rgw_vio *uio_vio; /* appended vectors */
+};
+
+typedef struct rgw_uio rgw_uio;
+
+int rgw_readv(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+int rgw_writev(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+/*
+   sync written data
+*/
+#define RGW_FSYNC_FLAG_NONE        0x0000
+
+int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+	      uint32_t flags);
+
+/*
+   NFS commit operation
+*/
+
+#define RGW_COMMIT_FLAG_NONE        0x0000
+
+int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+	       uint64_t offset, uint64_t length, uint32_t flags);
+
+/*
+  extended attributes
+ */
+typedef struct rgw_xattrstr
+{
+  char *val;
+  uint32_t len;
+} rgw_xattrstr;
+
+typedef struct rgw_xattr
+{
+  rgw_xattrstr key;
+  rgw_xattrstr val;
+} rgw_xattr;
+
+typedef struct rgw_xattrlist
+{
+  rgw_xattr *xattrs;
+  uint32_t xattr_cnt;
+} rgw_xattrlist;
+
+#define RGW_GETXATTR_FLAG_NONE      0x0000
+
+typedef int (*rgw_getxattr_cb)(rgw_xattrlist *attrs, void *arg,
+			       uint32_t flags);
+
+int rgw_getxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		  rgw_xattrlist *attrs, rgw_getxattr_cb cb, void *cb_arg,
+		  uint32_t flags);
+
+#define RGW_LSXATTR_FLAG_NONE       0x0000
+#define RGW_LSXATTR_FLAG_STOP       0x0001
+
+int rgw_lsxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		 rgw_xattrstr *filter_prefix /* unimplemented for now */,
+		 rgw_getxattr_cb cb, void *cb_arg, uint32_t flags);
+
+#define RGW_SETXATTR_FLAG_NONE      0x0000
+
+int rgw_setxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		 rgw_xattrlist *attrs, uint32_t flags);
+
+#define RGW_RMXATTR_FLAG_NONE       0x0000
+
+int rgw_rmxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		 rgw_xattrlist *attrs, uint32_t flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RADOS_RGW_FILE_H */
diff --git a/src/include/radosstriper/libradosstriper.h b/src/include/radosstriper/libradosstriper.h
new file mode 100644
index 000000000..a35345f7d
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.h
@@ -0,0 +1,620 @@
+#ifndef CEPH_LIBRADOSSTRIPER_H
+#define CEPH_LIBRADOSSTRIPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <string.h>
+
+#include "../rados/librados.h"
+
+#define LIBRADOSSTRIPER_VER_MAJOR 0
+#define LIBRADOSSTRIPER_VER_MINOR 0
+#define LIBRADOSSTRIPER_VER_EXTRA 0
+
+#define LIBRADOSSTRIPER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOSSTRIPER_VERSION_CODE LIBRADOSSTRIPER_VERSION(LIBRADOSSTRIPER_VER_MAJOR, LIBRADOSSTRIPER_VER_MINOR, LIBRADOSSTRIPER_VER_EXTRA)
+
+/**
+ * @typedef rados_striper_t
+ *
+ * A handle for interacting with striped objects in a RADOS cluster.
+ */
+typedef void *rados_striper_t;
+
+/**
+ * @defgroup libradosstriper_h_init Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using libradosstriper.
+ *
+ * @{
+ */
+
+/**
+ * Creates a rados striper using the given io context
+ * Striper has initially default object layout.
+ * See rados_striper_set_object_layout_*() to change this
+ *
+ * @param ioctx the rados context to use
+ * @param striper where to store the rados striper
+ * @returns 0 on success, negative error code on failure
+ */
+  int rados_striper_create(rados_ioctx_t ioctx,
+                           rados_striper_t *striper);
+
+/**
+ * Destroys a rados striper
+ *
+ * @param striper the striper to destroy
+ */
+void rados_striper_destroy(rados_striper_t striper);
+
+/**
+ * Sets the object layout's stripe unit of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_unit the stripe_unit value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper,
+                                                unsigned int stripe_unit);
+
+/**
+ * Sets the object layout's stripe count of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_count the stripe_count value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_count(rados_striper_t striper,
+                                                 unsigned int stripe_count);
+
+/**
+ * Sets the object layout's object_size of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param object_size the object_size value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_object_size(rados_striper_t striper,
+                                                unsigned int object_size);
+
+/** @} init */
+
+/**
+ * @defgroup libradosstriper_h_synch_io Synchronous I/O
+ * Writes are striped to several rados objects which are then
+ * replicated to a number of OSDs based on the configuration
+ * of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_striper_ioctx_wait_for_complete().
+ *
+ * @{
+ */
+
+/**
+ * Synchronously write data to a striped object at the specified offset
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_write(rados_striper_t striper,
+                        const char *soid,
+                        const char *buf,
+                        size_t len,
+                        uint64_t off);
+
+/**
+ * Synchronously write an entire striped object
+ *
+ * The striped object is filled with the provided data. If the striped object exists,
+ * it is truncated and then written.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_write_full(rados_striper_t striper,
+                             const char *soid,
+                             const char *buf,
+                             size_t len);
+
+/**
+ * Append data to an object
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_append(rados_striper_t striper,
+                         const char *soid,
+                         const char *buf,
+                         size_t len);
+
+/**
+ * Synchronously read data from a striped object at the specified offset
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+int rados_striper_read(rados_striper_t striper,
+                       const char *soid,
+                       char *buf,
+                       size_t len,
+                       uint64_t off);
+
+/**
+ * Synchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_remove(rados_striper_t striper,
+                         const char* soid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @note the truncation is not fully atomic. The metadata part is,
+ * so the behavior will be atomic from user point of view when
+ * the object size is reduced. However, in case of failure, old data
+ * may stay around, hidden. They may reappear if the object size is
+ * later grown, instead of the expected 0s. When growing the
+ * object and in case of failure, the new 0 data may not be
+ * fully created. This can lead to ENOENT errors when
+ * writing/reading the missing parts.
+ * @note the truncation can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during truncation (same EBUSY return code)
+ * @param io the rados context to use
+ * @param soid the name of the striped object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_trunc(rados_striper_t striper, const char *soid, uint64_t size);
+
+/** @} Synchronous I/O */
+
+/**
+ * @defgroup libradosstriper_h_xattrs Xattrs
+ * Extended attributes are stored as extended attributes on the
+ * first rados regular object of the striped object.
+ * Thus, they have the same limitations as the underlying
+ * rados extended attributes.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the getxattr will occur
+ * @param oid name of the striped object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+int rados_striper_getxattr(rados_striper_t striper,
+                           const char *oid,
+                           const char *name,
+                           char *buf,
+                           size_t len);
+
+/**
+ * Set an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the setxattr will occur
+ * @param oid name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_setxattr(rados_striper_t striper,
+                           const char *oid,
+                           const char *name,
+                           const char *buf,
+                           size_t len);
+
+/**
+ * Delete an extended attribute from a striped object.
+ *
+ * @param striper the striper in which the rmxattr will occur
+ * @param oid name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_rmxattr(rados_striper_t striper,
+                          const char *oid,
+                          const char *name);
+
+/**
+ * Start iterating over xattrs on a striped object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param striper the striper in which the getxattrs will occur
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs(rados_striper_t striper,
+                            const char *oid,
+                            rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the striped object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs_next(rados_xattrs_iter_t iter,
+                                 const char **name,
+                                 const char **val,
+                                 size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+void rados_striper_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Synchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_stat(rados_striper_t striper,
+                       const char* soid,
+                       uint64_t *psize,
+                       time_t *pmtime);
+
+int rados_striper_stat2(rados_striper_t striper,
+                       const char* soid,
+                       uint64_t *psize,
+                       struct timespec *pmtime);
+
+/**
+ * @defgroup libradosstriper_h_asynch_io Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_striper_multi_completion_t
+ * Represents the state of a set of asynchronous operations
+ * it contains the aggregated return value once the operations complete
+ * and can be used to block until all operations are complete and/or safe.
+ */
+typedef void *rados_striper_multi_completion_t;
+
+/**
+ * Constructs a multi completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all relpicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+int rados_striper_multi_aio_create_completion(void *cb_arg,
+                                              rados_callback_t cb_complete,
+                                              rados_callback_t cb_safe,
+                                              rados_striper_multi_completion_t *pc);
+
+/**
+ * Block until all operation complete
+ *
+ * This means data is in memory on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operation are safe
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations complete and callback completes
+ *
+ * This means data is in memory on all replicas and can be read.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations are safe and callback has completed
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation and callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe and has the callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Get the return value of a multi asychronous operation
+ *
+ * The return value is set when all operations are complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operations to inspect
+ * @returns aggregated return value of the operations
+ */
+int rados_striper_multi_aio_get_return_value(rados_striper_multi_completion_t c);
+
+/**
+ * Release a multi asynchrnous IO completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c multi completion to release
+ */
+void rados_striper_multi_aio_release(rados_striper_multi_completion_t c);
+
+/**
+ * Asynchronously write data to a striped object at the specified offset
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write(rados_striper_t striper,
+                            const char *soid,
+                            rados_completion_t completion,
+                            const char *buf,
+                            size_t len,
+                            uint64_t off);
+
+/**
+ * Asynchronously appends data to a striped object
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_append(rados_striper_t striper,
+                             const char *soid,
+                             rados_completion_t completion,
+                             const char *buf,
+                             size_t len);
+
+/**
+ * Asynchronously fills and object with the provided data.
+ * If the object exists, it is truncated and then written.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write_full(rados_striper_t striper,
+                                 const char *soid,
+                                 rados_completion_t completion,
+                                 const char *buf,
+                                 size_t len);
+
+/**
+ * Asynchronously read data from a striped object at the specified offset
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the read is safe and complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_read(rados_striper_t striper,
+                           const char *soid,
+                           rados_completion_t completion,
+                           char *buf,
+                           const size_t len,
+                           uint64_t off);
+
+/**
+ * Asynchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, negative error code on failure
+ */
+
+int rados_striper_aio_remove(rados_striper_t striper,
+                             const char* soid,
+                             rados_completion_t completion);
+
+/**
+ * Block until all pending writes in a striper are safe
+ *
+ * This is not equivalent to calling rados_striper_multi_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @param striper the striper in which the flush will occur
+ * @returns 0 on success, negative error code on failure
+*/
+void rados_striper_aio_flush(rados_striper_t striper);
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param completion what to do when the stats is complete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_aio_stat(rados_striper_t striper,
+                           const char* soid,
+                           rados_completion_t completion,
+                           uint64_t *psize,
+                           time_t *pmtime);
+
+int rados_striper_aio_stat2(rados_striper_t striper,
+                           const char* soid,
+                           rados_completion_t completion,
+                           uint64_t *psize,
+                           struct timespec *pmtime);
+/** @} Asynchronous I/O */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp
new file mode 100644
index 000000000..fb790b0d7
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.hpp
@@ -0,0 +1,241 @@
+#ifndef __LIBRADOSSTRIPER_HPP
+#define __LIBRADOSSTRIPER_HPP
+
+#include <string.h>
+#include <string>
+#include <map>
+#include "../rados/buffer.h"
+#include "../rados/librados.hpp"
+
+#include "libradosstriper.h"
+
+namespace libradosstriper
+{
+  struct RadosStriperImpl;
+  struct MultiAioCompletionImpl;
+
+  /*
+   * Completion object for multiple asynchronous IO
+   * It allows to internally handle several "requests"
+   */
+  struct MultiAioCompletion {
+    MultiAioCompletion(MultiAioCompletionImpl *pc_) : pc(pc_) {}
+    ~MultiAioCompletion();
+    int set_complete_callback(void *cb_arg, librados::callback_t cb);
+    int set_safe_callback(void *cb_arg, librados::callback_t cb) __attribute__ ((deprecated));
+    void wait_for_complete();
+    void wait_for_safe() __attribute__ ((deprecated));
+    void wait_for_complete_and_cb();
+    void wait_for_safe_and_cb() __attribute__ ((deprecated));
+    bool is_complete();
+    bool is_safe() __attribute__ ((deprecated));
+    bool is_complete_and_cb();
+    bool is_safe_and_cb() __attribute__ ((deprecated));
+    int get_return_value();
+    void release();
+    MultiAioCompletionImpl *pc;
+  };
+
+  /* RadosStriper : This class allows to perform read/writes on striped objects
+   *
+   * Typical use (error checking omitted):
+   *
+   * RadosStriper rs;
+   * RadosStriper.striper_create("my_cluster", rs);
+   * bufferlist bl;
+   * ... put data in bl ...
+   * rs.write(object_name, bl, len, offset);
+   * bufferlist bl2;
+   * rs.read(object_name, &bl2, len, offset);
+   * ...
+   */
+  class RadosStriper
+  {
+  public:
+
+    /*
+     * constructor
+     */
+    RadosStriper();
+
+    /*
+     * builds the C counter part of a RadosStriper
+     */
+    static void to_rados_striper_t(RadosStriper &striper,
+                                   rados_striper_t *s);
+
+    /*
+     * copy constructor
+     */
+    RadosStriper(const RadosStriper& rs);
+
+    /*
+     * operator=
+     */
+    RadosStriper& operator=(const RadosStriper& rs);
+
+    /*
+     * destructor
+     * Internally calling close() if an object is currently opened
+     */
+    ~RadosStriper();
+
+    /*
+     * create method
+     */
+    static int striper_create(librados::IoCtx& ioctx,
+                              RadosStriper *striper);
+
+    /*
+     * set object layout's stripe unit
+     * This layout will be used when new objects are created (by writing to them)
+     * Already existing objects will be opened with their own layout.
+     */
+    int set_object_layout_stripe_unit(unsigned int stripe_unit);
+
+    /*
+     * set object layout's stripe count
+     * This layout will be used when new objects are created (by writing to them)
+     * Already existing objects will be opened with their own layout.
+     */
+    int set_object_layout_stripe_count(unsigned int stripe_count);
+
+    /*
+     * set object layout's object size
+     * This layout will be used when new objects are created (by writing to them)
+     * Already existing objects will be opened with their own layout.
+     */
+    int set_object_layout_object_size(unsigned int object_size);
+
+    /**
+     * Get the value of an extended attribute on a striped object
+     */
+    int getxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+    /**
+     * Set the value of an extended attribute on a striped object
+     */
+    int setxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+    /**
+     * Delete an extended attribute from a striped object
+     */
+    int rmxattr(const std::string& oid, const char *name);
+
+    /**
+     * Start iterating over xattrs on a striped object.
+     */
+    int getxattrs(const std::string& oid,
+                  std::map<std::string, ceph::bufferlist>& attrset); 
+    
+    /**
+     * synchronously write to the striped object at the specified offset.
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int write(const std::string& soid, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+    /**
+     * synchronously fill the striped object with the specified data
+     * NOTE: this call steals the contents of @param bl.
+     */
+    int write_full(const std::string& soid, const ceph::bufferlist& bl);
+
+    /**
+     * synchronously append data to the striped object
+     * NOTE: this call steals the contents of @p bl.
+     */
+    int append(const std::string& soid, const ceph::bufferlist& bl, size_t len);
+
+    /**
+     * asynchronously write to the striped object at the specified offset.
+     * NOTE: this call steals the contents of @p bl.
+     */
+    int aio_write(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+    /**
+     * asynchronously fill the striped object with the specified data
+     * NOTE: this call steals the contents of @p bl.
+     */
+    int aio_write_full(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl);
+
+    /**
+     * asynchronously append data to the striped object
+     * NOTE: this call steals the contents of @p bl.
+     */
+    int aio_append(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len);
+
+    /**
+     * synchronously read from the striped object at the specified offset.
+     */
+    int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off);
+
+    /**
+     * asynchronously read from the striped object at the specified offset.
+     */
+    int aio_read(const std::string& soid, librados::AioCompletion *c, ceph::bufferlist *pbl, size_t len, uint64_t off);
+
+    /**
+     * synchronously get striped object stats (size/mtime)
+     */
+    int stat(const std::string& soid, uint64_t *psize, time_t *pmtime);
+    int stat2(const std::string& soid, uint64_t *psize, struct timespec *pts);
+
+    /**
+     * asynchronously get striped object stats (size/mtime)
+     */
+    int aio_stat(const std::string& soid, librados::AioCompletion *c,
+                 uint64_t *psize, time_t *pmtime);
+    int aio_stat2(const std::string& soid, librados::AioCompletion *c,
+                  uint64_t *psize, struct timespec *pts);
+
+    /**
+     * deletes a striped object.
+     * There is no atomicity of the deletion and the striped
+     * object may be left incomplete if an error is returned (metadata
+     * all present, but some stripes missing)
+     * However, there is a atomicity of the metadata deletion and
+     * the deletion can not happen if any I/O is ongoing (it
+     * will return EBUSY). Identically, no I/O will be able to start
+     * during deletion (same EBUSY return code)
+     */
+    int remove(const std::string& soid);
+    int remove(const std::string& soid, int flags);
+
+    /**
+     * asynchronous remove of striped objects
+     * See synchronous version for comments on (lack of) atomicity
+     */
+    int aio_remove(const std::string& soid, librados::AioCompletion *c);
+    int aio_remove(const std::string& soid, librados::AioCompletion *c, int flags);
+
+    /**
+     * Resizes a striped object
+     * the truncation can not happen if any I/O is ongoing (it
+     * will return EBUSY). Identically, no I/O will be able to start
+     * during truncation (same EBUSY return code)
+     */
+    int trunc(const std::string& oid, uint64_t size);
+
+    /**
+     * Wait for all currently pending aio writes to be safe.
+     *
+     * @returns 0 on success, negative error code on failure
+     */
+    int aio_flush();
+
+    /**
+     * creation of multi aio completion objects
+     */
+    static MultiAioCompletion *multi_aio_create_completion();
+    static MultiAioCompletion *multi_aio_create_completion(void *cb_arg,
+                                                           librados::callback_t cb_complete,
+                                                           librados::callback_t cb_safe);
+
+  private:
+    RadosStriperImpl *rados_striper_impl;
+
+  };
+
+}
+
+#endif
diff --git a/src/include/random.h b/src/include/random.h
new file mode 100644
index 000000000..f2e3e37bc
--- /dev/null
+++ b/src/include/random.h
@@ -0,0 +1,301 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+*/
+
+#ifndef CEPH_RANDOM_H
+#define CEPH_RANDOM_H 1
+
+#include <mutex>
+#include <random>
+#include <type_traits>
+#include <boost/optional.hpp>
+
+// Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85494
+#ifdef __MINGW32__
+#include <boost/random/random_device.hpp>
+
+using random_device_t = boost::random::random_device;
+#else
+using random_device_t = std::random_device;
+#endif
+
+// Basic random number facility (see N3551 for inspiration):
+namespace ceph::util {
+
+inline namespace version_1_0_3 {
+
+namespace detail {
+
+template <typename T0, typename T1>
+using larger_of = typename std::conditional<
+                    sizeof(T0) >= sizeof(T1), 
+                    T0, T1>
+                  ::type;
+
+// avoid mixing floating point and integers:
+template <typename NumberT0, typename NumberT1>
+using has_compatible_numeric_types =
+            std::disjunction<
+                std::conjunction<
+                    std::is_floating_point<NumberT0>, std::is_floating_point<NumberT1>
+                >,
+                std::conjunction<
+                    std::is_integral<NumberT0>, std::is_integral<NumberT1>
+                >
+            >;
+
+
+// Select the larger of type compatible numeric types:
+template <typename NumberT0, typename NumberT1>
+using select_number_t = std::enable_if_t<detail::has_compatible_numeric_types<NumberT0, NumberT1>::value,
+                                         detail::larger_of<NumberT0, NumberT1>>;
+
+} // namespace detail
+
+namespace detail {
+
+// Choose default distribution for appropriate types:
+template <typename NumberT, 
+          bool IsIntegral>
+struct select_distribution
+{
+ using type = std::uniform_int_distribution<NumberT>;
+};
+
+template <typename NumberT>
+struct select_distribution<NumberT, false>
+{
+ using type = std::uniform_real_distribution<NumberT>;
+};
+
+template <typename NumberT>
+using default_distribution = typename
+    select_distribution<NumberT, std::is_integral<NumberT>::value>::type;
+
+} // namespace detail
+
+namespace detail {
+
+template <typename EngineT>
+EngineT& engine();
+
+template <typename MutexT, typename EngineT, 
+          typename SeedT = typename EngineT::result_type>
+void randomize_rng(const SeedT seed, MutexT& m, EngineT& e)
+{
+  std::lock_guard<MutexT> lg(m);
+  e.seed(seed);
+}
+
+template <typename MutexT, typename EngineT>
+void randomize_rng(MutexT& m, EngineT& e)
+{
+  random_device_t rd;
+ 
+  std::lock_guard<MutexT> lg(m);
+  e.seed(rd());
+}
+
+template <typename EngineT = std::default_random_engine,
+          typename SeedT = typename EngineT::result_type>
+void randomize_rng(const SeedT n)
+{
+  detail::engine<EngineT>().seed(n);
+}
+
+template <typename EngineT = std::default_random_engine>
+void randomize_rng()
+{
+  random_device_t rd;
+  detail::engine<EngineT>().seed(rd());
+}
+
+template <typename EngineT>
+EngineT& engine()
+{
+  thread_local boost::optional<EngineT> rng_engine;
+
+  if (!rng_engine) {
+    rng_engine.emplace(EngineT());
+    randomize_rng<EngineT>();
+  }
+
+  return *rng_engine;
+}
+
+} // namespace detail
+
+namespace detail {
+
+template <typename NumberT,
+          typename DistributionT = detail::default_distribution<NumberT>,
+          typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max,
+                               EngineT& e)
+{
+  DistributionT d { min, max };
+
+  using param_type = typename DistributionT::param_type;
+  return d(e, param_type { min, max });
+}
+
+template <typename NumberT,
+          typename MutexT,
+          typename DistributionT = detail::default_distribution<NumberT>,
+          typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max,
+                               MutexT& m, EngineT& e)
+{
+  DistributionT d { min, max };
+ 
+  using param_type = typename DistributionT::param_type;
+ 
+  std::lock_guard<MutexT> lg(m);
+  return d(e, param_type { min, max });
+}
+
+template <typename NumberT,
+          typename DistributionT = detail::default_distribution<NumberT>,
+          typename EngineT>
+NumberT generate_random_number(const NumberT min, const NumberT max)
+{
+  return detail::generate_random_number<NumberT, DistributionT, EngineT>
+          (min, max, detail::engine<EngineT>());
+}
+
+template <typename MutexT, 
+          typename EngineT,
+          typename NumberT = int,
+          typename DistributionT = detail::default_distribution<NumberT>>
+NumberT generate_random_number(MutexT& m, EngineT& e)
+{
+  return detail::generate_random_number<NumberT, MutexT, DistributionT, EngineT>
+          (0, std::numeric_limits<NumberT>::max(), m, e);
+}
+
+template <typename NumberT, typename MutexT, typename EngineT>
+NumberT generate_random_number(const NumberT max, MutexT& m, EngineT& e)
+{
+  return generate_random_number<NumberT>(0, max, m, e);
+}
+
+} // namespace detail
+
+template <typename EngineT = std::default_random_engine>
+void randomize_rng()
+{
+  detail::randomize_rng<EngineT>();
+}
+
+template <typename NumberT = int,
+          typename DistributionT = detail::default_distribution<NumberT>,
+          typename EngineT = std::default_random_engine>
+NumberT generate_random_number()
+{
+  return detail::generate_random_number<NumberT, DistributionT, EngineT>
+          (0, std::numeric_limits<NumberT>::max());
+}
+
+template <typename NumberT0, typename NumberT1,
+          typename NumberT = detail::select_number_t<NumberT0, NumberT1>
+         >
+NumberT generate_random_number(const NumberT0 min, const NumberT1 max)
+{
+  return detail::generate_random_number<NumberT,
+                                        detail::default_distribution<NumberT>,
+                                        std::default_random_engine>
+                                       (static_cast<NumberT>(min), static_cast<NumberT>(max)); 
+}
+
+template <typename NumberT0, typename NumberT1,
+          typename DistributionT,
+          typename EngineT,
+          typename NumberT = detail::select_number_t<NumberT0, NumberT1>
+		 >
+NumberT generate_random_number(const NumberT min, const NumberT max,
+                               EngineT& e)
+{
+ return detail::generate_random_number<NumberT,
+                       DistributionT,
+                       EngineT>(static_cast<NumberT>(min), static_cast<NumberT>(max), e);
+}
+
+template <typename NumberT>
+NumberT generate_random_number(const NumberT max)
+{
+ return generate_random_number<NumberT>(0, max);
+}
+
+// Function object:
+template <typename NumberT>
+class random_number_generator final
+{
+  std::mutex l;
+  random_device_t rd;
+  std::default_random_engine e;
+
+  using seed_type = typename decltype(e)::result_type;
+ 
+  public:
+  using number_type         = NumberT;
+  using random_engine_type  = decltype(e);
+  using random_device_type  = decltype(rd);
+
+  public:
+  random_device_type& random_device() noexcept { return rd; } 
+  random_engine_type& random_engine() noexcept { return e; }
+ 
+  public:
+  random_number_generator() {
+    detail::randomize_rng(l, e);
+  }
+ 
+  explicit random_number_generator(const seed_type seed) {
+    detail::randomize_rng(seed, l, e);
+  }
+
+  random_number_generator(random_number_generator&& rhs)
+   : e(std::move(rhs.e))
+  {}
+ 
+  public:
+  random_number_generator(const random_number_generator&)            = delete;
+  random_number_generator& operator=(const random_number_generator&) = delete;
+ 
+  public:
+  NumberT operator()() { 
+    return detail::generate_random_number(l, e); 
+  }
+ 
+  NumberT operator()(const NumberT max) { 
+    return detail::generate_random_number<NumberT>(max, l, e); 
+  }
+ 
+  NumberT operator()(const NumberT min, const NumberT max) { 
+    return detail::generate_random_number<NumberT>(min, max, l, e); 
+  }
+ 
+  public:
+  void seed(const seed_type n) { 
+    detail::randomize_rng(n, l, e); 
+  }
+};
+
+template <typename NumberT>
+random_number_generator(const NumberT max) -> random_number_generator<NumberT>;
+
+} // inline namespace version_*
+
+} // namespace ceph::util
+
+#endif
diff --git a/src/include/rangeset.h b/src/include/rangeset.h
new file mode 100644
index 000000000..e7e3d047c
--- /dev/null
+++ b/src/include/rangeset.h
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_RANGESET_H
+#define CEPH_RANGESET_H
+
+/*
+ *
+ * my first container with iterator!   it's pretty ugly.
+ *
+ */
+
+#include <map>
+
+//typedef int T;
+
+template <class T>
+struct _rangeset_base {
+  map<T,T> ranges;  // pair(first,last) (inclusive, e.g. [first,last])
+                    
+  typedef typename map<T,T>::iterator mapit;
+
+  // get iterator for range including val.  or ranges.end().
+  mapit get_range_for(T val) {
+    mapit it = ranges.lower_bound(val);
+    if (it == ranges.end()) {
+      // search backwards
+      typename map<T,T>::reverse_iterator it = ranges.rbegin();
+      if (it == ranges.rend()) return ranges.end();
+      if (it->first <= val && it->second >= val)
+        return ranges.find(it->first);
+      return ranges.end();
+    } else {
+      if (it->first == val) return 
+      it--;
+      if (it->first <= val && it->second >= val)
+        return it;
+      return ranges.end();
+    }
+  }
+
+};
+
+
+template <class T>
+class rangeset_iterator :
+  public std::iterator<std::input_iterator_tag, T>
+{
+  //typedef typename map<T,T>::iterator mapit;
+
+  map<T,T> ranges;
+  typename map<T,T>::iterator it;
+  T current;
+
+public:
+  // cons
+  rangeset_iterator() {}
+
+  rangeset_iterator(typename map<T,T>::iterator& it, map<T,T>& ranges) {
+    this->ranges = ranges;
+    this->it = it;
+    if (this->it != ranges.end())
+      current = it->first;
+  }
+
+  bool operator==(rangeset_iterator<T> rit) {
+    return (it == rit.it && rit.current == current);
+  }
+  bool operator!=(rangeset_iterator<T> rit) {
+    return (it != rit.it) || (rit.current != current);
+  }
+  
+  T& operator*() {
+    return current;
+  }
+
+  rangeset_iterator<T> operator++(int) {
+    if (current < it->second)
+      current++;
+    else {
+      it++;
+      if (it != ranges.end())
+        current = it->first;
+    }
+    
+    return *this;
+  }
+};
+
+
+template <class T>
+class rangeset
+{
+  typedef typename map<T,T>::iterator map_iterator;
+
+  _rangeset_base<T> theset;
+  inodeno_t _size;
+
+public:
+  rangeset() { _size = 0; }
+  typedef rangeset_iterator<T> iterator;
+
+  iterator begin() {
+    map_iterator it = theset.ranges.begin();
+    return iterator(it, theset.ranges);
+  }
+
+  iterator end() {
+    map_iterator it = theset.ranges.end();
+    return iterator(it, theset.ranges);
+  }
+
+  map_iterator map_begin() {
+    return theset.ranges.begin();
+  }
+  map_iterator map_end() {
+    return theset.ranges.end();
+  }
+  int map_size() {
+    return theset.ranges.size();
+  }
+
+  void map_insert(T v1, T v2) {
+    theset.ranges.insert(pair<T,T>(v1,v2));
+    _size += v2 - v1+1;
+  }
+
+
+  // ...
+  bool contains(T val) {
+    if (theset.get_range_for(val) == theset.ranges.end()) return false;
+    ceph_assert(!empty());
+    return true;
+  }
+  
+  void insert(T val) {
+    ceph_assert(!contains(val));
+
+    map_iterator left = theset.get_range_for(val-1);
+    map_iterator right = theset.get_range_for(val+1);
+
+    if (left != theset.ranges.end() &&
+        right != theset.ranges.end()) {
+      // join!
+      left->second = right->second;
+      theset.ranges.erase(right);
+      _size++;
+      return;
+    }
+
+    if (left != theset.ranges.end()) {
+      // add to left range
+      left->second = val;
+      _size++;
+      return;
+    }
+
+    if (right != theset.ranges.end()) {
+      // add to right range
+      theset.ranges.insert(pair<T,T>(val, right->second));
+      theset.ranges.erase(val+1);
+      _size++;
+      return;
+    }
+
+    // new range
+    theset.ranges.insert(pair<T,T>(val,val));
+    _size++;
+    return;
+  }
+
+  unsigned size() {
+    return size();
+  }
+
+  bool empty() {
+    if (theset.ranges.empty()) {
+      ceph_assert(_size == 0);
+      return true;
+    }
+    ceph_assert(_size>0);
+    return false;
+  }
+
+  
+  T first() {
+    ceph_assert(!empty());
+    map_iterator it = theset.ranges.begin();
+    return it->first;
+  }
+  
+  void erase(T val) {
+    ceph_assert(contains(val));
+    map_iterator it = theset.get_range_for(val);
+    ceph_assert(it != theset.ranges.end());
+    
+    // entire range
+    if (val == it->first && val == it->second) {
+      theset.ranges.erase(it);
+      _size--;
+      return;
+    }
+
+    // beginning
+    if (val == it->first) {
+      theset.ranges.insert(pair<T,T>(val+1, it->second));
+      theset.ranges.erase(it);
+      _size--;
+      return;      
+    }
+
+    // end
+    if (val == it->second) {
+      it->second = val-1;
+      _size--;
+      return;
+    }
+
+    // middle split
+    theset.ranges.insert(pair<T,T>(it->first, val-1));
+    theset.ranges.insert(pair<T,T>(val+1, it->second));
+    theset.ranges.erase(it);
+    _size--;
+    return;
+  }
+
+  void dump() {
+    for (typename map<T,T>::iterator it = theset.ranges.begin();
+         it != theset.ranges.end();
+         it++) {
+      cout << " " << it->first << "-" << it->second << endl;
+    }
+  }
+
+};
+
+
+#endif
diff --git a/src/include/rbd/features.h b/src/include/rbd/features.h
new file mode 100644
index 000000000..31c73b38f
--- /dev/null
+++ b/src/include/rbd/features.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_FEATURES_H
+#define CEPH_RBD_FEATURES_H
+
+#define RBD_FEATURE_LAYERING            (1ULL<<0)
+#define RBD_FEATURE_STRIPINGV2          (1ULL<<1)
+#define RBD_FEATURE_EXCLUSIVE_LOCK      (1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP          (1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF           (1ULL<<4)
+#define RBD_FEATURE_DEEP_FLATTEN        (1ULL<<5)
+#define RBD_FEATURE_JOURNALING          (1ULL<<6)
+#define RBD_FEATURE_DATA_POOL           (1ULL<<7)
+#define RBD_FEATURE_OPERATIONS          (1ULL<<8)
+#define RBD_FEATURE_MIGRATING           (1ULL<<9)
+#define RBD_FEATURE_NON_PRIMARY         (1ULL<<10)
+#define RBD_FEATURE_DIRTY_CACHE         (1ULL<<11)
+
+#define RBD_FEATURES_DEFAULT             (RBD_FEATURE_LAYERING | \
+                                         RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                         RBD_FEATURE_OBJECT_MAP | \
+                                         RBD_FEATURE_FAST_DIFF | \
+                                         RBD_FEATURE_DEEP_FLATTEN)
+
+#define RBD_FEATURE_NAME_LAYERING        "layering"
+#define RBD_FEATURE_NAME_STRIPINGV2      "striping"
+#define RBD_FEATURE_NAME_EXCLUSIVE_LOCK  "exclusive-lock"
+#define RBD_FEATURE_NAME_OBJECT_MAP      "object-map"
+#define RBD_FEATURE_NAME_FAST_DIFF       "fast-diff"
+#define RBD_FEATURE_NAME_DEEP_FLATTEN    "deep-flatten"
+#define RBD_FEATURE_NAME_JOURNALING      "journaling"
+#define RBD_FEATURE_NAME_DATA_POOL       "data-pool"
+#define RBD_FEATURE_NAME_OPERATIONS      "operations"
+#define RBD_FEATURE_NAME_MIGRATING       "migrating"
+#define RBD_FEATURE_NAME_NON_PRIMARY     "non-primary"
+#define RBD_FEATURE_NAME_DIRTY_CACHE     "dirty-cache"
+
+/// features that make an image inaccessible for read or write by
+/// clients that don't understand them
+#define RBD_FEATURES_INCOMPATIBLE       (RBD_FEATURE_LAYERING       | \
+                                         RBD_FEATURE_STRIPINGV2     | \
+                                         RBD_FEATURE_DATA_POOL      | \
+                                         RBD_FEATURE_DIRTY_CACHE)
+
+/// features that make an image unwritable by clients that don't understand them
+#define RBD_FEATURES_RW_INCOMPATIBLE    (RBD_FEATURES_INCOMPATIBLE  | \
+                                         RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                         RBD_FEATURE_OBJECT_MAP     | \
+                                         RBD_FEATURE_FAST_DIFF      | \
+                                         RBD_FEATURE_DEEP_FLATTEN   | \
+                                         RBD_FEATURE_JOURNALING     | \
+                                         RBD_FEATURE_OPERATIONS     | \
+                                         RBD_FEATURE_MIGRATING      | \
+                                         RBD_FEATURE_NON_PRIMARY)
+
+#define RBD_FEATURES_ALL                (RBD_FEATURE_LAYERING       | \
+                                         RBD_FEATURE_STRIPINGV2     | \
+                                         RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                         RBD_FEATURE_OBJECT_MAP     | \
+                                         RBD_FEATURE_FAST_DIFF      | \
+                                         RBD_FEATURE_DEEP_FLATTEN   | \
+                                         RBD_FEATURE_JOURNALING     | \
+                                         RBD_FEATURE_DATA_POOL      | \
+                                         RBD_FEATURE_OPERATIONS     | \
+                                         RBD_FEATURE_MIGRATING      | \
+                                         RBD_FEATURE_NON_PRIMARY    | \
+                                         RBD_FEATURE_DIRTY_CACHE)
+
+/// features that may be dynamically enabled or disabled
+#define RBD_FEATURES_MUTABLE            (RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                         RBD_FEATURE_OBJECT_MAP     | \
+                                         RBD_FEATURE_FAST_DIFF      | \
+                                         RBD_FEATURE_JOURNALING     | \
+                                         RBD_FEATURE_NON_PRIMARY    | \
+                                         RBD_FEATURE_DIRTY_CACHE)
+
+#define RBD_FEATURES_MUTABLE_INTERNAL   (RBD_FEATURE_NON_PRIMARY    | \
+                                         RBD_FEATURE_DIRTY_CACHE)
+
+/// features that may be dynamically disabled
+#define RBD_FEATURES_DISABLE_ONLY       (RBD_FEATURE_DEEP_FLATTEN)
+
+/// features that only work when used with a single client
+/// using the image for writes
+#define RBD_FEATURES_SINGLE_CLIENT (RBD_FEATURE_EXCLUSIVE_LOCK | \
+                                    RBD_FEATURE_OBJECT_MAP     | \
+                                    RBD_FEATURE_FAST_DIFF      | \
+                                    RBD_FEATURE_JOURNALING     | \
+                                    RBD_FEATURE_DIRTY_CACHE)
+
+/// features that will be implicitly enabled
+#define RBD_FEATURES_IMPLICIT_ENABLE  (RBD_FEATURE_STRIPINGV2  | \
+                                       RBD_FEATURE_DATA_POOL   | \
+                                       RBD_FEATURE_FAST_DIFF   | \
+                                       RBD_FEATURE_OPERATIONS  | \
+                                       RBD_FEATURE_MIGRATING   | \
+                                       RBD_FEATURE_NON_PRIMARY | \
+                                       RBD_FEATURE_DIRTY_CACHE)
+
+/// features that cannot be controlled by the user
+#define RBD_FEATURES_INTERNAL         (RBD_FEATURE_OPERATIONS | \
+                                       RBD_FEATURE_MIGRATING)
+
+#define RBD_OPERATION_FEATURE_CLONE_PARENT      (1ULL<<0)
+#define RBD_OPERATION_FEATURE_CLONE_CHILD       (1ULL<<1)
+#define RBD_OPERATION_FEATURE_GROUP             (1ULL<<2)
+#define RBD_OPERATION_FEATURE_SNAP_TRASH        (1ULL<<3)
+
+#define RBD_OPERATION_FEATURE_NAME_CLONE_PARENT "clone-parent"
+#define RBD_OPERATION_FEATURE_NAME_CLONE_CHILD  "clone-child"
+#define RBD_OPERATION_FEATURE_NAME_GROUP        "group"
+#define RBD_OPERATION_FEATURE_NAME_SNAP_TRASH   "snap-trash"
+
+/// all valid operation features
+#define RBD_OPERATION_FEATURES_ALL (RBD_OPERATION_FEATURE_CLONE_PARENT | \
+                                    RBD_OPERATION_FEATURE_CLONE_CHILD  | \
+                                    RBD_OPERATION_FEATURE_GROUP        | \
+                                    RBD_OPERATION_FEATURE_SNAP_TRASH)
+
+#endif
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
new file mode 100644
index 000000000..7ae20e4dd
--- /dev/null
+++ b/src/include/rbd/librbd.h
@@ -0,0 +1,1549 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRBD_H
+#define CEPH_LIBRBD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include <stdbool.h>
+#include <string.h>
+#include <sys/uio.h>
+#include "../rados/librados.h"
+#include "features.h"
+
+#define LIBRBD_VER_MAJOR 1
+#define LIBRBD_VER_MINOR 18
+#define LIBRBD_VER_EXTRA 0
+
+#define LIBRBD_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA)
+
+#define LIBRBD_SUPPORTS_AIO_FLUSH 1
+#define LIBRBD_SUPPORTS_AIO_OPEN 1
+#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE 1
+#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE_IOVEC 1
+#define LIBRBD_SUPPORTS_LOCKING 1
+#define LIBRBD_SUPPORTS_INVALIDATE 1
+#define LIBRBD_SUPPORTS_IOVEC 1
+#define LIBRBD_SUPPORTS_WATCH 0
+#define LIBRBD_SUPPORTS_WRITESAME 1
+#define LIBRBD_SUPPORTS_WRITE_ZEROES 1
+#define LIBRBD_SUPPORTS_ENCRYPTION 1
+#define LIBRBD_SUPPORTS_ENCRYPTION_LOAD2 1
+
+#if __GNUC__ >= 4
+  #define CEPH_RBD_API          __attribute__ ((visibility ("default")))
+  #define CEPH_RBD_DEPRECATED   __attribute__((deprecated))
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#else
+  #define CEPH_RBD_API
+  #define CEPH_RBD_DEPRECATED
+#endif
+
+#define RBD_FLAG_OBJECT_MAP_INVALID   (1<<0)
+#define RBD_FLAG_FAST_DIFF_INVALID    (1<<1)
+
+#define RBD_MIRROR_IMAGE_STATUS_LOCAL_MIRROR_UUID ""
+
+typedef void *rbd_image_t;
+typedef void *rbd_image_options_t;
+typedef void *rbd_pool_stats_t;
+
+typedef void *rbd_completion_t;
+typedef void (*rbd_callback_t)(rbd_completion_t cb, void *arg);
+
+typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr);
+
+typedef void (*rbd_update_callback_t)(void *arg);
+
+typedef enum {
+  RBD_SNAP_NAMESPACE_TYPE_USER   = 0,
+  RBD_SNAP_NAMESPACE_TYPE_GROUP  = 1,
+  RBD_SNAP_NAMESPACE_TYPE_TRASH  = 2,
+  RBD_SNAP_NAMESPACE_TYPE_MIRROR = 3,
+} rbd_snap_namespace_type_t;
+
+typedef struct {
+  char *id;
+  char *name;
+} rbd_image_spec_t;
+
+typedef struct {
+  int64_t pool_id;
+  char *pool_name;
+  char *pool_namespace;
+  char *image_id;
+  char *image_name;
+  bool trash;
+} rbd_linked_image_spec_t;
+
+typedef struct {
+  uint64_t id;
+  rbd_snap_namespace_type_t namespace_type;
+  char *name;
+} rbd_snap_spec_t;
+
+typedef struct {
+  uint64_t id;
+  uint64_t size;
+  const char *name;
+} rbd_snap_info_t;
+
+typedef struct {
+  const char *pool_name;
+  const char *image_name;
+  const char *image_id;
+  bool trash;
+} rbd_child_info_t;
+
+#define RBD_MAX_IMAGE_NAME_SIZE 96
+#define RBD_MAX_BLOCK_NAME_SIZE 24
+
+#define RBD_SNAP_CREATE_SKIP_QUIESCE		(1 << 0)
+#define RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR	(1 << 1)
+
+#define RBD_SNAP_REMOVE_UNPROTECT	(1 << 0)
+#define RBD_SNAP_REMOVE_FLATTEN		(1 << 1)
+#define RBD_SNAP_REMOVE_FORCE		(RBD_SNAP_REMOVE_UNPROTECT | RBD_SNAP_REMOVE_FLATTEN)
+
+/**
+ * These types used to in set_image_notification to indicate the type of event
+ * socket passed in.
+ */
+enum {
+  EVENT_TYPE_PIPE = 1,
+  EVENT_TYPE_EVENTFD = 2
+};
+
+typedef struct {
+  uint64_t size;
+  uint64_t obj_size;
+  uint64_t num_objs;
+  int order;
+  char block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE]; /* deprecated */
+  int64_t parent_pool;                             /* deprecated */
+  char parent_name[RBD_MAX_IMAGE_NAME_SIZE];       /* deprecated */
+} rbd_image_info_t;
+
+typedef enum {
+  RBD_MIRROR_MODE_DISABLED, /* mirroring is disabled */
+  RBD_MIRROR_MODE_IMAGE,    /* mirroring enabled on a per-image basis */
+  RBD_MIRROR_MODE_POOL      /* mirroring enabled on all journaled images */
+} rbd_mirror_mode_t;
+
+typedef enum {
+  RBD_MIRROR_PEER_DIRECTION_RX    = 0,
+  RBD_MIRROR_PEER_DIRECTION_TX    = 1,
+  RBD_MIRROR_PEER_DIRECTION_RX_TX = 2
+} rbd_mirror_peer_direction_t;
+
+typedef struct {
+  char *uuid;
+  char *cluster_name;
+  char *client_name;
+} rbd_mirror_peer_t CEPH_RBD_DEPRECATED;
+
+typedef struct {
+  char *uuid;
+  rbd_mirror_peer_direction_t direction;
+  char *site_name;
+  char *mirror_uuid;
+  char *client_name;
+  time_t last_seen;
+} rbd_mirror_peer_site_t;
+
+#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_MON_HOST "mon_host"
+#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_KEY      "key"
+
+typedef enum {
+  RBD_MIRROR_IMAGE_MODE_JOURNAL  = 0,
+  RBD_MIRROR_IMAGE_MODE_SNAPSHOT = 1,
+} rbd_mirror_image_mode_t;
+
+typedef enum {
+  RBD_MIRROR_IMAGE_DISABLING = 0,
+  RBD_MIRROR_IMAGE_ENABLED = 1,
+  RBD_MIRROR_IMAGE_DISABLED = 2
+} rbd_mirror_image_state_t;
+
+typedef struct {
+  char *global_id;
+  rbd_mirror_image_state_t state;
+  bool primary;
+} rbd_mirror_image_info_t;
+
+typedef enum {
+  MIRROR_IMAGE_STATUS_STATE_UNKNOWN         = 0,
+  MIRROR_IMAGE_STATUS_STATE_ERROR           = 1,
+  MIRROR_IMAGE_STATUS_STATE_SYNCING         = 2,
+  MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY = 3,
+  MIRROR_IMAGE_STATUS_STATE_REPLAYING       = 4,
+  MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY = 5,
+  MIRROR_IMAGE_STATUS_STATE_STOPPED         = 6,
+} rbd_mirror_image_status_state_t;
+
+typedef struct {
+  char *name;
+  rbd_mirror_image_info_t info;
+  rbd_mirror_image_status_state_t state;
+  char *description;
+  time_t last_update;
+  bool up;
+} rbd_mirror_image_status_t CEPH_RBD_DEPRECATED;
+
+typedef struct {
+  char *mirror_uuid;
+  rbd_mirror_image_status_state_t state;
+  char *description;
+  time_t last_update;
+  bool up;
+} rbd_mirror_image_site_status_t;
+
+typedef struct {
+  char *name;
+  rbd_mirror_image_info_t info;
+  uint32_t site_statuses_count;
+  rbd_mirror_image_site_status_t *site_statuses;
+} rbd_mirror_image_global_status_t;
+
+typedef enum {
+  RBD_GROUP_IMAGE_STATE_ATTACHED,
+  RBD_GROUP_IMAGE_STATE_INCOMPLETE
+} rbd_group_image_state_t;
+
+typedef struct {
+  char *name;
+  int64_t pool;
+  rbd_group_image_state_t state;
+} rbd_group_image_info_t;
+
+typedef struct {
+  char *name;
+  int64_t pool;
+} rbd_group_info_t;
+
+typedef enum {
+  RBD_GROUP_SNAP_STATE_INCOMPLETE,
+  RBD_GROUP_SNAP_STATE_COMPLETE
+} rbd_group_snap_state_t;
+
+typedef struct {
+  char *name;
+  rbd_group_snap_state_t state;
+} rbd_group_snap_info_t;
+
+typedef struct {
+  int64_t group_pool;
+  char *group_name;
+  char *group_snap_name;
+} rbd_snap_group_namespace_t;
+
+typedef enum {
+  RBD_SNAP_MIRROR_STATE_PRIMARY,
+  RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED,
+  RBD_SNAP_MIRROR_STATE_NON_PRIMARY,
+  RBD_SNAP_MIRROR_STATE_NON_PRIMARY_DEMOTED
+} rbd_snap_mirror_state_t;
+
+typedef struct {
+  rbd_snap_mirror_state_t state;
+  size_t mirror_peer_uuids_count;
+  char *mirror_peer_uuids;
+  bool complete;
+  char *primary_mirror_uuid;
+  uint64_t primary_snap_id;
+  uint64_t last_copied_object_number;
+} rbd_snap_mirror_namespace_t;
+
+typedef enum {
+  RBD_LOCK_MODE_EXCLUSIVE = 0,
+  RBD_LOCK_MODE_SHARED = 1,
+} rbd_lock_mode_t;
+
+CEPH_RBD_API void rbd_version(int *major, int *minor, int *extra);
+
+/* image options */
+enum {
+  RBD_IMAGE_OPTION_FORMAT = 0,
+  RBD_IMAGE_OPTION_FEATURES = 1,
+  RBD_IMAGE_OPTION_ORDER = 2,
+  RBD_IMAGE_OPTION_STRIPE_UNIT = 3,
+  RBD_IMAGE_OPTION_STRIPE_COUNT = 4,
+  RBD_IMAGE_OPTION_JOURNAL_ORDER = 5,
+  RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH = 6,
+  RBD_IMAGE_OPTION_JOURNAL_POOL = 7,
+  RBD_IMAGE_OPTION_FEATURES_SET = 8,
+  RBD_IMAGE_OPTION_FEATURES_CLEAR = 9,
+  RBD_IMAGE_OPTION_DATA_POOL = 10,
+  RBD_IMAGE_OPTION_FLATTEN = 11,
+  RBD_IMAGE_OPTION_CLONE_FORMAT = 12,
+  RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE = 13,
+};
+
+typedef enum {
+  RBD_TRASH_IMAGE_SOURCE_USER = 0,
+  RBD_TRASH_IMAGE_SOURCE_MIRRORING = 1,
+  RBD_TRASH_IMAGE_SOURCE_MIGRATION = 2,
+  RBD_TRASH_IMAGE_SOURCE_REMOVING = 3,
+  RBD_TRASH_IMAGE_SOURCE_USER_PARENT = 4,
+} rbd_trash_image_source_t;
+
+typedef struct {
+  char *id;
+  char *name;
+  rbd_trash_image_source_t source;
+  time_t deletion_time;
+  time_t deferment_end_time;
+} rbd_trash_image_info_t;
+
+typedef struct {
+  char *addr;
+  int64_t id;
+  uint64_t cookie;
+} rbd_image_watcher_t;
+
+typedef enum {
+  RBD_IMAGE_MIGRATION_STATE_UNKNOWN = -1,
+  RBD_IMAGE_MIGRATION_STATE_ERROR = 0,
+  RBD_IMAGE_MIGRATION_STATE_PREPARING = 1,
+  RBD_IMAGE_MIGRATION_STATE_PREPARED = 2,
+  RBD_IMAGE_MIGRATION_STATE_EXECUTING = 3,
+  RBD_IMAGE_MIGRATION_STATE_EXECUTED = 4,
+  RBD_IMAGE_MIGRATION_STATE_ABORTING = 5,
+} rbd_image_migration_state_t;
+
+typedef struct {
+  int64_t source_pool_id;
+  char *source_pool_namespace;
+  char *source_image_name;
+  char *source_image_id;
+  int64_t dest_pool_id;
+  char *dest_pool_namespace;
+  char *dest_image_name;
+  char *dest_image_id;
+  rbd_image_migration_state_t state;
+  char *state_description;
+} rbd_image_migration_status_t;
+
+typedef enum {
+  RBD_CONFIG_SOURCE_CONFIG = 0,
+  RBD_CONFIG_SOURCE_POOL = 1,
+  RBD_CONFIG_SOURCE_IMAGE = 2,
+} rbd_config_source_t;
+
+typedef struct {
+  char *name;
+  char *value;
+  rbd_config_source_t source;
+} rbd_config_option_t;
+
+typedef enum {
+  RBD_POOL_STAT_OPTION_IMAGES,
+  RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES,
+  RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES,
+  RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS,
+  RBD_POOL_STAT_OPTION_TRASH_IMAGES,
+  RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES,
+  RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES,
+  RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS
+} rbd_pool_stat_option_t;
+
+/* rbd_write_zeroes / rbd_aio_write_zeroes flags */
+enum {
+  RBD_WRITE_ZEROES_FLAG_THICK_PROVISION = (1U<<0), /* fully allocated zeroed extent */
+};
+
+typedef enum {
+    RBD_ENCRYPTION_FORMAT_LUKS1 = 0,
+    RBD_ENCRYPTION_FORMAT_LUKS2 = 1,
+    RBD_ENCRYPTION_FORMAT_LUKS  = 2
+} rbd_encryption_format_t;
+
+typedef enum {
+    RBD_ENCRYPTION_ALGORITHM_AES128 = 0,
+    RBD_ENCRYPTION_ALGORITHM_AES256 = 1
+} rbd_encryption_algorithm_t;
+
+typedef void *rbd_encryption_options_t;
+
+typedef struct {
+    rbd_encryption_format_t format;
+    rbd_encryption_options_t opts;
+    size_t opts_size;
+} rbd_encryption_spec_t;
+
+typedef struct {
+    rbd_encryption_algorithm_t alg;
+    const char* passphrase;
+    size_t passphrase_size;
+} rbd_encryption_luks1_format_options_t;
+
+typedef struct {
+    rbd_encryption_algorithm_t alg;
+    const char* passphrase;
+    size_t passphrase_size;
+} rbd_encryption_luks2_format_options_t;
+
+typedef struct {
+    const char* passphrase;
+    size_t passphrase_size;
+} rbd_encryption_luks_format_options_t;
+
+CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts);
+CEPH_RBD_API void rbd_image_options_destroy(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_set_string(rbd_image_options_t opts,
+					      int optname, const char* optval);
+CEPH_RBD_API int rbd_image_options_set_uint64(rbd_image_options_t opts,
+					      int optname, uint64_t optval);
+CEPH_RBD_API int rbd_image_options_get_string(rbd_image_options_t opts,
+					      int optname, char* optval,
+					      size_t maxlen);
+CEPH_RBD_API int rbd_image_options_get_uint64(rbd_image_options_t opts,
+					      int optname, uint64_t* optval);
+CEPH_RBD_API int rbd_image_options_is_set(rbd_image_options_t opts,
+                                          int optname, bool* is_set);
+CEPH_RBD_API int rbd_image_options_unset(rbd_image_options_t opts, int optname);
+CEPH_RBD_API void rbd_image_options_clear(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_is_empty(rbd_image_options_t opts);
+
+/* helpers */
+CEPH_RBD_API void rbd_image_spec_cleanup(rbd_image_spec_t *image);
+CEPH_RBD_API void rbd_image_spec_list_cleanup(rbd_image_spec_t *images,
+                                              size_t num_images);
+CEPH_RBD_API void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image);
+CEPH_RBD_API void rbd_linked_image_spec_list_cleanup(
+    rbd_linked_image_spec_t *images, size_t num_images);
+CEPH_RBD_API void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap);
+
+/* images */
+CEPH_RBD_API int rbd_list(rados_ioctx_t io, char *names, size_t *size)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_list2(rados_ioctx_t io, rbd_image_spec_t* images,
+                           size_t *max_images);
+
+CEPH_RBD_API int rbd_create(rados_ioctx_t io, const char *name, uint64_t size,
+                            int *order);
+CEPH_RBD_API int rbd_create2(rados_ioctx_t io, const char *name, uint64_t size,
+		             uint64_t features, int *order);
+/**
+ * create new rbd image
+ *
+ * The stripe_unit must be a factor of the object size (1 << order).
+ * The stripe_count can be one (no intra-object striping) or greater
+ * than one.  The RBD_FEATURE_STRIPINGV2 must be specified if the
+ * stripe_unit != the object size and the stripe_count is != 1.
+ *
+ * @param io ioctx
+ * @param name image name
+ * @param size image size in bytes
+ * @param features initial feature bits
+ * @param order object/block size, as a power of two (object size == 1 << order)
+ * @param stripe_unit stripe unit size, in bytes.
+ * @param stripe_count number of objects to stripe over before looping
+ * @return 0 on success, or negative error code
+ */
+CEPH_RBD_API int rbd_create3(rados_ioctx_t io, const char *name, uint64_t size,
+		             uint64_t features, int *order,
+		             uint64_t stripe_unit, uint64_t stripe_count);
+CEPH_RBD_API int rbd_create4(rados_ioctx_t io, const char *name, uint64_t size,
+			     rbd_image_options_t opts);
+CEPH_RBD_API int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
+	                   const char *p_snapname, rados_ioctx_t c_ioctx,
+	                   const char *c_name, uint64_t features, int *c_order);
+CEPH_RBD_API int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name,
+	                    const char *p_snapname, rados_ioctx_t c_ioctx,
+	                    const char *c_name, uint64_t features, int *c_order,
+	                    uint64_t stripe_unit, int stripe_count);
+CEPH_RBD_API int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name,
+	                    const char *p_snapname, rados_ioctx_t c_ioctx,
+	                    const char *c_name, rbd_image_options_t c_opts);
+CEPH_RBD_API int rbd_remove(rados_ioctx_t io, const char *name);
+CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name,
+			                  librbd_progress_fn_t cb,
+                                          void *cbdata);
+CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname,
+                            const char *destname);
+
+CEPH_RBD_API int rbd_trash_move(rados_ioctx_t io, const char *name,
+                                uint64_t delay);
+CEPH_RBD_API int rbd_trash_get(rados_ioctx_t io, const char *id,
+                               rbd_trash_image_info_t *info);
+CEPH_RBD_API void rbd_trash_get_cleanup(rbd_trash_image_info_t *info);
+CEPH_RBD_API int rbd_trash_list(rados_ioctx_t io,
+                                rbd_trash_image_info_t *trash_entries,
+                                size_t *num_entries);
+CEPH_RBD_API void rbd_trash_list_cleanup(rbd_trash_image_info_t *trash_entries,
+                                         size_t num_entries);
+CEPH_RBD_API int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts, float threshold);
+CEPH_RBD_API int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts,
+                                               float threshold, librbd_progress_fn_t cb,
+                                               void* cbdata);
+CEPH_RBD_API int rbd_trash_remove(rados_ioctx_t io, const char *id, bool force);
+CEPH_RBD_API int rbd_trash_remove_with_progress(rados_ioctx_t io,
+                                                const char *id,
+                                                bool force,
+                                                librbd_progress_fn_t cb,
+                                                void *cbdata);
+CEPH_RBD_API int rbd_trash_restore(rados_ioctx_t io, const char *id,
+                                   const char *name);
+
+/* migration */
+CEPH_RBD_API int rbd_migration_prepare(rados_ioctx_t ioctx,
+                                       const char *image_name,
+                                       rados_ioctx_t dest_ioctx,
+                                       const char *dest_image_name,
+                                       rbd_image_options_t opts);
+CEPH_RBD_API int rbd_migration_prepare_import(
+    const char *source_spec, rados_ioctx_t dest_ioctx,
+    const char *dest_image_name, rbd_image_options_t opts);
+CEPH_RBD_API int rbd_migration_execute(rados_ioctx_t ioctx,
+                                       const char *image_name);
+CEPH_RBD_API int rbd_migration_execute_with_progress(rados_ioctx_t ioctx,
+                                                     const char *image_name,
+                                                     librbd_progress_fn_t cb,
+                                                     void *cbdata);
+CEPH_RBD_API int rbd_migration_abort(rados_ioctx_t ioctx,
+                                     const char *image_name);
+CEPH_RBD_API int rbd_migration_abort_with_progress(rados_ioctx_t ioctx,
+                                                   const char *image_name,
+                                                   librbd_progress_fn_t cb,
+                                                   void *cbdata);
+CEPH_RBD_API int rbd_migration_commit(rados_ioctx_t ioctx,
+                                      const char *image_name);
+CEPH_RBD_API int rbd_migration_commit_with_progress(rados_ioctx_t ioctx,
+                                                    const char *image_name,
+                                                    librbd_progress_fn_t cb,
+                                                    void *cbdata);
+CEPH_RBD_API int rbd_migration_status(rados_ioctx_t ioctx,
+                                      const char *image_name,
+                                      rbd_image_migration_status_t *status,
+                                      size_t status_size);
+CEPH_RBD_API void rbd_migration_status_cleanup(
+    rbd_image_migration_status_t *status);
+
+/* pool mirroring */
+CEPH_RBD_API int rbd_mirror_site_name_get(rados_t cluster,
+                                          char *name, size_t *max_len);
+CEPH_RBD_API int rbd_mirror_site_name_set(rados_t cluster,
+                                          const char *name);
+
+CEPH_RBD_API int rbd_mirror_mode_get(rados_ioctx_t io_ctx,
+                                     rbd_mirror_mode_t *mirror_mode);
+CEPH_RBD_API int rbd_mirror_mode_set(rados_ioctx_t io_ctx,
+                                     rbd_mirror_mode_t mirror_mode);
+
+CEPH_RBD_API int rbd_mirror_uuid_get(rados_ioctx_t io_ctx,
+                                     char *uuid, size_t *max_len);
+
+CEPH_RBD_API int rbd_mirror_peer_bootstrap_create(
+    rados_ioctx_t io_ctx, char *token, size_t *max_len);
+CEPH_RBD_API int rbd_mirror_peer_bootstrap_import(
+    rados_ioctx_t io_ctx, rbd_mirror_peer_direction_t direction,
+    const char *token);
+
+CEPH_RBD_API int rbd_mirror_peer_site_add(
+    rados_ioctx_t io_ctx, char *uuid, size_t uuid_max_length,
+    rbd_mirror_peer_direction_t direction, const char *site_name,
+    const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_site_set_name(
+    rados_ioctx_t io_ctx, const char *uuid, const char *site_name);
+CEPH_RBD_API int rbd_mirror_peer_site_set_client_name(
+    rados_ioctx_t io_ctx, const char *uuid, const char *client_name);
+CEPH_RBD_API int rbd_mirror_peer_site_set_direction(
+    rados_ioctx_t io_ctx, const char *uuid,
+    rbd_mirror_peer_direction_t direction);
+CEPH_RBD_API int rbd_mirror_peer_site_remove(
+    rados_ioctx_t io_ctx, const char *uuid);
+CEPH_RBD_API int rbd_mirror_peer_site_list(
+    rados_ioctx_t io_ctx, rbd_mirror_peer_site_t *peers, int *max_peers);
+CEPH_RBD_API void rbd_mirror_peer_site_list_cleanup(
+    rbd_mirror_peer_site_t *peers, int max_peers);
+CEPH_RBD_API int rbd_mirror_peer_site_get_attributes(
+    rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len,
+    char *values, size_t *max_value_len, size_t *key_value_count);
+CEPH_RBD_API int rbd_mirror_peer_site_set_attributes(
+    rados_ioctx_t p, const char *uuid, const char *keys, const char *values,
+    size_t key_value_count);
+
+CEPH_RBD_API int rbd_mirror_image_global_status_list(
+    rados_ioctx_t io_ctx, const char *start_id, size_t max, char **image_ids,
+    rbd_mirror_image_global_status_t *images, size_t *len);
+CEPH_RBD_API void rbd_mirror_image_global_status_list_cleanup(
+    char **image_ids, rbd_mirror_image_global_status_t *images, size_t len);
+
+/* rbd_mirror_peer_ commands are deprecated to rbd_mirror_peer_site_
+ * equivalents */
+CEPH_RBD_API int rbd_mirror_peer_add(
+    rados_ioctx_t io_ctx, char *uuid, size_t uuid_max_length,
+    const char *cluster_name, const char *client_name)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_remove(
+    rados_ioctx_t io_ctx, const char *uuid)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_list(
+    rados_ioctx_t io_ctx, rbd_mirror_peer_t *peers, int *max_peers)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API void rbd_mirror_peer_list_cleanup(
+    rbd_mirror_peer_t *peers, int max_peers)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_set_client(
+    rados_ioctx_t io_ctx, const char *uuid, const char *client_name)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_set_cluster(
+    rados_ioctx_t io_ctx, const char *uuid, const char *cluster_name)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_get_attributes(
+    rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len,
+    char *values, size_t *max_value_len, size_t *key_value_count)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_peer_set_attributes(
+    rados_ioctx_t p, const char *uuid, const char *keys, const char *values,
+    size_t key_value_count)
+  CEPH_RBD_DEPRECATED;
+
+/* rbd_mirror_image_status_list_ commands are deprecard to
+ * rbd_mirror_image_global_status_list_ commands */
+
+CEPH_RBD_API int rbd_mirror_image_status_list(
+    rados_ioctx_t io_ctx, const char *start_id, size_t max, char **image_ids,
+    rbd_mirror_image_status_t *images, size_t *len)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API void rbd_mirror_image_status_list_cleanup(
+    char **image_ids, rbd_mirror_image_status_t *images, size_t len)
+  CEPH_RBD_DEPRECATED;
+
+CEPH_RBD_API int rbd_mirror_image_status_summary(
+    rados_ioctx_t io_ctx, rbd_mirror_image_status_state_t *states, int *counts,
+    size_t *maxlen);
+
+CEPH_RBD_API int rbd_mirror_image_instance_id_list(rados_ioctx_t io_ctx,
+                                                   const char *start_id,
+                                                   size_t max, char **image_ids,
+                                                   char **instance_ids,
+                                                   size_t *len);
+CEPH_RBD_API void rbd_mirror_image_instance_id_list_cleanup(char **image_ids,
+                                                            char **instance_ids,
+                                                            size_t len);
+CEPH_RBD_API int rbd_mirror_image_info_list(
+    rados_ioctx_t io_ctx, rbd_mirror_image_mode_t *mode_filter,
+    const char *start_id, size_t max, char **image_ids,
+    rbd_mirror_image_mode_t *mode_entries,
+    rbd_mirror_image_info_t *info_entries, size_t *num_entries);
+CEPH_RBD_API void rbd_mirror_image_info_list_cleanup(
+    char **image_ids, rbd_mirror_image_info_t *info_entries,
+    size_t num_entries);
+
+/* pool metadata */
+CEPH_RBD_API int rbd_pool_metadata_get(rados_ioctx_t io_ctx, const char *key,
+                                       char *value, size_t *val_len);
+CEPH_RBD_API int rbd_pool_metadata_set(rados_ioctx_t io_ctx, const char *key,
+                                       const char *value);
+CEPH_RBD_API int rbd_pool_metadata_remove(rados_ioctx_t io_ctx,
+                                          const char *key);
+CEPH_RBD_API int rbd_pool_metadata_list(rados_ioctx_t io_ctx, const char *start,
+                                        uint64_t max, char *keys,
+                                        size_t *key_len, char *values,
+                                        size_t *vals_len);
+
+CEPH_RBD_API int rbd_config_pool_list(rados_ioctx_t io_ctx,
+                                      rbd_config_option_t *options,
+                                      int *max_options);
+CEPH_RBD_API void rbd_config_pool_list_cleanup(rbd_config_option_t *options,
+                                               int max_options);
+
+CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name,
+                          rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_open_by_id(rados_ioctx_t io, const char *id,
+                                rbd_image_t *image, const char *snap_name);
+
+CEPH_RBD_API int rbd_aio_open(rados_ioctx_t io, const char *name,
+			      rbd_image_t *image, const char *snap_name,
+			      rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_open_by_id(rados_ioctx_t io, const char *id,
+                                    rbd_image_t *image, const char *snap_name,
+                                    rbd_completion_t c);
+
+/**
+ * Open an image in read-only mode.
+ *
+ * This is intended for use by clients that cannot write to a block
+ * device due to cephx restrictions. There will be no watch
+ * established on the header object, since a watch is a write. This
+ * means the metadata reported about this image (parents, snapshots,
+ * size, etc.) may become stale. This should not be used for
+ * long-running operations, unless you can be sure that one of these
+ * properties changing is safe.
+ *
+ * Attempting to write to a read-only image will return -EROFS.
+ *
+ * @param io ioctx to determine the pool the image is in
+ * @param name image name
+ * @param image where to store newly opened image handle
+ * @param snap_name name of snapshot to open at, or NULL for no snapshot
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_open_read_only(rados_ioctx_t io, const char *name,
+                                    rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_open_by_id_read_only(rados_ioctx_t io, const char *id,
+                                          rbd_image_t *image, const char *snap_name);
+CEPH_RBD_API int rbd_aio_open_read_only(rados_ioctx_t io, const char *name,
+					rbd_image_t *image, const char *snap_name,
+					rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_open_by_id_read_only(rados_ioctx_t io, const char *id,
+                                              rbd_image_t *image, const char *snap_name,
+                                              rbd_completion_t c);
+CEPH_RBD_API int rbd_features_to_string(uint64_t features, char *str_features,
+                                        size_t *size);
+CEPH_RBD_API int rbd_features_from_string(const char *str_features, uint64_t *features);
+CEPH_RBD_API int rbd_close(rbd_image_t image);
+CEPH_RBD_API int rbd_aio_close(rbd_image_t image, rbd_completion_t c);
+CEPH_RBD_API int rbd_resize(rbd_image_t image, uint64_t size);
+CEPH_RBD_API int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink,
+			     librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_resize_with_progress(rbd_image_t image, uint64_t size,
+			     librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_stat(rbd_image_t image, rbd_image_info_t *info,
+                          size_t infosize);
+CEPH_RBD_API int rbd_get_old_format(rbd_image_t image, uint8_t *old);
+CEPH_RBD_API int rbd_get_size(rbd_image_t image, uint64_t *size);
+CEPH_RBD_API int rbd_get_features(rbd_image_t image, uint64_t *features);
+CEPH_RBD_API int rbd_update_features(rbd_image_t image, uint64_t features,
+                                     uint8_t enabled);
+CEPH_RBD_API int rbd_get_op_features(rbd_image_t image, uint64_t *op_features);
+CEPH_RBD_API int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit);
+CEPH_RBD_API int rbd_get_stripe_count(rbd_image_t image,
+                                      uint64_t *stripe_count);
+
+CEPH_RBD_API int rbd_get_create_timestamp(rbd_image_t image,
+                                          struct timespec *timestamp);
+CEPH_RBD_API int rbd_get_access_timestamp(rbd_image_t image,
+                                          struct timespec *timestamp);
+CEPH_RBD_API int rbd_get_modify_timestamp(rbd_image_t image,
+                                          struct timespec *timestamp);
+
+CEPH_RBD_API int rbd_get_overlap(rbd_image_t image, uint64_t *overlap);
+CEPH_RBD_API int rbd_get_name(rbd_image_t image, char *name, size_t *name_len);
+CEPH_RBD_API int rbd_get_id(rbd_image_t image, char *id, size_t id_len);
+CEPH_RBD_API int rbd_get_block_name_prefix(rbd_image_t image,
+                                           char *prefix, size_t prefix_len);
+CEPH_RBD_API int64_t rbd_get_data_pool_id(rbd_image_t image);
+
+CEPH_RBD_API int rbd_get_parent_info(rbd_image_t image,
+			             char *parent_poolname, size_t ppoolnamelen,
+			             char *parent_name, size_t pnamelen,
+			             char *parent_snapname,
+                                     size_t psnapnamelen)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_get_parent_info2(rbd_image_t image,
+                                      char *parent_poolname,
+                                      size_t ppoolnamelen,
+                                      char *parent_name, size_t pnamelen,
+                                      char *parent_id, size_t pidlen,
+                                      char *parent_snapname,
+                                      size_t psnapnamelen)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_get_parent(rbd_image_t image,
+                                rbd_linked_image_spec_t *parent_image,
+                                rbd_snap_spec_t *parent_snap);
+
+CEPH_RBD_API int rbd_get_migration_source_spec(rbd_image_t image,
+                                               char* source_spec,
+                                               size_t* max_len);
+
+CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags);
+CEPH_RBD_API int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info,
+                               size_t group_info_size);
+CEPH_RBD_API int rbd_set_image_notification(rbd_image_t image, int fd, int type);
+
+/* exclusive lock feature */
+CEPH_RBD_API int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner);
+CEPH_RBD_API int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode);
+CEPH_RBD_API int rbd_lock_release(rbd_image_t image);
+CEPH_RBD_API int rbd_lock_get_owners(rbd_image_t image,
+                                     rbd_lock_mode_t *lock_mode,
+                                     char **lock_owners,
+                                     size_t *max_lock_owners);
+CEPH_RBD_API void rbd_lock_get_owners_cleanup(char **lock_owners,
+                                              size_t lock_owner_count);
+CEPH_RBD_API int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode,
+                                const char *lock_owner);
+
+/* object map feature */
+CEPH_RBD_API int rbd_rebuild_object_map(rbd_image_t image,
+                                        librbd_progress_fn_t cb, void *cbdata);
+
+CEPH_RBD_API int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx,
+                          const char *destname);
+CEPH_RBD_API int rbd_copy2(rbd_image_t src, rbd_image_t dest);
+CEPH_RBD_API int rbd_copy3(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+			   const char *destname, rbd_image_options_t dest_opts);
+CEPH_RBD_API int rbd_copy4(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+			   const char *destname, rbd_image_options_t dest_opts,
+			   size_t sparse_size);
+CEPH_RBD_API int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
+                                        const char *destname,
+                                        librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress2(rbd_image_t src, rbd_image_t dest,
+			                 librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress3(rbd_image_t image,
+					 rados_ioctx_t dest_p,
+					 const char *destname,
+					 rbd_image_options_t dest_opts,
+					 librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress4(rbd_image_t image,
+					 rados_ioctx_t dest_p,
+					 const char *destname,
+					 rbd_image_options_t dest_opts,
+					 librbd_progress_fn_t cb, void *cbdata,
+					 size_t sparse_size);
+
+/* deep copy */
+CEPH_RBD_API int rbd_deep_copy(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+                               const char *destname,
+                               rbd_image_options_t dest_opts);
+CEPH_RBD_API int rbd_deep_copy_with_progress(rbd_image_t image,
+                                             rados_ioctx_t dest_io_ctx,
+                                             const char *destname,
+                                             rbd_image_options_t dest_opts,
+                                             librbd_progress_fn_t cb,
+                                             void *cbdata);
+
+/* encryption */
+
+/*
+ * Format the image using the encryption spec specified by
+ * (format, opts, opts_size) tuple.
+ *
+ * For a flat (i.e. non-cloned) image, the new encryption is loaded
+ * implicitly, calling rbd_encryption_load() afterwards is not needed.
+ * If existing encryption is already loaded, it is automatically
+ * replaced with the new encryption.
+ *
+ * For a cloned image, the new encryption must be loaded explicitly.
+ * Existing encryption (if any) must not be loaded.
+ */
+CEPH_RBD_API int rbd_encryption_format(rbd_image_t image,
+                                       rbd_encryption_format_t format,
+                                       rbd_encryption_options_t opts,
+                                       size_t opts_size);
+/*
+ * Load the encryption spec specified by (format, opts, opts_size)
+ * tuple for the image and all ancestor images.  If an ancestor image
+ * which does not match any encryption format known to librbd is
+ * encountered, it - along with remaining ancestor images - is
+ * interpreted as plaintext.
+ */
+CEPH_RBD_API int rbd_encryption_load(rbd_image_t image,
+                                     rbd_encryption_format_t format,
+                                     rbd_encryption_options_t opts,
+                                     size_t opts_size);
+/*
+ * Load encryption specs.  The first spec in the passed array is
+ * applied to the image itself, the second spec is applied to its
+ * ancestor image, the third spec is applied to the ancestor of
+ * that ancestor image and so on.
+ *
+ * If not enough specs are passed, the last spec is reused exactly as
+ * in rbd_encryption_load().  If an ancestor image for which the last
+ * spec is being reused turns out to not match any encryption format
+ * known to librbd, it - along with remaining ancestor images - is
+ * interpreted as plaintext.
+ */
+CEPH_RBD_API int rbd_encryption_load2(rbd_image_t image,
+                                      const rbd_encryption_spec_t *specs,
+                                      size_t spec_count);
+
+/* snapshots */
+CEPH_RBD_API int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
+                               int *max_snaps);
+CEPH_RBD_API void rbd_snap_list_end(rbd_snap_info_t *snaps);
+CEPH_RBD_API int rbd_snap_exists(rbd_image_t image, const char *snapname, bool *exists);
+CEPH_RBD_API int rbd_snap_create(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_create2(rbd_image_t image, const char *snap_name,
+                                  uint32_t flags, librbd_progress_fn_t cb,
+                                  void *cbdata);
+CEPH_RBD_API int rbd_snap_remove(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_remove2(rbd_image_t image, const char *snap_name,
+                                  uint32_t flags, librbd_progress_fn_t cb,
+                                  void *cbdata);
+CEPH_RBD_API int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id);
+CEPH_RBD_API int rbd_snap_rollback(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_rollback_with_progress(rbd_image_t image,
+                                                 const char *snapname,
+				                 librbd_progress_fn_t cb,
+                                                 void *cbdata);
+CEPH_RBD_API int rbd_snap_rename(rbd_image_t image, const char *snapname,
+				 const char* dstsnapsname);
+/**
+ * Prevent a snapshot from being deleted until it is unprotected.
+ *
+ * @param snap_name which snapshot to protect
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if snap is already protected
+ */
+CEPH_RBD_API int rbd_snap_protect(rbd_image_t image, const char *snap_name);
+/**
+ * Allow a snaphshot to be deleted.
+ *
+ * @param snap_name which snapshot to unprotect
+ * @returns 0 on success, negative error code on failure
+ * @returns -EINVAL if snap is not protected
+ */
+CEPH_RBD_API int rbd_snap_unprotect(rbd_image_t image, const char *snap_name);
+/**
+ * Determine whether a snapshot is protected.
+ *
+ * @param snap_name which snapshot query
+ * @param is_protected where to store the result (0 or 1)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_is_protected(rbd_image_t image, const char *snap_name,
+			               int *is_protected);
+/**
+ * Get the current snapshot limit for an image. If no limit is set,
+ * UINT64_MAX is returned.
+ *
+ * @param limit pointer where the limit will be stored on success
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit);
+
+/**
+ * Set a limit for the number of snapshots that may be taken of an image.
+ *
+ * @param limit the maximum number of snapshots allowed in the future.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_set_limit(rbd_image_t image, uint64_t limit);
+
+/**
+ * Get the timestamp of a snapshot for an image. 
+ *
+ * @param snap_id the snap id of a snapshot of input image.
+ * @param timestamp the timestamp of input snapshot.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp);
+
+CEPH_RBD_API int rbd_snap_set(rbd_image_t image, const char *snapname);
+CEPH_RBD_API int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id);
+CEPH_RBD_API int rbd_snap_get_name(rbd_image_t image, uint64_t snap_id, char *snapname, size_t *name_len);
+CEPH_RBD_API int rbd_snap_get_id(rbd_image_t image, const char *snapname, uint64_t *snap_id);
+
+CEPH_RBD_API int rbd_snap_get_namespace_type(rbd_image_t image,
+                                             uint64_t snap_id,
+                                             rbd_snap_namespace_type_t *namespace_type);
+CEPH_RBD_API int rbd_snap_get_group_namespace(rbd_image_t image,
+                                              uint64_t snap_id,
+                                              rbd_snap_group_namespace_t *group_snap,
+                                              size_t group_snap_size);
+CEPH_RBD_API int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap,
+                                                  size_t group_snap_size);
+CEPH_RBD_API int rbd_snap_get_trash_namespace(rbd_image_t image,
+                                              uint64_t snap_id,
+                                              char* original_name,
+                                              size_t max_length);
+CEPH_RBD_API int rbd_snap_get_mirror_namespace(
+    rbd_image_t image, uint64_t snap_id,
+    rbd_snap_mirror_namespace_t *mirror_snap, size_t mirror_snap_size);
+CEPH_RBD_API int rbd_snap_mirror_namespace_cleanup(
+    rbd_snap_mirror_namespace_t *mirror_snap, size_t mirror_snap_size);
+
+CEPH_RBD_API int rbd_flatten(rbd_image_t image);
+
+CEPH_RBD_API int rbd_flatten_with_progress(rbd_image_t image,
+                                           librbd_progress_fn_t cb,
+                                           void *cbdata);
+
+CEPH_RBD_API int rbd_sparsify(rbd_image_t image, size_t sparse_size);
+
+CEPH_RBD_API int rbd_sparsify_with_progress(rbd_image_t image,
+                                            size_t sparse_size,
+                                            librbd_progress_fn_t cb,
+                                            void *cbdata);
+
+/**
+ * List all images that are cloned from the image at the
+ * snapshot that is set via rbd_snap_set().
+ *
+ * This iterates over all pools, so it should be run by a user with
+ * read access to all of them. pools_len and images_len are filled in
+ * with the number of bytes put into the pools and images buffers.
+ *
+ * If the provided buffers are too short, the required lengths are
+ * still filled in, but the data is not and -ERANGE is returned.
+ * Otherwise, the buffers are filled with the pool and image names
+ * of the children, with a '\0' after each.
+ *
+ * @param image which image (and implicitly snapshot) to list clones of
+ * @param pools buffer in which to store pool names
+ * @param pools_len number of bytes in pools buffer
+ * @param images buffer in which to store image names
+ * @param images_len number of bytes in images buffer
+ * @returns number of children on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RBD_API ssize_t rbd_list_children(rbd_image_t image, char *pools,
+                                       size_t *pools_len, char *images,
+                                       size_t *images_len)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_list_children2(rbd_image_t image,
+                                    rbd_child_info_t *children,
+                                    int *max_children)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API void rbd_list_child_cleanup(rbd_child_info_t *child)
+  CEPH_RBD_DEPRECATED;
+CEPH_RBD_API void rbd_list_children_cleanup(rbd_child_info_t *children,
+                                            size_t num_children)
+  CEPH_RBD_DEPRECATED;
+
+CEPH_RBD_API int rbd_list_children3(rbd_image_t image,
+                                    rbd_linked_image_spec_t *images,
+                                    size_t *max_images);
+
+CEPH_RBD_API int rbd_list_descendants(rbd_image_t image,
+                                      rbd_linked_image_spec_t *images,
+                                      size_t *max_images);
+
+/**
+ * @defgroup librbd_h_locking Advisory Locking
+ *
+ * An rbd image may be locking exclusively, or shared, to facilitate
+ * e.g. live migration where the image may be open in two places at once.
+ * These locks are intended to guard against more than one client
+ * writing to an image without coordination. They don't need to
+ * be used for snapshots, since snapshots are read-only.
+ *
+ * Currently locks only guard against locks being acquired.
+ * They do not prevent anything else.
+ *
+ * A locker is identified by the internal rados client id of the
+ * holder and a user-defined cookie. This (client id, cookie) pair
+ * must be unique for each locker.
+ *
+ * A shared lock also has a user-defined tag associated with it. Each
+ * additional shared lock must specify the same tag or lock
+ * acquisition will fail. This can be used by e.g. groups of hosts
+ * using a clustered filesystem on top of an rbd image to make sure
+ * they're accessing the correct image.
+ *
+ * @{
+ */
+/**
+ * List clients that have locked the image and information about the lock.
+ *
+ * The number of bytes required in each buffer is put in the
+ * corresponding size out parameter. If any of the provided buffers
+ * are too short, -ERANGE is returned after these sizes are filled in.
+ *
+ * @param exclusive where to store whether the lock is exclusive (1) or shared (0)
+ * @param tag where to store the tag associated with the image
+ * @param tag_len number of bytes in tag buffer
+ * @param clients buffer in which locker clients are stored, separated by '\0'
+ * @param clients_len number of bytes in the clients buffer
+ * @param cookies buffer in which locker cookies are stored, separated by '\0'
+ * @param cookies_len number of bytes in the cookies buffer
+ * @param addrs buffer in which locker addresses are stored, separated by '\0'
+ * @param addrs_len number of bytes in the clients buffer
+ * @returns number of lockers on success, negative error code on failure
+ * @returns -ERANGE if any of the buffers are too short
+ */
+CEPH_RBD_API ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive,
+			              char *tag, size_t *tag_len,
+			              char *clients, size_t *clients_len,
+			              char *cookies, size_t *cookies_len,
+			              char *addrs, size_t *addrs_len);
+
+/**
+ * Take an exclusive lock on the image.
+ *
+ * @param image the image to lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_lock_exclusive(rbd_image_t image, const char *cookie);
+
+/**
+ * Take a shared lock on the image.
+ *
+ * Other clients may also take a shared lock, as lock as they use the
+ * same tag.
+ *
+ * @param image the image to lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param tag user-defined identifier for this shared use of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_lock_shared(rbd_image_t image, const char *cookie,
+                                 const char *tag);
+
+/**
+ * Release a shared or exclusive lock on the image.
+ *
+ * @param image the image to unlock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_unlock(rbd_image_t image, const char *cookie);
+
+/**
+ * Release a shared or exclusive lock that was taken by the specified client.
+ *
+ * @param image the image to unlock
+ * @param client the entity holding the lock (as given by rbd_list_lockers())
+ * @param cookie user-defined identifier for the instance of the lock to break
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RBD_API int rbd_break_lock(rbd_image_t image, const char *client,
+                                const char *cookie);
+
+/** @} locking */
+
+/* I/O */
+CEPH_RBD_API ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len,
+                              char *buf);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len,
+                               char *buf, int op_flags);
+/* DEPRECATED; use rbd_read_iterate2 */
+CEPH_RBD_API int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len,
+			              int (*cb)(uint64_t, size_t, const char *, void *),
+                                      void *arg);
+
+/**
+ * iterate read over an image
+ *
+ * Reads each region of the image and calls the callback.  If the
+ * buffer pointer passed to the callback is NULL, the given extent is
+ * defined to be zeros (a hole).  Normally the granularity for the
+ * callback is the image stripe size.
+ *
+ * @param image image to read
+ * @param ofs offset to start from
+ * @param len bytes of source image to cover
+ * @param cb callback for each region
+ * @returns 0 success, error otherwise
+ */
+CEPH_RBD_API int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len,
+		                   int (*cb)(uint64_t, size_t, const char *, void *),
+                                   void *arg);
+/**
+ * get difference between two versions of an image
+ *
+ * This will return the differences between two versions of an image
+ * via a callback, which gets the offset and length and a flag
+ * indicating whether the extent exists (1), or is known/defined to
+ * be zeros (a hole, 0).  If the source snapshot name is NULL, we
+ * interpret that as the beginning of time and return all allocated
+ * regions of the image.  The end version is whatever is currently
+ * selected for the image handle (either a snapshot or the writeable
+ * head).
+ *
+ * @param fromsnapname start snapshot name, or NULL
+ * @param ofs start offset
+ * @param len len in bytes of region to report on
+ * @param include_parent 1 if full history diff should include parent
+ * @param whole_object 1 if diff extents should cover whole object
+ * @param cb callback to call for each allocated region
+ * @param arg argument to pass to the callback
+ * @returns 0 on success, or negative error code on error
+ */
+CEPH_RBD_API int rbd_diff_iterate(rbd_image_t image,
+		                  const char *fromsnapname,
+		                  uint64_t ofs, uint64_t len,
+		                  int (*cb)(uint64_t, size_t, int, void *),
+                                  void *arg);
+CEPH_RBD_API int rbd_diff_iterate2(rbd_image_t image,
+		                   const char *fromsnapname,
+		                   uint64_t ofs, uint64_t len,
+                                   uint8_t include_parent, uint8_t whole_object,
+		                   int (*cb)(uint64_t, size_t, int, void *),
+                                   void *arg);
+CEPH_RBD_API ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len,
+                               const char *buf);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
+                                const char *buf, int op_flags);
+CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len);
+CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len,
+                                   const char *buf, size_t data_len,
+                                   int op_flags);
+CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs,
+                                      size_t len, int zero_flags,
+                                      int op_flags);
+CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs,
+                                           size_t len, const char *cmp_buf,
+                                           const char *buf,
+                                           uint64_t *mismatch_off,
+                                           int op_flags);
+
+CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
+                               const char *buf, rbd_completion_t c);
+
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len,
+                                const char *buf, rbd_completion_t c,
+                                int op_flags);
+CEPH_RBD_API int rbd_aio_writev(rbd_image_t image, const struct iovec *iov,
+                                int iovcnt, uint64_t off, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
+                              char *buf, rbd_completion_t c);
+/*
+ * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RBD_API int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len,
+                               char *buf, rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_readv(rbd_image_t image, const struct iovec *iov,
+                               int iovcnt, uint64_t off, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
+                                 rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len,
+                                   const char *buf, size_t data_len,
+                                   rbd_completion_t c, int op_flags);
+CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off,
+                                      size_t len, rbd_completion_t c,
+                                      int zero_flags, int op_flags);
+CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image,
+                                               uint64_t off, size_t len,
+                                               const char *cmp_buf,
+                                               const char *buf,
+                                               rbd_completion_t c,
+                                               uint64_t *mismatch_off,
+                                               int op_flags);
+CEPH_RBD_API ssize_t rbd_aio_compare_and_writev(rbd_image_t image,
+                                                uint64_t off,
+                                                const struct iovec *cmp_iov,
+                                                int cmp_iovcnt,
+                                                const struct iovec *iov,
+                                                int iovcnt,
+                                                rbd_completion_t c,
+                                                uint64_t *mismatch_off,
+                                                int op_flags);
+
+CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg,
+                                           rbd_callback_t complete_cb,
+                                           rbd_completion_t *c);
+CEPH_RBD_API int rbd_aio_is_complete(rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_wait_for_complete(rbd_completion_t c);
+CEPH_RBD_API ssize_t rbd_aio_get_return_value(rbd_completion_t c);
+CEPH_RBD_API void *rbd_aio_get_arg(rbd_completion_t c);
+CEPH_RBD_API void rbd_aio_release(rbd_completion_t c);
+CEPH_RBD_API int rbd_flush(rbd_image_t image);
+/**
+ * Start a flush if caching is enabled. Get a callback when
+ * the currently pending writes are on disk.
+ *
+ * @param image the image to flush writes to
+ * @param c what to call when flushing is complete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
+
+/**
+ * Drop any cached data for an image
+ *
+ * @param image the image to invalidate cached data for
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_invalidate_cache(rbd_image_t image);
+
+CEPH_RBD_API int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp);
+
+CEPH_RBD_API int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *val_len);
+CEPH_RBD_API int rbd_metadata_set(rbd_image_t image, const char *key, const char *value);
+CEPH_RBD_API int rbd_metadata_remove(rbd_image_t image, const char *key);
+/**
+ * List all metadatas associated with this image.
+ *
+ * This iterates over all metadatas, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are
+ * still filled in, but the data is not and -ERANGE is returned.
+ * Otherwise, the buffers are filled with the keys and values
+ * of the image, with a '\0' after each.
+ *
+ * @param image which image (and implicitly snapshot) to list clones of
+ * @param start_after which name to begin listing after
+ *        (use the empty string to start at the beginning)
+ * @param max the maximum number of names to lis(if 0 means no limit)
+ * @param keys buffer in which to store pool names
+ * @param keys_len number of bytes in pools buffer
+ * @param values buffer in which to store image names
+ * @param vals_len number of bytes in images buffer
+ * @returns number of children on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RBD_API int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max,
+    char *keys, size_t *key_len, char *values, size_t *vals_len);
+
+// RBD image mirroring support functions
+CEPH_RBD_API int rbd_mirror_image_enable(rbd_image_t image) CEPH_RBD_DEPRECATED;
+CEPH_RBD_API int rbd_mirror_image_enable2(rbd_image_t image,
+                                          rbd_mirror_image_mode_t mode);
+CEPH_RBD_API int rbd_mirror_image_disable(rbd_image_t image, bool force);
+CEPH_RBD_API int rbd_mirror_image_promote(rbd_image_t image, bool force);
+CEPH_RBD_API int rbd_mirror_image_demote(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_resync(rbd_image_t image);
+CEPH_RBD_API int rbd_mirror_image_create_snapshot(rbd_image_t image,
+                                                  uint64_t *snap_id);
+CEPH_RBD_API int rbd_mirror_image_create_snapshot2(rbd_image_t image,
+                                                   uint32_t flags,
+                                                   uint64_t *snap_id);
+CEPH_RBD_API int rbd_mirror_image_get_info(rbd_image_t image,
+                                           rbd_mirror_image_info_t *mirror_image_info,
+                                           size_t info_size);
+CEPH_RBD_API void rbd_mirror_image_get_info_cleanup(
+    rbd_mirror_image_info_t *mirror_image_info);
+CEPH_RBD_API int rbd_mirror_image_get_mode(rbd_image_t image,
+                                           rbd_mirror_image_mode_t *mode);
+
+CEPH_RBD_API int rbd_mirror_image_get_global_status(
+    rbd_image_t image,
+    rbd_mirror_image_global_status_t *mirror_image_global_status,
+    size_t status_size);
+CEPH_RBD_API void rbd_mirror_image_global_status_cleanup(
+    rbd_mirror_image_global_status_t *mirror_image_global_status);
+
+CEPH_RBD_API int rbd_mirror_image_get_status(
+    rbd_image_t image, rbd_mirror_image_status_t *mirror_image_status,
+    size_t status_size)
+  CEPH_RBD_DEPRECATED;
+
+CEPH_RBD_API int rbd_mirror_image_get_instance_id(rbd_image_t image,
+                                                  char *instance_id,
+                                                  size_t *id_max_length);
+CEPH_RBD_API int rbd_aio_mirror_image_promote(rbd_image_t image, bool force,
+                                              rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_demote(rbd_image_t image,
+                                             rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_info(rbd_image_t image,
+                                               rbd_mirror_image_info_t *mirror_image_info,
+                                               size_t info_size,
+                                               rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_mode(rbd_image_t image,
+                                               rbd_mirror_image_mode_t *mode,
+                                               rbd_completion_t c);
+
+CEPH_RBD_API int rbd_aio_mirror_image_get_global_status(
+    rbd_image_t image,
+    rbd_mirror_image_global_status_t *mirror_global_image_status,
+    size_t status_size, rbd_completion_t c);
+CEPH_RBD_API int rbd_aio_mirror_image_get_status(
+    rbd_image_t image, rbd_mirror_image_status_t *mirror_image_status,
+    size_t status_size, rbd_completion_t c)
+  CEPH_RBD_DEPRECATED;
+
+CEPH_RBD_API int rbd_aio_mirror_image_create_snapshot(rbd_image_t image,
+                                                      uint32_t flags,
+                                                      uint64_t *snap_id,
+                                                      rbd_completion_t c);
+
+// RBD groups support functions
+CEPH_RBD_API int rbd_group_create(rados_ioctx_t p, const char *name);
+CEPH_RBD_API int rbd_group_remove(rados_ioctx_t p, const char *name);
+CEPH_RBD_API int rbd_group_list(rados_ioctx_t p, char *names, size_t *size);
+CEPH_RBD_API int rbd_group_rename(rados_ioctx_t p, const char *src_name,
+                                  const char *dest_name);
+CEPH_RBD_API int rbd_group_info_cleanup(rbd_group_info_t *group_info,
+                                        size_t group_info_size);
+
+/**
+ * Register an image metadata change watcher.
+ *
+ * @param image the image to watch
+ * @param handle where to store the internal id assigned to this watch
+ * @param watch_cb what to do when a notify is received on this image
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_update_watch(rbd_image_t image, uint64_t *handle,
+				  rbd_update_callback_t watch_cb, void *arg);
+
+/**
+ * Unregister an image watcher.
+ *
+ * @param image the image to unwatch
+ * @param handle which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_update_unwatch(rbd_image_t image, uint64_t handle);
+
+/**
+ * List any watchers of an image.
+ *
+ * Watchers will be allocated and stored in the passed watchers array. If there
+ * are more watchers than max_watchers, -ERANGE will be returned and the number
+ * of watchers will be stored in max_watchers.
+ *
+ * The caller should call rbd_watchers_list_cleanup when finished with the list
+ * of watchers.
+ *
+ * @param image the image to list watchers for.
+ * @param watchers an array to store watchers in.
+ * @param max_watchers capacity of the watchers array.
+ * @returns 0 on success, negative error code on failure.
+ * @returns -ERANGE if there are too many watchers for the passed array.
+ * @returns the number of watchers in max_watchers.
+ */
+CEPH_RBD_API int rbd_watchers_list(rbd_image_t image,
+				   rbd_image_watcher_t *watchers,
+				   size_t *max_watchers);
+
+CEPH_RBD_API void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers,
+					    size_t num_watchers);
+
+CEPH_RBD_API int rbd_config_image_list(rbd_image_t image,
+                                       rbd_config_option_t *options,
+                                       int *max_options);
+CEPH_RBD_API void rbd_config_image_list_cleanup(rbd_config_option_t *options,
+                                                int max_options);
+
+CEPH_RBD_API int rbd_group_image_add(rados_ioctx_t group_p,
+                                     const char *group_name,
+                                     rados_ioctx_t image_p,
+                                     const char *image_name);
+CEPH_RBD_API int rbd_group_image_remove(rados_ioctx_t group_p,
+                                        const char *group_name,
+                                        rados_ioctx_t image_p,
+                                        const char *image_name);
+CEPH_RBD_API int rbd_group_image_remove_by_id(rados_ioctx_t group_p,
+                                              const char *group_name,
+                                              rados_ioctx_t image_p,
+                                              const char *image_id);
+CEPH_RBD_API int rbd_group_image_list(rados_ioctx_t group_p,
+                                      const char *group_name,
+                                      rbd_group_image_info_t *images,
+                                      size_t group_image_info_size,
+                                      size_t *num_entries);
+CEPH_RBD_API int rbd_group_image_list_cleanup(rbd_group_image_info_t *images,
+                                              size_t group_image_info_size,
+                                              size_t num_entries);
+
+CEPH_RBD_API int rbd_group_snap_create(rados_ioctx_t group_p,
+                                       const char *group_name,
+                                       const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_create2(rados_ioctx_t group_p,
+                                        const char *group_name,
+                                        const char *snap_name,
+                                        uint32_t flags);
+CEPH_RBD_API int rbd_group_snap_remove(rados_ioctx_t group_p,
+                                       const char *group_name,
+                                       const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_rename(rados_ioctx_t group_p,
+                                       const char *group_name,
+                                       const char *old_snap_name,
+                                       const char *new_snap_name);
+CEPH_RBD_API int rbd_group_snap_list(rados_ioctx_t group_p,
+                                     const char *group_name,
+                                     rbd_group_snap_info_t *snaps,
+                                     size_t group_snap_info_size,
+                                     size_t *num_entries);
+CEPH_RBD_API int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps,
+                                             size_t group_snap_info_size,
+                                             size_t num_entries);
+CEPH_RBD_API int rbd_group_snap_rollback(rados_ioctx_t group_p,
+                                         const char *group_name,
+                                         const char *snap_name);
+CEPH_RBD_API int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p,
+                                                       const char *group_name,
+                                                       const char *snap_name,
+                                                       librbd_progress_fn_t cb,
+                                                       void *cbdata);
+
+CEPH_RBD_API int rbd_namespace_create(rados_ioctx_t io,
+                                      const char *namespace_name);
+CEPH_RBD_API int rbd_namespace_remove(rados_ioctx_t io,
+                                      const char *namespace_name);
+CEPH_RBD_API int rbd_namespace_list(rados_ioctx_t io, char *namespace_names,
+                                    size_t *size);
+CEPH_RBD_API int rbd_namespace_exists(rados_ioctx_t io,
+                                      const char *namespace_name,
+                                      bool *exists);
+
+CEPH_RBD_API int rbd_pool_init(rados_ioctx_t io, bool force);
+
+CEPH_RBD_API void rbd_pool_stats_create(rbd_pool_stats_t *stats);
+CEPH_RBD_API void rbd_pool_stats_destroy(rbd_pool_stats_t stats);
+CEPH_RBD_API int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats,
+					          int stat_option,
+                                                  uint64_t* stat_val);
+CEPH_RBD_API int rbd_pool_stats_get(rados_ioctx_t io, rbd_pool_stats_t stats);
+
+/**
+ * Register a quiesce/unquiesce watcher.
+ *
+ * @param image the image to watch
+ * @param quiesce_cb what to do when librbd wants to quiesce
+ * @param unquiesce_cb what to do when librbd wants to unquiesce
+ * @param arg opaque value to pass to the callbacks
+ * @param handle where to store the internal id assigned to this watch
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_quiesce_watch(rbd_image_t image,
+                                   rbd_update_callback_t quiesce_cb,
+                                   rbd_update_callback_t unquiesce_cb,
+                                   void *arg, uint64_t *handle);
+
+/**
+ * Notify quiesce is complete
+ *
+ * @param image the image to notify
+ * @param handle which watch is complete
+ * @param r the return code
+ */
+CEPH_RBD_API void rbd_quiesce_complete(rbd_image_t image, uint64_t handle,
+                                       int r);
+
+/**
+ * Unregister a quiesce/unquiesce watcher.
+ *
+ * @param image the image to unwatch
+ * @param handle which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RBD_API int rbd_quiesce_unwatch(rbd_image_t image, uint64_t handle);
+
+#if __GNUC__ >= 4
+  #pragma GCC diagnostic pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_LIBRBD_H */
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
new file mode 100644
index 000000000..5d307cded
--- /dev/null
+++ b/src/include/rbd/librbd.hpp
@@ -0,0 +1,869 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef __LIBRBD_HPP
+#define __LIBRBD_HPP
+
+#include <string>
+#include <list>
+#include <map>
+#include <vector>
+#include "../rados/buffer.h"
+#include "../rados/librados.hpp"
+#include "librbd.h"
+
+#if __GNUC__ >= 4
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+namespace librbd {
+
+  using librados::IoCtx;
+
+  class Image;
+  class ImageOptions;
+  class PoolStats;
+  typedef void *image_ctx_t;
+  typedef void *completion_t;
+  typedef void (*callback_t)(completion_t cb, void *arg);
+
+  typedef struct {
+    std::string id;
+    std::string name;
+  } image_spec_t;
+
+  typedef struct {
+    int64_t pool_id;
+    std::string pool_name;
+    std::string pool_namespace;
+    std::string image_id;
+    std::string image_name;
+    bool trash;
+  } linked_image_spec_t;
+
+  typedef rbd_snap_namespace_type_t snap_namespace_type_t;
+
+  typedef struct {
+    uint64_t id;
+    snap_namespace_type_t namespace_type;
+    std::string name;
+  } snap_spec_t;
+
+  typedef struct {
+    uint64_t id;
+    uint64_t size;
+    std::string name;
+  } snap_info_t;
+
+  typedef struct {
+    int64_t group_pool;
+    std::string group_name;
+    std::string group_snap_name;
+  } snap_group_namespace_t;
+
+  typedef rbd_snap_mirror_state_t snap_mirror_state_t;
+
+  typedef struct {
+    snap_mirror_state_t state;
+    std::set<std::string> mirror_peer_uuids;
+    bool complete;
+    std::string primary_mirror_uuid;
+    uint64_t primary_snap_id;
+    uint64_t last_copied_object_number;
+  } snap_mirror_namespace_t;
+
+  typedef struct {
+    std::string client;
+    std::string cookie;
+    std::string address;
+  } locker_t;
+
+  typedef rbd_mirror_peer_direction_t mirror_peer_direction_t;
+
+  typedef struct {
+    std::string uuid;
+    std::string cluster_name;
+    std::string client_name;
+  } mirror_peer_t CEPH_RBD_DEPRECATED;
+
+  typedef struct {
+    std::string uuid;
+    mirror_peer_direction_t direction;
+    std::string site_name;
+    std::string mirror_uuid;
+    std::string client_name;
+    time_t last_seen;
+  } mirror_peer_site_t;
+
+  typedef rbd_mirror_image_mode_t mirror_image_mode_t;
+  typedef rbd_mirror_image_state_t mirror_image_state_t;
+
+  typedef struct {
+    std::string global_id;
+    mirror_image_state_t state;
+    bool primary;
+  } mirror_image_info_t;
+
+  typedef rbd_mirror_image_status_state_t mirror_image_status_state_t;
+
+  typedef struct {
+    std::string name;
+    mirror_image_info_t info;
+    mirror_image_status_state_t state;
+    std::string description;
+    time_t last_update;
+    bool up;
+  } mirror_image_status_t CEPH_RBD_DEPRECATED;
+
+  typedef struct {
+    std::string mirror_uuid;
+    mirror_image_status_state_t state;
+    std::string description;
+    time_t last_update;
+    bool up;
+  } mirror_image_site_status_t;
+
+  typedef struct {
+    std::string name;
+    mirror_image_info_t info;
+    std::vector<mirror_image_site_status_t> site_statuses;
+  } mirror_image_global_status_t;
+
+  typedef rbd_group_image_state_t group_image_state_t;
+
+  typedef struct {
+    std::string name;
+    int64_t pool;
+    group_image_state_t state;
+  } group_image_info_t;
+
+  typedef struct {
+    std::string name;
+    int64_t pool;
+  } group_info_t;
+
+  typedef rbd_group_snap_state_t group_snap_state_t;
+
+  typedef struct {
+    std::string name;
+    group_snap_state_t state;
+  } group_snap_info_t;
+
+  typedef rbd_image_info_t image_info_t;
+
+  class CEPH_RBD_API ProgressContext
+  {
+  public:
+    virtual ~ProgressContext();
+    virtual int update_progress(uint64_t offset, uint64_t total) = 0;
+  };
+
+  typedef struct {
+    std::string id;
+    std::string name;
+    rbd_trash_image_source_t source;
+    time_t deletion_time;
+    time_t deferment_end_time;
+  } trash_image_info_t;
+
+  typedef struct {
+    std::string pool_name;
+    std::string image_name;
+    std::string image_id;
+    bool trash;
+  } child_info_t;
+
+  typedef struct {
+    std::string addr;
+    int64_t id;
+    uint64_t cookie;
+  } image_watcher_t;
+
+  typedef rbd_image_migration_state_t image_migration_state_t;
+
+  typedef struct {
+    int64_t source_pool_id;
+    std::string source_pool_namespace;
+    std::string source_image_name;
+    std::string source_image_id;
+    int64_t dest_pool_id;
+    std::string dest_pool_namespace;
+    std::string dest_image_name;
+    std::string dest_image_id;
+    image_migration_state_t state;
+    std::string state_description;
+  } image_migration_status_t;
+
+  typedef rbd_config_source_t config_source_t;
+
+  typedef struct {
+    std::string name;
+    std::string value;
+    config_source_t source;
+  } config_option_t;
+
+  typedef rbd_encryption_format_t encryption_format_t;
+  typedef rbd_encryption_algorithm_t encryption_algorithm_t;
+  typedef rbd_encryption_options_t encryption_options_t;
+  typedef rbd_encryption_spec_t encryption_spec_t;
+
+  typedef struct {
+    encryption_algorithm_t alg;
+    std::string passphrase;
+  } encryption_luks1_format_options_t;
+
+  typedef struct {
+    encryption_algorithm_t alg;
+    std::string passphrase;
+  } encryption_luks2_format_options_t;
+
+  typedef struct {
+    std::string passphrase;
+  } encryption_luks_format_options_t;
+
+class CEPH_RBD_API RBD
+{
+public:
+  RBD();
+  ~RBD();
+
+  // This must be dynamically allocated with new, and
+  // must be released with release().
+  // Do not use delete.
+  struct AioCompletion {
+    void *pc;
+    AioCompletion(void *cb_arg, callback_t complete_cb);
+    bool is_complete();
+    int wait_for_complete();
+    ssize_t get_return_value();
+    void *get_arg();
+    void release();
+  };
+
+  void version(int *major, int *minor, int *extra);
+
+  int open(IoCtx& io_ctx, Image& image, const char *name);
+  int open(IoCtx& io_ctx, Image& image, const char *name, const char *snapname);
+  int open_by_id(IoCtx& io_ctx, Image& image, const char *id);
+  int open_by_id(IoCtx& io_ctx, Image& image, const char *id, const char *snapname);
+  int aio_open(IoCtx& io_ctx, Image& image, const char *name,
+	       const char *snapname, RBD::AioCompletion *c);
+  int aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id,
+	             const char *snapname, RBD::AioCompletion *c);
+  // see librbd.h
+  int open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+		     const char *snapname);
+  int open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+                           const char *snapname);
+  int aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name,
+			 const char *snapname, RBD::AioCompletion *c);
+  int aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id,
+                               const char *snapname, RBD::AioCompletion *c);
+  int features_to_string(uint64_t features, std::string *str_features);
+  int features_from_string(const std::string str_features, uint64_t *features);
+
+  int list(IoCtx& io_ctx, std::vector<std::string>& names)
+    CEPH_RBD_DEPRECATED;
+  int list2(IoCtx& io_ctx, std::vector<image_spec_t>* images);
+
+  int create(IoCtx& io_ctx, const char *name, uint64_t size, int *order);
+  int create2(IoCtx& io_ctx, const char *name, uint64_t size,
+	      uint64_t features, int *order);
+  int create3(IoCtx& io_ctx, const char *name, uint64_t size,
+	      uint64_t features, int *order,
+	      uint64_t stripe_unit, uint64_t stripe_count);
+  int create4(IoCtx& io_ctx, const char *name, uint64_t size,
+	      ImageOptions& opts);
+  int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+	       IoCtx& c_ioctx, const char *c_name, uint64_t features,
+	       int *c_order);
+  int clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+	     IoCtx& c_ioctx, const char *c_name, uint64_t features,
+	     int *c_order, uint64_t stripe_unit, int stripe_count);
+  int clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+	     IoCtx& c_ioctx, const char *c_name, ImageOptions& opts);
+  int remove(IoCtx& io_ctx, const char *name);
+  int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx);
+  int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname);
+
+  int trash_move(IoCtx &io_ctx, const char *name, uint64_t delay);
+  int trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info);
+  int trash_list(IoCtx &io_ctx, std::vector<trash_image_info_t> &entries);
+  int trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold);
+  int trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts, float threshold,
+                                ProgressContext &pctx);
+  int trash_remove(IoCtx &io_ctx, const char *image_id, bool force);
+  int trash_remove_with_progress(IoCtx &io_ctx, const char *image_id,
+                                 bool force, ProgressContext &pctx);
+  int trash_restore(IoCtx &io_ctx, const char *id, const char *name);
+
+  // Migration
+  int migration_prepare(IoCtx& io_ctx, const char *image_name,
+                        IoCtx& dest_io_ctx, const char *dest_image_name,
+                        ImageOptions& opts);
+  int migration_prepare_import(const char *source_spec, IoCtx& dest_io_ctx,
+                               const char *dest_image_name, ImageOptions& opts);
+  int migration_execute(IoCtx& io_ctx, const char *image_name);
+  int migration_execute_with_progress(IoCtx& io_ctx, const char *image_name,
+                                      ProgressContext &prog_ctx);
+  int migration_abort(IoCtx& io_ctx, const char *image_name);
+  int migration_abort_with_progress(IoCtx& io_ctx, const char *image_name,
+                                    ProgressContext &prog_ctx);
+  int migration_commit(IoCtx& io_ctx, const char *image_name);
+  int migration_commit_with_progress(IoCtx& io_ctx, const char *image_name,
+                                     ProgressContext &prog_ctx);
+  int migration_status(IoCtx& io_ctx, const char *image_name,
+                       image_migration_status_t *status, size_t status_size);
+
+  // RBD pool mirroring support functions
+  int mirror_site_name_get(librados::Rados& rados, std::string* site_name);
+  int mirror_site_name_set(librados::Rados& rados,
+                           const std::string& site_name);
+
+  int mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode);
+  int mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode);
+
+  int mirror_uuid_get(IoCtx& io_ctx, std::string* mirror_uuid);
+
+  int mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token);
+  int mirror_peer_bootstrap_import(IoCtx& io_ctx,
+                                   mirror_peer_direction_t direction,
+                                   const std::string &token);
+
+  int mirror_peer_site_add(IoCtx& io_ctx, std::string *uuid,
+                           mirror_peer_direction_t direction,
+                           const std::string &site_name,
+                           const std::string &client_name);
+  int mirror_peer_site_set_name(IoCtx& io_ctx, const std::string& uuid,
+                                const std::string &site_name);
+  int mirror_peer_site_set_client_name(IoCtx& io_ctx, const std::string& uuid,
+                                       const std::string &client_name);
+  int mirror_peer_site_set_direction(IoCtx& io_ctx, const std::string& uuid,
+                                     mirror_peer_direction_t direction);
+  int mirror_peer_site_remove(IoCtx& io_ctx, const std::string& uuid);
+  int mirror_peer_site_list(IoCtx& io_ctx,
+                            std::vector<mirror_peer_site_t> *peers);
+  int mirror_peer_site_get_attributes(
+      IoCtx& io_ctx, const std::string &uuid,
+      std::map<std::string, std::string> *key_vals);
+  int mirror_peer_site_set_attributes(
+      IoCtx& io_ctx, const std::string &uuid,
+      const std::map<std::string, std::string>& key_vals);
+
+  int mirror_image_global_status_list(
+      IoCtx& io_ctx, const std::string &start_id, size_t max,
+      std::map<std::string, mirror_image_global_status_t> *images);
+  int mirror_image_status_summary(IoCtx& io_ctx,
+      std::map<mirror_image_status_state_t, int> *states);
+  int mirror_image_instance_id_list(IoCtx& io_ctx, const std::string &start_id,
+      size_t max, std::map<std::string, std::string> *sevice_ids);
+  int mirror_image_info_list(IoCtx& io_ctx, mirror_image_mode_t *mode_filter,
+      const std::string &start_id, size_t max,
+      std::map<std::string, std::pair<mirror_image_mode_t,
+                                      mirror_image_info_t>> *entries);
+
+  /// mirror_peer_ commands are deprecated to mirror_peer_site_ equivalents
+  int mirror_peer_add(IoCtx& io_ctx, std::string *uuid,
+                      const std::string &cluster_name,
+                      const std::string &client_name)
+    CEPH_RBD_DEPRECATED;
+  int mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid)
+    CEPH_RBD_DEPRECATED;
+  int mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers)
+    CEPH_RBD_DEPRECATED;
+  int mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid,
+                             const std::string &client_name)
+    CEPH_RBD_DEPRECATED;
+  int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid,
+                              const std::string &cluster_name)
+    CEPH_RBD_DEPRECATED;
+  int mirror_peer_get_attributes(
+      IoCtx& io_ctx, const std::string &uuid,
+      std::map<std::string, std::string> *key_vals)
+    CEPH_RBD_DEPRECATED;
+  int mirror_peer_set_attributes(
+      IoCtx& io_ctx, const std::string &uuid,
+      const std::map<std::string, std::string>& key_vals)
+    CEPH_RBD_DEPRECATED;
+
+  /// mirror_image_status_list command is deprecated to
+  /// mirror_image_global_status_list
+
+  int mirror_image_status_list(
+      IoCtx& io_ctx, const std::string &start_id, size_t max,
+      std::map<std::string, mirror_image_status_t> *images)
+    CEPH_RBD_DEPRECATED;
+
+  // RBD groups support functions
+  int group_create(IoCtx& io_ctx, const char *group_name);
+  int group_remove(IoCtx& io_ctx, const char *group_name);
+  int group_list(IoCtx& io_ctx, std::vector<std::string> *names);
+  int group_rename(IoCtx& io_ctx, const char *src_group_name,
+                   const char *dest_group_name);
+
+  int group_image_add(IoCtx& io_ctx, const char *group_name,
+		      IoCtx& image_io_ctx, const char *image_name);
+  int group_image_remove(IoCtx& io_ctx, const char *group_name,
+			 IoCtx& image_io_ctx, const char *image_name);
+  int group_image_remove_by_id(IoCtx& io_ctx, const char *group_name,
+                               IoCtx& image_io_ctx, const char *image_id);
+  int group_image_list(IoCtx& io_ctx, const char *group_name,
+                       std::vector<group_image_info_t> *images,
+                       size_t group_image_info_size);
+
+  int group_snap_create(IoCtx& io_ctx, const char *group_name,
+			const char *snap_name);
+  int group_snap_create2(IoCtx& io_ctx, const char *group_name,
+                         const char *snap_name, uint32_t flags);
+  int group_snap_remove(IoCtx& io_ctx, const char *group_name,
+			const char *snap_name);
+  int group_snap_rename(IoCtx& group_ioctx, const char *group_name,
+                        const char *old_snap_name, const char *new_snap_name);
+  int group_snap_list(IoCtx& group_ioctx, const char *group_name,
+                      std::vector<group_snap_info_t> *snaps,
+                      size_t group_snap_info_size);
+  int group_snap_rollback(IoCtx& io_ctx, const char *group_name,
+                          const char *snap_name);
+  int group_snap_rollback_with_progress(IoCtx& io_ctx, const char *group_name,
+                                        const char *snap_name,
+                                        ProgressContext& pctx);
+
+  int namespace_create(IoCtx& ioctx, const char *namespace_name);
+  int namespace_remove(IoCtx& ioctx, const char *namespace_name);
+  int namespace_list(IoCtx& io_ctx, std::vector<std::string>* namespace_names);
+  int namespace_exists(IoCtx& io_ctx, const char *namespace_name, bool *exists);
+
+  int pool_init(IoCtx& io_ctx, bool force);
+  int pool_stats_get(IoCtx& io_ctx, PoolStats *pool_stats);
+
+  int pool_metadata_get(IoCtx &io_ctx, const std::string &key,
+                        std::string *value);
+  int pool_metadata_set(IoCtx &io_ctx, const std::string &key,
+                        const std::string &value);
+  int pool_metadata_remove(IoCtx &io_ctx, const std::string &key);
+  int pool_metadata_list(IoCtx &io_ctx, const std::string &start, uint64_t max,
+                         std::map<std::string, ceph::bufferlist> *pairs);
+
+  int config_list(IoCtx& io_ctx, std::vector<config_option_t> *options);
+
+private:
+  /* We don't allow assignment or copying */
+  RBD(const RBD& rhs);
+  const RBD& operator=(const RBD& rhs);
+};
+
+class CEPH_RBD_API ImageOptions {
+public:
+  ImageOptions();
+  ImageOptions(rbd_image_options_t opts);
+  ImageOptions(const ImageOptions &imgopts);
+  ~ImageOptions();
+
+  int set(int optname, const std::string& optval);
+  int set(int optname, uint64_t optval);
+  int get(int optname, std::string* optval) const;
+  int get(int optname, uint64_t* optval) const;
+  int is_set(int optname, bool* is_set);
+  int unset(int optname);
+  void clear();
+  bool empty() const;
+
+private:
+  friend class RBD;
+  friend class Image;
+
+  rbd_image_options_t opts;
+};
+
+class CEPH_RBD_API PoolStats {
+public:
+  PoolStats();
+  ~PoolStats();
+
+  PoolStats(const PoolStats&) = delete;
+  PoolStats& operator=(const PoolStats&) = delete;
+
+  int add(rbd_pool_stat_option_t option, uint64_t* opt_val);
+
+private:
+  friend class RBD;
+
+  rbd_pool_stats_t pool_stats;
+};
+
+class CEPH_RBD_API UpdateWatchCtx {
+public:
+  virtual ~UpdateWatchCtx() {}
+  /**
+   * Callback activated when we receive a notify event.
+   */
+  virtual void handle_notify() = 0;
+};
+
+class CEPH_RBD_API QuiesceWatchCtx {
+public:
+  virtual ~QuiesceWatchCtx() {}
+  /**
+   * Callback activated when we want to quiesce.
+   */
+  virtual void handle_quiesce() = 0;
+
+  /**
+   * Callback activated when we want to unquiesce.
+   */
+  virtual void handle_unquiesce() = 0;
+};
+
+class CEPH_RBD_API Image
+{
+public:
+  Image();
+  ~Image();
+
+  int close();
+  int aio_close(RBD::AioCompletion *c);
+
+  int resize(uint64_t size);
+  int resize2(uint64_t size, bool allow_shrink, ProgressContext& pctx);
+  int resize_with_progress(uint64_t size, ProgressContext& pctx);
+  int stat(image_info_t &info, size_t infosize);
+  int get_name(std::string *name);
+  int get_id(std::string *id);
+  std::string get_block_name_prefix();
+  int64_t get_data_pool_id();
+  int parent_info(std::string *parent_poolname, std::string *parent_name,
+		  std::string *parent_snapname)
+      CEPH_RBD_DEPRECATED;
+  int parent_info2(std::string *parent_poolname, std::string *parent_name,
+                   std::string *parent_id, std::string *parent_snapname)
+      CEPH_RBD_DEPRECATED;
+  int get_parent(linked_image_spec_t *parent_image, snap_spec_t *parent_snap);
+
+  int get_migration_source_spec(std::string* source_spec);
+
+  int old_format(uint8_t *old);
+  int size(uint64_t *size);
+  int get_group(group_info_t *group_info, size_t group_info_size);
+  int features(uint64_t *features);
+  int update_features(uint64_t features, bool enabled);
+  int get_op_features(uint64_t *op_features);
+  int overlap(uint64_t *overlap);
+  int get_flags(uint64_t *flags);
+  int set_image_notification(int fd, int type);
+
+  /* exclusive lock feature */
+  int is_exclusive_lock_owner(bool *is_owner);
+  int lock_acquire(rbd_lock_mode_t lock_mode);
+  int lock_release();
+  int lock_get_owners(rbd_lock_mode_t *lock_mode,
+                      std::list<std::string> *lock_owners);
+  int lock_break(rbd_lock_mode_t lock_mode, const std::string &lock_owner);
+
+  /* object map feature */
+  int rebuild_object_map(ProgressContext &prog_ctx);
+
+  int check_object_map(ProgressContext &prog_ctx);
+
+  int copy(IoCtx& dest_io_ctx, const char *destname);
+  int copy2(Image& dest);
+  int copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts);
+  int copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts,
+	    size_t sparse_size);
+  int copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+			 ProgressContext &prog_ctx);
+  int copy_with_progress2(Image& dest, ProgressContext &prog_ctx);
+  int copy_with_progress3(IoCtx& dest_io_ctx, const char *destname,
+			  ImageOptions& opts, ProgressContext &prog_ctx);
+  int copy_with_progress4(IoCtx& dest_io_ctx, const char *destname,
+			  ImageOptions& opts, ProgressContext &prog_ctx,
+			  size_t sparse_size);
+
+  /* deep copy */
+  int deep_copy(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts);
+  int deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
+                              ImageOptions& opts, ProgressContext &prog_ctx);
+
+  /* encryption */
+  int encryption_format(encryption_format_t format, encryption_options_t opts,
+                        size_t opts_size);
+  int encryption_load(encryption_format_t format, encryption_options_t opts,
+                      size_t opts_size);
+  int encryption_load2(const encryption_spec_t *specs, size_t spec_count);
+
+  /* striping */
+  uint64_t get_stripe_unit() const;
+  uint64_t get_stripe_count() const;
+
+  int get_create_timestamp(struct timespec *timestamp);
+  int get_access_timestamp(struct timespec *timestamp);
+  int get_modify_timestamp(struct timespec *timestamp);
+
+  int flatten();
+  int flatten_with_progress(ProgressContext &prog_ctx);
+
+  int sparsify(size_t sparse_size);
+  int sparsify_with_progress(size_t sparse_size, ProgressContext &prog_ctx);
+  /**
+   * Returns a pair of poolname, imagename for each clone
+   * of this image at the currently set snapshot.
+   */
+  int list_children(std::set<std::pair<std::string, std::string> > *children)
+      CEPH_RBD_DEPRECATED;
+  /**
+  * Returns a structure of poolname, imagename, imageid and trash flag
+  * for each clone of this image at the currently set snapshot.
+  */
+  int list_children2(std::vector<librbd::child_info_t> *children)
+      CEPH_RBD_DEPRECATED;
+  int list_children3(std::vector<linked_image_spec_t> *images);
+  int list_descendants(std::vector<linked_image_spec_t> *images);
+
+  /* advisory locking (see librbd.h for details) */
+  int list_lockers(std::list<locker_t> *lockers,
+		   bool *exclusive, std::string *tag);
+  int lock_exclusive(const std::string& cookie);
+  int lock_shared(const std::string& cookie, const std::string& tag);
+  int unlock(const std::string& cookie);
+  int break_lock(const std::string& client, const std::string& cookie);
+
+  /* snapshots */
+  int snap_list(std::vector<snap_info_t>& snaps);
+  /* DEPRECATED; use snap_exists2 */
+  bool snap_exists(const char *snapname) CEPH_RBD_DEPRECATED;
+  int snap_exists2(const char *snapname, bool *exists);
+  int snap_create(const char *snapname);
+  int snap_create2(const char *snapname, uint32_t flags, ProgressContext& pctx);
+  int snap_remove(const char *snapname);
+  int snap_remove2(const char *snapname, uint32_t flags, ProgressContext& pctx);
+  int snap_remove_by_id(uint64_t snap_id);
+  int snap_rollback(const char *snap_name);
+  int snap_rollback_with_progress(const char *snap_name, ProgressContext& pctx);
+  int snap_protect(const char *snap_name);
+  int snap_unprotect(const char *snap_name);
+  int snap_is_protected(const char *snap_name, bool *is_protected);
+  int snap_set(const char *snap_name);
+  int snap_set_by_id(uint64_t snap_id);
+  int snap_get_name(uint64_t snap_id, std::string *snap_name);
+  int snap_get_id(const std::string snap_name, uint64_t *snap_id);
+  int snap_rename(const char *srcname, const char *dstname);
+  int snap_get_limit(uint64_t *limit);
+  int snap_set_limit(uint64_t limit);
+  int snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp);
+  int snap_get_namespace_type(uint64_t snap_id,
+                              snap_namespace_type_t *namespace_type);
+  int snap_get_group_namespace(uint64_t snap_id,
+                               snap_group_namespace_t *group_namespace,
+                               size_t snap_group_namespace_size);
+  int snap_get_trash_namespace(uint64_t snap_id, std::string* original_name);
+  int snap_get_mirror_namespace(
+      uint64_t snap_id, snap_mirror_namespace_t *mirror_namespace,
+      size_t snap_mirror_namespace_size);
+
+  /* I/O */
+  ssize_t read(uint64_t ofs, size_t len, ceph::bufferlist& bl);
+  /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+  ssize_t read2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+  int64_t read_iterate(uint64_t ofs, size_t len,
+		       int (*cb)(uint64_t, size_t, const char *, void *), void *arg);
+  int read_iterate2(uint64_t ofs, uint64_t len,
+		    int (*cb)(uint64_t, size_t, const char *, void *), void *arg);
+  /**
+   * get difference between two versions of an image
+   *
+   * This will return the differences between two versions of an image
+   * via a callback, which gets the offset and length and a flag
+   * indicating whether the extent exists (1), or is known/defined to
+   * be zeros (a hole, 0).  If the source snapshot name is NULL, we
+   * interpret that as the beginning of time and return all allocated
+   * regions of the image.  The end version is whatever is currently
+   * selected for the image handle (either a snapshot or the writeable
+   * head).
+   *
+   * @param fromsnapname start snapshot name, or NULL
+   * @param ofs start offset
+   * @param len len in bytes of region to report on
+   * @param include_parent true if full history diff should include parent
+   * @param whole_object 1 if diff extents should cover whole object
+   * @param cb callback to call for each allocated region
+   * @param arg argument to pass to the callback
+   * @returns 0 on success, or negative error code on error
+   */
+  int diff_iterate(const char *fromsnapname,
+		   uint64_t ofs, uint64_t len,
+		   int (*cb)(uint64_t, size_t, int, void *), void *arg);
+  int diff_iterate2(const char *fromsnapname,
+		    uint64_t ofs, uint64_t len,
+                    bool include_parent, bool whole_object,
+		    int (*cb)(uint64_t, size_t, int, void *), void *arg);
+
+  ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl);
+  /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+  ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags);
+
+  int discard(uint64_t ofs, uint64_t len);
+  ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags);
+  ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags);
+
+  /**
+   * compare and write from/to image
+   *
+   * Compare data in compare bufferlist to data at offset in image.
+   * len bytes of the compare bufferlist are compared, i.e. the compare
+   * bufferlist has to be at least len bytes long.
+   * If the compare is successful len bytes from the write bufferlist
+   * are written to the image, i.e. the write bufferlist also has to be
+   * at least len bytes long.
+   * If the compare is unsuccessful no data is written and the
+   * offset in the bufferlist where the compare first differed
+   * is returned through mismatch_off.
+   *
+   * @param off offset in image
+   * @param len length of compare, length of write
+   * @param cmp_bl bufferlist to compare from
+   * @param bl bufferlist to write to image if compare succeeds
+   * @param c aio completion to notify when compare and write is complete
+   * @param mismatch_off (out) offset in bufferlist where compare first differed
+   * @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+   */
+  ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl,
+                            ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags);
+
+  int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
+  /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+  int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl,
+		  RBD::AioCompletion *c, int op_flags);
+
+  int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
+  int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl,
+                    RBD::AioCompletion *c, int op_flags);
+  int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c,
+                       int zero_flags, int op_flags);
+
+  int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl,
+                            ceph::bufferlist& bl, RBD::AioCompletion *c,
+                            uint64_t *mismatch_off, int op_flags);
+
+  /**
+   * read async from image
+   *
+   * The target bufferlist is populated with references to buffers
+   * that contain the data for the given extent of the image.
+   *
+   * NOTE: If caching is enabled, the bufferlist will directly
+   * reference buffers in the cache to avoid an unnecessary data copy.
+   * As a result, if the user intends to modify the buffer contents
+   * directly, they should make a copy first (unconditionally, or when
+   * the reference count on ther underlying buffer is more than 1).
+   *
+   * @param off offset in image
+   * @param len length of read
+   * @param bl bufferlist to read into
+   * @param c aio completion to notify when read is complete
+   */
+  int aio_read(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
+  /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */
+  int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl,
+		  RBD::AioCompletion *c, int op_flags);
+
+  int flush();
+  /**
+   * Start a flush if caching is enabled. Get a callback when
+   * the currently pending writes are on disk.
+   *
+   * @param image the image to flush writes to
+   * @param c what to call when flushing is complete
+   * @returns 0 on success, negative error code on failure
+   */
+  int aio_flush(RBD::AioCompletion *c);
+
+  /**
+   * Drop any cached data for this image
+   *
+   * @returns 0 on success, negative error code on failure
+   */
+  int invalidate_cache();
+
+  int poll_io_events(RBD::AioCompletion **comps, int numcomp);
+
+  int metadata_get(const std::string &key, std::string *value);
+  int metadata_set(const std::string &key, const std::string &value);
+  int metadata_remove(const std::string &key);
+  /**
+   * Returns a pair of key/value for this image
+   */
+  int metadata_list(const std::string &start, uint64_t max, std::map<std::string, ceph::bufferlist> *pairs);
+
+  // RBD image mirroring support functions
+  int mirror_image_enable() CEPH_RBD_DEPRECATED;
+  int mirror_image_enable2(mirror_image_mode_t mode);
+  int mirror_image_disable(bool force);
+  int mirror_image_promote(bool force);
+  int mirror_image_demote();
+  int mirror_image_resync();
+  int mirror_image_create_snapshot(uint64_t *snap_id);
+  int mirror_image_create_snapshot2(uint32_t flags, uint64_t *snap_id);
+  int mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+                            size_t info_size);
+  int mirror_image_get_mode(mirror_image_mode_t *mode);
+  int mirror_image_get_global_status(
+      mirror_image_global_status_t *mirror_image_global_status,
+      size_t status_size);
+  int mirror_image_get_status(
+      mirror_image_status_t *mirror_image_status, size_t status_size)
+    CEPH_RBD_DEPRECATED;
+  int mirror_image_get_instance_id(std::string *instance_id);
+  int aio_mirror_image_promote(bool force, RBD::AioCompletion *c);
+  int aio_mirror_image_demote(RBD::AioCompletion *c);
+  int aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info,
+                                size_t info_size, RBD::AioCompletion *c);
+  int aio_mirror_image_get_mode(mirror_image_mode_t *mode,
+                                RBD::AioCompletion *c);
+  int aio_mirror_image_get_global_status(
+      mirror_image_global_status_t *mirror_image_global_status,
+      size_t status_size, RBD::AioCompletion *c);
+  int aio_mirror_image_get_status(
+      mirror_image_status_t *mirror_image_status, size_t status_size,
+      RBD::AioCompletion *c)
+    CEPH_RBD_DEPRECATED;
+  int aio_mirror_image_create_snapshot(uint32_t flags, uint64_t *snap_id,
+      RBD::AioCompletion *c);
+
+  int update_watch(UpdateWatchCtx *ctx, uint64_t *handle);
+  int update_unwatch(uint64_t handle);
+
+  int list_watchers(std::list<image_watcher_t> &watchers);
+
+  int config_list(std::vector<config_option_t> *options);
+
+  int quiesce_watch(QuiesceWatchCtx *ctx, uint64_t *handle);
+  int quiesce_unwatch(uint64_t handle);
+  void quiesce_complete(uint64_t handle, int r);
+
+private:
+  friend class RBD;
+
+  Image(const Image& rhs);
+  const Image& operator=(const Image& rhs);
+
+  image_ctx_t ctx;
+};
+
+} // namespace librbd
+
+#if __GNUC__ >= 4
+  #pragma GCC diagnostic pop
+#endif
+
+#endif // __LIBRBD_HPP
diff --git a/src/include/rbd/object_map_types.h b/src/include/rbd/object_map_types.h
new file mode 100644
index 000000000..54852caa8
--- /dev/null
+++ b/src/include/rbd/object_map_types.h
@@ -0,0 +1,13 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_RBD_OBJECT_MAP_TYPES_H
+#define CEPH_RBD_OBJECT_MAP_TYPES_H
+
+#include "include/int_types.h"
+
+static const uint8_t OBJECT_NONEXISTENT  = 0;
+static const uint8_t OBJECT_EXISTS       = 1;
+static const uint8_t OBJECT_PENDING      = 2;
+static const uint8_t OBJECT_EXISTS_CLEAN = 3;
+
+#endif // CEPH_RBD_OBJECT_MAP_TYPES_H
diff --git a/src/include/rbd_types.h b/src/include/rbd_types.h
new file mode 100644
index 000000000..35a1a8bc3
--- /dev/null
+++ b/src/include/rbd_types.h
@@ -0,0 +1,159 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include "include/types.h"
+#include "rbd/features.h"
+
+/* New-style rbd image 'foo' consists of objects
+ *   rbd_id.foo              - id of image
+ *   rbd_header.<id>         - image metadata
+ *   rbd_object_map.<id>     - optional image object map
+ *   rbd_data.<id>.00000000
+ *   rbd_data.<id>.00000001
+ *   ...                     - data
+ */
+
+#define RBD_HEADER_PREFIX      "rbd_header."
+#define RBD_OBJECT_MAP_PREFIX  "rbd_object_map."
+#define RBD_DATA_PREFIX        "rbd_data."
+#define RBD_ID_PREFIX          "rbd_id."
+
+/*
+ * old-style rbd image 'foo' consists of objects
+ *   foo.rbd      - image metadata
+ *   rb.<idhi>.<idlo>.00000000
+ *   rb.<idhi>.<idlo>.00000001
+ *   ...          - data
+ */
+
+#define RBD_SUFFIX	 	".rbd"
+#define RBD_DIRECTORY           "rbd_directory"
+#define RBD_INFO                "rbd_info"
+#define RBD_NAMESPACE           "rbd_namespace"
+#define RBD_TASK                "rbd_task"
+
+/*
+ * rbd_children object in each pool contains omap entries
+ * that map parent (poolid, imageid, snapid) to a list of children
+ * (imageids; snapids aren't required because we get all the snapshot
+ * info from a read of the child's header object anyway).
+ *
+ * The clone operation writes a new item to this child list, and rm or
+ * flatten removes an item, and may remove the whole entry if no children
+ * exist after the rm/flatten.
+ *
+ * When attempting to remove a parent, all pools are searched for
+ * rbd_children objects with entries referring to that parent; if any
+ * exist (and those children exist), the parent removal is prevented.
+ */
+#define RBD_CHILDREN		"rbd_children"
+#define RBD_LOCK_NAME		"rbd_lock"
+
+/**
+ * rbd_mirroring object in each pool contains pool-specific settings
+ * for configuring mirroring.
+ */
+#define RBD_MIRRORING       "rbd_mirroring"
+
+/**
+ * rbd_mirror_leader and rbd_mirror_instance.<instance id> objects are used
+ * for pool-level coordination between rbd-mirror daemons.
+ */
+#define RBD_MIRROR_LEADER               "rbd_mirror_leader"
+#define RBD_MIRROR_INSTANCE_PREFIX      "rbd_mirror_instance."
+
+#define RBD_MAX_OBJ_NAME_SIZE	96
+#define RBD_MAX_BLOCK_NAME_SIZE 24
+
+/**
+ * Maximum string length of the RBD v2 image id (not including
+ * null termination). This limit was derived from the existing
+ * RBD_MAX_BLOCK_NAME_SIZE limit which needs to hold the "rbd_data."
+ * prefix and null termination.
+ */
+#define RBD_MAX_IMAGE_ID_LENGTH 14
+
+/**
+ * Maximum string length of the RBD block object name prefix (not including
+ * null termination).
+ *
+ * v1 format: rb.<max 8-byte high id>.<max 8-byte low id>.<max 8-byte extra>
+ * v2 format: rbd_data.[<max 19-byte pool id>.]<max 14-byte image id>
+ *
+ * Note: new features might require increasing this maximum prefix length.
+ */
+#define RBD_MAX_BLOCK_NAME_PREFIX_LENGTH 43
+
+#define RBD_COMP_NONE		0
+#define RBD_CRYPT_NONE		0
+
+#define RBD_HEADER_TEXT		"<<< Rados Block Device Image >>>\n"
+#define RBD_MIGRATE_HEADER_TEXT	"<<< Migrating RBD Image      >>>\n"
+#define RBD_HEADER_SIGNATURE	"RBD"
+#define RBD_HEADER_VERSION	"001.005"
+
+#define RBD_GROUP_INVALID_POOL (-1)
+
+#define RBD_GROUP_HEADER_PREFIX "rbd_group_header."
+
+#define RBD_GROUP_DIRECTORY "rbd_group_directory"
+
+#define RBD_TRASH "rbd_trash"
+
+/**
+ * MON config-key prefix for storing optional remote cluster connectivity
+ * parameters
+ */
+#define RBD_MIRROR_CONFIG_KEY_PREFIX          "rbd/mirror/"
+#define RBD_MIRROR_SITE_NAME_CONFIG_KEY       RBD_MIRROR_CONFIG_KEY_PREFIX "site_name"
+#define RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY  RBD_MIRROR_CONFIG_KEY_PREFIX "peer_client_id"
+#define RBD_MIRROR_PEER_CONFIG_KEY_PREFIX     RBD_MIRROR_CONFIG_KEY_PREFIX "peer/"
+
+struct rbd_info {
+	ceph_le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_obj_snap_ondisk {
+	ceph_le64 id;
+	ceph_le64 image_size;
+} __attribute__((packed));
+
+struct rbd_obj_header_ondisk {
+	char text[40];
+	char block_name[RBD_MAX_BLOCK_NAME_SIZE];
+	char signature[4];
+	char version[8];
+	struct {
+		__u8 order;
+		__u8 crypt_type;
+		__u8 comp_type;
+		__u8 unused;
+	} __attribute__((packed)) options;
+	ceph_le64 image_size;
+	ceph_le64 snap_seq;
+	ceph_le32 snap_count;
+	ceph_le32 reserved;
+	ceph_le64 snap_names_len;
+	struct rbd_obj_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+enum {
+  RBD_PROTECTION_STATUS_UNPROTECTED  = 0,
+  RBD_PROTECTION_STATUS_UNPROTECTING = 1,
+  RBD_PROTECTION_STATUS_PROTECTED    = 2,
+  RBD_PROTECTION_STATUS_LAST         = 3
+};
+
+#endif
diff --git a/src/include/scope_guard.h b/src/include/scope_guard.h
new file mode 100644
index 000000000..eacc65e7b
--- /dev/null
+++ b/src/include/scope_guard.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef SCOPE_GUARD
+#define SCOPE_GUARD
+
+#include <utility>
+
+template <typename F>
+struct scope_guard {
+  F f;
+  scope_guard() = delete;
+  scope_guard(const scope_guard &) = delete;
+  scope_guard(scope_guard &&) = default;
+  scope_guard & operator=(const scope_guard &) = delete;
+  scope_guard & operator=(scope_guard &&) = default;
+  scope_guard(const F& f) : f(f) {}
+  scope_guard(F &&f) : f(std::move(f)) {}
+  template<typename... Args>
+  scope_guard(std::in_place_t, Args&& ...args) : f(std::forward<Args>(args)...) {}
+  ~scope_guard() {
+    std::move(f)(); // Support at-most-once functions
+  }
+};
+
+template <typename F>
+[[nodiscard("Unassigned scope guards will execute immediately")]]
+scope_guard<F> make_scope_guard(F &&f) {
+  return scope_guard<F>(std::forward<F>(f));
+}
+
+template<typename F, typename... Args>
+[[nodiscard("Unassigned scope guards will execute immediately")]]
+scope_guard<F> make_scope_guard(std::in_place_type_t<F>, Args&& ...args) {
+  return { std::in_place, std::forward<Args>(args)... };
+}
+
+#endif
diff --git a/src/include/sock_compat.h b/src/include/sock_compat.h
new file mode 100644
index 000000000..14b5efa1d
--- /dev/null
+++ b/src/include/sock_compat.h
@@ -0,0 +1,43 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_SOCK_COMPAT_H
+#define CEPH_SOCK_COMPAT_H
+
+#include "include/compat.h"
+#include <sys/socket.h>
+
+/*
+ * This optimization may not be available on all platforms (e.g. OSX).
+ * Apparently a similar approach based on TCP_CORK can be used.
+ */
+#ifndef MSG_MORE
+# define MSG_MORE 0
+#endif
+
+/*
+ * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
+ */
+#ifndef MSG_NOSIGNAL
+# define MSG_NOSIGNAL 0
+# ifdef SO_NOSIGPIPE
+#  define CEPH_USE_SO_NOSIGPIPE
+# else
+#  define CEPH_USE_SIGPIPE_BLOCKER
+#  warning "Using SIGPIPE blocking instead of suppression; this is not well-tested upstream!"
+# endif
+#endif
+
+int socket_cloexec(int domain, int type, int protocol);
+int socketpair_cloexec(int domain, int type, int protocol, int sv[2]);
+int accept_cloexec(int sockfd, struct sockaddr* addr, socklen_t* addrlen);
+
+#endif
diff --git a/src/include/spinlock.h b/src/include/spinlock.h
new file mode 100644
index 000000000..3f12bdc00
--- /dev/null
+++ b/src/include/spinlock.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ * @author Jesse Williamson <jwilliamson@suse.de>
+ *
+*/
+
+#ifndef CEPH_SPINLOCK_HPP
+#define CEPH_SPINLOCK_HPP
+
+#include <atomic>
+
+namespace ceph {
+inline namespace version_1_0 {
+
+class spinlock;
+
+inline void spin_lock(std::atomic_flag& lock);
+inline void spin_unlock(std::atomic_flag& lock);
+inline void spin_lock(ceph::spinlock& lock);
+inline void spin_unlock(ceph::spinlock& lock);
+
+/* A pre-packaged spinlock type modelling BasicLockable: */
+class spinlock final
+{
+  std::atomic_flag af = ATOMIC_FLAG_INIT;
+
+  public:
+  void lock() {
+    ceph::spin_lock(af);
+  }
+ 
+  void unlock() noexcept {
+    ceph::spin_unlock(af);
+  }
+};
+
+// Free functions:
+inline void spin_lock(std::atomic_flag& lock)
+{
+ while(lock.test_and_set(std::memory_order_acquire))
+  ;
+}
+
+inline void spin_unlock(std::atomic_flag& lock)
+{
+ lock.clear(std::memory_order_release);
+}
+
+inline void spin_lock(std::atomic_flag *lock)
+{
+ spin_lock(*lock);
+}
+
+inline void spin_unlock(std::atomic_flag *lock)
+{
+ spin_unlock(*lock);
+}
+
+inline void spin_lock(ceph::spinlock& lock)
+{
+ lock.lock();
+}
+
+inline void spin_unlock(ceph::spinlock& lock)
+{
+ lock.unlock();
+}
+
+inline void spin_lock(ceph::spinlock *lock)
+{
+ spin_lock(*lock);
+}
+
+inline void spin_unlock(ceph::spinlock *lock)
+{
+ spin_unlock(*lock);
+}
+
+} // inline namespace (version)
+} // namespace ceph
+
+#endif
diff --git a/src/include/stat.h b/src/include/stat.h
new file mode 100644
index 000000000..19398758e
--- /dev/null
+++ b/src/include/stat.h
@@ -0,0 +1,145 @@
+#ifndef CEPH_STAT_H
+#define CEPH_STAT_H
+
+#include <acconfig.h>
+
+#include <sys/stat.h>
+
+/*
+ * Access time-related `struct stat` members.
+ *
+ * Note that for each of the stat member get/set functions below, setting a
+ * high-res value (stat_set_*_nsec) on a platform without high-res support is
+ * a no-op.
+ */
+
+#ifdef HAVE_STAT_ST_MTIM_TV_NSEC
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+  return st->st_mtim.tv_nsec;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_mtim.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+  return st->st_atim.tv_nsec;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_atim.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+  return st->st_ctim.tv_nsec;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_ctim.tv_nsec = nsec;
+}
+
+#elif defined(HAVE_STAT_ST_MTIMESPEC_TV_NSEC)
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+  return st->st_mtimespec.tv_nsec;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_mtimespec.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+  return st->st_atimespec.tv_nsec;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_atimespec.tv_nsec = nsec;
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+  return st->st_ctimespec.tv_nsec;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+  st->st_ctimespec.tv_nsec = nsec;
+}
+
+#else
+
+static inline uint32_t stat_get_mtime_nsec(struct stat *st)
+{
+  return 0;
+}
+
+static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+static inline uint32_t stat_get_atime_nsec(struct stat *st)
+{
+  return 0;
+}
+
+static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+static inline uint32_t stat_get_ctime_nsec(struct stat *st)
+{
+  return 0;
+}
+
+static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec)
+{
+}
+
+#endif
+
+/*
+ * Access second-resolution `struct stat` members.
+ */
+
+static inline uint32_t stat_get_mtime_sec(struct stat *st)
+{
+  return st->st_mtime;
+}
+
+static inline void stat_set_mtime_sec(struct stat *st, uint32_t sec)
+{
+  st->st_mtime = sec;
+}
+
+static inline uint32_t stat_get_atime_sec(struct stat *st)
+{
+  return st->st_atime;
+}
+
+static inline void stat_set_atime_sec(struct stat *st, uint32_t sec)
+{
+  st->st_atime = sec;
+}
+
+static inline uint32_t stat_get_ctime_sec(struct stat *st)
+{
+  return st->st_ctime;
+}
+
+static inline void stat_set_ctime_sec(struct stat *st, uint32_t sec)
+{
+  st->st_ctime = sec;
+}
+
+#endif
diff --git a/src/include/statlite.h b/src/include/statlite.h
new file mode 100644
index 000000000..0ff4b04e7
--- /dev/null
+++ b/src/include/statlite.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_STATLITE_H
+#define CEPH_STATLITE_H
+
+extern "C" {
+
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#include "include/compat.h"
+
+struct statlite {
+  dev_t         st_dev;      /* device */
+  ino_t         st_ino;      /* inode */
+  mode_t        st_mode;     /* protection */
+  nlink_t       st_nlink;    /* number of hard links */
+  uid_t         st_uid;      /* user ID of owner */
+  gid_t         st_gid;      /* group ID of owner */
+  dev_t         st_rdev;     /* device type (if inode device)*/
+  unsigned long st_litemask; /* bit mask for optional fields */
+  /***************************************************************/
+  /**** Remaining fields are optional according to st_litemask ***/
+  off_t         st_size;     /* total size, in bytes         */
+  blksize_t     st_blksize;  /* blocksize for filesystem I/O */
+  blkcnt_t      st_blocks;   /* number of blocks allocated   */
+  struct timespec st_atim;            /* Time of last access.  */
+  struct timespec st_mtim;            /* Time of last modification.  */
+  struct timespec st_ctim;            /* Time of last status change.  */
+  //time_t        st_atime;    /* time of last access          */
+  //time_t        st_mtime;    /* time of last modification    */
+  //time_t        st_ctime;    /* time of last change          */
+}; 
+
+#define S_STATLITE_SIZE     1
+#define S_STATLITE_BLKSIZE  2
+#define S_STATLITE_BLOCKS   4
+#define S_STATLITE_ATIME    8
+#define S_STATLITE_MTIME    16
+#define S_STATLITE_CTIME    32
+
+#define S_REQUIRESIZE(m)      (m | S_STATLITE_SIZE)
+#define S_REQUIREBLKSIZE(m)   (m | S_STATLITE_BLKSIZE)
+#define S_REQUIREBLOCKS(m)    (m | S_STATLITE_BLOCKS)
+#define S_REQUIREATIME(m)     (m | S_STATLITE_ATIME)
+#define S_REQUIREMTIME(m)     (m | S_STATLITE_MTIME)
+#define S_REQUIRECTIME(m)     (m | S_STATLITE_CTIME)
+
+#define S_ISVALIDSIZE(m)      (m & S_STATLITE_SIZE)
+#define S_ISVALIDBLKSIZE(m)   (m & S_STATLITE_BLKSIZE)
+#define S_ISVALIDBLOCKS(m)    (m & S_STATLITE_BLOCKS)
+#define S_ISVALIDATIME(m)     (m & S_STATLITE_ATIME)
+#define S_ISVALIDMTIME(m)     (m & S_STATLITE_MTIME)
+#define S_ISVALIDCTIME(m)     (m & S_STATLITE_CTIME)
+
+
+// readdirplus etc.
+
+struct dirent_plus {
+ struct dirent     d_dirent;  /* dirent struct for this entry */
+ struct stat       d_stat;    /* attributes for this entry */
+ int               d_stat_err;/* errno for d_stat, or 0 */
+};
+struct dirent_lite {
+ struct dirent     d_dirent;  /* dirent struct for this entry */
+ struct statlite   d_stat;    /* attributes for this entry */
+ int               d_stat_err;/* errno for d_stat, or 0 */
+};
+
+}
+#endif
diff --git a/src/include/str_list.h b/src/include/str_list.h
new file mode 100644
index 000000000..cad76c1d6
--- /dev/null
+++ b/src/include/str_list.h
@@ -0,0 +1,97 @@
+#ifndef CEPH_STRLIST_H
+#define CEPH_STRLIST_H
+
+#include <list>
+#include <set>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace ceph {
+
+/// Split a string using the given delimiters, passing each piece as a
+/// (non-null-terminated) std::string_view to the callback.
+template <typename Func> // where Func(std::string_view) is a valid call
+void for_each_substr(std::string_view s, const char *delims, Func&& f)
+{
+  auto pos = s.find_first_not_of(delims);
+  while (pos != s.npos) {
+    s.remove_prefix(pos); // trim delims from the front
+    auto end = s.find_first_of(delims);
+    f(s.substr(0, end));
+    pos = s.find_first_not_of(delims, end);
+  }
+}
+
+} // namespace ceph
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as list
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+			 std::list<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as list
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+                         const char *delims,
+			 std::list<std::string>& str_list);
+
+std::list<std::string> get_str_list(const std::string& str,
+                                    const char *delims = ";,= \t");
+
+/**
+ * Split **str** into a vector of strings, using the ";,= \t" delimiters and output the result in **str_vec**.
+ * 
+ * @param [in] str String to split and save as Vector
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+void get_str_vec(std::string_view str, std::vector<std::string>& str_vec);
+
+/**
+ * Split **str** into a vector of strings, using the **delims** delimiters and output the result in **str_vec**.
+ * 
+ * @param [in] str String to split and save as Vector
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+void get_str_vec(std::string_view str,
+                 const char *delims,
+                 std::vector<std::string>& str_vec);
+
+std::vector<std::string> get_str_vec(std::string_view str,
+                                     const char *delims = ";,= \t");
+
+/**
+ * Return a String containing the vector **v** joined with **sep**
+ * 
+ * If **v** is empty, the function returns an empty string
+ * For each element in **v**,
+ * it will concatenate this element and **sep** with result
+ * 
+ * @param [in] v Vector to join as a String
+ * @param [in] sep String used to join each element from **v**
+ * @return empty string if **v** is empty or concatenated string
+**/
+inline std::string str_join(const std::vector<std::string>& v, const std::string& sep)
+{
+  if (v.empty())
+    return std::string();
+  auto i = v.cbegin();
+  std::string r = *i;
+  for (++i; i != v.cend(); ++i) {
+    r += sep;
+    r += *i;
+  }
+  return r;
+}
+
+#endif
diff --git a/src/include/str_map.h b/src/include/str_map.h
new file mode 100644
index 000000000..7f354fd46
--- /dev/null
+++ b/src/include/str_map.h
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#ifndef CEPH_STRMAP_H
+#define CEPH_STRMAP_H
+
+#define CONST_DELIMS ",;\t\n "
+
+#include <map>
+#include <string>
+#include <sstream>
+
+template <typename Func>
+void for_each_pair(std::string_view s, const char* delims, Func&& f)
+{
+  auto pos = s.find_first_not_of(delims);
+  while (pos != s.npos) {
+    s.remove_prefix(pos); // trim delims from the front
+    auto end = s.find_first_of(delims);
+    auto kv = s.substr(0, end);
+    if (auto equal = kv.find('='); equal != kv.npos) {
+      f(kv.substr(0, equal), kv.substr(equal + 1));
+    } else {
+      f(kv.substr(0, equal), std::string_view());
+    }
+    pos = s.find_first_not_of(delims, end);
+  }
+}
+
+using str_map_t = std::map<std::string,std::string>;
+
+/**
+ * Parse **str** and set **str_map** with the key/value pairs read
+ * from it. The format of **str** is either a well formed JSON object
+ * or a custom key[=value] plain text format.
+ *
+ * JSON is tried first. If successfully parsed into a JSON object, it
+ * is copied into **str_map** verbatim. If it is not a JSON object ( a
+ * string, integer etc. ), -EINVAL is returned and **ss** is set to
+ * a human readable error message.
+ *
+ * If **str** is no valid JSON and if **fallback_to_plain** is set to true
+ * (default: true) it is assumed to be a string containing white space
+ * separated key=value pairs. A white space is either space, tab or newline.
+ * Function **get_str_map** will be leveraged to parse the plain-text
+ * key/value pairs.
+ * 
+ * @param [in] str JSON or plain text key/value pairs
+ * @param [out] ss human readable message on error
+ * @param [out] str_map key/value pairs read from str
+ * @param [in] fallback_to_plain attempt parsing as plain-text if json fails
+ * @return **0** on success or a -EINVAL on error.
+ */
+int get_json_str_map(
+    const std::string &str,
+    std::ostream &ss,
+    str_map_t* str_map,
+    bool fallback_to_plain = true);
+
+/**
+ * Parse **str** and set **str_map** with the key/value pairs read from
+ * it.  The format of **str** is a number of custom key[=value] pairs in
+ * plain text format.
+ *
+ * The string will be parsed taking **delims** as field delimiters for
+ * key/values.  The value is optional resulting in an empty string when
+ * not provided.  For example, using white space as delimiters:
+ *
+ *     insert your own=political/ideological    statement=here 
+ *
+ * will be parsed into:
+ *
+ *     { "insert": "", 
+ *       "your": "", 
+ *       "own": "political/ideological",
+ *       "statement": "here" }
+ *
+ * Alternative delimiters may be provided.  For instance, specifying
+ * "white space and slash", for the above statement, would be parsed
+ * into:
+ *
+ *     { "insert": "",
+ *       "your": "",
+ *       "own": "political",
+ *       "ideological": "",
+ *       "statement": "here" }
+ *
+ * See how adding '/' to the delimiters field will spawn a new key without
+ * a set value.
+ *
+ * Always returns 0, as there is no condition for failure.
+ *
+ * @param [in] str plain text key/value pairs
+ * @param [in] delims field delimiters to be used for parsing str
+ * @param [out] str_map key/value pairs parsed from str
+ * @return **0**
+ */
+int get_str_map(
+    const std::string &str,
+    str_map_t* str_map,
+    const char *delims = CONST_DELIMS);
+
+// an alternate form (as we never fail):
+str_map_t get_str_map(
+    const std::string& str,
+    const char* delim = CONST_DELIMS);
+
+/**
+ * Returns the value of **key** in **str_map** if available.
+ *
+ * If **key** is not available in **str_map**, and if **def_val** is
+ * not-NULL then returns **def_val**. Otherwise checks if the value of
+ * **key** is an empty string and if so will return **key**.
+ * If the map contains **key**, the function returns the value of **key**.
+ *
+ * @param[in] str_map Map to obtain **key** from
+ * @param[in] key The key to search for in the map
+ * @param[in] def_val The value to return in case **key** is not present
+ */
+std::string get_str_map_value(
+    const str_map_t& str_map,
+    const std::string &key,
+    const std::string *def_val = nullptr);
+
+/**
+ * Returns the value of **key** in **str_map** if available.
+ *
+ * If **key** is available in **str_map** returns the value of **key**.
+ *
+ * If **key** is not available in **str_map**, and if **def_key**
+ * is not-NULL and available in **str_map**, then returns the value
+ * of **def_key**.
+ *
+ * Otherwise returns an empty string.
+ *
+ * @param[in] str_map Map to obtain **key** or **def_key** from
+ * @param[in] key Key to obtain the value of from **str_map**
+ * @param[in] def_key Key to fallback to if **key** is not present
+ *                    in **str_map**
+ */
+std::string get_str_map_key(
+    const str_map_t& str_map,
+    const std::string &key,
+    const std::string *fallback_key = nullptr);
+
+// This function's only purpose is to check whether a given map has only
+// ONE key with an empty value (which would mean that 'get_str_map()' read
+// a map in the form of 'VALUE', without any KEY/VALUE pairs) and, in such
+// event, to assign said 'VALUE' to a given 'def_key', such that we end up
+// with a map of the form "m = { 'def_key' : 'VALUE' }" instead of the
+// original "m = { 'VALUE' : '' }".
+int get_conf_str_map_helper(
+    const std::string &str,
+    std::ostringstream &oss,
+    str_map_t* str_map,
+    const std::string &default_key);
+
+std::string get_value_via_strmap(
+  const std::string& conf_string,
+  std::string_view default_key);
+
+std::string get_value_via_strmap(
+  const std::string& conf_string,
+  const std::string& key,
+  std::string_view default_key);
+
+#endif
diff --git a/src/include/stringify.h b/src/include/stringify.h
new file mode 100644
index 000000000..1b2a130c9
--- /dev/null
+++ b/src/include/stringify.h
@@ -0,0 +1,33 @@
+#ifndef __CEPH_STRINGIFY_H
+#define __CEPH_STRINGIFY_H
+
+#include <string>
+#include <sstream>
+
+#include "include/types.h"
+
+template<typename T>
+inline std::string stringify(const T& a) {
+#if defined(__GNUC__) && !(defined(__clang__) || defined(__INTEL_COMPILER))
+  static __thread std::ostringstream ss;
+  ss.str("");
+#else
+  std::ostringstream ss;
+#endif
+  ss << a;
+  return ss.str();
+}
+
+template <class T, class A>
+T joinify(const A &begin, const A &end, const T &t)
+{
+  T result;
+  for (A it = begin; it != end; it++) {
+    if (!result.empty())
+      result.append(t);
+    result.append(*it);
+  }
+  return result;
+}
+
+#endif
diff --git a/src/include/timegm.h b/src/include/timegm.h
new file mode 100644
index 000000000..fb970432d
--- /dev/null
+++ b/src/include/timegm.h
@@ -0,0 +1,79 @@
+//  (C) Copyright Howard Hinnant
+//  (C) Copyright 2010-2011 Vicente J. Botet Escriba
+//  Use, modification and distribution are subject to the Boost Software License,
+//  Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt).
+
+//===-------------------------- locale ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// This code was adapted by Vicente from Howard Hinnant's experimental work
+// on chrono i/o to Boost and some functions from libc++/locale to emulate the missing time_get::get()
+
+#ifndef BOOST_CHRONO_IO_TIME_POINT_IO_H
+#define BOOST_CHRONO_IO_TIME_POINT_IO_H
+
+#include <time.h>
+
+static int32_t is_leap(int32_t year) {
+  if(year % 400 == 0)
+    return 1;
+  if(year % 100 == 0)
+    return 0;
+  if(year % 4 == 0)
+    return 1;
+  return 0;
+}
+
+static int32_t days_from_0(int32_t year) {
+  year--;
+  return 365 * year + (year / 400) - (year/100) + (year / 4);
+}
+
+int32_t static days_from_1970(int32_t year) {
+  static const int days_from_0_to_1970 = days_from_0(1970);
+  return days_from_0(year) - days_from_0_to_1970;
+}
+
+static int32_t days_from_1jan(int32_t year,int32_t month,int32_t day) {
+  static const int32_t days[2][12] =
+  {
+    { 0,31,59,90,120,151,181,212,243,273,304,334},
+    { 0,31,60,91,121,152,182,213,244,274,305,335}
+  };
+
+  return days[is_leap(year)][month-1] + day - 1;
+}
+
+static  time_t internal_timegm(tm const *t) {
+  int year = t->tm_year + 1900;
+  int month = t->tm_mon;
+  if(month > 11)
+  {
+    year += month/12;
+    month %= 12;
+  }
+  else if(month < 0)
+  {
+    int years_diff = (-month + 11)/12;
+    year -= years_diff;
+    month+=12 * years_diff;
+  }
+  month++;
+  int day = t->tm_mday;
+  int day_of_year = days_from_1jan(year,month,day);
+  int days_since_epoch = days_from_1970(year) + day_of_year ;
+
+  time_t seconds_in_day = 3600 * 24;
+  time_t result = seconds_in_day * days_since_epoch + 3600 * t->tm_hour + 60 * t->tm_min + t->tm_sec;
+
+  return result;
+}
+
+#endif
diff --git a/src/include/types.h b/src/include/types.h
new file mode 100644
index 000000000..a76360db4
--- /dev/null
+++ b/src/include/types.h
@@ -0,0 +1,629 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_TYPES_H
+#define CEPH_TYPES_H
+
+// this is needed for ceph_fs to compile in userland
+#include "int_types.h"
+#include "byteorder.h"
+
+#include "uuid.h"
+
+#include <netinet/in.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "rbd_types.h"
+
+#ifdef __cplusplus
+#ifndef _BACKWARD_BACKWARD_WARNING_H
+#define _BACKWARD_BACKWARD_WARNING_H   // make gcc 4.3 shut up about hash_*
+#endif
+#endif
+
+extern "C" {
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "statlite.h"
+}
+
+#include <string>
+#include <list>
+#include <set>
+#include <boost/container/flat_set.hpp>
+#include <boost/container/flat_map.hpp>
+#include <map>
+#include <vector>
+#include <optional>
+#include <ostream>
+#include <iomanip>
+
+
+#include "include/unordered_map.h"
+
+#include "object.h"
+#include "intarith.h"
+
+#include "acconfig.h"
+
+#include "assert.h"
+
+// DARWIN compatibility
+#ifdef __APPLE__
+typedef long long loff_t;
+typedef long long off64_t;
+#define O_DIRECT 00040000
+#endif
+
+// FreeBSD compatibility
+#ifdef __FreeBSD__
+typedef off_t loff_t;
+typedef off_t off64_t;
+#endif
+
+#if defined(__sun) || defined(_AIX)
+typedef off_t loff_t;
+#endif
+
+
+// -- io helpers --
+
+// Forward declare all the I/O helpers so strict ADL can find them in
+// the case of containers of containers. I'm tempted to abstract this
+// stuff using template templates like I did for denc.
+
+namespace std {
+template<class A, class B>
+inline std::ostream& operator<<(std::ostream&out, const std::pair<A,B>& v);
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v);
+template<class A, std::size_t N, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v);
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t);
+template<typename T>
+inline std::ostream& operator<<(std::ostream& out, const std::optional<T> &t);
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset);
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m);
+}
+
+namespace boost {
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const boost::tuple<Ts...> &t);
+
+namespace container {
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset);
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& iset);
+}
+}
+
+namespace std {
+template<class A, class B>
+inline std::ostream& operator<<(std::ostream& out, const std::pair<A,B>& v) {
+  return out << v.first << "," << v.second;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v) {
+  bool first = true;
+  out << "[";
+  for (const auto& p : v) {
+    if (!first) out << ",";
+    out << p;
+    first = false;
+  }
+  out << "]";
+  return out;
+}
+
+template<class A, std::size_t N, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v) {
+  bool first = true;
+  out << "[";
+  for (const auto& p : v) {
+    if (!first) out << ",";
+    out << p;
+    first = false;
+  }
+  out << "]";
+  return out;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v) {
+  out << "<";
+  for (auto p = v.begin(); p != v.end(); ++p) {
+    if (p != v.begin()) out << ",";
+    out << *p;
+  }
+  out << ">";
+  return out;
+}
+
+template<typename... Ts>
+inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t) {
+  auto f = [n = sizeof...(Ts), i = 0U, &out](const auto& e) mutable {
+    out << e;
+    if (++i != n)
+      out << ",";
+  };
+  ceph::for_each(t, f);
+  return out;
+}
+
+// Mimics boost::optional
+template<typename T>
+inline std::ostream& operator<<(std::ostream& out, const std::optional<T> &t) {
+  if (!t)
+    out << "--" ;
+  else
+    out << ' ' << *t ;
+  return out;
+}
+
+template<class A, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist) {
+  for (auto it = ilist.begin();
+       it != ilist.end();
+       ++it) {
+    if (it != ilist.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset) {
+  for (auto it = iset.begin();
+       it != iset.end();
+       ++it) {
+    if (it != iset.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset) {
+  for (auto it = iset.begin();
+       it != iset.end();
+       ++it) {
+    if (it != iset.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m)
+{
+  out << "{";
+  for (auto it = m.begin();
+       it != m.end();
+       ++it) {
+    if (it != m.begin()) out << ",";
+    out << it->first << "=" << it->second;
+  }
+  out << "}";
+  return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m)
+{
+  out << "{{";
+  for (auto it = m.begin();
+       it != m.end();
+       ++it) {
+    if (it != m.begin()) out << ",";
+    out << it->first << "=" << it->second;
+  }
+  out << "}}";
+  return out;
+}
+
+} // namespace std
+
+namespace boost {
+namespace tuples {
+template<typename A, typename B, typename C>
+inline std::ostream& operator<<(std::ostream& out, const boost::tuples::tuple<A, B, C> &t) {
+  return out << boost::get<0>(t) << ","
+	     << boost::get<1>(t) << ","
+	     << boost::get<2>(t);
+}
+}
+namespace container {
+template<class A, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset) {
+  for (auto it = iset.begin();
+       it != iset.end();
+       ++it) {
+    if (it != iset.begin()) out << ",";
+    out << *it;
+  }
+  return out;
+}
+
+template<class A, class B, class Comp, class Alloc>
+inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& m) {
+  for (auto it = m.begin();
+       it != m.end();
+       ++it) {
+    if (it != m.begin()) out << ",";
+    out << it->first << "=" << it->second;
+  }
+  return out;
+}
+}
+} // namespace boost
+
+
+
+/*
+ * comparators for stl containers
+ */
+// for ceph::unordered_map:
+//   ceph::unordered_map<const char*, long, hash<const char*>, eqstr> vals;
+struct eqstr
+{
+  bool operator()(const char* s1, const char* s2) const
+  {
+    return strcmp(s1, s2) == 0;
+  }
+};
+
+// for set, map
+struct ltstr
+{
+  bool operator()(const char* s1, const char* s2) const
+  {
+    return strcmp(s1, s2) < 0;
+  }
+};
+
+
+namespace ceph {
+  class Formatter;
+}
+
+#include "encoding.h"
+
+WRITE_RAW_ENCODER(ceph_fsid)
+WRITE_RAW_ENCODER(ceph_file_layout)
+WRITE_RAW_ENCODER(ceph_dir_layout)
+WRITE_RAW_ENCODER(ceph_mds_session_head)
+WRITE_RAW_ENCODER(ceph_mds_request_head_legacy)
+WRITE_RAW_ENCODER(ceph_mds_request_release)
+WRITE_RAW_ENCODER(ceph_filelock)
+WRITE_RAW_ENCODER(ceph_mds_caps_head)
+WRITE_RAW_ENCODER(ceph_mds_caps_export_body)
+WRITE_RAW_ENCODER(ceph_mds_caps_non_export_body)
+WRITE_RAW_ENCODER(ceph_mds_cap_peer)
+WRITE_RAW_ENCODER(ceph_mds_cap_release)
+WRITE_RAW_ENCODER(ceph_mds_cap_item)
+WRITE_RAW_ENCODER(ceph_mds_lease)
+WRITE_RAW_ENCODER(ceph_mds_snap_head)
+WRITE_RAW_ENCODER(ceph_mds_snap_realm)
+WRITE_RAW_ENCODER(ceph_mds_reply_head)
+WRITE_RAW_ENCODER(ceph_mds_reply_cap)
+WRITE_RAW_ENCODER(ceph_mds_cap_reconnect)
+WRITE_RAW_ENCODER(ceph_mds_snaprealm_reconnect)
+WRITE_RAW_ENCODER(ceph_frag_tree_split)
+WRITE_RAW_ENCODER(ceph_osd_reply_head)
+WRITE_RAW_ENCODER(ceph_osd_op)
+WRITE_RAW_ENCODER(ceph_msg_header)
+WRITE_RAW_ENCODER(ceph_msg_footer)
+WRITE_RAW_ENCODER(ceph_msg_footer_old)
+WRITE_RAW_ENCODER(ceph_mon_subscribe_item)
+
+WRITE_RAW_ENCODER(ceph_mon_statfs)
+WRITE_RAW_ENCODER(ceph_mon_statfs_reply)
+
+// ----------------------
+// some basic types
+
+// NOTE: these must match ceph_fs.h typedefs
+typedef uint64_t ceph_tid_t; // transaction id
+typedef uint64_t version_t;
+typedef __u32 epoch_t;       // map epoch  (32bits -> 13 epochs/second for 10 years)
+
+// --------------------------------------
+// identify individual mount clients by 64bit value
+
+struct client_t {
+  int64_t v;
+
+  // cppcheck-suppress noExplicitConstructor
+  client_t(int64_t _v = -2) : v(_v) {}
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(v, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    decode(v, bl);
+  }
+};
+WRITE_CLASS_ENCODER(client_t)
+
+static inline bool operator==(const client_t& l, const client_t& r) { return l.v == r.v; }
+static inline bool operator!=(const client_t& l, const client_t& r) { return l.v != r.v; }
+static inline bool operator<(const client_t& l, const client_t& r) { return l.v < r.v; }
+static inline bool operator<=(const client_t& l, const client_t& r) { return l.v <= r.v; }
+static inline bool operator>(const client_t& l, const client_t& r) { return l.v > r.v; }
+static inline bool operator>=(const client_t& l, const client_t& r) { return l.v >= r.v; }
+
+static inline bool operator>=(const client_t& l, int64_t o) { return l.v >= o; }
+static inline bool operator<(const client_t& l, int64_t o) { return l.v < o; }
+
+inline std::ostream& operator<<(std::ostream& out, const client_t& c) {
+  return out << c.v;
+}
+
+
+
+// --
+
+namespace {
+inline std::ostream& format_u(std::ostream& out, const uint64_t v, const uint64_t n,
+      const int index, const uint64_t mult, const char* u)
+  {
+    char buffer[32];
+
+    if (index == 0) {
+      (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+    } else if ((v % mult) == 0) {
+      // If this is an even multiple of the base, always display
+      // without any decimal fraction.
+      (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u);
+    } else {
+      // We want to choose a precision that reflects the best choice
+      // for fitting in 5 characters.  This can get rather tricky when
+      // we have numbers that are very close to an order of magnitude.
+      // For example, when displaying 10239 (which is really 9.999K),
+      // we want only a single place of precision for 10.0K.  We could
+      // develop some complex heuristics for this, but it's much
+      // easier just to try each combination in turn.
+      int i;
+      for (i = 2; i >= 0; i--) {
+        if (snprintf(buffer, sizeof(buffer), "%.*f%s", i,
+          static_cast<double>(v) / mult, u) <= 7)
+          break;
+      }
+    }
+
+    return out << buffer;
+  }
+}
+
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * decimal unit prefix (the classic SI units). No actual unit will be added.
+ */
+struct si_u_t {
+  uint64_t v;
+  explicit si_u_t(uint64_t _v) : v(_v) {};
+};
+
+inline std::ostream& operator<<(std::ostream& out, const si_u_t& b)
+{
+  uint64_t n = b.v;
+  int index = 0;
+  uint64_t mult = 1;
+  const char* u[] = {"", "k", "M", "G", "T", "P", "E"};
+
+  while (n >= 1000 && index < 7) {
+    n /= 1000;
+    index++;
+    mult *= 1000;
+  }
+
+  return format_u(out, b.v, n, index, mult, u[index]);
+}
+
+/*
+ * Use this struct to pretty print values that should be formatted with a
+ * binary unit prefix (IEC units). Since binary unit prefixes are to be used for
+ * "multiples of units in data processing, data transmission, and digital
+ * information" (so bits and bytes) and so far bits are not printed, the unit
+ * "B" for "byte" is added besides the multiplier.
+ */
+struct byte_u_t {
+  uint64_t v;
+  explicit byte_u_t(uint64_t _v) : v(_v) {};
+};
+
+inline std::ostream& operator<<(std::ostream& out, const byte_u_t& b)
+{
+  uint64_t n = b.v;
+  int index = 0;
+  const char* u[] = {" B", " KiB", " MiB", " GiB", " TiB", " PiB", " EiB"};
+
+  while (n >= 1024 && index < 7) {
+    n /= 1024;
+    index++;
+  }
+
+  return format_u(out, b.v, n, index, 1ULL << (10 * index), u[index]);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const ceph_mon_subscribe_item& i)
+{
+  return out << (long)i.start
+	     << ((i.flags & CEPH_SUBSCRIBE_ONETIME) ? "" : "+");
+}
+
+struct weightf_t {
+  float v;
+  // cppcheck-suppress noExplicitConstructor
+  weightf_t(float _v) : v(_v) {}
+};
+
+inline std::ostream& operator<<(std::ostream& out, const weightf_t& w)
+{
+  if (w.v < -0.01F) {
+    return out << "-";
+  } else if (w.v < 0.000001F) {
+    return out << "0";
+  } else {
+    std::streamsize p = out.precision();
+    return out << std::fixed << std::setprecision(5) << w.v << std::setprecision(p);
+  }
+}
+
+struct shard_id_t {
+  int8_t id;
+
+  shard_id_t() : id(0) {}
+  explicit shard_id_t(int8_t _id) : id(_id) {}
+
+  operator int8_t() const { return id; }
+
+  const static shard_id_t NO_SHARD;
+
+  void encode(ceph::buffer::list &bl) const {
+    using ceph::encode;
+    encode(id, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    using ceph::decode;
+    decode(id, bl);
+  }
+
+  bool operator==(const shard_id_t&) const = default;
+  auto operator<=>(const shard_id_t&) const = default;
+};
+WRITE_CLASS_ENCODER(shard_id_t)
+std::ostream &operator<<(std::ostream &lhs, const shard_id_t &rhs);
+
+#if defined(__sun) || defined(_AIX) || defined(__APPLE__) || \
+    defined(__FreeBSD__) || defined(_WIN32)
+extern "C" {
+__s32  ceph_to_hostos_errno(__s32 e);
+__s32  hostos_to_ceph_errno(__s32 e);
+}
+#else
+#define  ceph_to_hostos_errno(e) (e)
+#define  hostos_to_ceph_errno(e) (e)
+#endif
+
+struct errorcode32_t {
+  int32_t code;
+
+  errorcode32_t() : code(0) {}
+  // cppcheck-suppress noExplicitConstructor
+  explicit errorcode32_t(int32_t i) : code(i) {}
+
+  operator int() const  { return code; }
+  int* operator&()      { return &code; }
+  errorcode32_t& operator=(int32_t i) {
+    code = i;
+    return *this;
+  }
+  bool operator==(const errorcode32_t&) const = default;
+  auto operator<=>(const errorcode32_t&) const = default;
+
+  void encode(ceph::buffer::list &bl) const {
+    using ceph::encode;
+    __s32 newcode = hostos_to_ceph_errno(code);
+    encode(newcode, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    using ceph::decode;
+    decode(code, bl);
+    code = ceph_to_hostos_errno(code);
+  }
+};
+WRITE_CLASS_ENCODER(errorcode32_t)
+
+template <uint8_t S>
+struct sha_digest_t {
+  constexpr static uint32_t SIZE = S;
+  // TODO: we might consider std::array in the future. Avoiding it for now
+  // as sha_digest_t is a part of our public API.
+  unsigned char v[S] = {0};
+
+  std::string to_str() const {
+    char str[S * 2 + 1] = {0};
+    str[0] = '\0';
+    for (size_t i = 0; i < S; i++) {
+      ::sprintf(&str[i * 2], "%02x", static_cast<int>(v[i]));
+    }
+    return std::string(str);
+  }
+  sha_digest_t(const unsigned char *_v) { memcpy(v, _v, SIZE); };
+  sha_digest_t() {}
+
+  bool operator==(const sha_digest_t& r) const {
+    return ::memcmp(v, r.v, SIZE) == 0;
+  }
+  bool operator!=(const sha_digest_t& r) const {
+    return ::memcmp(v, r.v, SIZE) != 0;
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    // copy to avoid reinterpret_cast, is_pod and other nasty things
+    using ceph::encode;
+    std::array<unsigned char, SIZE> tmparr;
+    memcpy(tmparr.data(), v, SIZE);
+    encode(tmparr, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    using ceph::decode;
+    std::array<unsigned char, SIZE> tmparr;
+    decode(tmparr, bl);
+    memcpy(v, tmparr.data(), SIZE);
+  }
+};
+
+template<uint8_t S>
+inline std::ostream &operator<<(std::ostream &out, const sha_digest_t<S> &b) {
+  std::string str = b.to_str();
+  return out << str;
+}
+
+#if FMT_VERSION >= 90000
+template <uint8_t S> struct fmt::formatter<sha_digest_t<S>> : fmt::ostream_formatter {};
+#endif
+
+using sha1_digest_t = sha_digest_t<20>;
+WRITE_CLASS_ENCODER(sha1_digest_t)
+
+using sha256_digest_t = sha_digest_t<32>;
+WRITE_CLASS_ENCODER(sha256_digest_t)
+
+using sha512_digest_t = sha_digest_t<64>;
+
+using md5_digest_t = sha_digest_t<16>;
+WRITE_CLASS_ENCODER(md5_digest_t)
+
+
+#endif
diff --git a/src/include/unordered_map.h b/src/include/unordered_map.h
new file mode 100644
index 000000000..aee5f5a76
--- /dev/null
+++ b/src/include/unordered_map.h
@@ -0,0 +1,11 @@
+#ifndef CEPH_UNORDERED_MAP_H
+#define CEPH_UNORDERED_MAP_H
+
+#include <unordered_map>
+
+namespace ceph {
+  using std::unordered_map;
+  using std::unordered_multimap;
+}
+
+#endif
diff --git a/src/include/unordered_set.h b/src/include/unordered_set.h
new file mode 100644
index 000000000..e30e1799e
--- /dev/null
+++ b/src/include/unordered_set.h
@@ -0,0 +1,10 @@
+#ifndef CEPH_UNORDERED_SET_H
+#define CEPH_UNORDERED_SET_H
+
+#include <unordered_set>
+
+namespace ceph {
+  using std::unordered_set;
+}
+
+#endif
diff --git a/src/include/uses_allocator.h b/src/include/uses_allocator.h
new file mode 100644
index 000000000..35cdbd709
--- /dev/null
+++ b/src/include/uses_allocator.h
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+// Derived from:
+/* uses_allocator.h                  -*-C++-*-
+ *
+ * Copyright (C) 2016 Pablo Halpern <phalpern@halpernwightsoftware.com>
+ * Distributed under the Boost Software License - Version 1.0
+ */
+// Downloaded from https://github.com/phalpern/uses-allocator.git
+
+#pragma once
+
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace ceph {
+
+namespace internal {
+template <class T, class Tuple, std::size_t... Indexes>
+T make_from_tuple_imp(Tuple&& t, std::index_sequence<Indexes...>)
+{
+  return T(std::get<Indexes>(std::forward<Tuple>(t))...);
+}
+} // namespace internal
+
+template<class T, class Tuple>
+T make_from_tuple(Tuple&& args_tuple)
+{
+    using namespace internal;
+    using Indices = std::make_index_sequence<std::tuple_size_v<
+                                               std::decay_t<Tuple>>>;
+    return make_from_tuple_imp<T>(std::forward<Tuple>(args_tuple), Indices{});
+}
+
+////////////////////////////////////////////////////////////////////////
+
+// Forward declaration
+template <class T, class Alloc, class... Args>
+auto uses_allocator_construction_args(const Alloc& a, Args&&... args);
+
+namespace internal {
+
+template <class T, class A>
+struct has_allocator : std::uses_allocator<T, A> { };
+
+// Specialization of `has_allocator` for `std::pair`
+template <class T1, class T2, class A>
+struct has_allocator<std::pair<T1, T2>, A>
+  : std::integral_constant<bool, has_allocator<T1, A>::value ||
+                                 has_allocator<T2, A>::value>
+{
+};
+
+template <bool V> using boolean_constant = std::integral_constant<bool, V>;
+
+template <class T> struct is_pair : std::false_type { };
+
+template <class T1, class T2>
+struct is_pair<std::pair<T1, T2>> : std::true_type { };
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload is handles types for which `has_allocator<T, Alloc>` is false.
+template <class T, class Unused1, class Unused2, class Alloc, class... Args>
+auto uses_allocator_args_imp(Unused1      /* is_pair */,
+                             std::false_type   /* has_allocator */,
+                             Unused2      /* uses prefix allocator arg */,
+                             const Alloc& /* ignored */,
+                             Args&&... args)
+{
+    // Allocator is ignored
+    return std::forward_as_tuple(std::forward<Args>(args)...);
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles non-pair `T` for which `has_allocator<T, Alloc>` is
+// true and constructor `T(allocator_arg_t, a, args...)` is valid.
+template <class T, class Alloc, class... Args>
+auto uses_allocator_args_imp(std::false_type /* is_pair */,
+                             std::true_type  /* has_allocator */,
+                             std::true_type  /* uses prefix allocator arg */,
+                             const Alloc& a,
+                             Args&&... args)
+{
+    // Allocator added to front of argument list, after `allocator_arg`.
+  return std::tuple<std::allocator_arg_t, const Alloc&,
+                    Args&&...>(std::allocator_arg, a, std::forward<Args>(args)...);
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles non-pair `T` for which `has_allocator<T, Alloc>` is
+// true and constructor `T(allocator_arg_t, a, args...)` NOT valid.
+// This function will produce invalid results unless `T(args..., a)` is valid.
+template <class T1, class Alloc, class... Args>
+auto uses_allocator_args_imp(std::false_type /* is_pair */,
+                             std::true_type  /* has_allocator */,
+                             std::false_type /* prefix allocator arg */,
+                             const Alloc& a,
+                             Args&&... args)
+{
+    // Allocator added to end of argument list
+    return std::forward_as_tuple(std::forward<Args>(args)..., a);
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// piecewise_construct arguments are passed in.
+template <class T, class Alloc, class Tuple1, class Tuple2>
+auto uses_allocator_args_imp(std::true_type  /* is_pair */,
+                             std::true_type  /* has_allocator */,
+                             std::false_type /* prefix allocator arg */,
+                             const Alloc& a,
+                             std::piecewise_construct_t,
+                             Tuple1&& x, Tuple2&& y)
+{
+    using T1 = typename T::first_type;
+    using T2 = typename T::second_type;
+
+    return std::make_tuple(
+      std::piecewise_construct,
+      std::apply([&a](auto&&... args1) -> auto {
+                   return uses_allocator_construction_args<T1>(
+                     a, std::forward<decltype(args1)>(args1)...);
+                 }, std::forward<Tuple1>(x)),
+      std::apply([&a](auto&&... args2) -> auto {
+                   return uses_allocator_construction_args<T2>(
+                     a, std::forward<decltype(args2)>(args2)...);
+                 }, std::forward<Tuple2>(y))
+      );
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// no other constructor arguments are passed in.
+template <class T, class Alloc>
+auto uses_allocator_args_imp(std::true_type  /* is_pair */,
+                             std::true_type  /* has_allocator */,
+                             std::false_type /* prefix allocator arg */,
+                             const Alloc& a)
+{
+    // using T1 = typename T::first_type;
+    // using T2 = typename T::second_type;
+
+    // return std::make_tuple(
+    //     piecewise_construct,
+    //     uses_allocator_construction_args<T1>(a),
+    //     uses_allocator_construction_args<T2>(a));
+  return uses_allocator_construction_args<T>(a, std::piecewise_construct,
+                                             std::tuple<>{}, std::tuple<>{});
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// a single argument of type const-lvalue-of-pair is passed in.
+template <class T, class Alloc, class U1, class U2>
+auto uses_allocator_args_imp(std::true_type  /* is_pair */,
+                             std::true_type  /* has_allocator */,
+                             std::false_type /* prefix allocator arg */,
+                             const Alloc& a,
+                             const std::pair<U1, U2>& arg)
+{
+    // using T1 = typename T::first_type;
+    // using T2 = typename T::second_type;
+
+    // return std::make_tuple(
+    //     piecewise_construct,
+    //     uses_allocator_construction_args<T1>(a, arg.first),
+    //     uses_allocator_construction_args<T2>(a, arg.second));
+  return uses_allocator_construction_args<T>(a, std::piecewise_construct,
+                                               std::forward_as_tuple(arg.first),
+                                             std::forward_as_tuple(arg.second));
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// a single argument of type rvalue-of-pair is passed in.
+template <class T, class Alloc, class U1, class U2>
+auto uses_allocator_args_imp(std::true_type  /* is_pair */,
+                             std::true_type  /* has_allocator */,
+                             std::false_type /* prefix allocator arg */,
+                             const Alloc& a,
+                             std::pair<U1, U2>&& arg)
+{
+    // using T1 = typename T::first_type;
+    // using T2 = typename T::second_type;
+
+    // return std::make_tuple(
+    //     piecewise_construct,
+    //     uses_allocator_construction_args<T1>(a, forward<U1>(arg.first)),
+    //     uses_allocator_construction_args<T2>(a, forward<U2>(arg.second)));
+  return uses_allocator_construction_args<T>(a, std::piecewise_construct,
+                                             std::forward_as_tuple(std::forward<U1>(arg.first)),
+                                             std::forward_as_tuple(std::forward<U2>(arg.second)));
+}
+
+// Return a tuple of arguments appropriate for uses-allocator construction
+// with allocator `Alloc` and ctor arguments `Args`.
+// This overload handles specializations of `T` = `std::pair` for which
+// `has_allocator<T, Alloc>` is true for either or both of the elements and
+// two additional constructor arguments are passed in.
+template <class T, class Alloc, class U1, class U2>
+auto uses_allocator_args_imp(std::true_type  /* is_pair */,
+                             std::true_type  /* has_allocator */,
+                             std::false_type /* prefix allocator arg */,
+                             const Alloc& a,
+                             U1&& arg1, U2&& arg2)
+{
+    // using T1 = typename T::first_type;
+    // using T2 = typename T::second_type;
+
+    // return std::make_tuple(
+    //     piecewise_construct,
+    //     uses_allocator_construction_args<T1>(a, forward<U1>(arg1)),
+    //     uses_allocator_construction_args<T2>(a, forward<U2>(arg2)));
+  return uses_allocator_construction_args<T>(
+    a, std::piecewise_construct,
+    std::forward_as_tuple(std::forward<U1>(arg1)),
+    std::forward_as_tuple(std::forward<U2>(arg2)));
+}
+
+} // close namespace internal
+
+template <class T, class Alloc, class... Args>
+auto uses_allocator_construction_args(const Alloc& a, Args&&... args)
+{
+    using namespace internal;
+    return uses_allocator_args_imp<T>(is_pair<T>(),
+                                      has_allocator<T, Alloc>(),
+                                      std::is_constructible<T, std::allocator_arg_t,
+                                                            Alloc, Args...>(),
+                                      a, std::forward<Args>(args)...);
+}
+
+template <class T, class Alloc, class... Args>
+T make_obj_using_allocator(const Alloc& a, Args&&... args)
+{
+  return make_from_tuple<T>(
+    uses_allocator_construction_args<T>(a, std::forward<Args>(args)...));
+}
+
+template <class T, class Alloc, class... Args>
+T* uninitialized_construct_using_allocator(T* p,
+                                           const Alloc& a,
+                                           Args&&... args)
+{
+  return std::apply([p](auto&&... args2){
+                      return ::new(static_cast<void*>(p))
+                        T(std::forward<decltype(args2)>(args2)...);
+                    }, uses_allocator_construction_args<T>(
+		      a, std::forward<Args>(args)...));
+}
+
+} // namespace ceph
diff --git a/src/include/util.h b/src/include/util.h
new file mode 100644
index 000000000..acad4a52c
--- /dev/null
+++ b/src/include/util.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ */
+#ifndef CEPH_UTIL_H
+#define CEPH_UTIL_H
+
+#include "common/Formatter.h"
+#include "include/types.h"
+
+std::string bytes2str(uint64_t count);
+
+struct ceph_data_stats
+{
+  uint64_t byte_total;
+  uint64_t byte_used;
+  uint64_t byte_avail;
+  int avail_percent;
+
+  ceph_data_stats() :
+    byte_total(0),
+    byte_used(0),
+    byte_avail(0),
+    avail_percent(0)
+  { }
+
+  void dump(ceph::Formatter *f) const {
+    ceph_assert(f != NULL);
+    f->dump_int("total", byte_total);
+    f->dump_int("used", byte_used);
+    f->dump_int("avail", byte_avail);
+    f->dump_int("avail_percent", avail_percent);
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(byte_total, bl);
+    encode(byte_used, bl);
+    encode(byte_avail, bl);
+    encode(avail_percent, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator &p) {
+    DECODE_START(1, p);
+    decode(byte_total, p);
+    decode(byte_used, p);
+    decode(byte_avail, p);
+    decode(avail_percent, p);
+    DECODE_FINISH(p);
+  }
+
+  static void generate_test_instances(std::list<ceph_data_stats*>& ls) {
+    ls.push_back(new ceph_data_stats);
+    ls.push_back(new ceph_data_stats);
+    ls.back()->byte_total = 1024*1024;
+    ls.back()->byte_used = 512*1024;
+    ls.back()->byte_avail = 512*1024;
+    ls.back()->avail_percent = 50;
+  }
+};
+typedef struct ceph_data_stats ceph_data_stats_t;
+WRITE_CLASS_ENCODER(ceph_data_stats)
+
+int get_fs_stats(ceph_data_stats_t &stats, const char *path);
+
+/// get memory limit for the current cgroup
+int get_cgroup_memory_limit(uint64_t *limit);
+
+/// collect info from @p uname(2), @p /proc/meminfo and @p /proc/cpuinfo
+void collect_sys_info(std::map<std::string, std::string> *m, CephContext *cct);
+
+#ifdef _WIN32
+/// Retrieve the actual Windows version, regardless of the app manifest.
+int get_windows_version(POSVERSIONINFOEXW ver);
+#endif
+
+/// dump service ids grouped by their host to the specified formatter
+/// @param f formatter for the output
+/// @param services a map from hostname to a list of service id hosted by this host
+/// @param type the service type of given @p services, for example @p osd or @p mon.
+void dump_services(ceph::Formatter* f,
+		   const std::map<std::string, std::list<int> >& services,
+		   const char* type);
+/// dump service names grouped by their host to the specified formatter
+/// @param f formatter for the output
+/// @param services a map from hostname to a list of service name hosted by this host
+/// @param type the service type of given @p services, for example @p osd or @p mon.
+void dump_services(ceph::Formatter* f, const std::map<std::string,
+		   std::list<std::string> >& services, const char* type);
+
+std::string cleanbin(ceph::buffer::list &bl, bool &b64, bool show = false);
+std::string cleanbin(std::string &str);
+
+namespace ceph::util {
+
+// Returns true if s matches any parameters:
+template <typename ...XS>
+bool match_str(const std::string& s, const XS& ...xs)
+{
+ return ((s == xs) || ...);
+}
+
+} // namespace ceph::util
+#endif /* CEPH_UTIL_H */
diff --git a/src/include/utime.cc b/src/include/utime.cc
new file mode 100644
index 000000000..2252a1ca4
--- /dev/null
+++ b/src/include/utime.cc
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "utime.h"
+#include "common/Formatter.h"
+
+void utime_t::dump(ceph::Formatter *f) const
+{
+  f->dump_int("seconds", tv.tv_sec);
+  f->dump_int("nanoseconds", tv.tv_nsec);
+}
+
+void utime_t::generate_test_instances(std::list<utime_t*>& o)
+{
+  o.push_back(new utime_t());
+  o.push_back(new utime_t());
+  o.back()->tv.tv_sec = static_cast<__u32>((1L << 32) - 1);
+  o.push_back(new utime_t());
+  o.back()->tv.tv_nsec = static_cast<__u32>((1L << 32) - 1);
+}
diff --git a/src/include/utime.h b/src/include/utime.h
new file mode 100644
index 000000000..fad66af79
--- /dev/null
+++ b/src/include/utime.h
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_UTIME_H
+#define CEPH_UTIME_H
+
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+
+#if defined(WITH_SEASTAR)
+#include <seastar/core/lowres_clock.hh>
+#endif
+
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/timegm.h"
+#include "common/strtol.h"
+#include "common/ceph_time.h"
+#include "common/safe_io.h"
+#include "common/SubProcess.h"
+#include "include/denc.h"
+
+
+// --------
+// utime_t
+
+inline __u32 cap_to_u32_max(__u64 t) {
+  return std::min(t, (__u64)std::numeric_limits<uint32_t>::max());
+}
+/* WARNING: If add member in utime_t, please make sure the encode/decode function
+ * work well. For little-endian machine, we should make sure there is no padding
+ * in 32-bit machine and 64-bit machine.
+ * You should also modify the padding_check function.
+ */
+class utime_t {
+public:
+  struct {
+    __u32 tv_sec, tv_nsec;
+  } tv;
+
+ public:
+  bool is_zero() const {
+    return (tv.tv_sec == 0) && (tv.tv_nsec == 0);
+  }
+
+  void normalize() {
+    if (tv.tv_nsec > 1000000000ul) {
+      tv.tv_sec = cap_to_u32_max(tv.tv_sec + tv.tv_nsec / (1000000000ul));
+      tv.tv_nsec %= 1000000000ul;
+    }
+  }
+
+  // cons
+  utime_t() { tv.tv_sec = 0; tv.tv_nsec = 0; }
+  utime_t(time_t s, int n) { tv.tv_sec = s; tv.tv_nsec = n; normalize(); }
+  utime_t(const struct ceph_timespec &v) {
+    decode_timeval(&v);
+  }
+  utime_t(const struct timespec v)
+  {
+    // NOTE: this is used by ceph_clock_now() so should be kept
+    // as thin as possible.
+    tv.tv_sec = v.tv_sec;
+    tv.tv_nsec = v.tv_nsec;
+  }
+  // conversion from ceph::real_time/coarse_real_time
+  template <typename Clock, typename std::enable_if_t<
+            ceph::converts_to_timespec_v<Clock>>* = nullptr>
+  explicit utime_t(const std::chrono::time_point<Clock>& t)
+    : utime_t(Clock::to_timespec(t)) {} // forward to timespec ctor
+
+  template<class Rep, class Period>
+  explicit utime_t(const std::chrono::duration<Rep, Period>& dur) {
+    using common_t = std::common_type_t<Rep, int>;
+    tv.tv_sec = std::max<common_t>(std::chrono::duration_cast<std::chrono::seconds>(dur).count(), 0);
+    tv.tv_nsec = std::max<common_t>((std::chrono::duration_cast<std::chrono::nanoseconds>(dur) %
+				     std::chrono::seconds(1)).count(), 0);
+  }
+#if defined(WITH_SEASTAR)
+  explicit utime_t(const seastar::lowres_system_clock::time_point& t) {
+    tv.tv_sec = std::chrono::duration_cast<std::chrono::seconds>(
+        t.time_since_epoch()).count();
+    tv.tv_nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        t.time_since_epoch() % std::chrono::seconds(1)).count();
+  }
+  explicit operator seastar::lowres_system_clock::time_point() const noexcept {
+    using clock_t = seastar::lowres_system_clock;
+    return clock_t::time_point{std::chrono::duration_cast<clock_t::duration>(
+      std::chrono::seconds{tv.tv_sec} + std::chrono::nanoseconds{tv.tv_nsec})};
+  }
+#endif
+
+  utime_t(const struct timeval &v) {
+    set_from_timeval(&v);
+  }
+  utime_t(const struct timeval *v) {
+    set_from_timeval(v);
+  }
+  void to_timespec(struct timespec *ts) const {
+    ts->tv_sec = tv.tv_sec;
+    ts->tv_nsec = tv.tv_nsec;
+  }
+  void set_from_double(double d) {
+    tv.tv_sec = (__u32)trunc(d);
+    tv.tv_nsec = (__u32)((d - (double)tv.tv_sec) * 1000000000.0);
+  }
+
+  ceph::real_time to_real_time() const {
+    ceph_timespec ts;
+    encode_timeval(&ts);
+    return ceph::real_clock::from_ceph_timespec(ts);
+  }
+
+  // accessors
+  time_t        sec()  const { return tv.tv_sec; }
+  long          usec() const { return tv.tv_nsec/1000; }
+  int           nsec() const { return tv.tv_nsec; }
+
+  // ref accessors/modifiers
+  __u32&         sec_ref()  { return tv.tv_sec; }
+  __u32&         nsec_ref() { return tv.tv_nsec; }
+
+  uint64_t to_nsec() const {
+    return (uint64_t)tv.tv_nsec + (uint64_t)tv.tv_sec * 1000000000ull;
+  }
+  uint64_t to_msec() const {
+    return (uint64_t)tv.tv_nsec / 1000000ull + (uint64_t)tv.tv_sec * 1000ull;
+  }
+
+  void copy_to_timeval(struct timeval *v) const {
+    v->tv_sec = tv.tv_sec;
+    v->tv_usec = tv.tv_nsec/1000;
+  }
+  void set_from_timeval(const struct timeval *v) {
+    tv.tv_sec = v->tv_sec;
+    tv.tv_nsec = v->tv_usec*1000;
+  }
+  void padding_check() {
+    static_assert(
+      sizeof(utime_t) ==
+        sizeof(tv.tv_sec) +
+        sizeof(tv.tv_nsec)
+      ,
+      "utime_t have padding");
+  }
+  void encode(ceph::buffer::list &bl) const {
+#if defined(CEPH_LITTLE_ENDIAN)
+    bl.append((char *)(this), sizeof(__u32) + sizeof(__u32));
+#else
+    using ceph::encode;
+    encode(tv.tv_sec, bl);
+    encode(tv.tv_nsec, bl);
+#endif
+  }
+  void decode(ceph::buffer::list::const_iterator &p) {
+#if defined(CEPH_LITTLE_ENDIAN)
+    p.copy(sizeof(__u32) + sizeof(__u32), (char *)(this));
+#else
+    using ceph::decode;
+    decode(tv.tv_sec, p);
+    decode(tv.tv_nsec, p);
+#endif
+  }
+
+  DENC(utime_t, v, p) {
+    denc(v.tv.tv_sec, p);
+    denc(v.tv.tv_nsec, p);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<utime_t*>& o);
+  
+  void encode_timeval(struct ceph_timespec *t) const {
+    t->tv_sec = tv.tv_sec;
+    t->tv_nsec = tv.tv_nsec;
+  }
+  void decode_timeval(const struct ceph_timespec *t) {
+    tv.tv_sec = t->tv_sec;
+    tv.tv_nsec = t->tv_nsec;
+  }
+
+  utime_t round_to_minute() {
+    struct tm bdt;
+    time_t tt = sec();
+    localtime_r(&tt, &bdt);
+    bdt.tm_sec = 0;
+    tt = mktime(&bdt);
+    return utime_t(tt, 0);
+  }
+
+  utime_t round_to_hour() {
+    struct tm bdt;
+    time_t tt = sec();
+    localtime_r(&tt, &bdt);
+    bdt.tm_sec = 0;
+    bdt.tm_min = 0;
+    tt = mktime(&bdt);
+    return utime_t(tt, 0);
+  }
+
+  utime_t round_to_day() {
+    struct tm bdt;
+    time_t tt = sec();
+    localtime_r(&tt, &bdt);
+    bdt.tm_sec = 0;
+    bdt.tm_min = 0;
+    bdt.tm_hour = 0;
+    tt = mktime(&bdt);
+    return utime_t(tt, 0);
+  }
+
+  // cast to double
+  operator double() const {
+    return (double)sec() + ((double)nsec() / 1000000000.0f);
+  }
+  operator ceph_timespec() const {
+    ceph_timespec ts;
+    ts.tv_sec = sec();
+    ts.tv_nsec = nsec();
+    return ts;
+  }
+
+  void sleep() const {
+    struct timespec ts;
+    to_timespec(&ts);
+    nanosleep(&ts, NULL);
+  }
+
+  // output
+  std::ostream& gmtime(std::ostream& out, bool legacy_form=false) const {
+    out.setf(std::ios::right);
+    char oldfill = out.fill();
+    out.fill('0');
+    if (sec() < ((time_t)(60*60*24*365*10))) {
+      // raw seconds.  this looks like a relative time.
+      out << (long)sec() << "." << std::setw(6) << usec();
+    } else {
+      // this looks like an absolute time.
+      //  conform to http://en.wikipedia.org/wiki/ISO_8601
+      struct tm bdt;
+      time_t tt = sec();
+      gmtime_r(&tt, &bdt);
+      out << std::setw(4) << (bdt.tm_year+1900)  // 2007 -> '07'
+	  << '-' << std::setw(2) << (bdt.tm_mon+1)
+	  << '-' << std::setw(2) << bdt.tm_mday;
+      if (legacy_form) {
+	out << ' ';
+      } else {
+	out << 'T';
+      }
+      out << std::setw(2) << bdt.tm_hour
+	  << ':' << std::setw(2) << bdt.tm_min
+	  << ':' << std::setw(2) << bdt.tm_sec;
+      out << "." << std::setw(6) << usec();
+      out << "Z";
+    }
+    out.fill(oldfill);
+    out.unsetf(std::ios::right);
+    return out;
+  }
+
+  // output
+  std::ostream& gmtime_nsec(std::ostream& out) const {
+    out.setf(std::ios::right);
+    char oldfill = out.fill();
+    out.fill('0');
+    if (sec() < ((time_t)(60*60*24*365*10))) {
+      // raw seconds.  this looks like a relative time.
+      out << (long)sec() << "." << std::setw(6) << usec();
+    } else {
+      // this looks like an absolute time.
+      //  conform to http://en.wikipedia.org/wiki/ISO_8601
+      struct tm bdt;
+      time_t tt = sec();
+      gmtime_r(&tt, &bdt);
+      out << std::setw(4) << (bdt.tm_year+1900)  // 2007 -> '07'
+	  << '-' << std::setw(2) << (bdt.tm_mon+1)
+	  << '-' << std::setw(2) << bdt.tm_mday
+	  << 'T'
+	  << std::setw(2) << bdt.tm_hour
+	  << ':' << std::setw(2) << bdt.tm_min
+	  << ':' << std::setw(2) << bdt.tm_sec;
+      out << "." << std::setw(9) << nsec();
+      out << "Z";
+    }
+    out.fill(oldfill);
+    out.unsetf(std::ios::right);
+    return out;
+  }
+
+  // output
+  std::ostream& asctime(std::ostream& out) const {
+    out.setf(std::ios::right);
+    char oldfill = out.fill();
+    out.fill('0');
+    if (sec() < ((time_t)(60*60*24*365*10))) {
+      // raw seconds.  this looks like a relative time.
+      out << (long)sec() << "." << std::setw(6) << usec();
+    } else {
+      // this looks like an absolute time.
+      struct tm bdt;
+      time_t tt = sec();
+      gmtime_r(&tt, &bdt);
+
+      char buf[128];
+      asctime_r(&bdt, buf);
+      int len = strlen(buf);
+      if (buf[len - 1] == '\n')
+        buf[len - 1] = '\0';
+      out << buf;
+    }
+    out.fill(oldfill);
+    out.unsetf(std::ios::right);
+    return out;
+  }
+
+  std::ostream& localtime(std::ostream& out, bool legacy_form=false) const {
+    out.setf(std::ios::right);
+    char oldfill = out.fill();
+    out.fill('0');
+    if (sec() < ((time_t)(60*60*24*365*10))) {
+      // raw seconds.  this looks like a relative time.
+      out << (long)sec() << "." << std::setw(6) << usec();
+    } else {
+      // this looks like an absolute time.
+      //  conform to http://en.wikipedia.org/wiki/ISO_8601
+      struct tm bdt;
+      time_t tt = sec();
+      localtime_r(&tt, &bdt);
+      out << std::setw(4) << (bdt.tm_year+1900)  // 2007 -> '07'
+	  << '-' << std::setw(2) << (bdt.tm_mon+1)
+	  << '-' << std::setw(2) << bdt.tm_mday;
+      if (legacy_form) {
+	out << ' ';
+      } else {
+	out << 'T';
+      }
+      out << std::setw(2) << bdt.tm_hour
+	  << ':' << std::setw(2) << bdt.tm_min
+	  << ':' << std::setw(2) << bdt.tm_sec;
+      out << "." << std::setw(6) << usec();
+      if (!legacy_form) {
+	char buf[32] = { 0 };
+	strftime(buf, sizeof(buf), "%z", &bdt);
+	out << buf;
+      }
+    }
+    out.fill(oldfill);
+    out.unsetf(std::ios::right);
+    return out;
+  }
+
+  static int invoke_date(const std::string& date_str, utime_t *result) {
+     char buf[256];
+
+     SubProcess bin_date("/bin/date", SubProcess::CLOSE, SubProcess::PIPE,
+			 SubProcess::KEEP);
+     bin_date.add_cmd_args("-d", date_str.c_str(), "+%s %N", NULL);
+
+     int r = bin_date.spawn();
+     if (r < 0) return r;
+
+     ssize_t n = safe_read(bin_date.get_stdout(), buf, sizeof(buf));
+
+     r = bin_date.join();
+     if (r || n <= 0) return -EINVAL;
+
+     uint64_t epoch, nsec;
+     std::istringstream iss(buf);
+
+     iss >> epoch;
+     iss >> nsec;
+
+     *result = utime_t(epoch, nsec);
+
+     return 0;
+  }
+
+
+  static int parse_date(const std::string& date, uint64_t *epoch, uint64_t *nsec,
+                        std::string *out_date=nullptr,
+			std::string *out_time=nullptr) {
+    struct tm tm;
+    memset(&tm, 0, sizeof(tm));
+
+    if (nsec)
+      *nsec = 0;
+
+    const char *p = strptime(date.c_str(), "%Y-%m-%d", &tm);
+    if (p) {
+      if (*p == ' ' || *p == 'T') {
+	p++;
+	// strptime doesn't understand fractional/decimal seconds, and
+	// it also only takes format chars or literals, so we have to
+	// get creative.
+	char fmt[32] = {0};
+	strncpy(fmt, p, sizeof(fmt) - 1);
+	fmt[0] = '%';
+	fmt[1] = 'H';
+	fmt[2] = ':';
+	fmt[3] = '%';
+	fmt[4] = 'M';
+	fmt[6] = '%';
+	fmt[7] = 'S';
+	const char *subsec = 0;
+	char *q = fmt + 8;
+	if (*q == '.') {
+	  ++q;
+	  subsec = p + 9;
+	  q = fmt + 9;
+	  while (*q && isdigit(*q)) {
+	    ++q;
+	  }
+	}
+	// look for tz...
+	if (*q == '-' || *q == '+') {
+	  *q = '%';
+	  *(q+1) = 'z';
+	  *(q+2) = 0;
+	}
+	p = strptime(p, fmt, &tm);
+	if (!p) {
+	  return -EINVAL;
+	}
+        if (nsec && subsec) {
+          unsigned i;
+          char buf[10]; /* 9 digit + null termination */
+          for (i = 0; (i < sizeof(buf) - 1) && isdigit(*subsec); ++i, ++subsec) {
+            buf[i] = *subsec;
+          }
+          for (; i < sizeof(buf) - 1; ++i) {
+            buf[i] = '0';
+          }
+          buf[i] = '\0';
+	  std::string err;
+          *nsec = (uint64_t)strict_strtol(buf, 10, &err);
+          if (!err.empty()) {
+            return -EINVAL;
+          }
+        }
+      }
+    } else {
+      int sec, usec;
+      int r = sscanf(date.c_str(), "%d.%d", &sec, &usec);
+      if (r != 2) {
+        return -EINVAL;
+      }
+
+      time_t tt = sec;
+      gmtime_r(&tt, &tm);
+
+      if (nsec) {
+        *nsec = (uint64_t)usec * 1000;
+      }
+    }
+
+    #ifndef _WIN32
+    // apply the tm_gmtoff manually below, since none of mktime,
+    // gmtime, and localtime seem to do it.  zero it out here just in
+    // case some other libc *does* apply it.  :(
+    auto gmtoff = tm.tm_gmtoff;
+    tm.tm_gmtoff = 0;
+    #else
+    auto gmtoff = _timezone;
+    #endif /* _WIN32 */
+
+    time_t t = internal_timegm(&tm);
+    if (epoch)
+      *epoch = (uint64_t)t;
+
+    *epoch -= gmtoff;
+
+    if (out_date) {
+      char buf[32];
+      strftime(buf, sizeof(buf), "%Y-%m-%d", &tm);
+      *out_date = buf;
+    }
+    if (out_time) {
+      char buf[32];
+      strftime(buf, sizeof(buf), "%H:%M:%S", &tm);
+      *out_time = buf;
+    }
+
+    return 0;
+  }
+
+  bool parse(const std::string& s) {
+    uint64_t epoch, nsec;
+    int r = parse_date(s, &epoch, &nsec);
+    if (r < 0) {
+      return false;
+    }
+    *this = utime_t(epoch, nsec);
+    return true;
+  }
+};
+WRITE_CLASS_ENCODER(utime_t)
+WRITE_CLASS_DENC(utime_t)
+
+// arithmetic operators
+inline utime_t operator+(const utime_t& l, const utime_t& r) {
+  __u64 sec = (__u64)l.sec() + r.sec();
+  return utime_t(cap_to_u32_max(sec), l.nsec() + r.nsec());
+}
+inline utime_t& operator+=(utime_t& l, const utime_t& r) {
+  l.sec_ref() = cap_to_u32_max((__u64)l.sec() + r.sec());
+  l.nsec_ref() += r.nsec();
+  l.normalize();
+  return l;
+}
+inline utime_t& operator+=(utime_t& l, double f) {
+  double fs = trunc(f);
+  double ns = (f - fs) * 1000000000.0;
+  l.sec_ref() = cap_to_u32_max(l.sec() + (__u64)fs);
+  l.nsec_ref() += (long)ns;
+  l.normalize();
+  return l;
+}
+
+inline utime_t operator-(const utime_t& l, const utime_t& r) {
+  return utime_t( l.sec() - r.sec() - (l.nsec()<r.nsec() ? 1:0),
+                  l.nsec() - r.nsec() + (l.nsec()<r.nsec() ? 1000000000:0) );
+}
+inline utime_t& operator-=(utime_t& l, const utime_t& r) {
+  l.sec_ref() -= r.sec();
+  if (l.nsec() >= r.nsec())
+    l.nsec_ref() -= r.nsec();
+  else {
+    l.nsec_ref() += 1000000000L - r.nsec();
+    l.sec_ref()--;
+  }
+  return l;
+}
+inline utime_t& operator-=(utime_t& l, double f) {
+  double fs = trunc(f);
+  double ns = (f - fs) * 1000000000.0;
+  l.sec_ref() -= (long)fs;
+  long nsl = (long)ns;
+  if (nsl) {
+    l.sec_ref()--;
+    l.nsec_ref() = 1000000000L + l.nsec_ref() - nsl;
+  }
+  l.normalize();
+  return l;
+}
+
+
+// comparators
+inline bool operator>(const utime_t& a, const utime_t& b)
+{
+  return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.nsec() > b.nsec());
+}
+inline bool operator<=(const utime_t& a, const utime_t& b)
+{
+  return !(operator>(a, b));
+}
+inline bool operator<(const utime_t& a, const utime_t& b)
+{
+  return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.nsec() < b.nsec());
+}
+inline bool operator>=(const utime_t& a, const utime_t& b)
+{
+  return !(operator<(a, b));
+}
+
+inline bool operator==(const utime_t& a, const utime_t& b)
+{
+  return a.sec() == b.sec() && a.nsec() == b.nsec();
+}
+inline bool operator!=(const utime_t& a, const utime_t& b)
+{
+  return a.sec() != b.sec() || a.nsec() != b.nsec();
+}
+
+
+// output
+
+// ostream
+inline std::ostream& operator<<(std::ostream& out, const utime_t& t)
+{
+  return t.localtime(out);
+}
+
+inline std::string utimespan_str(const utime_t& age) {
+  auto age_ts = ceph::timespan(age.nsec()) + std::chrono::seconds(age.sec());
+  return ceph::timespan_str(age_ts);
+}
+
+#endif
diff --git a/src/include/utime_fmt.h b/src/include/utime_fmt.h
new file mode 100644
index 000000000..e7a98d209
--- /dev/null
+++ b/src/include/utime_fmt.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+/**
+ * \file fmtlib formatter for utime_t
+ */
+#include <fmt/chrono.h>
+#include <fmt/format.h>
+
+#include "include/utime.h"
+
+template <>
+struct fmt::formatter<utime_t> {
+  template <typename ParseContext>
+  constexpr auto parse(ParseContext& ctx)
+  {
+    auto it = ctx.begin();
+    if (it != ctx.end() && *it == 's') {
+      short_format = true;
+      ++it;
+    }
+    return it;
+  }
+
+  template <typename FormatContext>
+  auto format(const utime_t& utime, FormatContext& ctx)
+  {
+    if (utime.sec() < ((time_t)(60 * 60 * 24 * 365 * 10))) {
+      // raw seconds.  this looks like a relative time.
+      return fmt::format_to(ctx.out(), "{}.{:06}", (long)utime.sec(),
+			    utime.usec());
+    }
+
+    // this looks like an absolute time.
+    // conform to http://en.wikipedia.org/wiki/ISO_8601
+    // (unless short_format is set)
+    auto aslocal = fmt::localtime(utime.sec());
+    if (short_format) {
+      return fmt::format_to(ctx.out(), "{:%FT%T}.{:03}", aslocal,
+			    utime.usec() / 1000);
+    }
+    return fmt::format_to(ctx.out(), "{:%FT%T}.{:06}{:%z}", aslocal,
+			  utime.usec(), aslocal);
+  }
+
+  bool short_format{false};
+};
diff --git a/src/include/uuid.cc b/src/include/uuid.cc
new file mode 100644
index 000000000..106fc1db5
--- /dev/null
+++ b/src/include/uuid.cc
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "uuid.h"
+#include "common/Formatter.h"
+
+void uuid_d::dump(ceph::Formatter *f) const
+{
+  f->dump_stream("uuid") << to_string();
+}
+
+void uuid_d::generate_test_instances(std::list<uuid_d*>& o)
+{
+  // these are sourced from examples at
+  // https://www.boost.org/doc/libs/1_62_0/libs/uuid/uuid.html#Synopsis_generators
+  boost::uuids::string_generator gen;
+  o.push_back(new uuid_d());
+  o.back()->uuid = gen("{01234567-89ab-cdef-0123-456789abcdef}");
+  o.push_back(new uuid_d());
+  o.back()->uuid = gen(L"01234567-89ab-cdef-0123-456789abcdef");
+  o.push_back(new uuid_d());
+  o.back()->uuid = gen(std::string("0123456789abcdef0123456789abcdef"));
+  o.push_back(new uuid_d());
+  o.back()->uuid = gen(std::wstring(L"01234567-89ab-cdef-0123-456789abcdef"));
+}
diff --git a/src/include/uuid.h b/src/include/uuid.h
new file mode 100644
index 000000000..f6ef9878d
--- /dev/null
+++ b/src/include/uuid.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef _CEPH_UUID_H
+#define _CEPH_UUID_H
+
+/*
+ * Thin C++ wrapper around libuuid.
+ */
+
+#include "encoding.h"
+#include "random.h"
+
+#include <ostream>
+#include <random>
+
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+#if FMT_VERSION >= 90000
+#include <fmt/ostream.h>
+#endif
+
+namespace ceph {
+  class Formatter;
+}
+
+struct uuid_d {
+  boost::uuids::uuid uuid;
+
+  uuid_d() {
+    boost::uuids::nil_generator gen;
+    uuid = gen();
+  }
+
+  bool is_zero() const {
+    return uuid.is_nil();
+  }
+
+  void generate_random() {
+    random_device_t rng;
+    boost::uuids::basic_random_generator gen(rng);
+    uuid = gen();
+  }
+  
+  bool parse(const char *s) {
+    try {
+      boost::uuids::string_generator gen;
+      uuid = gen(s);
+      return true;
+    } catch (std::runtime_error& e) {
+      return false;
+    }
+  }
+  void print(char *s) const {
+    memcpy(s, boost::uuids::to_string(uuid).c_str(), 37);
+  }
+
+ std::string to_string() const {
+    return boost::uuids::to_string(uuid);
+  }
+
+  const char *bytes() const {
+    return (const char*)uuid.data;
+  }
+
+  void encode(::ceph::buffer::list::contiguous_appender& p) const {
+    p.append(reinterpret_cast<const char *>(&uuid), sizeof(uuid));
+  }
+
+  void bound_encode(size_t& p) const {
+    p += sizeof(uuid);
+  }
+
+  void decode(::ceph::buffer::ptr::const_iterator& p) {
+    assert((p.get_end() - p.get_pos()) >= (int)sizeof(*this));
+    memcpy((char *)this, p.get_pos_add(sizeof(*this)), sizeof(*this));
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<uuid_d*>& o);
+};
+WRITE_CLASS_DENC_BOUNDED(uuid_d)
+
+inline std::ostream& operator<<(std::ostream& out, const uuid_d& u) {
+  char b[37];
+  u.print(b);
+  return out << b;
+}
+
+inline bool operator==(const uuid_d& l, const uuid_d& r) {
+  return l.uuid == r.uuid;
+}
+inline bool operator!=(const uuid_d& l, const uuid_d& r) {
+  return l.uuid != r.uuid;
+}
+inline bool operator<(const uuid_d& l, const uuid_d& r) {
+  return l.to_string() < r.to_string();
+}
+inline bool operator>(const uuid_d& l, const uuid_d& r) {
+  return l.to_string() > r.to_string();
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<uuid_d> : fmt::ostream_formatter {};
+#endif
+
+#endif
diff --git a/src/include/win32/arpa/inet.h b/src/include/win32/arpa/inet.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/arpa/inet.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/dlfcn.h b/src/include/win32/dlfcn.h
new file mode 100644
index 000000000..32e51f16f
--- /dev/null
+++ b/src/include/win32/dlfcn.h
@@ -0,0 +1 @@
+#include "../dlfcn_compat.h"
diff --git a/src/include/win32/fs_compat.h b/src/include/win32/fs_compat.h
new file mode 100644
index 000000000..deeedf071
--- /dev/null
+++ b/src/include/win32/fs_compat.h
@@ -0,0 +1,47 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2021 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+// Those definitions allow handling information coming from Ceph and should
+// not be passed to Windows functions.
+
+#pragma once
+
+#define S_IFLNK   0120000
+
+#define S_ISTYPE(m, TYPE) ((m & S_IFMT) == TYPE)
+#define S_ISLNK(m)  S_ISTYPE(m, S_IFLNK)
+#define S_ISUID     04000
+#define S_ISGID     02000
+#define S_ISVTX     01000
+
+#define LOCK_SH    1
+#define LOCK_EX    2
+#define LOCK_NB    4
+#define LOCK_UN    8
+#define LOCK_MAND  32
+#define LOCK_READ  64
+#define LOCK_WRITE 128
+#define LOCK_RW    192
+
+#define AT_SYMLINK_NOFOLLOW 0x100
+#define AT_REMOVEDIR        0x200
+
+#define MAXSYMLINKS  65000
+
+#define O_DIRECTORY 0200000
+#define O_NOFOLLOW  0400000
+
+#define XATTR_CREATE  1
+#define XATTR_REPLACE 2
+
+typedef unsigned int uid_t;
+typedef unsigned int gid_t;
diff --git a/src/include/win32/ifaddrs.h b/src/include/win32/ifaddrs.h
new file mode 100644
index 000000000..45e1a362c
--- /dev/null
+++ b/src/include/win32/ifaddrs.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2002-2016 Free Software Foundation, Inc.
+ * Copyright (C) 2019 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef IFADDRS_H
+#define IFADDRS_H
+
+#include "winsock_compat.h"
+#include <ifdef.h>
+
+struct ifaddrs {
+  struct ifaddrs  *ifa_next;    /* Next item in list */
+  char            *ifa_name;    /* Name of interface */
+  unsigned int     ifa_flags;   /* Flags from SIOCGIFFLAGS */
+  struct sockaddr *ifa_addr;    /* Address of interface */
+  struct sockaddr *ifa_netmask; /* Netmask of interface */
+
+  struct sockaddr_storage in_addrs;
+  struct sockaddr_storage in_netmasks;
+
+  char             ad_name[IF_MAX_STRING_SIZE];
+  size_t           speed;
+};
+
+int getifaddrs(struct ifaddrs **ifap);
+void freeifaddrs(struct ifaddrs *ifa);
+
+#endif
diff --git a/src/include/win32/netdb.h b/src/include/win32/netdb.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/netdb.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/netinet/in.h b/src/include/win32/netinet/in.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/netinet/in.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/netinet/ip.h b/src/include/win32/netinet/ip.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/include/win32/netinet/ip.h
diff --git a/src/include/win32/netinet/tcp.h b/src/include/win32/netinet/tcp.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/include/win32/netinet/tcp.h
diff --git a/src/include/win32/poll.h b/src/include/win32/poll.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/poll.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/sys/errno.h b/src/include/win32/sys/errno.h
new file mode 100644
index 000000000..339f4fc10
--- /dev/null
+++ b/src/include/win32/sys/errno.h
@@ -0,0 +1 @@
+#include <errno.h>
diff --git a/src/include/win32/sys/select.h b/src/include/win32/sys/select.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/include/win32/sys/select.h
diff --git a/src/include/win32/sys/socket.h b/src/include/win32/sys/socket.h
new file mode 100644
index 000000000..44983f03f
--- /dev/null
+++ b/src/include/win32/sys/socket.h
@@ -0,0 +1 @@
+#include "winsock_compat.h"
diff --git a/src/include/win32/sys/statvfs.h b/src/include/win32/sys/statvfs.h
new file mode 100644
index 000000000..73a892b88
--- /dev/null
+++ b/src/include/win32/sys/statvfs.h
@@ -0,0 +1,36 @@
+#ifndef _SYS_STATVFS_H
+#define _SYS_STATVFS_H  1
+
+typedef unsigned __int64 fsfilcnt64_t;
+typedef unsigned __int64 fsblkcnt64_t;
+typedef unsigned __int64 fsblkcnt_t;
+
+struct statvfs
+{
+    unsigned long int f_bsize;
+    unsigned long int f_frsize;
+    fsblkcnt64_t f_blocks;
+    fsblkcnt64_t f_bfree;
+    fsblkcnt64_t f_bavail;
+    fsfilcnt64_t f_files;
+    fsfilcnt64_t f_ffree;
+    fsfilcnt64_t f_favail;
+    unsigned long int f_fsid;
+    unsigned long int f_flag;
+    unsigned long int f_namemax;
+    int __f_spare[6];
+};
+struct flock {
+    short l_type;
+    short l_whence;
+    off_t l_start;
+    off_t l_len;
+    pid_t l_pid;
+};
+
+#define F_RDLCK 0
+#define F_WRLCK 1
+#define F_UNLCK 2
+#define F_SETLK 6
+
+#endif /* _SYS_STATVFS_H */
diff --git a/src/include/win32/sys/uio.h b/src/include/win32/sys/uio.h
new file mode 100644
index 000000000..15e95be7f
--- /dev/null
+++ b/src/include/win32/sys/uio.h
@@ -0,0 +1 @@
+#include "include/compat.h"
diff --git a/src/include/win32/sys/un.h b/src/include/win32/sys/un.h
new file mode 100644
index 000000000..d08940b2c
--- /dev/null
+++ b/src/include/win32/sys/un.h
@@ -0,0 +1 @@
+#include "include/win32/winsock_compat.h"
diff --git a/src/include/win32/syslog.h b/src/include/win32/syslog.h
new file mode 100644
index 000000000..28389e0b9
--- /dev/null
+++ b/src/include/win32/syslog.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2013, 2015 Cloudbase Solutions Srl
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.You may obtain
+ * a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef SYSLOG_H
+#define SYSLOG_H 1
+
+#define LOG_EMERG       0       /* system is unusable */
+#define LOG_ALERT       1       /* action must be taken immediately */
+#define LOG_CRIT        2       /* critical conditions */
+#define LOG_ERR         3       /* error conditions */
+#define LOG_WARNING     4       /* warning conditions */
+#define LOG_NOTICE      5       /* normal but significant condition */
+#define LOG_INFO        6       /* informational */
+#define LOG_DEBUG       7       /* debug-level messages */
+
+#define LOG_KERN      (0<<3)  /* kernel messages */
+#define LOG_USER      (1<<3)  /* user-level messages */
+#define LOG_MAIL      (2<<3)  /* mail system */
+#define LOG_DAEMON    (3<<3)  /* system daemons */
+#define LOG_AUTH      (4<<3)  /* security/authorization messages */
+#define LOG_SYSLOG    (5<<3)  /* messages generated internally by syslogd */
+#define LOG_LPR       (6<<3)  /* line printer subsystem */
+#define LOG_NEWS      (7<<3)  /* network news subsystem */
+#define LOG_UUCP      (8<<3)  /* UUCP subsystem */
+#define LOG_CRON      (9<<3)  /* clock daemon */
+#define LOG_AUTHPRIV  (10<<3) /* security/authorization messages */
+#define LOG_FTP       (11<<3) /* FTP daemon */
+
+#define LOG_LOCAL0      (16<<3) /* reserved for local use */
+#define LOG_LOCAL1      (17<<3) /* reserved for local use */
+#define LOG_LOCAL2      (18<<3) /* reserved for local use */
+#define LOG_LOCAL3      (19<<3) /* reserved for local use */
+#define LOG_LOCAL4      (20<<3) /* reserved for local use */
+#define LOG_LOCAL5      (21<<3) /* reserved for local use */
+#define LOG_LOCAL6      (22<<3) /* reserved for local use */
+#define LOG_LOCAL7      (23<<3) /* reserved for local use */
+
+#define	LOG_PRIMASK	0x07	/* mask to extract priority part (internal) */
+				/* extract priority */
+#define	LOG_PRI(p)	((p) & LOG_PRIMASK)
+
+
+static inline void
+openlog(const char *ident, int option, int facility)
+{
+}
+
+void
+syslog(int priority, const char *format, ...);
+
+#endif /* syslog.h */
diff --git a/src/include/win32/win32_errno.h b/src/include/win32/win32_errno.h
new file mode 100644
index 000000000..dd8ff8474
--- /dev/null
+++ b/src/include/win32/win32_errno.h
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+// We're going to preserve the error numbers defined by the Windows SDK but not
+// by Mingw headers. For others, we're going to use numbers greater than 256 to
+// avoid unintended overlaps.
+
+#ifndef WIN32_ERRNO_H
+#define WIN32_ERRNO_H 1
+
+#include <errno.h>
+
+#include "include/int_types.h"
+
+#ifndef EBADMSG
+#define EBADMSG 104
+#endif
+
+#ifndef ENODATA
+#define ENODATA 120
+#endif
+
+#ifndef ENOLINK
+#define ENOLINK 121
+#endif
+
+#ifndef ENOMSG
+#define ENOMSG 122
+#endif
+
+#ifndef ENOTRECOVERABLE
+#define ENOTRECOVERABLE 127
+#endif
+
+#ifndef ETIME
+#define ETIME 137
+#endif
+
+#ifndef ETXTBSY
+#define ETXTBSY 139
+#endif
+
+#ifndef ENODATA
+#define ENODATA 120
+#endif
+
+#define ESTALE 256
+#define EREMOTEIO 257
+
+#ifndef EBADE
+#define EBADE 258
+#endif
+
+#define EUCLEAN 259
+#define EREMCHG 260
+#define EKEYREJECTED 261
+#define EREMOTE 262
+
+// Not used at moment. Full coverage ensures that remote errors will be
+// converted and handled properly.
+#define EADV 263
+#define EBADFD 264
+#define EBADR 265
+#define EBADRQC 266
+#define EBADSLT 267
+#define EBFONT 268
+#define ECHRNG 269
+#define ECOMM 270
+#define EDOTDOT 271
+#define EHOSTDOWN 272
+#define EHWPOISON 273
+// Defined by Boost.
+#ifndef EIDRM
+#define EIDRM 274
+#endif
+#define EISNAM 275
+#define EKEYEXPIRED 276
+#define EKEYREVOKED 277
+#define EL2HLT 278
+#define EL2NSYNC 279
+#define EL3HLT 280
+#define EL3RST 281
+#define ELIBACC 282
+#define ELIBBAD 283
+#define ELIBEXEC 284
+#define ELIBMAX 285
+#define ELIBSCN 286
+#define ELNRNG 287
+#define EMEDIUMTYPE 288
+#define EMULTIHOP 289
+#define ENAVAIL 290
+#define ENOANO 291
+#define ENOCSI 292
+#define ENOKEY 293
+#define ENOMEDIUM 294
+#define ENONET 295
+#define ENOPKG 296
+#ifndef ENOSR
+#define ENOSR 297
+#endif
+#ifndef ENOSTR
+#define ENOSTR 298
+#endif
+#define ENOTNAM 299
+#define ENOTUNIQ 300
+#define EPFNOSUPPORT 301
+#define ERFKILL 302
+#define ESOCKTNOSUPPORT 303
+#define ESRMNT 304
+#define ESTRPIPE 305
+#define ETOOMANYREFS 306
+#define EUNATCH 307
+#define EUSERS 308
+#define EXFULL 309
+#define ENOTBLK 310
+
+#ifndef EDQUOT
+#define EDQUOT 311
+#endif
+
+#define ESHUTDOWN 312
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+__s32 wsae_to_errno(__s32 r);
+__u32 errno_to_ntstatus(__s32 r);
+__u32 cephfs_errno_to_ntstatus_map(int cephfs_errno);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // WIN32_ERRNO_H
diff --git a/src/include/win32/winsock_compat.h b/src/include/win32/winsock_compat.h
new file mode 100644
index 000000000..990cc4823
--- /dev/null
+++ b/src/include/win32/winsock_compat.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (c) 2019 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef WINSOCK_COMPAT_H
+#define WINSOCK_COMPAT_H 1
+
+#include "winsock_wrapper.h"
+
+#ifndef poll
+#define poll WSAPoll
+#endif
+
+// afunix.h is available starting with Windows SDK 17063. Still, it wasn't
+// picked up by mingw yet, for which reason we're going to define sockaddr_un
+// here.
+#ifndef _AFUNIX_
+#define UNIX_PATH_MAX 108
+
+typedef struct sockaddr_un
+{
+     ADDRESS_FAMILY sun_family;     /* AF_UNIX */
+     char sun_path[UNIX_PATH_MAX];  /* pathname */
+} SOCKADDR_UN, *PSOCKADDR_UN;
+
+#define SIO_AF_UNIX_GETPEERPID _WSAIOR(IOC_VENDOR, 256)
+#endif /* _AFUNIX */
+
+#endif /* WINSOCK_COMPAT_H */
diff --git a/src/include/win32/winsock_wrapper.h b/src/include/win32/winsock_wrapper.h
new file mode 100644
index 000000000..1bb951a9d
--- /dev/null
+++ b/src/include/win32/winsock_wrapper.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (c) 2020 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef WINSOCK_WRAPPER_H
+#define WINSOCK_WRAPPER_H 1
+
+#ifdef __cplusplus
+// Boost complains if winsock2.h (or windows.h) is included before asio.hpp.
+#include <boost/asio.hpp>
+#endif
+
+#include <winsock2.h>
+#include <ws2ipdef.h>
+#include <ws2tcpip.h>
+
+#endif /* WINSOCK_WRAPPER_H */
diff --git a/src/include/xlist.h b/src/include/xlist.h
new file mode 100644
index 000000000..76d0ddccd
--- /dev/null
+++ b/src/include/xlist.h
@@ -0,0 +1,237 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_XLIST_H
+#define CEPH_XLIST_H
+
+#include <iterator>
+#include <cstdlib>
+#include <ostream>
+
+#include "include/ceph_assert.h"
+
+template<typename T>
+class xlist {
+public:
+  class item {
+  public:
+    item(T i) : _item(i) {}
+    ~item() { 
+      ceph_assert(!is_on_list());
+    }
+
+    item(const item& other) = delete;
+    item(item&& other) = delete;
+    const item& operator= (const item& right) = delete;
+    item& operator= (item&& right) = delete;
+
+    xlist* get_list() { return _list; }
+    bool is_on_list() const { return _list ? true:false; }
+    bool remove_myself() {
+      if (_list) {
+	_list->remove(this);
+	ceph_assert(_list == 0);
+	return true;
+      } else
+	return false;
+    }
+    void move_to_front() {
+      ceph_assert(_list);
+      _list->push_front(this);
+    }
+    void move_to_back() {
+      ceph_assert(_list);
+      _list->push_back(this);
+    }
+
+  private:
+    friend xlist;
+    T _item;
+    item *_prev = nullptr, *_next = nullptr;
+    xlist *_list = nullptr;
+  };
+
+  typedef item* value_type;
+  typedef item* const_reference;
+
+private:
+  item *_front, *_back;
+  size_t _size;
+
+public:
+  xlist(const xlist& other) {
+    _front = other._front;
+    _back = other._back;
+    _size = other._size;
+  }
+
+  xlist() : _front(0), _back(0), _size(0) {}
+  ~xlist() { 
+    ceph_assert(_size == 0);
+    ceph_assert(_front == 0);
+    ceph_assert(_back == 0);
+  }
+
+  size_t size() const {
+    ceph_assert((bool)_front == (bool)_size);
+    return _size;
+  }
+  bool empty() const { 
+    ceph_assert((bool)_front == (bool)_size);
+    return _front == 0; 
+  }
+
+  void clear() {
+    while (_front)
+      remove(_front);
+    ceph_assert((bool)_front == (bool)_size);
+  }
+
+  void push_front(item *i) {
+    if (i->_list) 
+      i->_list->remove(i);
+
+    i->_list = this;
+    i->_next = _front;
+    i->_prev = 0;
+    if (_front) 
+      _front->_prev = i;
+    else
+      _back = i;
+    _front = i;
+    _size++;
+  }
+  void push_back(item *i) {
+    if (i->_list) 
+      i->_list->remove(i);
+
+    i->_list = this;
+    i->_next = 0;
+    i->_prev = _back;
+    if (_back) 
+      _back->_next = i;
+    else
+      _front = i;
+    _back = i;
+    _size++;
+  }
+  void remove(item *i) {
+    ceph_assert(i->_list == this);
+    
+    if (i->_prev)
+      i->_prev->_next = i->_next;
+    else
+      _front = i->_next;
+    if (i->_next)
+      i->_next->_prev = i->_prev;
+    else
+      _back = i->_prev;
+    _size--;
+
+    i->_list = 0;
+    i->_next = i->_prev = 0;
+    ceph_assert((bool)_front == (bool)_size);
+  }
+
+  T front() { return static_cast<T>(_front->_item); }
+  const T front() const { return static_cast<const T>(_front->_item); }
+
+  T back() { return static_cast<T>(_back->_item); }
+  const T back() const { return static_cast<const T>(_back->_item); }
+
+  void pop_front() {
+    ceph_assert(!empty());
+    remove(_front);
+  }
+  void pop_back() {
+    ceph_assert(!empty());
+    remove(_back);
+  }
+
+  class iterator {
+  private:
+    item *cur;
+  public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = T;
+    using difference_type = std::ptrdiff_t;
+    using pointer = T*;
+    using reference = T&;
+    iterator(item *i = 0) : cur(i) {}
+    T operator*() { return static_cast<T>(cur->_item); }
+    iterator& operator++() {
+      ceph_assert(cur);
+      ceph_assert(cur->_list);
+      cur = cur->_next;
+      return *this;
+    }
+    bool end() const { return cur == 0; }
+    friend bool operator==(const iterator& lhs, const iterator& rhs) {
+      return lhs.cur == rhs.cur;
+    }
+    friend bool operator!=(const iterator& lhs, const iterator& rhs) {
+      return lhs.cur != rhs.cur;
+    }
+  };
+
+  iterator begin() { return iterator(_front); }
+  iterator end() { return iterator(NULL); }
+
+  class const_iterator {
+  private:
+    item *cur;
+  public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = T;
+    using difference_type = std::ptrdiff_t;
+    using pointer = const T*;
+    using reference = const T&;
+
+    const_iterator(item *i = 0) : cur(i) {}
+    const T operator*() { return static_cast<const T>(cur->_item); }
+    const_iterator& operator++() {
+      ceph_assert(cur);
+      ceph_assert(cur->_list);
+      cur = cur->_next;
+      return *this;
+    }
+    bool end() const { return cur == 0; }
+    friend bool operator==(const const_iterator& lhs,
+                           const const_iterator& rhs) {
+      return lhs.cur == rhs.cur;
+    }
+    friend bool operator!=(const const_iterator& lhs,
+                           const const_iterator& rhs) {
+      return lhs.cur != rhs.cur;
+    }
+  };
+
+  const_iterator begin() const { return const_iterator(_front); }
+  const_iterator end() const { return const_iterator(NULL); }
+
+  friend std::ostream &operator<<(std::ostream &oss, const xlist<T> &list) {
+    bool first = true;
+    for (const auto &item : list) {
+      if (!first) {
+        oss << ", ";
+      }
+      oss << *item; /* item should be a pointer */
+      first = false;
+    }
+    return oss;
+  }
+};
+
+
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/include
parent	Initial commit. (diff)
download	ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip